-1

I am designing scraping project for my research but i am stuck in to write scrape data in csv. Please help me for that?

i have successfully scrape data but i want to store it in csv here below is my code

need to write code to pull all of the html from a website then save it to a csv file.

I believe I somehow need to turn the links into a list and then write the list, but I'm unsure how to do that.

This is what I have so far:

import requests
import time
from bs4 import BeautifulSoup
import csv



# Collect and parse first page
page = requests.get('https://www.myamcat.com/jobs')
soup = BeautifulSoup(page.content, 'lxml')

print("Wait Scraper is working on ")
time.sleep(10)
if(page.status_code != 200):
    
    print("Error in Scraping check the url")
else:
    
    print("Successfully scrape the data")
    time.sleep(10)
    print("Loading data in csv")
    file = csv.writer(open('dataminer.csv', 'w'))
    file.writerow(['ProfileName', 'CompanyName', 'Salary', 'Job', 'Location']) 
       
    for pname in soup.find_all(class_="profile-name"):
        
        #print(pname.text)
        profname = pname.text
        file.writerow([profname, ])
        
    for cname in soup.find_all(class_="company_name"):

        print(cname.text)
                   
    for salary in soup.find_all(class_="salary"):

        print(salary.text)
                 

    for lpa in soup.find_all(class_="jobText"):

        print(lpa.text) 

    for loc in soup.find_all(class_="location"):

        print(loc.text)

        

            



  • First, save the result in a list using `.append()`. Then save into `csv` file. Refer to this [thread](https://stackoverflow.com/questions/2084069/create-a-csv-file-with-values-from-a-python-list) – YusufUMS Mar 26 '19 at 09:05
  • I am new to this can you please show how to do that ? – Bhushan Patil Mar 26 '19 at 09:12

2 Answers2

0

Make a dict and save the data into it then save to csv, check below code!

import requests
import time
from bs4 import BeautifulSoup
import csv



# Collect and parse first page
page = requests.get('https://www.myamcat.com/jobs')
soup = BeautifulSoup(page.content, 'lxml')
data = []
print("Wait Scrapper is working on ")
if(page.status_code != 200):
    print("Error in Srapping check the url")
else:
    print("Successfully scrape the data")
    for x in soup.find_all('div',attrs={'class':'job-page'}):
        data.append({
            'pname':x.find(class_="profile-name").text.encode('utf-8'),
            'cname':x.find(class_="company_name").text.encode('utf-8'),
            'salary':x.find(class_="salary").text.encode('utf-8'),
            'lpa':x.find(class_="jobText").text.encode('utf-8'),
            'loc':x.find(class_="location").text.encode('utf-8')})

print("Loading data in csv")
with open('dataminer.csv', 'w') as f:
    fields = ['salary', 'loc', 'cname', 'pname', 'lpa']
    writer = csv.DictWriter(f, fieldnames=fields)
    writer.writeheader()
    writer.writerows(data)
Sohan Das
  • 1,560
  • 2
  • 15
  • 16
  • Hi thank you so much for the code it's really works great!. I have one question why some unwanted contain are getting in csv file? – Bhushan Patil Mar 26 '19 at 09:47
  • you can use `replace()` to replace those unwanted content and for tab and new line use `strip()` – Sohan Das Mar 26 '19 at 09:51
0

Apart from what you have got in other answer, you can scrape and write the content at the same time as well. I used .select() instead of .find_all() to achieve the same.

import csv
import requests
from bs4 import BeautifulSoup

URL = "https://www.myamcat.com/jobs"

page = requests.get(URL)
soup = BeautifulSoup(page.text, 'lxml')
with open('myamcat_doc.csv','w',newline="",encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(['pname','cname','salary','loc'])

    for item in soup.select(".job-listing .content"):
        pname = item.select_one(".profile-name h3").get_text(strip=True)
        cname = item.select_one(".company_name").get_text(strip=True)
        salary = item.select_one(".salary .jobText").get_text(strip=True)
        loc = item.select_one(".location .jobText").get_text(strip=True)
        writer.writerow([pname,cname,salary,loc])
SIM
  • 21,997
  • 5
  • 37
  • 109