I have some code for a web scraper which exports to Excel. The guy who helped me with it uses 2.7 but I have only been learning for a couple of months and am using 3.5. Would anyone be able to help me adapt this code please?
#import urllib2 as urllib
#import urllib.request
import requests
from bs4 import BeautifulSoup
import datetime
import xlsxwriter
import sys
# Web scraping
def make_soup(url):
#the_page = urllib.request.urlopen(url)
res = requests.get(url)
the_page = res.text
soup_data = BeautifulSoup(the_page, "html.parser")
return soup_data
soup = make_soup('http://www.url.co.uk/')
def getNames():
ret_names_list = ['Names']
for record in soup.findAll('tr'):
for data in record.findAll('td'):
for td_in_data in data.findAll('td', {"class": "propname"}):
#import unicodedata
td_in_data = td_in_data.text
td_in_data = td_in_data.encode(sys.stdout.encoding, errors='replace')
#unicodedata.normalize('NFKD', td_in_data).encode('ascii','ignore')
print(td_in_data)
ret_names_list.append(td_in_data)
return ret_names_list
def getRooms():
ret_rooms_list = ['Rooms']
for record in soup.findAll('tr'):
for data in record.findAll('td'):
for td_in_data in data.findAll('span', {"class": "beds"}):
print(td_in_data.text)
td_in_data = td_in_data.text
td_in_data = td_in_data.encode(sys.stdout.encoding, errors='replace')
ret_rooms_list.append(td_in_data)
return ret_rooms_list
def getRents():
ret_rents_list = ['Rents']
for record in soup.findAll('tr'):
for data in record.findAll('td'):
for td_in_data in data.findAll('td', {"class": "rentprice"}):
print(td_in_data.text)
td_in_data = td_in_data.text
td_in_data = td_in_data.encode(sys.stdout.encoding, errors='replace')
ret_rents_list.append(td_in_data)
return ret_rents_list
''' To do: get the scraped data to an Excel doc.'''
# Create a workbook and add a worksheet.
if __name__ == '__main__':
todays_date = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M") )+ '.xlsx'
todays_date = todays_date.replace(" ", "_").replace(":", "_")
workbook = xlsxwriter.Workbook(todays_date)
worksheet = workbook.add_worksheet()
# Data to Excel.
excel_dump = zip(getNames(), getRents(), getRooms())
#Excel_dump = (
# ['Name', getNames()],
# ['Rent', getRents()],
# ['Rooms', getRooms()]
#)
# Start from the first cell. Rows and columns are zero indexed.
row = 0
col = 0
# Iterate over the data and write it out row by row.
for name, rent, room in excel_dump:
try:
reload(sys)
sys.setdefaultencoding('Cp1252')
worksheet.write(col, row, str(name))
worksheet.write(col+1, row, rent)
worksheet.write(col+2, row, room)
except Exception as e:
raise e
#col += 1
row += 1
workbook.close()
The error messages I am getting for this are:
^Scraped data in memory^ Traceback (most recent call last): File "C:/Users/joseph.devlin/PycharmProjects/Web_Scraping/scraper.py", line 91, in raise e File "C:/Users/joseph.devlin/PycharmProjects/Web_Scraping/scraper.py", line 85, in reload(sys) NameError: name 'reload' is not defined Exception ignored in: > Traceback (most recent call last): File "C:\Users\joseph.devlin\AppData\Roaming\Python\Python35\site-packages\xlsxwriter\workbook.py", line 149, in del Exception: Exception caught in workbook destructor. Explicit close() may be required for workbook.
Process finished with exit code 1
I am currently doing my own research as to how to fix this, but any help would be appreciated!