I'm building a simple scraper in order to learn python. After writing the csvWriter function below, I'm having issues. It seems that the encoding can't be written to csv file (I assume this is because of price information I'm scraping).
Also, I'm wondering if I am correct in thinking that in this case, it is best to go from set -> list to get the information zipped and presented in the way that I want before writing.
Also - any general advice on how I am approaching this?
from bs4 import BeautifulSoup
import requests
import time
import csv
response = request.get('http://website.com/subdomain/logqueryhere')
baseurl = 'http://website.com'
soup = BeautifulSoup(response.text)
hotelInfo = soup.find_all("div", {'class': "hotel-wrap"})
#retrieveLinks: A function to generate a list of hotel URL's to be passed to the price checker.
def retrieveLinks():
for hotel in hotelInfo:
urllist = []
hotelLink = hotel.find('a', attrs={'class': ''})
urllist.append(hotelLink['href'])
scraper(urllist)
hotelnameset = set()
hotelurlset = set()
hotelpriceset = set()
# Scraper: A function to scrape from the lists generated above with retrieveLinks
def scraper(inputlist):
global hotelnameset
global hotelurlset
global hotelpriceset
#Use a set here to avoid any dupes.
for url in inputlist:
fullurl = baseurl + url
hotelurlset.add(str(fullurl))
hotelresponse = requests.get(fullurl)
hotelsoup = BeautifulSoup(hotelresponse.text)
hoteltitle = hotelsoup.find('div', attrs={'class': 'vcard'})
hotelhighprice = hotelsoup.find('div', attrs={'class': 'pricing'}).text
hotelpriceset.add(hotelhighprice)
for H1 in hoteltitle:
hotelName = hoteltitle.find('h1').text
hotelnameset.add(str(hotelName))
time.sleep(2)
csvWriter()
#csvWriter: A function to write the above mentioned sets/lists to a CSV file.
def csvWriter():
global hotelnameset
global hotelurlset
global hotelpriceset
csvname = list(hotelnameset)
csvurl = list(hotelurlset)
csvprice = list(hotelpriceset)
#lets zip the values we neded (until we learn a better way to do it)
zipped = zip(csvname, csvurl, csvprice)
c = csv.writer(open("hoteldata.csv", 'wb'))
for row in zipped:
c.writerow(row)
retrieveLinks()
Error is as follows -
± |Add_CSV_Writer U:2 ✗| → python main.py
Traceback (most recent call last):
File "main.py", line 62, in <module>
retrieveLinks()
File "main.py", line 18, in retrieveLinks
scraper(urllist)
File "main.py", line 44, in scraper
csvWriter()
File "main.py", line 60, in csvWriter
c.writerow(row)
UnicodeEncodeError: 'ascii' codec can't encode character u'\u20ac' in position 0: ordinal not in range(128)