I have a problem in my scraping function.
In this project I have a sqlite3 database which contains links to musical albums' reviews. I create a scraper.py file which contains these 2 methods:
from bs4 import BeautifulSoup
import requests
def take_source(url):
if 'http://' or 'https://' in url:
source = requests.get(url).text
return source
else:
print("Invalid URL")
def extract_corpus(source):
soup = BeautifulSoup(source, "html.parser")
soup.prettify().encode('cp1252', errors='ignore')
corpus = []
for e in soup.select("p"):
corpus.append(e.text)
return corpus
I call the extract_corpus method in a file called embedding.py, In this file I create a connection with the sqlite3 database and I put data in a Pandas Dataframe. I want to store the content of all the links in a csv file. My embedding.py file contains:
import sqlite3
import pandas as pd
import scraper
import csv
#create connection with sqlite db
con = sqlite3.connect("database.sqlite")
#creating a pandas data frame
query = pd.read_sql_query("SELECT url, artist, title FROM reviews;", con)
#populating data frame with urls
df = pd.DataFrame(query, columns=['url', 'artist', 'title'])
#preparing the .csv file for storing the reviews
with open('reviews.csv', 'w') as csvfile:
fieldnames = ['title', 'artist', 'review']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
def append_csv(tit,art,rev):
with open('reviews.csv','a') as csv_f:
writer = csv.DictWriter(csv_f, fieldnames=fieldnames)
writer.writerow({'title': tit, 'artist':art,'review':rev})
for i, row in df.iterrows():
album = (str(row.__getitem__('title')))
artist = (str(row.__getitem__('artist')))
review = str(scraper.extract_corpus(scraper.take_source(str(row.__getitem__('url')))))
append_csv(album,artist,review)
When I run this file, it works for an initial group of links, then it breaks returning the error in the title. This is the error:
Traceback (most recent call last): File "C:/Users/kikko/PycharmProjects/SongsBot/embedding.py", line 59, in append_csv(album,artist,review) File "C:/Users/kikko/PycharmProjects/SongsBot/embedding.py", line 52, in append_csv writer.writerow({'title': tit, 'artist':art,'review':rev}) File "C:\Users\kikko\AppData\Local\Programs\Python\Python37-32\lib\csv.py", line 155, in writerow return self.writer.writerow(self._dict_to_list(rowdict)) File "C:\Users\kikko\AppData\Local\Programs\Python\Python37-32\lib\encodings\cp1252.py", line 19, in encode return codecs.charmap_encode(input,self.errors,encoding_table)[0] UnicodeEncodeError: 'charmap' codec can't encode character '\u011f' in position 1087: character maps to
Unfortunately, I can't find the error.