I am scraping artists from discogs.com. I am unable to get the artist names as they appear on the page. E.g. artist Andrés appears as Andr\xe9s when I run my code.
Can anyone explain what I'm doing wrong?
from bs4 import BeautifulSoup
import requests
import urllib2
from itertools import chain
import codecs
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0' }
all_artists = []
result_pages = 1 #446
def load_artists():
for page in xrange(1, result_pages+1):
url = url = 'https://www.discogs.com/search/?sort=have%2Cdesc&style_exact=House&genre_exact=Electronic&decade=2010&page=' + str(page)
r = requests.get(url, headers = headers)
soup = BeautifulSoup(r.content.decode('utf-8'), 'html.parser')
[all_artists.append(tag["title"]) for tag in soup.select('div#search_results h5 span')]
load_artists()
all_artists