i'm having some issues with python encode and cant find any help in the web!
In the first, i begin developer in python a 3 months ago, so i'm a beginer!
I'm doing some scraper and i having a issue with a encode, the error code is:
Traceback (most recent call last):
File "/home/thiago/crawler/src/link_produto.py", line 145, in <module>
crawler(link)
File "/home/thiago/crawler/src/link_produto.py", line 125, in crawler
cursor.execute(sql)
File "/usr/local/lib/python2.7/dist-packages/MySQLdb/cursors.py", line 181, in execute
query = query.encode(db.unicode_literal.charset)
UnicodeEncodeError: 'latin-1' codec can't encode character u'\u201d' in position 5013: ordinal not in range(256)
[Finished in 1.633s]
My Code here:
from time import gmtime, strftime
import MySQLdb
import requests
from bs4 import BeautifulSoup as bs
import re
import json
def crawler(link_cat):
html = requests.get(link_cat)
soup = bs(html.content, "lxml")
for div in soup.find_all('a', {"class" : "last-page"}):
lp = div['href']
regex = r"^.*\/([0-9]+)\/$"
strlp = lp
matches = re.search(regex, strlp)
if matches:
for groupNum in range(0, len(matches.groups())):
groupNum = groupNum + 1
valor_final = matches.group(groupNum)
valor_final = int(matches.group(groupNum))
print('1º STEP: A Qtd de páginas da categoria é', valor_final)
vetor = []
for i in range(0, valor_final):
vetor.insert (i,'%s%d' %(link_cat, i+1) + '/')
print(i)
vetor[0] = link_cat
for i in vetor:
html = requests.get(i)
soup = bs(html.content, "lxml")
for a in soup.find_all('a', {"class" : "product-li"}):
last_update = strftime("%Y-%m-%d %H:%M:%S", gmtime())
rank_cat = str(i)
rank_pagina_cat = 2
rank_site = 300
url = requests.get(a['href'])
html_sku = url.content
soup_sku = bs(html_sku, 'html.parser')
title = soup_sku.title.string
SKU = soup_sku.find(string=re.compile("digo.*"))
rows = soup_sku.find_all('tr')
specList = []
for row in rows:
data = row.find_all('td')
spec = {data[0].get_text() : data[1].get_text()}
specList.append(spec)
product = {'title': title, 'sku' : SKU[+7:], 'spec' : specList}
productJSON = json.dumps(product)
productJSON = productJSON.encode(encoding='UTF-8',errors='strict')
print(a['href'], last_update, rank_cat, rank_pagina_cat, rank_site, SKU[+7:], str(productJSON), 'title')
db = MySQLdb.connect("ipbd","user","pass","bd" )
cursor = db.cursor()
sql = "INSERT INTO link_produto(desc_link, \
last_update, rank_cat, rank_pagina_cat, rank_site, sku, json_encode, titulo) \
VALUES ('%s', '%s', '%s', '%d', '%d', '%s', '%s', '%s' )" % \
(a['href'], last_update, rank_cat, rank_pagina_cat, rank_site, SKU[+7:], productJSON, title)
#try:
# Execute the SQL command
cursor.execute(sql)
# Commit your changes in the database
db.commit()
#except:
# Rollback in case there is any error
db.rollback()
# disconnect from server
db.close()
print ("3ºSTEP:", a['href'])
link = 'https://www.linktodoscrapper.com'
crawler(link)
`
I understand that I am not using the best practices of code organization, so if you want to give an opinion on how I should structure this code, I thank you