I'm running following code for web scraper:
25 # save source page and return xpath tree
26 def scrape_Page(url, path):
27 page = requests.get(url)
28 tree = html.fromstring(page.text)
29 # save html content
30 file_name = url.split('/')[-1] + ".html"
31 with open(os.path.join(path, file_name), 'wb') as srcFile:
32 webPage = urllib.urlopen(url)
33 wPageSrc = webPage.read()
34 webPage.close()
35 # write to text file
36 srcFile.write(wPageSrc)
37 return tree
The code works well for some url, but fails for few others, and here's the error message I got:
tree = html.fromstring(page.text)
File "/Library/Python/2.7/site-packages/lxml/html/__init__.py", line 669, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "/Library/Python/2.7/site-packages/lxml/html/__init__.py", line 563, in document_fromstring
value = etree.fromstring(html, parser, **kw)
File "lxml.etree.pyx", line 2993, in lxml.etree.fromstring (src/lxml/lxml.etree.c:62433)
File "parser.pxi", line 1584, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:91750)
ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.