I met UnicodeEncodeError while crawling Wikipedia dump json file. Here are my code snippet and the error message. It seems like the character 'é' cause this problem. However, I do not know how to solve this issue.
import urllib2
import json
# List of philosopher's name: mergel list
# print mergel
i = 0
for name in mergel:
# Use the API to get the page content in a format that we like.
# https://en.wikipedia.org/w/api.php?action=query&titles=Spider-Man&prop=revisions&rvprop=content&format=json
# set the parameters (https://www.mediawiki.org/wiki/API:Tutorial)
i = i+1
baseurl = "https://en.wikipedia.org/w/api.php?"
action = "action=query"
titlename = name.replace(" ", "_")
print titlename
title = "titles="+titlename
content = "prop=revisions&rvprop=content"
dataformat = "format=json"
# construct the query
query = "%s%s&%s&%s&%s" % (baseurl, action, title, content, dataformat)
print query
wikiresponse = urllib2.urlopen(query)
wikisource = wikiresponse.read()
# print wikisource
wikijson = json.loads(wikisource)
jsonfilename = './json/'+titlename+'.json'
with open(jsonfilename, 'w') as outfile:
json.dump(wikijson, outfile)
Error message:
Tenzin_Gyatso
https://en.wikipedia.org/w/api.php?action=query&titles=Tenzin_Gyatso&prop=revisions&rvprop=content&format=json
Claude_Lévi-Strauss
https://en.wikipedia.org/w/api.php?action=query&titles=Claude_Lévi-Strauss&prop=revisions&rvprop=content&format=json
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-203-8430fc805550> in <module>()
21 query = "%s%s&%s&%s&%s" % (baseurl, action, title, content, dataformat)
22 print query
---> 23 wikiresponse = urllib2.urlopen(query)
24 wikisource = wikiresponse.read()
25 # print wikisource
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in urlopen(url, data, timeout, cafile, capath, cadefault, context)
152 else:
153 opener = _opener
--> 154 return opener.open(url, data, timeout)
155
156 def install_opener(opener):
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout)
429 req = meth(req)
430
--> 431 response = self._open(req, data)
432
433 # post-process response
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in _open(self, req, data)
447 protocol = req.get_type()
448 result = self._call_chain(self.handle_open, protocol, protocol +
--> 449 '_open', req)
450 if result:
451 return result
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
407 func = getattr(handler, meth_name)
408
--> 409 result = func(*args)
410 if result is not None:
411 return result
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in https_open(self, req)
1238 def https_open(self, req):
1239 return self.do_open(httplib.HTTPSConnection, req,
-> 1240 context=self._context)
1241
1242 https_request = AbstractHTTPHandler.do_request_
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in do_open(self, http_class, req, **http_conn_args)
1192
1193 try:
-> 1194 h.request(req.get_method(), req.get_selector(), req.data, headers)
1195 except socket.error, err: # XXX what error?
1196 h.close()
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in request(self, method, url, body, headers)
1051 def request(self, method, url, body=None, headers={}):
1052 """Send a complete request to the server."""
-> 1053 self._send_request(method, url, body, headers)
1054
1055 def _set_content_length(self, body, method):
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in _send_request(self, method, url, body, headers)
1091 for hdr, value in headers.iteritems():
1092 self.putheader(hdr, value)
-> 1093 self.endheaders(body)
1094
1095 def getresponse(self, buffering=False):
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in endheaders(self, message_body)
1047 else:
1048 raise CannotSendHeader()
-> 1049 self._send_output(message_body)
1050
1051 def request(self, method, url, body=None, headers={}):
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in _send_output(self, message_body)
891 msg += message_body
892 message_body = None
--> 893 self.send(msg)
894 if message_body is not None:
895 #message_body was not a string (i.e. it is a file) and
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in send(self, data)
867 datablock = data.read(blocksize)
868 else:
--> 869 self.sock.sendall(data)
870
871 def _output(self, s):
/Users/sundong/anaconda/lib/python2.7/ssl.pyc in sendall(self, data, flags)
719 count = 0
720 while (count < amount):
--> 721 v = self.send(data[count:])
722 count += v
723 return amount
/Users/sundong/anaconda/lib/python2.7/ssl.pyc in send(self, data, flags)
685 self.__class__)
686 try:
--> 687 v = self._sslobj.write(data)
688 except SSLError as x:
689 if x.args[0] == SSL_ERROR_WANT_READ:
UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 43: ordinal not in range(128)
However, below simple & direct code without getting a title from a list, just works without any issues.
import urllib2
import json
query = 'https://en.wikipedia.org/w/api.php?action=query&titles=Claude_Lévi-Strauss&prop=revisions&rvprop=content&format=json'
wikiresponse = urllib2.urlopen(query)
wikisource = wikiresponse.read()
wikijson = json.loads(wikisource)
jsonfilename = './json/'+'Claude_Lévi-Strauss'+'.json'
with open(jsonfilename, 'w') as outfile:
json.dump(wikijson, outfile)