1

I met UnicodeEncodeError while crawling Wikipedia dump json file. Here are my code snippet and the error message. It seems like the character 'é' cause this problem. However, I do not know how to solve this issue.

import urllib2
import json

# List of philosopher's name: mergel list
# print mergel
i = 0
for name in mergel:
# Use the API to get the page content in a format that we like.
# https://en.wikipedia.org/w/api.php?action=query&titles=Spider-Man&prop=revisions&rvprop=content&format=json
# set the parameters (https://www.mediawiki.org/wiki/API:Tutorial)
    i = i+1
    baseurl = "https://en.wikipedia.org/w/api.php?"
    action = "action=query"
    titlename = name.replace(" ", "_")
    print titlename
    title = "titles="+titlename
    content = "prop=revisions&rvprop=content"
    dataformat = "format=json"

# construct the query
    query = "%s%s&%s&%s&%s" % (baseurl, action, title, content, dataformat)
    print query
    wikiresponse = urllib2.urlopen(query)
    wikisource = wikiresponse.read()
#     print wikisource
    wikijson = json.loads(wikisource)
    jsonfilename = './json/'+titlename+'.json'
    with open(jsonfilename, 'w') as outfile:
        json.dump(wikijson, outfile)

Error message:

Tenzin_Gyatso
https://en.wikipedia.org/w/api.php?action=query&titles=Tenzin_Gyatso&prop=revisions&rvprop=content&format=json
Claude_Lévi-Strauss
https://en.wikipedia.org/w/api.php?action=query&titles=Claude_Lévi-Strauss&prop=revisions&rvprop=content&format=json
---------------------------------------------------------------------------
UnicodeEncodeError                        Traceback (most recent call last)
<ipython-input-203-8430fc805550> in <module>()
     21     query = "%s%s&%s&%s&%s" % (baseurl, action, title, content, dataformat)
     22     print query
---> 23     wikiresponse = urllib2.urlopen(query)
     24     wikisource = wikiresponse.read()
     25 #     print wikisource

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    152     else:
    153         opener = _opener
--> 154     return opener.open(url, data, timeout)
    155 
    156 def install_opener(opener):

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout)
    429             req = meth(req)
    430 
--> 431         response = self._open(req, data)
    432 
    433         # post-process response

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in _open(self, req, data)
    447         protocol = req.get_type()
    448         result = self._call_chain(self.handle_open, protocol, protocol +
--> 449                                   '_open', req)
    450         if result:
    451             return result

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
    407             func = getattr(handler, meth_name)
    408 
--> 409             result = func(*args)
    410             if result is not None:
    411                 return result

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in https_open(self, req)
   1238         def https_open(self, req):
   1239             return self.do_open(httplib.HTTPSConnection, req,
-> 1240                 context=self._context)
   1241 
   1242         https_request = AbstractHTTPHandler.do_request_

/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in do_open(self, http_class, req, **http_conn_args)
   1192 
   1193         try:
-> 1194             h.request(req.get_method(), req.get_selector(), req.data, headers)
   1195         except socket.error, err: # XXX what error?
   1196             h.close()

/Users/sundong/anaconda/lib/python2.7/httplib.pyc in request(self, method, url, body, headers)
   1051     def request(self, method, url, body=None, headers={}):
   1052         """Send a complete request to the server."""
-> 1053         self._send_request(method, url, body, headers)
   1054 
   1055     def _set_content_length(self, body, method):

/Users/sundong/anaconda/lib/python2.7/httplib.pyc in _send_request(self, method, url, body, headers)
   1091         for hdr, value in headers.iteritems():
   1092             self.putheader(hdr, value)
-> 1093         self.endheaders(body)
   1094 
   1095     def getresponse(self, buffering=False):

/Users/sundong/anaconda/lib/python2.7/httplib.pyc in endheaders(self, message_body)
   1047         else:
   1048             raise CannotSendHeader()
-> 1049         self._send_output(message_body)
   1050 
   1051     def request(self, method, url, body=None, headers={}):

/Users/sundong/anaconda/lib/python2.7/httplib.pyc in _send_output(self, message_body)
    891             msg += message_body
    892             message_body = None
--> 893         self.send(msg)
    894         if message_body is not None:
    895             #message_body was not a string (i.e. it is a file) and

/Users/sundong/anaconda/lib/python2.7/httplib.pyc in send(self, data)
    867                 datablock = data.read(blocksize)
    868         else:
--> 869             self.sock.sendall(data)
    870 
    871     def _output(self, s):

/Users/sundong/anaconda/lib/python2.7/ssl.pyc in sendall(self, data, flags)
    719             count = 0
    720             while (count < amount):
--> 721                 v = self.send(data[count:])
    722                 count += v
    723             return amount

/Users/sundong/anaconda/lib/python2.7/ssl.pyc in send(self, data, flags)
    685                     self.__class__)
    686             try:
--> 687                 v = self._sslobj.write(data)
    688             except SSLError as x:
    689                 if x.args[0] == SSL_ERROR_WANT_READ:

UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 43: ordinal not in range(128)

However, below simple & direct code without getting a title from a list, just works without any issues.

import urllib2
import json
query = 'https://en.wikipedia.org/w/api.php?action=query&titles=Claude_Lévi-Strauss&prop=revisions&rvprop=content&format=json'
wikiresponse = urllib2.urlopen(query)
wikisource = wikiresponse.read()
wikijson = json.loads(wikisource)
jsonfilename = './json/'+'Claude_Lévi-Strauss'+'.json'
with open(jsonfilename, 'w') as outfile:
    json.dump(wikijson, outfile)
SUNDONG
  • 2,501
  • 5
  • 21
  • 37
  • I found that my code is working by adding a line `title = title.encode('utf-8')` after `title = "titles="+titlename` However, I don't know clearly why it becomes okay. – SUNDONG Sep 27 '15 at 15:12
  • While its not an answer to your question - using urllib2 to perform HTTP requests is a bit un-pythonic. I would suggest using [requests](http://www.python-requests.org/en/latest/) to make your life a lot easier. So if using that solves your problem. With regard to your actual problem, you probably need to encode your title using the 'idna' encoding https://docs.python.org/2/library/codecs.html#python-specific-encodings – Michael Aquilina Sep 27 '15 at 16:25
  • @MichaelAquilina: `idna` has nothing to do with it: `en.wikipedia.org` domain name is pure ascii. Also, no need to use `requests`, to make a simple http get request here. This question might have the same issue as [How do I post non-ASCII characters using httplib when content-type is “application/xml”](http://stackoverflow.com/q/7993175/4279) i.e., bytestrings and unicode string are mixed. – jfs Sep 27 '15 at 23:45
  • related: [How to deal with unicode string in URL in python3?](http://stackoverflow.com/q/11818362/4279) – jfs Sep 28 '15 at 00:46

1 Answers1

0

Don't mix Unicode and bytestrings: use Unicode strings to work with text in Python.

Don't create urls by hand, use urllib functions such as quote(), urlencode(). Also, consider functions from urlparse module such as urljoin(), urlunsplit().

You've already requested json format, no need to parse it, only to dump it back immediately using the same format; you could use shutil.copyfileobj() to copy file-like objects. You could check the result file later, to make sure that it has been downloaded correctly.

Putting it all together, here's how to save a wiki-page with a given title to a file in JSON format:

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import os
from contextlib import closing
from urllib import quote
from urllib2 import urlopen
from shutil import copyfileobj

def urlretrieve(url, filename, chunksize=8096):
    with closing(urlopen(url)) as response, open(filename, 'wb') as file:
        copyfileobj(response, file, chunksize)

#XXX for name in mergel:
name = u"Claude Lévi-Strauss" #NOTE: Unicode string
urlretrieve("https://en.wikipedia.org/w/api.php?"
            "action=query&prop=revisions&rvprop=content&format=json&"
            "titles=" + quote(name.encode('utf-8')),
            os.path.join('json', name + '.json'))

Note:

  • you don't need to .replace(' ', '_') in this case

  • os.path.join('json', name + '.json') line mixes bytestrings ('json', '.json') and Unicode (type(name) == unicode). It is ok here, because both 'json' and '.json' are ascii-only literals in the source code

  • # -*- coding: utf-8 -*- encoding declaration affects only characters that appear literally in your Python source code e.g., it is accidental that the query string also uses the same encoding in this particular case. The encoding of your source code has no relation with a character encoding that might be used for filenames, or to transfer data over http, or to write Unicode text to terminal, etc (all these encodings may be different from each other).

  • In principle, you could have used urllib.urlretrieve(url, filename) here instead of urlopen + copyfile but urllib.urlretrieve() behavior is different from urllib2.urlopen() on Python 2

Here's the same code using requests:

#!/usr/bin/env python2
# -*- coding: utf-8 -*-    
import os
from urllib import quote
import requests # $ pip install requests

def urlretrieve(url, filename, chunksize=8096):
    r = requests.get(url, stream=True)
    r.raise_for_status() # raise on http error
    with open(filename, 'wb') as f:
        for chunk in r.iter_content(chunksize): 
            f.write(chunk)

#XXX for name in mergel:
name = u"Claude Lévi-Strauss" #NOTE: Unicode string
urlretrieve("https://en.wikipedia.org/w/api.php?"
            "action=query&prop=revisions&rvprop=content&format=json&"
            "titles=" + quote(name.encode('utf-8')),
            os.path.join('json', name + '.json'))

However, below simple & direct code without getting a title from a list, just works without any issues.

Your code uses non-ascii bytestring literals (illegal in Python 3). There is no encoding error because all data is bytes already. The issue with using bytestrings is that it breaks if different environment may use different character encodings and they do (you can't expect that everything uses utf-8 however desirable it might be). Also, the query part should be properly encoded e.g., é should be sent as '%C3%A9'.


Unrelated: to download several web-pages at once, you could use a thread pool:

from multiprocessing.dummy import Pool # use threads

def download(name):
    urlretrieve("https://en.wikipedia.org/w/api.php?"
                "action=query&prop=revisions&rvprop=content&format=json&"
                "titles=" + quote(name.encode('utf-8')),
                os.path.join('json', name + '.json'))

pool = Pool(4) # download 4 titles concurrently
for _  in pool.imap_unordered(download, mergel, chunksize=100):
    pass

It is polite to set maxlag query parameter and respect Retry-After http header. There are several wrappers for Wikipedia API that might do it for you.

Community
  • 1
  • 1
jfs
  • 399,953
  • 195
  • 994
  • 1,670