I have recently started my Python journey and stackoverflow has helped me a lot in resolving most of the issues I came across. However, this is one that I don't seem to be able to catch, despite trying the different solutions suggested here.
I am collecting urls from a website in a list. My next step is to go through the urls and download them if they don't already exist in the folder. However, some of the URLs contain non-ascii characters, such as ú, é, ç. Which leads to the unicode error below.
UnicodeEncodeError: 'ascii' codec can't encode character '\xfa' in position 64: ordinal not in range(128)
I escaped it for now with try/except but would need to download those manually.
When I use .encode('utf-8') it also result in an error: "TypeError: cannot use a string pattern on a bytes-like object".
This is my code:
import os
import urllib
dict = (this includes a large dictionary scraped from a website)
links = []
for d in dict :
links.append(d["EncodedAbsUrl"])
# For every line in the file
for url in links:
# Split on the rightmost / and take everything on the right side of that
name = url.rsplit('/', 1)[-1]
# Combine the name and the downloads directory to get the local filename
filename = os.path.join(r'C:\\PATH', name)
# Download the file if it does not exist
if not os.path.isfile(filename):
try:
urllib.request.urlretrieve(url, filename)
except UnicodeEncodeError:
print(filename + " could not be saved.")
pass
else:
print(filename + " already exists.")
Edit
Based on Ardiya's suggestion in the comments (thanks a million for that), I have changed used the urllib.parse.quote_plus method. This seems to work but also returns an http error 400. Revised code now reads:
for url in links:
# Split on the rightmost / and take everything on the right side of that
name = url.rsplit('/', 1)[-1]
# Combine the name and the downloads directory to get the local filename
filename = os.path.join(r'C:\\PATH', name)
# Download the file if it does not exist
if not os.path.isfile(filename):
try:
urllib.request.urlretrieve(url, filename)
except UnicodeEncodeError:
new_url = str(root + url.split('/')[-2] + '/' + urllib.parse.quote_plus(name))
urllib.request.urlretrieve(new_url, filename)
else:
print(filename + " already exists.")
For example, the following link is in the source dictionary: https://www4.unfccc.int/sites/ndcstaging/PublishedDocuments/Peru%20First/iNDC%20Perú%20castellano.pdf
is translated into https://www4.unfccc.int/sites/ndcstaging/PublishedDocuments/Peru%20First/iNDC%2520Per%C3%BA%2520castellano.pdf
which does not properly work.
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-6-12f5f676515d> in <module>
25 try:
---> 26 urllib.request.urlretrieve(url, filename)
27 except UnicodeEncodeError:
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
524 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 525 response = self._open(req, data)
526
~\Anaconda3\lib\urllib\request.py in _open(self, req, data)
541 protocol = req.type
--> 542 result = self._call_chain(self.handle_open, protocol, protocol +
543 '_open', req)
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
~\Anaconda3\lib\urllib\request.py in https_open(self, req)
1392 def https_open(self, req):
-> 1393 return self.do_open(http.client.HTTPSConnection, req,
1394 context=self._context, check_hostname=self._check_hostname)
~\Anaconda3\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
1349 try:
-> 1350 h.request(req.get_method(), req.selector, req.data, headers,
1351 encode_chunked=req.has_header('Transfer-encoding'))
~\Anaconda3\lib\http\client.py in request(self, method, url, body, headers, encode_chunked)
1254 """Send a complete request to the server."""
-> 1255 self._send_request(method, url, body, headers, encode_chunked)
1256
~\Anaconda3\lib\http\client.py in _send_request(self, method, url, body, headers, encode_chunked)
1265
-> 1266 self.putrequest(method, url, **skips)
1267
~\Anaconda3\lib\http\client.py in putrequest(self, method, url, skip_host, skip_accept_encoding)
1103
-> 1104 self._output(self._encode_request(request))
1105
~\Anaconda3\lib\http\client.py in _encode_request(self, request)
1183 # ASCII also helps prevent CVE-2019-9740.
-> 1184 return request.encode('ascii')
1185
UnicodeEncodeError: 'ascii' codec can't encode character '\xfa' in position 64: ordinal not in range(128)
During handling of the above exception, another exception occurred:
HTTPError Traceback (most recent call last)
<ipython-input-6-12f5f676515d> in <module>
27 except UnicodeEncodeError:
28 new_url = str(root + url.split('/')[-2] + '/' + urllib.parse.quote_plus(name))
---> 29 urllib.request.urlretrieve(new_url, filename)
30 else:
31 print(filename + " already exists.")
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = _splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
638 # request was successfully received, understood, and accepted.
639 if not (200 <= code < 300):
--> 640 response = self.parent.error(
641 'http', request, response, code, msg, hdrs)
642
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
500 for handler in handlers:
501 func = getattr(handler, meth_name)
--> 502 result = func(*args)
503 if result is not None:
504 return result
~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 400: Bad Request