Some times I can effectively handle the socket.timeout, although some other times I get that socket timeout error and my script stops abruptly... Is there something I'm missing in my exception handling? how come it goes right trough it?
Happens randomly in either one of the following pieces of code:
First snippet:
for _ in range(max_retries):
try:
req = Request(url, headers={'User-Agent' :'Mozilla/5.0'})
response = urlopen(req,timeout=5)
break
except error.URLError as err:
print("URL that generated the error code: ", url)
print("Error description:",err.reason)
except error.HTTPError as err:
print("URL that generated the error code: ", url)
print("Error code:", err.code)
print("Error description:", err.reason)
except socket.timeout:
print("URL that generated the error code: ", url)
print("Error description: No response.")
except socket.error:
print("URL that generated the error code: ", url)
print("Error description: Socket error.")
if response.getheader('Content-Type').startswith('text/html'):
htmlBytes = response.read()
htmlString = htmlBytes.decode("utf-8")
self.feed(htmlString)
Second snippet
for _ in range(max_retries):
try:
req = Request(i, headers={'User-Agent' :'Mozilla/5.0'})
with urlopen(req,timeout=5) as response, open(aux, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
with open(path, fname), 'a') as f:
f.write(("link" + str(intaux) + "-" + auxstr + str(index) + i[-4:] + " --- " + metadata[index%batch] + '\n'))
break
except error.URLError as err:
print("URL that generated the error code: ", i)
print("Error description:",err.reason)
except error.HTTPError as err:
print("URL that generated the error code: ", i)
print("Error code:", err.code)
print("Error description:", err.reason)
except socket.timeout:
print("URL that generated the error code: ", i)
print("Error description: No response.")
except socket.error:
print("URL that generated the error code: ", i)
print("Error description: Socket error.")
The error:
Traceback (most recent call last):
File "/mydir/crawler.py", line 202, in <module>
spider("urls.txt", maxPages=0, debug=1, dailyRequests=9600)
File "/mydir/crawler.py", line 142, in spider
parser.getLinks(url + "?start=" + str(currbot) + "&tab=" + auxstr,auxstr)
File "/mydir/crawler.py", line 81, in getLinks
htmlBytes = response.read()
File "/usr/lib/python3.5/http/client.py", line 455, in read
return self._readall_chunked()
File "/usr/lib/python3.5/http/client.py", line 561, in _readall_chunked
value.append(self._safe_read(chunk_left))
File "/usr/lib/python3.5/http/client.py", line 607, in _safe_read
chunk = self.fp.read(min(amt, MAXAMOUNT))
File "/usr/lib/python3.5/socket.py", line 575, in readinto
return self._sock.recv_into(b)
File "/usr/lib/python3.5/ssl.py", line 929, in recv_into
return self.read(nbytes, buffer)
File "/usr/lib/python3.5/ssl.py", line 791, in read
return self._sslobj.read(len, buffer)
File "/usr/lib/python3.5/ssl.py", line 575, in read
v = self._sslobj.read(len, buffer)
socket.timeout: The read operation timed out
EDIT:
I noticed I missed a few lines of code thanks to @tdelaney I added them to the code above and I'm posting the solution I wrote if you post the solution or if you have a better approach to solve it I will mark the answer as correct
Solution:
for _ in range(max_retries):
try:
req = Request(url, headers={'User-Agent' :'Mozilla/5.0'})
response = urlopen(req,timeout=5)
break
except error.URLError as err:
print("URL that generated the error code: ", url)
print("Error description:",err.reason)
except error.HTTPError as err:
print("URL that generated the error code: ", url)
print("Error code:", err.code)
print("Error description:", err.reason)
except socket.timeout:
print("URL that generated the error code: ", url)
print("Error description: No response.")
except socket.error:
print("URL that generated the error code: ", url)
print("Error description: Socket error.")
if response.getheader('Content-Type').startswith('text/html'):
for _ in range(max_retries):
try:
htmlBytes = response.read()
htmlString = htmlBytes.decode("utf-8")
self.feed(htmlString)
break
except error.URLError as err:
print("URL that generated the error code: ", url)
print("Error description:",err.reason)
except error.HTTPError as err:
print("URL that generated the error code: ", url)
print("Error code:", err.code)
print("Error description:", err.reason)
except socket.timeout:
print("URL that generated the error code: ", url)
print("Error description: No response.")
except socket.error:
print("URL that generated the error code: ", url)
print("Error description: Socket error.")