Managed to solve it, by simply accepting that the connection will die and writing a new function that resumes the download at the exact offset, the theory of which is explained in this question - How to resume file download in Python?
My code (warning, messy):
def onionrequestthreadeddataleakdownloadresume(onionurl,resume_byte_pos):
print("rerunning")
companyname = onionurl[0]
onionurl = onionurl[1]
dataloc = '/media/archangel/Elements/clop/dataleaks/'
foldername = dataloc
dataloc = dataloc + companyname + "/"
try:
if not os.path.isdir(dataloc):
os.mkdir(dataloc)
except Exception as e:
print(e)
print("folder not created")
filename = os.path.basename(onionurl)
filenamebasename = filename
dataloc = dataloc + filename
try:
# seconds = 20
# timeout = Timeout(seconds)
# timeout.start()
session = requests.session()
session.proxies = {}
session.proxies['http'] = 'socks5h://localhost:9050'
session.proxies['https'] = 'socks5h://localhost:9050'
#print(onionurlforrequest)
# onionurlforrequest = "http://" + onionurl
print("dataloc")
print(dataloc)
print("onionurl")
print(onionurl)
url = onionurl
try:
print("url")
print(url)
if not os.path.isdir(foldername):
os.makedirs(foldername)
# download the body of response by chunk, not immediately
#https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests?rq=1
try:
try:
seconds = 20
timeout = Timeout(seconds)
timeout.start()
except Exception as ex:
print(ex)
resume_header = {'Accept-Encoding': None, 'Range': 'bytes=%d-' % resume_byte_pos}
try:
with session.get(url, stream=True, verify=False, headers=resume_header, timeout=600) as response:
#response.raise_for_status()
# get the total file size
file_size = int(response.headers['Content-Length'])
if (file_size > 1000000):
filesizemb = file_size / 1000000
else:
filesizemb = 1
print(file_size)
# get the file name
filename = dataloc
# filename = os.path.join(dataloc, url.split("/")[-1])
# progress bar, changing the unit to bytes instead of iteration (default by tqdm)
# response = session.get(url, stream = True)
# progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
try:
with open(filename, "ab") as text_file:
for chunk in response.iter_content(chunk_size=1024*1024):
#https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
#if len(chunk) != 1024*36:
if chunk:
#print(len(chunk))
text_file.write(chunk)
text_file.flush()
except Exception as ex:
logging.error(f'write failed with error: {ex}')
print(ex)
#else:
# write data read to the file
# f.write(data)
# update the progress bar manually
# progress.update(len(data))
# finally, if the url is valid
#logging.info('Download finished successfully')
print("exited with for file")
except Exception as ex:
logging.error(f'Request failed with error: {ex}')
print(ex)
except Exception as ex:
logging.error(f'Attempt failed with error: {ex}')
print(ex)
print("closing text file")
# text_file.close()
#list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
except Exception as e:
print("FAILED DOWNLOAD 2")
print(e)
except Exception as e:
print("FAILED DOWNLOAD 5")
print(e)
def onionrequestthreadeddataleakdownload2(onionurl):
companyname = onionurl[0]
onionurl = onionurl[1]
dataloc = '/media/archangel/Elements/clop/dataleaks/'
foldername = dataloc
dataloc = dataloc + companyname + "/"
try:
if not os.path.isdir(dataloc):
os.mkdir(dataloc)
except Exception as e:
print(e)
print("folder not created")
filename = os.path.basename(onionurl)
filenamebasename = filename
dataloc = dataloc + filename
try:
# seconds = 20
# timeout = Timeout(seconds)
# timeout.start()
session = requests.session()
session.proxies = {}
session.proxies['http'] = 'socks5h://localhost:9050'
session.proxies['https'] = 'socks5h://localhost:9050'
#print(onionurlforrequest)
# onionurlforrequest = "http://" + onionurl
print("dataloc")
print(dataloc)
print("onionurl")
print(onionurl)
url = onionurl
try:
print("url")
print(url)
if not os.path.isdir(foldername):
os.makedirs(foldername)
# download the body of response by chunk, not immediately
#https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests?rq=1
try:
try:
seconds = 20
timeout = Timeout(seconds)
timeout.start()
except Exception as ex:
print(ex)
# resume_header = ({'Range': f'bytes=0-2000000'})
#file_size_online = int(r.headers.get('content-length', 0))
headersac = {'Accept-Encoding': None}
try:
with session.get(url, stream=True, verify=False, headers = headersac, timeout=600) as response:
#response.raise_for_status()
# get the total file size
# file_size = int(response.headers.get("Content-Length", 0))
file_size = int(response.headers['Content-Length'])
if (file_size > 1000000):
filesizemb = file_size / 1000000
else:
filesizemb = 1
print(file_size)
#e
# get the file name
filename = dataloc
# filename = os.path.join(dataloc, url.split("/")[-1])
# progress bar, changing the unit to bytes instead of iteration (default by tqdm)
# response = session.get(url, stream = True)
# progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
try:
with open(filename, "wb") as text_file:
for chunk in response.iter_content(chunk_size=1024*1024):
#https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
#if len(chunk) != 1024*36:
if chunk:
# print(len(chunk))
text_file.write(chunk)
text_file.flush()
except Exception as ex:
logging.error(f'write failed with error: {ex}')
print(ex)
#else:
# write data read to the file
# f.write(data)
# update the progress bar manually
# progress.update(len(data))
# finally, if the url is valid
#logging.info('Download finished successfully')
except Exception as ex:
logging.error(f'request failed with error: {ex}')
print(ex)
print("exited with for file")
#path = Path(filename)
file_size_offline = Path(filename).stat().st_size
print("file size offline")
while (file_size_offline != file_size):
try:
print(file_size_offline)
print(file_size)
print("file size incomplete")
file_size_offline = Path(filename).stat().st_size
onionurllist = []
onionurllist.append(companyname)
onionurllist.append(onionurl)
onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
file_size_offline = Path(filename).stat().st_size
except Exception as ex:
print("redownload failed")
print(ex)
print("LOOP FINISHED")
print(file_size)
print(file_size_offline)
print(filename)
except Exception as ex:
logging.error(f'Attempt failed with error: {ex}')
print(ex)
# print("closing text file")
# text_file.close()
if(file_size_offline != file_size):
while (file_size_offline != file_size):
try:
print(file_size_offline)
print(file_size)
print("file size incomplete")
file_size_offline = Path(filename).stat().st_size
onionurllist = []
onionurllist.append(companyname)
onionurllist.append(onionurl)
onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
file_size_offline = Path(filename).stat().st_size
except Exception as ex:
print("redownload failed")
print(ex)
else:
#list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
returnedlist = []
returnedlist.append(dataloc)
returnedlist.append(filenamebasename)
returnedlist.append(url)
returnedlist.append(filesizemb)
return returnedlist
if(file_size_offline != file_size):
print("rerunning a final FINAL time")
while (file_size_offline != file_size):
try:
print(file_size_offline)
print(file_size)
print("file size incomplete")
file_size_offline = Path(filename).stat().st_size
onionurllist = []
onionurllist.append(companyname)
onionurllist.append(onionurl)
onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
file_size_offline = Path(filename).stat().st_size
except Exception as ex:
print("redownload failed")
print(ex)
else:
#list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
returnedlist = []
returnedlist.append(dataloc)
returnedlist.append(filenamebasename)
returnedlist.append(url)
returnedlist.append(filesizemb)
return returnedlist
returnedlist = []
returnedlist.append(dataloc)
returnedlist.append(filenamebasename)
returnedlist.append(url)
returnedlist.append(filesizemb)
return returnedlist
except Exception as e:
print("FAILED DOWNLOAD 2")
print(e)
except Exception as e:
print("FAILED DOWNLOAD 5")
print(e)