I'm putting up an example of how you might want to do this with the python-requests library. The script below checks if the destinations file already exists. If the partially destination file exists, it's assumed to be the partially downloaded file, and tries to resume the download. If the server claims support for a HTTP Partial Request (i.e. the response to a HEAD request contains Accept-Range header), then the script resume based on the size of the partially downloaded file; otherwise it just does a regular download and discard the parts that are already downloaded. I think it should be fairly straight forward to convert this to use just urllib2 if you don't want to use python-requests, it'll probably be just much more verbose.
Note that resuming downloads may corrupt the file if the file on the server is modified between the initial download and the resume. This can be detected if the server supports strong HTTP ETag header so the downloader can check whether it's resuming the same file.
I make no claim that it is bug-free.
You should probably add a checksum logic around this script to detect download errors and retry from scratch if the checksum doesn't match.
import logging
import os
import re
import requests
CHUNK_SIZE = 5*1024 # 5KB
logging.basicConfig(level=logging.INFO)
def stream_download(input_iterator, output_stream):
for chunk in input_iterator:
output_stream.write(chunk)
def skip(input_iterator, output_stream, bytes_to_skip):
total_read = 0
while total_read <= bytes_to_skip:
chunk = next(input_iterator)
total_read += len(chunk)
output_stream.write(chunk[bytes_to_skip - total_read:])
assert total_read == output_stream.tell()
return input_iterator
def resume_with_range(url, output_stream):
dest_size = output_stream.tell()
headers = {'Range': 'bytes=%s-' % dest_size}
resp = requests.get(url, stream=True, headers=headers)
input_iterator = resp.iter_content(CHUNK_SIZE)
if resp.status_code != requests.codes.partial_content:
logging.warn('server does not agree to do partial request, skipping instead')
input_iterator = skip(input_iterator, output_stream, output_stream.tell())
return input_iterator
rng_unit, rng_start, rng_end, rng_size = re.match('(\w+) (\d+)-(\d+)/(\d+|\*)', resp.headers['Content-Range']).groups()
rng_start, rng_end, rng_size = map(int, [rng_start, rng_end, rng_size])
assert rng_start <= dest_size
if rng_start != dest_size:
logging.warn('server returned different Range than requested')
output_stream.seek(rng_start)
return input_iterator
def download(url, dest):
''' Download `url` to `dest`, resuming if `dest` already exists
If `dest` already exists it is assumed to be a partially
downloaded file for the url.
'''
output_stream = open(dest, 'ab+')
output_stream.seek(0, os.SEEK_END)
dest_size = output_stream.tell()
if dest_size == 0:
logging.info('STARTING download from %s to %s', url, dest)
resp = requests.get(url, stream=True)
input_iterator = resp.iter_content(CHUNK_SIZE)
stream_download(input_iterator, output_stream)
logging.info('FINISHED download from %s to %s', url, dest)
return
remote_headers = requests.head(url).headers
remote_size = int(remote_headers['Content-Length'])
if dest_size < remote_size:
logging.info('RESUMING download from %s to %s', url, dest)
support_range = 'bytes' in [s.strip() for s in remote_headers['Accept-Ranges'].split(',')]
if support_range:
logging.debug('server supports Range request')
logging.debug('downloading "Range: bytes=%s-"', dest_size)
input_iterator = resume_with_range(url, output_stream)
else:
logging.debug('skipping %s bytes', dest_size)
resp = requests.get(url, stream=True)
input_iterator = resp.iter_content(CHUNK_SIZE)
input_iterator = skip(input_iterator, output_stream, bytes_to_skip=dest_size)
stream_download(input_iterator, output_stream)
logging.info('FINISHED download from %s to %s', url, dest)
return
logging.debug('NOTHING TO DO')
return
def main():
TEST_URL = 'http://mirror.internode.on.net/pub/test/1meg.test'
DEST = TEST_URL.split('/')[-1]
download(TEST_URL, DEST)
main()