I have a script that parses HTML and saves the images to disk. However, for some reason it outputs the filename wrongly.
It is not saving the file with the correct file extension in Windows. Eg, the image should be saved as <filename>.jpg
or <filename>.gif
. Instead the images are being saved with no filename extension.
Could you help me to see why this script is not saving the extension correctly in the filename?
I'm running Python 2.7.
""" Tumbrl downloader
This program will download all the images from a Tumblr blog """
from urllib import urlopen, urlretrieve
import os, sys, re
def download_images(images, path):
for im in images:
print(im)
filename = re.findall("([^/]*).(?:jpg|gif|png)",im)[0]
filename = os.path.join(path,filename)
try:
urlretrieve(im, filename.replace("500","1280"))
except:
try:
urlretrieve(im, filename)
except:
print("Failed to download "+im)
def main():
#Check input arguments
if len(sys.argv) < 2:
print("usage: ./tumblr_rip.py url [starting page]")
sys.exit(1)
url = sys.argv[1]
if len(sys.argv) == 3:
pagenum = int(sys.argv[2])
else:
pagenum = 1
if (check_url(url) == ""):
print("Error: Malformed url")
sys.exit(1)
if (url[-1] != "/"):
url.append("/")
blog_name = url.replace("http://", "")
blog_name = re.findall("(?:.[^\.]*)", blog_name)[0]
current_path = os.getcwd()
path = os.path.join(current_path, blog_name)
#Create blog directory
if not os.path.isdir(path):
os.mkdir(path)
html_code_old = ""
while(True):
#fetch html from url
print("\nFetching images from page "+str(pagenum)+"\n")
f = urlopen(url+"page/"+str(pagenum))
html_code = f.read()
html_code = str(html_code)
if(check_end(html_code, html_code_old, pagenum)):
break
images = get_images_page(html_code)
download_images(images, path)
html_code_old = html_code
pagenum += 1
print("Done downloading all images from " + url)
if __name__ == '__main__':
main()