I am currently working in a project in the form of a silly gift to my girlfriend: a program that automatically downloads pictures of puppies and kitties, and other animals. (By simply changing some things, it would be capable of downloading .pdf, books, movies or whatever, so that's the serious goal...).
The program works correctly: it searches images on a series of web-sites stored on a list, and downloads one randomly and automatically. The problem is that some pictures are not fully downloaded: their size is much smaller than the pictures correctly downloaded, and, when opened, won't show any image.
Why are some pictures downloaded fine, and others, though the file is there, are so small in size and show no image when opened? Where am I making a mistake?
I really want to correct this. I am really new on Python, so any other advises are more than welcome (mostly if they are related to a way of reducing that messy bunch of code I made on the look_images(x, z) function, with all those try, except and while loops codes and stuff...). Thank you! Here's the script:
#! python3
import os, requests, bs4, random, shelve, wget
def create_folder(): #Creates the folder 'Puppies'
print('Comprobando pelusitas y pulguitas...')
os.makedirs('Puppies', exist_ok=True)
def download_image(url, request_response): #Saves the image in the folder 'Puppies'
image_file = open(os.path.join('Puppies', os.path.basename(url)), 'wb')
for chunk in request_response.iter_content(chunk_size=1024):
image_file.write(chunk)
image_file.flush()
image_file.close()
def create_saves(saves_list): #Creates a data file, if not yet created, storing the pictures already downloaded.
if os.path.isfile('C:\\Users\\usuario\\Documents\\santi\\saves.dat') == False:
page_files = shelve.open('saves')
page_files['saves'] = saves_list
page_files.close()
def look_images(pages_list, saves_list):
for i in pages_list: #For every item in the pages_list (for every page...)
res = requests.get(i)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
object = soup.select('img')
object_number = len(object)
random_number = random.randint(0, int(object_number)) #Creates a random number between 0 and the number of images the object variable found.
print('\nSe encontraron {} cachorritos potenciales...'.format(int(object_number)))
try:
image_url = object[random_number].get('srcset')
except IndexError:
print('No se hallaron vauitos, se continuará la búsqueda...')
print('\nEvaluando colmillitos y patitas...')
page_files = shelve.open('saves') #Opens the saves data file.
while str(image_url) in open('saves.dat', encoding='Latin-1').read(): #While the image found is stored in the saves.dat file, select another image.
try:
image_url = object[random.randint(0, int(object_number))].get('src')
except IndexError:
continue
while not '.jpg' in str(image_url):
try:
image_url = object[random.randint(0, int(object_number))].get('src')
if str(image_url) in open('saves.dat', encoding='Latin-1').read():
image_url = object[random.randint(0, int(object_number))].get('src')
continue
except IndexError:
print('No se hallaron vauitos, se continuará la búsqueda...')
continue
print('\nSe encontraron vauitos...')
print('\nAdoptando cachorrito...')
if str(image_url).endswith('.jpg 2x'): #Lot of images were downloaded as '.jpg 2x', so I made this if statement to erase the ' 2x' final part.
image_url = str(image_url.replace(' ', '')[:-2])
response = requests.get(image_url, stream=True)
res.raise_for_status()
saves_list.append(image_url) #Adds the image to the save_list, which is then saved on the .dat 'saves' file.
download_image(image_url, response)
print('¡Cachorrito adoptado!')
page_files[image_url] = page_files #Saves the image url on the 'saves'.dat file
page_files.close()
def get_page():
page = ['https://pixabay.com/es/photos/puppy/',
'https://www.petsworld.in/blog/cute-pictures-of-puppies-and-kittens-together.html',
'https://pixabay.com/es/photos/bear%20cubs/',
'https://pixabay.com/es/photos/?q=cute+little+animals&hp=&image_type=photo&order=popular&cat=',
'https://pixabay.com/es/photos/?q=baby+cows&hp=&image_type=photo&order=popular&cat=',
'https://www.boredpanda.com/cute-baby-animals/',
'http://abduzeedo.com/node/74367']
alr_dow = []
create_folder()
create_saves(alr_dow)
look_images(page, alr_dow)
get_page()
P.S. The printed messages are in Spanish and are irrelevant to the question. Please note that the goal is to make this a serious program at some time, downloading serious things.
The problem has been solved. Apparently, it was a problem raised when requesting the url of the images of some of the pages. I didn't notice this error because of a mistake of mine, at the end of the look_images function: the "res.raise_for_status()" should have been "response.raise_for_status()", since "response" was the variable that stored the request of the image url. When corrected that, I noticed that on the second and the last page of the pages list a request error was being raised: from this sites were the files that didn't download properly, due to an error apparently related to some port and firewall issues. This is the code now, properly functioning and also cleaner than the first:
#! python3
import os, requests, bs4, random, shelve, shutil
def create_folder():
print('Comprobando pelusitas y pulguitas...')
os.makedirs('Puppies', exist_ok=True)
def download_image_shutil(url, request_response):
local_filename = url.split('/')[-1]
with open(os.path.join('Puppies', local_filename), 'wb') as f:
shutil.copyfileobj(request_response.raw, f, [False])
return local_filename
def create_saves(saves_list):
if os.path.isfile('C:\\Users\\usuario\\Documents\\santi\\saves.dat') == False:
page_files = shelve.open('saves')
page_files['saves'] = saves_list
page_files.close()
def get_random_url(web_object, web_object_number):
while True:
try:
random_number = random.randint(0, web_object_number)
url_variable = web_object[random_number].get('src')
if not '.jpg' in str(url_variable):
continue
elif str(url_variable) in open('saves.dat', encoding='Latin-1').read():
continue
print('Finish')
print(url_variable)
return url_variable
break
except IndexError:
print('Algo salió mal: reanundando búsqueda...')
continue
def look_images(pages_list, saves_list):
for i in pages_list:
res = requests.get(i)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
object = soup.select('img')
object_number = len(object)
print('\nSe encontraron {} cachorritos potenciales...'.format(int(object_number)))
print('\nEvaluando colmillitos y patitas...')
page_files = shelve.open('saves')
image_url = get_random_url(object, object_number)
print('\nSe encontraron vauitos...')
print('\nAdoptando cachorrito...')
if str(image_url).endswith('.jpg 2x'):
image_url = str(image_url.replace(' ', '')[:-2])
if not str(image_url).startswith('https://'):
image_url = 'https://' + str(image_url)
response = requests.get(image_url, stream=True)
response.raise_for_status()
saves_list.append(image_url)
download_image_shutil(image_url, response)
print('¡Cachorrito adoptado!')
page_files[image_url] = page_files
page_files.close()
def get_page():
page = ['https://pixabay.com/es/photos/puppy/',
'https://www.petsworld.in/blog/cute-pictures-of-puppies-and-kittens-together.html',
'https://pixabay.com/es/photos/bear%20cubs/',
'https://pixabay.com/es/photos/?q=cute+little+animals&hp=&image_type=photo&order=popular&cat=',
'https://pixabay.com/es/photos/?q=baby+cows&hp=&image_type=photo&order=popular&cat=',
'https://www.boredpanda.com/cute-baby-animals/']
alr_dow = []
create_folder()
create_saves(alr_dow)
look_images(page, alr_dow)
get_page()
I converted some variables to strings (with the str(x) function) because if not an error saying: NoneType is not iterable appeared on the variables. I didn't understand where that came from; if someone could tell me, would be nice.
I express my gratitude towards bruno desthuilliers, who advised me well and had the patience to do so.