Downloading damaged .jpg files using Python 2.7

Question

I created a web crawler using Python 2.7 and I tried to download some comics in JPEG format. Everything works fine until I see the images I downloaded. A message says that the image is damaged or too big but it is only about 100 kB. All links are checked and correct. All paths too. I can see folders and files been created but when I open the jpgs there is nothing there except the error message.

Here is my code:

import requests
from bs4 import BeautifulSoup
import os
import urllib


def manga_crawl(from_manga, to_manga):
    manga = from_manga
    url = 'https://www.mangareader.net/one-piece/'
    while manga <= to_manga:
        url = url + str(manga) + '/'
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, "html.parser")

        #print "URL-> " + url

        path_name = create_folder(manga)

        #print "FOLDER-> " + path_name

        pages = find_manga_pages(soup)

        #print "PAGES-> " + pages

        download_jpg(pages, url, path_name)

        manga = manga + 1
        url = 'https://www.mangareader.net/one-piece/'


def create_folder(manga):
    pathname = 'one-piece-' + str(manga)
    os.makedirs(pathname)
    return pathname


def find_manga_pages(soup):
    for opt in soup.find_all('option'):
        counter = opt.text

    return counter


def download_jpg(pages, url, path_name):
    page = 1
    while page <= int(pages):
        thisurl = url + str(page)
        #print "THIS URL->" + str(thisurl)
        source_code = requests.get(thisurl)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, "html.parser")
        urlsoup = soup.find('img', {'id': 'img'})
        iconurl = str(urlsoup['src'])
        this_path_name = path_name + '/' + str(page) + '.jpg'

        print "ICON URL->" + iconurl

        urllib.urlretrieve(iconurl, this_path_name)

        page = page + 1


def main():
    x = raw_input()
    y = raw_input()
    manga_crawl(int(x), int(y))


if __name__ == "__main__":
    main()

Any suggestions?

It's likely compressed (gzip or similar). [Use `requests.get` to download the file](https://stackoverflow.com/questions/13137817/how-to-download-image-using-requests), instead of `urllib.urlretrieve`. — Tomalak, Jun 24 '18 at 12:34
Possible duplicate of [How to download image using requests](https://stackoverflow.com/questions/13137817/how-to-download-image-using-requests) — Tomalak, Jun 24 '18 at 12:35

score 1 · Answer 1 · answered Jun 24 '18 at 13:06

1

Well I changed this line

urllib.urlretrieve(iconurl, this_path_name)

with these

    response = requests.get(iconurl, stream=True)
    with open(this_path_name, 'wb') as out_file:
        shutil.copyfileobj(response.raw, out_file)
    del response

and it worked just fine!

answered Jun 24 '18 at 13:06

a_user

207
3
13

`del response` is unnecessary, though! – Tomalak Jun 24 '18 at 13:10

Downloading damaged .jpg files using Python 2.7

1 Answers1