0

I'm unable to save/download the images at the location. I can't figure out the problem although the code seems right.

I'm using requests library for scraping the images.

import os
import urllib
import urllib.request
from bs4 import BeautifulSoup
import requests
import re

from lxml.html import fromstring

r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci")
data = r.text
soup = BeautifulSoup(data, "lxml")

title = fromstring(r.content).findtext('.//title')

#print(title)


newPath = r'C:\Users\Vicky\Desktop\ScrappedImages\ ' + title

for link in soup.find_all('img'):
    image = link.get('src')
    if 'http' in image:
        print(image)
        imageName = os.path.split(image)[1]
        print(imageName)

        r2 = requests.get(image)

        if not os.path.exists(newPath):
            os.makedirs(newPath)
            with open(imageName, "wb") as f:
                f.write(r2.content)
Cœur
  • 37,241
  • 25
  • 195
  • 267
Vikas Tomar
  • 69
  • 1
  • 2
  • 11
  • What error are you getting, if any? – Nordle Jul 04 '18 at 09:58
  • you have to add and else, to that if, because if the path exist then it will do nothing – efirvida Jul 04 '18 at 09:59
  • Possible duplicate of [How to save an image locally using Python whose URL address I already know?](https://stackoverflow.com/questions/8286352/how-to-save-an-image-locally-using-python-whose-url-address-i-already-know) – Metalik Jul 04 '18 at 09:59

2 Answers2

0

Try wrapping your r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci") in a try: or while: statement to make sure that the website you are scraping is returning a 200 response, it could be that the website is timing out or not serving your request.

Nordle
  • 2,915
  • 3
  • 16
  • 34
0
import os
from bs4 import BeautifulSoup
import urllib
import requests
import urlparse

from lxml.html import fromstring

r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci")
data = r.text
soup = BeautifulSoup(data, "lxml")

for link in soup.find_all('img'):
    image = link.get('src')
    if bool(urlparse.urlparse(image).netloc):
        print(image)
        imageName = image[image.rfind("/")+1:]
        print(imageName)

        urllib.urlretrieve(image,imageName)
efirvida
  • 4,592
  • 3
  • 42
  • 68