I'm modifying this script to scrape pages like this for the book page images. Using the script directly from stackoverflow, it returns all the images correctly except the one image I want. The page is returned as empty file with a title like this: img.php?dir=39d761947ad84e71e51e3c300f7af8ff&file=1.png.
In my modified version below I'm only pulling the book page image.
Here's my script:
from bs4 import BeautifulSoup as bs
import urlparse
from urllib2 import urlopen
from urllib import urlretrieve
import os
import sys
out_folder = '/Users/Craig/Desktop/img'
def main(url, out_folder):
soup = bs(urlopen(url))
parsed = list(urlparse.urlparse(url))
for image in soup.findAll('img', id='page_image'):
print "Image: %(src)s" % image
filename = image["src"].split("/")[-1]
parsed[2] = image["src"]
outpath = os.path.join(out_folder, filename)
if image["src"].lower().startswith("http"):
urlretrieve(image["src"], outpath)
else:
urlretrieve(urlparse.urlunparse(parsed), outpath)
def _usage():
print "usage: python dumpimages.py http://example.com [outpath]"
if __name__ == "__main__":
url = sys.argv[-1]
if not url.lower().startswith("http"):
out_folder = sys.argv[-1]
url = sys.argv[-2]
if not url.lower().startswith("http"):
_usage()
sys.exit(-1)
main(url, out_folder)
Any ideas?