1

I can get the width attribute of an image using BeautifulSoup as follows:

img = soup.find("img")
width = img["width"]

The problem is that width can be set in CSS file or not set at all.

I would like to extract the value without downloading the image from img["src"] How can I do it in Python to extract the value if it's set somewhere (HTML or CSS) or get the default value the browser will render (if not set)?

xralf
  • 3,312
  • 45
  • 129
  • 200

2 Answers2

3

The quick answer is: you can't - the resultant size of an image is based on evaluation of CSS, and indeed JS. You'd need to do all that work in order to find your answer.

Another approach might be to use a real browser to do that work for you, and then ask it what the width is. See PhantomJS, and Selenium.

ahri
  • 351
  • 2
  • 11
2

You can partially download image, only enough to get width/height through setting Range in requests headers and use somehow variant of getimageinfo.py

Example usage:

def check_is_small_pic(url, pic_size):
    is_small = False
    r_check = requests.get(url, headers={"Range": "50"})
    image_info = getimageinfo.getImageInfo(r_check.content)
    if image_info[1] < pic_size or image_info[2] < pic_size:
        is_small = True
    return is_small

Some getimageinfo.py, quickly adjusted for python 3.5:

import io
import struct
# import urllib.request as urllib2

def getImageInfo(data):
    data = data
    size = len(data)
    #print(size)
    height = -1
    width = -1
    content_type = ''

    # handle GIFs
    if (size >= 10) and data[:6] in (b'GIF87a', b'GIF89a'):
        # Check to see if content_type is correct
        content_type = 'image/gif'
        w, h = struct.unpack(b"<HH", data[6:10])
        width = int(w)
        height = int(h)

    # See PNG 2. Edition spec (http://www.w3.org/TR/PNG/)
    # Bytes 0-7 are below, 4-byte chunk length, then 'IHDR'
    # and finally the 4-byte width, height
    elif ((size >= 24) and data.startswith(b'\211PNG\r\n\032\n')
          and (data[12:16] == b'IHDR')):
        content_type = 'image/png'
        w, h = struct.unpack(b">LL", data[16:24])
        width = int(w)
        height = int(h)

    # Maybe this is for an older PNG version.
    elif (size >= 16) and data.startswith(b'\211PNG\r\n\032\n'):
        # Check to see if we have the right content type
        content_type = 'image/png'
        w, h = struct.unpack(b">LL", data[8:16])
        width = int(w)
        height = int(h)

    # handle JPEGs
    elif (size >= 2) and data.startswith(b'\377\330'):
        content_type = 'image/jpeg'
        jpeg = io.BytesIO(data)
        jpeg.read(2)
        b = jpeg.read(1)
        try:
            while (b and ord(b) != 0xDA):
                while (ord(b) != 0xFF): b = jpeg.read(1)
                while (ord(b) == 0xFF): b = jpeg.read(1)
                if (ord(b) >= 0xC0 and ord(b) <= 0xC3):
                    jpeg.read(3)
                    h, w = struct.unpack(b">HH", jpeg.read(4))
                    break
                else:
                    jpeg.read(int(struct.unpack(b">H", jpeg.read(2))[0])-2)
                b = jpeg.read(1)
            width = int(w)
            height = int(h)
        except struct.error:
            pass
        except ValueError:
            pass

    return content_type, width, height



# from PIL import Image
# import requests
# hrefs = ['http://farm4.staticflickr.com/3894/15008518202_b016d7d289_m.jpg','https://farm4.staticflickr.com/3920/15008465772_383e697089_m.jpg','https://farm4.staticflickr.com/3902/14985871946_86abb8c56f_m.jpg']
# RANGE = 5000
# for href in hrefs:
#     req  = requests.get(href,headers={'User-Agent':'Mozilla5.0(Google spider)','Range':'bytes=0-{}'.format(RANGE)})
#     im = getImageInfo(req.content)
# 
#     print(im)
# req = urllib2.Request("http://vn-sharing.net/forum/images/smilies/onion/ngai.gif", headers={"Range": "5000"})
# r = urllib2.urlopen(req)
# 
# f = open("D:\\Pictures\\1.jpg", "rb")
# print(getImageInfo(r.read()))
# Output: >> ('image/gif', 50, 50)
# print(getImageInfo(f.read()))
Hellohowdododo
  • 396
  • 3
  • 12