In order to scrape the full resolution image URL using requests
and beautifulsoup
library, you need to scrape data from the page source code with regex
.
Basic explanation:
# find all <script> tags:
soup.select('script')
# match images data via regex:
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
# match desired images (full res size) via regex:
# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
# if you try to json.loads() without json.dumps() it will throw an error:
# "Expecting property name enclosed in double quotes"
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
matched_images_data_json)
# Extract and decode them using bytes() and decode():
for fixed_full_res_image in matched_google_full_resolution_images:
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
Code and full example in the online IDE that also downloads images to a folder:
import requests, lxml, re, json
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "pexels cat",
"tbm": "isch",
"hl": "en",
"ijn": "0",
}
html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
def get_images_data():
print('\nGoogle Images Metadata:')
for google_image in soup.select('.isv-r.PNCib.MSM1fd.BUooTd'):
title = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['title']
source = google_image.select_one('.fxgdke').text
link = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['href']
print(f'{title}\n{source}\n{link}\n')
# this steps could be refactored to a more compact
all_script_tags = soup.select('script')
# # https://regex101.com/r/48UZhY/4
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
# if you try to json.loads() without json.dumps it will throw an error:
# "Expecting property name enclosed in double quotes"
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/pdZOnW/3
matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)
# https://regex101.com/r/NnRg27/1
matched_google_images_thumbnails = ', '.join(
re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
str(matched_google_image_data))).split(', ')
print('Google Image Thumbnails:') # in order
for fixed_google_image_thumbnail in matched_google_images_thumbnails:
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')
# after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
print(google_image_thumbnail)
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', '', str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
removed_matched_google_images_thumbnails)
print('\nDownloading Google Full Resolution Images:') # in order
for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
print(original_size_img)
get_images_data()
-------------
'''
Google Images Metadata:
9,000+ Best Cat Photos · 100% Free Download · Pexels Stock Photos
pexels.com
https://www.pexels.com/search/cat/
...
Google Image Thumbnails:
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR2cZsuRkkLWXOIsl9BZzbeaCcI0qav7nenDvvqi-YSm4nVJZYyljRsJZv6N5vS8hMNU_w&usqp=CAU
...
Full Resolution Images:
https://images.pexels.com/photos/1170986/pexels-photo-1170986.jpeg?cs=srgb&dl=pexels-evg-culture-1170986.jpg&fm=jpg
https://images.pexels.com/photos/3777622/pexels-photo-3777622.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
...
'''
Alternatively, you can achieve the same thing by using Google Images API from SerpApi. It's a paid API with a free plan.
The difference in your case is that you don't have to deal with regex to match and extract needed data from the source code of the page, instead, you only need to iterate over structured JSON and get what you want faster.
Code to integrate to achieve your goal:
import os, json # json for pretty output
from serpapi import GoogleSearch
def get_google_images():
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "pexels cat",
"tbm": "isch"
}
search = GoogleSearch(params)
results = search.get_dict()
print(json.dumps(results['images_results'], indent=2, ensure_ascii=False))
get_google_images()
---------------
'''
[
...
{
"position": 100, # img number
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRR1FCGhFsr_qZoxPvQBDjVn17e_8bA5PB8mg&usqp=CAU",
"source": "pexels.com",
"title": "Close-up of Cat · Free Stock Photo",
"link": "https://www.pexels.com/photo/close-up-of-cat-320014/",
"original": "https://images.pexels.com/photos/2612982/pexels-photo-2612982.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500",
"is_product": false
}
]
'''
P.S - I wrote a more in-depth blog post about how to scrape Google Images, and how to reduce the chance of being blocked while web scraping search engines.
Disclaimer, I work for SerpApi.