An ideal scenario is when you have good proxies, residential is ideal which will allow you to choose a specific location (country, city, or a mobile carrier) and CAPTCHA-solving service.
Here's a code snippet to extract data from all available pages using parsel
:
from parsel import Selector
import requests, json
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
params = {
'q': 'samsung medical center seoul semiconductor element simulation x-ray fetch',
'hl': 'en',
'start': 0
}
# JSON data will be collected here
data = []
while True:
html = requests.get('https://scholar.google.com/scholar', headers=headers, params=params).text
selector = Selector(text=html)
print(f'extrecting {params["start"] + 10} page...')
# Container where all needed data is located
for result in selector.css('.gs_r.gs_or.gs_scl'):
title = result.css('.gs_rt').xpath('normalize-space()').get()
title_link = result.css('.gs_rt a::attr(href)').get()
publication_info = result.css('.gs_a').xpath('normalize-space()').get()
snippet = result.css('.gs_rs').xpath('normalize-space()').get()
cited_by_link = result.css('.gs_or_btn.gs_nph+ a::attr(href)').get()
data.append({
'page_num': params['start'] + 10, # 0 -> 1 page. 70 in the output = 7th page
'title': title,
'title_link': title_link,
'publication_info': publication_info,
'snippet': snippet,
'cited_by_link': f'https://scholar.google.com{cited_by_link}',
})
if selector.css('.gs_ico_nav_next').get():
params['start'] += 10
else:
break
print(json.dumps(data, indent = 2, ensure_ascii = False))
As an alternative solution, you can use Google Scholar API from SerpApi.
It's a paid API with a free plan that bypasses blocks from Google via proxies and CAPTCHA solving solutions, can scale to enterprise-level plus there's no need for the end-user to create a parser from scratch and maintain it over time if something in the HTML is changed.
Also, it supports cite, profile, author results.
Example code to integrate to parse organic results:
import json
from serpapi import GoogleScholarSearch
params = {
"api_key": "Your SerpAPi API KEY",
"engine": "google_scholar",
"q": "biology",
"hl": "en"
}
search = GoogleScholarSearch(params)
results = search.get_dict()
for result in results["organic_results"]:
print(json.dumps(result, indent=2))
# first organic results output:
'''
{
"position": 0,
"title": "The biology of mycorrhiza.",
"result_id": "6zRLFbcxtREJ",
"link": "https://www.cabdirect.org/cabdirect/abstract/19690600367",
"snippet": "In the second, revised and extended, edition of this work [cf. FA 20 No. 4264], two new chapters have been added (on carbohydrate physiology physiology Subject Category \u2026",
"publication_info": {
"summary": "JL Harley - The biology of mycorrhiza., 1969 - cabdirect.org"
},
"inline_links": {
"serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=6zRLFbcxtREJ",
"cited_by": {
"total": 704,
"link": "https://scholar.google.com/scholar?cites=1275980731835430123&as_sdt=5,50&sciodt=0,50&hl=en",
"cites_id": "1275980731835430123",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=5%2C50&cites=1275980731835430123&engine=google_scholar&hl=en"
},
"related_pages_link": "https://scholar.google.com/scholar?q=related:6zRLFbcxtREJ:scholar.google.com/&scioq=biology&hl=en&as_sdt=0,50",
"versions": {
"total": 4,
"link": "https://scholar.google.com/scholar?cluster=1275980731835430123&hl=en&as_sdt=0,50",
"cluster_id": "1275980731835430123",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C50&cluster=1275980731835430123&engine=google_scholar&hl=en"
},
"cached_page_link": "https://scholar.googleusercontent.com/scholar?q=cache:6zRLFbcxtREJ:scholar.google.com/+biology&hl=en&as_sdt=0,50"
}
}
... other results
'''
There's also a dedicated Scrape historic Google Scholar results using Python at SerpApi blog post of mine.
Disclaimer, I work for SerpApi.