BeautifulSoup get 'f slp' items from google

Question

Hi Guys I'm trying to get the citations from a number of papers out of google. This is my code

import urllib
import mechanize
from bs4 import BeautifulSoup

import csv
import os #change directory
import re #for regular expressions



br = mechanize.Browser()

br.set_handle_equiv(False)
br.set_handle_robots(False)   # ignore robots

br.addheaders = [('User-agent', 'Firefox')]             # [()]
br.open('http://google.com/')

br.select_form(name='f')   # Note: select the form named 'f' here
term = "Multinational Study of the Efficacy and Safety of Humanized Anti-HER2 Monoclonal Antibody in Women Who Have HER2-Overexpressing Metastatic Breast Cancer That Has Progressed After Chemotherapy for Metastatic Disease".replace(" ","+")
br.form['q'] = term # query
data = br.submit()

soup = BeautifulSoup(data)


cite= soup.findAll('div',{'class': 'f slp'})
ref = str(cite[1])
print ref

However I keep getting erorrs. I want the number of citations this paper has.

score 0 · Accepted Answer · edited May 23 '17 at 12:06

The problem is that there is no citation info on the page you are getting after the form submit, in other words there is no divs with f slp class.

You have several options to solve it:

instead of mechanize, automate it in a real browser with selenium
use google search api

See also:

Hope that helps.

score 0 · Answer 2 · answered Jan 07 '23 at 06:21

To get the citations from a number of papers out of Google you can use regular expressions highlighting cited by from the snippet:

snippet = result.select_one(".lEBKkf").text
cited_by = re.search(r'Cited by (\d+)', snippet).group()

In order to collect information from all pages you need to use pagination whith while loop.

Pagination is possible as long as the next button exists (determined by the presence of a button selector on the page, in our case the CSS selector ".d6cvqb a[id=pnnext]", you need to increase the value of ["start"] by 10 to access the next page, if present, otherwise, we need to exit the while loop:

if soup.select_one('.d6cvqb a[id=pnnext]'):
    params["start"] += 10
else:
    break

Check code in online IDE.

from bs4 import BeautifulSoup
import requests, json, re, lxml

query = "Multinational Study of the Efficacy and Safety of Humanized Anti-HER2 Monoclonal Antibody in Women Who Have HER2-Overexpressing Metastatic Breast Cancer That Has Progressed After Chemotherapy for Metastatic Disease"
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
    "q": query,          # query
    "hl": "en",          # language
    "gl": "uk",          # country of the search, UK -> United Kingdom
    "start": 0,          # number page by default up to 0
    #"num": 100          # parameter defines the maximum number of results to return.
}

# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}

page_num = 0

citations = []

while True:
    page_num += 1
    print(f"page: {page_num}")
        
    html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
    soup = BeautifulSoup(html.text, 'lxml')

    for result in soup.select(".tF2Cxc"):
        title = result.select_one(".DKV0Md").text
        try:
            snippet = result.select_one(".lEBKkf").text
        except:
            snippet = None
        try:
            cited_by = re.search(r'Cited by (\d+)', snippet).group()
        except:
            cited_by = None
                    
        citations.append({
              "title": title,
              "snippet": snippet,
              "cited_by": cited_by  
        })
      
    if soup.select_one('.d6cvqb a[id=pnnext]'):
        params["start"] += 10
    else:
        break

print(json.dumps(citations, indent=2, ensure_ascii=False))

Example output:

[
  {
    "title": "Targeted therapeutic options and future perspectives for ...",
    "snippet": "by J Wang · 2019 · Cited by 238 — Since its launch in 1998, trastuzumab became a therapeutic for breast cancer patients with HER2 overexpression and is widely administrated as ...",
    "cited_by": "Cited by 238"
  },
  {
    "title": "Trastuzumab Regimens for HER2-Overexpressing Metastatic ...",
    "snippet": "by DR Spigel · 2003 · Cited by 30 — Multinational study of the efficacy and safety of humanized anti-HER2 ... breast cancer that has progressed after chemotherapy for metastatic disease.",
    "cited_by": "Cited by 30"
  },
  other results...
]

Also one of the solutions is to use Google Search Engine Results API from SerpApi. It's a paid API with the free plan. The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.

Code example:

from serpapi import GoogleSearch
import os, json

query = "Multinational Study of the Efficacy and Safety of Humanized Anti-HER2 Monoclonal Antibody in Women Who Have HER2-Overexpressing Metastatic Breast Cancer That Has Progressed After Chemotherapy for Metastatic Disease"

params = {
    "api_key": "...",          # https://serpapi.com/manage-api-key
    "device": "desktop",       # device
    "engine": "google",        # serpapi parser engine
    "q": query,                # query
    "gl": "uk",                # country of the search, UK -> United Kingdom
    "hl": "en"                 # language
}

search = GoogleSearch(params)  # where data extraction happens
pages = search.pagination()

citations = []

for page in pages:  
    for organic_result in page["organic_results"]:
        title = organic_result.get("title")
        snippet = organic_result.get("snippet")
        cited_by =  organic_result.get("rich_snippet", {}).get("top", {}).get("detected_extensions", {}).get("cited_by")

        citations.append({
            "title": title,
            "snippet": snippet,
            "cited_by": cited_by
        })

print(json.dumps(citations, indent=2))

Output:

[
   {
    "title": "Targeting Bcl-2 in Herceptin-Resistant Breast Cancer Cell Lines",
    "snippet": "recombinant humanized anti-HER2 monoclonal antibody approved for treatment of HER2-overexpressing metastatic breast cancer. Clinical studies have shown that ...",
    "cited_by": 71
  },
  {
    "title": "Estabilidad a largo plazo del trastuzumab en plasma y suero ...",
    "snippet": "Multinational study of the efficacy and safety of humanized anti-HER2 monoclonal antibody in women who have HER2-overexpressing metastatic breast cancer ...",
    "cited_by": 1
  }
  other results...
]

BeautifulSoup get 'f slp' items from google

2 Answers2