0

I am scraping Google Patents with Selenium and, as expected, I am getting rate limited. I have tried all kinds of configurations (such as How can I make a Selenium script undetectable using GeckoDriver and Firefox through Python?, proxies, and so on), nothing works.

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from utils import call_and_wait
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

def call_and_wait(_func=None, *, n=3):
    def decorator_wait(func):
        @functools.wraps(func)
        def wrapper_repeat(*args, **kwargs):
            value = func(*args, **kwargs)
            time.sleep(n)
            return value
        return wrapper_repeat

    if _func is None:
        return decorator_wait
    else:
        return decorator_wait(_func)
class GooglePatentClient:
    def __init__(self):
        profile = webdriver.FirefoxProfile()

        PROXY_HOST = "103.149.162.195"
        PROXY_PORT = "80"
        profile.set_preference("network.proxy.type", 1)
        profile.set_preference("network.proxy.http", PROXY_HOST)
        profile.set_preference("network.proxy.http_port", int(PROXY_PORT))
        profile.set_preference("dom.webdriver.enabled", False)
        profile.set_preference('useAutomationExtension', False)
        profile.update_preferences()
        desired = DesiredCapabilities.FIREFOX

        driver = webdriver.Firefox(firefox_profile=profile, desired_capabilities=desired)

        self.driver = driver

    def connect(self, url='https://patents.google.com/'):
        self.driver.get(url)

    @call_and_wait(n=3)
    def query(self, q):
        elem = self.driver.find_element(By.NAME, "q")
        elem.clear()
        elem.send_keys(q)
        elem.send_keys(Keys.RETURN)

    @call_and_wait(n=3)
    def filter_patent(self):
        self.driver.find_element(By.CSS_SELECTOR,
                                 value='#box > div:nth-child(2) > dropdown-menu:nth-child(2) > span > span:nth-child(1)').click()  # Click on type but
        self.driver.find_element(By.XPATH,
                                 value='/html/body/search-app/search-results/search-ui/div/div/div[1]/div[1]/div/workspace-ui/div[1]/workspace-ui-search/metadata-editor/div[4]/restrict-editor/div/div[2]/dropdown-menu[2]/iron-dropdown/div/div/div/div[1]').click()  # Click on patent

    @call_and_wait(n=3)
    def filter_english(self):
        self.driver.find_element(By.XPATH,
                                 value='/html/body/search-app/search-results/search-ui/div/div/div[1]/div[1]/div/workspace-ui/div[1]/workspace-ui-search/metadata-editor/div[4]/restrict-editor/div/div[1]/dropdown-menu[2]/span/span[1]').click()  # Click on Language
        self.driver.find_element(By.XPATH,
                                 value='/html/body/search-app/search-results/search-ui/div/div/div[1]/div[1]/div/workspace-ui/div[1]/workspace-ui-search/metadata-editor/div[4]/restrict-editor/div/div[1]/dropdown-menu[2]/iron-dropdown/div/div/div/div[1]').click()  # Click on English

    @call_and_wait(n=3)
    def download_csv(self):
        self.driver.find_element(By.XPATH,
                                 value='/html/body/search-app/search-results/search-ui/div/div/div[2]/div/div/div[1]/div[2]/div[1]/span[2]/a').click()  # Click on Download button

and the main file:

from GooglePatentClient import GooglePatentClient    

queries = ['guz']
if __name__ == '__main__':
    client = GooglePatentClient()

    for query in queries:
        client.connect()
        client.query(query)
        client.filter_patent()
        client.filter_english()
        client.download_csv() # Should take some time

Unfortunately, after like 20 downloads, I am getting rate limited. It's strange for me that when I try to dowlnoad with normal Chrome/Firefox, I am not getting 429. It's detecting Selenium somehow but idk how can I surround the 429. Any ideas?

Petar Ulev
  • 311
  • 4
  • 12
  • try sleeping in between requests. A human can't click as soon as the previous thing is done loading, which your code is doing. – Esther Jun 08 '22 at 13:26
  • I did, I forgot to add the call_and_wait function. It sleeps 3 secs between requests. – Petar Ulev Jun 08 '22 at 13:27

0 Answers0