I am scraping Google Patents with Selenium and, as expected, I am getting rate limited. I have tried all kinds of configurations (such as How can I make a Selenium script undetectable using GeckoDriver and Firefox through Python?, proxies, and so on), nothing works.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from utils import call_and_wait
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
def call_and_wait(_func=None, *, n=3):
def decorator_wait(func):
@functools.wraps(func)
def wrapper_repeat(*args, **kwargs):
value = func(*args, **kwargs)
time.sleep(n)
return value
return wrapper_repeat
if _func is None:
return decorator_wait
else:
return decorator_wait(_func)
class GooglePatentClient:
def __init__(self):
profile = webdriver.FirefoxProfile()
PROXY_HOST = "103.149.162.195"
PROXY_PORT = "80"
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.http", PROXY_HOST)
profile.set_preference("network.proxy.http_port", int(PROXY_PORT))
profile.set_preference("dom.webdriver.enabled", False)
profile.set_preference('useAutomationExtension', False)
profile.update_preferences()
desired = DesiredCapabilities.FIREFOX
driver = webdriver.Firefox(firefox_profile=profile, desired_capabilities=desired)
self.driver = driver
def connect(self, url='https://patents.google.com/'):
self.driver.get(url)
@call_and_wait(n=3)
def query(self, q):
elem = self.driver.find_element(By.NAME, "q")
elem.clear()
elem.send_keys(q)
elem.send_keys(Keys.RETURN)
@call_and_wait(n=3)
def filter_patent(self):
self.driver.find_element(By.CSS_SELECTOR,
value='#box > div:nth-child(2) > dropdown-menu:nth-child(2) > span > span:nth-child(1)').click() # Click on type but
self.driver.find_element(By.XPATH,
value='/html/body/search-app/search-results/search-ui/div/div/div[1]/div[1]/div/workspace-ui/div[1]/workspace-ui-search/metadata-editor/div[4]/restrict-editor/div/div[2]/dropdown-menu[2]/iron-dropdown/div/div/div/div[1]').click() # Click on patent
@call_and_wait(n=3)
def filter_english(self):
self.driver.find_element(By.XPATH,
value='/html/body/search-app/search-results/search-ui/div/div/div[1]/div[1]/div/workspace-ui/div[1]/workspace-ui-search/metadata-editor/div[4]/restrict-editor/div/div[1]/dropdown-menu[2]/span/span[1]').click() # Click on Language
self.driver.find_element(By.XPATH,
value='/html/body/search-app/search-results/search-ui/div/div/div[1]/div[1]/div/workspace-ui/div[1]/workspace-ui-search/metadata-editor/div[4]/restrict-editor/div/div[1]/dropdown-menu[2]/iron-dropdown/div/div/div/div[1]').click() # Click on English
@call_and_wait(n=3)
def download_csv(self):
self.driver.find_element(By.XPATH,
value='/html/body/search-app/search-results/search-ui/div/div/div[2]/div/div/div[1]/div[2]/div[1]/span[2]/a').click() # Click on Download button
and the main file:
from GooglePatentClient import GooglePatentClient
queries = ['guz']
if __name__ == '__main__':
client = GooglePatentClient()
for query in queries:
client.connect()
client.query(query)
client.filter_patent()
client.filter_english()
client.download_csv() # Should take some time
Unfortunately, after like 20 downloads, I am getting rate limited. It's strange for me that when I try to dowlnoad with normal Chrome/Firefox, I am not getting 429. It's detecting Selenium somehow but idk how can I surround the 429. Any ideas?