I want to parallelize this script I've written for scraping some patent-specific info from Google Patent, picking patents from a list, with Python and Selenium toolbox. It performs the task perfectly:
- Open the browser
- If it is the first patent search in the general interface, and get four objects
- If it is the second patent onwards clear the search box, paste the new patent and repeat the same data extraction.
- Fill line by a line of a list with the info, and retrieve a CSV file.
This is my working script:
# Working env
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time
import csv
# Scraping function
def scrape_patent_data(code):
line = [code]
# Current assignee
# Wait for the current assignee element to be present, if so get it, if not state a missing (.)
try:
wait = WebDriverWait(driver, 5)
dl_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'dl.important-people.style-scope.patent-result')))
dd_elements = dl_element.find_elements(By.CSS_SELECTOR, 'dd.style-scope.patent-result')
dd_element = dd_elements[-1]
assignee = dd_element.text.strip()
line.append(assignee)
except NoSuchElementException:
line.append('.')
print('Patent', index, 'with no assignee found')
# Patent status
# Wait for patent status element to be present, if so get it, if not state a missing (.)
try:
dl_element = driver.find_element(By.CSS_SELECTOR,'div.wrap.style-scope.application-timeline')
dd_elements = dl_element.find_elements(By.CSS_SELECTOR, 'span.title-text.style-scope.application-timeline')
dd_element = dd_elements[-1]
status = dd_element.text.strip()
line.append(status)
except NoSuchElementException:
line.append('.')
print('Patent', index, 'with no status found')
# Relevant Dates
# Wait for the application date element to be present, if so get it, if not state a missing (.)
try:
app_date = driver.find_element(By.CSS_SELECTOR,'div.filed.style-scope.application-timeline')
line.append(app_date.text)
except NoSuchElementException:
line.append('.')
print('Patent', index, 'with no application date found')
# Extract data if patent was granted
# Wait for the granted date element to be present, if so get it, if not state a missing (.)
try:
grant_date = driver.find_element(By.CSS_SELECTOR, 'div.granted.style-scope.application-timeline')
line.append(grant_date.text)
except NoSuchElementException:
line.append('.')
print('Patent', index, 'Non granted')
return line
# Main code
csv_file_path = "patent_codes.csv"
with open(csv_file_path, "r") as csvfile:
reader = csv.reader(csvfile)
patents = list(reader)
codes = patents[:100]
codes = [item[0] for item in codes]
data = [['code', 'cur_assig', 'status', 'app_date', 'grant_date']]
index = 0
start_time = time.time()
driver = webdriver.Chrome()
driver.get('https://patents.google.com/')
time.sleep(1)
for code in codes:
if index == 0:
input_box = driver.find_element(By.CSS_SELECTOR, 'input.style-scope.search-box')
input_box.send_keys(code)
input_box.send_keys(Keys.ENTER)
time.sleep(1.3)
else:
input_box = driver.find_element(By.NAME, 'q')
input_box.clear()
input_box.send_keys(code)
input_box.send_keys(Keys.ENTER)
time.sleep(0.5)
line = scrape_patent_data(code)
data.append(line)
index += 1
print("Scrapped patent", index)
driver.quit()
elapsed_time = time.time() - start_time
print(f"The process took {elapsed_time} seconds.")
# CSV export
csv_file_path = "patent_data.csv"
with open(csv_file_path, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerows(data)
Every method I've tryied doesn't complete the task properly, when I manage to operate in parallel it doesn't get the data, and when it get the data it does so by opening one browser for each patent or partially because it overlaps request in the same browser.
I've written this auxiliar code to operate in parallel
def process_code(args):
index, code, driver = args
if index == 0:
input_box = driver.find_element(By.CSS_SELECTOR, 'input.style-scope.search-box')
input_box.send_keys(code)
input_box.send_keys(Keys.ENTER)
time.sleep(1.3)
else:
input_box = driver.find_element(By.NAME, 'q')
input_box.clear()
input_box.send_keys(code)
input_box.send_keys(Keys.ENTER)
time.sleep(0.5)
line = scrape_patent_data(driver, code)
return line
And this is the part of the script that tries to parallel perform the instructions
python
from concurrent.futures import ThreadPoolExecutor
import multiprocessing
# Number of processes to use (based on available CPU cores)
num_processes = multiprocessing.cpu_count()
# Create a list to hold the driver instances
drivers = []
# Create the initial driver instances
for _ in range(num_processes):
driver = webdriver.Chrome()
driver.get('https://patents.google.com/')
#time.sleep(1)
drivers.append(driver)
# Create a ThreadPoolExecutor with the desired number of processes
with ThreadPoolExecutor(max_workers=num_processes) as executor:
args_list = [(index, code, drivers[index % num_processes]) for index, code in enumerate(codes)]
futures = list(executor.map(process_code, args_list))
# Wait for all the driver instances to quit
for driver in drivers:
driver.quit()
# Process the results from futures
for line in futures:
data.append(line)
# Print the data to verify
for item in data:
print(item)
And I always get some issues with the "index" operator, different changes and ways of including it delivers the problems mention before, but never what I want.