I'm running a webdriver with selenium to scrape items from a page after sending keys and clicking on a button. However, my data is fairly large ~ 28000 rows and the time it takes to complete a single page is ~0.8-1.3 seconds. Are there ways in improving the speed so it drops down to <0.5 second per page? I have thought of using multiprocessing however I'm unexperienced in that field.
Here's what I've managed to create which is most efficient so far but I'm pretty convinced it could go faster.
#Example data
df = pd.DataFrame(defaultdict(list,
{'salary': [28452, 28452, 31000, 35000, 35000],
'tuition': [27750, 27750, 27750, 27750, 27750],
'country': ['England',
'England',
'England',
'England',
'England'],
'category': ['anatomy',
'physiology',
'finance',
'finance',
'finance']}))
The imports:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
from collections import defaultdict
import time
import timeit
This data is used to loop the send_keys
and clicks
:
debt1 = []
salary1 = []
loan1 = []
plan21 = []
button1 = []
for i in range(0, len(df)):
i
debt1.append("//input[@id='debt']")
salary1.append("//input[@id='salary']")
loan1.append("//select[@id='loan-type']")
plan21.append("//select[@id='loan-type']/option[2]")
button1.append("//button[@class='btn btn-primary calculate-button']")
Finally, here's the selenium driver to scrape the information:
i = 0
driver = webdriver.Chrome()
while i < len(df):
for salary,tuition,category,country,deb, sal, plan, lo, but in zip(df.salary,df.tuition,df.category, df.country,debt1,salary1, loan1, plan21, button1):
start = timeit.default_timer()
driver.get("https://www.student-loan-calculator.co.uk/")
driver.find_element(By.XPATH, sal
).clear()
driver.find_element(By.XPATH, sal
).send_keys(salary)
driver.find_element(By.XPATH, deb
).clear()
driver.find_element(By.XPATH, deb
).send_keys(str(tuition))
driver.find_element(By.XPATH, lo).click()
driver.find_element(By.XPATH, plan).click()
driver.find_element(By.XPATH, but).click()
driver2 = driver.page_source
tables_debt['table'].append(pd.read_html(driver2))
tables_debt['category'].append(category)
tables_debt['country'].append(country)
i+= 1
stop = timeit.default_timer()
print(f"You're on this country: {country} and this row number {i}", 'And the total time is:', stop - start)
Here's the output speed that I get. It starts off slow at first but ranges from 0.8 - 1.3 afterwards:
You're on this country: England and this row number 1 And the total time is: 2.415818831999786
You're on this country: England and this row number 2 And the total time is: 1.8458935059970827
You're on this country: England and this row number 3 And the total time is: 1.2618036500025482
You're on this country: England and this row number 4 And the total time is: 0.8504524469972239
You're on this country: England and this row number 5 And the total time is: 0.8366585959993245
With the webpage provided here's what I have come up with:
from multiprocessing.pool import ThreadPool
from bs4 import BeautifulSoup
from selenium import webdriver
import threading
import gc
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# suppress logging:
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
print('The driver was just created.')
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
print('The driver has terminated.')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
def get_title(url):
driver = create_driver()
i = 0
#driver = webdriver.Chrome()
tables_debt = defaultdict(list)
while i < len(df):
for salary,tuition,category,country,deb, sal, plan, lo, but in zip(df.salary,df.tuition,df.category, df.country,debt1,salary1, loan1, plan21, button1):
start = timeit.default_timer()
driver.get(url)
driver.find_element(By.XPATH, sal
).clear()
driver.find_element(By.XPATH, sal
).send_keys(salary)
driver.find_element(By.XPATH, deb
).clear()
driver.find_element(By.XPATH, deb
).send_keys(str(tuition))
driver.find_element(By.XPATH, lo).click()
driver.find_element(By.XPATH, plan).click()
driver.find_element(By.XPATH, but).click()
driver2 = driver.page_source
tables_debt['table'].append(pd.read_html(driver2))
tables_debt['category'].append(category)
tables_debt['country'].append(country)
i+= 1
stop = timeit.default_timer()
print(f"You're on this country: {country} and this row number {i}", 'And the total time is:', stop - start)
# just 2 threads in our pool for demo purposes:
with ThreadPool(10) as pool:
urls = [
"https://www.student-loan-calculator.co.uk/"
]
pool.map(get_title, urls)
# must be done before terminate is explicitly or implicitly called on the pool:
del threadLocal
gc.collect()
# pool.terminate() is called at exit of with block
however the output shows a similarity to the original code:
The driver was just created.
You're on this country: England and this row number 1 And the total time is: 1.7789302960009081
You're on this country: England and this row number 2 And the total time is: 1.3455885630028206
You're on this country: England and this row number 3 And the total time is: 1.7194338539993623
You're on this country: England and this row number 4 And the total time is: 0.8721739090033225
You're on this country: England and this row number 5 And the total time is: 0.9608322030035197