1

I have around 30k license numbers that I want to search from a website and extract all the relevant information from it When I tried the extracting the information from the function below by looping through multiple license_nums the code works fine and gives me what I am looking for

# create a UserAgent object to generate random user agents
user_agent = UserAgent()

# create a ChromeOptions object to set the user agent in the browser header
chrome_options = Options()
chrome_options.add_argument(f'user-agent={user_agent.random}')
chrome_options.add_argument("start-maximized")

# create a webdriver instance with the ChromeOptions object
driver = webdriver.Chrome(options=chrome_options,executable_path=r'C:\WebDrivers\ChromeDriver\chromedriver_win32\chromedriver.exe')

driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
print(driver.execute_script("return navigator.userAgent;"))

form_url = "https://cdicloud.insurance.ca.gov/cal/LicenseNumberSearch?handler=Search"
driver.get(form_url)

license_num = ['0726675', '0747600', '0691046', '0D95524', '0E77989', '0L78427']

def get_license_info(license):
    if license not in license_num:
        return pd.DataFrame()
    df_license = []
    search_box = driver.find_element('id','SearchLicenseNumber').send_keys(license)
    time.sleep(randint(15,100))
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "btnSearch"))).click()
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")

    table = soup.find('table', id='searchResult')
    license_name = []
    license_number =[]

    #extract all license names on the page
    # Collecting Ddata
    for row in table.tbody.find_all('tr'):    
        # Find all data for each column
        columns = row.find_all('td')

        if(columns != []):
            l_name = columns[0].text.strip().replace("\t"," ")
            license_name.append(l_name)
            license_number.append(columns[1].text.strip())
            print(l_name)
    for row in range(0, len(license_name)):      
            first_page_handle = driver.current_window_handle
            time.sleep(5)
    
            WebDriverWait(driver, 40).until(EC.element_to_be_clickable((By.XPATH, f"//table[@id='searchResult']/tbody/tr[{row+1}]/td[2]/a"))).click()
            try:
                driver.switch_to.window(driver.window_handles[1])
                html = driver.page_source
                soup = BeautifulSoup(html, "lxml")
                #Grab license type and Expiration date
                table_l = soup.find('table', id='licenseDetailGrid')
                data = []
                for tr in table_l.find_all('tr'):
                    row = [td.text for td in tr.find_all('td')]
                    data.append(row)
                df1 = pd.DataFrame(data, columns=['license_type','original_issue_date','status','status_date','exp_date'])
                time.sleep(5)
                business = soup.find("div",id="collapse-LicenseDetailSection").extract()
                b_list = list(business.stripped_strings)
                df_final = df1[df1['license_type'].str.contains("Accident",na=False)]
                df_final = df_final.assign(license_type=df_final['license_type'].str.extract('(.*)\n'))
                df_final['license_name'] = l_name
                df_final['license_number'] = license
                df_license.append(df_final)
                driver.close()
                driver.switch_to.window(first_page_handle)
            except NoSuchWindowException:
                    print("Window closed, skipping to next license")


    driver.find_element('id','SearchLicenseNumber').clear()
    time.sleep(5)

    return pd.concat(df_license)

when I try to put it run with multi thread it doesn't show the value in the search field and throws error

approach 1 from (Scraping multiple webpages at once with Selenium)

Error: It runs for first item in the license_num_list and then start seraching for empty license number and throws 'An exception occurred: 'NoneType' object has no attribute 'tbody'

with futures.ThreadPoolExecutor() as executor:     
    # store the url for each thread as a dict, so we can know which thread fails
    future_results = {license: executor.submit(get_license_info, license) for license in license_num}
    
    for license, future in future_results.items(): 
        try:
            df_license = pd.concat([f.result() for f in future_results.values()])
        except Exception as exc:
            print('An exception occurred: {}'.format(exc))

approach 2 from (How to run `selenium-chromedriver` in multiple threads)

Error: Even this approach only search for first item in the list and throws 'Message: stale element reference: element is not attached to the page document'

start_time = time.time()    
threads = [] 
for license in license_num: # each thread could be like a new 'click' 
    th = threading.Thread(target=get_license_info, args=(license,))    
    th.start() # could `time.sleep` between 'clicks' to see whats'up without headless option
    threads.append(th)        
for th in threads:
    th.join() # Main thread wait for threads finish
print("multiple threads took ", (time.time() - start_time), " seconds")

Can anybody help me with this. Thank you in advance

anonymous13
  • 581
  • 1
  • 5
  • 17
  • What is the error you are getting for both the Approaches? And you could elaborate more about what is happening from both the Approaches. – pmadhu Apr 28 '23 at 11:37
  • You don't include a stack trace or describe in detail your error. But if you have 30K licenses and you are attempting to create a web driver for each one, that can be a real problem. See [this question](https://stackoverflow.com/questions/53475578/python-selenium-multiprocessing/64513719#64513719) and in particular my answer. – Booboo Apr 28 '23 at 11:40
  • @Booboo I tried your approach it continuously gives me captcha validation failed, which I was not getting earlier – anonymous13 Apr 28 '23 at 18:35
  • 1
    @anonymous13 so the `captcha validation failed` error occurred when you were querying `cdicloud.insurance.ca.gov` to quickly? If so, you likely triggered some rate limit threshold. Why do you need to scrape the website fast for data, which is likely historical? – Life is complex May 01 '23 at 18:13
  • 1
    A suggestion: use the `multiprocessing` and divide the license number into batches. – Memristor May 01 '23 at 19:52
  • I noted when I was looking into this question that the site uses `CloudFlare` and `recaptcha`. If you hit this site too hard you will have other issues that you will have to deal with. – Life is complex May 03 '23 at 03:39

1 Answers1

1

You are encountering error because you are trying to access the same webdriver instance from multiple threads. The webdriver instance is not thread-safe and should not be shared across threads.

One solution is to create a new webdriver instance for each thread. You can modify your get_license_info function to take a webdriver instance as an argument, and then create a new webdriver instance for each thread outside of the function.

I had no access to the web page that your trying to scrape, you should replace the selectors and stuff like that and modify this example code as needed

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import pandas as pd
import threading
import time

def get_license_info(license, driver):
    df_license = []
    search_box = driver.find_element('id','SearchLicenseNumber')
    search_box.clear()
    search_box.send_keys(license)
    time.sleep(1)
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "btnSearch"))).click()
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")

    table = soup.find('table', id='searchResult')
    license_name = []
    license_number =[]

    #extract all license names on the page
    # Collecting Data
    for row in table.tbody.find_all('tr'):    
        # Find all data for each column
        columns = row.find_all('td')

        if(columns != []):
            l_name = columns[0].text.strip().replace("\t"," ")
            license_name.append(l_name)
            license_number.append(columns[1].text.strip())
            print(l_name)
    for row in range(0, len(license_name)):      
        first_page_handle = driver.current_window_handle
        time.sleep(1)

        WebDriverWait(driver, 40).until(EC.element_to_be_clickable((By.XPATH, f"//table[@id='searchResult']/tbody/tr[{row+1}]/td[2]/a"))).click()
        try:
            driver.switch_to.window(driver.window_handles[1])
            html = driver.page_source
            soup = BeautifulSoup(html, "lxml")
            #Grab license type and Expiration date
            table_l = soup.find('table', id='licenseDetailGrid')
            data = []
            for tr in table_l.find_all('tr'):
                row = [td.text for td in tr.find_all('td')]
                data.append(row)
            df1 = pd.DataFrame(data, columns=['license_type','original_issue_date','status','status_date','exp_date'])
            time.sleep(1)
            business = soup.find("div",id="collapse-LicenseDetailSection").extract()
            b_list = list(business.stripped_strings)
            df_final = df1[df1['license_type'].str.contains("Accident",na=False)]
            df_final = df_final.assign(license_type=df_final['license_type'].str.extract('(.*)\n'))
            df_final['license_name'] = l_name
            df_final['license_number'] = license
            df_license.append(df_final)
            driver.close()
            driver.switch_to.window(first_page_handle)
        except NoSuchWindowException:
            print("Window closed, skipping to next license")

    search_box = driver.find_element('id','SearchLicenseNumber')
    search_box.clear()
    time.sleep(1)

    return pd.concat(df_license)

def run_in_thread(license):
    try:
        # create a UserAgent object to generate random user agents
        user_agent = UserAgent()

        # create a ChromeOptions object to set the user agent in the browser header
        chrome_options = Options()
        chrome_options.add_argument(f'user-agent={user_agent.random}')
        chrome_options.add_argument("start-maximized")

        # create a webdriver instance with the ChromeOptions object
        driver = webdriver.Chrome(options=chrome_options,executable_path=r'C:\WebDrivers\ChromeDriver\chromedriver_win32\chromedriver.exe')

        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
        print(driver.execute_script("return navigator.userAgent;"))

        form_url = "https://cdicloud.insurance.ca.gov/cal/LicenseNumberSearch?handler=Search"
        driver.get(form_url)

        if license not in license_num:
            return pd.DataFrame()
        
        df_license = []
        search_box = driver.find_element('id','SearchLicenseNumber')
        search_box.send_keys(license)
        time.sleep(randint(15,100))
        WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "btnSearch"))).click()
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")

        table = soup.find('table', id='searchResult')
        license_name = []
        license_number =[]

        #extract all license names on the page
        # Collecting Ddata
        for row in table.tbody.find_all('tr'):    
            # Find all data for each column
            columns = row.find_all('td')

            if(columns != []):
                l_name = columns[0].text.strip().replace("\t"," ")
                license_name.append(l_name)
                license_number.append(columns[1].text.strip())
                print(l_name)

        for row in range(0, len(license_name)):      
            first_page_handle = driver.current_window_handle
            time.sleep(5)
    
            WebDriverWait(driver, 40).until(EC.element_to_be_clickable((By.XPATH, f"//table[@id='searchResult']/tbody/tr[{row+1}]/td[2]/a"))).click()
            try:
                driver.switch_to.window(driver.window_handles[1])
                html = driver.page_source
                soup = BeautifulSoup(html, "lxml")
                #Grab license type and Expiration date
                table_l = soup.find('table', id='licenseDetailGrid')
                data = []
                for tr in table_l.find_all('tr'):
                    row = [td.text for td in tr.find_all('td')]
                    data.append(row)
                df1 = pd.DataFrame(data, columns=['license_type','original_issue_date','status','status_date','exp_date'])
                time.sleep(5)
                business = soup.find("div",id="collapse-LicenseDetailSection").extract()
                b_list = list(business.stripped_strings)
                df_final = df1[df1['license_type'].str.contains("Accident",na=False)]
                df_final = df_final.assign(license_type=df_final['license_type'].str.extract('(.*)\n'))
                df_final['license_name'] = l_name
                df_final['license_number'] = license_number[row]
                df_final['business_name'] = b_list[1]
                df_license.append(df_final)
                driver.close()
                driver.switch_to.window(first_page_handle)
            except:
                pass