0
#scrape.py
import threading
from selenium import webdriver
from selenium.common.exceptions import *
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options


threadLocal = threading.local()

def get_driver():
    browser = getattr(threadLocal, 'browser', None)
    if browser is None:
        chrome_options = Options()
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument("--headless")
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument("--lang=en")
        chrome_options.add_argument("--start-maximized")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")
        chrome_options.binary_location = "/usr/bin/google-chrome"
        browser = webdriver.Chrome(executable_path=r'/usr/local/bin/chromedriver', options=chrome_options)
        setattr(threadLocal, 'browser', browser)
    return browser

def run_scrape(link):
    browser = get_driver()
    browser.get(<link passed here>)
    try:
        #scrape process
    except:
        #other stuffs
#multiprocess.py
from scrape import run_scrape
from multiprocessing.pool import ThreadPool
if __name__ == '__main__':
    start_time = time.time()
    #links = list of links to be scraped
    pool = ThreadPool(20)
    results = pool.map(run_scrape, links)
    print("Total Time Processed: "+"--- %s seconds ---" % (time.time() - start_time))

The chromedriver is exiting after the workers finished now my question is there are instances of chrome browsers that doesn't exited? Is there a way that i can store it also in the threadLocal?

1 Answers1

0

As you instantiate one WebDriver instance for each thread, though this approach will launch multiple Browsing Contexts, conceptually your program looks good to go.

To exit the ChromeDriver / Chrome sessions from the threadpool, you have to invoke quit() individually for each worker thread which gets instantiated. However, I would avoid any attempt to store any session with in the threadLocal to avoid a potential crash.

So, I modified your code trial a bit adding quit() which will be invoked individually for each individual worker thread instantiate and here is the execution results:

  • Code Block:

    • multiprocess.py:

      from scrape import run_scrape
      from multiprocessing.pool import ThreadPool
      import time
      
      if __name__ == '__main__':
          start_time = time.time()
          links = ["https://selenium.dev/downloads/", "https://selenium.dev/documentation/en/"] 
          pool = ThreadPool(20)
          results = pool.map(run_scrape, links)
          print("Total Time Processed: "+"--- %s seconds ---" % (time.time() - start_time)) 
      
    • scrape.py:

      #scrape.py
      import threading
      from selenium import webdriver
      from selenium.common.exceptions import NoSuchElementException, TimeoutException
      from selenium.webdriver.chrome.options import Options
      
      
      threadLocal = threading.local()
      
      def get_driver():
          browser = getattr(threadLocal, 'browser', None)
          if browser is None:
          chrome_options = Options()
          chrome_options.add_argument('--no-sandbox')
          chrome_options.add_argument("--headless")
          chrome_options.add_argument('--disable-dev-shm-usage')
          chrome_options.add_argument("--lang=en")
          chrome_options.add_argument("--start-maximized")
          chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
          chrome_options.add_experimental_option('useAutomationExtension', False)
          chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")
          chrome_options.binary_location=r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
          browser = webdriver.Chrome(executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe', options=chrome_options)
          setattr(threadLocal, 'browser', browser)
          return browser
      
      def run_scrape(link):
          browser = get_driver()
          browser.get(link)
          try:
              print(browser.title)
          except (NoSuchElementException, TimeoutException):
              print("Error")
          browser.quit()
      
  • Console Output:

    Downloads
    The Selenium Browser Automation Project :: Documentation for Selenium
    Total Time Processed: --- 10.329657554626465 seconds ---
    
undetected Selenium
  • 183,867
  • 41
  • 278
  • 352