Creating Multiple Instances of a Selenium Scraper Class and running the in Parallel

Question

So I have created a web scraper with selenium that infinitely crawls a web page. I am trying to create two instances of this scraper and run them in parallel so that two different portions of the site (or two different sites entirely) will be scraped at the same time. With my current code, both processes start and two chrome instances launch, but only one actually starts scraping. The other just sits on the landing page and never moves. My current scraper class looks like this

class clBot(Scraper):

def __init__(self, light_or_dark):
    light_side_xpaths = ['//*[@id="hhh"]/h4/a', '//*[@id="sss"]/h4/a/', '//*[@id="jjj"]/h4/a',
                              '//*[@id="bbb"]/h4/a', '//*[@id="ggg"]/h4/a']
    dark_side_xpaths = ['//*[@id="ccc"]/h4/a', '//*[@id="ppp"]/h4', '//*[@id="forums"]/h4/a']
    if light_or_dark == "light":
        self.xpaths_to_scrape = light_side_xpaths
        self.csv_file = "lightside.csv"
    elif light_or_dark == "dark":
        self.xpaths_to_scrape = dark_side_xpaths
        self.csv_file = "darkside.csv"
    else:
        print('Incorrect variable entered. Please enter "light" or "dark" when initializing this class')
        quit()
    self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
    self.options = webdriver.ChromeOptions()
    #self.options.add_argument('--headless')
    self.options.add_argument('user-agent={self.user_agent}')
    self.current_region = ''
    self.driver = webdriver.Chrome(chrome_options=self.options)
    self.driver.get('https://craigslist.org')

    def run(self):
        self.navigate_pages()


def identify_phone_number(self, string, phone_number_list):
    reg = re.findall(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", string)
    if len(reg) > 0:
        for r in reg:
            if r.strip() not in phone_number_list:
                with open(self.csv_file, 'a') as csv:
                    csv.write("{}\n".format(r.strip()))
                print("Extracted {} from listing".format(r.strip()))
            else:
                print('Phone number already in list.')


def extract_phone_number(self):
    try:
        with open(self.csv_file, 'r') as csv:
            current_phone_numbers = csv.read()
        posting_body = self.driver.find_element_by_id('postingbody')
        self.scraper_wait_class_until_all(self.driver, 'showcontact', seconds_to_wait=5)
        contact_info = self.driver.find_element_by_class_name('showcontact')
        contact_info.click()
        time.sleep(1)
        self.identify_phone_number(posting_body.text, current_phone_numbers)
    except TimeoutException:
        self.identify_phone_number(posting_body.text, current_phone_numbers)
        print('There is no phone number in this listing.')



def scrape_pages(self):
    i=1
    while True:
        try:
            self.scraper_wait_class_until_all(self.driver, 'result-row')
            results = self.driver.find_elements_by_class_name('result-row')
            print("clicking result {}".format(i))
            results[i].find_element_by_class_name('result-title').click()
            self.extract_phone_number()
            self.driver.back()
            i+=1
        except IndexError:
            self.scraper_wait_xpath_until_any(self.driver, '//*[@id="searchform"]/div[5]/div[3]/span[2]/a[3]')
            next_button = self.driver.find_element_by_xpath('//*[@id="searchform"]/div[5]/div[3]/span[2]/a[3]')
            print('Navigating to next page.')
            next_button.click()
            i=1

def choose_xpath_to_scrape(self, list_of_xpaths):
    xpath_index = randint(0, len(list_of_xpaths)-1)
    xpath = list_of_xpaths[xpath_index]
    return xpath
def navigate_pages(self):
    try:
        while True:
            try:
                self.scraper_wait_xpath_until_any(self.driver, '//*[@id="rightbar"]')
                rightbar = self.driver.find_element_by_xpath('//*[@id="rightbar"]')
                nearby_cl = rightbar.find_element_by_xpath('//*[@id="rightbar"]/ul/li[1]')
                child_items = nearby_cl.find_elements_by_class_name('s')
                random = randint(1, len(child_items)-1)
                time.sleep(3)
                print("Clicking {}".format(child_items[random].text))
                child_items[random].click()
                for xpath in self.xpaths_to_scrape:
                    area_to_scrape = self.driver.find_element_by_xpath(self.choose_xpath_to_scrape(self.xpaths_to_scrape))
                    area_to_scrape.click()
                    self.scrape_pages()
                    self.driver.back()
                    time.sleep(1)
            except WebDriverException:
                continue
    except Exception as e:
        print(e)
        return
    finally:
        self.driver.quit()

and the main.py file that opens the two processes and initializes them is as follows:

import scraper

from multiprocessing import Process, Manager



if __name__ == "__main__":
    manager = Manager()
    d = manager.dict()
    l = manager.list(range(10))
    darksideScraper = scraper.clBot('light')
    lightsideScraper = scraper.clBot('dark')

    darkside = Process(target=darksideScraper.navigate_pages())
    lightside = Process(target=lightsideScraper.navigate_pages())
    darkside.start()
    lightside.start()
    darkside.join()
    lightside.join()

Any help would be appreciated!

Try passing your `target` as reference to your function instead of calling it, like this `Process(target=darksideScraper.navigate_pages)`. Also refer to [this](https://stackoverflow.com/questions/20887555/dead-simple-example-of-using-multiprocessing-queue-pool-and-locking), for another example. — PixelEinstein, Mar 12 '18 at 20:01
that was exactly the issue. I was calling the function so the start of process 2 was waiting for process 1 to terminate, which it doesn't. Thanks so much! — Jake, Mar 12 '18 at 23:48
Great! I will make it an answer so people will be able to refer to it easier. — PixelEinstein, Mar 12 '18 at 23:50

score 2 · Accepted Answer · answered Mar 12 '18 at 23:50

2

Try passing your target as reference to your function instead of calling it, like this Process(target=darksideScraper.navigate_pages). Also refer to this, for another example of how to use multiprocessing.

answered Mar 12 '18 at 23:50

PixelEinstein

1,713
1
8
17

Creating Multiple Instances of a Selenium Scraper Class and running the in Parallel

1 Answers1