So I have created a web scraper with selenium that infinitely crawls a web page. I am trying to create two instances of this scraper and run them in parallel so that two different portions of the site (or two different sites entirely) will be scraped at the same time. With my current code, both processes start and two chrome instances launch, but only one actually starts scraping. The other just sits on the landing page and never moves. My current scraper class looks like this
class clBot(Scraper):
def __init__(self, light_or_dark):
light_side_xpaths = ['//*[@id="hhh"]/h4/a', '//*[@id="sss"]/h4/a/', '//*[@id="jjj"]/h4/a',
'//*[@id="bbb"]/h4/a', '//*[@id="ggg"]/h4/a']
dark_side_xpaths = ['//*[@id="ccc"]/h4/a', '//*[@id="ppp"]/h4', '//*[@id="forums"]/h4/a']
if light_or_dark == "light":
self.xpaths_to_scrape = light_side_xpaths
self.csv_file = "lightside.csv"
elif light_or_dark == "dark":
self.xpaths_to_scrape = dark_side_xpaths
self.csv_file = "darkside.csv"
else:
print('Incorrect variable entered. Please enter "light" or "dark" when initializing this class')
quit()
self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
self.options = webdriver.ChromeOptions()
#self.options.add_argument('--headless')
self.options.add_argument('user-agent={self.user_agent}')
self.current_region = ''
self.driver = webdriver.Chrome(chrome_options=self.options)
self.driver.get('https://craigslist.org')
def run(self):
self.navigate_pages()
def identify_phone_number(self, string, phone_number_list):
reg = re.findall(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", string)
if len(reg) > 0:
for r in reg:
if r.strip() not in phone_number_list:
with open(self.csv_file, 'a') as csv:
csv.write("{}\n".format(r.strip()))
print("Extracted {} from listing".format(r.strip()))
else:
print('Phone number already in list.')
def extract_phone_number(self):
try:
with open(self.csv_file, 'r') as csv:
current_phone_numbers = csv.read()
posting_body = self.driver.find_element_by_id('postingbody')
self.scraper_wait_class_until_all(self.driver, 'showcontact', seconds_to_wait=5)
contact_info = self.driver.find_element_by_class_name('showcontact')
contact_info.click()
time.sleep(1)
self.identify_phone_number(posting_body.text, current_phone_numbers)
except TimeoutException:
self.identify_phone_number(posting_body.text, current_phone_numbers)
print('There is no phone number in this listing.')
def scrape_pages(self):
i=1
while True:
try:
self.scraper_wait_class_until_all(self.driver, 'result-row')
results = self.driver.find_elements_by_class_name('result-row')
print("clicking result {}".format(i))
results[i].find_element_by_class_name('result-title').click()
self.extract_phone_number()
self.driver.back()
i+=1
except IndexError:
self.scraper_wait_xpath_until_any(self.driver, '//*[@id="searchform"]/div[5]/div[3]/span[2]/a[3]')
next_button = self.driver.find_element_by_xpath('//*[@id="searchform"]/div[5]/div[3]/span[2]/a[3]')
print('Navigating to next page.')
next_button.click()
i=1
def choose_xpath_to_scrape(self, list_of_xpaths):
xpath_index = randint(0, len(list_of_xpaths)-1)
xpath = list_of_xpaths[xpath_index]
return xpath
def navigate_pages(self):
try:
while True:
try:
self.scraper_wait_xpath_until_any(self.driver, '//*[@id="rightbar"]')
rightbar = self.driver.find_element_by_xpath('//*[@id="rightbar"]')
nearby_cl = rightbar.find_element_by_xpath('//*[@id="rightbar"]/ul/li[1]')
child_items = nearby_cl.find_elements_by_class_name('s')
random = randint(1, len(child_items)-1)
time.sleep(3)
print("Clicking {}".format(child_items[random].text))
child_items[random].click()
for xpath in self.xpaths_to_scrape:
area_to_scrape = self.driver.find_element_by_xpath(self.choose_xpath_to_scrape(self.xpaths_to_scrape))
area_to_scrape.click()
self.scrape_pages()
self.driver.back()
time.sleep(1)
except WebDriverException:
continue
except Exception as e:
print(e)
return
finally:
self.driver.quit()
and the main.py file that opens the two processes and initializes them is as follows:
import scraper
from multiprocessing import Process, Manager
if __name__ == "__main__":
manager = Manager()
d = manager.dict()
l = manager.list(range(10))
darksideScraper = scraper.clBot('light')
lightsideScraper = scraper.clBot('dark')
darkside = Process(target=darksideScraper.navigate_pages())
lightside = Process(target=lightsideScraper.navigate_pages())
darkside.start()
lightside.start()
darkside.join()
lightside.join()
Any help would be appreciated!