0

Im building a webscraper, i am facing an issue where the search page only provides links to the items, so i would like

function 1 read postcodes from .txt file and search for links to items

function 2 take links of items and scrape for details

I have both scrapers as seperate .py files

i have combined them together and made each script a function

i have implemented deque to add data and retrieve data and this works. but how to i get them both to run together?

# -*- coding: UTF-8 -*-
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import time
from time import sleep
import csv
from collections import deque

dq = deque([])

#Search The links Via Post Code

def linkScrape():
    recordnum = 0
    pagnum = 0
    with open("catlist.txt") as catlist:
        postkeys = []
        for line in catlist:
            postkeys.append(line.strip())
    with open("pcodnum.txt") as pagesnum:
        postpages = []
        for line in pagesnum:
            postpages.append(line.strip())
    with open("pcodes.txt") as pcodes:
        postcodes = []
        for line in pcodes:
            postcodes.append(line.strip())
    for y in postcodes:
        for z in postkeys:
            for x in postpages:
                surl = 'https://www.checkatrade.com/Search/?location={}&cat={}&page={}'.format(y, z, x)
                options = Options()
                options.headless = True
                driver = webdriver.Firefox(options=options)
                #driver = webdriver.Firefox()
                driver.implicitly_wait(10) # seconds
                driver.get (surl)
                print ("Link Scraper: Headless Firefox Scraping: " + surl)
                html = driver.page_source
                soup = BeautifulSoup(html, 'html.parser')
                questions = soup.select('.ch-listing__result')
                for question in questions:
                    comlink = question.find('a', attrs={"class": "catnow-search-click"})
                    if comlink is None:
                        comlink = 'None'
                    else:
                        comlink = comlink.attrs['href']
                    comlink = 'https://www.checkatrade.com' + comlink
                    recordnum += 1
                    dq.appendleft(str(comlink))
                pagnum += 1
                print("Link Scraper: " + str(pagnum) + ' pages finished with ' + str(recordnum) + ' records')
                print(list(dq))
                driver.close()




# Scrape Company Details From Url
def datScrape( xurl ):
    f = csv.writer(open('companydetails.csv', 'w'))
    f.writerow(['Business Name', 'Business Owner', 'Business Telephone', 'Business Mobile', 'Business Email', 'Business Managed Email'])
    surl = xurl
    options = Options()
    options.headless = True
    driver = webdriver.Firefox(options=options)
    #driver = webdriver.Firefox()
    driver.implicitly_wait(5) # seconds
    driver.get (surl)
    print ("Company Details Scraper: Headless Firefox Scraping: " + surl)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    details = soup.select('.contact-card__details')
    #print(questions)
    for detail in details:
        busname = detail.select('h1')[0].get_text()
        #print(busname)
        #busowner = question.find(class_='contact-card__contact-name').get_text()
        busowner = detail.find('div', attrs={"class": "contact-card__contact-name"})
        if busowner is None:
            busowner = 'None'
        else:
            busowner = busowner.text
        #print(busowner)
        comtelephone = detail.find('a', attrs={"id": "ctl00_ctl00_content_ctlTel"})
        if comtelephone is None:
            comtelephone = 'None'
        else:
            comtelephone = comtelephone.attrs['href'].rsplit(":", 1)[-1]
        #print(comtelephone)
        comtelemob = detail.find('a', attrs={"id": "ctl00_ctl00_content_ctlMobile"})
        if comtelemob is None:
            comtelemob = 'None'
        else:
            comtelemob = comtelemob.attrs['href'].rsplit(":", 1)[-1]
        #print(comtelemob)
        comemail = detail.find('a', attrs={"id": "ctl00_ctl00_content_ctlEmail"})
        if comemail is None:
            comemail = 'None'
        else:
            comemail = comemail.attrs['href'].rsplit(":", 1)[-1]
        comemanmail = detail.find('a', attrs={"id": "ctl00_ctl00_content_managedEmail"})
        if comemanmail is None:
            comemanmail = 'None'
        else:
            comemanmail = comemanmail.attrs['href'].rsplit(":", 1)[-1]
        #print(comemail)
        print("Company Details Scraper: " + busname + "\n" + busowner + "\n" + comtelephone + "\n" + comtelemob + "\n" + comemail + "\n" + comemanmail)
        f.writerow([busname, busowner, comtelephone, comtelemob, comemail, comemanmail])
    data_list = []
    driver.close()
    driver.quit()
from multiprocessing import Process

p = Process(target=linkScrape)
p.start()
p2 = Process(target=datScrape)
sleep(20)
p2.start(dq.pop())

p.join()
p2.join()

Updated Code With Multi Processing new error

Traceback (most recent call last):
  File "script.py", line 120, in <module>
    p2.start(dq.pop())
IndexError: pop from an empty deque

even thought the queue definitley has data at this point

1 Answers1

1

In order to accomplish this, you'll probably need to include some multiprocessing. See this page for some more details.

You may consider reviewing this Stack Overflow post for a similar issue. What you are going to need to do is create processes for each.

I would consider changing your script to the following:

# -*- coding: UTF-8 -*-
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import time
from time import sleep
import csv
from collections import deque
# Added this library
from multiprocessing import Process

dq = deque([])

#Search The links Via Post Code

def linkScrape( scrapeInput ):
    recordnum = 0
    pagnum = 0
    spost = scrapeInput
    with open("catlist.txt") as catlist:
        postkeys = []
        for line in catlist:
            postkeys.append(line.strip())
    with open("pcodnum.txt") as pagesnum:
        postpages = []
        for line in pagesnum:
            postpages.append(line.strip())
    for z in postkeys:
        for x in postpages:
            surl = 'https://www.checkatrade.com/Search/?location=' + spost + '&cat=' + str(z) + '&page=' + str(x)
            options = Options()
            options.headless = True
            driver = webdriver.Firefox(options=options)
            #driver = webdriver.Firefox()
            driver.implicitly_wait(10) # seconds
            driver.get (surl)
            print ("Headless Firefox Scraping: " + surl)
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')
            questions = soup.select('.ch-listing__result')
            for question in questions:
                comlink = question.find('a', attrs={"class": "catnow-search-click"})
                if comlink is None:
                    comlink = 'None'
                else:
                    comlink = comlink.attrs['href']
                comlink = 'https://www.checkatrade.com' + comlink
                recordnum += 1
                dq.appendleft(comlink)
            pagnum += 1
            print("Link Scraper: " + str(pagnum) + ' pages finished with ' + str(recordnum) + ' records')
            driver.close()




# Scrape Company Details From Url
def datScrape( xurl ):
    f = csv.writer(open('companydetails.csv', 'w'))
    f.writerow(['Business Name', 'Business Owner', 'Business Telephone', 'Business Mobile', 'Business Email', 'Business Managed Email'])
    surl = xurl
    options = Options()
    options.headless = True
    driver = webdriver.Firefox(options=options)
    #driver = webdriver.Firefox()
    driver.implicitly_wait(5) # seconds
    driver.get (surl)
    print ("Headless Firefox Scraping: " + surl)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    details = soup.select('.contact-card__details')
    #print(questions)
    for detail in details:
        busname = detail.select('h1')[0].get_text()
        #print(busname)
        #busowner = question.find(class_='contact-card__contact-name').get_text()
        busowner = detail.find('div', attrs={"class": "contact-card__contact-name"})
        if busowner is None:
            busowner = 'None'
        else:
            busowner = busowner.text
        #print(busowner)
        comtelephone = detail.find('a', attrs={"id": "ctl00_ctl00_content_ctlTel"})
        if comtelephone is None:
            comtelephone = 'None'
        else:
            comtelephone = comtelephone.attrs['href'].rsplit(":", 1)[-1]
        #print(comtelephone)
        comtelemob = detail.find('a', attrs={"id": "ctl00_ctl00_content_ctlMobile"})
        if comtelemob is None:
            comtelemob = 'None'
        else:
            comtelemob = comtelemob.attrs['href'].rsplit(":", 1)[-1]
        #print(comtelemob)
        comemail = detail.find('a', attrs={"id": "ctl00_ctl00_content_ctlEmail"})
        if comemail is None:
            comemail = 'None'
        else:
            comemail = comemail.attrs['href'].rsplit(":", 1)[-1]
        comemanmail = detail.find('a', attrs={"id": "ctl00_ctl00_content_managedEmail"})
        if comemanmail is None:
            comemanmail = 'None'
        else:
            comemanmail = comemanmail.attrs['href'].rsplit(":", 1)[-1]
        #print(comemail)
        print("Company Details Scraper: " + busname + "\n" + busowner + "\n" + comtelephone + "\n" + comtelemob + "\n" + comemail + "\n" + comemanmail)
        f.writerow([busname, busowner, comtelephone, comtelemob, comemail, comemanmail])
    data_list = []
    driver.close()
    driver.quit()

# Added in this function to run two functions in parallel
# Taken from: https://stackoverflow.com/questions/7207309/python-how-can-i-run-python-functions-in-parallel
# Credit to NPE
def runInParallel(*fns):
  proc = []
  for fn in fns:
    p = Process(target=fn)
    p.start()
    proc.append(p)
  for p in proc:
    p.join()

with open("pcodes.txt") as pcodes:
    postcodes = []
    for line in pcodes:
        postcodes.append(line.strip())

# You will probably need to edit the below...
for postcode in postcodes:
    # You will need to call the runInParallel function to call your two other functions
    runInParallel(linkScrape(postcode), datScrape(postcode))

It may require a bit of editing since it's unclear if you really want two things to occur simultaneously since you have an IF statement, but this would accomplish both functions running at (as close as possible) the same time.

artemis
  • 6,857
  • 11
  • 46
  • 99
  • yeah i tried this ```from multiprocessing import Process p = Process(target=linkScrape) p.start() p2 = Process(target=datScrape) sleep(20) p2.start(dq.pop()) p.join() p2.join()``` and it gives error ``` IndexError: pop from an empty deque ``` but if i print the que there is stuff in there – Patrick Kenneally Jul 17 '19 at 14:51
  • and no not simultaneously, just function 2 read an excute the data from function 1 automatically – Patrick Kenneally Jul 17 '19 at 14:53