1

I'm a newbie getting into web scrapers. I've made something that works, but it takes hours and hours to get everything I need. I read something about using parallel processes to process the URLs but I have no clue how to go about it and incorporate it in what I already have. Help is much appreciated!

Here is my, still extremely messy, code. I'm still learning :)

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
import time
import random
import pprint
import itertools
import csv
import pandas as pd

start_url = "https://www.nationalevacaturebank.nl/vacature/zoeken?query=&location=&distance=city&limit=100&sort=relevance&filters%5BcareerLevel%5D%5B%5D=Starter&filters%5BeducationLevel%5D%5B%5D=MBO"

driver = webdriver.Firefox()
driver.set_page_load_timeout(20)
driver.get(start_url)
driver.find_element_by_xpath('//*[@id="form_save"]').click() #accepts cookies

wait = WebDriverWait(driver, random.randint(1500,3200)/1000.0)
j = random.randint(1500,3200)/1000.0
time.sleep(j)
num_jobs = int(driver.find_element_by_xpath('/html/body/div[3]/div/main/div[2]/div[3]/div/header/h2/span').text)
num_pages = int(num_jobs/102)

urls = []
list_of_links = []

for i in range(num_pages+1):
        try:


            elements = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="search-results-container"]//article/job/a')))
            for i in elements:
                list_of_links.append(i.get_attribute('href'))

            j = random.randint(1500,3200)/1000.0
            time.sleep(j) 

            if 'page=3' not in driver.current_url:
                driver.find_element_by_xpath('//html/body/div[3]/div/main/div[2]/div[3]/div/paginator/div/nav[1]/ul/li[6]/a').click()
            else:
                driver.find_element_by_xpath('//html/body/div[3]/div/main/div[2]/div[3]/div/paginator/div/nav[1]/ul/li[5]/a').click()

            url = driver.current_url
            if url not in urls:
                print(url)
                urls.append(url)

            else:
                break


        except:
            continue


set_list_of_links = list(set(list_of_links))
print(len(set_list_of_links), "results")                
driver.close()

def grouper(n, iterable):
    it = iter(iterable)
    while True:
       chunk = tuple(itertools.islice(it, n))
       if not chunk:
           return
       yield chunk

def remove_empty_lists(l):
    keep_going = True
    prev_l = l
    while keep_going:
        new_l = remover(prev_l)
        #are they identical objects?
        if new_l == prev_l:
            keep_going = False
        #set prev to new
        prev_l = new_l
    #return the result
    return new_l


def remover(l):
    newlist = []
    for i in l:
        if isinstance(i, list) and len(i) != 0:
            newlist.append(remover(i))
        if not isinstance(i, list):
            newlist.append(i)

    return newlist

vacatures = []
chunks = grouper(100, set_list_of_links)
chunk_count = 0

for chunk in chunks: 
    chunk_count +=1
    print(chunk_count)
    j = random.randint(1500,3200)/1000.0
    time.sleep(j)

    for url in chunk:

        driver = webdriver.Firefox()
        driver.set_page_load_timeout(20)

        try: 
            driver.get(url)
            driver.find_element_by_xpath('//*[@id="form_save"]').click() #accepts cookies

            vacature = []
            vacature.append(url)

            j = random.randint(1500,3200)/1000.0
            time.sleep(j)

            elements = driver.find_elements_by_tag_name('dl')
            p_elements = driver.find_elements_by_tag_name('p')
            li_elements = driver.find_elements_by_tag_name('li')

            for i in elements:
                if "Salaris:" not in i.text:
                    vacature.append(i.text)

            running_text = list()
            for p in p_elements:
                running_text.append(p.text)

            text= [''.join(running_text)]

            remove_ls = ['vacatures', 'carrièretips', 'help', 'inloggen', 'inschrijven', 'Bezoek website', 'YouTube',
                        'Over Nationale Vacaturebank', 'Werken bij de Persgroep', 'Persberichten', 'Autotrack', 'Tweakers',
                        'Tweakers Elect', 'ITBanen', 'Contact', 'Carrière Mentors', 'Veelgestelde vragen',
                         'Vacatures, stages en bijbanen', 'Bruto Netto Calculator', 'Salariswijzer', 'Direct vacature plaatsen',
                         'Kandidaten zoeken', 'Bekijk de webshop', 'Intermediair', 'Volg ons op Facebook']

            for li in li_elements:
                if li.text not in remove_ls: 
                    text.append(li.text)

            text = ''. join(text)
            vacature.append(text)

            vacatures.append(vacature)

            driver.close() 

        except TimeoutException as ex:
            isrunning = 0
            print("Exception has been thrown. " + str(ex))
            driver.close()

        except NoSuchElementException:
            continue 
Lunalight
  • 157
  • 2
  • 15
  • If you want to improve *working code* you'd better post your question on [CodeReview](https://codereview.stackexchange.com) – Andersson Oct 31 '18 at 12:23
  • I'm not real sure, but the use of Selenium might be a reason behind the slow nature of this. Selenium visually renders the page and loads all the images, adds, etc. If you just use the html-data and scrape that, it might be a lot faster. I built for example a script with `requests` and `BeautifulSoup` and it scrapes all the data (not the images) from Jaap in about 10-15 minutes (3000+ pages). So Nationale Vacaturebank should also be possible in a reasonable time... – Niels Henkens Oct 31 '18 at 12:52
  • @Andersson I did but got the reaction that they do not help with code that does not exist i.e. how to go about parallel processing – Lunalight Oct 31 '18 at 12:56

4 Answers4

2

Python Selenium webdriver is not thread-safe. This means your browser can not correctly consume asynchronous calls from multiple threads. Try to scrape websites with requests and bs4 + lxml. It's much faster than Selenium. This answer can be helpful.

Rezvanov Maxim
  • 346
  • 1
  • 7
  • Oh really? I thought Selenium was normally used for this kind of thing. Thanks :) – Lunalight Oct 31 '18 at 12:56
  • No, like I said, using requests and BeautifulSoap is a lot faster because you only get the html. – Niels Henkens Oct 31 '18 at 13:00
  • @Lunalight you can try to use PhantomJS webdriver with Selenium. This is headless browser and can be faster than Firefox. – Rezvanov Maxim Oct 31 '18 at 13:22
  • @NielsHenkens nothing prevents to inspect the web page for some API calls. But in case if data generates with JS we should use browser of course. – Rezvanov Maxim Oct 31 '18 at 13:24
  • But how do I go to a next page with requests? – Lunalight Oct 31 '18 at 14:54
  • @Lunalight `requests.get(url_of_next_page)`. Do you know HTTP protocol? – Rezvanov Maxim Oct 31 '18 at 14:58
  • @RezvanovMaxim I know nothing as stated in my OP :) I followed a tutorial on webscraping with selenium and thought I could extend that knowledge by coding what you see above. – Lunalight Oct 31 '18 at 15:08
  • Also I don't have the url of the next page since there is a go to next page thing at the bottom of the page – Lunalight Oct 31 '18 at 15:13
  • @Lunalight for scraping this page you should not navigate out of pages. I inspect this web page and found, all jobs are returns with JSON format in one API call. Just learn how to inspect web page with Chrome browser. – Rezvanov Maxim Oct 31 '18 at 15:41
  • @RezvanovMaxim I did inspect the page that's how I got the tag names and Xpaths. As for the rest, I don't understand what you mean by that? – Lunalight Oct 31 '18 at 15:43
  • @Lunalight i mean inspect which requests are sent from this web page. You should use `Network` tab in inspect tool in Chrome browser. Look at my screenshot: https://ibb.co/eiZK50 – Rezvanov Maxim Oct 31 '18 at 15:47
2
  1. You're using Firefox which is slower than Chrome in almost all real-life applications.
  2. Xpath is the slowest selector, match by id or class. If that is not possible then by CSS.
  3. Use headless mode and don't load images unless you need to.
misantroop
  • 2,276
  • 1
  • 16
  • 24
0

You can use Scrapy and this is much faster and more flexible than anything. See link for more information.

SushilG
  • 655
  • 1
  • 6
  • 19
  • He IS using Scrapy. He's wants to use Selenium to load javascript based (SPA) pages probably. This does not answer hsi question. – PrashanD Jan 24 '20 at 09:38
0

You can scrape by simultaneously opening multiple tabs side by side in the same scraper program, for doing so, you can refer to this code:

import time
from selenium import webdriver
from selenium.webdriver import ChromeOptions

options = ChromeOptions()
profile = rf'\\PATH'
my_profile = rf'user-data-dir={profile}'
options.add_argument(my_profile)
driver = webdriver.Chrome(options=options)

# driver.execute_script("window.open('about:blank','google');")
# driver.switch_to.window("google")
driver.get('http://google.com')

driver.execute_script("window.open('about:blank','facebook');")
driver.switch_to.window("facebook")
driver.get('https://www.facebook.com/')

driver.execute_script("window.open('about:blank','twitter');")
driver.switch_to.window("twitter")
driver.get('https://twitter.com/')

driver.execute_script("window.open('about:blank','instagram');")
driver.switch_to.window("instagram")
driver.get('https://www.instagram.com/')

time.sleep(1)
driver.switch_to.window("twitter")
time.sleep(1)
driver.switch_to.window("facebook")
time.sleep(1)
driver.switch_to.window("google")
time.sleep(1)
driver.switch_to.window("instagram")
time.sleep(1)
driver.switch_to.window("facebook")

driver.close()
driver.quit()