Unsure of how to properly implement multi-threading into my webscraper

Question

I've currently created, and finished a webscraper, but the problem is, it gathers about 80k links and goes through each one individually, scraping data. This takes extremely long, so I've been trying to figure out multi-threading, which I've been able to understand small scale on simple functions, but I can't figure out how to implement it into my code.

Here's my code:

class Steamscrape(webdriver.Chrome):
    def __init__(self, driver_path = r'PATH', teardown = False):
        self.driver_path = driver_path
        self.teardown = teardown
        os.environ['PATH'] += self.driver_path
        super(Steamscrape, self).__init__()
        self.implicitly_wait(1)
        self.maximize_window()

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.teardown:
            self.quit()

    def land_first_page(self):
        self.get(const.BASE_URL)

    def scroll_down(self):
        SCROLL_PAUSE_TIME = 0.5
        last_height = self.execute_script("return document.body.scrollHeight")
        while True: #Main
            # Scroll down to bottom
            self.execute_script("window.scrollTo(0, document.body.scrollHeight);")

            # Wait to load page
            time.sleep(SCROLL_PAUSE_TIME)

            # Calculate new scroll height and compare with last scroll height
            new_height = self.execute_script("return document.body.scrollHeight")
            if new_height != last_height:
                break
            last_height = new_height

    def get_links(self):
        links = self.find_elements(By.CSS_SELECTOR, "a[href^='https://store.steampowered.com/app/']")
        link_list = []
        for link in links:
                link_list.append(link.get_attribute("href"))
                if len(link_list) == 12:
                    break
        return link_list

    def link_tester(self, link_list):
        GamesList = []

        for game in link_list:
            self.get(game)

            try:
                dropwdown = self.find_element(By.XPATH, "//select[@id='ageYear']")
                dd = Select(dropwdown)
                dd.select_by_value("1990")
                button = self.find_element(By.XPATH, "//a[@id='view_product_page_btn']").click()
            except Exception:
        
            
                self.implicitly_wait(0)

                RecentReview = ""
                FullReview = ""
                try:
                    element = self.find_element(By.CSS_SELECTOR, "#userReviews")
                    text = element.text
                    text = text.split("\n")
                    RecentReview = text[1]
                    FullReview = text[3]
                except Exception:
                    RecentReview = "null"
                    FullReview = "null"

                Title = ""
                Genre_s = ""
                Developer_s = ""
                Publisher_s = ""
                ReleaseDate = ""
                try:
                    element = self.find_element(By.CSS_SELECTOR, "#genresAndManufacturer")
                    text = element.text
                    text = text.split("\n")
                    text = [x for x in text if not x.startswith('FRANCHISE:')]
                    Title = clean(text[0],fix_unicode=True,to_ascii=True,).replace("title: ", "")
                    Genre_s = clean(text[1],fix_unicode=True,to_ascii=True,).replace("genre: ", "")
                    Developer_s = clean(text[2],fix_unicode=True,to_ascii=True,).replace("developer: ","")
                    Publisher_s = clean(text[3],fix_unicode=True,to_ascii=True,).replace("publisher: ","")
                    ReleaseDate += clean(text[4],fix_unicode=True,to_ascii=True,).replace("release date: ","")
                except Exception:
                    Title = ""
                    Genre_s = ""
                    Developer_s = ""
                    Publisher_s = ""
                    ReleaseDate = ""

                BasePrice = ""
                try:
                    element = self.find_element(By.CLASS_NAME, "game_purchase_action_bg")
                    text = clean(element.text, no_line_breaks=True).replace("add to cart","")
                    BasePrice += text
                except Exception:
                    BasePrice = ""

                GameRow = [Title, Genre_s, Developer_s, Publisher_s, ReleaseDate, RecentReview, FullReview, BasePrice]
                GamesList.append(GameRow)

        return GamesList

    def load_data(self, Gameslist):
        df = pd.DataFrame(Gameslist, columns = ["Title", "Genre", "Developers", "Publisher", "Release Date", "Recent Reviews", "Total Reviews", "Price"])
        df.to_csv(r'PATH')
        print(df.head(10))

    def run(self):
        start_time = time.time()
        # Execute the whole process
        self.land_first_page()
        self.scroll_down()
        link_list = self.get_links()
        GamesList = self.link_tester(link_list=link_list)
        self.load_data(GamesList)
        total_time = time.time()-start_time
        print(total_time)

Right now, `link_tester` does a `for game in link_list` to run through the whole list. You would need to divide that into two functions, so that one function simply fetched one link. NOW you can create a thread pool and spawn off the list of links to the thread pool. Are you absolutely certain you need Selenium? `scrapy` can automate a lot of this for you. — Tim Roberts, Jul 13 '23 at 00:08
I'd replace the `link_list` with a thread pool. For each URL I wanted to fetch, I'd submit a task to the thread pool that would fetch the one URL, scrape it looking for more links, and add a new task for each link found. But I'm not really an active developer any more. Maybe one of the kiddies here knows a more modern, stylish way. — Solomon Slow, Jul 13 '23 at 00:09
Do note that Python threads run concurrently but not in parallel (thanks to [the GIL](https://stackoverflow.com/q/1294382/11082165)). Only one Python thread ever executes at a time. Multithreading can help with I/O blocking, but it doesn't make your Python code run any faster. To do work faster and in parallel, you'll want to look into multiprocessing. See also [Multiprocessing vs Threading](https://stackoverflow.com/q/3044580/11082165) — Brian61354270, Jul 13 '23 at 00:27

Unsure of how to properly implement multi-threading into my webscraper

0 Answers0