I've currently created, and finished a webscraper, but the problem is, it gathers about 80k links and goes through each one individually, scraping data. This takes extremely long, so I've been trying to figure out multi-threading, which I've been able to understand small scale on simple functions, but I can't figure out how to implement it into my code.
Here's my code:
class Steamscrape(webdriver.Chrome):
def __init__(self, driver_path = r'PATH', teardown = False):
self.driver_path = driver_path
self.teardown = teardown
os.environ['PATH'] += self.driver_path
super(Steamscrape, self).__init__()
self.implicitly_wait(1)
self.maximize_window()
def __exit__(self, exc_type, exc_val, exc_tb):
if self.teardown:
self.quit()
def land_first_page(self):
self.get(const.BASE_URL)
def scroll_down(self):
SCROLL_PAUSE_TIME = 0.5
last_height = self.execute_script("return document.body.scrollHeight")
while True: #Main
# Scroll down to bottom
self.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = self.execute_script("return document.body.scrollHeight")
if new_height != last_height:
break
last_height = new_height
def get_links(self):
links = self.find_elements(By.CSS_SELECTOR, "a[href^='https://store.steampowered.com/app/']")
link_list = []
for link in links:
link_list.append(link.get_attribute("href"))
if len(link_list) == 12:
break
return link_list
def link_tester(self, link_list):
GamesList = []
for game in link_list:
self.get(game)
try:
dropwdown = self.find_element(By.XPATH, "//select[@id='ageYear']")
dd = Select(dropwdown)
dd.select_by_value("1990")
button = self.find_element(By.XPATH, "//a[@id='view_product_page_btn']").click()
except Exception:
self.implicitly_wait(0)
RecentReview = ""
FullReview = ""
try:
element = self.find_element(By.CSS_SELECTOR, "#userReviews")
text = element.text
text = text.split("\n")
RecentReview = text[1]
FullReview = text[3]
except Exception:
RecentReview = "null"
FullReview = "null"
Title = ""
Genre_s = ""
Developer_s = ""
Publisher_s = ""
ReleaseDate = ""
try:
element = self.find_element(By.CSS_SELECTOR, "#genresAndManufacturer")
text = element.text
text = text.split("\n")
text = [x for x in text if not x.startswith('FRANCHISE:')]
Title = clean(text[0],fix_unicode=True,to_ascii=True,).replace("title: ", "")
Genre_s = clean(text[1],fix_unicode=True,to_ascii=True,).replace("genre: ", "")
Developer_s = clean(text[2],fix_unicode=True,to_ascii=True,).replace("developer: ","")
Publisher_s = clean(text[3],fix_unicode=True,to_ascii=True,).replace("publisher: ","")
ReleaseDate += clean(text[4],fix_unicode=True,to_ascii=True,).replace("release date: ","")
except Exception:
Title = ""
Genre_s = ""
Developer_s = ""
Publisher_s = ""
ReleaseDate = ""
BasePrice = ""
try:
element = self.find_element(By.CLASS_NAME, "game_purchase_action_bg")
text = clean(element.text, no_line_breaks=True).replace("add to cart","")
BasePrice += text
except Exception:
BasePrice = ""
GameRow = [Title, Genre_s, Developer_s, Publisher_s, ReleaseDate, RecentReview, FullReview, BasePrice]
GamesList.append(GameRow)
return GamesList
def load_data(self, Gameslist):
df = pd.DataFrame(Gameslist, columns = ["Title", "Genre", "Developers", "Publisher", "Release Date", "Recent Reviews", "Total Reviews", "Price"])
df.to_csv(r'PATH')
print(df.head(10))
def run(self):
start_time = time.time()
# Execute the whole process
self.land_first_page()
self.scroll_down()
link_list = self.get_links()
GamesList = self.link_tester(link_list=link_list)
self.load_data(GamesList)
total_time = time.time()-start_time
print(total_time)