I am currently using selenium webdriver to parse through this webpage (https://startup-map.berlin/companies.startups/f/all_locations/allof_Berlin/data_type/allof_Verified) to extract all startup urls using Python. I tried all relevant methods mentioned in this post: How can I scroll a web page using selenium webdriver in python? and also other suggestions online.
However, it did not work out for this website. It only loaded the first 25 startups. Some code examples:
from time import sleep
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
webdriver = webdriver.Chrome(executable_path='chromedriver')
# Write into csv file
filename = "startups_urls.csv"
f = open(BLD / "processed/startups_urls.csv", "w")
headers = "startups_urls\n"
f.write(headers)
url = "https://startup-map.berlin/companies.startups/f/all_locations/allof_Berlin/data_type/allof_Verified"
webdriver.get(url)
time.sleep(3)
# Get scroll height
last_height = webdriver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
webdriver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(3)
# Calculate new scroll height and compare with last scroll height
new_height = webdriver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
htmlSource = webdriver.page_source
page_soup = BeautifulSoup(htmlSource, "html.parser")
startups = page_soup.findAll("div", {"class": "type-element type-element--h3 hbox entity-name__name entity-name__name--black"})
if startups != []:
for startup in startups:
startups_href = startup.a["href"]
startups_url = "https://startup-map.berlin" + startups_href
open_file.write(startups_url + "\n")
else:
print("NaN.")
f.close()
driver.close()
Any suggestions? Thank you very much.