This might be the stupidest question i asked yet but this is driving me nuts...
Basically i want to get all links from profiles but for some reason selenium gives different amounts of links most of the time ( sometimes all sometimes only a tenth)
I experimented with time.sleep and i know its affecting the output somehow but i dont understand where the problem is. (but thats just my hypothesis maybe thats wrong)
I have no other explanation why i get incosistent output. Since i get all profile links from time to time the program is able to find all relevant profiles.
heres what the output should be (for different gui input)
input:anlagenbau output:3070
Fahrzeugbau output:4065
laserschneiden output:1311
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
from urllib.request import urlopen
from datetime import date
from datetime import datetime
import easygui
import re
from selenium.common.exceptions import NoSuchElementException
import time
#input window suchbegriff
suchbegriff = easygui.enterbox("Suchbegriff eingeben | Hinweis: suchbegriff sollte kein '/' enthalten")
#get date and time
now = datetime.now()
current_time = now.strftime("%H-%M-%S")
today = date.today()
date = today.strftime("%Y-%m-%d")
def get_profile_url(label_element):
# get the url from a result element
onlick = label_element.get_attribute("onclick")
# some regex magic
return re.search(r"(?<=open\(\')(.*?)(?=\')", onlick).group()
def load_more_results():
# load more results if needed // use only on the search page!
button_wrapper = wd.find_element_by_class_name("loadNextBtn")
button_wrapper.find_element_by_tag_name("span").click()
#### Script starts here ####
# Set some Selenium Options
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
# Webdriver
wd = webdriver.Chrome(options=options)
# Load URL
wd.get("https://www.techpilot.de/zulieferer-suchen?"+str(suchbegriff))
# lets first wait for the timeframe
iframe = WebDriverWait(wd, 5).until(
EC.frame_to_be_available_and_switch_to_it("efficientSearchIframe")
)
# the result parent
result_pane = WebDriverWait(wd, 5).until(
EC.presence_of_element_located((By.ID, "resultPane"))
)
#get all profilelinks as list
time.sleep(5)
href_list = []
wait = WebDriverWait(wd, 15)
while True:
try:
#time.sleep(1)
wd.execute_script("loadFollowing();")
#time.sleep(1)
try:
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".fancyCompLabel")))
except TimeoutException:
break
#time.sleep(1) # beeinflusst in irgeneiner weise die findung der ergebnisse
result_elements = wd.find_elements_by_class_name("fancyCompLabel")
#time.sleep(1)
for element in result_elements:
url = get_profile_url(element)
href_list.append(url)
#time.sleep(2)
while True:
try:
element = wd.find_element_by_class_name('fancyNewProfile')
wd.execute_script("""var element = arguments[0];element.parentNode.removeChild(element);""", element)
except NoSuchElementException:
break
except NoSuchElementException:
break
wd.close #funktioniert noch nicht
print("####links secured: "+str(len(href_list)))