Apologies in advance if this long question seems quite basic!
Given:
search query link in a library website:
url = 'https://digi.kansalliskirjasto.fi/search?query=economic%20crisis&orderBy=RELEVANCE'
I'd like to extract all useful information for each individual search result (total 20 in 1 page) of this specific query as depicted by red rectangles in this figure:
currently, I have the following code:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
def run_selenium(URL):
options = Options()
options.add_argument("--remote-debugging-port=9222"),
options.headless = True
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(URL)
pt = "//app-digiweb/ng-component/section/div/div/app-binding-search-results/div/div"
medias = driver.find_elements(By.XPATH, pt) # expect to obtain a list with 20 elements!!
print(medias) # >>>>>> result: []
print("#"*100)
for i, v in enumerate(medias):
print(i, v.get_attribute("innerHTML"))
if __name__ == '__main__':
url = 'https://digi.kansalliskirjasto.fi/search?query=economic%20crisis&orderBy=RELEVANCE'
run_selenium(URL=url)
Problem:
Having a look at part of the inspect in chrome:
I have tried several xpath generated by Chrome Extensions XPath Helper
and SelectorsHub
to produce XPath and use it as pt
variable in my python code this library search engine, but the result is []
or simply nothing.
Using SelectorsHub
and hovering the mouse over Rel XPath
, I get this warning: id & class both look dynamic. Uncheck id & class checkbox to generate rel xpath without them if it is generated with them.
Question:
Assuming selenium
as a tool for web scraping of a page containing dynamic attributes instead of BeautifulSoup
as recommended here and here, shouldn't driver.find_elements()
, return a list of 20 elements each of which containing all info and to be extracted?
>>>>> UPDATE <<<<< Working Solution (although time inefficient)
As recommended by @JaSON in the solution, I now use WebDriverWait
in try except
block as follows:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common import exceptions
def get_all_search_details(URL):
st_t = time.time()
SEARCH_RESULTS = {}
options = Options()
options.headless = True
options.add_argument("--remote-debugging-port=9222")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-extensions")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver =webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(URL)
print(f"Scraping {driver.current_url}")
try:
medias = WebDriverWait(driver,timeout=10,).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'result-row')))
for media_idx, media_elem in enumerate(medias):
outer_html = media_elem.get_attribute('outerHTML')
result = scrap_newspaper(outer_html) # some function to retrieve results
SEARCH_RESULTS[f"result_{media_idx}"] = result
except exceptions.StaleElementReferenceException as e:
print(f"Selenium: {type(e).__name__}: {e.args}")
return
except exceptions.NoSuchElementException as e:
print(f"Selenium: {type(e).__name__}: {e.args}")
return
except exceptions.TimeoutException as e:
print(f"Selenium: {type(e).__name__}: {e.args}")
return
except exceptions.WebDriverException as e:
print(f"Selenium: {type(e).__name__}: {e.args}")
return
except exceptions.SessionNotCreatedException as e:
print(f"Selenium: {type(e).__name__}: {e.args}")
return
except Exception as e:
print(f"Selenium: {type(e).__name__} line {e.__traceback__.tb_lineno} of {__file__}: {e.args}")
return
except:
print(f"Selenium General Exception: {URL}")
return
print(f"\t\tFound {len(medias)} media(s) => {len(SEARCH_RESULTS)} search result(s)\tElapsed_t: {time.time()-st_t:.2f} s")
return SEARCH_RESULTS
if __name__ == '__main__':
url = 'https://digi.kansalliskirjasto.fi
get_all_search_details(URL=url)
This approach works but seems to be very time consuming and inefficient:
Found 20 media(s) => 20 search result(s) Elapsed_t: 15.22 s