I'm trying to scrape a webpage, getting links from main pages and then scrape every page individually. This error occurs sometimes, but not always. I've tried to use WebDriverWait and scroll to end of page, but sometimes I still get this error
How can I asure selenium will only click the button after is loaded?
init.py
import scraper
urls_globo = [
# 'https://g1.globo.com/brasil', #TODO PAGINA INICIAL, MANTER?
'https://g1.globo.com/ciencia',
'https://g1.globo.com/mundo',
'https://g1.globo.com/politica',
'https://g1.globo.com/saude',
'https://g1.globo.com/tecnologia'
]
for url in urls_globo:
print('\nCATEGORIA: ' + url.split('/')[-1])
navegador = scraper.GetPages(url)
links = scraper.GetLinksFromPage(navegador, 20)
for link in links:
print(link)
scraper.py
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
opcoes = Options()
opcoes.add_argument('--ignore-certificate-errors')
opcoes.add_argument('--ignore-ssl-errors')
# opcoes.headless = True
#desabiltando anuncios = instalar bloqueador do navegador
navegador = webdriver.Chrome('C:/Users/julia/repos/pos/novo/chromedriver.exe', options=opcoes)
# espera = WebDriverWait(navegador, 10)
def GetPages(url):
try:
navegador.get(url)
except Exception as e:
raise SystemExit(e)
return navegador
def GetLinksFromPage(navegador, itens_meta):
espera = WebDriverWait(navegador, 20)
links = []
#itens_meta = 15
while itens_meta > len(links):
#1 - desce a página até o botão aparecer pela 1a vez
navegador.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#3 - Clique no botão para mais conteudo
espera.until(EC.element_to_be_clickable((By.CLASS_NAME, 'load-more'))).click()
#2 - pega todos links de todos os elementos carregados
espera.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'feed-post-body')))
#elementos = navegador.find_elements(by=By.CLASS_NAME, value='feed-post-link')
elementos = espera.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'feed-post-link')))
temp_links = []
temp_links = [elem.get_attribute('href') for elem in elementos]
# links.append(temp_links)
#4 - atualiza a lista que controla o loop
links.append(temp_links)
# print(links)
# print(len(links))
#navegador.quit()
return links
stacktrace