I have this code that should take around 360 hours to fully complete and it's all because of the slow servers of the website I'm trying to scrape, But when I look at the website and the python console at the same time I realize the elements I'm trying to use has already been loaded and the selenium is waiting for the useless ads and another thing I don't care about to load. So I was wondering if there is any way to start scraping as soon as the elements needed are loaded.
Another way of doing this would be to just do the scraping even if the page is not loaded and then using the time.sleep
I can time it by hand. Though this question has already been asked and answered in stack overflow so if this is the only way of doing it you can let me know in the comments otherwise better way would be to wait only for the elements needed to be scrapped which would make it way easier.
I don't think my code could help you answer my question but ill put it here in case.
code:
#C:\Users\keibo\PycharmProjects\emergency ahanonline project
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium import webdriver
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager
import time
t = time.localtime()
current_time = time.strftime("%H:%M:%S", t)
print(f'[{current_time}] Started.')
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
#options.add_argument("--headless")
output=f'\nState, City, Group, Sub_Group, Address, Website, Description, Views'
browser = webdriver.Chrome(options=options,service=Service(ChromeDriverManager().install()))
def tir():
global output
browser.get(
'https://senf.ir/ListCompany/75483/%D8%A2%D9%87%D9%86-%D8%A2%D9%84%D8%A7%D8%AA-%D9%88-%D8%B6%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA')
browser.find_element(By.ID, "ContentPlaceHolder2_rptPager_lnkPage_11").click()
pages = (browser.find_element(By.ID, "ContentPlaceHolder2_rptPager_lnkPage_9").text)
print(f'There are {pages} pages of 20 names which means there is {pages*20} people to save.')
for page in range(pages-1):
for person in range(19):
browser.get(
'https://senf.ir/ListCompany/75483/%D8%A2%D9%87%D9%86-%D8%A2%D9%84%D8%A7%D8%AA-%D9%88-%D8%B6%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA')
browser.find_element(By.ID, f"ContentPlaceHolder2_grdProduct_HpCompany_{person}").click()
try:
state = (browser.find_element(By.XPATH,
'(.//span[@id = "ContentPlaceHolder2_rpParent_lblheaderCheild_0"])').text)
if state == '' or state == ' ':
state = None
except:
state = None
try:
city = (browser.find_element(By.XPATH,
'(.//span[@id = "ContentPlaceHolder2_rpParent_lblheaderCheild_1"])').text)
if city == '' or city == ' ':
city = None
except:
city = None
try:
group = (browser.find_element(By.XPATH,
'(.//span[@id = "ContentPlaceHolder2_rpParent_lblheaderCheild_2"])').text)
if group == '' or group == ' ':
group = None
except:
group = None
try:
sub_group = (browser.find_element(By.XPATH,
'(.//span[@id = "ContentPlaceHolder2_rpParent_lblheaderCheild_3"])').text)
if sub_group == '' or sub_group == ' ':
sub_group = None
except:
sub_group = None
try:
Address = (browser.find_element(By.XPATH, '(.//span[@id = "ContentPlaceHolder2_txtAddress"])').text)
if Address == '' or Address == ' ':
Address = None
except:
Address = None
try:
ceo = (browser.find_element(By.XPATH, '(.//span[@id = "ContentPlaceHolder2_LblManager"])').text)
if ceo == '' or ceo == ' ':
ceo = None
except:
ceo = None
# print(browser.find_element(By.XPATH, '(.//span[@id = "ContentPlaceHolder2_ImgEmail"])').text)
try:
website = str(browser.find_element(By.XPATH, '(.//a[@id = "ContentPlaceHolder2_hfWebsait"])').text)
if website == '' or website == ' ':
website = None
except:
website = None
try:
Description = (browser.find_element(By.XPATH, '(.//span[@id = "ContentPlaceHolder2_lblDesc"])').text)
if Description == '' or Description == ' ':
Description = None
except:
Description = None
try:
views = (browser.find_element(By.XPATH, '(.//span[@id = "ContentPlaceHolder2_lblVisit"])').text)
if views == '' or views == ' ':
views = None
except:
views = None
output += f'\n{views}, {Description}, {website}, {Address}, {sub_group}, {group}, {city}, {state}'
print(output)
print('--------------------------------------------')
browser.find_element(By.ID, "ContentPlaceHolder2_rptPager_lnkPage_12").click()
tir()
print("End")
with open('Program Files\CSV pre built.txt') as f1:
file1 = open("Program Files\CSV pre built.txt", "w")
file1.write(output)
file1.close()
read_file1 = pd.read_csv('Program Files\CSV pre built.txt')
read_file1.to_csv('Output.csv', index=False)
try:
pass
except Exception as e:
browser.close()
print('something went wrong ):')
sees=input('Press enter to leave or press 1 and than enter to see error: ')
if sees=='1':
input(e)