At this time, my script will check multiple url if some 5 different type of keywords are present in the webpage. Depending of which keyword is found or not, it will output "ok" or "no".
I use set_page_load_timeout(30)
to avoid infinite load of a url.
Problem : some webpages doesn't load fully before timeout (even if it's a "very" long timeout). But I can see visually (no headless) that the page is loaded. At least it could check the keywords in the webpage but it doesn't and after timeout, it display "fail" and the scrape saying "no" doesn't show to the final output.
So I don't want to put an except after 30 seconds but I want to stop loading the page after 30 seconds and takes what it can be taken.
My code :
# coding=utf-8
import re
sites=[]
keywords_1=[]
keywords_2=[]
keywords_3=[]
keywords_4=[]
keywords_5=[]
import sys
from selenium import webdriver
import csv
import urllib.parse
from datetime import datetime
from datetime import date
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
def reader3(filename):
with open(filename, 'r') as csvfile:
# creating a csv reader object
csvreader = csv.reader(csvfile)
# extracting field names through first row
# extracting each data row one by one
for row in csvreader:
sites.append(str(row[0]).lower())
try:
reader3("data/script/filter_domain_OUTPUT.csv")
except Exception as e:
print(e)
sys.exit()
exc=[]
def reader3(filename):
with open(filename, 'r') as csvfile:
# creating a csv reader object
csvreader = csv.reader(csvfile)
# extracting field names through first row
# extracting each data row one by one
for row in csvreader:
exc.append(str(row[0]).lower())
try:
reader3("data/script/checking_EXCLUDE.csv")
except Exception as e:
print(e)
sys.exit()
def reader2(filename):
with open(filename, 'r') as csvfile:
# creating a csv reader object
csvreader = csv.reader(csvfile)
# extracting field names through first row
# extracting each data row one by one
for row in csvreader:
keywords_1.append(str(row[0]).lower())
keywords_2.append(str(row[1]).lower())
keywords_3.append(str(row[2]).lower())
keywords_4.append(str(row[3]).lower())
keywords_5.append(str(row[4]).lower())
try:
reader2("data/script/checking_KEYWORD.csv")
except Exception as e:
print(e)
sys.exit()
chrome_options = Options()
chrome_options.page_load_strategy = 'none'
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--lang=en')
chrome_options.add_argument('--disable-notifications')
#chrome_options.headless = True
chrome_options.add_argument('start-maximized')
chrome_options.add_argument('enable-automation')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-browser-side-navigation')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=chrome_options)
for site in sites:
try:
status_1 = "no"
status_2 = "no"
status_3 = "no"
status_4 = "no"
status_5 = "no"
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
today = date.today()
print("[" + current_time + "] " + str(site))
if 'http' in site:
driver.get(site)
else:
driver.get("http://" + site)
r=str(driver.page_source).lower()
driver.set_page_load_timeout(30)
for keyword_1 in keywords_1:
if keyword_1 in r:
status_1="ok"
print("home -> " +str(keyword_1))
break
for keyword_2 in keywords_2:
if keyword_2 in r:
status_2="ok"
print("home -> " +str(keyword_2))
break
for keyword_3 in keywords_3:
if keyword_3 in r:
status_3="ok"
print("home -> " +str(keyword_3))
break
for keyword_4 in keywords_4:
if keyword_4 in r:
status_4="ok"
print("home -> " +str(keyword_4))
break
for keyword_5 in keywords_5:
if keyword_5 in r:
status_5="ok"
print("Home ->" +str(keyword_5))
break
with open('data/script/checking_OUTPUT.csv', mode='a') as employee_file:
employee_writer = csv.writer(employee_file, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL,lineterminator='\n')
write=[site,status_1,status_2,status_3,status_4,status_5]
employee_writer.writerow(write)
except Exception as e:
#driver.delete_all_cookies()
print("Fail")
driver.quit()