I've been trying to scrape this website for 2 days now. I'm completely stuck. The problem is that it detects me as a bot.
I have a list of urls that I need to crawl. and in the results folder, every file says that Access to this page has been denied... To continue, please prove you are not a robot... etc.
Below is my current code
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
CHROMEDRIVER_PATH = './chromedriver'
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("disable-infobars")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
ua = UserAgent()
userAgent = ua.random
chrome_options.add_argument('user-agent={userAgent}')
LOGIN_PAGE = "https://www.seekingalpha.com/login"
ACCOUNT = "Account"
PASSWORD = "Password"
driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, chrome_options=chrome_options)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
wait = WebDriverWait(driver, 30)
driver.get("https://www.seekingalpha.com/login")
time.sleep(1)
wait.until(EC.element_to_be_clickable((By.NAME, "email"))).send_keys(ACCOUNT)
wait.until(EC.element_to_be_clickable((By.ID, "signInPasswordField"))).send_keys(PASSWORD)
wait.until(EC.element_to_be_clickable((By.XPATH, "//button[text()='Sign in']"))).click()
time.sleep(1)
with open("links.txt", "r") as inArticle:
articles = inArticle.read().splitlines()
for article in articles:
outName = article.split("/")[-1]
outName = outName.split("-")[0]
driver.get(article)
time.sleep(1)
html_source = driver.page_source
out_text = str(html_source).encode("utf8")
with open("./results/"+outName, "w") as outFile:
outFile.write(out_text)
driver.quit()
Is there a better way to do this? and is there a way to pass this bot check?