I've written a script in scrapy in combination with selenium to make proxied requests using newly generated proxies by get_proxies()
method. I used requests module to fetch the proxies in order to reuse them in the script. What I'm trying to do is parse all the post links from it's landing page and then fetch the name of each title from it's target page.
My following script works inconsistently because when get_random_proxy
function produces a usable proxy then I get my script working otherwise it fails miserably.
How can I make my script keep trying with different proxies until it runs successfully?
I've written so far:
import scrapy
import random
import requests
from itertools import cycle
from bs4 import BeautifulSoup
from selenium import webdriver
from scrapy.crawler import CrawlerProcess
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
def get_proxies():
response = requests.get("https://www.sslproxies.org/")
soup = BeautifulSoup(response.text,"lxml")
proxies = [':'.join([item.select_one("td").text,item.select_one("td:nth-of-type(2)").text]) for item in soup.select("table.table tr") if "yes" in item.text]
return proxies
def get_random_proxy(proxy_vault):
random.shuffle(proxy_vault)
proxy_url = next(cycle(proxy_vault))
return proxy_url
def start_script():
proxy = get_proxies()
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument(f'--proxy-server={get_random_proxy(proxy)}')
driver = webdriver.Chrome(options=chrome_options)
return driver
class StackBotSpider(scrapy.Spider):
name = "stackoverflow"
start_urls = [
'https://stackoverflow.com/questions/tagged/web-scraping'
]
def __init__(self):
self.driver = start_script()
self.wait = WebDriverWait(self.driver, 10)
def parse(self,response):
self.driver.get(response.url)
for elem in self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".summary .question-hyperlink"))):
yield scrapy.Request(elem.get_attribute("href"),callback=self.parse_details)
def parse_details(self,response):
self.driver.get(response.url)
for elem in self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "h1[itemprop='name'] > a"))):
yield {"post_title":elem.text}
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
})
c.crawl(StackBotSpider)
c.start()