Geckodriver: 0.24.0
Firefox: I installed buitron buildpack. I don't know how to get the version. I tried executing heroku run bash and then firefox with --version
Python:3.6.8
Selenium: 3.141.0
I've been recently struggling with this scraping code. I'm trying to do efficient scraping code and deploy it in Heroku, but in local it's always taking too much time and in Heroku is giving one error after another.
Right now the error I want to solve is:
selenium.common.exceptions.WebDriverException: Message: invalid argument: can't kill an exited process
I have also been through another errors like:
Geckodriver executable needs to be in PATH
Or this one from a previous query of mine (as you can see I tried with Chrome as well).
After reading Ryan's Mitchell book about scraping with Python and googling around I still haven't found a solution for these errors.
Can anyone help me to solve this "can't kill exited process" error, please? Or at least provide some resource.
Code below:
import os
import requests
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
TYPE_COLUMN_IDX = 4
IS_HTTPS_COLUMN_IDX = 6
MY_TIMEOUT = 4.5
user_agent_list = [
# Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
# Goes on...
]
# Replace here with your browser paths and binaries !!!
def get_chromedriver_exec_path():
return 'Your_chromedriver_path_here'
def get_chrome_path():
return 'Your_chrome_path_here'
def get_firefox_exec_path():
return 'Your_firefox_geckodriver_executable_path_here'
def get_firefox_path():
return 'Your_firefox_browser_path'
def get_firefox_binpath():
return 'Your_firefox_binary_path'
def get_url(url, **kwargs):
"""
Visits the url for getting the HTML response
:param url: the url to get the HTML response from
:param timeout: the max time to wait, (default = 7)
:return: the raw HTML response
"""
timeout = kwargs.pop('timeout', MY_TIMEOUT)
return requests.get(url, timeout=timeout, **kwargs)
def user_agent_generator():
"""
It returns a random user agent string
:return: an user agent
"""
browsers = []
while True:
try:
if not len(browsers):
browsers = user_agent_list[:]
random.shuffle(browsers)
yield browsers.pop()
except Exception as e:
logger.error('UA generator error', exc_info=e)
def proxy_generator():
"""
It returns a generator that when called with next() or
a for structure, will return a new proxy
:return: a proxy generator
"""
proxy_url = 'https://free-proxy-list.net/anonymous-proxy.html'
types = ('elite', 'anonymous')
proxies = set()
while True:
try:
if not len(proxies):
response = get_url(proxy_url)
b = BeautifulSoup(response.content, 'html.parser')
for bis in b.select('table tbody tr'):
elements = bis.find_all('td')
# [:2] gets the two first columns with IP and PORT
ip_port = [t.text for t in elements[:2] if any(t in elements[TYPE_COLUMN_IDX].text for t in types) and 'yes' in elements[IS_HTTPS_COLUMN_IDX].text]
proxy = ':'.join(ip_port)
if proxy != '':
proxies.add(proxy)
yield proxies.pop()
except Exception as e:
logger.error('Proxy generator error', exc_info=e)
class AlfredProxy(object):
proxy_gen = proxy_generator()
user_agent_gen = user_agent_generator()
@staticmethod
def get_proxy():
"""
Returns a new proxy
:return: a new proxy
"""
return next(AlfredProxy.proxy_gen)
@staticmethod
def get_user_agent():
"""
Returns a new user agent
:return: a new user agent
"""
return next(AlfredProxy.user_agent_gen)
@staticmethod
def get_chrome_options():
"""
Returns a ChromeOptions object with a new proxy
:return:
"""
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = os.environ.get('GOOGLE_CHROME_SHIM', get_chrome_path()) # '/app/.apt/usr/bin/google-chrome'
chrome_options.add_argument(f'--user-agent={AlfredProxy.get_user_agent()}')
chrome_options.add_argument(f'--proxy-server={AlfredProxy.get_proxy()}')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('headless')
return chrome_options
@staticmethod
def get_firefox_options():
"""
Returns a FirefoxOptions object with a new proxy
:return:
"""
capabilities = webdriver.DesiredCapabilities().FIREFOX
#capabilities['marionette'] = False
options = Options()
options.headless = True
#options.binary_location = get_firefox_path()
os.environ['MOZ_HEADLESS'] = '1'
binary = FirefoxBinary(get_firefox_binpath())
return {
'capabilities': capabilities,
'options': options,
'firefox_binary': binary,
'executable_path': get_firefox_exec_path(),
}
@staticmethod
def get_webdriver(firefox=True):
if firefox:
options = AlfredProxy.get_firefox_options()
user_agent = webdriver.Firefox(**options)
else:
options = AlfredProxy.get_chrome_options()
user_agent = webdriver.Chrome(executable_path=get_chromedriver_exec_path(), options=options)
return user_agent
@staticmethod
def get(url, attempts=0):
"""
Gets the contents of an URL changing the headers of the user-agent
:param url: the url to visit
:param attempts: number of attempts to try
:return: the URL contents
"""
if attempts > 10:
raise Exception()
proxies = {http_prefix: f'{http_prefix}://{proxy}'
for http_prefix, proxy in zip(['http', 'https'], [AlfredProxy.get_proxy()] * 2)}
user_agent = next(AlfredProxy.user_agent_gen)
headers = {'User-Agent': user_agent}
try:
logger.info(f'UA: {user_agent}')
result = get_url(url, timeout=MY_TIMEOUT, headers=headers, proxies=proxies)
if result.status_code != 200:
raise ConnectionError()
return result
except ConnectionError as e:
return AlfredProxy.get(url, attempts + 1)
except Exception as e:
return AlfredProxy.get(url, attempts + 1)
def get_url_through_proxy(url, selector=None, num_attempts=0, **kwargs):
"""
Visits the url for getting the HTML response,
uses a proxy anc changes user-agent if necessary
:param url: the url
:return: the raw HTML response
"""
if num_attempts == 5:
raise ConnectionAttemptsError(msg=['Demasiados intentos a través de proxy'])
user_agent_driver = AlfredProxy.get_webdriver()
try:
if selector:
wait = WebDriverWait(user_agent_driver, MY_TIMEOUT)
user_agent_driver.get(url)
condition = EC.presence_of_element_located((By.CSS_SELECTOR, selector))
element = wait.until(condition)
else:
user_agent_driver.implicitly_wait(MY_TIMEOUT)
user_agent_driver.get(url)
return user_agent_driver.page_source
except TimeoutException:
logger.error('Timeout error')
return get_url_through_proxy(url, selector, num_attempts=num_attempts+1)
except Exception as e:
logger.error(f"Proxy error: {type(e)}")
return get_url_through_proxy(url, selector, num_attempts=num_attempts + 1)
# Some test code
if __name__ == '__main__':
get_url_through_proxy('https://www.amazon.es/dp/B07HR7RRWQ/ref=gbps_tit_s-5_c44f_2b9ee16f', '#centerCol')
print('-'*40)
get_url_through_proxy('https://www.amazon.es/dp/B07HR7RRWQ/ref=gbps_tit_s-5_c44f_2b9ee16f')