I have a working code to fetch details from a URL using selenium and Python. But facing an issue after searching 50 plus URL's the google chrome is showing up "I'm not a Robot" option and ask to select the checkbox.
But after that unable to get the results and thereafter no consistent results or false results are showing.
So is there a way to avoid this "I'm not a Robot" captcha and to get consistent results? Or Anything that I need to modify in this code to make it more optimized?
Also is it possible to open 50 or 100 tabs in the chrome driver at the same time and search the loaded tabs for the results?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import psycopg2
import os
import datetime
final_results=[]
positions=[]
option = webdriver.ChromeOptions()
option.add_argument("—-incognito")
browser = webdriver.Chrome(executable_path='/users/user_123/downloads/chrome_driver/chromedriver', chrome_options=option)
#def db_connect():
try:
#Database connection string
DSN = "dbname='postgres' user='postgres' host='localhost' password='postgres' port='5432'"
#DWH table to which data is ported
TABLE_NAME = 'staging.search_url'
#Connecting DB..
conn = psycopg2.connect(DSN)
print("Database connected...")
#conn.set_client_encoding('utf-8')
cur = conn.cursor()
cur.execute("SET datestyle='German'")
except (Exception, psycopg2.Error) as error:
print('database connection failed')
quit()
def get_products(url):
browser.get(url)
names = browser.find_elements_by_xpath("//span[@class='pymv4e']")
upd_product_name_list=list(filter(None, names))
product_name = [x.text for x in upd_product_name_list]
product = [x for x in product_name if len(x.strip()) > 2]
upd_product_name_list.clear()
product_name.clear()
return product
##################################
search_url_fetch="""select url_to_be_searched from staging.search_url where id in(65,66,67,68)"""
psql_cursor = conn.cursor()
psql_cursor.execute(search_url_fetch)
serach_url_list = psql_cursor.fetchall()
print('Fetched DB values')
##################################
for row in serach_url_list:
passed_url=''
passed_url=str(row)
passed_url=passed_url.replace(',)','')
passed_url=passed_url.replace('(','')
new_url=''
new_url=passed_url[1:len(passed_url)-1]
print('Passed URL :'+new_url)
print("\n")
filtered=[]
filtered.clear()
filtered = get_products(new_url)
if not filtered:
new_url=new_url+'+kaufen'
get_products(new_url)
print('Modified URL :'+new_url)
if filtered:
print(filtered)
positions.clear()
for x in range(1, len(filtered)+1):
positions.append(str(x))
gobal_position=0
gobal_position=len(positions)
print('global postion first: '+str(gobal_position))
print("\n")
company_name_list = browser.find_elements_by_xpath("//div[@class='LbUacb']")
# use list comprehension to get the actual repo titles and not the selenium objects.
company = []
company.clear()
company = [x.text for x in company_name_list]
# print out all the titles.
print('Company Name:')
print(company, '\n')
price_list = browser.find_elements_by_xpath("//div[@class='e10twf T4OwTb']")
# use list comprehension to get the actual repo titles and not the selenium objects.
price = []
price.clear()
price = [x.text for x in price_list]
print('Price:')
print(price)
print("\n")
urls=[]
urls.clear()
find_href = browser.find_elements_by_xpath("//a[@class='plantl pla-unit-single-clickable-target clickable-card']")
for my_href in find_href:
url_list=my_href.get_attribute("href")
urls.append(url_list)
#print(my_href.get_attribute("href"))
print('URLS:')
print(urls)
print("\n")
print('Final Result: ')
result = zip(positions,filtered, urls, company,price)
final_results.clear()
final_results.append(tuple(result))
print(final_results)
print("\n")
print('global postion end :'+str(gobal_position))
i=0
#try:
for d in final_results:
print( d[i])
while i <= gobal_position:
cur.execute("""INSERT into staging.pla_crawler_results(position, product_name, url,company,price) VALUES (%s, %s, %s,%s, %s)""", d[i])
print('Inserted succesfully')
conn.commit()
i=i+1
#except (Exception, psycopg2.Error) as error:
#pass