I am trying to scrape the price of allergy products in Target. For each product, i will input all the US zip codes to see the effect of changing ZIPCODE on price. And i use selenium to input the ZIPCODE for each products. However, i have more than 40000 ZIPCODES and 200 products total to scrape. If I run my code, the run time of the code will be too long(almost 90 days..) because each time it need 2 seconds for selenium to input the zipcode. What should I do to reduce the time of running?
while(True):
priceArray = []
nameArray = []
zipCodeArray =[]
GMTArray = []
wait_imp = 10
CO = webdriver.ChromeOptions()
CO.add_experimental_option('useAutomationExtension', False)
CO.add_argument('--ignore-certificate-errors')
CO.add_argument('--start-maximized')
wd = webdriver.Chrome(r'D:\chromedriver\chromedriver_win32new\chromedriver_win32 (2)\chromedriver.exe',options=CO)
for url in urlList:
wd.get(url)
wd.implicitly_wait(wait_imp)
for zipcode in zipCodeList:
try:
#click the delivery address
address = wd.find_element(by=By.XPATH, value="//*[@id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[4]/div/div[1]/button[2]")
address.click()
#click the Edit location
editLocation = wd.find_element(by=By.XPATH, value="//*[@id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[4]/div/div[2]/button")
editLocation.click()
except:
#directly click he Edit location
editLocation = wd.find_element(by=By.XPATH, value="//*[@id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[4]/div[1]/div/div[1]/button")
editLocation.click()
#input ZipCode
inputZipCode = wd.find_element(by=By.XPATH, value="//*[@id='enter-zip-or-city-state']")
inputZipCode.clear()
inputZipCode.send_keys(zipcode)
#click submit
clickSubmit = wd.find_element(by=By.XPATH, value="//*[@id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[4]/div/div[2]/div/div/div[3]/div/button[1]")
clickSubmit.click()
#start scraping
name = wd.find_element(by=By.XPATH, value="//*[@id='pageBodyContainer']/div[1]/div[1]/h1/span").text
nameArray.append(name)
price = wd.find_element(by=By.XPATH, value="//*[@id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[1]/div[1]/span").text
priceArray.append(price)
currentZipCode = zipcode
zipCodeArray.append(currentZipCode)
tz = pytz.timezone('Europe/London')
GMT = datetime.now(tz)
GMTArray.append(GMT)
data = {'prod-name': nameArray,
'Price': priceArray,
'currentZipCode': zipCodeArray,
"GMT": GMTArray
}
df = pd.DataFrame(data, columns= ['prod-name', 'Price','currentZipCode',"GMT"])
df.to_csv(r'C:\Users\12987\PycharmProjects\Network\priceingAlgoriCoding\export_Target_dataframe.csv', mode='a', index = False, header=True)