I'm not sure why, but my script always stops crawling once it hits page 9. There are no errors, exceptions, or warnings, so I'm kind of at a loss.
Can somebody help me out?
P.S. Here is the full script in case anybody wants to test it for themselves!
def initiate_crawl():
def refresh_page(url):
ff = create_webdriver_instance()
ff.get(url)
ff.find_element(By.XPATH, '//*[@id="FilterItemView_sortOrder_dropdown"]/div/span[2]/span/span/span/span').click()
ff.find_element(By.XPATH, '//a[contains(text(), "Discount - High to Low")]').click()
items = WebDriverWait(ff, 15).until(
EC.visibility_of_all_elements_located((By.XPATH, '//div[contains(@id, "100_dealView_")]'))
)
print(len(items))
for count, item in enumerate(items):
slashed_price = item.find_elements(By.XPATH, './/span[contains(@class, "a-text-strike")]')
active_deals = item.find_elements(By.XPATH, './/*[contains(text(), "Add to Cart")]')
if len(slashed_price) > 0 and len(active_deals) > 0:
product_title = item.find_element(By.ID, 'dealTitle').text
if product_title not in already_scraped_product_titles:
already_scraped_product_titles.append(product_title)
url = ff.current_url
ff.quit()
refresh_page(url)
break
if count+1 is len(items):
try:
next_button = WebDriverWait(ff, 15).until(
EC.text_to_be_present_in_element((By.PARTIAL_LINK_TEXT, 'Next→'), 'Next→')
)
ff.find_element(By.PARTIAL_LINK_TEXT, 'Next→').click()
url = ff.current_url
ff.quit()
refresh_page(url)
except Exception as error:
print(error)
ff.quit()
refresh_page('https://www.amazon.ca/gp/goldbox/ref=gbps_ftr_s-3_4bc8_dct_10-?gb_f_c2xvdC0z=sortOrder:BY_SCORE,discountRanges:10-25%252C25-50%252C50-70%252C70-&pf_rd_p=f5836aee-0969-4c39-9720-4f0cacf64bc8&pf_rd_s=slot-3&pf_rd_t=701&pf_rd_i=gb_main&pf_rd_m=A3DWYIK6Y9EEQB&pf_rd_r=CQ7KBNXT36G95190QJB1&ie=UTF8')
initiate_crawl()
Printing the length of items
invokes some strange behaviour too. Instead of it always returning 32, which would correspond to the number of items on each page, it prints 32
for the first page, 64
for the second, 96
for the third, so on and so forth. I fixed this by using //div[contains(@id, "100_dealView_")]/div[contains(@class, "dealContainer")]
instead of //div[contains(@id, "100_dealView_")]
as the XPath for the items
variable. I'm hoping this is the reason why it runs into issues on page 9. I'm running tests right now. Update: It is now scraping page 10 and beyond, so the issue is resolved.