I am trying to get the time of the last snapshot of the Washington Post front page from the Internet Archives for every day in a range of dates. The problem is that Selenium is not always selecting the correct date, although I have collected a list of date objects and it seems to be error-free. For example, Selenium will jump from January 31 to February 11, instead of to February 1.
Print output:
Moving to 31
Date: 2020-01-31 00:00:00
Last snapshot taken at 22:59:03
Moving to 1
Date: 2020-02-11 00:00:00
Last snapshot taken at 23:53:32
Moving to 2
Date: 2020-02-02 00:00:00
Last snapshot taken at 23:59:56
In this output we can see that it was supposed to go to 1, and that the date object's text said 1, meaning that the date object was extracted from the page correctly. Selenium is not hovering over it correctly though?
Complete code:
urls = ['https://web.archive.org/web/20190901000000*/washingtonpost.com',
'https://web.archive.org/web/*/washingtonpost.com']
browser = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver') # brew install chromedriver then see in terminal where it was installed to and paste this
data = {}
for j in range(0, len(urls)):
browser.get(urls[j])
calendar_grid = WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.CLASS_NAME, 'calendar-grid')))
if j == 0: # 2019
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(10)
start = len(dates) - 1
end = 0
step = -1
elif j == 1: # 2020
start = 0
end = len(dates)
step = 1
dates = calendar_grid.find_elements_by_css_selector('.calendar-day')
print('Dates on page: ' + str(len(dates)))
for i in range(start, end, step):
if j==0 and len(data) == len(desired_ranges[j]): # to end 2019
break
# Hover over the date, let popup appear, wait for loader to disappear, select scroll area
print('Moving to ' + dates[i])
hov = ActionChains(browser).move_to_element(dates[i])
hov.perform()
popup = WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.popup-of-day-content')))
WebDriverWait(browser, 20).until(EC.invisibility_of_element_located((By.TAG_NAME, 'svg')))
scroll_area = WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.popup-of-day-content > ul > div')))
# Get date and check that it is in our range
date = popup.find_element_by_class_name('day-tooltip-title')
date_formatted = datetime.datetime.strptime(date.text, '%B %d, %Y')
print('Date: ' + str(date_formatted))
if date_formatted not in desired_ranges[j]:
continue # skip if it is not
else:
attempts = 0
while attempts < 5:
try:
browser.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scroll_area)
snapshots = popup.find_elements_by_tag_name('a')
last_snapshot = snapshots[len(snapshots) - 1]
print('Last snapshot taken at ' + last_snapshot.text)
data[date_formatted] = {'link': last_snapshot.get_attribute('href'),
'time': last_snapshot.text,
'headlines': []}
break
except StaleElementReferenceException:
attempts += 1
Extra code to set date range:
start_day = datetime.date(2019, 12, 8)
end_day = datetime.date.today()
days = (end_day - start_day).days
desired_range = pd.date_range(start_day, periods=days).tolist()
print('Range: ' + str(start_day) + ' to ' + str(end_day))
print('Days: ' + str(days))
desired_range
def time_in_range(start, end, x):
'''Return true if x is in the range [start, end]'''
if start <= end:
return start <= x <= end
else:
return start <= x or x <= end
#Get 2019 date range
desired_range_in_2019 = [x for x in desired_range if time_in_range(datetime.date(2019, 1, 1), datetime.date(2019, 12, 31), x)]
desired_range_in_2020 = [x for x in desired_range if time_in_range(datetime.date(2020, 1, 1), datetime.date(2020, 12, 31), x)]
desired_ranges = [desired_range_in_2019, desired_range_in_2020]
print('Dates in 2019: ' + str(len(desired_range_in_2019)))
print('Dates in 2020: ' + str(len(desired_range_in_2020)))