There is a web page that I want to run my scraping script on. However, because the page refreshes with additional content when you scroll down, I need to be able to add a function to my script that scrolls the web page all the way to the bottom before my scraping script is run.
In attempt to achieve this, please find my entire script which seems to stop at row height 5287.
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
import pandas as pd
#Initialize a Chrome browser
driver = webdriver.Chrome("C:.............chromedriver_win32/chromedriver.exe")
#Go to the page we want to scrape
driver.get('https://icodrops.com/category/ended-ico/')
#Open csv file to write in
csv_file = open('icodrops_ended_icos.csv', 'w')
writer = csv.writer(csv_file)
writer.writerow(['Project_Name', 'Interest', 'Category', 'Received', 'Goal', 'End_Date', 'Ticker'])
page_url = 'https://icodrops.com/category/ended-ico/'
# Although only one page to scrape - need to scroll to the bottom to pull all data
lastHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('lastHeight', lastHeight)
while True:
driver.execute_script(f"window.scrollTo(0, {lastHeight});")
time.sleep(15)
#height = driver.execute_script("return document.documentElement.scrollHeight")
newHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('newHeight', newHeight)
if newHeight == lastHeight:
break
lastHeight = newHeight
try:
#print the url that we are scraping
print('Scraping this url:' + page_url)
#Exract a list object where each element of the list is a row in the table
rows = driver.find_elements_by_xpath('//div[@class="col-md-12 col-12 a_ico"]')
# Extract detail in columns from each row
for row in rows:
#Initialize a dictionary for each row
row_dict = {}
#Use relative xpaths to locate desired data
project_name = row.find_element_by_xpath('.//div[@class="ico-row"]/div[2]/h3/a').text
interest = row.find_element_by_xpath('.//div[@class="interest"]').text
category = row.find_element_by_xpath('.//div[@class="categ_type"]').text
received = row.find_element_by_xpath('.//div[@id="new_column_categ_invisted"]/span').text
goal = row.find_element_by_xpath('.//div[@id="categ_desctop"]').text
end_date = row.find_element_by_xpath('.//div[@class="date"]').text
ticker = row.find_element_by_xpath('.//div[@id="t_tikcer"]').text
# Add extracted data to the dictionary
row_dict['project_name'] = project_name
row_dict['interest'] = interest
row_dict['category'] = category
row_dict['received'] = received
row_dict['goal'] = goal
row_dict['end_date'] = end_date
row_dict['ticker'] = ticker
writer.writerow(row_dict.values())
except Exception as e:
print(e)
csv_file.close()
driver.close()
break
Without being able to scroll to the bottom of the page my script will only scrape data form the initial page which only constitutes about 10% of all that is available