I'm making a Craigslist scraper to scrape the titles, prices, date, and URL and exported that info to a CSV. Now, I want Selenium to click on the post URL to navigate to the actual page, parse the page to get a span tag "Odometer" (to get mileage), and return that to my CSV file.
Here's my code so far:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#import schedule
from bs4 import BeautifulSoup
import urllib.request
import csv
import pandas as pd
class CraigslistScaper(object):
def __init__(self,query,location,max_price,transmission):
self.query = query
# self.sort=sort
self.location = location
# self.postal = postal
self.max_price = max_price
self.transmission = auto_transmission
#https://sfbay.craigslist.org/search/cta?query=mazda+miata&sort=rel&max_price=6000&auto_transmission=1
self.url = "https://{}.craigslist.org/search/cta?query={}&sort=rel&max_price={}&auto_transmission={}".format(self.location, self.query, self.max_price, self.transmission)
self.driver = webdriver.Chrome('/Users/MyUser/Desktop/chromedriver')
self.delay = 5
def load_craigslist_url(self):
self.driver.get(self.url)
try:
wait = WebDriverWait(self.driver, self.delay)
wait.until(EC.presence_of_element_located((By.ID,"searchform")))
print("page is ready")
except TimeoutError:
print('Loading took too much time')
#extracting the post information such as titles, dates, and prices
def extract_post_information(self):
all_posts = self.driver.find_elements_by_class_name('result-row')
titles = []
dates = []
prices = []
post_info_list = []
for i in range(len(all_posts)):
post = all_posts[i]
title = post.text.split('$')
if title[0] == '':
title = title[1]
else:
title = title[0]
title = title.split("\n")
price = title[0]
title = title[-1]
title = title.split(' ')
month = title[0]
day = title[1]
date = month + " " + day
title = ' '.join(title[2:])
#print('PRICE: ' + (price))
#print('TITLE: ' + (title))
#print('DATE: ' + date)
lst = [price, title, date]
post_info_list.append(lst)
#f=open("miata_prices.csv", "a+")
#f.write(post_info_list)
#print(post_info_list)
#df = pd.DataFrame(post_info_list)
#df.to_csv('miata_prices.csv', index=False, header=False)
print(post_info_list)
return post_info_list
def save_post_info_and_urls_to_csv(self, post_info, post_urls):
for i in range(len(post_info)):
post_info[i].append(post_urls[i])
#print(post_info)
df = pd.DataFrame(post_info)
df.to_csv('miata_prices.csv', index=False, header=False)
return post_info
#extracting post URLs
def extract_post_urls(self):
url_list = []
soup = BeautifulSoup(self.driver.page_source,'html.parser')
aTagsInLi = self.driver.find_elements_by_css_selector('li a')
self.driver.find_elements_by_css_selector('li a')[0].click()
for a in aTagsInLi:
link = a.get_attribute('href')
print(link)
link = self.driver.find_element_by_link_text('Miata')
print(link)
link.click()
for link in soup.findAll('a', {'class': "result-title hdrlnk"}):
#print(link.get('href'))
url_list.append(link.get('href'))
return url_list
#to click on URL Links and parse the HTML
def click_next_page(self):
href = driver.find_element_by_partial_link_text("result-title hdrlink")
extract_post_urls(url_list).click(href)
def quit(self):
self.driver.close()
location = "sfbay"
max_price = "5000"
#radius = "250"
auto_transmission = 1
query = "Mazda Miata"
scraper = CraigslistScaper(query,location,max_price,auto_transmission)
scraper.load_craigslist_url()
post_info = scraper.extract_post_information()
#print(post_info)
post_urls = scraper.extract_post_urls()
#print(post_urls)
scraper.save_post_info_and_urls_to_csv(post_info, post_urls)
#print(post_info)
scraper.quit()
I manage to get everything to the CSV file, but I'm stuck on how I can get Selenium to open every link in a new tab, get the odometer information, then close the tab.
I'm using this to build a dataset and eventually do some analysis with it!