0

I'm making a Craigslist scraper to scrape the titles, prices, date, and URL and exported that info to a CSV. Now, I want Selenium to click on the post URL to navigate to the actual page, parse the page to get a span tag "Odometer" (to get mileage), and return that to my CSV file.

Here's my code so far:

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#import schedule

from bs4 import BeautifulSoup
import urllib.request
import csv
import pandas as pd




class CraigslistScaper(object): 
    def __init__(self,query,location,max_price,transmission): 
        self.query = query
#        self.sort=sort
        self.location = location
#        self.postal = postal
        self.max_price = max_price
        self.transmission = auto_transmission


#https://sfbay.craigslist.org/search/cta?query=mazda+miata&sort=rel&max_price=6000&auto_transmission=1
        self.url = "https://{}.craigslist.org/search/cta?query={}&sort=rel&max_price={}&auto_transmission={}".format(self.location, self.query, self.max_price, self.transmission)
        self.driver = webdriver.Chrome('/Users/MyUser/Desktop/chromedriver')
        self.delay = 5

    def load_craigslist_url(self): 
        self.driver.get(self.url)
        try:
            wait = WebDriverWait(self.driver, self.delay)
            wait.until(EC.presence_of_element_located((By.ID,"searchform")))              
            print("page is ready")
        except TimeoutError: 
            print('Loading took too much time')

#extracting the post information such as titles, dates, and prices    
    def extract_post_information(self): 
        all_posts = self.driver.find_elements_by_class_name('result-row')
        titles = []
        dates = []
        prices = []

        post_info_list = []

        for i in range(len(all_posts)): 
            post = all_posts[i]
            title = post.text.split('$')

            if title[0] == '':
                title = title[1]
            else:
                title = title[0]

            title = title.split("\n")
            price = title[0]
            title = title[-1]

            title = title.split(' ')
            month = title[0]
            day = title[1]
            date = month + " " + day
            title = ' '.join(title[2:])

            #print('PRICE: ' + (price))

            #print('TITLE: ' + (title))
            #print('DATE: ' + date)  

            lst = [price, title, date]
            post_info_list.append(lst)

        #f=open("miata_prices.csv", "a+")
        #f.write(post_info_list)

        #print(post_info_list)

        #df = pd.DataFrame(post_info_list)
        #df.to_csv('miata_prices.csv', index=False, header=False)
        print(post_info_list)
        return post_info_list        

    def save_post_info_and_urls_to_csv(self, post_info, post_urls):
        for i in range(len(post_info)):
            post_info[i].append(post_urls[i])
        #print(post_info)
        df = pd.DataFrame(post_info)
        df.to_csv('miata_prices.csv', index=False, header=False)
        return post_info

#extracting post URLs    
    def extract_post_urls(self): 
        url_list = []
        soup = BeautifulSoup(self.driver.page_source,'html.parser')
        aTagsInLi = self.driver.find_elements_by_css_selector('li a')
        self.driver.find_elements_by_css_selector('li a')[0].click()
        for a in aTagsInLi:
           link = a.get_attribute('href')
           print(link)
        link = self.driver.find_element_by_link_text('Miata')
        print(link)
        link.click()
        for link in soup.findAll('a', {'class': "result-title hdrlnk"}):
            #print(link.get('href'))
            url_list.append(link.get('href'))

            return url_list
    #to click on URL Links and parse the HTML

    def click_next_page(self): 
        href = driver.find_element_by_partial_link_text("result-title hdrlink")
        extract_post_urls(url_list).click(href)




    def quit(self): 
        self.driver.close()

location = "sfbay" 
max_price = "5000"
#radius = "250"
auto_transmission = 1
query = "Mazda Miata"

scraper = CraigslistScaper(query,location,max_price,auto_transmission)        

scraper.load_craigslist_url()
post_info = scraper.extract_post_information()
#print(post_info)
post_urls = scraper.extract_post_urls()
#print(post_urls)
scraper.save_post_info_and_urls_to_csv(post_info, post_urls)
#print(post_info)
scraper.quit()

I manage to get everything to the CSV file, but I'm stuck on how I can get Selenium to open every link in a new tab, get the odometer information, then close the tab.

I'm using this to build a dataset and eventually do some analysis with it!

  • 1
    Possible duplicate of [Open web in new tab Selenium + Python](https://stackoverflow.com/questions/28431765/open-web-in-new-tab-selenium-python) – Ankit Jaiswal Jul 09 '18 at 06:52

1 Answers1

0

I have an example how to get Selenium to open every link and get the odometer information. I used a wrapper for Selenium (SeElements) for less code. I hope you will found out how it works. So:

I'm opening your link, scrapping all links from the titles to the list. Then open every link and trying to get odometer info.

from elementium.drivers.se import SeElements
from selenium import webdriver


browser = webdriver.Chrome()

url = 'https://sfbay.craigslist.org/search/ctaquery=mazda+miata&sort=rel&max_price=6000&auto_transmission=1'
browser.get(url)
se = SeElements(browser)
titles = se.xpath('//p[@class="result-info"]/a', wait=True, ttl=5)
try:
    links = []
    for link in titles:
        links.append(link.attribute('href'))
    for link in links:
        print(link)
        browser.get(link)
        try:
            odometer = se.xpath('//span[contains(text(), "odometer")]',wait=True, ttl=2).text()
        except Exception:
            continue
        print(odometer)
except Exception as e:
    browser.quit()
    raise e
Oleksandr Makarenko
  • 779
  • 1
  • 6
  • 18