How to export all pages scraped from site to Excel

Question

I'm trying to export scraped data from site to excel. But my code overwrites previous data in excel file with the last scraped. This is my first try with scraping and Pandas. Please help me to understand the logic of correct export. This is my code:

import requests
import lxml.html
import time
import sys
import pandas as pd

sys.stdin.reconfigure(encoding='utf-8')
sys.stdout.reconfigure(encoding='utf-8')


def parse_data(url):
    titles = []
    prices = []
    try:
        response = requests.get(url)
    except:
        return
    tree = lxml.html.document_fromstring(response.text)
    for item in tree.xpath('//*[contains(@class, "listing-item")]'):
            title = item.xpath(".//h2/a/text()")[0]
            price = item.xpath('.//*[contains(@class, "price")]/text()')
            price = price[0] if price else "N/A"
            titles.append(title)
            prices.append(price)
            
    return titles, prices


def output(titles, prices):
    output = pd.DataFrame({"Make": titles,
                           "Price": prices,
                           })
    writer = pd.ExcelWriter('avbuyer.com.xlsx', engine='xlsxwriter')
    output.to_excel(writer, sheet_name='Sheet1')

    output(titles, prices)


def main():
    for i in range(1, 3):
        url = 'https://www.avbuyer.com/aircraft/private-jets/page-' + str(i)
        print(url)
        parse_data(url)
        i += 1
        time.sleep(2)


if __name__ == "__main__":
    main()

maybe you want to check out this answer https://stackoverflow.com/a/38075046/6060982 — zap, Jun 29 '21 at 15:44

score 1 · Accepted Answer · answered Jun 29 '21 at 15:51

During iteration you always overwriting previous created lists (titles and price), that's why you will always get result only from last iteration. I suggest you to do something like this:

dfFinal = pd.DataFrame()
def main():
    
    for i in range(1, 3):
        url = 'https://www.avbuyer.com/aircraft/private-jets/page-' + str(i)

        titles = []
        prices = []
        
        try:
            response = requests.get(url)
        except:
            return
        tree = lxml.html.document_fromstring(response.text)
        for item in tree.xpath('//*[contains(@class, "listing-item")]'):
            title = item.xpath(".//h2/a/text()")[0]
            price = item.xpath('.//*[contains(@class, "price")]/text()')
            price = price[0] if price else "N/A"
            titles.append(title)
            prices.append(price)

        output = pd.DataFrame({"Make": titles,
                               "Price": prices,
                               })
        dfFinal = dfFinal.appedn(output)    
        
        i += 1
        time.sleep(2)

dfFinal.to_excel(r'your path')

That works for me and logic of writing such code became more clear. Thank you! — Kopatych, Jun 30 '21 at 15:54

How to export all pages scraped from site to Excel

1 Answers1