0

I'm trying to export scraped data from site to excel. But my code overwrites previous data in excel file with the last scraped. This is my first try with scraping and Pandas. Please help me to understand the logic of correct export. This is my code:

import requests
import lxml.html
import time
import sys
import pandas as pd

sys.stdin.reconfigure(encoding='utf-8')
sys.stdout.reconfigure(encoding='utf-8')


def parse_data(url):
    titles = []
    prices = []
    try:
        response = requests.get(url)
    except:
        return
    tree = lxml.html.document_fromstring(response.text)
    for item in tree.xpath('//*[contains(@class, "listing-item")]'):
            title = item.xpath(".//h2/a/text()")[0]
            price = item.xpath('.//*[contains(@class, "price")]/text()')
            price = price[0] if price else "N/A"
            titles.append(title)
            prices.append(price)
            
    return titles, prices


def output(titles, prices):
    output = pd.DataFrame({"Make": titles,
                           "Price": prices,
                           })
    writer = pd.ExcelWriter('avbuyer.com.xlsx', engine='xlsxwriter')
    output.to_excel(writer, sheet_name='Sheet1')

    output(titles, prices)


def main():
    for i in range(1, 3):
        url = 'https://www.avbuyer.com/aircraft/private-jets/page-' + str(i)
        print(url)
        parse_data(url)
        i += 1
        time.sleep(2)


if __name__ == "__main__":
    main()
Kopatych
  • 27
  • 6
  • 1
    maybe you want to check out this answer https://stackoverflow.com/a/38075046/6060982 – zap Jun 29 '21 at 15:44

1 Answers1

1

During iteration you always overwriting previous created lists (titles and price), that's why you will always get result only from last iteration. I suggest you to do something like this:

dfFinal = pd.DataFrame()
def main():
    
    for i in range(1, 3):
        url = 'https://www.avbuyer.com/aircraft/private-jets/page-' + str(i)

        titles = []
        prices = []
        
        try:
            response = requests.get(url)
        except:
            return
        tree = lxml.html.document_fromstring(response.text)
        for item in tree.xpath('//*[contains(@class, "listing-item")]'):
            title = item.xpath(".//h2/a/text()")[0]
            price = item.xpath('.//*[contains(@class, "price")]/text()')
            price = price[0] if price else "N/A"
            titles.append(title)
            prices.append(price)

        output = pd.DataFrame({"Make": titles,
                               "Price": prices,
                               })
        dfFinal = dfFinal.appedn(output)    
        
        i += 1
        time.sleep(2)

dfFinal.to_excel(r'your path')
choka
  • 30
  • 5