Web scraping a list of pages from the same website using python

Question

I have a python code to extract data from a website and write it to a csv file. The code works fine but now I would like to iterate a list of webpages to collect more data with the same structure.

My code is:

import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime

# page = requests.get('https://www.pccomponentes.com/procesadores/amd/socket-am4')
# page = requests.get('https://www.pccomponentes.com/placas-base/amd-x570/atx')
url_list = [
    'https://www.pccomponentes.com/procesadores/socket-am4'
    'https://www.pccomponentes.com/discos-duros/500-gb/conexiones-m-2/disco-ssd/internos'
    'https://www.pccomponentes.com/discos-duros/1-tb/conexiones-m-2/disco-ssd/internos'
    'https://www.pccomponentes.com/placas-base/amd-b550/atx'
    'https://www.pccomponentes.com/placas-base/amd-x570/atx'
    'https://www.pccomponentes.com/memorias-ram/16-gb/kit-2x8gb'
    'https://www.pccomponentes.com/ventiladores-cpu'
    'https://www.pccomponentes.com/fuentes-alimentacion/850w/fuente-modular'
    'https://www.pccomponentes.com/fuentes-alimentacion/750w/fuente-modular'
    'https://www.pccomponentes.com/cajas-pc/atx/con-ventana/sin-ventana'
    ]


for link in url_list:
    r = requests.get(link)
   # r.encoding = 'utf-8'

    # html_content = r.text
    # soup = BS(html_content, 'lxml')

    # table = soup.find('table', class_='bigborder')

    soup = BeautifulSoup(page.content,'html.parser')
#print(soup)
    product = soup.find(id = 'articleListContent')
#print(product)
    items = product.find_all(class_='c-product-card__content')
#print(items[0])

# print(items[0].find(class_ = 'c-product-card__header').get_text())
# print(items[0].find(class_ = 'c-product-card__prices cy-product-price').get_text())
# print(items[0].find(class_ = 'c-product-card__availability disponibilidad-inmediata cy-product-availability-date').get_text())
# print(items[0].find(class_ = 'c-star-rating__text cy-product-text').get_text())
# print(items[0].find(class_ = 'c-star-rating__text cy-product-rating-result').get_text())

product_name = [item.find(class_ = 'c-product-card__header').get_text() for item in items]
price = [item.find(class_ = 'c-product-card__prices cy-product-price').get_text() for item in items]
# availability = [item.find(class_ = 'c-product-card__availability disponibilidad-inmediata cy-product-availability-date').get_text() for item in items]
rating = [item.find(class_ = 'c-star-rating__text cy-product-text').get_text() for item in items]
opinion = [item.find(class_ = 'c-star-rating__text cy-product-rating-result').get_text() for item in items]

# print(product_name)
# print(price)
# print(availability)
# print(rating)
# print(opinion)

store = 'PCComponentes'
extraction_date = datetime.datetime.now() 
data_PCCOMP = pd.DataFrame (
    { 
        'product_name' : product_name,
        'price' : price,
        # 'availability' : availability,
        'rating' : rating,
        'opinion' : opinion,
        'store' : store,
        'date_extraction' : extraction_date,
    })

# site = ‘mysite’
path = "C:\PriceTracking\pccomp\\"
# now = datetime.datetime.now()
mydate = extraction_date.strftime('%Y%m%d')
mytime = extraction_date.strftime('%H%M%S')
filename = path+store+'_'+mydate+'_'+mytime+".csv"

data_PCCOMP.to_csv(filename)

#print(data_PCCOMP)

How can I iterate in order to insert all data from the urls in the same csv?

Any help would be much appreciated.

score 0 · Accepted Answer · answered Jan 02 '21 at 03:08

I rearranged the top part of the code, but once you get the final dataframe you can write that to csv as you were. Also, note I changed a couple list comprehensions to check for errors that I was getting. Also, url_list needs commas.

store = 'PCComponentes'
df_hold_list = [] # capture dataframe for each link
for link in url_list:
    extraction_date = datetime.datetime.now()
    print(link)
    r = requests.get(link)
    print(r.status_code)
    soup = BeautifulSoup(r.content,'html.parser')
    product = soup.find(id = 'articleListContent')
    items = product.find_all(class_='c-product-card__content')

    product_name = [item.find(class_ = 'c-product-card__header').get_text() for item in items]
    price = [item.find(class_ = 'c-product-card__prices cy-product-price').get_text() for item in items]
    # availability = [item.find(class_ = 'c-product-card__availability disponibilidad-inmediata cy-product-availability-date').get_text() for item in items]
    # rating = [item.find(class_ = 'c-star-rating__text cy-product-text').get_text() for item in items]
    rating = [item.find(class_ = 'c-star-rating__text cy-product-text').get_text() if item.find(class_ = 'c-star-rating__text cy-product-text') != None else None for item in items]
    opinion = [item.find(class_ = 'c-star-rating__text cy-product-rating-result').get_text() if item.find(class_ = 'c-star-rating__text cy-product-rating-result') != None else None for item in items]

    df = pd.DataFrame (
        {
            'product_name' : product_name,
            'price' : price,
            # 'availability' : availability,
            'rating' : rating,
            'opinion' : opinion,
            'store' : store,
            'date_extraction' : extraction_date,
        })
    df_hold_list.append(df)
data_PCCOMP = pd.concat(df_hold_list, axis=0) # concatenate dfs

Output:

                                         product_name         price rating         opinion          store            date_extraction
0                         AMD Ryzen 5 3600 3.6GHz BOX       219,91€    4.8  3003 Opiniones  PCComponentes 2021-01-01 21:03:57.007233
1             AMD Ryzen 5 1600 Stepping AF 3.6GHz BOX       129,91€    4.8   799 Opiniones  PCComponentes 2021-01-01 21:03:57.007233
2                        AMD Ryzen 7 3700X 3.6GHz BOX       319,90€    4.8  1178 Opiniones  PCComponentes 2021-01-01 21:03:57.007233
3                            AMD Ryzen 7 5800X 3.8GHz       489,90€    4.8   176 Opiniones  PCComponentes 2021-01-01 21:03:57.007233
4                Procesador AMD Ryzen 5 2600X 3.6 Ghz       159,90€    4.8   898 Opiniones  PCComponentes 2021-01-01 21:03:57.007233
..                                                ...           ...    ...             ...            ...                        ...
19  Corsair iCUE 220T RGB Airflow Cristal Templado...        99,98€    4.7    90 Opiniones  PCComponentes 2021-01-01 21:04:15.405459
20  Thermaltake H200 TG RGB Snow Cristal Templado ...  59,99€79,99€    4.7    85 Opiniones  PCComponentes 2021-01-01 21:04:15.405459
21             Tempest Spook White RGB USB 3.0 Blanca        39,99€    4.1    65 Opiniones  PCComponentes 2021-01-01 21:04:15.405459
22  Bitfenix Pack Nova Mesh TG 4ARGB Cristal Templ...       158,99€    4.3     4 Opiniones  PCComponentes 2021-01-01 21:04:15.405459
23  Nfortec Draco V2 Cristal Templado USB 3.0 RGB ...        67,99€    4.5   551 Opiniones  PCComponentes 2021-01-01 21:04:15.405459

[213 rows x 6 columns]

Web scraping a list of pages from the same website using python

1 Answers1