I am trying to build a web scrape for Trustpilot, however, the code keeps returning empty data frames, I really can't figure out why. Could someone please help me? Thank you so so much. This is the output I get. I am a new beginner in Python, any help much appreciated.
Empty DataFrame Columns: [Title, Body, Rating, Date] Index: []
code:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import time
from time import sleep
from random import randint
#Not needed but I like a timer to see how long the code takes to run
then = time.time()
#Create empty lists to hold data
reviews = []
headings = []
stars = []
dates = []
#Set number of pages to scrape, you need to check on TrustPilot to see how many to scrape
#in this instance at the time of coding there were 287 pages to be scraped
# The first number 1 means start at 1, the number 287 means stop at 287
#the third number which is 1 means go from 1 to 287 in steps of 1
pages = np.arange(1, 10, 1)
#Create a loop to go over the reviews
for page in pages:
page = requests.get("https://www.trustpilot.com/review/www.dugood.org" + "?page=" + str(page))
soup = BeautifulSoup(page.text, "html.parser")
#Set the tag we wish to start at, this is like a parent tag where we will go in and get everything below it`enter code here`
review_div = soup.find_all('div', class_="review-content")
#loop to iterate through each reviews
for container in review_div:
#Get the body of the review
#If there is no review left by the user we will get a "-" returned by using 'if len(nv) == True else '-''
#TrustPilot will add nothing if there is no review so there will be no tag for the code to scrape
#It is saying if nv is True (we have a review) return the review or just put a - in
#We now tell the code to go into the tag 'p' 'class' 'review-content__text'
nv = container.find_all('p', attrs={'class': 'review-content__text'})
review = container.p.text if len(nv) == True else '-'
reviews.append(review)
#Get the title of the review
nv1 = container.find_all('h2', attrs={'class': 'review-content__title'})
heading = container.a.text if len(nv1) == True else '-'
headings.append(heading)
#Get the star rating review given
star = container.find("div", {"class":"star-rating star-rating--medium"}).find('img').get('alt')
stars.append(star)
#Get the date
date_json = json.loads(container.find('script').text)
date = date_json['publishedDate']
dates.append(date)
TrustPilot = pd.DataFrame({'Title': headings, 'Body': reviews, 'Rating': stars, 'Date': dates})
print(TrustPilot)