I am trying to scrape data from Google results using BeautifulSoup in Google Colab, but whilst my code is able to return relevant data, it seems to ignore the start date/end date element and just bring up the newest 100 headlines. I have had issues setting up Selenium in Colab, and thus was wondering whether there was an alternate way of only searching in a specific date range other than just modifying the URL, or whether there was another fix. Any advice would be appreciated. Thanks.
class Scrape:
def __init__(self, search_term, start_date, end_date):
self.search_term = search_term
self.start_date = start_date
self.start_day = start_date[0]
self.start_month = start_date[1]
self.start_year = start_date[2]
self.end_day = end_date[0]
self.end_month = end_date[1]
self.end_year = end_date[2]
self.url = 'https://www.google.com/search?q={0}&biw=1053&bih=1138&source=lnt&tbs=cdr%3A1%2Ccd_min%3A{1}%2F{2}%2F{3}%2Ccd_max%3A{4}%2F{5}%2F{6}&tbm=nws&num=100'.format(self.search_term, self.start_month, self.start_day, self.start_year, self.end_month, self.end_day, self.end_year)
self.filename = '{0}{1}.csv'.format(self.search_term, self.start_date)
self.behaviour_index = 0
def run(self):
response = requests.get(self.url)
soup = BeautifulSoup(response.text, 'html.parser')
headlines = soup.findAll('div', {'class': "BNeawe vvjwJb AP7Wnd"})
csv_file = open(self.filename, 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['text', 'sentiment'])
for headline in headlines:
headline = headline.get_text()
csv_writer.writerow([headline,0])
csv_writer.writerow([headline,0])