here is my code ..it is very slow I am not scraping a lot of data. My file size is 188 KB..i think the problem is i am getting all the internal url links from a website and scraping each one to find certain key words and count them. For each internal URL i have to make a request call so I end up scraping more websites than I have in the file.
I am thinking the solution should be deploying it to a virtual computer like AWS or is there anything I could do in my code to make it more efficient?
import re
# import xlwt
import time
import string
import requests
import pandas as pd
from bs4 import BeautifulSoup
from boilerpy3 import extractors
from urllib.parse import urlparse
# from urllib.parse import urljoin
# from sklearn.feature_extraction import text
# from sklearn.feature_extraction.text import CountVectorizer
# downloads web page content
def get_web_page_content(page_url:str) -> object:
response = None
connection_status = True
try:
response = requests.get(page_url)
time.sleep(5)
if response.status_code != 200:
raise Exception('Web page response code is not 200.')
except:
connection_status = False
return response, connection_status
# transforms HTML content into a BeautifulSoup object
def create_beautiful_soup_object(response:object) -> object:
# '''
# Takes a response object and returns a BeautifulSoup object
# '''
bs = BeautifulSoup(response.content, 'html.parser')
return bs
# get page title
def get_page_title(bs:object) -> str:
# '''
# Takes a beautiful soup object and returns the page title
# '''
try:
title = bs.title.text
except AttributeError as e:
return None
return title
# the subdirectory describes the document type
def extract_subdirectory(page_url:str) -> str:
# '''
# Takes a page URL and returns the subdirectory in the URL
# '''
# Defines the regex to match subdirectory in an URL
regex = re.compile(fr"^(?:https:\/\/{page_url}\/|\/)(?P<subdirectory>(\w+-?)*)\/")
match_obj = re.search(regex, page_url)
if match_obj:
subdirectory = match_obj.group('subdirectory')
# replace - with a space
rtn_value = subdirectory.replace('-', ' ')
else:
rtn_value = 'n/a'
return rtn_value
def extract_text_data_from_web_page(web_content:str) -> string:
# print("web_content", web_content)
# '''
# Takes a web content as an argument.
# Returns a text string without clutter.
# '''
try:
extractor = extractors.ArticleExtractor()
return extractor.get_content(web_content)
except:
pass
# cleans text data. references: https://medium.com/nwamaka-imasogie/stand-up-comedy-and-nlp-c7d64002520c
def clean_text_data(raw_text:str) -> str:
# print("str", str)
rtn_value = ''
try:
rtn_value = raw_text
# 1. Make text all lower case
rtn_value = rtn_value.lower()
# 2. Remove punctuation
rtn_value = re.sub(r'[{}©]'.format(re.escape(string.punctuation)), '', rtn_value)
# 3. Remove numerical values
rtn_value = re.sub(r'\w*\d+\w*', '', rtn_value)
# 4. Remove common non-sensical text
rtn_value = re.sub(r' *\n *', ' ', rtn_value)
except:
pass
return rtn_value
def is_absolute(url):
return bool(urlparse(url).netloc)
# uses CountVectorizer in Python to count word frequencies
def count_word_frequencies(clean_text, name, turnover, size, employees, address, town, postcode, sic_code, directors, telephone, email, website):
# print('clean_text', clean_text)
word_frequency_dict = dict()
list1 = clean_text.split() # this is your original list of words
print('list1', list1)
word_frequency_dict["name"] = name
word_frequency_dict["turnover"] = turnover
word_frequency_dict["size"] = size
word_frequency_dict["employees"] = employees
word_frequency_dict["address"] = address
word_frequency_dict["town"] = town
word_frequency_dict["postcode"] = postcode
word_frequency_dict["sic_code"] = sic_code
word_frequency_dict["directors"] = directors
word_frequency_dict["telephone"] = telephone
word_frequency_dict["email"] = email
word_frequency_dict["website"] = website
word_frequency_dict["sustainability"] = list1.count('sustainability')
word_frequency_dict["sustainable"] = list1.count('sustainable')
word_frequency_dict["buffet"] = list1.count('buffet')
word_frequency_dict["kitchen"] = list1.count('kitchen')
word_frequency_dict["ISO 14001"] = list1.count('ISO 14001')
word_frequency_dict["b corp"] = list1.count('b corp')
word_frequency_dict["brasserie"] = list1.count('brasserie')
word_frequency_dict["community"] = list1.count('community')
word_frequency_dict["social"] = list1.count('social')
word_frequency_dict["green credentials"] = list1.count('green credentials')
word_frequency_dict["environment"] = list1.count('environment')
word_frequency_dict["environmental"] = list1.count('environmental')
word_frequency_dict["food"] = list1.count('food')
word_frequency_dict["book a table"] = list1.count('book a table')
word_frequency_dict["planet"] = list1.count('planet')
word_frequency_dict["planet earth"] = list1.count('planet earth')
word_frequency_dict["compostable"] = list1.count('compostable')
word_frequency_dict["recyclable"] = list1.count('recyclable')
word_frequency_dict["eco friendly"] = list1.count('eco friendly')
word_frequency_dict["restaurant"] = list1.count('restaurant')
word_frequency_dict["bistro"] = list1.count('bistro')
word_frequency_dict["take away"] = list1.count('take away')
word_frequency_dict["climate emissions"] = list1.count('climate emissions')
word_frequency_dict["coffee"] = list1.count('coffee')
word_frequency_dict["tea"] = list1.count('tea')
word_frequency_dict["menu"] = list1.count('menu')
word_frequency_dict["just eat"] = list1.count('just eat')
word_frequency_dict["uber eat"] = list1.count('uber eat')
word_frequency_dict["deliveroo"] = list1.count('deliveroo')
word_frequency_dict["reservations"] = list1.count('reservations')
word_frequency_dict["corporate social responsibility"] = list1.count('corporate social responsibility')
word_frequency_dict["freshly squeezed juice"] = list1.count('freshly squeezed juice')
word_frequency_dict["breakfast"] = list1.count('breakfast')
word_frequency_dict["lunch"] = list1.count('lunch')
word_frequency_dict["dinner"] = list1.count('dinner')
word_frequency_dict["hotel"] = list1.count('hotel')
word_frequency_dict["csr"] = list1.count('csr')
word_frequency_dict["net zero"] = list1.count('net zero')
word_frequency_dict["carbon offsetting"] = list1.count('carbon offsetting')
word_frequency_dict["type"] = 'hospitality business'
return word_frequency_dict
def get_response_text(internal_links):
content = []
for link in internal_links:
try:
response, status = get_web_page_content(link)
time.sleep(5)
print("response", response)
print("status", status)
if status == True:
text_data = extract_text_data_from_web_page(response.text)
clean_text = clean_text_data(text_data)
print("clean_text", clean_text)
content.append(clean_text)
except:
pass
return content
# finds frequently words in the web page
def get_frequently_words(internal_links, name, turnover, size, employees, address, town, postcode, sic_code, directors, telephone, email, website):
contents = get_response_text(internal_links)
print("contents", contents)
all_strings = list(map(str, contents))
one_single_text = ' '.join(all_strings)
dict_word_frequency = count_word_frequencies(one_single_text, name, turnover, size, employees, address, town, postcode, sic_code, directors, telephone, email, website)
return dict_word_frequency
# finds a list of all internal links
def get_internal_links(soup, page_url):
# '''
# Takes a beautiful soup object and returns a list of all Internal links
# '''
# Initialize the return value
rtn_value = list()
# Defines the regex to find internal links. Hardcode the base URL
regex = re.compile(fr'^((https:\/\/)?{page_url}\/|\/).+')
links = soup.find_all('a', {'href': regex})
if len(links) < 1:
rtn_value.append(page_url)
return rtn_value
else:
for link in links:
try:
href = link['href']
if is_absolute(href):
rtn_value.append(href)
else:
rtn_value.append(page_url + href)
except AttributeError as e:
pass
if page_url not in rtn_value:
rtn_value.append(page_url)
return rtn_value
if __name__ == '__main__':
df = pd.read_csv('./Endole.csv')
name = df['Name'].to_list()
turnover = df['Turnover'].to_list()
size = df['Size'].to_list()
employees = df['No. Of Employees'].to_list()
address = df['Address'].to_list()
town = df['Post Town'].to_list()
postcode = df['Postcode'].to_list()
sic_code = df['SIC Code'].to_list()
directors = df['Directors'].to_list()
telephone = df['Telephone'].to_list()
email = df['Email Address'].to_list()
website = df['Website'].to_list()
page_urls = df['Website'].to_list()
# print("NAMES", NAMES)
# print("LINKS", LINKS)
for i, page_url in enumerate(page_urls):
# page_url = 'https://www.hotelanacapri.co.uk'
response, status = get_web_page_content(page_url)
print("response", response)
print("status", status)
if status == True:
soup = create_beautiful_soup_object(response)
page_title = get_page_title(soup)
docuemnt_type = extract_subdirectory(page_url)
internal_links = get_internal_links(soup, page_url)
print("internal_links", internal_links)
high_frequency_words = get_frequently_words(internal_links, name[i], turnover[i], size[i], employees[i], address[i], town[i], postcode[i], sic_code[i], directors[i], telephone[i], email[i], website[i])
print("high_frequency_words", high_frequency_words)
new_df = pd.DataFrame(high_frequency_words, index=[0])
new_df.to_csv('testing.csv', mode='a', index=False, header=False)