I'm working on a web scraper for yellowpages.com, which seems to be working well overall. However, while iterating through the pagination of a long query, requests.get(url) will randomly return <Response [503]>
or <Response [404]>
. Occassionally, I will receive worse exceptions, such as:
requests.exceptions.ConnectionError: HTTPConnectionPool(host='www.yellowpages.com', port=80): Max retries exceeded with url: /search?search_terms=florists&geo_location_terms=FL&page=22 (Caused by NewConnectionError(': Failed to establish a new connection: [WinError 10053] An established connection was aborted by the software in your host machine',))
Using time.sleep() seems to eliminate the 503 errors, but 404s and exceptions remain issues.
I'm trying to figure out how to "catch" the various responses, so I can make changes (wait, change proxy, change user-agent) and try again and/or move on. Pseudocode something like this:
If error/exception with request.get:
wait and/or change proxy and user agent
retry request.get
else:
pass
At this point, I can't even seem to capture an issue using:
try:
r = requests.get(url)
except requests.exceptions.RequestException as e:
print (e)
import sys #only added here, because it's not part of my stable code below
sys.exit()
Full code for where I'm starting from on github and below:
import requests
from bs4 import BeautifulSoup
import itertools
import csv
# Search criteria
search_terms = ["florists", "pharmacies"]
search_locations = ['CA', 'FL']
# Structure for Data
answer_list = []
csv_columns = ['Name', 'Phone Number', 'Street Address', 'City', 'State', 'Zip Code']
# Turns list of lists into csv file
def write_to_csv(csv_file, csv_columns, answer_list):
with open(csv_file, 'w') as csvfile:
writer = csv.writer(csvfile, lineterminator='\n')
writer.writerow(csv_columns)
writer.writerows(answer_list)
# Creates url from search criteria and current page
def url(search_term, location, page_number):
template = 'http://www.yellowpages.com/search?search_terms={search_term}&geo_location_terms={location}&page={page_number}'
return template.format(search_term=search_term, location=location, page_number=page_number)
# Finds all the contact information for a record
def find_contact_info(record):
holder_list = []
name = record.find(attrs={'class': 'business-name'})
holder_list.append(name.text if name is not None else "")
phone_number = record.find(attrs={'class': 'phones phone primary'})
holder_list.append(phone_number.text if phone_number is not None else "")
street_address = record.find(attrs={'class': 'street-address'})
holder_list.append(street_address.text if street_address is not None else "")
city = record.find(attrs={'class': 'locality'})
holder_list.append(city.text if city is not None else "")
state = record.find(attrs={'itemprop': 'addressRegion'})
holder_list.append(state.text if state is not None else "")
zip_code = record.find(attrs={'itemprop': 'postalCode'})
holder_list.append(zip_code.text if zip_code is not None else "")
return holder_list
# Main program
def main():
for search_term, search_location in itertools.product(search_terms, search_locations):
i = 0
while True:
i += 1
url = url(search_term, search_location, i)
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
main = soup.find(attrs={'class': 'search-results organic'})
page_nav = soup.find(attrs={'class': 'pagination'})
records = main.find_all(attrs={'class': 'info'})
for record in records:
answer_list.append(find_contact_info(record))
if not page_nav.find(attrs={'class': 'next ajax-page'}):
csv_file = "YP_" + search_term + "_" + search_location + ".csv"
write_to_csv(csv_file, csv_columns, answer_list) # output data to csv file
break
if __name__ == '__main__':
main()
Thank you in advance for taking the time to read this long post/reply :)