I am scrapping the website craiglist.com but after getting certain requests it keeps blocking my device. I tried out the solution in Proxies with Python 'Requests' module but didn't understand how to specify the headers
every time. Here's the code :
from bs4 import BeautifulSoup
import requests,json
list_of_tuples_with_given_zipcodes = []
id_of_apartments = []
params = {
'sort': 'dd',
'filter': 'reviews-dd',
'res_id': 18439027
}
http_proxy = "http://10.10.1.10:3128"
https_proxy = "https://10.10.1.11:1080"
ftp_proxy = "ftp://10.10.1.10:3128"
proxies = {
"http" : http_proxy,
"https" : https_proxy,
"ftp" : ftp_proxy
}
for i in range(1,30):
content = requests.get('https://losangeles.craigslist.org/search/apa?s = ' + str(i),params = params) #https://losangeles.craigslist.org/search/apa?s=120
# content = requests.get('https://www.zillow.com/homes/for_rent/')
soup = BeautifulSoup(content.content, 'html.parser')
my_anchors = list(soup.find_all("a",{"class": "result-image gallery"}))
for index,each_anchor_tag in enumerate(my_anchors):
URL_to_look_for_zipcode = soup.find_all("a",{"class": "result-title"}) #taking set so that a page is not visited twice.
for each_href in URL_to_look_for_zipcode:
# content_href = requests.get(each_href['href']) #script id="ld_posting_data" type="application/ld+json">
content_href = requests.get(each_href['href']) #script id="ld_posting_data" type="application/ld+json">
# print(each_href['href'])
soup_href = BeautifulSoup(content_href.content, 'html.parser')
my_script_tags = soup_href.find("script",{"id": "ld_posting_data"})
# for each_tag in my_script_tags:
if my_script_tags:
res = json.loads(str(list(my_script_tags)[0]))
if res and 'address' in list(res.keys()):
if res['address']['postalCode'] == "90012": #use the input zipcode entered by the user.
list_of_tuples_with_given_zipcodes.append(each_href['href'])
I am still not sure about the value of the http_proxy
variable. I specified it as what was given but should it be the IP address of my device mapped to the localhost port number? It still keeps blocking the code.
Please help.