I am learning scrapy and am trying to scrape this realtor site in Quebec. I am using their API to collect homes and print the URLs to the screen. But my last function print_urls() won't run. I really am stuck here i tried debugging it and it just skips right over my whole function block.
class CentrishomesSpider(scrapy.Spider):
name = 'centrisHomes'
# allowed_domains = ['www.centris.ca']
# start_urls = ['http://www.centris.ca/']
def start_requests(self):
query = {...
}
yield scrapy.Request(
url='https://www.centris.ca/property/UpdateQuery',
method='POST',
body=json.dumps(query),
headers={
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
},
callback=self.get_inscriptions
)
...
def get_inscriptions(self, response):
resp, success = self.success(response)
if success == True:
print(Fore.GREEN + 'Query Updated' + Style.RESET_ALL)
else:
print(Fore.RED + 'Query Not Updated' + Style.RESET_ALL)
yield scrapy.Request(
url='https://www.centris.ca/Property/GetInscriptions',
method='POST',
body=json.dumps({"startPosition": 0}),
headers={
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
},
callback=self.handle_inscriptions
)
def handle_inscriptions(self, response):
homes, success = self.success(response)
if success == True:
print(Fore.GREEN + 'Count ' + str(homes['d']['Result']['count']) + Style.RESET_ALL)
# self.test()
self.html = Selector(text=homes['d']['Result']['html'])
self.print_urls()
# print(response.body)
...
def success(self, response):
my_dict = literal_eval(response.body.decode(
'utf-8').replace(':true}', ':True}'))
if my_dict['d']['Succeeded'] == True:
return my_dict, True
else:
return False
def print_urls(self):
print('try')
# page_html = Selector(resp['d']['Result']['html'])
page_html = self.html
homes = page_html.xpath('//div[contains(@class, "property-thumbnail-item")]')
for home in homes:
yield{
'home_url':home.xpath('.//a[@class="property-thumbnail-summary-link"]/@href').get()
}
...