I tried to find a way to do it without rendering the page:
from scrapy import Spider
import scrapy
import json
import logging
class IndustrystockSpider(Spider):
name = "industry_stock"
allowed_domains = ['industrystock.com']
start_urls = ["https://www.industrystock.com/html/hydraulic-cylinder/product-result-uk-19931-0.html"]
custom_settings = {'ROBOTSTXT_OBEY': False}
ajax_url = 'https://www.industrystock.com/ajax/ajax_live.php'
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.industrystock.com/html/hydraulic-cylinder/product-result-uk-19931-0.html',
'Origin': 'https://www.industrystock.com',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}
data = {
'lang': 'en',
'beta': 'false',
'action': 'RESULTPAGE_AJAX#getOverview',
'content': 'resultpage',
'subContent': 'result',
'company_id': '0',
'override_id': '0',
'domain_id': '0',
'user_id': '0',
'keyword_id': '19931',
}
@staticmethod
def construct_json_str(index):
return '{"key":"company","length":9,"keyword_id":null,"index":' + \
str(index) + \
',"filter":{},"override":{"key":"company"},"query":"Hydraulic Cylinder"}'
def parse(self, response):
index = 0
data = self.data
data['JSONStr'] = self.construct_json_str(index)
logging.info(f"data is {data}")
yield scrapy.FormRequest(self.ajax_url,
callback=self.parse_detail,
method='POST',
formdata=data,
headers=self.headers,
meta={'index': index})
def parse_detail(self, response):
company_data = json.loads(response.body)
overview = company_data['result']['overview']
if overview:
for company in overview:
company_id = company['company_id']
logging.info(f"company_id {company_id}")
previous_index = response.meta['index']
index = previous_index + 1
data = self.data
data['JSONStr'] = self.construct_json_str(index)
yield scrapy.FormRequest(self.ajax_url,
callback=self.parse_detail,
method='POST',
formdata=data,
headers=self.headers,
dont_filter=True,
meta={'index': index})