See the comments in the code. Read this, and this.
Basically when you have data that's loaded with JavaScript you'll want to get it from the API. If you open devtools in your browser you can see where the data is loaded from and try to recreate the request with scrapy, and then parse the data from the JSON file.
Lose the request_header
method, it's not part of the Spider's methods and you never call it. You probably wanted to use start_requests.
import json
import scrapy
class IndiaSpider(scrapy.Spider):
name = 'espace'
allowed_domains = ['worldwide.espacenet.com']
search_value = 'laptop'
# browser devtools -> network tab -> JSON url -> headers
headers = {
"Accept": "application/json,application/i18n+xml",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "application/json",
"DNT": "1",
"EPO-Trace-Id": "YOUR ID", # <------ copy it from your browser
"Host": "worldwide.espacenet.com",
"Origin": "https://worldwide.espacenet.com",
"Pragma": "no-cache",
"Referer": "https://worldwide.espacenet.com/patent/search?q=laptop",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"X-EPO-PQL-Profile": "cpci"
}
api_url = f'https://worldwide.espacenet.com/3.2/rest-services/search?lang=en,de,fr&q={search_value}&qlang=cql&'
def start_requests(self):
# browser devtools -> network tab -> JSON url -> Request
payload = {
"filters": {
"publications.patent": [
{
"value": [
"true"
]
}
]
},
"query": {
"fields": [
"publications.ti_*",
"publications.abs_*",
"publications.pn_docdb",
"publications.in",
"publications.inc",
"publications.pa",
"publications.pac",
"publications.pd",
"publications.pr_docdb",
"publications.app_fdate.untouched",
"publications.ipc_ic",
"publications.ipc_icci",
"publications.ipc_iccn",
"publications.ipc_icai",
"publications.ipc_ican",
"publications.ci_cpci",
"publications.ca_cpci",
"publications.cl_cpci",
"biblio:pa;pa_orig;pa_unstd;in;in_orig;in_unstd;pac;inc;pd;pn_docdb;allKindCodes;",
"oprid_full.untouched",
"opubd_full.untouched"
],
"from": 0,
"highlighting": [
{
"field": "publications.ti_en",
"fragment_words_number": 20,
"hits_only": True,
"number_of_fragments": 3
},
{
"field": "publications.abs_en",
"fragment_words_number": 20,
"hits_only": True,
"number_of_fragments": 3
},
{
"field": "publications.ti_de",
"fragment_words_number": 20,
"hits_only": True,
"number_of_fragments": 3
},
{
"field": "publications.abs_de",
"fragment_words_number": 20,
"hits_only": True,
"number_of_fragments": 3
},
{
"field": "publications.ti_fr",
"fragment_words_number": 20,
"hits_only": True,
"number_of_fragments": 3
},
{
"field": "publications.abs_fr",
"fragment_words_number": 20,
"hits_only": True,
"number_of_fragments": 3
},
{
"field": "publications.pn_docdb",
"fragment_words_number": 20,
"hits_only": True,
"number_of_fragments": 3
},
{
"field": "publications.pa",
"fragment_words_number": 20,
"hits_only": True,
"number_of_fragments": 3
}
],
"size": 20
},
"widgets": {}
}
yield scrapy.Request(url=self.api_url, headers=self.headers, method='POST', body=json.dumps(payload))
def parse(self, response):
# browser devtools -> network tab -> JSON url -> Response
json_data = response.json()
if json_data:
for hit in json_data['hits']:
if 'publications.ti_en' in hit['hits'][0]['fields']:
title = hit['hits'][0]['fields']['publications.ti_en']
yield {'title': title}
Output:
{'title': ['METHOD AND DEVICE FOR CHECKING THE DETERMINATION OF THE POSITION OF A MOBILE STATION CARRIED OUT BY A RADIO COMMUNICATION SYSTEM']}
{'title': ['Laptop']}
{'title': ['PRESENTATION LAPTOP']}
{'title': ['LAPTOP COMPUTER']}
{'title': ['Laptop comprises an integrated flat bed scanner containing a composite glass plate made from a mineral glass pane and a plastic layer']}
...
...
...