0

I am new at scrapy. I am getting none insted of item here is my code

class IndiaSpider(scrapy.Spider):
    name = 'espace'
    allowed_domains = ['worldwide.espacenet.com']
    search_value = 'laptop'
    start_urls = [f'https://worldwide.espacenet.com/patent/search?q={search_value}']

    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    def request_header(self):
        yield scrapy.Request(url=self.start_urls, callback=self.parse, headers={'User-Agent':self.user_agent})

    def parse(self, response):
        title = response.xpath("//span[@class='h2--2VrrSjFb item__content--title--dYTuyzV6']/text()").extract_first()

        yield{
            'title':title
        }

I am getting

2023-01-17 15:58:53 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://worldwide.espacenet.com/patent/search?q=laptop> (referer: None)
2023-01-17 15:58:54 [scrapy.core.scraper] DEBUG: Scraped from <200 https://worldwide.espacenet.com/patent/search?q=laptop>
{'title': None}
2023-01-17 15:58:54 [scrapy.core.engine] INFO: Closing spider (finished)

Anyone can help me...?

Sarfraz
  • 47
  • 7
  • The data is populated with JavaScript so you need to find another way. Why did you call the method `request_header` instead of `start_requests`? – SuperUser Jan 17 '23 at 11:09
  • I tried with title = response.xpath("(//section/header/span)[2]/text()").get() but getting same results. – Sarfraz Jan 17 '23 at 11:19
  • I already told you the data is generated with JavaScript. You can get it from the JSON file. Open the network tab on your browser and look for it. Then you'll need to recreate the request. – SuperUser Jan 17 '23 at 11:27
  • I am sorry for that I am bothering you but as I said I am new to scrapy and I don't know how to do. BTW thanks for reply. I hope I'll get this – Sarfraz Jan 17 '23 at 11:35

1 Answers1

1

See the comments in the code. Read this, and this.

Basically when you have data that's loaded with JavaScript you'll want to get it from the API. If you open devtools in your browser you can see where the data is loaded from and try to recreate the request with scrapy, and then parse the data from the JSON file.

Lose the request_header method, it's not part of the Spider's methods and you never call it. You probably wanted to use start_requests.

import json
import scrapy


class IndiaSpider(scrapy.Spider):
    name = 'espace'
    allowed_domains = ['worldwide.espacenet.com']
    search_value = 'laptop'

    # browser devtools -> network tab -> JSON url -> headers
    headers = {
        "Accept": "application/json,application/i18n+xml",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.5",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Content-Type": "application/json",
        "DNT": "1",
        "EPO-Trace-Id": "YOUR ID",          # <------ copy it from your browser
        "Host": "worldwide.espacenet.com",
        "Origin": "https://worldwide.espacenet.com",
        "Pragma": "no-cache",
        "Referer": "https://worldwide.espacenet.com/patent/search?q=laptop",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
        "X-EPO-PQL-Profile": "cpci"
    }
    api_url = f'https://worldwide.espacenet.com/3.2/rest-services/search?lang=en,de,fr&q={search_value}&qlang=cql&'

    def start_requests(self):
        # browser devtools -> network tab -> JSON url -> Request
        payload = {
            "filters": {
                "publications.patent": [
                    {
                        "value": [
                            "true"
                        ]
                    }
                ]
            },
            "query": {
                "fields": [
                    "publications.ti_*",
                    "publications.abs_*",
                    "publications.pn_docdb",
                    "publications.in",
                    "publications.inc",
                    "publications.pa",
                    "publications.pac",
                    "publications.pd",
                    "publications.pr_docdb",
                    "publications.app_fdate.untouched",
                    "publications.ipc_ic",
                    "publications.ipc_icci",
                    "publications.ipc_iccn",
                    "publications.ipc_icai",
                    "publications.ipc_ican",
                    "publications.ci_cpci",
                    "publications.ca_cpci",
                    "publications.cl_cpci",
                    "biblio:pa;pa_orig;pa_unstd;in;in_orig;in_unstd;pac;inc;pd;pn_docdb;allKindCodes;",
                    "oprid_full.untouched",
                    "opubd_full.untouched"
                ],
                "from": 0,
                "highlighting": [
                    {
                        "field": "publications.ti_en",
                        "fragment_words_number": 20,
                        "hits_only": True,
                        "number_of_fragments": 3
                    },
                    {
                        "field": "publications.abs_en",
                        "fragment_words_number": 20,
                        "hits_only": True,
                        "number_of_fragments": 3
                    },
                    {
                        "field": "publications.ti_de",
                        "fragment_words_number": 20,
                        "hits_only": True,
                        "number_of_fragments": 3
                    },
                    {
                        "field": "publications.abs_de",
                        "fragment_words_number": 20,
                        "hits_only": True,
                        "number_of_fragments": 3
                    },
                    {
                        "field": "publications.ti_fr",
                        "fragment_words_number": 20,
                        "hits_only": True,
                        "number_of_fragments": 3
                    },
                    {
                        "field": "publications.abs_fr",
                        "fragment_words_number": 20,
                        "hits_only": True,
                        "number_of_fragments": 3
                    },
                    {
                        "field": "publications.pn_docdb",
                        "fragment_words_number": 20,
                        "hits_only": True,
                        "number_of_fragments": 3
                    },
                    {
                        "field": "publications.pa",
                        "fragment_words_number": 20,
                        "hits_only": True,
                        "number_of_fragments": 3
                    }
                ],
                "size": 20
            },
            "widgets": {}
        }
        yield scrapy.Request(url=self.api_url, headers=self.headers, method='POST', body=json.dumps(payload))

    def parse(self, response):
        # browser devtools -> network tab -> JSON url -> Response
        json_data = response.json()

        if json_data:
            for hit in json_data['hits']:
                if 'publications.ti_en' in hit['hits'][0]['fields']:
                    title = hit['hits'][0]['fields']['publications.ti_en']
                    yield {'title': title}

Output:

{'title': ['METHOD AND DEVICE FOR CHECKING THE DETERMINATION OF THE POSITION OF A MOBILE STATION CARRIED OUT BY A RADIO COMMUNICATION SYSTEM']}
{'title': ['Laptop']}
{'title': ['PRESENTATION LAPTOP']}
{'title': ['LAPTOP COMPUTER']}
{'title': ['Laptop comprises an integrated flat bed scanner containing a composite glass plate made from a mineral glass pane and a plastic layer']}
...
...
...
SuperUser
  • 4,527
  • 1
  • 5
  • 24