2

I'm currently using a combination of Scrapy and Selenium to quickly search the USPTO TradeMark database. These pages have a session token attached.

The things I've tried and read about don't seem to be integrated enough- meaning that while Selenium can pass found URLs to scrapy, scrapy makes a new request to that page, thus invalidating the token, so I need Selenium to deliver the HTML to scrapy for parsing. Is this possible?

# -*- coding: utf-8 -*-
# from terminal run: scrapy crawl trademarks -o items.csv -t csv

import time
import scrapy
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider

from selenium import webdriver

class TrademarkscrapeItem(scrapy.Item):
    category = Field()
    wordmark = Field()
    registrant = Field()
    registration_date = Field()
    description = Field()

class TradeMarkSpider(CrawlSpider):
    name = "trademarks"
    allowed_domains = ["uspto.gov"]
    start_urls = ['http://www.uspto.gov']

    def __init__(self):
        self.driver = webdriver.Firefox()

    def parse(self, response):
        # Navigate through the site to get to the page I want to scrape
        self.driver.get(response.url)
        next = self.driver.find_element_by_xpath("//*[@id='menu-84852-1']/a")
        next.click()
        time.sleep(2) # Let any js render in page
        next = self.driver.find_element_by_xpath("//*[@id='content']/article/ul[1]/li[1]/article/h4/a")
        next.click()
        time.sleep(2)

    # How to get this next part to point at Selenium-delivered HTML?
        TradeDict = {}
        SelectXpath = Selector(SeleniumHTML).xpath #SeleniumHTML is psuedoCode
        TradeDict['description'] = SelectXpath("//*[@id='content']/article/div/p/text()").extract()

        self.driver.close()
        return TradeDict
Community
  • 1
  • 1
Benjamin James
  • 941
  • 1
  • 9
  • 24

0 Answers0