I'm currently using a combination of Scrapy and Selenium to quickly search the USPTO TradeMark database. These pages have a session token attached.
The things I've tried and read about don't seem to be integrated enough- meaning that while Selenium can pass found URLs to scrapy, scrapy makes a new request to that page, thus invalidating the token, so I need Selenium to deliver the HTML to scrapy for parsing. Is this possible?
# -*- coding: utf-8 -*-
# from terminal run: scrapy crawl trademarks -o items.csv -t csv
import time
import scrapy
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider
from selenium import webdriver
class TrademarkscrapeItem(scrapy.Item):
category = Field()
wordmark = Field()
registrant = Field()
registration_date = Field()
description = Field()
class TradeMarkSpider(CrawlSpider):
name = "trademarks"
allowed_domains = ["uspto.gov"]
start_urls = ['http://www.uspto.gov']
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
# Navigate through the site to get to the page I want to scrape
self.driver.get(response.url)
next = self.driver.find_element_by_xpath("//*[@id='menu-84852-1']/a")
next.click()
time.sleep(2) # Let any js render in page
next = self.driver.find_element_by_xpath("//*[@id='content']/article/ul[1]/li[1]/article/h4/a")
next.click()
time.sleep(2)
# How to get this next part to point at Selenium-delivered HTML?
TradeDict = {}
SelectXpath = Selector(SeleniumHTML).xpath #SeleniumHTML is psuedoCode
TradeDict['description'] = SelectXpath("//*[@id='content']/article/div/p/text()").extract()
self.driver.close()
return TradeDict