I'm trying to crawl ACRIS in a similar way to what I found in one of the answers here (look at the UPDATE answer). I slightly altered the code in order to remove deprecated stuff.
At first I had problems with robots.txt
restricting me, so I found I can set ROBOTSTXT_OBEY=False
in settings.py and indeed it seems to ignore it but for some reason the spider isn't getting to the parse
method anymore.
This is my spider
from scrapy.http import Request, FormRequest
from scrapy.item import Item, Field
from scrapy.spiders import Spider
import logging
class AcrisItem(Item):
borough = Field()
block = Field()
class AcrisSpider(Spider):
name = "acris"
allowed_domains = ["a836-acris.nyc.gov"]
start_urls = ['https://a836-acris.nyc.gov/DS/DocumentSearch/PartyName']
def start_requests(self):
return [(Request(url, meta={'dont_redirect': True}, callback=self.parse)) for url in self.start_urls]
def parse(self, response):
form_token = response.selector.xpath('//input[@name="__RequestVerificationToken"]/@value').extract_first()
logging.debug('THE FORM TOKEN IS: %s\n\n' % form_token)
formdata = {
"__RequestVerificationToken": form_token,
"hid_last": "SMITH",
"hid_first": "JOHN",
"hid_ml": "",
"hid_suffix": "",
"hid_business": "",
"hid_selectdate": "To Current Date",
"hid_datefromm": "",
"hid_datefromd": "",
"hid_datefromy": "",
"hid_datetom": "",
"hid_datetod": "",
"hid_datetoy": "",
"hid_partype": "",
"hid_borough": "All Boroughs/Counties",
"hid_doctype": "All Document Classes",
"hid_max_rows": "10",
"hid_page": "1",
"hid_partype_name": "All Parties",
"hid_doctype_name": "All Document Classes",
"hid_borough_name": "All Boroughs/Counties",
"hid_ReqID": "",
"hid_SearchType": "PARTYNAME",
"hid_ISIntranet": "N",
"hid_sort": ""
}
if form_token:
yield FormRequest(url="https://a836-acris.nyc.gov/DS/DocumentSearch/PartyNameResult",
method="POST",
formdata=formdata,
meta={'dont_redirect': True},
callback=self.parse_page)
def parse_page(self, response):
rows = response.selector.xpath('//form[@name="DATA"]/table/tbody/tr[2]/td/table/tbody/tr')
for row in rows:
item = AcrisItem()
borough = row.xpath('.//td[3]/div/font/text()').extract_first()
block = row.xpath('.//td[4]/div/font/text()').extract_first()
if borough and block:
item['borough'] = borough
item['block'] = block
yield item
And this is the output (minus the init messages)
2017-01-04 17:06:12 [scrapy.core.engine] INFO: Spider opened
2017-01-04 17:06:12 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2017-01-04 17:06:12 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2017-01-04 17:06:41 [scrapy.core.engine] DEBUG: Crawled (301) <GET https://a836-acris.nyc.gov/DS/DocumentSearch/PartyName> (referer: None)
2017-01-04 17:06:41 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <301 https://a836-acris.nyc.gov/DS/DocumentSearch/PartyName>: HTTP status code is not handled or not allowed
2017-01-04 17:06:41 [scrapy.core.engine] INFO: Closing spider (finished)
2017-01-04 17:06:41 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 243,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 414,
'downloader/response_count': 1,
'downloader/response_status_count/301': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 1, 4, 15, 6, 41, 791647),
'log_count/DEBUG': 2,
'log_count/INFO': 8,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2017, 1, 4, 15, 6, 12, 467659)}
2017-01-04 17:06:41 [scrapy.core.engine] INFO: Spider closed (finished)