I am trying to crawl and get all the links from pages using scrapy.
When I run it like this in the terminal scrapy crawl crawl1 -o items.csv -t csv
. I do see that it does crawl and gets some links like follows but it doesn't write anything in the output file mentioned.
2016-12-05 16:17:33 [scrapy] DEBUG: Crawled (200) <GET http://www.abof.com/men/new-in/footwear> (referer: http://www.abof.com/)
2016-12-05 16:17:33 [scrapy] DEBUG: Crawled (200) <GET http://www.abof.com/> (referer: http://www.abof.com/)
2016-12-05 16:17:33 [scrapy] DEBUG: Crawled (200) <GET http://www.abof.com/skult> (referer: http://www.abof.com/)
I tried this out Scrapy does not write data to a file as well.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from crawl.items import CrawlItem
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import TakeFirst
class CrawlLoader(XPathItemLoader):
default_output_processor = TakeFirst()
class MySpider(CrawlSpider):
name = "crawl1"
allowed_domains = ["www.abof.com"]
start_urls = ["http://www.abof.com/"]
#follow= True
rules = (Rule(SgmlLinkExtractor(allow=()), callback="parse_items", ),)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('//span[@class="pl"]')
items = []
l = CrawlLoader(CrawlItem(), hxs)
for titles in titles:
item = CrawlItem()
# l.add_value("url",response.url)
# l.add_xpath("title",titles.xpath("a/text()").extract())
# l.add_xpath("link",titles.xpath("a/@href").extract()))
item["title"] = titles.xpath("a/text()").extract()
item["url"] = titles.xpath("a/@href").extract()
items.append(item)
return(items)
# return l.load_item()
items.py
import scrapy
class CrawlItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
url = scrapy.Field()
pass
Any help is appreciated.