I can't figure out how to make scrapy crawl links in order I've got a page with articles and in each one there is a title but the article doesn't match the title Also in settings.py I added:
DEPTH_PRIORITY = 1
SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'
I've got something like this:
class Getgot(Spider):
name = "getem"
allowed_domains = ["somesite.us"]
start_urls = ["file:local.html"]
el = '//div[@article]'
def parse(self,response):
hxs = HtmlXPathSelector(response)
s = hxs.select('//article')
filename = ("links.txt")
filly = open(filename, "w")
for i in s:
t = i.select('a/@href').extract()
filly.write(str(t[0])+'\n')
yield Request(str(t[0]),callback=self.parse_page)
def parse_page(self,res):
hxs = HtmlXPathSelector(res)
s = hxs.select('//iframe').extract()
if s:
filename = ("frames.txt")
filly = open(filename, "a")
filly.write(str(s[0])+'\n')
else:
filename = ("/frames.txt")
filly = open(filename, "a")
filly.write('[]\n')