Trying to setup simple example to traverse the requests in priority order. I am creating a random priority order and attach it to the request. The second function get_link_title, output the names to check if the order of priority is respected. And it is not. Can't figure what I am doing wrong:
from tutorial.items import TutorialItem
from scrapy.http import Request
import random
class DmozSpider(BaseSpider):
name = "dmoz"
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/"
]
def parse(self, response):
#filename = response.url.split('/')[-2]
#open(filename, 'wb').write(response.body)
tree = HtmlXPathSelector(response)
sites = tree.xpath('//ul[@class="directory-url"]/li')
items = []
reqs = []
for site in sites:
item = TutorialItem()
item['name'] = site.xpath('a/text()').extract()[0]
item['url'] = site.xpath('a/@href').extract()[0]
item['description'] = site.xpath('./text()').re('-\s([^\n]*?)\\n')[0]
items.append(item)
pty = random.randint(0, 100)
print 'From first page', item['name'], pty
reqs.append(Request(item['url'], meta={'item':item}, callback=self.get_link_title, priority=pty))
return reqs
def get_link_title(self, response):
item = response.meta['item']
print 'Title for', item['name']
tree = HtmlXPathSelector(response)
item['title_link'] = tree.xpath('//title/text()').extract()[0]
return item