I try crawl many url in the same domain. I have to url list in the string. I want to search regex in string and find urls. But re.match() always return none. I test my regex and it working. This is my code:
# -*- coding: UTF-8 -*-
import scrapy
import codecs
import re
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy import Request
from scrapy.selector import HtmlXPathSelector
from hurriyet.items import HurriyetItem
class hurriyet_spider(CrawlSpider):
name = 'hurriyet'
allowed_domains = ['hurriyet.com.tr']
start_urls = ['http://www.hurriyet.com.tr/gundem/']
rules = (Rule(SgmlLinkExtractor(allow=('\/gundem(\/\S*)?.asp$')),'parse',follow=True),)
def parse(self, response):
image = HurriyetItem()
text = response.xpath("//a/@href").extract()
print text
urls = ''.join(text)
page_links = re.match("(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))", urls, re.M)
image['title'] = response.xpath("//h1[@class = 'title selectionShareable'] | //h1[@itemprop = 'name']/text()").extract()
image['body'] = response.xpath("//div[@class = 'detailSpot']").extract()
image['body2'] = response.xpath("//div[@class = 'ctx_content'] ").extract()
print page_links
return image, text