I'm a beginner and stuck into a problem. I have 30 links to crawl. But the crawler should stop crawling next links after it meets a certain condition (break_flag==True
). I put a dummy condition like stop crawling when the count==2
. But spider always scraps all the 30 provided links. I raise CloseSpider()
exception but nothing is making sense. It always scraps all the provided links. Another problem that I am facing is the spider chooses random links to crawl, I want them to be crawled in sequence as given.
My Spider
class IkmanSpider(scrapy.Spider):
name = 'ikman'
allowed_domains = ['ikman.lk']
start_urls = ['https://ikman.lk/en/ads/sri-lanka/property?page=' + str(i) for i in range(1, 30)]
main_url = 'https://ikman.lk'
#Differnce between current date and last scrapped date
days_diff = GoogleSheet().duration_from_last_run()
count = 0
def parse(self, response):
self.count += 1
break_flag = False
objs = list()
links = set()
boxes = response.css('.list--3NxGO li')
for box in boxes:
l = box.css('a::attr(href)')[0].extract()
try:
time = box.css('.updated-time--1DbCk::text')[0].extract()
print('time: ', time)
if 'day' in time:
day = int(str(time).split(' ')[0].strip())
print('Posted day:', day)
if self.days_diff <= day:
break_flag = True
continue
except:
pass
l = self.main_url + l
if l not in links:
obj = PropertiesLinkItem()
obj['link'] = l
obj["status"] = '0'
# scraping Date
obj['s_date'] = str(datetime.now().day) + '-' + str(datetime.now().month) + '-' + str(
datetime.now().year)
objs.append(obj)
links.add(l)
if break_flag or self.count == 2:
print("Stop Scraping")
raise CloseSpider('All newly added Links has been Scrapped')
yield {'data': objs}