How to restart the same spider once its finished, so that it can fetch next list of urls to process. Since my database is too large I cant pass all the websites at once. So i need the spider should run in a loop in order to fetch 100 websites process it and again fetch 100 websites and so on. Is there a way to call the spider once it finishes processing 100 websites? Please help me for the issue as I am new to scrapy. Or is there any option of scheduling the spider to run after a specified interval of time?
In the current code, i can get the urls from the domains and store it in the database. But i need to run the spider all the time. Is there a way to run it once and it will continuously run until there is no website to process. Please help.
class MyItem(Item):
url = Field()
class MySpider(CrawlSpider):
con = MySQLdb.connect(host="localhost", user="user",
passwd="pwd", db="db")
cur = con.cursor(MySQLdb.cursors.DictCursor)
cur.execute("select website from table limit 100")
name = "first"
urls = []
domains = []
allowed_domains = []
start_urls = []
row = cur.fetchone()
while row is not None:
p = "%s" % (row["website"])
domains.append(p)
start_urls = "http://www.% s" % p
urls.append(start_urls)
row = cur.fetchone()
allowed_domains = domains
start_urls = urls
cur.close()
con.close()
print(start_urls)
rules = (Rule(SgmlLinkExtractor(), callback='parse_url', follow=True),)
connection = MySQLdb.connect(host="localhost", user="user",
passwd="pwd", db="db")
cursor = connection.cursor(MySQLdb.cursors.DictCursor)
def parse_url(self, response):
item = MyItem()
item['url'] = response.url
topDomain = tldextract.extract(response.url)
tld = topDomain.domain + '.' + topDomain.suffix
add_url = "INSERT INTO crawl_db (url,tld,time) VALUES (%s,%s,%s)"
add_url_data = (item['url'], tld, strftime("%Y-%m-%d %H:%M:%S", gmtime()))
self.cursor.execute(add_url, add_url_data)
self.connection.commit()
Thank You.