#I am following code from this previous stackoverflow posts:
- How to schedule Scrapy crawl execution programmatically
- Running Scrapy multiple times in the same process
##The following script works well while using one spider:
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from AmazonScrap.spiders.Amazonfeed import AmazonfeedSpider
from scrapy.utils.project import get_project_settings
from twisted.internet.defer import inlineCallbacks
from urllib.parse import urlparse
# from AmazonScrap.spiders.Productfeed import ProductfeedSpider
import yaml
from urllib.parse import urlencode
with open(r'C:\Users\Latitude\Desktop\Shadman\Scrapy_Projects\Product_List.yaml') as file:
PList = yaml.load(file, Loader=yaml.FullLoader)
Purl= []
for k, v in PList.items():
arg = v['M_title']
args = {"k": arg}
amazon_url= 'https://www.amazon.com/s?{}'.format(urlencode(args))
Purl.append(amazon_url)
print(Purl)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner(settings = get_project_settings())
@inlineCallbacks
def loop_urls(urls):
for url in urls:
yield runner.crawl(AmazonfeedSpider, url)
# reactor.stop()
loop_urls(Purl)
reactor.run()
enter code here
##But this script doesn't even scrape successfully using the first spider.. and can't access the 2nd spider...
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from AmazonScrap.spiders.Amazonfeed import AmazonfeedSpider
from scrapy.utils.project import get_project_settings
from twisted.internet.defer import inlineCallbacks
from urllib.parse import urlparse
from AmazonScrap.spiders.Productfeed import ProductfeedSpider
import yaml
from urllib.parse import urlencode
# def crawl_job():
# """
# Job to start spiders.
# Return Deferred, which will execute after crawl has completed.
# """
# settings = get_project_settings()
# runner = CrawlerRunner(settings)
# return runner.crawl(AmazonfeedSpider)
def CrawlProduct():
settings = get_project_settings()
runner2 = CrawlerRunner(settings)
yield runner2.crawl(ProductfeedSpider)
reactor.stop()
def schedule_next_crawl(null, sleep_time):
"""
Schedule the next crawl
"""
reactor.callLater(sleep_time, CrawlProduct)
@inlineCallbacks
def loop_urls(urls):
"""
# Job to start spiders.
# Return Deferred, which will execute after crawl has completed.
# """
settings = get_project_settings()
runner = CrawlerRunner(settings)
for url in urls:
yield runner.crawl(AmazonfeedSpider, url)
# reactor.stop()
def crawl(Purl):
"""
A function that schedules a crawl 30 seconds after
each successful crawl.
"""
# loop_urls() returns a Deferred
d = loop_urls(Purl)
# call schedule_next_crawl(<scrapy response>, n) after crawl job is complete
d.addCallback(schedule_next_crawl, 30)
d.addErrback(catch_error)
def catch_error(failure):
print(failure.value)
if __name__=="__main__":
with open(r'C:\Users\Latitude\Desktop\Shadman\Scrapy_Projects\Product_List.yaml') as file:
PList = yaml.load(file, Loader=yaml.FullLoader)
Purl= []
for k, v in PList.items():
arg = v['M_title']
args = {"k": arg}
amazon_url= 'https://www.amazon.com/s?{}'.format(urlencode(args))
Purl.append(amazon_url)
print(Purl)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
crawl(Purl)
reactor.run()
#Is it for not executing the inlineCallbacks function properly..? I am drawing the attention of altruistic experts and looking forward to their suggestions and solutions. please speculate the aforementioned stackoverflow questions and solutions first, before answering my question.