0

#I am following code from this previous stackoverflow posts:

  1. How to schedule Scrapy crawl execution programmatically
  2. Running Scrapy multiple times in the same process

##The following script works well while using one spider:

from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from AmazonScrap.spiders.Amazonfeed import AmazonfeedSpider
from scrapy.utils.project import  get_project_settings
from twisted.internet.defer import inlineCallbacks
from urllib.parse import urlparse
# from AmazonScrap.spiders.Productfeed import ProductfeedSpider
import yaml
from urllib.parse import urlencode
with open(r'C:\Users\Latitude\Desktop\Shadman\Scrapy_Projects\Product_List.yaml') as file:
         PList = yaml.load(file, Loader=yaml.FullLoader) 

Purl= []
for k, v in PList.items():
    arg = v['M_title']
    args = {"k": arg}
    amazon_url= 'https://www.amazon.com/s?{}'.format(urlencode(args))
    Purl.append(amazon_url)

print(Purl)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner(settings = get_project_settings())

@inlineCallbacks
def loop_urls(urls):
    for url in urls:
        yield runner.crawl(AmazonfeedSpider, url)
    # reactor.stop()

loop_urls(Purl)
reactor.run()

enter code here

##But this script doesn't even scrape successfully using the first spider.. and can't access the 2nd spider...

from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from AmazonScrap.spiders.Amazonfeed import AmazonfeedSpider
from scrapy.utils.project import  get_project_settings
from twisted.internet.defer import inlineCallbacks
from urllib.parse import urlparse
from AmazonScrap.spiders.Productfeed import ProductfeedSpider
import yaml
from urllib.parse import urlencode

# def crawl_job():
#     """
#     Job to start spiders.
#     Return Deferred, which will execute after crawl has completed.
#     """
#     settings = get_project_settings()
#     runner = CrawlerRunner(settings)
#     return runner.crawl(AmazonfeedSpider)
def CrawlProduct():
    settings = get_project_settings()
    runner2 = CrawlerRunner(settings)
    yield runner2.crawl(ProductfeedSpider)
    reactor.stop()
    
def schedule_next_crawl(null, sleep_time):
    """
    Schedule the next crawl
    """
    reactor.callLater(sleep_time, CrawlProduct)

@inlineCallbacks
def loop_urls(urls):
    """
#     Job to start spiders.
#     Return Deferred, which will execute after crawl has completed.
#     """
    settings = get_project_settings()
    runner = CrawlerRunner(settings)
    for url in urls:
        yield runner.crawl(AmazonfeedSpider, url)
    # reactor.stop()

def crawl(Purl):
    """
    A function that schedules a crawl 30 seconds after
    each successful crawl.
    """
    # loop_urls() returns a Deferred
    d = loop_urls(Purl)
    # call schedule_next_crawl(<scrapy response>, n) after crawl job is complete
    d.addCallback(schedule_next_crawl, 30)
    d.addErrback(catch_error)

def catch_error(failure):
    print(failure.value)

if __name__=="__main__":
    with open(r'C:\Users\Latitude\Desktop\Shadman\Scrapy_Projects\Product_List.yaml') as file:
         PList = yaml.load(file, Loader=yaml.FullLoader) 

    Purl= []
    for k, v in PList.items():
        arg = v['M_title']
        args = {"k": arg}
        amazon_url= 'https://www.amazon.com/s?{}'.format(urlencode(args))
        Purl.append(amazon_url)

    print(Purl)
    configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
    crawl(Purl)
    reactor.run()

#Is it for not executing the inlineCallbacks function properly..? I am drawing the attention of altruistic experts and looking forward to their suggestions and solutions. please speculate the aforementioned stackoverflow questions and solutions first, before answering my question.

  • Are you calling loop_urls from the second code block from the first code block? – Fallenreaper Jan 23 '20 at 17:11
  • First I call crawl() function from the main function, after that the Crawl function will call the Loop_urls method to start the crawling process from the 1st spider one after another – Shadman Nashif Jan 25 '20 at 05:05
  • Loop_Urls is a generator though, so are you properly looping the contents of loop_urls? I see you calling crawl, then i see the loop_urls, but i dont see you looping the resultant, 'd' to get the results of all all the url. what does: list(d) do after `d.addErrback` to see if it will loop the generator to turn it into a list. If it runs then it means that was your issue and you should likely look into addressing how you want the generator to process. list(d) isnt necessarily the right answer but it might show you that the generator isnt being processed. – Fallenreaper Jan 27 '20 at 17:23

0 Answers0