0

I have about 100 spiders on a server. Every morning all spiders start scraping and writing all of the logs in their logs. Sometimes a couple of them gives me an error. When a spider gives me an error I have to go to the server and read from log file but I want to read the logs from the mail.

I already set dynamic mail sender as follow:

class FirstBotSpiderMiddleware:
    def __init__(self, stats):
        self.stats = stats

    @classmethod
    def from_crawler(cls, crawler):
        s = cls(crawler.stats)
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
        return s

    def process_spider_input(self, response, spider):
        return None

    def process_spider_output(self, response, result, spider):
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        pass

    def process_start_requests(self, start_requests, spider):
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
    
    def spider_closed(self, spider,reason):
        error_count = self.stats.get_value('log_count/ERROR')
        counts = self.stats.get_value('item_scraped_count')
        count_403 = self.stats.get_value('downloader/response_status_count/403')
        count_404 = self.stats.get_value('downloader/response_status_count/404')
        robots_404 = self.stats.get_value('robotstxt/response_status_count/404')
        robots_403 = self.stats.get_value('robotstxt/response_status_count/403')
        duplicate_count = self.stats.get_value('item_dropped_count')

        #I want to read all logs here

        content = "some stat string"

        self.mailSender(spider.name,content,logs)

    def mailSender(self,spider,content,logs):
        send_mail(
        "Scrapy "+spider+" done",
        content,
        djsettings.EMAIL_HOST_USER,
        ['xxx@xxx.com'],
        )

I couldn't figure out how to read the error log at spider_closed on middleware dynamically. Do you have any suggestions?

Murat Demir
  • 716
  • 7
  • 26

1 Answers1

1

I have implemented a similar method in my web scraping module.

Below is the implementation you can look at and take reference from.

import gzip
import datetime

from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder

from collections import defaultdict

try:
    from cStringIO import cStringIO as StringIO
except ImportError:
    from io import StringIO
 

def format_size(size):
    for x in ['bytes', 'KB', 'MB', 'GB']:
        if size < 1024.0:
            return "%3.1f %s" % (size, x)

        size /= 1024.0


class GzipCompressor(gzip.GzipFile):
    extension = '.gz'
    mimetype = 'application/gzip'

    def __init__(self):
        super(GzipCompressor, self).__init__(
            fileobj=PlainCompressor(), mode='w')
        self.read = self.fileobj.read


class PlainCompressor(StringIO):
    extension = ''
    mimetype = 'text/plain'

    def read(self, *args, **kwargs):
        self.seek(0)

        return StringIO.read(self, *args, **kwargs)

    @property
    def size(self):
        return len(self.getvalue())


class StatusMailer(object):
    def __init__(self, recipients, mail, compressor, crawler):
        self.recipients = recipients
        self.mail = mail
        self.encoder = ScrapyJSONEncoder()
        self.files = defaultdict(compressor)

        self.num_items = 0
        self.num_errors = 0
        self.start_time = datetime.datetime.now()

    @classmethod
    def from_crawler(cls, crawler):
        recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS')
        compression = crawler.settings.get('STATUSMAILER_COMPRESSION')

        if not compression:
            compressor = PlainCompressor
        elif compression.lower().startswith('gz'):
            compressor = GzipCompressor
        else:
            raise NotConfigured

        if not recipients:
            raise NotConfigured

        mail = MailSender.from_settings(crawler.settings)
        instance = cls(recipients, mail, compressor, crawler)

        crawler.signals.connect(instance.item_scraped,
                                signal=signals.item_scraped)
        crawler.signals.connect(instance.spider_error,
                                signal=signals.spider_error)
        crawler.signals.connect(instance.spider_closed,
                                signal=signals.spider_closed)

        return instance

    def item_scraped(self, item, response, spider):
        self.num_items += 1
        self.files[spider.name + '.log'].write(str(self.num_items)+ " " + str(response.url) + '\n')
        self.files[spider.name +
                   '-items.json'].write(self.encoder.encode(item))

    def spider_error(self, failure, response, spider):
        self.files[spider.name + '.log'].write(failure.getTraceback())
        self.num_errors += 1

    def spider_closed(self, spider, reason):
        files = []
        for name, compressed in self.files.items():
            files.append((name + compressed.extension,
                          compressed.mimetype, compressed))

        try:
            size = self.files[spider.name + '-items.json'].size
        except KeyError:
            size = 0

        body = '''Crawl statistics:

            - Spider name: {0}
            - Spider started at: {1}
            - Spider finished at: {2}
            - Number of items scraped: {3}
            - Number of errors: {4}
            - Size of scraped items: {5}'''.format(
            spider.name,
            self.start_time,
            datetime.datetime.now(),
            self.num_items,
            self.num_errors,
            format_size(size)
        )

        return self.mail.send(
            to=self.recipients,
            subject='Crawler for %s: %s' % (spider.name, reason),
            body=body,
            attachs=files
        )