I am writing my own Scrapy - Item Pipeline, in order to output individual JSON files into S3. This is my code so far, but I can't work out how to serialize each item into JSON.
NOTE: This is a question on how to serialize a scrapy.Item object not a general question on how to serialize an object.
def process_item(self, item, spider):
s3_conn = boto.connect_s3(spider.settings.get('AWS_ACCESS_KEY_ID'), spider.settings.get('AWS_SECRET_ACCESS_KEY'))
bucket = s3_conn.get_bucket(spider.settings.get('AWS_S3_BUCKET'))
url_path = item['path']
key = boto.s3.key.Key(bucket, "crawls/" base64.b64encode(url_path) + ".json")
serialized = json.dumps(item)
key.set_contents_from_string(serialized)
return item
However, the above code gives me:
Traceback (most recent call last):
File "/usr/local/lib/python2.7/site-packages/twisted/internet/defer.py", line 651, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/cetinick/Projects/cmlsocialbot/lib/spider/spider/pipelines.py", line 23, in process_item
serialized = json.dumps(item)
File "/usr/local/Cellar/python/2.7.13/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/__init__.py", line 244, in dumps
return _default_encoder.encode(obj)
File "/usr/local/Cellar/python/2.7.13/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/encoder.py", line 207, in encode
chunks = self.iterencode(o, _one_shot=True)
File "/usr/local/Cellar/python/2.7.13/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/encoder.py", line 270, in iterencode
return _iterencode(o, 0)
File "/usr/local/Cellar/python/2.7.13/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/encoder.py", line 184, in default
raise TypeError(repr(o) + " is not JSON serializable")
TypeError: {'description': None,
'h1s': [u'Example Domain'],
'h2s': [],
'h3s': [],
'h4s': [],
'h5s': [],
'images': [],
'inbound_links': [],
'keywords': [(u'domain', 2),
(u'examples', 2),
(u'established', 1),
(u'documents', 1),
(u'permission', 1),
(u'prior', 1),
(u'coordination', 1),
(u'illustrative', 1)],
'keywords_count': 10,
'outbound_links': [{'nofollow': False,
'text': 'More information...',
'url': 'http://www.iana.org/domains/example'}],
'path': '',
'title': u'Example Domain',
'url': 'http://example.com',
'words_count': 28} is not JSON serializable
items.py
class ItemLink(scrapy.Item):
url = scrapy.Field()
text = scrapy.Field()
nofollow = scrapy.Field()
class ItemImage(scrapy.Item):
src = scrapy.Field()
alt = scrapy.Field()
title = scrapy.Field()
class SpiderPage(scrapy.Item):
url = scrapy.Field()
path = scrapy.Field()
title = scrapy.Field()
description = scrapy.Field()
h1s = scrapy.Field()
h2s = scrapy.Field()
h3s = scrapy.Field()
h4s = scrapy.Field()
h5s = scrapy.Field()
keywords_count = scrapy.Field()
words_count = scrapy.Field()
keywords = scrapy.Field()
outbound_links = scrapy.Field(serializer=ItemLink)
inbound_links = scrapy.Field(serializer=ItemLink)
images = scrapy.Field(serializer=ItemImage)