I just need to understand How can I detect whether scrapy saved and item in spider ? I'm fetching items from a site and after that I'm fetching comments on that item. So first I have to save the item after that I'll save comments. But when I'm writing code after yield it's giving me this error.
save() prohibited to prevent data loss due to unsaved related object ''.
And this is my code
def parseProductComments(self, response):
name = response.css('h1.product-name::text').extract_first()
price = response.css('span[id=offering-price] > span::text').extract_first()
node = response.xpath("//script[contains(text(),'var utagData = ')]/text()")
data = node.re('= (\{.+\})')[0] #data = xpath.re(" = (\{.+\})")
data = json.loads(data)
barcode = data['product_barcode']
objectImages = []
for imageThumDiv in response.css('div[id=productThumbnailsCarousel]'):
images = imageThumDiv.xpath('img/@data-src').extract()
for image in images:
imageQuality = image.replace('/80/', '/500/')
objectImages.append(imageQuality)
company = Company.objects.get(pk=3)
comments = []
item = ProductItem(name=name, price=price, barcode=barcode, file_urls=objectImages, product_url=response.url,product_company=company, comments = comments)
yield item
print item["pk"]
for commentUl in response.css('ul.chevron-list-container'):
url = commentUl.css('span.link-more-results::attr(href)').extract_first()
if url is not None:
for commentLi in commentUl.css('li.review-item'):
comment = commentLi.css('p::text').extract_first()
commentItem = CommentItem(comment=comment, product=item.instance)
yield commentItem
else:
yield scrapy.Request(response.urljoin(url), callback=self.parseCommentsPages, meta={'item': item.instance})
And this is my pipeline.
def comment_to_model(item):
model_class = getattr(item, 'Comment')
if not model_class:
raise TypeError("Item is not a `DjangoItem` or is misconfigured")
def get_comment_or_create(model):
model_class = type(model)
created = False
# Normally, we would use `get_or_create`. However, `get_or_create` would
# match all properties of an object (i.e. create a new object
# anytime it changed) rather than update an existing object.
#
# Instead, we do the two steps separately
try:
# We have no unique identifier at the moment; use the name for now.
obj = model_class.objects.get(product=model.product, comment=model.comment)
except model_class.DoesNotExist:
created = True
obj = model # DjangoItem created a model for us.
obj.save()
return (obj, created)
def get_or_create(model):
model_class = type(model)
created = False
# Normally, we would use `get_or_create`. However, `get_or_create` would
# match all properties of an object (i.e. create a new object
# anytime it changed) rather than update an existing object.
#
# Instead, we do the two steps separately
try:
# We have no unique identifier at the moment; use the name for now.
obj = model_class.objects.get(product_company=model.product_company, barcode=model.barcode)
except model_class.DoesNotExist:
created = True
obj = model # DjangoItem created a model for us.
obj.save()
return (obj, created)
def update_model(destination, source, commit=True):
pk = destination.pk
source_dict = model_to_dict(source)
for (key, value) in source_dict.items():
setattr(destination, key, value)
setattr(destination, 'pk', pk)
if commit:
destination.save()
return destination
class ProductItemPipeline(object):
def process_item(self, item, spider):
if isinstance(item, ProductItem):
item['cover_photo'] = item['files'][0]['path']
item_model = item.instance
model, created = get_or_create(item_model)
#update_model(model, item_model)
if created:
for image in item['files']:
imageItem = ProductImageItem(image=image['path'], product=item.instance)
imageItem.save()
# for comment in item['comments']:
# commentItem = CommentItem(comment=comment, product= item.instance)
# commentItem.save()
return item
if isinstance(item, CommentItem):
comment_to_model = item.instance
model, created = get_comment_or_create(comment_to_model)
if created:
print model
else:
print created
return item