I am completely stumped on this bug. I'm trying to pull all of the menu items from this page https://www.alloresto.fr/restaurant-livraison-a-domicile/restaurant/pizza-mia/angers-centre-ville/particuliers/carte. When it reaches the innermost for loop, it goes through one iteration and then moves on. This is very unexpected and I don't know what's causing it. The following is the function that is dedicated to parsing this page.
def get_menu(self, response):
image_url = response.urljoin(response.xpath('//span/img/@src').extract_first())
for menu_section in response.xpath("//div[@id = 'contenu_choixplats']/div"):
menu_section_name = menu_section.xpath('dl/dt/text()').extract_first()
for menu_item in menu_section.xpath('ul/li'):
item = Restaurant()
item['restaurant_url'] = response.url
item['restaurant_name'] = response.request.meta['restaurant_name']
item['street_name'] = response.request.meta['street_name']
item['street_number'] = response.request.meta['street_number']
item['city'] = response.request.meta['city']
item['zip_code'] = response.request.meta['zip_code']
item['food_type'] = response.request.meta['food_type']
item['image_urls'] = [image_url]
item['menu_category'] = menu_section_name
item['menu_item_title'] = menu_item.xpath('div/h3/text()').extract()
item['menu_item_details'] = menu_item.xpath('div/p/text()').extract_first()
item['menu_item_price'] = menu_item.xpath('div').css('div.product-price-with-offer').xpath('p/text()').extract_first()
yield item
Do you see anything that I am missing? Thank you for your time.
--UPDATE---
This is the complete code. I am supplying it in case the problem lies outside of the get_menu function. Note that the get_menu function only found after digging two pages deep from the index of the website. spiders/alloresto_spider.py import scrapy import re from french_scraping.items import Restaurant
class DmozSpider(scrapy.Spider):
name = "alloresto"
allowed_domains = ['alloresto.fr']
start_urls = ["https://www.alloresto.fr/livraison/villes/"]
def parse(self, response):
for sel in response.xpath('//ul/li/a/@href'):
url = response.urljoin(sel.extract())
yield scrapy.Request(url, callback=self.restaurants_for_this_city)
def restaurants_for_this_city(self, response):
for restaurant in response.xpath('//article/div'):
restaurant_url = response.urljoin(restaurant.xpath('a/@href').extract_first())
restaurant_name = restaurant.xpath('div/section[@class="restaurantDetails"]/h3/a/text()').extract_first()
full_address = restaurant.xpath('div/section[@class="restaurantDetails"]/address/text()').extract_first()
extracts = re.search(r'^([\d-]*?)\W(.*?),\W(.*?)\W(\d\d\d\d\d)', full_address)
try:
street_number = extracts.group(1)
except:
continue
street_name = extracts.group(2)
city = extracts.group(3)
zip_code = extracts.group(4)
food_type = restaurant.xpath('div/section/p').css('.restaurantCuisines').xpath('text()').extract()
meta_data = {
'restaurant_url': restaurant_url,
'restaurant_name': restaurant_name,
'street_number': street_number,
'street_name': street_name,
'city': city,
'zip_code': zip_code,
'food_type': food_type}
yield scrapy.Request(restaurant_url, meta=meta_data, callback=self.get_menu)
# get info on next page
next_page = response.css('.next').xpath('a/@href').extract()
if len(next_page) > 0:
url = response.urljoin(next_page[0])
yield scrapy.Request(url, callback=self.restaurants_for_this_city)
def get_menu(self, response):
image_url = response.urljoin(response.xpath('//span/img/@src').extract_first())
for menu_section in response.xpath("//div[@id = 'contenu_choixplats']/div"):
menu_section_name = menu_section.xpath('dl/dt/text()').extract_first()
for menu_item in menu_section.xpath('ul/li'):
item = Restaurant()
item['restaurant_url'] = response.url
item['restaurant_name'] = response.request.meta['restaurant_name']
item['street_name'] = response.request.meta['street_name']
item['street_number'] = response.request.meta['street_number']
item['city'] = response.request.meta['city']
item['zip_code'] = response.request.meta['zip_code']
item['food_type'] = response.request.meta['food_type']
item['image_urls'] = [image_url]
item['menu_category'] = menu_section_name
item['menu_item_title'] = menu_item.xpath('div/h3/text()').extract()
item['menu_item_details'] = menu_item.xpath('div/p/text()').extract_first()
item['menu_item_price'] = menu_item.xpath('div').css('div.product-price-with-offer').xpath('p/text()').extract_first()
yield item
items.py
import scrapy
class Restaurant(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
restaurant_url = scrapy.Field()
street_number = scrapy.Field()
restaurant_name = scrapy.Field()
street_name = scrapy.Field()
city = scrapy.Field()
zip_code = scrapy.Field()
food_type = scrapy.Field()
menu_category = scrapy.Field()
menu_item_title = scrapy.Field()
menu_item_details = scrapy.Field()
menu_item_price = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
pass
settings.py
BOT_NAME = 'french_scraping'
SPIDER_MODULES = ['french_scraping.spiders']
NEWSPIDER_MODULE = 'french_scraping.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
IMAGES_STORE = '/Users/drew/Desktop/frenchscraping/french_scraping'
pipelines.py
class FrenchScrapingPipeline(object):
def process_item(self, item, spider):
return item