0

So I tried to apply the examples in this link but my output is strange:

My item:

class Artiste(scrapy.Item):
    url = scrapy.Field()
    nom = scrapy.Field()
    styles = scrapy.Field()

My Scrapy class:

class AnnuSpider(scrapy.Spider):
    name = "annu"
    start_urls = [
        'https://www.livetonight.fr/groupe-musique-dj',
    ]

    def parse(self, response):
        doc = Artiste()
        for artiste in response.css('.card-musician'):
            details_partial_link = artiste.css('a::attr(href)').get()
            doc['nom'] = artiste.css('.card-musician-title-wrapper').xpath('normalize-space(./h4/text())').get()
            doc['url'] = details_partial_link
            details_link = response.urljoin(details_partial_link)
            request = scrapy.Request(details_link, callback=self.parse_details)
            request.meta['item'] = doc
            print "NOM", doc['nom']
            yield request

    def parse_details(self, response):
        doc = response.meta['item']
        doc['styles'] = response.css('.show-overview-info').xpath('normalize-space(./p/text())')[0].get()
        return doc

So, instead of giving me 21 lines with each having their own nom,url,stylesI get 21 lines with the same (which is the last of the list) nomand urland the right styles.

Here is the full output:

[
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Jazz / Folk / Rock"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Rock / Pop / Folk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Soul / Pop / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Soul / Pop / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Jazz / Pop"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Rock / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Rock / Jazz"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Rock / Pop / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Rock / Pop / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Jazz"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Rock / Blues / Soul"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Rock / Blues / Soul"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Funk / Soul / Pop"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Folk / Soul"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Jazz / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Jazz / Funk"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Jazz / Swing / Musique du monde"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Guinguette / Swing"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Guinguette / Swing"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Jazz / Swing / Pop"},
{"url": "/groupe-musique-dj/4123-remi-dugue-trio", "nom": "R\u00e9mi Dugu\u00e9 Trio", "styles": "Pop / Funk / Dj"}
]

What's strange to me is the fact that if I take out the request, my output is perfect. Like with this code:

class AnnuSpider(scrapy.Spider):
    name = "annu"
    start_urls = [
        'https://www.livetonight.fr/groupe-musique-dj',
    ]

    def parse(self, response):
        doc = Artiste()
        for artiste in response.css('.card-musician'):
            details_partial_link = artiste.css('a::attr(href)').get()
            doc['nom'] = artiste.css('.card-musician-title-wrapper').xpath('normalize-space(./h4/text())').get()
            doc['url'] = details_partial_link
            details_link = response.urljoin(details_partial_link)
            yield doc
Max atton
  • 131
  • 10

1 Answers1

1

Try changing the doc declaration to inside the loop:

def parse(self, response):
        for artiste in response.css('.card-musician'):
           doc = Artiste()
           ...
Namba
  • 116
  • 1
  • 3