0

I am trying to crawl some data as my side project but I am having a problem gathering it. I have been trying for two day without much luck.

First problem: When I crawl the post form the main page I get a wrong token.

Second problem: I have read and I have tried to implement scrapy docs request to get the phone number but in vain, or this answer stackoverflow

Third problem: How would I go to implement the next page (comment out code inside gumtree.py).

Fourth problem: I am now able to get the phone numbers but I am getting repeated requests to the same url with different values, [see results]

I would really appreciate if anyone could give me a direction. My main goal is to crawl post that have phone numbers I have tried to search stackoverflow but I couldn't find the proper post. Many Thanks

setting.py

BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'enter code here
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
TELNETCONSOLE_ENABLED = False

gumtree.py [UPDATED]

# -*- coding: utf-8 -*-
import re
import json
import scrapy

from scrapy import Request, Item, Field, Selector

def complete_link(string):
    return string

class MyItem(Item):
    token = Field()
    post_id = Field()
    post_url = Field()
    phone_num = Field()
    phone_url = Field()


class GumtreeSpider(scrapy.Spider):
    name = "gumtree"
    allowed_domains = ["gumtree.com"]
    start_urls = [
        'https://www.gumtree.com/search?search_category=cars',
    ]

    def parse(self, response):
        item = MyItem()
        for href in response.css('a.listing-link::attr(href)').extract():
            domain = 'https://www.gumtree.com' + href
            request = Request(domain, callback=self.parse_post, meta={'domain':domain,'item':item})
            yield request

        # next_page = response.css('li.pagination-next a::attr("href")').extract_first()
        # if next_page is not None:
        #     next_page = response.urljoin(next_page)
        #     yield Request(next_page, callback=self.parse)

    def parse_post(self, response):
        item = response.meta['item']
        item['post_url'] = response.meta['domain']
        post_id = re.match('.*?([0-9]+)$', item['post_url'])
        if post_id:
            item['post_id'] = post_id.group(1)
            token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
            arr_token = re.findall(r'"([^"]*)"', str(token))
            if len(arr_token) == 15:
                item['token'] = arr_token[-2]
                request = Request('https://www.gumtree.com/ajax/account/seller/reveal/number/' + item['post_id'], headers={'X-GUMTREE-TOKEN':item['token']}, callback=self.parse_phone, meta={'item':item})
                yield request

    def parse_phone(self, response):
        item = response.meta['item']
        phone = json.loads(response.body_as_unicode())
        item['phone_num'] = phone['data']
        return item

results: [scrapy crawl gumtree -o ..\result.json]

{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "01527853397"},
Jonathan Hall
  • 75,165
  • 16
  • 143
  • 189
ombra23
  • 1
  • 1

2 Answers2

0

Have you checked that meta['item'] actually being passed to parse_token()?

I'd do the following: meta = { 'item': item } request = Request(response.urljoin(href), meta=meta, callback=self.parse_token) yield request

andjelx
  • 199
  • 1
  • 13
  • Hello andjelx, thank you for your replay, but I was able to fix the token part, that's why it had a "line-through" (First problem). – ombra23 Feb 06 '17 at 11:15
  • What does it had? Give me please an example – andjelx Feb 07 '17 at 12:01
  • Basically I need: "I am now able to get the phone numbers but I am getting repeated requests to the same url with different values." As you can see from the results on the main thread I got three requests to the same url but with different phones' numbers. I don't understand why. Many Thanks – ombra23 Feb 07 '17 at 15:53
0

I have found the solution.

# -*- coding: utf-8 -*-
import re, json, scrapy

from crawler.items import CrawlerItem
from scrapy import Request, Item, Field, Selector

gumtree = 'https://www.gumtree.com'
getphone = 'https://www.gumtree.com/ajax/account/seller/reveal/number/'

class GumtreeSpider(scrapy.Spider):
    name = "gumtree"
    allowed_domains = ["gumtree.com"]
    start_urls = [
        'https://www.gumtree.com/search?search_category=cars',
    ]
    def parse(self, response):
        item = CrawlerItem()
        pid = []
        arr_url = []
        for href in response.css('a.listing-link::attr(href)').extract():
            if len(href) > 0:
                post_id = u''.join(href).encode('utf-8').strip()
                post_id = re.match('.*?([0-9]+)$', post_id)
                if post_id:
                    pid.append(post_id.group(1))
                    domain = gumtree + href
                    arr_url.append(domain)

        i = 0
        while i < len(arr_url):
            url = u''.join(arr_url[i]).encode('utf-8').strip()
            request = Request(url, callback=self.parse_post, meta={'url':url,'item':item,'pid':pid[i]}, headers={'Referer':gumtree})
            i += 1
            yield request

        next_page = response.css('li.pagination-next a::attr("href")').extract_first()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield Request(next_page, callback=self.parse)

    def parse_post(self, response):
        item = response.meta['item']
        item['post_id'] = response.meta['pid']
        item['post_url'] = response.meta['url']
        token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
        arr_token = re.findall(r'"([^"]*)"', str(token))
        if len(arr_token) == 15:
            item['token'] = arr_token[-2]
            ref = item['post_url']
            req = Request(getphone + item['post_id'], callback=self.parse_phone, headers={'X-GUMTREE-TOKEN':item['token'], 'Referer':ref}, meta={'url':response.meta['url'],'item':item})
            return req

    def parse_phone(self, response):
        item = response.meta['item']
        item['post_url'] = response.meta['url']
        phone = json.loads(response.body_as_unicode())
        item['phone_num'] = u''.join(phone['data']).encode('utf-8').strip()
        return item
ombra23
  • 1
  • 1