0

In some parent pages, there are some child page anchors I need.I want to crawl all the parent pages, parse them, then get the child anchor, follow the anchor, and get the result.but when i write the code, i found, before i follow the anchor, the anchor url didn't change.here's my code:

var req = require('request');
var cheerio = require('cheerio')
var model = require('./model')


function callnext(index){
    var url = 'http://www.youku.com/show_episode/id_z2c9b63e691e611e2b356.html?dt=json&divid=reload_'+index+'&__rt=1&__ro=reload_21';
    var result = req.get(url, function(error, response, body){
        if (!error && response.statusCode == 200) {
            var patt = /暂无内容/g;
            var result = patt.test(body);
            if(result){
                return;
            }
            $ = cheerio.load(body);
            var children = $('div').first().children();
            for(var i=0;i<children.length;i++){
                var item = $(children[i]);
                var anchor = $(item.find('li>a')[0]).attr('href');
                var labelText = $(item.find('label')[0]).text();
                //TAG 1
                req.get(anchor, function(error, response, body){
                    //TAG 2
                    console.log(anchor);
                    //here's my result
                })
            }
            index = index+20;
            callnext(index)
        }
    })
}
callnext(1);

In this code, if i console.log() the anchor url at TAG1 place and TAG2 place, it cames different result. in TAG 1, it's my expected result, but at TAG 2,it seems only printout the first anchor of the parent page.

i tried to changed the code and extract the sub request function, the cames the right result.why?

var req = require('request');
var cheerio = require('cheerio')
var model = require('./model')

function crawlItem(url, text){
        req.get(url, function(error, response, body){
        console.log(url)
        var inner = cheerio.load(body);
        var text = inner('#text_long').text();
        // model.Talk.create({ id: la, video: hr, youku_desc:text }).complete(function(err, album) {
        //  console.log(err);
        // });
    })
}

function callnext(index){
    var url = 'http://www.youku.com/show_episode/id_z2c9b63e691e611e2b356.html?dt=json&divid=reload_'+index+'&__rt=1&__ro=reload_21';
    var result = req.get(url, function(error, response, body){
        if (!error && response.statusCode == 200) {
            var patt = /暂无内容/g;
            var result = patt.test(body);
            if(result){
                return;
            }
            $ = cheerio.load(body);
            var children = $('div').first().children();
            for(var i=0;i<children.length;i++){
                var item = $(children[i]);
                var anchor = $(item.find('li>a')[0]).attr('href');
                var labelText = $(item.find('label')[0]).text();
                // console.log(anchor);
                crawlItem(anchor, labelText);
            }
            index = index+20;
            callnext(index)
        }
    })
}

callnext(1);
yudun1989
  • 996
  • 9
  • 23
  • 1
    Looks like this common problem http://stackoverflow.com/questions/750486/javascript-closure-inside-loops-simple-practical-example – elclanrs Aug 30 '14 at 01:03
  • @elclanrs in that question, in my understanding, i seems like a pointer, it did not changes the value,after the loop, and the anymous function references i, so it cames out the same result, in my first program,each time, `anchor` was set to a new variable, why it came out the same result? – yudun1989 Aug 30 '14 at 01:36
  • @elclanrs is there any documents that explains this closure problem? – yudun1989 Aug 30 '14 at 01:40
  • @osrpt 感谢,我的意思是,刚那个问题,那个same variable是i,但是我这个例子里面,每次循环实际上我都重新赋值了一个新变量anchor,那为什么req.get的时候还会出现相同的结果。换句话说,为什么TAG2,那里,req.get总是在循环结束之后才会执行? – yudun1989 Aug 30 '14 at 03:50
  • 是一样的,匿名函数将引用同一个anchor,如果anchor在后面的循环中被修改了,那么之前的匿名函数也将受到影响。i的作用域和anchor一样,都在整个 for 循环中。如果一定要这样写在循环里面,可以使用 `(function(anchor){console.log(anchor);})(anchor);` 这种方式。不建议在循环中写function,function本质上也是Object,这样将导致产生大量的Object,对性能不利。 – yangsibai Aug 30 '14 at 04:30

0 Answers0