In some parent pages, there are some child page anchors I need.I want to crawl all the parent pages, parse them, then get the child anchor, follow the anchor, and get the result.but when i write the code, i found, before i follow the anchor, the anchor url didn't change.here's my code:
var req = require('request');
var cheerio = require('cheerio')
var model = require('./model')
function callnext(index){
var url = 'http://www.youku.com/show_episode/id_z2c9b63e691e611e2b356.html?dt=json&divid=reload_'+index+'&__rt=1&__ro=reload_21';
var result = req.get(url, function(error, response, body){
if (!error && response.statusCode == 200) {
var patt = /暂无内容/g;
var result = patt.test(body);
if(result){
return;
}
$ = cheerio.load(body);
var children = $('div').first().children();
for(var i=0;i<children.length;i++){
var item = $(children[i]);
var anchor = $(item.find('li>a')[0]).attr('href');
var labelText = $(item.find('label')[0]).text();
//TAG 1
req.get(anchor, function(error, response, body){
//TAG 2
console.log(anchor);
//here's my result
})
}
index = index+20;
callnext(index)
}
})
}
callnext(1);
In this code, if i console.log() the anchor url at TAG1 place and TAG2 place, it cames different result. in TAG 1, it's my expected result, but at TAG 2,it seems only printout the first anchor of the parent page.
i tried to changed the code and extract the sub request function, the cames the right result.why?
var req = require('request');
var cheerio = require('cheerio')
var model = require('./model')
function crawlItem(url, text){
req.get(url, function(error, response, body){
console.log(url)
var inner = cheerio.load(body);
var text = inner('#text_long').text();
// model.Talk.create({ id: la, video: hr, youku_desc:text }).complete(function(err, album) {
// console.log(err);
// });
})
}
function callnext(index){
var url = 'http://www.youku.com/show_episode/id_z2c9b63e691e611e2b356.html?dt=json&divid=reload_'+index+'&__rt=1&__ro=reload_21';
var result = req.get(url, function(error, response, body){
if (!error && response.statusCode == 200) {
var patt = /暂无内容/g;
var result = patt.test(body);
if(result){
return;
}
$ = cheerio.load(body);
var children = $('div').first().children();
for(var i=0;i<children.length;i++){
var item = $(children[i]);
var anchor = $(item.find('li>a')[0]).attr('href');
var labelText = $(item.find('label')[0]).text();
// console.log(anchor);
crawlItem(anchor, labelText);
}
index = index+20;
callnext(index)
}
})
}
callnext(1);