The following code is a modification of the soupselect demo example. It basically fetches some html and prints a list of links and stores them in a variable:
crawl = function(host)
var select = require('soupselect').select,
htmlparser = require("htmlparser"),
http = require('http'),
sys = require('sys');
// fetch some HTML...
var http = require('http');
var client = http.createClient(80, host);
var request = client.request('GET', '/',{'host': host});
var newPages = []
request.on('response', function (response) {
response.setEncoding('utf8');
var body = "";
response.on('data', function (chunk) {
body = body + chunk;
});
response.on('end', function() {
// now we have the whole body, parse it and select the nodes we want...
var handler = new htmlparser.DefaultHandler(function(err, dom) {
if (err) {
sys.debug("Error: " + err);
} else {
// soupselect happening here...
var titles = select(dom, 'a.title');
sys.puts("Top stories from reddit");
titles.forEach(function(title) {
sys.puts("- " + title.children[0].raw + " [" + title.attribs.href + "]\n");
newPages.push(title.attribs.href);
})
}
});
var parser = new htmlparser.Parser(handler);
parser.parseComplete(body);
});
});
request.end();
}
what i really want is for this function to return newPages
i want to be able to say newPages = crawl(host)
; Trouble is im not sure if this makes sense or where to put the return statement. I see that newPages exists before the request is ended but is empty after the request is over.
How do i make that function have a return value that is newPages
?