Ok, sorry for the long code. This is a spider that downloads all the html pages from the site recursively. My question is: why the callback in the iterate
function will always be this function:
(err) => {
if (err) {
console.error(err);
}
console.log("All files downloaded");
}
which actually allows it to exit the recursion. I thought the callback would be like this, but of course it would create an infinite recursion:
(err) => {
if (err) {
return callback(err);
}
iterate(index + 1);
}
const request = require("request");
const fs = require("fs");
const path = require("path");
const cheerio = require("cheerio");
function urlToFilename(url) {
const parsedUrl = new URL(url);
const hostname = parsedUrl.hostname;
const pathname = parsedUrl.pathname;
return pathname === "/"
? `${hostname.slice(0, hostname.lastIndexOf("."))}/${hostname.slice(
0,
hostname.lastIndexOf(".")
)}.html`
: `${hostname.slice(0, hostname.lastIndexOf("."))}/${pathname.slice(
0,
pathname.lastIndexOf(".")
)}.html`;
}
function getPageLinks(url, body) {
const $ = cheerio.load(body);
const linkObjects = $("a");
const links = [];
const urlObj = new URL(url);
linkObjects.each((i, elem) => {
const link = $(elem).attr("href");
try {
const linkObj = new URL(link);
if (linkObj.hostname === urlObj.hostname) {
links.push(link);
}
} catch (err) {
// console.error(err);
}
});
return links;
}
function saveFile(filename, contents, callback) {
fs.mkdir(path.dirname(filename), { recursive: true }, (err) => {
if (err) {
return callback(err);
}
fs.writeFile(filename, contents, callback);
});
}
function download(url, filename, callback) {
console.log(`Downloading ${url}`);
request(url, (err, response, body) => {
if (err) {
return callback(err);
}
saveFile(filename, body, (err) => {
if (err) {
return callback(err);
}
console.log(`Downloaded and saved: ${url}`);
callback(null, body);
});
});
}
function spiderLinks(currentUrl, body, nesting, callback) {
if (nesting === 0) {
return process.nextTick(callback);
}
const links = getPageLinks(currentUrl, body);
function iterate(index) {
if (index === links.length) {
return callback();
}
spiderNested(links[index], nesting - 1, (err) => {
if (err) {
return callback(err);
}
iterate(index + 1);
});
}
iterate(0);
}
function spiderNested(url, nesting, callback) {
const filename = urlToFilename(url);
fs.readFile(filename, "utf-8", (err, body) => {
if (err) {
if (err.code !== "ENOENT") {
return callback(err);
}
return download(url, filename, (err, body) => {
if (err) {
return callback(err);
}
spiderLinks(url, body, nesting, callback);
});
}
spiderLinks(url, body, nesting, callback);
});
}
spiderNested(process.argv[2], 1, (err) => {
if (err) {
console.error(err);
}
console.log("All files downloaded");
});