0

Ok, sorry for the long code. This is a spider that downloads all the html pages from the site recursively. My question is: why the callback in the iterate function will always be this function:

(err) => {
  if (err) {
    console.error(err);
  }

  console.log("All files downloaded");
}

which actually allows it to exit the recursion. I thought the callback would be like this, but of course it would create an infinite recursion:

(err) => {
      if (err) {
        return callback(err);
      }
      iterate(index + 1);
}
const request = require("request");
const fs = require("fs");
const path = require("path");
const cheerio = require("cheerio");

function urlToFilename(url) {
  const parsedUrl = new URL(url);
  const hostname = parsedUrl.hostname;
  const pathname = parsedUrl.pathname;

  return pathname === "/"
    ? `${hostname.slice(0, hostname.lastIndexOf("."))}/${hostname.slice(
        0,
        hostname.lastIndexOf(".")
      )}.html`
    : `${hostname.slice(0, hostname.lastIndexOf("."))}/${pathname.slice(
        0,
        pathname.lastIndexOf(".")
      )}.html`;
}

function getPageLinks(url, body) {
  const $ = cheerio.load(body);
  const linkObjects = $("a");
  const links = [];
  const urlObj = new URL(url);
  linkObjects.each((i, elem) => {
    const link = $(elem).attr("href");

    try {
      const linkObj = new URL(link);
      if (linkObj.hostname === urlObj.hostname) {
        links.push(link);
      }
    } catch (err) {
      // console.error(err);
    }
  });

  return links;
}

function saveFile(filename, contents, callback) {
  fs.mkdir(path.dirname(filename), { recursive: true }, (err) => {
    if (err) {
      return callback(err);
    }

    fs.writeFile(filename, contents, callback);
  });
}

function download(url, filename, callback) {
  console.log(`Downloading ${url}`);

  request(url, (err, response, body) => {
    if (err) {
      return callback(err);
    }

    saveFile(filename, body, (err) => {
      if (err) {
        return callback(err);
      }

      console.log(`Downloaded and saved: ${url}`);
      callback(null, body);
    });
  });
}

function spiderLinks(currentUrl, body, nesting, callback) {
  if (nesting === 0) {
    return process.nextTick(callback);
  }
  const links = getPageLinks(currentUrl, body);
  function iterate(index) {
    if (index === links.length) {
      return callback();
    }

    spiderNested(links[index], nesting - 1, (err) => {
      if (err) {
        return callback(err);
      }
      iterate(index + 1);
    });
  }
  iterate(0);
}

function spiderNested(url, nesting, callback) {
  const filename = urlToFilename(url);
  fs.readFile(filename, "utf-8", (err, body) => {
    if (err) {
      if (err.code !== "ENOENT") {
        return callback(err);
      }

      return download(url, filename, (err, body) => {
        if (err) {
          return callback(err);
        }

        spiderLinks(url, body, nesting, callback);
      });
    }

    spiderLinks(url, body, nesting, callback);
  });
}

spiderNested(process.argv[2], 1, (err) => {
  if (err) {
    console.error(err);
  }

  console.log("All files downloaded");
});
Danko
  • 55
  • 2
  • 6
  • *why the callback in the `iterate` function will always be this function* - I don't think it will. Try putting `console.log(callback.toString());` at the beginning of `iterate()`. Probably the resulting output will be tamer if you put it at the beginning of `spiderLinks()` though. – tevemadar Apr 16 '23 at 14:44
  • Perhaps I already understood. Then `iterate` is created, the first version of the callback is saved inside, and then when `iterate` is called, the reference remains as when creating – Danko Apr 16 '23 at 14:49
  • 1
    Every time `spiderLinks()` is called, its `callback` is captured in a "closure", which then the newly defined `iterate()` method uses. So as the recursion goes deeper, there will be multiple closures with multiple `iterate()` functions, using multiple `callback`s. – tevemadar Apr 16 '23 at 16:11

2 Answers2

1

Inside spiderLinks, the recursion happens when it calls spiderNested to descend into the linked-to page:

spiderNested(links[index], nesting - 1, (err) => {
  if (err) {
    return callback(err);
  }
  iterate(index + 1);
});

This is a different callback, so when that page has been dealt with, iterate is called so the next page will be similarly analysed.

Only the root of the recursion tree uses the callback that outputs the completion message.

Daniel Earwicker
  • 114,894
  • 38
  • 205
  • 284
1

I still think that learning the closures thing should be enough, but here is a snippet showing how there are multiple variants of the iterate() function existing at the same time:

function doSomething(callback) {
  function iterate() {
    console.log("This is iterate with callback", callback);
    if (callback) {
      console.log("Calling callback");
      callback();
    }
  }
  console.log("(1) callback is", callback);
  iterate();
  if (!callback) {
    console.log("Recursion here");
    doSomething(iterate);
    console.log("Back from recursion");
  }
  console.log("(2) callback is", callback);
  iterate();
}

doSomething();
.as-console-wrapper {
  max-height: 100% !important
}

The most relevant part of the output is perhaps this one:

This is iterate with callback  function iterate() {
    console.log("This is iterate with callback", callback);
    if (callback) {
      console.log("Calling callback");
      callback();
    }
  }
-----------------------
Calling callback
-----------------------
This is iterate with callback undefined

We are inside iterate() with a defined callback, and thus calling it, and it reports that it's iterate() with an undefined callback. It comes from the recursion parameter.
If you carefully go through the code and its output up to this point, I think you will understand the original code too.

tevemadar
  • 12,389
  • 3
  • 21
  • 49