2

I'm trying to read big json file that contain news in node.js, but I face an error:

RangeError [ERR_FS_FILE_TOO_LARGE]: File size (3472064213) is greater than 2 GB at new NodeError (node:internal/errors:371:5) at FSReqCallback.readFileAfterStat [as oncomplete] (node:fs:335:11) { code: 'ERR_FS_FILE_TOO_LARGE' }

The code:

var fs = require("fs");
    fs.readFile("GOV.json", { encoding: "utf-8" }, function (err, data) {
  if (err) {
    throw err;
  }

  // Build up a giant bulk request for elasticsearch.
  bulk_request = data.split("\n").reduce(function (bulk_request, line) {
    var obj, tweets;

    try {
      obj = JSON.parse(line);
    } catch (e) {
      console.log("Done reading 1");
      return bulk_request;
    }

    // Rework the data slightly

    tweets = {
      id: obj.id,
      username: obj.username,
      tweet: obj.tweet,
      date: new Date(obj.date),
      url: obj.url,
    };

    bulk_request.push({
      index: { _index: "tweets_index", _type: "tweets", _id: tweets.id },
    });
    bulk_request.push(tweets);
    return bulk_request;
  }, []);

  // A little voodoo to simulate synchronous insert
  var busy = false;
  var callback = function (err, resp) {
    if (err) {
      console.log(err);
    }

    busy = false;
  };

  // Recursively whittle away at bulk_request, 1000 at a time.
  var perhaps_insert = function () {
    if (!busy) {
      busy = true;
      client.bulk(
        {
          body: bulk_request.slice(0, 1000),
        },
        callback
      );
      bulk_request = bulk_request.slice(1000);
      console.log(bulk_request.length);
    }

    if (bulk_request.length > 0) {
      setTimeout(perhaps_insert, 100);
    } else {
      console.log("Inserted all records.");
    }
  };

  perhaps_insert();
});
Drsaud
  • 385
  • 2
  • 6
  • 15

1 Answers1

2

According to this answer on GitHub, 2GB is the limit:

That is the max buffer size in node. To import larger files, the code will need to change the imports to streams instead of putting the whole file in a buffer (...).

  • Some possible solutions are mentioned here: https://stackoverflow.com/a/53464795/3734141 – Gerard Jaryczewski Jan 29 '22 at 11:08
  • BTW, this cannot be solved by --max-old-space-size option, because the purpose of this option is: "Sets the max memory size of V8's old memory section. As memory consumption approaches the limit, V8 will spend more time on garbage collection in an effort to free unused memory." https://nodejs.org/api/cli.html#useful-v8-options – Gerard Jaryczewski Jan 29 '22 at 11:17