1

I am using Nodejs to read JSON objects from a really large JSON file (1GB+). The JSON file has the format [{field1: x, field2: x, field3: x},{...},...,{...}]. There is no line separation for each object. In order to avoid memory problems, I am using fs.createReadStream and treating each chunk of data in sequence. This works and I get valid JSON objects, but the reader stops after reading only one data chunk. Why is it not reading the rest of the file?

My solution was inspired by the accepted answer in this question: Parse large JSON file in Nodejs

Here is the code:

// Get the JSON file
var fs = require('fs');
var stream = fs.createReadStream('Results.json', {flags: 'r', encoding: 'utf-8'});
var buf = '';
var count = 0;

stream.on('data', function(chunk) {
    console.log("Stream on data!"); // ONLY EXECUTED ONCE
    buf += chunk.toString(); // when data is read, stash it in a string buffer
    process(); // process the buffer
});
stream.on('error', function(err) {
    // NEVER EXECUTED
    console.log(err);
});
stream.on('end', function() {
    // NEVER EXECUTED
    console.log("Count: " + count);
});

function process() {
    var posStart = buf.indexOf('{');
    var posEnd = buf.indexOf('}');

    while (posStart >= 0 || posEnd >= 0) { // keep going until the start or end of the json object in the string
        // IF the start bracket is before the end, skip to the start
        if((posStart < posEnd || posEnd < 0) && posStart >= 0){ 
            buf = buf.slice(posStart);
        }
        if(posStart == 0 && posEnd >= 0){ // IF the end bracket is next
            processObjectString(buf.slice(0, posEnd+1)); // Process the complete object string
            buf = buf.slice(posEnd+1); // Remove the processed string from the buffer
        }
        // Update the positions
        posStart = buf.indexOf('{');
        posEnd = buf.indexOf('}');
    }
}

function processObjectString(objectString) {
    count++;
    var obj = JSON.parse(objectString); // parse the JSON
    console.log(obj.id); // Print object ID (works)
}

EDIT: After fixing the errors causing the infinite while loop, the following is a working solution iterating through all the objects in the JSON file. It might not be very elegant, but at least it works (for anyone who might have a similar problem).

// Get the JSON file
var fs = require('fs');
var stream = fs.createReadStream('Results.json', {flags: 'r', encoding: 'utf-8'});
var buf = '';
var count = 0;

stream.on('data', function(chunk) {
    buf += chunk.toString(); // when data is read, stash it in a string buffer
    process(buf); // process the buffer
});
stream.on('error', function(err) {
    console.log(err);
});
stream.on('end', function() {
    console.log("Count: " + count);
});

function process() {
    var posStart = buf.indexOf('{');
    var posEnd = buf.indexOf('}');

    while (posStart >= 0 || posEnd >= 0) { // keep going until the start or end of the json object in the string
        // IF the start bracket is before the end, skip to the start
        if((posStart < posEnd || posEnd < 0) && posStart >= 0){ 
            buf = buf.slice(posStart);
        }
        if(posStart == 0 && posEnd >= 0){ // IF the end bracket is next
            processObjectString(buf.slice(0, posEnd+1)); // Process the complete object string
            buf = buf.slice(posEnd+1); // Remove the processed string from the buffer
        }else if(posStart < 0 || posEnd < 0){ // Return to get a new chunk
            return;
        }
        // Update the positions
        posStart = buf.indexOf('{');
        posEnd = buf.indexOf('}');
    }
}

function processObjectString(objectString) {
    count++;
    var obj = JSON.parse(objectString); // parse the JSON
    console.log(obj.id); // Print object ID (works)
}
Community
  • 1
  • 1
krisaulie
  • 13
  • 1
  • 4
  • 1
    Can you please comment out `buf += chunk.toString();` and `process();` lines leaving only `console.log` for `stream.on('data', ...`, and check whether you still receive just single chunk. Very likely some code in `process` might halt the node process itself. – alandarev Oct 14 '14 at 11:26
  • You are right! When I comment out `process();`, it looks like I am receiving all the chunks. I was a bit blinded by my own code and didn't notice I had an infinite while in `process()`. I will try to fix it and update the post. – krisaulie Oct 14 '14 at 11:44
  • cool, to keep it *formal* I'll submit a short answer, reputation counts :D – alandarev Oct 14 '14 at 11:51

1 Answers1

1

Some Theory

Node.js is asynchronous but is actually single threaded. If the process gets stuck on processing the data received, it will never receive a second chunk, as the sender waits for the stuck thread to be released before it can do anything.

Meaning

If the line process(); inside of 'data', function(chunk) is in infinite loop, then you will never receive a second chunk, thus it may look like sender is being lazy.


For the future: try to always isolate the problem to ensure you are looking in the right place.

P.S. it actually is easy to get yourself into infinite loop while processing text, I feel your pain here.

Community
  • 1
  • 1
alandarev
  • 8,349
  • 2
  • 34
  • 43