2

Its a program that reads data from text file 'IN.txt' and write it to 'copy.json' file in json format.
In each line of text file, words are separated by tab and using tab i am splitting the line into array.

I think implementing readable stream in this way overwrites same pieces of data again and again, that's not efficient for large file.
I did try many different ways but i was getting errors like memory leakage, _read method not defined etc.

const fs = require('fs');
const readLine = require('readline');
const { Readable } = require('stream');
const dataArray = [];

//creating readline interface
const lineReader = readLine.createInterface({
    input: fs.createReadStream(__dirname + '/IN.txt'),
});

const fields = ['country', 'pin', 'place', 'state', 'code', 'division', 'admin', 'mandal', 'xxx', 'lat', 'long'];

//reading data from text file line by line and spliting each line into array
lineReader.on('line', function (line) {
    let words = line.split('\t');
    writeToFile(fields, words);
});

lineReader.on('close', function (line) {
    console.log('***Finished***');
    process.exit(0);
});

//words array will be like ["IN","744301", "Mus Andaman & Nicobar Islands", "01 Nicobar 638 Carnicobar" , "9.2333", "92.7833","4"]
//creating obj with fields and words array and pushing into array
function writeToFile(fields, words) {
    var obj = {};
    for(let i = 0; i < fields.length; i++) {
        obj[fields[i]] = words[i];
    }
    dataArray.push(obj);
    //implementing readable stream and pushing string into it 
    const rStream = new Readable();
    rStream.push(JSON.stringify(dataArray, null, 4));
    rStream.push(null);
    const output = fs.createWriteStream(__dirname + '/copy.json');
    //piping to output
    rStream.pipe(output);
}

here is small snapshot of IN.txt file

IN.txt file

1 Answers1

0

On every call to writeToFile (basically on reading every line) you are creating a readStream and copying dataArray to it, piping to a write stream. You don't need this if you already have a read stream open to the file.

Nice text to read: https://medium.freecodecamp.org/node-js-streams-everything-you-need-to-know-c9141306be93

Give following a try: process.memoryUsage().heapUsed / 1024 / 1024 gave me memory heap used as around 147 MB for an IN.txt file of around 14 MB.

const fs = require('fs');
const readLine = require('readline');
const { Readable } = require('stream');
const output = fs.createWriteStream(__dirname + '/copy.json');
const dataArray = [];

//creating readline interface
const lineReader = readLine.createInterface({
    input: fs.createReadStream(__dirname + '/IN.txt')
});

const fields = ['country', 'pin', 'place', 'state', 'code', 'division',     'admin', 'mandal', 'xxx', 'lat', 'long'];

//reading data from text file line by line and pushing it to an array
lineReader.on('line', function (line) {
    let words = line.split('\t');
    dataArray.push(getLineContent(fields, words));
});

lineReader.on('close', function (line) {
    console.log('***Finished***');
    output.write(JSON.stringify(dataArray, null, 4));
    output.end();
    process.exit(0);
});

//words array will be like ["IN","744301", "Mus Andaman & Nicobar Islands", "01 Nicobar 638 Carnicobar" , "9.2333", "92.7833","4"]
//creating obj with fields and words
function getLineContent(fields, words) {
    var obj = {};
    for(let i = 0; i < fields.length; i++) {
        obj[fields[i]] = words[i];
    }
    return obj;
}

A more efficient solution :

process.memoryUsage().heapUsed / 1024 / 1024 gave me memory heap used as around 5-7 MB (a significant improvement as compared to above approach) for an IN.txt file of around 14 MB.

More Reference text:

  1. Stream highWaterMark misunderstanding
  2. Pausing readline in Node.js
  3. https://www.valentinog.com/blog/memory-usage-node-js/

Following might help give you a kickstart:

const fs = require('fs');
const readLine = require('readline');
const { Readable } = require('stream');
const output = fs.createWriteStream(__dirname + '/copy.json');

//creating readline interface
const lineReader = readLine.createInterface({
    input: fs.createReadStream(__dirname + '/IN.txt')
});

const fields = ['country', 'pin', 'place', 'state', 'code', 'division',         'admin', 'mandal', 'xxx', 'lat', 'long'];

let lineCount = 0;
let writeAllowed = true; //Turns to false when stream.write starts     returning false
let paused = false; //Pause Readline
let buffstr = ""; //To handle leaks after calling readLine pause()

//reading data from text file line by line and pushing it to an array
lineReader.on('line', function (line) {
    lineCount++;
    let words = line.split('\t');
    let lineJson = getLineContent(fields, words);

    if (paused) {
      if(lineCount > 1) {
        buffstr = buffstr + ",";
      }
      buffstr = buffstr + JSON.stringify(lineJson, null, 4);
    }
    else {
      if(!writeAllowed) {
        lineReader.pause();
      }
      lineCount === 1 ? writeMe('[') : writeMe(",");
      writeMe(JSON.stringify(lineJson, null, 4));
    }
});

lineReader.on('pause', function() {
   paused = true;
});

lineReader.on('resume', function() {
   paused = false;
});

lineReader.on('close', function (line) {
    output.write(buffstr);
    output.write(']');
    output.end();
    console.log(`***Finished*** Memory heap used:     ${process.memoryUsage().heapUsed / 1024 / 1024} MB`);
});

function writeMe(str){
   if(writeAllowed){
      writeAllowed = writeAllowed && output.write(str);
   }
   else{
      buffstr += str;
      output.once('drain', function() {
         lineReader.resume();
         output.write(buffstr);
         buffstr = ""; //Possible scope of improvement. Need to check if     any race condition
         writeAllowed = true;
      });
   }
}

//words array will be like ["IN","744301", "Mus Andaman & Nicobar     Islands", "01 Nicobar 638 Carnicobar" , "9.2333", "92.7833","4"]
//creating obj with fields and words
function getLineContent(fields, words) {
    var obj = {};
    for(let i = 0; i < fields.length; i++) {
        obj[fields[i]] = words[i];
    }
    return obj;
}
amangpt777
  • 525
  • 5
  • 10
  • it is not producing proper json data. Since every time it writes just an object.[here is how output looks like ](https://drive.google.com/file/d/1k8Tl1hVy2tChnDC0_B6suEXuL1tEyzaK/view) – Amrendra Kumar Jun 23 '18 at 14:35
  • oops I made a mistake there. I have edited my answer, please see if it solves your problem? – amangpt777 Jun 23 '18 at 20:29
  • It is necessary to listen for drain and .write return and make it able to write in control manner like writable.write return true when its ready to write and vice- versa. – Amrendra Kumar Jun 25 '18 at 15:30
  • I tried this for large file but it is not efficient method because it buffer all data to memeory uncontrollably and sucking all memory, freezed the system – Amrendra Kumar Jun 25 '18 at 15:37
  • Yes you are right, I have re-edited my answer. Pls see if it helps you give a kickstart. – amangpt777 Jun 25 '18 at 20:46