13

I have a large file (utf8). I know fs.createReadStream can create stream to read a large file, but not synchronized. So i try to use fs.readSync, but read text is broken like "迈�".

var fs = require('fs');
var util = require('util');
var textPath = __dirname + '/people-daily.txt';   
var fd = fs.openSync(textPath, "r");
var text = fs.readSync(fd, 4, 0, "utf8");
console.log(util.inspect(text, true, null));
hippietrail
  • 15,848
  • 18
  • 99
  • 158
nroe
  • 163
  • 1
  • 1
  • 6

5 Answers5

12

For large files, readFileSync can be inconvenient, as it loads the whole file in memory. A different synchronous approach is to iteratively call readSync, reading small bits of data at a time, and processing the lines as they come. The following bit of code implements this approach and synchronously processes one line at a time from the file 'test.txt':

var fs = require('fs');
var filename = 'test.txt'

var fd = fs.openSync(filename, 'r');
var bufferSize = 1024;
var buffer = new Buffer(bufferSize);

var leftOver = '';
var read, line, idxStart, idx;
while ((read = fs.readSync(fd, buffer, 0, bufferSize, null)) !== 0) {
  leftOver += buffer.toString('utf8', 0, read);
  idxStart = 0
  while ((idx = leftOver.indexOf("\n", idxStart)) !== -1) {
    line = leftOver.substring(idxStart, idx);
    console.log("one line read: " + line);
    idxStart = idx + 1;
  }
  leftOver = leftOver.substring(idxStart);
}
Peace Makes Plenty
  • 856
  • 10
  • 13
8

use https://github.com/nacholibre/node-readlines

var lineByLine = require('n-readlines');
var liner = new lineByLine('./textFile.txt');

var line;
var lineNumber = 0;
while (line = liner.next()) {
    console.log('Line ' + lineNumber + ': ' + line.toString('ascii'));
    lineNumber++;
}

console.log('end of line reached');
Divam Gupta
  • 228
  • 4
  • 6
3

Use readFileSync:

fs.readFileSync(filename, [encoding]) Synchronous version of fs.readFile. Returns the contents of the filename.

If encoding is specified then this function returns a string. Otherwise it returns a buffer.

On a side note, since you are using node, I'd recommend using asynchronous functions.

Tom
  • 8,536
  • 31
  • 133
  • 232
  • 1
    @nroe, then why are you asking for a synchronous read? Of course that won't work with a large file. – Tom Sep 25 '11 at 19:13
  • 2
    @Tom, nroe likely wishes to be able to receive the lines as return values to the read calls. Some code implementing this is available in [this blog post](http://blog.jaeckel.com/2010/03/i-tried-to-find-example-on-using-node.html) (not mine). – Peace Makes Plenty Jan 19 '14 at 12:37
2

I built a simpler version JB Kohn's answer that uses split() on the buffer. It works on the larger files I tried.

/*
 * Synchronously call fn(text, lineNum) on each line read from file descriptor fd.
 */
function forEachLine (fd, fn) {
    var bufSize = 64 * 1024;
    var buf = new Buffer(bufSize);
    var leftOver = '';
    var lineNum = 0;
    var lines, n;

    while ((n = fs.readSync(fd, buf, 0, bufSize, null)) !== 0) {
        lines = buf.toString('utf8', 0 , n).split('\n');
        lines[0] = leftOver+lines[0];       // add leftover string from previous read
        while (lines.length > 1) {          // process all but the last line
            fn(lines.shift(), lineNum);
            lineNum++;
        }
        leftOver = lines.shift();           // save last line fragment (may be '')
    }
    if (leftOver) {                         // process any remaining line
        fn(leftOver, lineNum);
    }
}
srkleiman
  • 607
  • 8
  • 16
  • 1
    Maybe change `split('\n')` to `split(/\r?\n/)` to support possible windows line endings, as well? – rob3c Dec 21 '17 at 22:56
1

two potential problems,

  1. 3bytes BOM at the beginning you did not skip
  2. first 4bytes cannot be well format to UTF8's chars( utf8 is not fixed length )
user943702
  • 956
  • 6
  • 12