The accepted answer is great and attempted to cover all the important aspects of this problem.
- Reading the CSV file as a stream of lines
- Writing the documents in batches to MongoDB
- Synchronization between reading and writing
While it did well with first two aspects, the approach taken to address the synchronization using async.series() won't work as expected.
stream.on("line",function(line) {
async.series(
[
function(callback) {
var row = line.split(","); // split the lines on delimiter
var obj = {};
// other manipulation
bulk.insert(obj); // Bulk is okay if you don't need schema
// defaults. Or can just set them.
counter++;
if ( counter % 1000 == 0 ) {
bulk.execute(function(err,result) {
if (err) throw err; // or do something
// possibly do something with result
bulk = Entry.collection.initializeOrderedBulkOp();
callback();
});
} else {
callback();
}
}
],
function (err) {
// each iteration is done
}
);
});
Here bulk.execute() is a mongodb write operation and its an asynchronous IO call. This allows node.js to proceed with the event loop before bulk.execute() is done with its db writes and calls back.
So it may go on to receive more 'line' events from the stream and queue more documents bulk.insert(obj)
and can hit next modulo to trigger bulk.execute() again.
Lets have a look at this example.
var async = require('async');
var bulk = {
execute: function(callback) {
setTimeout(callback, 1000);
}
};
async.series(
[
function (callback) {
bulk.execute(function() {
console.log('completed bulk.execute');
callback();
});
},
],
function(err) {
}
);
console.log("!!! proceeding to read more from stream");
It's output
!!! proceeding to read more from stream
completed bulk.execute
To really ensure that we are processing one batch of N documents at any given time, we need to enforce flow control on the file stream using stream.pause()
& stream.resume()
var LineInputStream = require("line-input-stream"),
fs = require("fs"),
mongoose = require("mongoose"),
Schema = mongoose.Schema;
var entrySchema = new Schema({},{ strict: false });
var Entry = mongoose.model( "Entry", entrySchema );
var stream = LineInputStream(fs.createReadStream("data.txt",{ flags: "r" }));
stream.setDelimiter("\n");
mongoose.connection.on("open",function(err,conn) {
// lower level method, needs connection
var bulk = Entry.collection.initializeOrderedBulkOp();
var counter = 0;
stream.on("error",function(err) {
console.log(err); // or otherwise deal with it
});
stream.on("line",function(line) {
var row = line.split(","); // split the lines on delimiter
var obj = {};
// other manipulation
bulk.insert(obj); // Bulk is okay if you don't need schema
// defaults. Or can just set them.
counter++;
if ( counter % 1000 === 0 ) {
stream.pause(); //lets stop reading from file until we finish writing this batch to db
bulk.execute(function(err,result) {
if (err) throw err; // or do something
// possibly do something with result
bulk = Entry.collection.initializeOrderedBulkOp();
stream.resume(); //continue to read from file
});
}
});
stream.on("end",function() {
if ( counter % 1000 != 0 ) {
bulk.execute(function(err,result) {
if (err) throw err; // or something
// maybe look at result
});
}
});
});