I currently have a nodejs script that reads data from CSV files and then writes to a number of different CSVs based on the data in each row.
There are 300 CSVs (around 40Gbs worth) to process so I added async to my script to read and write the data simultaneously across all cores.
async.mapLimit(filePaths, 4, streamZip, function (err, results) {
console.log('finished');
});
But it turns out that is not what async does. This code actually takes more time to complete than processing each file individually because it is using just a single core.
There seems to many different ways to use more cores cluster, child process, web workers and worker-farm
There have also been other questions asked like this one
But they all seem to want to use HTTP or Express and run as a server, or they call on an external program like 'ls'. Rather than just running a multi processing pool like I would in Python.
Can anyone provide an example or help on how to use threads or processes that would read multiple CSV files in parallel and all write to the same fs.createWriteStreams??
Thanks
More of my code is here:
function streamZip(filePath, callback) {
var stream = fs.createReadStream(filePath)
.pipe(unzip.Parse())
.on('entry', function (entry) {
var fileName = entry.path;
entry.pipe(csvStream)
})
var csvStream = csv()
.on("data", function(data){
var identifier = data[0];
if (identifier === '10') {
10CSV.write(data)
} else if (identifier === '11') {
11CSV.write(data)
} else if (identifier === '15') {
15CSV.write(data)
}
})
.on("end", function(){
callback(null, filePath + 'Processed');
});
}