1

I have a json file with about 20000 to 100000 links in it it looks like this

[{
    "file_name": "Blessed_Titles.jpg",
    "url": "https://i.imgur.com/FRDibHa.jpg",
    "downloadId": "6r44r4k340rvvr"

}]

Are there any ways to parallel download them about 100 at a time? Will I get any warnings or errors while downloading 1000s of links? Right now I'm using sequential download,but I'm not sure it's suitable for such large number of links.

Here's how I'm downloading currently

async function downloadALL(ultUrls) {
  let i = 1;
  const len = ultUrls.length;
  for (ult of ultUrls) {

    await downloadFile(ult, i, len)
      .then(() => i++)
      .catch(err => console.log(err));
  }


}



function downloadFile(ult, i, len) {
  return new Promise((resolve, reject, cb) => {
    console.log('Downloading File: () ', ult.file_name);
    const download = {
      file: {},
    };

    let percentage = 0;
    const percentage2 = ((i / len) * 100).toFixed(0);

    download.file.name = ult.file_name;

    download.file.percentage = percentage;
    download.file.downloadId = ult.downloadId;
    download.percentage = percentage2;
    // console.log(download);
    // let console_message = download;


    let request = (ult.url.substr(0, 5) === 'https' ? https : http)
      .get(ult.url, function(response) {
        const lent = parseInt(response.headers['content-length'], 10);

        let body = '';
        let cur = 0;

        const total = lent / 1048576; // 1048576 - bytes in  1Megabyte

        response.on('data', function(chunk) {
          body += chunk;
          cur += chunk.length;
          percentage = ((100.0 * cur) / lent).toFixed(0);
          download.file.percentage = percentage;
          mainWindow.webContents.send('download-info', download);
        });

        const file = utility.writeFile(ult.file_name, dir);
        response.pipe(file);
        file.on('error', function(err) {
          console.log(`ERROR:${  err}`);
          file.read();
        });
        file.on('finish', function() {
          console.log('File downloaded');
          return resolve(file.close(cb)); // close() is async, call cb after close completes.
        });
      })
      .on('error', function(err) {
        // Handle errors
        return reject(err);
      });
  });
Sai Krishna
  • 547
  • 1
  • 12
  • 26

2 Answers2

2

I recommend to use bluebird. This Promise library has a batch promises concurrency solution.

This is the link to their tutorial: http://bluebirdjs.com/docs/api/promise.map.html

And here is a code solution with bluebird for your case:

// don't forget to run `npm install bluebird` first
const Promise = require('bluebird');

async function downloadAll(ultUrls) {
// The concurrency property here represents the number of promises that will be allowed to run at the same time
// You can surround this line with try/catch scope if you want to
  await Promise.map(ultUrls, downloadFile, {concurrency: 100});
}

// Here you no longer need the i and len parameters
function downloadFile() {
  // Code change needed here stop using the i and len parameters 
}
Nicolae Olariu
  • 2,487
  • 2
  • 18
  • 30
Rami Loiferman
  • 853
  • 1
  • 6
  • 22
0

So since you mentioned parallel, the usual way in NodeJS is to use child process and spawn multiple parallel threads based on a number of computing resources available.

Here is a pseudo-code that you can refer to create a solution.

// parent.js
var child_process = require('child_process');

var numchild = require('os').cpus().length;
var done = 0;
var filesListJSON = [{
    "file_name": "Blessed_Titles.jpg",
    "url": "https://i.imgur.com/FRDibHa.jpg",
    "downloadId": "6r44r4k340rvvr"
}, {
    "file_name": "Blessed_Titles2.jpg",
    "url": "https://i.imgur.com/FRDibHa2.jpg",
    "downloadId": "6r44r4k340rvvr"
}, {
    "file_name": "Blessed_Titles3.jpg",
    "url": "https://i.imgur.com/FRDibHa3.jpg",
    "downloadId": "6r44r4k340rvvr"
}];

// split the array into number of parallel threads avalible
var chunks = _.chunk(filesListJSON, numchild);

for (var i = 0; i < numchild; i++) {
    var
    var child = child_process.fork('./child');
    //send the chunk of the list to respective thread. 
    child.send(chunks[i]);
    //ps please check the count and logic for yourself I have not tested this.
    child.on('message', function (message) {
        console.log('[parent] received message from child:', message);
        done++;
        if (done === numchild) {
            console.log('[parent] received all results');
      ...
    }
  });
}

// child.js
process.on('message', function (list) {
    console.log('[child] received message from server:', message);
    downloadFiles(list, function (done) {
        console.log("Done  downloading files : " + list.length);
        process.send({
            child: process.pid,
            result: done
        });
        process.disconnect();
    });
});

function downloadFiles(list, cb) {
    //loop over list
    //logic to download files
    //cb(true)
}

Refer to this link for more details about the logic used.

Also, I have used chuck function from the lodash library to split the array for processing. https://lodash.com/docs/3.10.1#chunk

damitj07
  • 2,689
  • 1
  • 21
  • 40
  • I will prefer clustering though and you should use as many forks as the number of cpus you have, so that all cores work in the same time (distribute the work between your processor's cores). But that doesn't seems to be in your answer,you can't just fork 1000 processes when your cpu has only 4 cores ,that will eventually end up making your system super slow. – Shubham Dixit Nov 18 '19 at 06:33
  • well, I am not suggesting that. The logic above takes into account the number of cores available, and divides the total amount of work available and delegates it to the spawned processes. Please read the logic carefully and let me know if you are lead to believe otherwise. – damitj07 Nov 18 '19 at 06:36
  • That's depend of the case, if you're waiting an desktop app you can't scale to 1k threads so it's a valid awnser as well. – LeonanCarvalho Jul 16 '22 at 12:14
  • For nowadays use working threads would fit better. – LeonanCarvalho Jul 16 '22 at 12:14