4

I am running a server using Node.js and need to request data from another server that I am running (localhost:3001). I need to make many requests (~200) to the data server and collect the data (response sizes vary from ~20Kb to ~20Mb). Each request is independent, and I would like to save the responses as one giant array of the form:

[{"urlAAA": responseAAA}, {"urlCCC": responseCCC}, {"urlBBB": responseBBB}, etc ]

Notice that the order of the items in unimportant, they should ideally fill the array in the order that the data becomes available.

var express = require('express');
var router = express.Router();
var async = require("async");
var papa = require("papaparse");
var sync_request = require('sync-request');
var request = require("request");

var pinnacle_data = {};
var lookup_list = [];
for (var i = 0; i < 20; i++) {
    lookup_list.push(i);
}

function write_delayed_files(object, key, value) {
    object[key] = value;
    return;
}

var show_file = function (file_number) {
    var file_index = Math.round(Math.random() * 495) + 1;
    var pinnacle_file_index = 'http://localhost:3001/generate?file=' + file_index.toString();
    var response_json = sync_request('GET', pinnacle_file_index);
    var pinnacle_json = JSON.parse(response_json.getBody('utf8'));
    var object_key = "file_" + file_number.toString();
    pinnacle_data[object_key] = pinnacle_json;
    console.log("We've handled file:    " + file_number);
    return;
};

async.each(lookup_list, show_file, function (err) {});



console.log(pinnacle_data);

/* GET contact us page. */
router.get('/', function (req, res, next) {
    res.render('predictionsWtaLinks', {title: 'Async Trial'});
});

module.exports = router;

Now when this program is run it displays:

We've handled file:    0
We've handled file:    1
We've handled file:    2
We've handled file:    3
We've handled file:    4
We've handled file:    5
etc

Now as the files are of such variable size I was expecting that this would perform the requests "in parallel", but it seems to perform them sequentially, which is what I was trying to avoid through using async.each(). Currently it takes about 1-2s to connect to the data server and so to perform this over many files is taking too long.

I realise I am using synchronous requesting, and so would like to ideally replace:

var response_json = sync_request('GET', pinnacle_file_index);

with something similar to

request(pinnacle_file_index, function (error, response, body) {
    if (!error && response.statusCode == 200) {
        pinnacle_data[object_key] = JSON.parse(body);
    }
});

Any help would be much appreciated.

Additionally I have looked at trying:

  • Converting the list of urls into a list of anonymous functions and using async.parallel(function_list, function (err, results) { //add results to pinnacle_data[]});. (I have encountered problems trying to define unique functions for each element in the array).

Similarly I have looked at other related topics:

EDIT - WORKING SOLUTION


The following code now does the task (taking ~80ms per request, including having to make repeated requests using npm requestretry). Similarly this scales very well, taking an average request time of ~80ms for making between 5 request in total, up to 1000.

var performance = require("performance-now");
var time_start = performance();
var async = require("async");
var request_retry = require('requestretry');

var lookup_list = [];
var total_requests = 50;
for (var i = 0; i < total_requests; i++) {
    lookup_list.push(i);
}

var pinnacle_data = {};
async.map(lookup_list, function (item, callback) {
        var file_index = Math.round(Math.random() * 495) + 1;
        var pinnacle_file_index = 'http://localhost:3001/generate?file=' + file_index;
        request_retry({
                url: pinnacle_file_index,
                maxAttempts: 20,
                retryDelay: 20,
                retryStrategy: request_retry.RetryStrategies.HTTPOrNetworkError
            },
            function (error, response, body) {
                if (!error && response.statusCode == 200) {
                    body = JSON.parse(body);
                    var data_array = {};
                    data_array[file_index.toString()] = body;
                    callback(null, data_array);
                } else {
                    console.log(error);
                    callback(error || response.statusCode);
                }
            });
    },
    function (err, results) {
        var time_finish = performance();
        console.log("It took " + (time_finish - time_start).toFixed(3) + "ms to complete " + total_requests + " requests.");
        console.log("This gives an average rate of " + ((time_finish - time_start) / total_requests).toFixed(3) + " ms/request");
        if (!err) {
            for (var i = 0; i < results.length; i++) {
                for (key in results[i]) {
                    pinnacle_data[key] = results[i][key];
                }
            }
            var length_array = Object.keys(pinnacle_data).length.toString();
            console.log("We've got all the data, totalling " + length_array + " unique entries.");
        } else {
            console.log("We had an error somewhere.");
        }
    });

Thanks for the help.

Community
  • 1
  • 1
oliversm
  • 1,771
  • 4
  • 22
  • 44

3 Answers3

5

As you have discovered, async.parallel() can only parallelize operations that are themselves asynchronous. If the operations are synchronous, then because of the single threaded nature of node.js, the operations will run one after another, not in parallel. But, if the operations are themselves asynchronous, then async.parallel() (or other async methods) will start them all at once and coordinate the results for you.

Here's a general idea using async.map(). I used async.map() because the idea there is that it takes an array as input and produces an array of results in the same order as the original, but runs all the requests in parallel which seems to line up with what you want:

var async = require("async");
var request = require("request");

// create list of URLs
var lookup_list = [];
for (var i = 0; i < 20; i++) {
    var index = Math.round(Math.random() * 495) + 1;
    var url = 'http://localhost:3001/generate?file=' + index;
    lookup_list.push(url);
}

async.map(lookup_list, function(url, callback) {
    // iterator function
    request(url, function (error, response, body) {
        if (!error && response.statusCode == 200) {
            var body = JSON.parse(body);
            // do any further processing of the data here
            callback(null, body);
        } else {
            callback(error || response.statusCode);
        }
    });
}, function(err, results) {
    // completion function
    if (!err) {
        // process all results in the array here
        console.log(results);
        for (var i = 0; i < results.length; i++) {
            // do something with results[i]
        }
    } else {
        // handle error here
    }
});

And, here's a version using Bluebird promises and somewhat similarly using Promise.map() to iterate the initial array:

var Promise = require("bluebird");
var request = Promise.promisifyAll(require("request"), {multiArgs: true});

// create list of URLs
var lookup_list = [];
for (var i = 0; i < 20; i++) {
    var index = Math.round(Math.random() * 495) + 1;
    var url = 'http://localhost:3001/generate?file=' + index;
    lookup_list.push(url);
}

Promise.map(lookup_list, function(url) {
    return request.getAsync(url).spread(function(response, body) {
        if response.statusCode !== 200) {
            throw response.statusCode;
        }
        return JSON.parse(body);
    });
}).then(function(results) {
    console.log(results);
    for (var i = 0; i < results.length; i++) {
        // process results[i] here
    }
}, function(err) {
    // process error here
});
jfriend00
  • 683,504
  • 96
  • 985
  • 979
  • The solution template you made in the first block of code works a charm, and gave a great improvement in performance. I have modified it though to use `requestretry` instead of `request` as my other server kept crashing or having too many files open with such a bombardment of requests, although most of these errors vanished upon a slight delay and another request attempt.Thanks @jfriend00. – oliversm Sep 08 '15 at 09:43
  • 1
    @oliversm - if you are doing so many parallel requests that you are overwhelming your server, then you may want to use `async.mapLimit()` where you can specify how many requests are allowed to be in-flight at the same time. This is useful for still running requests in parallel, but protecting against flooding the server and `.mapLimit()` will do all the work for you. You just pass an extra argument for how many simultaneous requests to allow. This would be much better than flooding the server, causing errors, then retrying. – jfriend00 Sep 08 '15 at 20:19
  • Thanks for the advice, currently I am hosting the data server myself and it isn't very 'heavy duty', so this restriction had reduced the number of errors the data server generates. However, when I get to the full production version I will ideally be bombarding someone else's server, which should be considerably more "heavy duty", and hopefully be able to cope with 200 simultaneous requests easily. – oliversm Sep 09 '15 at 15:07
  • @oliversm - You really should not assault some other server with 200 simultaneous requests. Many servers designed for big scale will actively prevent a single endpoint from doing that by rate limiting to protect their quality of service for others. Since there's no way that really any single server can actually process 200 requests at literally the same time, there's little point in your bombarding a server that way as most requests will just be queued anyway or you may trigger rate limiting. I'd suggest you back it off to 10-20 at a time. – jfriend00 Sep 09 '15 at 21:54
  • Thanks for the info, will do. – oliversm Sep 10 '15 at 14:41
  • @ningappa - I incorporated your `multiArgs` suggestion. Bluebird changed their default for that in one of their releases which probably created the issue you saw. – jfriend00 Oct 11 '16 at 19:14
4

Sounds like you're just trying to download a bunch of URLs in parallel. This will do that:

var request = require('request');
var async = require('async');

var urls = ['http://microsoft.com', 'http://yahoo.com', 'http://google.com', 'http://amazon.com'];

var loaders = urls.map( function(url) {
  return function(callback) {
        request(url, callback);
  }
});

async.parallel(loaders, function(err, results) {
        if (err) throw(err); // ... handle appropriately
        // results will be an array of the results, in 
        // the same order as 'urls', even thought the operation
        // was done in parallel
        console.log(results.length); // == urls.length
});

or even simpler, using async.map:

var request = require('request');
var async = require('async');

var urls = ['http://microsoft.com', 'http://yahoo.com', 'http://google.com', 'http://amazon.com'];

async.map(urls, request, function(err, results) {
        if (err) throw(err);          // handle error 
        console.log(results.length);  // == urls.length
});
Fattie
  • 27,874
  • 70
  • 431
  • 719
caasjj
  • 1,354
  • 8
  • 11
  • can we change push more urls in "urls" array after passing in aync.map function ? – Udit Kumawat May 25 '17 at 16:06
  • No need. You just map your new URLs from a new array. The async.map just creates an array of functions that execute asynchronously. You just repeat the process for whatever URLs you want to add. So, even you modify the urls array, it will have no impact on whatever functions you already created using async.map. – caasjj Jun 08 '17 at 03:17
  • Can I map the response with its respective response using second way? – TGW Nov 02 '17 at 05:13
0

Try this:

var async = require("async");
var request = require("request");
var show_file = function (file_number,cb) {
    //..Sync ops
     var file_index = Math.round(Math.random() * 495) + 1;
     var pinnacle_file_index = 'http://localhost:3001/generate?file='+file_index.toString();
    //request instance from Request npm Module
    //..Async op --> this should make async.each asynchronous
    request(pinnacle_file_index, function (error, response, body) {
       if(error)
           return cb(error);
       var object_key = "file_" + file_number.toString();
      pinnacle_data[object_key] = JSON.parse(body);
      return cb();
    });
};

async.each(
  lookup_list, 
  show_file,
  function (err) {
    if(err){
       console.log("Error",err);
    }else{
       console.log("Its ok");
       console.log(pinnacle_data);
   }
});
cshion
  • 1,173
  • 1
  • 10
  • 19