I am currently using this piece of code to connect to a massive list of links (a total of 2458 links, dumped at https://pastebin.com/2wC8hwad) to get feeds from numerous sources, and to deliver them to users of my program.
It's basically splitting up one massive array into multiple batches (arrays), then forking a process to handle a batch to request each stored link for a 200 status code. Only when a batch is complete is the next batch sent for processing, and when its all done the forked process is disconnected. However I'm facing issues concerning apparent inconsistency in how this is performing with this logic, particularly the part where it requests the code.
const req = require('./request.js')
const process = require('child_process')
const linkList = require('./links.json')
let processor
console.log(`Total length: ${linkList.length}`) // 2458 links
const batchLength = 400
const batchList = [] // Contains batches (arrays) of links
let currentBatch = []
for (var i in linkList) {
if (currentBatch.length < batchLength) currentBatch.push(linkList[i])
else {
batchList.push(currentBatch)
currentBatch = []
currentBatch.push(linkList[i])
}
}
if (currentBatch.length > 0) batchList.push(currentBatch)
console.log(`Batch list length by default is ${batchList.length}`)
// cutDownBatchList(1)
console.log(`New batch list length is ${batchList.length}`)
const startTime = new Date()
getBatchIsolated(0, batchList)
let failCount = 0
function getBatchIsolated (batchNumber) {
console.log('Starting batch #' + batchNumber)
let completedLinks = 0
const currentBatch = batchList[batchNumber]
if (!processor) processor = process.fork('./request.js')
for (var u in currentBatch) { processor.send(currentBatch[u]) }
processor.on('message', function (linkCompletion) {
if (linkCompletion === 'failed') failCount++
if (++completedLinks === currentBatch.length) {
if (batchNumber !== batchList.length - 1) setTimeout(getBatchIsolated, 500, batchNumber + 1)
else finish()
}
})
}
function finish() {
console.log(`Completed, time taken: ${((new Date() - startTime) / 1000).toFixed(2)}s. (${failCount}/${linkList.length} failed)`)
processor.disconnect()
}
function cutDownBatchList(maxBatches) {
for (var r = batchList.length - 1; batchList.length > maxBatches && r >= 0; r--) {
batchList.splice(r, 1)
}
return batchList
}
Below is request.js, using needle. (However, for some strange reason it may completely hang up on a particular site indefinitely - in that case, I just use this workaround)
const needle = require('needle')
function connect (link, callback) {
const options = {
timeout: 10000,
read_timeout: 8000,
follow_max: 5,
rejectUnauthorized: true
}
const request = needle.get(link, options)
.on('header', (statusCode, headers) => {
if (statusCode === 200) callback(null, link)
else request.emit('err', new Error(`Bad status code (${statusCode})`))
})
.on('err', err => callback(err, link))
}
process.on('message', function(linkRequest) {
connect(linkRequest, function(err, link) {
if (err) {
console.log(`Couldn't connect to ${link} (${err})`)
process.send('failed')
} else process.send('success')
})
})
In theory, I think this should perform perfectly fine - it spawns off a separate process to handle the dirty work in sequential batches so its not overloaded and is super scaleable. However, when using using the full list of links at length 2458 with a total of 7 batches, I often get massive "socket hang up" errors on random batches on almost every trial that I do, similar to what would happen if I requested all the links at once.
If I cut down the number of batches to 1 using the function cutDownBatchList
it performs perfectly fine on almost every trial. This is all happening on a Linux Debian VPS with two 3.1GHz vCores and 4 GB RAM from OVH, on Node v6.11.2
One thing I also noticed is that if I increased the timeout
to 30000 (30 sec) in request.js for 7 batches, it works as intended - however it works perfectly fine with a much lower timeout
when I cut it down to 1 batch. If I also try to do all 2458 links at once, with a higher timeout, I also face no issues (which basically makes this mini algorithm useless if I can't cut down the timeout via batch handling links). This all goes back to the inconsistent behavior issue.
The best TLDR I can do: Trying to request a bunch of links in sequential batches in a forked child process - succeeds almost every time with a lower number of batches, fails consistently with full number of batches even though behavior should be the same since its handling it in isolated batches.
Any help would be greatly appreciated in solving this issue as I just cannot for the life of me figure it out!