Here's a scheme that runs a user controllable number of getFile()
operations in parallel. You set the maxInFlight
variable to how many pages you want to run in parallel (which is probably just a matter of your memory usage or any rate limiting that facebook might apply). You will have to decide what to set that to with experimentation. I've set it initially to 10 to allow 10 pages to be "in flight" at the same time.
The general idea here is that getFile()
increments/decrements inFlightCntr
as a measure of how many pages are open at once and then the csvPipe is paused or resumed based on that counter.
const csv = require('csv-parser');
const fs = require('fs');
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const maxInFlight = 10; // set this value to control how many pages run in parallel
let inFlightCntr = 0;
let paused = false;
async function getFile(rowId, path) {
try {
++inFlightCntr;
const page = await browser.newPage();
page.setViewport({ width: 1000, height: 1500, deviceScaleFactor: 1 });
let url = 'https://www.facebook.com/ads/library/?id=' + rowId;
const response = await page.goto(url, { waitUntil: 'networkidle2' });
await page.waitFor(3000);
const body = await page.$('body');
await body.screenshot({
path: path
});
await page.close();
} catch(e) {
console.log(e);
page.close();
} finally {
--inFlightCntr;
}
}
let fname = 'ids.csv'
const csvPipe = fs.createReadStream(fname).pipe(csv());
csvPipe.on('data', async (row) => {
let id = row.ad_id;
console.log(id);
let path = './images/' + id + '.png';
getFile(id, path).finally(() => {
if (paused && inFlightCntr < maxInFlight) {
cvsPipe.resume();
paused = false;
}
});
if (!paused && inFlightCntr >= maxInFlight) {
cvsPipe.pause();
paused = true;
}
}).on('end', () => {
console.log('CSV file successfully processed');
});
})();
The code could be a bit simpler if you just ran the csvPipe to collect all the rows into an array (before you process any of them). Then, you could use any number of promise concurrency functions for processing the array while controlling how many run in parallel. See this answer from yesterday for a number of functions that let you manage concurrency when parallel processing an array. Here's how that implementation would look:
const csv = require('csv-parser');
const fs = require('fs');
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const maxInFlight = 10; // set this value to control how many pages run in parallel
const fname = 'ids.csv'
const csvPipe = fs.createReadStream(fname).pipe(csv());
const rowIDs = [];
async function getFile(rowId, path) {
try {
const page = await browser.newPage();
page.setViewport({ width: 1000, height: 1500, deviceScaleFactor: 1 });
let url = 'https://www.facebook.com/ads/library/?id=' + rowId;
const response = await page.goto(url, { waitUntil: 'networkidle2' });
await page.waitFor(3000);
const body = await page.$('body');
await body.screenshot({
path: path
});
} catch(e) {
console.log(e);
} finally {
await page.close();
}
}
csvPipe.on('data', row => {
rowIDs.push(row.ad_id);
}).on('end', () => {
// all rowIDs in the array now
pMap(rowIDs, (id) => {
let path = './images/' + id + '.png';
return getFile(id, path);
}, maxInFlight).then(() => {
console.log("all items processed"); // all done now
}).catch(err => {
console.log(e);
});
});
})();
// utility function for processing an array asynchronously with
// no more than limit items "in flight" at the same time
function pMap(array, fn, limit) {
return new Promise(function(resolve, reject) {
var index = 0, cnt = 0, stop = false, results = new Array(array.length);
function run() {
while (!stop && index < array.length && cnt < limit) {
(function(i) {
++cnt;
++index;
fn(array[i]).then(function(data) {
results[i] = data;
--cnt;
// see if we are done or should run more requests
if (cnt === 0 && index === array.length) {
resolve(results);
} else {
run();
}
}, function(err) {
// set stop flag so no more requests will be sent
stop = true;
--cnt;
reject(err);
});
})(index);
}
}
run();
});
}