I'm trying to create a node js web scraper. The overall operation of this scraper is:
- Grab array of URLs from database. Return in a promise.
- Send Requests to URL from database and scrape data. Return in a promise
- Insert scraped data into database.
I want to be able to compose my steps like so.
getUrls()
.then(scrapeData)
.then(insertData);
However, I'm finding that in order to do this, I must wait for ALL data from each url to resolve within step 2 (using promise.all) in order to proceed to the next chained event.
This could pose problems because I could be sending requests to thousands of URLS and if one fails during promise.all, all of the data gathered is then lost.
I would much rather have each function operate like so:
getUrls() //grab array of all urls (could be thousands)
.then(scrapeData) // for each url scrape data and immediately proceed to chained function
.then(insertData);
In short, is there a procedural way to iterate through the chain of a promise and control when data is to be waited for?
My Code:
var express = require('express');
var app = express();
var request = require('request');
var cheerio = require('cheerio');
app.get('/', (req, res) => {
var sql = require("mssql");
// config for your database
var config = {
user: '',
password: '',
server: '',
database: '',
options: {
encrypt: false // Use this if you're on Windows Azure
}
}
const getSkus = () => {
var promise = new Promise((resolve, reject) => {
sql.connect(config, (err) => {
if (err) console.log(err);
// create Request object
var request = new sql.Request();
// query to the database and get the records
request.query('SELECT URL FROM PRODUCTS, (err, recordset) => {
if (err) {
console.log("There was an error executing the SQL statement: " + err)
reject(err);
} else{
resolve(recordset);
}
});
});
});
return promise;
}
const urlGen = (skus) => {
var base_url = 'http://somesite.com/search/?q='
var urls = [];
skus.forEach((sku) =>{
let code = sku.Code;
let mpn = sku.MPN;
let url = base_url + mpn;
urls.push(url);
});
return urls;
}
const makeRequests = (urls) => {
var promises = [];
urls.forEach((url) => {
var promise = new Promise((resolve, reject) => {
request(url, (err, response, html) => {
if(!err && response.statusCode == 200){
//do scraping here
}
catch(err){
reject(err);
console.log('Error occured during data scraping:');
}
resolve(jsontemp);
}
else{
reject(err);
}
});
});
promises.push(promise);
});
return Promise.all(promises);
}
getSkus()
.then(urlGen)
.then(makeRequests)
.catch((e) => console.log(e));
});
var server = app.listen(5000, function () {
console.log('Server is running..');
});