I want to scrape some webpages and get some data from them in Node js. My code is working but it takes almost 1 minute to finish scraping and returning all the data. I've used async functions for each website and promises to gather all information. There are at most 100 hundred links that I've worked on it. I think the running time is too much for this. Is there any issue in my code's structure (the usage of request-promise, promises, async, await etc...) that causes the delay? All functions can run in parallel/asynchronous but my constraint is I need to wait until all the results come from each website. I've limited the timeout of each request to 10 seconds. If I decrease it much more, the existing ETIMEDOUT, ECONNRESET, ESOCKETTIMEDOUT errors (which I still couldn't get rid of) increases.
Here is one of my scraping functions:
const rp = require('request-promise');
const cheerio = require('cheerio');
const fs = require("fs");
const Promise = require("bluebird");
async function ntv() {
var posts = [];
try {
const baseUrl = 'http://www.ntv.com';
const mainHtml = await rp({uri: baseUrl, timeout: 10000});
const $ = cheerio.load(mainHtml);
const links =
$(".swiper-slide")
.children("a")
.map((i, el) => {
return baseUrl + $(el).attr("href");
}).get();
posts = await Promise.map(links, async (link) => {
try {
const newsHtml = await rp({uri: link, timeout: 10000});
const $ = cheerio.load(newsHtml);
return {
title: $("meta[property='og:title']").attr("content"),
image: $("meta[property='og:image']").attr("content"),
summary: $("meta[property='og:description']").attr("content")
}
} catch (err) {
if (err.message == 'Error: ETIMEDOUT') console.log('TIMEOUT error ' + link);
else if (err.message == 'Error: read ECONNRESET') console.log('CONNECTION RESET error ' + link);
else if (err.message == 'Error: ESOCKETTIMEDOUT') console.log('SOCKET TIMEOUT error ' + link);
else console.log(err);
}
})
} catch (e) {
console.log(e)
}
return posts;
}
My main function that runs all these scraping functions is this:
var Promise = require("bluebird")
var fs = require("fs")
async function getData() {
const sourceFunc = [func1(), func2(), ... , func10()];
var news = [];
await Promise.map(sourceFunc, async (getNews) => {
try {
const currentNews = await getNews;
news = news.concat(currentNews);
} catch (err) {
console.log(err);
}
},{concurrency:10});
news.sort(function(a,b){
return new Date(b.time) - new Date(a.time);
});
fs.writeFile('./news.json', JSON.stringify(news, null, 3), (err) => {
if (err) throw err;
});
return news;
}