Inside the .txt file there are 30.000 url to scrape, when I made the program I was testing it with 10 urls and everything was fine, as soon as I made the 30k url file .txt it crashes after few minutes, I think it starts to read the .txt file and then it crashes due memory issues, here is the console output and my code. What's the best way to handle such a file?
FATAL ERROR: Ineffective mark-compacts near heap limit Allocation failed - JavaScript heap out of memory 1: 0x100ba0c4a node::Abort() (.cold.1) [/usr/local/bin/node] 2: 0x100084961 node::FatalError(char const*, char const*) [/usr/local/bin/node] 3: 0x100084a89 node::OnFatalError(char const*, char const*) [/usr/local/bin/node] 4: 0x10017fa4d v8::Utils::ReportOOMFailure(v8::internal::Isolate*, char const*, bool) [/usr/local/bin/node] 5: 0x10017f9f7 v8::internal::V8::FatalProcessOutOfMemory(v8::internal::Isolate*, char const*, bool) [/usr/local/bin/node] 6: 0x100299baf v8::internal::Heap::FatalProcessOutOfMemory(char const*) [/usr/local/bin/node] 7: 0x10029af4c v8::internal::Heap::MarkCompactPrologue() [/usr/local/bin/node] 8: 0x100298b04 v8::internal::Heap::PerformGarbageCollection(v8::internal::GarbageCollector, v8::GCCallbackFlags) [/usr/local/bin/node] 9: 0x1002975ab v8::internal::Heap::CollectGarbage(v8::internal::AllocationSpace, v8::internal::GarbageCollectionReason, v8::GCCallbackFlags) [/usr/local/bin/node] 10: 0x100296a2a v8::internal::Heap::HandleGCRequest() [/usr/local/bin/node] 11: 0x10026d9a5 v8::internal::StackGuard::HandleInterrupts() [/usr/local/bin/node] 12: 0x1004e1383 v8::internal::Runtime_StackGuard(int, unsigned long*, v8::internal::Isolate*) [/usr/local/bin/node] 13: 0x1007502f9 Builtins_CEntry_Return1_DontSaveFPRegs_ArgvOnStack_NoBuiltinExit [/usr/local/bin/node] 14: 0x10073c5fb Builtins_StringPrototypeMatch [/usr/local/bin/node] 15: 0x267b75f209cb zsh: abort node scrape.js
let cheerio = require('cheerio');
let request = require('request');
let UserAgent = require('user-agents');
let axios = require('axios');
const fileUrlErrors = "UrlsWithErrors.txt";
const async = require('async')
let Promise = require("bluebird");
let userAgent = new UserAgent({ deviceCategory: 'desktop' });
let options = {
headers: { userAgent }
};
let exec = require('child_process').exec;
const mysql = require('mysql2/promise');
let con = mysql.createPool({
host: "xxx.xxx.xxx.xxx",
user: "xxx",
password: "xxxx",
database: "xxx"
});
async function run() {
let file = fs.readFileSync('urls.txt');
let urls = file.toString().split('\r\n');
console.log(urls);
const numeroUrl = urls.length;
let urlsArray = [];
console.log("numeroUrl : " + numeroUrl);
for (let i = 1; i < numeroUrl; i++) {
for (let y = 1; y < 6; y++) {
urlsArray.push(urls[y-1] + '&page=' + y);
}
}
Promise.map(urlsArray, parseUrl, {concurrency: 10}).then(function(data) {
// all done here
console.log("Done!!!");
});
}
async function parseUrl(url) {
try {
let response = await axios.get(url, {
headers: {
'User-Agent': new UserAgent()
}
});
console.log(url + " " + response.status);
if (response.status >= 201) {
fs.appendFile(fileUrlErrors, '\n' + url + ' - ' + response.status, (error) => {
if (error) {
console.error(`Could not save the url status error to a file: ${error}`);
return;
}
console.log('Saved Url error to ' + fileUrlErrors);
});
} else if (response.status == 200) {
let $ = cheerio.load(response.data);
prodotti = $(".item");
let items = $(prodotti).get();
for (let item of items) {
let title = $(".title", item).text();
if (!title) {
title = $(".title2", item).text();
}
let price = $(".price", item).text();
if (!price) {
price = $(".price2", item).text();
}
if (title) {
const prodotto = [
[
title,
price]
];
let result = await con.query("INSERT INTO Items (title, price) VALUES ? ON DUPLICATE KEY UPDATE price=VALUES(price)", [prodotto]);
console.log('Prodotto ' + title + ' inserito nel DB.');
console.log(prodotto);
}
}
}
} catch (error) {
//console.error(error);
if (error.response) {
// Request made and server responded
await fs.appendFile(fileUrlErrors, '\n' + url + " - " + error.response.status, (error) => {
if (error) {
console.error(`Could not save the url status error to a file: ${error}`);
return;
}
console.log('Saved Url error to ' + fileUrlErrors);
});
}
}
}
run().then(() => {
console.log("Done!");
}).catch(err => {
console.log(err);
});