0

Inside the .txt file there are 30.000 url to scrape, when I made the program I was testing it with 10 urls and everything was fine, as soon as I made the 30k url file .txt it crashes after few minutes, I think it starts to read the .txt file and then it crashes due memory issues, here is the console output and my code. What's the best way to handle such a file?

FATAL ERROR: Ineffective mark-compacts near heap limit Allocation failed - JavaScript heap out of memory 1: 0x100ba0c4a node::Abort() (.cold.1) [/usr/local/bin/node] 2: 0x100084961 node::FatalError(char const*, char const*) [/usr/local/bin/node] 3: 0x100084a89 node::OnFatalError(char const*, char const*) [/usr/local/bin/node] 4: 0x10017fa4d v8::Utils::ReportOOMFailure(v8::internal::Isolate*, char const*, bool) [/usr/local/bin/node] 5: 0x10017f9f7 v8::internal::V8::FatalProcessOutOfMemory(v8::internal::Isolate*, char const*, bool) [/usr/local/bin/node] 6: 0x100299baf v8::internal::Heap::FatalProcessOutOfMemory(char const*) [/usr/local/bin/node] 7: 0x10029af4c v8::internal::Heap::MarkCompactPrologue() [/usr/local/bin/node] 8: 0x100298b04 v8::internal::Heap::PerformGarbageCollection(v8::internal::GarbageCollector, v8::GCCallbackFlags) [/usr/local/bin/node] 9: 0x1002975ab v8::internal::Heap::CollectGarbage(v8::internal::AllocationSpace, v8::internal::GarbageCollectionReason, v8::GCCallbackFlags) [/usr/local/bin/node] 10: 0x100296a2a v8::internal::Heap::HandleGCRequest() [/usr/local/bin/node] 11: 0x10026d9a5 v8::internal::StackGuard::HandleInterrupts() [/usr/local/bin/node] 12: 0x1004e1383 v8::internal::Runtime_StackGuard(int, unsigned long*, v8::internal::Isolate*) [/usr/local/bin/node] 13: 0x1007502f9 Builtins_CEntry_Return1_DontSaveFPRegs_ArgvOnStack_NoBuiltinExit [/usr/local/bin/node] 14: 0x10073c5fb Builtins_StringPrototypeMatch [/usr/local/bin/node] 15: 0x267b75f209cb zsh: abort node scrape.js

let cheerio = require('cheerio');
let request = require('request');
let UserAgent = require('user-agents');
let axios = require('axios');
const fileUrlErrors = "UrlsWithErrors.txt";
const async = require('async')
let Promise = require("bluebird");

let userAgent = new UserAgent({ deviceCategory: 'desktop' });
let options = {
  headers: { userAgent }
};
let exec = require('child_process').exec;

const mysql = require('mysql2/promise');
let con = mysql.createPool({
      host: "xxx.xxx.xxx.xxx",
      user: "xxx",
      password: "xxxx",
      database: "xxx"
    });


async function run() {

    let file = fs.readFileSync('urls.txt');
    let urls = file.toString().split('\r\n');

    console.log(urls);

    const numeroUrl = urls.length;
    let urlsArray = [];
    console.log("numeroUrl : " + numeroUrl);
    for (let i = 1; i < numeroUrl; i++) {
      for (let y = 1; y < 6; y++) {
        urlsArray.push(urls[y-1] + '&page=' + y);
      }
    }
    Promise.map(urlsArray, parseUrl, {concurrency: 10}).then(function(data) {
        // all done here
        console.log("Done!!!");
    });
}


async function parseUrl(url) {
  try {
    let response = await axios.get(url, {
      headers: { 
        'User-Agent': new UserAgent() 
      }  
    });
    console.log(url + " " + response.status);
    if (response.status >= 201) {

      fs.appendFile(fileUrlErrors, '\n' + url + ' - ' + response.status, (error) => {
        if (error) {
            console.error(`Could not save the url status error to a file: ${error}`);
            return;
        }

        console.log('Saved Url error to ' + fileUrlErrors);
        });  
      
    } else if (response.status == 200) {

      let $ = cheerio.load(response.data);

      prodotti = $(".item");

      let items = $(prodotti).get();

      for (let item of items) {

        let title = $(".title", item).text();
        if (!title) {
          title = $(".title2", item).text();
        }
        
        let price = $(".price", item).text();
        if (!price) {
          price = $(".price2", item).text();
        }
        

        if (title) {
          const prodotto = [
          [
          title,
          price]
          ];
          let result = await con.query("INSERT INTO Items (title, price) VALUES ? ON DUPLICATE KEY UPDATE price=VALUES(price)", [prodotto]);
          console.log('Prodotto ' + title + ' inserito nel DB.');
          console.log(prodotto);
        }

        }
    } 

    } catch (error) {
        //console.error(error);
        if (error.response) {
          // Request made and server responded
            await fs.appendFile(fileUrlErrors, '\n' + url + " - " + error.response.status, (error) => {
            if (error) {
                console.error(`Could not save the url status error to a file: ${error}`);
                return;
            }
            console.log('Saved Url error to ' + fileUrlErrors);
            });  
        }

    }
}

run().then(() => {
    console.log("Done!");
}).catch(err => {
    console.log(err);
});
vamp
  • 35
  • 6
  • 1
    I bet it depends on what's happening in parseUrl. – ThisIsNoZaku Jul 01 '20 at 21:46
  • Show us `parseUrl()`. If that contains asynchronous operations, then you're running too many of them concurrently and you need to use something like `mapConcurrent()` [here](https://stackoverflow.com/questions/46654265/promise-all-consumes-all-my-ram/46654592#46654592) to process all your URLs N at a time instead of all at a time. – jfriend00 Jul 01 '20 at 21:48
  • big giant loop seems like a bad idea. A queue type of system seems better. – epascarello Jul 01 '20 at 21:48
  • I’m voting to close this question because this belongs on [Code Review](https://codereview.stackexchange.com), another site in the Stack Exchange network. – AndrewL64 Jul 01 '20 at 21:48
  • 2
    @AndrewL64 - Code review is for improving working code. This doesn't work - it runs out of memory because it's the wrong code for a problem like this. – jfriend00 Jul 01 '20 at 21:49
  • `testing it with 10 urls and everything was fine, as soon as I made the 30k url file .txt it crashes after few minutes` -- I think his code works but he wants to improve it so it won't run out of memory. – AndrewL64 Jul 01 '20 at 21:50
  • @AndrewL64 [Don't use the existence of Code Review as a reason to close a question](//meta.stackoverflow.com/q/287400). Evaluate the question and use a reason like; needs focus, primarily opinion-based, etc. – Peilonrayz Jul 01 '20 at 21:52
  • @Peilonrayz I guess I made a wrong call but he stated that his code works with x amount of data but he wants it to work with xx amount of data so I thought that counts as "improving/optimising current **working code**". – AndrewL64 Jul 01 '20 at 21:54
  • @jfriend00 if `parseUrl` is async he could use `await parseUrl()` in `for of` loop or it still have memory leak? – Nico Jul 01 '20 at 21:55
  • @AndrewL64 I have not contested that. – Peilonrayz Jul 01 '20 at 21:56
  • @Peilonrayz I wasn't implying that you are tbh. Just stating the reason from evaluating the question as you mentioned. – AndrewL64 Jul 01 '20 at 21:58
  • @Nico - If he wants to process only one at a time, he could use `await` with promises. But, typically, one gets much better performance processing N at a time where N it tuned to the operation/environment as is often something like 5-10. – jfriend00 Jul 01 '20 at 22:15
  • We asked to see the code for `parseUrl()` and you added code for `parseCategories()`. I'm confused. – jfriend00 Jul 01 '20 at 22:16
  • @jfriend00 Sorry if I posted in wrong section, not sure, anyways I've updated the code with the function missing. – vamp Jul 01 '20 at 22:17
  • `parseUrl()` is flawed because you've wrapped regular asynchronous callbacks in an `async` function which means you don't have any way to communicate back when that function is actually done. So you can't use `parseUrl()` successfully with `await` or any of the other management tools that work off promises. Do not mix promises and plain callbacks - it will not get you what you want. – jfriend00 Jul 01 '20 at 22:19
  • @jfriend00 I've started 2 days ago with javascript, it's a bit confusing for me the asynchronous mechanism, I'm not used to, so in my case, the best thing is to just use promises? Thanks for your help! – vamp Jul 01 '20 at 22:27

2 Answers2

2

As discussed in comments, your parseUrl() function has a mix of promises and plain asynchronous callbacks and that's a disaster. You really cannot mix them. The best solution is to do all your asynchronous control flow with promises and if you have some non-promise returning asynchronous callbacks, then promisify them either manually, with util.promisify(), by using the right promisified version of the API or by getting the right version of the library that contains promise support.

One you've converted everything to promise control flow, you can then use async and await and other promise control-flow tools and only then will your parseUrl() function return a promise that is only resolved when all the underlying asynchronous operations are done and only then will you have proper error propagation too.

Here's an example that fixes parseUrl() to properly use promises for all asynchronous operations and then uses await in your for loop to process your urls one at a time so you won't have all of them in=flight at the same time taking up too much memory:

const fs = require('fs');

async function run() {
    const file = fs.readFileSync('urls.txt');
    const urls = file.toString().split('\r\n');

    // count the number of urls inside .txt file
    const numberOfUrls = urls.length;
    console.log("There are : " + numberOfUrls + " urls");

    // Add page to url and use the scrape function
    for (let i = 1; i < numberOfUrls; i++) {
        for (let y = 1; y < 6; y++) {
            let url = urls[y - 1] + '&page=' + y;
            await parseUrl(url);
        }
    }

    async function parseUrl(url) {
        try {
            const response = await axios.get(url, {
                headers: {
                    'User-Agent': new UserAgent()
                }
            });
            if (response.status >= 201) {
                await fs.promises.appendFile(fileUrlErrors, '\n' + url + ' - ' + response.status);
            }  else if (response.status == 200) {
                const $ = cheerio.load(response.data);
                const prodotti = $(".result");

                // get items into a normal array so we can use a normal for loop
                const items = $(prodotti).get();
                for (let item of items) {
                    const title = $("title", item).text();
                    const code = $(".code", item).text();
                    if (asin[1]) {
                        const prodotto = [
                            [title, code]
                        ];
                        // promise support in your mysql database requires the mysql2 module
                        const result = await con.query("INSERT INTO Items (title, code) VALUES ? ON DUPLICATE KEY UPDATE code=VALUES(code)", [prodotto]);
                        console.log('Prodotto ' + code + ' inserito nel DB.');
                        console.log(prodotto);
                    }

                }
            }
        } catch (error) {
            console.error(error);
            throw error;          // propagate error back to caller
        }
    }
}

run().then(() => {
    console.log("all done");
}).catch(err => {
    console.log(err);
});

Note, the best place to get promise support for your mysql database (I'm guessing that's the database you're using) is in the mysql2 module on NPM. You will also probably have to change your other code that sets up the database connection to use promises (code which you don't show here).

Summary of Changes:

  1. Switch to fs.promises.appendFile() to get promise support.
  2. Change all variable declarations to let or const. No use of var.
  3. Declare all variables properly so there are no accidental globals.
  4. Switch to use mysql2 so you can use promise support for con.query()
  5. Change if and if to if and else if since that's really what your logic is.
  6. file.toString().split('\r\n') is already an array so there's no need to use Object.keys() on it to get the length. You can just directly reference the .length property on the array.

For Better Performance

The cause of your memory issue was that you were running ALL the requests in flight at the same time which uses a maximum amount of memory. The solution above goes all the way to the other side and runs ONE request at a time. This will use the minimum amount of memory, but it also sacrifices some end-to-end performance. For maximal performance, while using a bit more memory, you can often gain a lot by running N requests in flight at the same time where N depends upon your circumstances but can often be 3-10. This keeps the network busy fetching data while your CPU is busy working on previous responses.

To do that, you need some code that manages your asynchronous operations to run N at a time. One such function to do that is mapConcurrent() and is illustrated here. It iterates your array for you, firing up N requests and then each time one of those N finishes, it fires off the next one, gathering results in order into a final array for you.

jfriend00
  • 683,504
  • 96
  • 985
  • 979
  • Thank you very much for your reply, it's a very full answer! I just have an issue, when I run the script it says " await parseUrl(url); ^^^^^ SyntaxError: await is only valid in async function", I used the code you posted, async function parseUrl(url) isn't an async function? I have no idea what I'm doing wrong – vamp Jul 02 '20 at 15:53
  • @vamp - I modified the code in my answer. Since the code was trying to use `await` at the top level, we just had to wrap that top level in an `async` function and then call it. – jfriend00 Jul 02 '20 at 20:17
  • thanks! I'm using mysql2 now but I'm having issues with "const result = await con.query... ... SyntaxError: await is only valid in async function". I think the problem is the mysql query, which is inside the loop " $(prodotti).each(function(i, item) ", if I place the mysql query outside that loop it says prodotto is not declared, is there a workaround? – vamp Jul 02 '20 at 22:56
  • @vamp - Yep, another place where you had a non-async callback function that you can't use `await` inside of. I've rewritten that part of the code to use a regular `for` loop so you can use `await`. The cheerio/jQuery methods like `.each()` are not async-aware so you tend to want to avoid them if using asynchronous operations on the iteration. Much better to use a native `for` loop. – jfriend00 Jul 02 '20 at 23:05
  • I've added "Promise.map(urlsArray, parseUrl, {concurrency: 10}).then(function(data) { // all done here console.log("Done!"); }); " inside the function run () . It's partially working because it parses all urls and starts to put items inside my database, the Issue is that it fetches all urls (so it prints "done!") and it stops before all items are placed inside the database. I think the issue is that I'm using promise.map which I guess it isn't async, is there a similar function for async code? – vamp Jul 05 '20 at 18:50
  • @vamp - It sounds like you're not properly using promises with your database code. So, the code is not waiting for the database operations to complete. Not much we could do to help you with that without seeing exactly what you're code is. – jfriend00 Jul 05 '20 at 19:55
  • I've added my full code in the answers, I couldn't update original one – vamp Jul 05 '20 at 21:15
0

Code updated

const fs = require('fs');
let cheerio = require('cheerio');
let request = require('request');
let UserAgent = require('user-agents');
let axios = require('axios');
const fileUrlErrors = "UrlsWithErrors.txt";
const async = require('async')
let Promise = require("bluebird");

let userAgent = new UserAgent({ deviceCategory: 'desktop' });
let options = {
  headers: { userAgent }
};
let exec = require('child_process').exec;

const mysql = require('mysql2/promise');
let con = mysql.createPool({
      host: "xxx.xxx.xxx.xxx",
      user: "xxx",
      password: "xxxx",
      database: "xxx"
    });

async function run() {

    let file = fs.readFileSync('urls.txt');
    let urls = file.toString().split('\r\n');

    console.log(urls);

    const numeroUrl = urls.length;
    let urlsArray = [];
    console.log("numeroUrl : " + numeroUrl);
    for (let i = 1; i < numeroUrl; i++) {
      for (let y = 1; y < 6; y++) {
        urlsArray.push(urls[i-1] + '&page=' + y);
      }
    }
    Promise.map(urlsArray, parseUrl, {concurrency: 10}).then(function(data) {
        // all done here
        console.log("Done!");

    });
}


async function parseUrl(url) {
  try {
    let response = await axios.get(url, {
      headers: { 
        'User-Agent': new UserAgent() 
      }  
    });
    console.log(url + " " + response.status);
    if (response.status >= 201) {

      await fs.promises.appendFile(fileUrlErrors, '\n' + url + ' - ' + response.status);
        console.log('Saved Url error to ' + fileUrlErrors); 
      
    } else if (response.status == 200) {

      let $ = cheerio.load(response.data);

      prodotti = $(".item");

      let items = $(prodotti).get();

      for (let item of items) {

        let title = $(".title", item).text();
        if (!title) {
          title = $(".title2", item).text();
        }
        
        let price = $(".price", item).text();
        if (!price) {
          price = $(".price2", item).text();
        }
        

        if (title) {
          const prodotto = [
          [
          title,
          price]
          ];
          let result = await con.query("INSERT INTO Items (title, price) VALUES ? ON DUPLICATE KEY UPDATE price=VALUES(price)", [prodotto]);
          console.log('Prodotto ' + title + ' inserito nel DB.');
          console.log(prodotto);
        }

        }
    } 

    } catch (error) {
        //console.error(error);
        if (error.response) {
          // Request made and server responded
            await fs.promises.appendFile(fileUrlErrors, '\n' + url + " - " + error.response.status);
            console.log('Saved Url error to ' + fileUrlErrors);
        }

    }
}

run().then(() => {
    console.log("Done!");
}).catch(err => {
    console.log(err);
});

vamp
  • 35
  • 6
  • As I showed in my answer, you can't use `await` with regular `fs.appendFile()`. You can use it with `fs.promises.appendFile()` and don't pass a callback to it. – jfriend00 Jul 05 '20 at 21:26
  • @jfriend00 Thank you!! Didn't notice :) for the mysql query I've done this "let result = await con.query("INSERT INTO Items (title, price) VALUES ? ON DUPLICATE KEY UPDATE price=VALUES(price)", [prodotto]); ". There's await inside the async func, but it doesn't wait for the db to complete all queries – vamp Jul 05 '20 at 22:31
  • @jfriend00 I'm sorry to bother you, do you think it doesn't wait mysql queries due network speed? Maybe I send too many requests? Right now is set at 10 concurrency operation, code seems to be fine – vamp Jul 06 '20 at 20:35
  • Without being able to run your code, I'm not able to see what else might be wrong. A concurrency of 10 should be OK, but you can try dropping it to 2 as an experiment. – jfriend00 Jul 06 '20 at 20:52