0

I'm using cheerio and axios to get all links from a web page and making GET requests with axios with each link to check that it is a not broken link (i.e. the status code is 200).

Then, I want to put each broken link URL into an array. I can't figure out how this works in term of async and await.

const cheerio = require("cheerio");

// returns an array of broken link urls
async function getAllBrokenLinks(url) {
    const res = await axios.get(url) // get webpage

    // if there was an error getting the data, return null
    if (!res || res.status != 200) { 
        return null;
    }

    let body = res.data.body.value; // get body html from res
    let results = [];

    $ = cheerio.load(body); // parse body html into cheerio
    let links = $('a'); // get all the links in the body

    $(links).each( (i, link) => {
        let linkText = $(link).text();
        let linkHref = $(link).attr('href');

        // check if the link is dead
        axios.get(linkHref)
        .then( (response) => {
            if (response.status != 200) {
                results.push(linkHref);
            }
        })
        .catch( (error) => {
            results.push(linkHref);
        });
    })

    return results; // this returns an empty array instead of a populated array

}

I know that axios.get() is an async function but I'm not sure how I can wait for each GET request in the each loop to finish running before I return the populated array results.

EDIT: This is different from Using async/await with a forEach loop because $(links) does not have a .forEach function.

EDIT: I tried this:

    const reqs = $(links).map( async (i, link) => {
        let linkText = $(link).text();
        let linkHref = $(link).attr('href');

        // check if the link is dead
       axios.get(linkHref)
       .then( (response) => {
           if (response.status != 200) {
               results.push(linkHref);
           }
       })
       .catch( (error) => {
           results.push(linkHref);
       });

       return axios.get(linkHref)
    })

    await Promise.all(reqs); // ERROR

    return results;

but it is now giving me this error:

(node:77480) UnhandledPromiseRejectionWarning: TypeError: object is not iterable (cannot read property Symbol(Symbol.iterator))

EDIT: I figured it out. Here is my code:

    // put each link text and href in the linkUrls array
    $(links).each( (i, link) => {
        linkUrls[String($(link).text())] = $(link).attr('href');
    })

    let results = {};
    results['title'] = pageTitle;

    let num = 0;

    for (const linkText in linkUrls) {

        const response = await axios.get(linkUrls[linkText])
        .then( (r) => {
            if (r.status != 200) {
                num += 1;
                results[ 'broken link ' + String(num)] = {
                    'text': linkText,
                    'url': linkUrls[linkText],
                    'status code': r.status
                }
            }

        })
        .catch( (err) => {
            num += 1;
            results[ 'broken link ' + String(num)] = {
                'text': linkText,
                'url': linkUrls[linkText],
                'status code': err.response.status
            }
        })
    }

    console.log('results: ', results);

    return results;

viviansplatoon
  • 148
  • 1
  • 16
  • 1
    Try converting `.each` to a regular `for (const element of elements)` loop – Kevin Pastor Aug 06 '19 at 20:14
  • If there is a `.map` function instead of `.each`? – lonewarrior556 Aug 06 '19 at 20:15
  • @lonewarrior556 a `.map` function exists but I'm not sure how I'd use especially considering I want a `results` array that is likely not the same size as the `lists` array – viviansplatoon Aug 06 '19 at 20:30
  • if you do `const reqs = $(links).map(` and return the `axios.get(linkHref)` your last two lines can be `await Promise.all(reqs); return results;` – lonewarrior556 Aug 06 '19 at 20:36
  • @vivianyoung i guess you'll need to use the `.toArray()` method to convert the Cheerio wrapper into something that can be consumed by `Promise.all` – Bergi Aug 06 '19 at 20:59
  • @Bergi I figured it out and yes, you're correct. I retrieved each `linkText` and `linkHref ` from `$(links)` and put them into a new dictionary. From there, I used a for loop and I was able to populate and return the `results` array. – viviansplatoon Aug 06 '19 at 21:01
  • `$(links).each` and `[].forEach` are the same, at least in terms of the duplicate. They do have some differences, but the reason it doesn't work (and the solution) is identical. – Kevin B Aug 06 '19 at 21:31
  • @KevinB I see. That answer did help me in the end but there was an intermediate step that I had to resolve on my own before that answer proved useful. Thanks for your response. – viviansplatoon Aug 06 '19 at 21:34

0 Answers0