1

I would like to know how to ensure I crawl a website with all products, crawl each products unique page, build an array and write the end result to an xml file.

The below code will hit the main site url, crawl all links within a certain container, request each link inside the main loop and prepare the var for writing once in the end. How can I ensure all loops are finished before we write the XML to a file at the end? The crawler might follow links on the 2nd and 3rd requests also creating more loops within each other. Is there a way to setup one final call once everything is done, or perhaps a different approach here?

function doRequest(url) {
    return new Promise(function(resolve, reject) {
        request(url, function(error, res, body) {
            if (!error && res.statusCode == 200) {
                resolve(body);
            } else {
                reject(error);
            }
        });
    });
}

async function getHTML(url) {
    const response = await doRequest(url);
    return response;
}

getHTML("https://www.example.com")
    .then(function(data) {
        const $ = cheerio.load(data);

        $(".col-md-6.col-lg-4.container").each(function(i, elem) {
            var productURL = $(this)
                .find(".product-section")
                .find("a")
                .attr("href");
            getHTML(productURL)
                .then(function(data) {
                    const $ = cheerio.load(data);
                    var productName = $(".product-details__name")
                        .first()
                        .text();

                    productCounter += 1;
                    doc.ele("product")
                        .ele("title")
                        .txt(productName)
                        .up()
                        .ele("order")
                        .txt(productCounter)
                        .up()
                        .ele("expiry")
                        .txt("2020-01-31");
                })
                .catch(err => console.log(err));
        });

        console.log("this should be processed last or not?");
        const theXML = '<?xml version="1.0"?>' + doc.toString({ pretty: true });
        writeStream.write(theXML);
    })
    .catch(err => console.log(err));

Skobbejak
  • 108
  • 1
  • 8
  • 1
    Your problem here is that `$(".col-md-6.col-lg-4.container").each` is synchronous. It traverses all elements one after another and doesn't wait for the `getHTML` promise to resolve before proceeding to the next element. It's probably best to use `for ( a in b )` with `await` instead as jQuery doesn't support async work. – Tony Bogdanov Jan 30 '20 at 22:41
  • Thanks, you pointed me in the right direction other than just closing the post and link it to an irrelevant post somewhere else. I found the perfect solution here: https://github.com/41x3n/GoalKicker-Books-Script/blob/master/app.js – Skobbejak Jan 31 '20 at 01:26

0 Answers0