I would like to know how to ensure I crawl a website with all products, crawl each products unique page, build an array and write the end result to an xml file.
The below code will hit the main site url, crawl all links within a certain container, request each link inside the main loop and prepare the var for writing once in the end. How can I ensure all loops are finished before we write the XML to a file at the end? The crawler might follow links on the 2nd and 3rd requests also creating more loops within each other. Is there a way to setup one final call once everything is done, or perhaps a different approach here?
function doRequest(url) {
return new Promise(function(resolve, reject) {
request(url, function(error, res, body) {
if (!error && res.statusCode == 200) {
resolve(body);
} else {
reject(error);
}
});
});
}
async function getHTML(url) {
const response = await doRequest(url);
return response;
}
getHTML("https://www.example.com")
.then(function(data) {
const $ = cheerio.load(data);
$(".col-md-6.col-lg-4.container").each(function(i, elem) {
var productURL = $(this)
.find(".product-section")
.find("a")
.attr("href");
getHTML(productURL)
.then(function(data) {
const $ = cheerio.load(data);
var productName = $(".product-details__name")
.first()
.text();
productCounter += 1;
doc.ele("product")
.ele("title")
.txt(productName)
.up()
.ele("order")
.txt(productCounter)
.up()
.ele("expiry")
.txt("2020-01-31");
})
.catch(err => console.log(err));
});
console.log("this should be processed last or not?");
const theXML = '<?xml version="1.0"?>' + doc.toString({ pretty: true });
writeStream.write(theXML);
})
.catch(err => console.log(err));