I'm using Robots.js to getch the sitemaps from robots.txt files of a list of websites. Basically, I use QueryStream to query each site url from a MongoDB collection and pass it on Robots.js function to process.
var stream = Urls.find().stream();
stream.on('data', function(data) {
// Fetch the urls from domain pool
var url = data;
// Get Sitemaps
parser.setUrl('http://' + url + '/robots.txt' || 'https://' + url + '/robots.txt', function(parser, success) {
if (success) {
parser.getSitemaps(function(sitemaps) {
// The array keeps increasing after each url passed in
console.log(sitemaps);
// Did try this but not work
sitemaps.length = 0;
});
}
});
}).on('error', function(err) {
// handle the error
console.log(err);
}).on('close', function() {
// the stream is closed
console.log('End of database!');
});
The problem is it returns an array which keeps increasing each time an url passed in and proccessed, like:
[url1/sitemap.xml],
[url1/sitemap.xml, url2/sitemap.xml],
[url1/sitemap.xml, url2/sitemap.xml, url3/sitemap.xml]
while the ideal thing I want is
[url1/sitemap.xml],
[url2/sitemap.xml],
[url3/sitemap.xml]
for each corresponding url passed in.
So my question is that is there any way to take the sitemap urls which match the given urls only? I've tried with sitemaps.filter
to filter out but no luck.