So I'm building a web scraper, and I need (want) to loop over my axios request.
most of the code is here... the rest is data and functions to return pieces.
When I run this, it's not waiting for anything, it just flies over it. Where am I going wrong?
I dont' know if you need this info, but the page checker seems to think you do, so....
I'm trying to scrape google for LinkedIn pages that contain a plainly visible email address. The code at this point is unfinished, but the gist is this. I've got some proxies. If I get a captcha page, I change proxies. If I get search results, I parse them for the data I want. If there is a next link, I go to the next page. If everything is right, I should be zipping through google in no time.
while( (task == "new") || (task == "next") || (task=="proxy"))
{
switch(task) {
case "new":
url = getNextUrl(1);
if(url == "end") {
console.log("Search completed!");
break;
}
break;
case "next":
url = getNextUrl(pageNumber)
break;
case "proxy":
break;
}
proxy = getNextProxy();
(async () => {
let proxy = getNextProxy().toString().split(":");
console.log(proxy);
console.log(url);
await axios.get(url,{
headers: {
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.66 Safari/537.36"
},
proxy: false,
httpsAgent: new HttpsProxyAgent.HttpsProxyAgent(`http://${proxy[0]}:${proxy[1]}`)
},
{timeout:5000})
.then((response) =>{
const $ = cheerio.load(response.data);
let captcha = $("#captcha-form");
if(captcha.length > 0)
{
task = "proxy";
return;
}
let searchitems = $(".jtfYYd");
for(let i=0; i < searchitems.length; i++)
{
let element = searchitems[i];
let c = $(element).attr("class");
let link = $(element).find(".yuRUbf").find("a").attr('href');
let title = $(element).find("h3").text();
let details = $(element).find(".MUxGbd.wuQ4Ob.WZ8Tjf").find("span");
let stub = $(element).find(".VwiC3b.yXK7lf.MUxGbd.yDYNvb.lyLwlc.lEBKkf").find("span").text();
console.log();
console.log("Title: " + title);
console.log("Link: " + link);
console.log("Details: " + details);
console.log("Stub: " + stub);
}
let botstuff = $("#botstuff");
console.log("botstuff: " + botstuff.text());
if(botstuff.text().indexOf("Next") > 1)
{
task = "next";
}
else
{
if(searchitems.length == 0) {
task = "proxy";
}
else
{
task="new"
}
}
//botstuff: Page Navigation123Next
})
.catch((error) => {
task = "proxy";
//console.log(error);
});
})();
}