I am trying to scrape for data from an array of wesbite urls
which are given to me. I only want to fetch the title
, the image
and the body
of the HTML page. These are the steps that I am following:
- I am trying to get the HTML page (in string format) of each of the
urls
and store them in an array:
async function getdata(value){
let data = "";
let txtPromise;
if(value.includes('https')){
txtPromise = await fetch('https://api.codetabs.com/v1/proxy?quest='+value);
if (txtPromise.ok) {
data = await txtPromise.text();
}
else{
data = "not found";
}
}
else
data = "invalid/null url";
return data;
}
let urls = ["https://.......","https://......","https://...."];
for(var i=0; i<urls.length; i++){
responses.push(getdata(urls[i]));
}
- After fetching the documents, I am trying to HTML parse the documents and retrieve the
title
,image
and thebody
of each of the documents (Theimage
can be any image, so I am just fetching the first image that I get in the page):
function saveimagetosystem(url){
return $.ajax({
url: '/server.php',
type: 'POST',
data:{"input":"save image","url":url},
cache:false,
})
}
async function processhtmldata(html_data,index){
if(!html_data.includes("not found") && !html_data.includes("invalid/null url")){
var parser = new DOMParser();
var htmldoc = parser.parseFromString(html_data, "text/html");
var title = htmldoc.querySelector("title").innerText;
details["title"+" "+index]=title.trim();
if(htmldoc.getElementsByTagName("img") != null){
var url = htmldoc.getElementsByTagName("img")[0].src.trim();
var response = await saveimagetosystem('https://api.codetabs.com/v1/proxy?quest='+url);
details["imeg"+" "+index] = response; //I am saving the image into a folder, and fetching the file location here
if(htmldoc.getElementsByTagName("p") != null){
for(var l=0; l<htmldoc.getElementsByTagName("p").length-1; l++){
details["body"+" "+index+" "+l] = htmldoc.getElementsByTagName("p")[l].innerText.trim();
}
}
}
}
}
let sl_no = 0;
Promise.all(responses)
.then( htmlfiles =>{
htmlfiles.forEach(file=>{
processhtmldata(file,sl_no);
sl_no+=1;
})
})
Now, my objective here is that my details
object should look like this:
{"title 0": Should contain the title of the HTML document page of the FIRST url,
"imeg 0": Should contain the image of the HTML document page of the FIRST url,
"body 0 0": Should contain the first body of the HTML document page of the FIRST url,
"body 0 1": Should contain the second body of the HTML document page of the FIRST url,...
"title 1": Should contain the title of the HTML document page of the SECOND url,
"imeg 1": Should contain the image of the HTML document page of the SECOND url,
"body 1 0": Should contain the first body of the HTML document page of the SECOND url,
"body 1 1": Should contain the second body of the HTML document page of the SECOND url,...
"title 2": Should contain the title of the HTML document page of the THIRD url,
"imeg 2": Should contain the image of the HTML document page of the THIRD url,
"body 2 0": Should contain the first body of the HTML document page of the THIRD url,
"body 2 1": Should contain the second body of the HTML document page of the THIRD url,...
}
But instead, my details
object looks like this:
{"title 0": Contains the title of the HTML document page of the SECOND url,
"imeg 0": Contains the image of the HTML document page of the THIRD url,
"body 0 0": Contains the first body of the HTML document page of the FIRST url,
"body 0 1": Contains the second body of the HTML document page of the FIRST url, and so on....
}
Why am I not receiving the response in a synchronous manner even after using await
? Please help me.