0

I am trying to scrape for data from an array of wesbite urls which are given to me. I only want to fetch the title, the image and the body of the HTML page. These are the steps that I am following:

  1. I am trying to get the HTML page (in string format) of each of the urls and store them in an array:
async function getdata(value){
    let data = "";
    let txtPromise;
    if(value.includes('https')){
        txtPromise = await fetch('https://api.codetabs.com/v1/proxy?quest='+value);
        if (txtPromise.ok) {
           data = await txtPromise.text();
        }
        else{
           data = "not found";
        }
    }
    else
        data = "invalid/null url";
    return data;
}


let urls = ["https://.......","https://......","https://...."];
for(var i=0; i<urls.length; i++){ 
    responses.push(getdata(urls[i]));
}
  1. After fetching the documents, I am trying to HTML parse the documents and retrieve the title,image and the body of each of the documents (The image can be any image, so I am just fetching the first image that I get in the page):

function saveimagetosystem(url){
   return $.ajax({
            url: '/server.php',
            type: 'POST',
            data:{"input":"save image","url":url},
            cache:false,
        })
}


async function processhtmldata(html_data,index){
    if(!html_data.includes("not found") && !html_data.includes("invalid/null url")){
        var parser = new DOMParser();
        var htmldoc = parser.parseFromString(html_data, "text/html");
        var title = htmldoc.querySelector("title").innerText;

        details["title"+" "+index]=title.trim();

        if(htmldoc.getElementsByTagName("img") != null){         
            var url = htmldoc.getElementsByTagName("img")[0].src.trim();
            var response = await saveimagetosystem('https://api.codetabs.com/v1/proxy?quest='+url);
            details["imeg"+" "+index] = response; //I am saving the image into a folder, and fetching the file location here
            if(htmldoc.getElementsByTagName("p") != null){
                for(var l=0; l<htmldoc.getElementsByTagName("p").length-1; l++){                                 
                    details["body"+" "+index+" "+l] =  htmldoc.getElementsByTagName("p")[l].innerText.trim();

                }
            }

        }
    }
}

let sl_no = 0;
Promise.all(responses)
   .then( htmlfiles =>{
       htmlfiles.forEach(file=>{
           processhtmldata(file,sl_no);
           sl_no+=1;
       })                                                             
   })

Now, my objective here is that my details object should look like this:

{"title 0": Should contain the title of the HTML document page of the FIRST url,
 "imeg 0": Should contain the image of the HTML document page of the FIRST url,
 "body 0 0": Should contain the first body of the HTML document page of the FIRST url,
 "body 0 1": Should contain the second body of the HTML document page of the FIRST url,...
 
 "title 1": Should contain the title of the HTML document page of the SECOND url,
 "imeg 1": Should contain the image of the HTML document page of the SECOND url,
 "body 1 0": Should contain the first body of the HTML document page of the SECOND url,
 "body 1 1": Should contain the second body of the HTML document page of the SECOND url,...

 "title 2": Should contain the title of the HTML document page of the THIRD url,
 "imeg 2": Should contain the image of the HTML document page of the THIRD url,
 "body 2 0": Should contain the first body of the HTML document page of the THIRD url,
 "body 2 1": Should contain the second body of the HTML document page of the THIRD url,...
}

But instead, my details object looks like this:

{"title 0": Contains the title of the HTML document page of the SECOND url,
 "imeg 0": Contains the image of the HTML document page of the THIRD url,
 "body 0 0": Contains the first body of the HTML document page of the FIRST url,
 "body 0 1": Contains the second body of the HTML document page of the FIRST url, and so on....
}

Why am I not receiving the response in a synchronous manner even after using await? Please help me.

  • 1
    Why did you use `(async() => { … })();` instead of making the whole `processhtmldata` an `async` function? That way, you're only starting the asynchronous action, but immediately return nothing. – Bergi Nov 30 '22 at 09:13
  • I tried that, didn't make a difference. The response is still asynchronous –  Nov 30 '22 at 10:17
  • Well yes, the next step is to [remove the `forEach` loop](https://stackoverflow.com/q/37576685/1048572) and wait until all the `processhtmldata(file,sl_no);` calls are done – Bergi Nov 30 '22 at 10:27
  • I didn't get it. Can you please recreate the solution using my code? –  Nov 30 '22 at 10:43
  • It is not possible to make asynchronous code synchronous. You need to redesign your architecture to support asynchronous code instead. – slebetman Nov 30 '22 at 10:52

1 Answers1

0

You'll never get the data synchronously. You can write sequential code though that produces the desired result at the end:

function run() {
    let urls = ["https://.......","https://......","https://...."];
    let responses = [];
    for(var i=0; i<urls.length; i++){ 
        responses.push(getdata(urls[i]));
    }
    return Promise.all(responses).then(htmlfiles => {
       let sl_no = 0;
       let processed = [];
       htmlfiles.forEach(file=>{
           processed.push(processhtmldata(file,sl_no));
           sl_no+=1;
       });
       return Promise.all(processed);
   }).then(() => {
       console.log(details);
   });
}

or better

async function run() {
    const urls = ["https://.......","https://......","https://...."];
    const responses = urls.map(getdata);
    const htmlfiles = await Promise.all(responses);
    const processed = htmlfiles.map(processhtmldata);
    await Promise.all(processed);
    console.log(details);
}

You can also use a for … of loop that awaits each promise returned by getdata and processhtmldata.

Bergi
  • 630,263
  • 148
  • 957
  • 1,375