0
const puppeteer = require("puppeteer");
const fs = require("fs");
const chalk = require("chalk");
const outputFile = "./swatHotelsDetails.json";



(async function main() {
  let content = fs.readFileSync("swat.json", "utf-8");
  const data = JSON.parse(content);

  for (url of data) {
    await HotelsDetails(url.hotelUrl);
  }
})();

async function HotelsDetails(URL) {
  const url = URL;
  //console.log(url);
  const browser = await puppeteer.launch({
    headless: true,
    ignoreHTTPSErrors: true,
    args: [`--window-size=1920,1080`],
    defaultViewport: {
      width:1920,
      height:1080
    }
  });

  const page = await browser.newPage();
  

  const userAgent =
    "Mozilla/5.0 (X11; Linux x86_64)" +
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.39 Safari/537.36";
  await page.setUserAgent(userAgent);

  await page.goto(url);

  await new Promise((r) => setTimeout(r, 10000));

  await page.screenshot({ path: "example.png" });

  const hotelsDetails = await page.evaluate(async (page) => {

    const hotelName = Array.from(
      document.querySelectorAll('[data-stid="content-hotel-title"] > h1')
    ).map((name) => name.textContent);

    const property_Highlights = Array.from(
      document.querySelectorAll(
        '[data-stid="hotel-amenities-list"] > div > ul > div > li'
      )
    ).map((name) => name.textContent);

    const property_Highlights2 = Array.from(
      document.querySelectorAll(".uitk-spacing-margin-block-two li")
    ).map((name) => name.textContent);

    const aa = document.querySelector('[data-stid="section-room-list"]');
    const Rooms = Array.from(
      aa.querySelectorAll(
        ".uitk-layout-grid-item div.uitk-layout-flex-item > div.uitk-spacing-padding-blockstart-three , .uitk-layout-grid-item div.uitk-layout-flex-item > div > div div[data-stid='price-summary']"
      )
    ).map((name) => name.textContent);


//******************************************************** */

    await new Promise((r) => setTimeout(r, 10000));


    const about_this_area = Array.from(
      document.querySelectorAll(".uitk-layout-columns-minwidth-seventy_two li")
    ).map((name) => name.textContent);
    // const about_this_area = Array.from(
    //   document.querySelectorAll(
    //     "div#Location div.uitk-layout-flex-item > ul.uitk-spacing-margin-blockstart-two  li"
    //   )
    // ).map((name) => name.textContent);

    const about_this_property = Array.from(
      document.querySelectorAll("section > div.uitk-card-content-section > div")
    ).map((name) => name.textContent);

    const at_a_glance = Array.from(
      document.querySelectorAll("div#Amenities div.uitk-layout-columns div")
    ).map((name) => name.textContent);

    const list = [];
    list.push({
     // hotelName: hotelName,
     // Property_Highlights: property_Highlights,
    // Property_Highlights2: property_Highlights2,
      //Rooms: Rooms,
      About_This_Area: about_this_area,
      //About_This_Property: about_this_property,
     // At_a_Glance: at_a_glance,
    });

    return list;
  });

  //exportResults(hotelsDetials)
  console.log(hotelsDetails);

  await browser.close();
}

const exportResults = (parsedResults) => {
  fs.writeFile(outputFile, JSON.stringify(parsedResults, null, 3), (err) => {
    if (err) {
      console.log(err);
    }
    console.log(
      chalk.yellow.bgBlue(
        `\n ${chalk.underline.bold(
          parsedResults.length
        )} Results exported successfully to ${chalk.underline.bold(
          outputFile
        )}\n`
      )
    );
  });
};

the above code is working fine but give me an empty array when i run the query in devTools it give me the result but not in vs-Code , i tried to set the viewport screen max size in order to check whether may be the headless opens it in small screen size and the may be class differences but issue still persists. also i tried the setTimeOut methods may be the document doesnt loads but still the issue is not resolved, i tried to change the selectors but no help, i am stuck here, checked the different questions and best possible solutionsbut still the issue is as it is. check the data serialization docs but no help from there too.

Johnfranklien
  • 525
  • 3
  • 5
  • 15
  • the same issue is with the About_this_property and At_a_Glance array too... – Johnfranklien Dec 11 '22 at 10:33
  • one thing i tried is that when i set the view port to max size and turn the headless false and when the puppeteer launch the browser and when i scroll down to the location for which i have mentioned the selectors then it grabs it and show me the results? why is this like this, can any one explains it? – Johnfranklien Dec 11 '22 at 10:35
  • 1
    [Advice for non-native English speakers](https://meta.stackoverflow.com/questions/291362/advice-for-non-native-english-speakers) might be helpful here. Consider using sentences instead of commas to separate your ideas. There's no need to blockquote yourself. Beyond that, this isn't a runnable [mcve] so there's no way to really help. VS Code is just an editor, so I don't see how that's relevant here. – ggorlen Dec 11 '22 at 16:32
  • @ggorlen it worked for me if you have any other method, please share. – Johnfranklien Dec 11 '22 at 17:21
  • As mentioned, I can't help without more info: "there's no way to really help". – ggorlen Dec 11 '22 at 18:52

1 Answers1

0

have solved the problem myself after using different methods, and the issue is resolved as the query was not grabbing the selector because its a one-page website which very lengthy so i used a click fucntion in-order too click on an anchor tag and go to that particular location so that it is visible and then run the query selector

await new Promise((r) => setTimeout(r, 5000));
document.querySelector("[href='#Location']").click()
await new Promise((r) => setTimeout(r, 5000));


const about_this_area = Array.from(
  document.querySelectorAll(".uitk-layout-columns-minwidth-seventy_two li")
).map((name) => name.textContent);
Johnfranklien
  • 525
  • 3
  • 5
  • 15
  • i tired to run the script a couple of times and found the not all the arrays are having data so i set the time out to 5 sec also i found that when running the querySelector in devtools gives me the required result but unless i first scroll the DOM manually, but istead i do not scroll the DOM manually it will shows nothing an empty array, so after a thoughtfull thinking i used this method, hopefully it worked for me but still it needs to be explained why? is it necessary to scroll to that particular location which needs to be cralled, – Johnfranklien Dec 11 '22 at 11:54
  • 1
    5 second delays are a [poor solution](https://serpapi.com/blog/puppeteer-antipatterns/#overusing-waitfortimeout). – ggorlen Dec 11 '22 at 16:34
  • @ggorlen it might be but it solved my problem and I desperately need it to be solved, so here it did worked, if you have something to add to it sure it will be appreciated – Johnfranklien Dec 11 '22 at 17:25
  • I'd love to provide a better solution but it's not answerable without more information. I'm not even sure which site we're working with or what its behavior is. – ggorlen Dec 11 '22 at 18:51
  • hotels.com is the Website I want to scrap. first I collected the names and URLs of all the hotels for a particular location on hotels.com and exported it to a JSON file then in the above code I read the data of that exported JSON file and through the URL of the data, I go to the particular tab of the hotels.com and grab the required information @ggorlen – Johnfranklien Dec 12 '22 at 07:42
  • Thanks, but that's all still pretty high-level. Which site specifically within hotels.com? It's not productive for me to have to guess which page you're scraping, or what the JSON looks like. Need clear details that show all relevant info to eliminate all guesswork. See also [puppeteer: wait N seconds before continuing to the next line](https://stackoverflow.com/questions/46919013/puppeteer-wait-n-seconds-before-continuing-to-the-next-line/73676564#73676564) for general suggestions. – ggorlen Dec 12 '22 at 19:02