I'm using puppeteer to scrape page that has contents that change periodically and use express to present data in rest api.
If I turn on headless chrome to see what is being shown in the browser, the new data
is there, but the data is not showing up in get()
and http://localhost:3005/api-weather
. The normal browser only shows the original data.
const express = require('express');
const server = new express();
const cors = require('cors');
const morgan = require('morgan');
const puppeteer = require('puppeteer');
server.use(morgan('combined'));
server.use(
cors({
allowHeaders: ['sessionId', 'Content-Type'],
exposedHeaders: ['sessionId'],
origin: '*',
methods: 'GET, HEAD, PUT, PATCH, POST, DELETE',
preflightContinue: false
})
);
const WEATHER_URL = 'https://forecast.weather.gov/MapClick.php?lat=40.793588904953985&lon=-73.95738513173298';
const hazard_url2 = `file://C:/Users/xdevtran/Documents/vshome/wc_api/weather-forecast-nohazard.html`;
(async () => {
try {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on("request", request => {
console.log(request.url());
request.continue();
});
await page.goto(hazard_url2, { timeout: 0, waitUntil: 'networkidle0' });
hazard = {
"HazardTitle": "stub",
"Hazardhref": "stub"
}
let forecast = await page.evaluate(() => {
try {
let forecasts = document.querySelectorAll("#detailed-forecast-body.panel-body")[0].children;
let weather = [];
for (var i = 0, element; element = forecasts[i]; i++) {
period = element.querySelector("div.forecast-label").textContent;
forecast = element.querySelector("div.forecast-text").textContent;
weather.push(
{
period,
forecast
}
)
}
return weather;
} catch (err) {
console.log('error in evaluate: ', err);
res.end();
}
}).catch(err => {
console.log('err.message :', err.message);
});
weather = forecast;
server.get('/api-weather', (req, res) => {
try {
res.end(JSON.stringify(weather, null, ' '));
console.log(weather);
} catch (err) {
console.log('failure: ', err);
res.sendStatus(500);
res.end();
return;
}
});
} catch (err) {
console.log('caught error :', err);
}
browser.close();
})();
server.listen(3005, () => {
console.log('http://localhost:3005/api-weather');
});
I've tried several solutions WaitUntil, WaitFor, .then and sleep but nothing seems to work.
I wonder if it has something to do with express get()? I'm using res.end()
instead of res.send()
is because the json looks better when I use res.end()
. I don't really know the distinction.
I'm also open to using this reload solution. But I received errors and didn't use it. I also tried waitForNavigation(), but I don't know how it works, either.
Maybe I'm using the wrong search term to find the solution. Could anyone point me in the right direction? Thank you.