I am trying to scrape a music website with Puppetter. I want the audio "src" scraped from the website, but the website assigns the src dynamically when the users play a track. So, I have a script that plays each track and then, I want to grab the "src" from the audio tag. But, I have this error "page is not defined". I think the "Puppetter.page" is not defined inside callback fns. So, I need your help with it.
import puppeteer from 'puppeteer-core';
import appendJSONdata from './utils/appendJSONdata.js';
export function scrape() {
try {
(async () => {
// set some options (set headless to false so we can see this automated browsing experience)
let launchOptions = {
headless: true,
executablePath:
'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe', // because we are using puppeteer-core so we must define this option
args: ['--start-maximized'],
};
const browser = await puppeteer.launch(launchOptions);
const page = await browser.newPage();
// set viewport and user agent (just in case for nice viewing)
await page.setViewport({ width: 1366, height: 768 });
await page.setUserAgent(
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
);
// Go to the chillHop Albums Page
await page.goto('https://chillhop.com/releases/');
const albumLinks = await page.$$eval('.release > a', (list) =>
list.map((elm) => elm.href)
); // 12 Albums Load Initaially
// console.log(albumLinks);
let audioRef = await page.$('audio')
// console.log();
//getAudioSrc(111)
for (const albumURL of albumLinks) {
// console.log(albumURL);
await page.goto(albumURL);
// async function getAudioSrc() {
// return await page.$('audio').getAttribute('src')
// }
let numOfTracks = await page.$$eval('.track-single', (tracks) => {
// console.log(page);
// if (tracks.length >= 5) {
return tracks.map(track => {
track.querySelector(`a.track-${track.children[0].getAttribute("data-track")}`).click() // Plays the track
return {
"data-track": track.children[0].getAttribute("data-track"),
"title": track.querySelector("div.trackTitle").textContent,
"artists": track.querySelectorAll("div.trackArtists")[0].textContent,
"duration": track.querySelector("div.track-length").textContent,
"audio-src": page.querySelector('audio').getAttribute('src') // ! page is not defined
}
// let dataTrack = track.children[0].getAttribute("data-track")
})
// } else {
// return "Less than 5 tracks"
// }
});
console.log(numOfTracks);
// (numOfTracks > 5) ? (scrape the site) : (do not scrape)
}
// appendJSONdata("This is randome data")
// close the browser
await browser.close();
})();
} catch (error) {
console.log(error);
}
}