I'm trying to find the src for all the scripts in the DOM. There are around 77 scripts on the page and puppeteer returns 66 scripts. If I check the DOM there are 12 scripts with the async attribute and those are the exact ones that are missing. How can we get them?
Analyzer.js
Next.js page which basically takes input from the user for the site to scrape.
import React from 'react'
import { useState , useEffect} from 'react';
const Test = () => {
const [websiteURL, setWebsiteURL] = useState('');
async function submitURL(){
const data = await fetch('api/scraper', {
method: 'POST',
headers: {
'Content-Type' : 'application/json'
},
body: JSON.stringify({
url : websiteURL
})
})
const response = await data.json();
console.log(response)
}
return (
<div>
<input type="text" value={websiteURL} onChange={(e) => setWebsiteURL(e.target.value)} placeholder = "Enter URL" />
<button onClick={submitURL}>Test</button>
</div>
)
}
export default Test
Scraper.js
Endpoint under the API folder that scrapes the scripts from the URL
export default async function test (req, res){
const url = req.body.url
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(url, { waitUntil: 'networkidle0' })
const data = await page.page.evaluate(
() => Array.from(document.querySelectorAll('script'))
.map(elem => elem.tagName)
);
console.log(data.length);
await page.browser.close()
}
Package.json
{
"name": "scraper",
"version": "0.1.0",
"private": true,
"scripts": {
"dev": "next dev",
"build": "next build",
"start": "next start",
"lint": "next lint"
},
"dependencies": {
"cheerio": "^1.0.0-rc.12",
"firebase": "^9.11.0",
"graphql": "^16.6.0",
"graphql-request": "^5.0.0",
"mobile-friendly-test-npm": "^1.0.4",
"moment": "^2.29.4",
"next": "12.2.5",
"puppeteer": "^18.2.1",
"puppeteer-extra": "^3.3.4",
"puppeteer-extra-plugin-stealth": "^2.11.1",
"react": "18.2.0",
"react-dom": "18.2.0",
"react-firebase-hooks": "^5.0.3",
"react-share": "^4.4.0"
},
"devDependencies": {
"eslint": "8.23.0",
"eslint-config-next": "12.2.5"
}
}
This is a next.js project that can be run with npm run dev
.