6

I'm using puppeteer-extra and node.js to iterate accross multiple urls.

I'm trying to intercept some resourceType to load upon each iteration, and getting the following error.

PS C:\Users\someuser\Desktop\Project> node temp.js
-- running
C:\Users\someuser\node_modules\puppeteer\lib\cjs\puppeteer\common\assert.js:26
        throw new Error(message);
              ^

Error: Request is already handled!
    at Object.exports.assert (C:\Users\someuser\node_modules\puppeteer\lib\cjs\puppeteer\common\assert.js:26:15)
    at HTTPRequest.continue (C:\Users\someuser\node_modules\puppeteer\lib\cjs\puppeteer\common\HTTPRequest.js:217:21)
    at PuppeteerBlocker.onRequest (C:\Users\someuser\node_modules\@cliqz\adblocker-puppeteer\dist\cjs\adblocker.js:225:33)
    at BlockingContext.onRequest (C:\Users\someuser\node_modules\@cliqz\adblocker-puppeteer\dist\cjs\adblocker.js:64:47)
    at C:\Users\someuser\node_modules\puppeteer\lib\cjs\vendor\mitt\src\index.js:51:62
    at Array.map (<anonymous>)
    at Object.emit (C:\Users\someuser\node_modules\puppeteer\lib\cjs\vendor\mitt\src\index.js:51:43)
    at Page.emit (C:\Users\someuser\node_modules\puppeteer\lib\cjs\puppeteer\common\EventEmitter.js:72:22)
    at C:\Users\someuser\node_modules\puppeteer\lib\cjs\puppeteer\common\Page.js:143:100
    at C:\Users\someuser\node_modules\puppeteer\lib\cjs\vendor\mitt\src\index.js:51:62

I'm having trouble understanding why the request would be already handled as the actual request page.goto is done while in the for loop. Would anyone one have any hints?

Here is the full project

const puppeteer = require( 'puppeteer-extra' );

const StealthPlugin = require( 'puppeteer-extra-plugin-stealth' );
puppeteer.use( StealthPlugin() );

const AdblockerPlugin = require( 'puppeteer-extra-plugin-adblocker' );
puppeteer.use( AdblockerPlugin( { blockTrackers: true } ) );

puppeteer.launch( { headless: true } ).then( async browser => {

    console.log( '--\xa0running' );

    console.time( '--\xa0process' );

    const page = await browser.newPage();

    await page.setRequestInterception( true );
    
    page.on( 'request', ( request ) => {
        if ( [ 'image', 'stylesheet', 'font', 'script' ].indexOf( request.resourceType() ) ) {
            request.abort();
        } else {
            request.continue();
        };
    } );

    for ( var i = 1; i <= 20; i++ ) {

        console.time( '--\xa0iteration\xa0' + i ); // ... timer start 
    
        await page.goto( 'https://www.someurl.it/shop/s%2D' + i, { waitUntil: 'load' } );
    
        const title = await page.title();
    
        console.log( title.includes( '404' ) ? false : title );
    
        console.timeEnd( '--\xa0iteration\xa0' + i ); // ... timer end 
    
    };

    await browser.close();

    console.timeEnd( '--\xa0process' );
  
    console.log( '--\xa0ending' );

} );
amarinediary
  • 4,930
  • 4
  • 27
  • 45

3 Answers3

5

Adding a return statement solved the issue on my end.

page.on( 'request', ( request ) => {
        if ([ 'image', 'stylesheet', 'font', 'script' ].indexOf( request.resourceType() ) !== -1 ) {
           return request.abort();
        }
        request.continue();
} );
K_Wainaina
  • 219
  • 3
  • 7
3

Ressources interception must be made against each new pages.

Here is the full list of ressources you can intercept: stylesheet, image, media, font, script, texttrack, xhr, fetch, eventsource, websocket, manifest, other.

Note:
Most of the time, intercepting ALL resources might negatively impact your scraper.

I would advise to ONLY intercept image, media and font. (In some cases intercepting stylesheet might impact puppeteer click action).

Example

/**
 * Puppeteer, Headless Chrome Node.js API
 * 
 * @link https://github.com/puppeteer/puppeteer
 * 
 * @package npm install puppeteer
 */
const puppeteer = require( 'puppeteer' );

const brewery = async ( page ) => {

    await page.setRequestInterception( true );

    page.on( 'request', r => {

        /**
         * @see https://stackoverflow.com/a/47166637/3645650
         */
        if ( [
            //'stylesheet', 
            'image', 
            'media', 
            'font',
        ].indexOf( r.resourceType() ) !== -1 ) {

            r.abort();

        } else {

            r.continue();

        };

    } );

};

( async () => {

    // ... start
    let start = new Date();
    console.log( '--\xa0process:\xa0start' );

    const browser = await puppeteer.launch( { 
        headless: true 
    } );

    const page = await browser.newPage();
    
    await brewery( page );

    await page.goto( 'https://github.com/login' );
    await page.screenshot( { path: Date.now() + '.png' } );
    console.log( '--\xa0process:\xa0screenshot' );

    // ... end
    await browser.close().then( () => {
        var end = ( new Date() - start ) / 1000;
        console.log( '--\xa0process:\xa0end,\xa0runtime\xa0' + end + '\xa0seconds' );
    } );  

} ) ()
amarinediary
  • 4,930
  • 4
  • 27
  • 45
0
page.on("request", (request) => {
  const requestUrl = request.url();
  if (!request.isInterceptResolutionHandled())
    if (
      blockResourceType.includes(request.resourceType()) ||
      blockResourceName.some((resource) => requestUrl.includes(resource))
    ) {
      request.abort();
    } else {
      request.continue();
    }
});

You can try this to avoid this error message