34

I'm new to web scraping and want to download all images on a webpage using puppeteer:

const puppeteer = require('puppeteer');

let scrape = async () => {
  // Actual Scraping goes Here...

  const browser = await puppeteer.launch({headless: false});
  const page = await browser.newPage();
  await page.goto('https://memeculture69.tumblr.com/');

  //   Right click and save images

};

scrape().then((value) => {
    console.log(value); // Success!
});

I have looked at the API‌ docs but could not figure out how to acheive this. So appreciate your help.

Md. Abu Taher
  • 17,395
  • 5
  • 49
  • 73
supermario
  • 2,625
  • 4
  • 38
  • 58
  • Typically you would have a selector/id for the image and then can grab the url. Then do something like this with the url https://github.com/GoogleChrome/puppeteer/issues/1937 – Braden Brown Sep 27 '18 at 17:24
  • Yeah, I've seen that issue, but could not make use of it. Can you elaborate your answer with code please? – supermario Sep 27 '18 at 17:28
  • I posted an answer. This is where I started learning to use Puppeteer. https://medium.com/@e_mad_ehsan/getting-started-with-puppeteer-and-chrome-headless-for-web-scrapping-6bf5979dee3e It goes over the basics of looping through elements and getting info from them – Braden Brown Sep 27 '18 at 17:52

10 Answers10

27

If you want to skip the manual dom traversal you can write the images to disk directly from the page response.

Example:

const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');

(async () => {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    page.on('response', async response => {
        const url = response.url();
        if (response.request().resourceType() === 'image') {
            response.buffer().then(file => {
                const fileName = url.split('/').pop();
                const filePath = path.resolve(__dirname, fileName);
                const writeStream = fs.createWriteStream(filePath);
                writeStream.write(file);
            });
        }
    });
    await page.goto('https://memeculture69.tumblr.com/');
    await browser.close();
})();

See the documentation for page.on and for the HTTPResponse object that you get from page.on('response', ...).

Boris Verkhovskiy
  • 14,854
  • 11
  • 100
  • 103
Ben Adam
  • 655
  • 7
  • 4
  • 1
    can it work with bigger files? It saves only 1KB. How to save a video? https://sample-videos.com/video123/mp4/720/big_buck_bunny_720p_1mb.mp4 – Toolkit Dec 08 '20 at 11:29
16

Here is another example. It goes to a generic search in google and downloads the google image at the top left.

const puppeteer = require('puppeteer');
const fs = require('fs');

async function run() {
    const browser = await puppeteer.launch({
        headless: false
    });
    const page = await browser.newPage();
    await page.setViewport({ width: 1200, height: 1200 });
    await page.goto('https://www.google.com/search?q=.net+core&rlz=1C1GGRV_enUS785US785&oq=.net+core&aqs=chrome..69i57j69i60l3j69i65j69i60.999j0j7&sourceid=chrome&ie=UTF-8');

    const IMAGE_SELECTOR = '#tsf > div:nth-child(2) > div > div.logo > a > img';
    let imageHref = await page.evaluate((sel) => {
        return document.querySelector(sel).getAttribute('src').replace('/', '');
    }, IMAGE_SELECTOR);

    console.log("https://www.google.com/" + imageHref);
    var viewSource = await page.goto("https://www.google.com/" + imageHref);
    fs.writeFile(".googles-20th-birthday-us-5142672481189888-s.png", await viewSource.buffer(), function (err) {
    if (err) {
        return console.log(err);
    }

    console.log("The file was saved!");
});

    browser.close();
}

run();

If you have a list of images you want to download then you could change the selector to programatically change as needed and go down the list of images downloading them one at a time.

Braden Brown
  • 3,111
  • 1
  • 20
  • 18
  • why doesn't it work with bigger files? This doesn't work https://gist.github.com/d668/2b5ae9654f42563ff5cb0546dea787c1 – Toolkit Dec 08 '20 at 11:18
16

You can use the following to scrape an array of all the src attributes of all images on the page:

const images = await page.evaluate(() => Array.from(document.images, e => e.src));

Then you can use the Node File System Module and HTTP or HTTPS Module to download each image.

Complete Example:

'use strict';

const fs = require('fs');
const https = require('https');
const puppeteer = require('puppeteer');

/* ============================================================
  Promise-Based Download Function
============================================================ */

const download = (url, destination) => new Promise((resolve, reject) => {
  const file = fs.createWriteStream(destination);

  https.get(url, response => {
    response.pipe(file);

    file.on('finish', () => {
      file.close(resolve(true));
    });
  }).on('error', error => {
    fs.unlink(destination);

    reject(error.message);
  });
});

/* ============================================================
  Download All Images
============================================================ */

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();

  let result;

  await page.goto('https://www.example.com/');

  const images = await page.evaluate(() => Array.from(document.images, e => e.src));

  for (let i = 0; i < images.length; i++) {
    result = await download(images[i], `image-${i}.png`);

    if (result === true) {
      console.log('Success:', images[i], 'has been downloaded successfully.');
    } else {
      console.log('Error:', images[i], 'was not downloaded.');
      console.error(result);
    }
  }

  await browser.close();
})();
Grant Miller
  • 27,532
  • 16
  • 147
  • 165
9

The logic is simple i think. You just need to make a function which will take url of image and save it to your directory. The puppeteer will just scrape the image url and pass it to downloader function. Here is an example:

const puppeteer = require('puppeteer');
const fs = require('fs');
const request = require('request');

//  This is main download function which takes the url of your image
function download(uri, filename) {
  return new Promise((resolve, reject) => {
    request.head(uri, function (err, res, body) {
      request(uri).pipe(fs.createWriteStream(filename)).on('close', resolve);
    });
  });
}

let main = async () => {
  const browser = await puppeteer.launch({ headless: false });
  const page = await browser.newPage();
  await page.goto('https://memeculture69.tumblr.com/');
  await page.waitFor(1000);
  const imageUrl = await page.evaluate(
    // here we got the image url from the selector.
    () => document.querySelector('img.image')
  );
  // Now just simply pass the image url
  // to the downloader function to download  the image.
  await download(imageUrl, 'image.png');
};

main();
Md. Abu Taher
  • 17,395
  • 5
  • 49
  • 73
Naimur Rahman
  • 697
  • 4
  • 15
  • Well it reaches here: https://www.tumblr.com/privacy/consent?redirect=https%3A%2F%2Fmemeculture69.tumblr.com%2F and need to click `Accept` to continue. How to deal with that? – supermario Sep 27 '18 at 18:12
  • I just manually reached [https://memeculture69.tumblr.com/](https://memeculture69.tumblr.com/) and i didn't get any Button to `Accept` anything. I just got a image src. You can wait for the button and when it appears just click on that button using `page.click(selector)` and then just get the image src from the dom. – Naimur Rahman Sep 27 '18 at 18:17
  • 2
    Well, the consent page appears for me (perhaps due to being in Europe?) and then I get `(node:31793) UnhandledPromiseRejectionWarning: Error: options.uri is a required argument` before I can click the button `` – supermario Sep 27 '18 at 18:22
  • I see, can you send your current code via gist? so that i can try locally with Europe proxy? – Naimur Rahman Sep 28 '18 at 04:38
  • Hey just curious but where is the variable "document" coming from? – B''H Bi'ezras -- Boruch Hashem Mar 27 '19 at 08:54
  • @bluejayke, just in case if your question is still unanswered... `page.evaluate` function is executed in the page context. Thus it has access to global scope and variable `document` is available like in browser console or inside a script executed in the browser. Docs: https://pptr.dev/#?product=Puppeteer&version=v1.18.0&show=api-pageevaluatepagefunction-args – ScriptyChris Sep 26 '19 at 18:45
8

This code saves all images found on the page into images folder

page.on('response', async (response) => {
  const matches = /.*\.(jpg|png|svg|gif)$/.exec(response.url());
  if (matches && (matches.length === 2)) {
    const extension = matches[1];
    const buffer = await response.buffer();
    fs.writeFileSync(`images/${matches[0]}.${extension}`, buffer, 'base64');
  }
});
ggorlen
  • 44,755
  • 7
  • 76
  • 106
Sergey Gurin
  • 1,537
  • 15
  • 14
  • 1
    This looks interesting, could you elaborate a bit please ? – M4hd1 Aug 09 '19 at 13:56
  • @M4hd1 I believe that instead of waiting for the page to load and then query selecting them like ~everyone~most people here are doing, he's intercepting the headers for all the files received and then filtering the image formats. This would definitively be faster I think, since it eliminates searching through the DOM tree instead searching through an array. I think. – Jaacko Torus Jun 27 '20 at 02:39
  • 1
    Another point is that when you wait for the page to load, query for the images on the page and download them, you're downloading the images twice. If you intercept all requests and write the ones that respond with images, you're only downloading it once. (I think, haven't checked). This answer is the same as [@BenAdam's answer](https://stackoverflow.com/a/56534741). – Boris Verkhovskiy Nov 21 '20 at 23:45
2

For image download by its selector I did the following:

  1. Obtained uri for the image using selector
  2. Passed uri to the download function

    const puppeteer = require('puppeteer');
    const fs = require('fs');
    var request = require('request');
    
    //download function
    var download = function (uri, filename, callback) {
        request.head(uri, function (err, res, body) {
            console.log('content-type:', res.headers['content-type']);
            console.log('content-length:', res.headers['content-length']);
            request(uri).pipe(fs.createWriteStream(filename)).on('close', callback);
        });
    };
    
    (async () => {
         const browser = await puppeteer.launch({
          headless: true,
          args: ['--no-sandbox', '--disable-setuid-sandbox'], //for no sandbox
        });
        const page = await browser.newPage();
        await page.goto('http://example.com');// your url here
    
        let imageLink = await page.evaluate(() => {
            const image = document.querySelector('#imageId');
            return image.src;
        })
    
        await download(imageLink, 'myImage.png', function () {
            console.log('done');
        });
    
        ...
    })();
    

Resource: Downloading images with node.js

Kate Orlova
  • 3,225
  • 5
  • 11
  • 35
Lovesh Dongre
  • 1,294
  • 8
  • 23
  • This downloads twice the images: once when loading the page, once when using the download function. This can be costly, especially when using proxies. The solution of Ben Adams avoids this issue. – Jeremie Jun 09 '23 at 12:41
2

It is possible to get all the images without visiting each url independently. You need to listen to all the requests to the server:

await page.setRequestInterception(true)
await page.on('request', function (request) {
   request.continue()
})
await page.on('response', async function (response) {
   // Filter those responses that are interesting
   const data = await response.buffer()
   // data contains the img information
})
Gabriel Furstenheim
  • 2,969
  • 30
  • 27
0

You can also filter based on the request type.

const blocked_resources = [
  'stylesheet',
  /*'image',*/
  'media',
  'font'
];

const _handleRequest = request => {
  const type = request.resourceType();
  if (blocked_resources.some(r => type === r)) {
    request.abort();
    return;
  }
  request.continue();
  return;
}
Joel Stransky
  • 380
  • 1
  • 5
  • 17
0
const puppeteer = require("puppeteer")
const fs = require("fs/promises")

// add the url of website below which you want to scrape
const yourURL = "example.com"

async function scrapeIt() {

  // it will launch browser
  const browser = await puppeteer.launch()

  // This line of code opens new page in browser
  const page = await browser.newPage()

  // page will open the webpage of your provided url
  await page.goto(yourURL)

  const photos = await page.$$eval("img", (imgs) => {
  return imgs.map((x) => x.src)
})
for (const photo of photos) {
  const imagepage = await page.goto(photo)
  await fs.writeFile(photo.split("/").pop(), await imagepage.buffer())
}
await browser.close()
}
scrapeIt()
  • 2
    Please provide more information, not only code. – gru Feb 25 '22 at 08:28
  • await page.$$eval("img", (imgs) => { return imgs.map((x) => x.src) The above code is using $$eval (a,b) which is giving us the ability to manipulate DOM . It is taking two arguments (a , b ). a is for DOM selection For Example $$eval("h1", b) and b argument is the function which have that DOM element in it. $$eval("h1",(data)=>{ data.textContent }. It will convert the DOM data in array so that 's why I am using map function. Documentation Link https://puppeteer.github.io/puppeteer/docs/puppeteer.page.__eval/ – Nabeel Yousaf Feb 26 '22 at 06:17
  • 1
    Please [edit] that into the answer. – ggorlen Jun 09 '22 at 15:35
-1

Download google images with 100% quality based on your search query using puppeteer in nodejs.

It is a straightforward approach. Open google images. Search for images using keyword. Click the images one by one to open it's right preview panel. Store all the links. Download the images.

Note: If you download the images without previewing , you will lose quality.

const request = require('request');
const cheerio = require('cheerio');
const fs = require('fs');

var puppeteer = require('puppeteer');
const readline = require("readline-sync");

const path = require('path');
const axios = require('axios').default;

// fileUrl: the absolute url of the image or video you want to download
// downloadFolder: the path of the downloaded file on your machine
const downloadFile = async (fileUrl,localFilePath) => {

  try {
    const response = await axios({
      method: 'GET',
      url: fileUrl,
      responseType: 'stream',
    });

    const w = response.data.pipe(fs.createWriteStream(localFilePath));
    w.on('finish', () => {
      console.log('Successfully downloaded file!');
    });
  } catch (err) { 
    throw new Error(err);
  }
}; 


const Google_Image = 'https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=990&'
let data = 'Ramayana HD Images Good Quality wallpaper'
let search_url = Google_Image + 'q=' + data;

var imagelinkslist =[];
let main = async () => {
  const browser = await puppeteer.launch({ headless: false });
  const page = await browser.newPage();
  let result;
  await page.goto(search_url);
  // /html/body/div[2]/c-wiz/div[3]/div[1]/div/div/div/div[1]/div[1]/span/div[1]/div[1]/div[1]/a[1]/div[1]/img
  // /html/body/div[2]/c-wiz/div[3]/div[1]/div/div/div/div[1]/div[1]/span/div[1]/div[1]/div[2]/a[1]/div[1]/img
 
  let previewimagexpath = '/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[2]/div/a/img'
 // previewimagexpath = '//*[@id="Sva75c"]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[2]/div/a/img'
  for(let i=1;i<20;i++)
  {
       let imagexpath = '/html/body/div[2]/c-wiz/div[3]/div[1]/div/div/div/div[1]/div[1]/span/div[1]/div[1]/div['+i+']/a[1]/div[1]/img'
       const elements = await page.$x(imagexpath)
       await elements[0].click();
       await page.waitForTimeout(3000);
       const image = await page.$x(previewimagexpath);
       let d = await image[0].getProperty('src') 
       //console.log(d._remoteObject.value);
       imagelinkslist.push(d._remoteObject.value)
  }
  await browser.close();
};

main().then(()=>{
    console.log('Got Image links');
    imagelinkslist.map((el,index)=>{
      let url = el;
      //console.log(url);
      const path = `./images/image${index+1}.png`;
      if(url.includes('https'))
       downloadFile(url , path);
    })
  //  console.log(imagelinkslist)
});