0

I am trying to download a file, it does not work after download. I am getting files but the size is 1kb which is not actual file size.

If I used fetchResp.text() I am not able to open a file name.

Here is full code.

I think the problem could be here: return await fetchResp.text();

This is example, it is also important to set cookies, because i want to download data behind login.

How to handle puppeteer cookies and fetch?

What if i put fetch function outside page.evaluation. Does { credentials: "include" } will work?

Thanks in advance for your help.

const puppeteer = require("puppeteer");
const cheerio = require("cheerio");
const fs = require("fs");

(async () => {
  const browser = await puppeteer.launch({
    args: ["--no-sandbox"],
    headless: false,
    slowMo: 30,
  });
  const page = await browser.newPage();

  await page.goto(
    "https://file-examples.com/index.php/sample-documents-download/sample-xls-download/"
  );
  const content = await page.content();
  const $ = cheerio.load(content);

  const listings = $("#table-files > tbody > tr:has(a)")
    .map((index, element) => {
      const URL = $(element).find("a").attr("href");

      const Filename = $(element).find("a").attr("href").split("/").pop();
      //.replace(/^.*[\\\/]/g, "");

      const name = $(element)
        .find("td:nth-child(1)")
        .text()
        .trim()
        .replace("\n", "");

      return {
        Filename,
        URL,
      };
    })
    .get();

  for (let val of listings) {
    const downloadUrl = val.URL;
    const Filename = val.Filename;
    console.log(val);

    const downloadedContent = await page.evaluate(async (downloadUrl) => {
      const fetchResp = await fetch(downloadUrl, { credentials: "include" });
      return await fetchResp.text();
    }, downloadUrl);

    fs.writeFile(`./${Filename}`, downloadedContent, () =>
      console.log("Wrote file")
    );
  }

  await page.close();
  await browser.close();
})();

user1862965
  • 327
  • 3
  • 15
  • Please provide more information about which packages you're using. – Leo Apr 08 '20 at 14:31
  • Please provide full code, this code does not represent the whole scenario. – Zubad Ibrahim Apr 08 '20 at 14:33
  • Does this answer your question? [How to download a file with Node.js (without using third-party libraries)?](https://stackoverflow.com/questions/11944932/how-to-download-a-file-with-node-js-without-using-third-party-libraries) – cbass Apr 08 '20 at 15:03
  • @lenny here is more information, code updated – user1862965 Apr 10 '20 at 16:43

1 Answers1

0

The main problem here is that you are getting the file contents as just text, which would be fine if you wanted a plain text file, but you need to write an excel file, so you will need blob or an arrayBuffer, both of which cannot be returned from the page.evaluate method. See https://github.com/puppeteer/puppeteer/issues/3722

So you don't need to fetch the excel files using the page.evaluate function from puppeteer, you can directly get them from node using https module after getting all the links and then stream the contents to the files, which is easier in this case and also less code. You'll need these modifications

First require the https module

const https = require('https');

Then close puppeteer after getting the links, since we don't need it anymore

.get();
await page.close();
await browser.close();

Call the function here, when looping throught the links

for (let val of listings) {
 const downloadUrl = val.URL;
 const Filename = val.Filename;
 console.log(val);
 var file = await getFile(downloadUrl, Filename);
}

Finally, you need to create a function to read/write the file, outside of your main code block

function getFile(downloadUrl, Filename) {
    var data = '';
    var writeStream = fs.createWriteStream(Filename);
    var req = https.get(downloadUrl, function(res) {
        res.pipe(writeStream);
        res.on('end', () => {
            console.log('No more data in response.');
        });
    });
    req.end();
}

Full snippet

const puppeteer = require('puppeteer');
const cheerio = require("cheerio");
const fs = require("fs");
const https = require('https');

(async () => {
 const browser = await puppeteer.launch({
  args: ["--no-sandbox"],
  headless: false,
  slowMo: 30,
 });
 const page = await browser.newPage();

 await page.goto(
  "https://file-examples.com/index.php/sample-documents-download/sample-xls-download/"
 );
 const content = await page.content();
 const $ = cheerio.load(content);

 const listings = $("#table-files > tbody > tr:has(a)")
  .map((index, element) => {
   const URL = $(element).find("a").attr("href");

   const Filename = $(element).find("a").attr("href").split("/").pop();
   //.replace(/^.*[\\\/]/g, "");

   const name = $(element)
    .find("td:nth-child(1)")
    .text()
    .trim()
    .replace("\n", "");

   return {
    Filename,
    URL,
   };
  })
  .get();
 await page.close();
 await browser.close();

 for (let val of listings) {
  const downloadUrl = val.URL;
  const Filename = val.Filename;
  console.log(val);
    //call the function with each link and filename
  var file = await getFile(downloadUrl, Filename);
 }

})();
//send request and stream the response to a file
function getFile(downloadUrl, Filename) {
 var writeStream = fs.createWriteStream(Filename);
 var req = https.get(downloadUrl, function(res) {
  res.pipe(writeStream);
  res.on('end', () => {
   console.log('No more data in response.');
  });
 });
 req.end();
}

EDIT Saw your comment, you can send cookies by modifying the get request like this, but remember about the same domain policy for cookies

function getFile(downloadUrl, Filename) {
 var url = new URL(downloadUrl)
 var options = {
  hostname: url.hostname,
  path: url.pathname,
  method: 'GET',
  headers: {
   'Cookie': 'myCookie=myvalue'
  }
 };
 var writeStream = fs.createWriteStream(Filename);
 var req = https.request(options, function(res) {
  res.pipe(writeStream);
  res.on('end', () => {
   console.log('No more data in response.');
  });
 });
 req.end();
}
Zoilo Granda
  • 341
  • 2
  • 6
  • Hi Zoilo, Thanks a lot for you reply and informations. i need to download data behind login. with fetch(downloadUrl, { credentials: "include" }); i did not need cookies. But how can i download data behind login? i use puppeteer for log in and then get a cookies or? – user1862965 Apr 11 '20 at 23:25
  • @user1862965 you can send cookies with the http module too, i updated my answer to include them – Zoilo Granda Apr 12 '20 at 01:02