1
var pdfParser = require('pdf-parser')
var fs = require('fs')
var PDF_PATH = __dirname + '/pdfs'
var results = []
var failed = []

fs.readdir(PDF_PATH, function(err, files){
    if(err){
        return console.log(err)
    }
    for(const file of files){
        let the_ent = {
            'name': '',
            'other data': []
        }

        pdfParser.pdf2json(PDF_PATH + '/' + file, function(error, pdf){
            if(error != null){
                console.log(error)
            }else if(pdf['pages'] == undefined){
                failed.push(file)
                console.log(file +' failed')
            }else{
                //populate 'results' array
            }
                console.log(/*pdf_data*/)
                results.push(/*pdf_data*/)

        })
    }
    console.log(results)
    console.log(failed)
    results = JSON.stringify(results)
    //fs.writeFileSync() write results to json
})

I don't know what is wrong with me this morning, I can't work out how to write this in async; obviously the logs/writefile at the bottom fire as soon as the script executes.

I have tried wrapping in async functions and awaiting the readdir / pdf parsing instead of using callbacks - clearly not correctly. I'm just trying to parse every pdf in a folder - push what I want to some arrays and then log them once the loop finishes zzz.

AndreasPizsa
  • 1,736
  • 19
  • 26
anxxxious
  • 27
  • 6

3 Answers3

2

Wrap the smallest asynchronous tasks into Promises, then use async/await to combine them:

   // the Promise wrapper:
   const parsePdf = file => new Promise((res, rej) => pdfParser.pdf2json(file, (err, r) => err ? rej(err) : res(r));

 (async function () { // start an asynchronous context
   const PDF_PATH = __dirname + '/pdfs'; 
   const results = [], failed = []; // prefer const over let

    // reading files in a promising way is already provided natively:
   const files = await fs.promises.readdir(PDF_PATH);

   for(const file of files){ // this is in series, in parallel would probably be faster
     let the_ent = {
         name: '',
         'other data': [], // avoid whitespaces!
     };

     const pdf = await parsePdf(PDF_PATH + '/' +file);

     if(pdf.pages === undefined) { // prefer equality (===) over comparison (==)
        failed.push(file);
        console.log(file + ' failed');
     } else {
        // populate 'results' array
     }             
  }

  console.log(results, failed);
})();

You can probably process the files in parallel too.

Jonas Wilms
  • 132,000
  • 20
  • 149
  • 151
2

I would promisify the async operations and use async/await. For the fs operations, use the new fs.promises API. For others, use util.promisify() to make promisified versions.

The resolved value of the parsePDFs function I create will be an array of JSON and an array of failed filenames so you get both pieces of information back:

const util = require('util');
const pdfParser = require('pdf-parser');
// make promisified version of the function
const pdfParser.pdf2jsonP = util.promisify(pdfParser.pdf2json);

const fsp = require('fs').promises;
const path = require('path');
const PDF_PATH = path.join(__dirname, 'pdfs');

async function parsePDFs(dir) {
    const files = await fsp.readdir(dir);
    const results = [];
    const failed = [];

    for (const file of files) {
        let the_ent = {
            'name': '',
            'other data': []
        }

        try {
            let pdf = await pdfParser.pdf2jsonP(path.join(dir, file));
            if (!pdf || pdf.pages === undefined) {
                throw new Error("pdf.pages is empty")
            }
            results.push(pdf);
        } catch(e){
            console.log(e);
            failed.push(file);
        }
    }
    // this will be the resolved value of the returned promise
    return {results, failed};
}

parsePDFs(PDF_PATH).then(data => {
    console.log("failed filenames: " data.failed);
    console.log("json results: ", data.results);
    // do something with data.results and data.failed
}).catch(err => {
    console.log(err);
});

Note: You declare, but never use the variable the_ent.

Jonas Wilms
  • 132,000
  • 20
  • 149
  • 151
jfriend00
  • 683,504
  • 96
  • 985
  • 979
  • Out of curiosity, is there an actual difference between `fs.readdirSync` and `await fs.promises.readdir`? – AndreasPizsa Nov 14 '19 at 23:42
  • 2
    @AndreasPizsa - Yes, huge difference. `fs.promises.readdir` is still non-blocking and asynchronous. It just returns a promise which you then use with `await` which can be much friendlier to program with than the plain callback of `fs.readdir()`. `fs.readdirSync()` is synchronous and blocking. In a server environment, a synchronous I/O call will ruin your server scalability because it blocks the ONE Javascript thread so nothing else can run while waiting for the file operation to finish. Asynchronous calls allow other things to run while they are finishing. – jfriend00 Nov 14 '19 at 23:44
  • Hey thanks! And yeah the_ent is used but I stripped most of the operations on the pdf objects out for cleanliness' sake. – anxxxious Nov 14 '19 at 23:46
  • Thanks @jfriend00. I get that part, and I get the Promise part; just in this example, where there’s nothing else going on, no other simultaneous nonblocking I/O, those would effectively be the same, yes? (Ah, just saw you extended your comment, same conclusion, k, thx!) – AndreasPizsa Nov 14 '19 at 23:50
  • 1
    @AndreasPizsa - In a non-server environment (like just a stand-alone script to accomplish a task which this is perhaps), it's perfectly OK to use synchronous file I/O and it is sometimes easier to program it that way. Just don't ever do it in a server environment or in any code that a server might want to use. – jfriend00 Nov 14 '19 at 23:55
1

You can use util.promisify to promisify the sync functions:

const readdir = util.promisify(fs.readdir);
const reader = util.promisify(pdfParser.pdf2json);

Minimal demo:

const fs = require('fs');
const util = require('util');
var pdfParser = require('pdf-parser');

const readdir = util.promisify(fs.readdir);
const reader = util.promisify(pdfParser.pdf2json);

var PDF_PATH = __dirname + '/pdfs';

(async () => {
    async function processFiles() {
        let results = [];

        let files = await readdir(PDF_PATH);

        for (const file of files) {
            let pdf = await reader(PDF_PATH + '/' + file);
            results.push(pdf);
        }

        return results;
    }

    const result = await processFiles();

    console.info(result);
})();
Samuel Goldenbaum
  • 18,391
  • 17
  • 66
  • 104