5

I'm trying to implement a pdf word count in Javascript. I came across pdf.js which uses promises. Is there a way to wait till the script is done before returning the count? I know that this goes against the idea of promises, but the other js pdf readers out there either sometimes produce a bunch of gibberish or return nothing. In its current form the function always return a word count of 0.

function countWords(pdfUrl){
var pdf = PDFJS.getDocument(pdfUrl);
var count = 0;
pdf.then(function(pdf) {
     var maxPages = pdf.pdfInfo.numPages;
     for (var j = 1; j <= maxPages; j++) {
        var page = pdf.getPage(j);

        var txt = "";
        page.then(function(page) {
            var textContent = page.getTextContent();
            textContent.then(function(page){

            for(var i=0;i<page.items.length;i++){
                txtadd = page.items[i].str
                txt += txtadd.replace(/[^a-zA-Z0-9:;,.?!-() ]/g,'');
            }
                count = count + txt.split(" ").length;

            })
        })
     }
     return count;
});

}

Joe Harrison
  • 105
  • 1
  • 1
  • 8
  • You'll need to invert your flow i.e. the logic which needs the `count` must be in a "then" which follows this count calculation. – Vasan Nov 08 '16 at 08:55
  • How do I do that? Do I make this method into an object and call "then" on it? This is some code I got from Spring.io, is this what you mean? var greetingPromise = sayHello(); greetingPromise.then(function (greeting) { console.log(greeting); // 'hello world’ }); – Joe Harrison Nov 08 '16 at 09:16
  • Something like that. Basically, your counting method returns a Promise (instead of the count itself) with the count resolved (`resolve(count)`). Next, you add a .then block to that returned promise and do stuff with the count inside the then block. – Vasan Nov 08 '16 at 09:30
  • Where do I return the promise? I've now added the following return after the pdf.then clause: return Promise.resolve({ then: function(onFulfill, onReject) { onFulfill(count); } }); I'm doing something wrong because it return a count of 0. Sorry for not understanding everything, I've never heard of promises before and the documentation is leaving me more confused than enlightened and thank you for helping me out! – Joe Harrison Nov 08 '16 at 10:08
  • Ok, let me add a trivial code sample when I find some time. – Vasan Nov 08 '16 at 13:44

1 Answers1

10

Promises cannot be handled in sync manner. The countWords cannot return value immediately and has to wait on inner promises (one for document and multiple for pages and text contexts) to be resolved. So countWords must return a Promise or accept callback. Best way is try to return and chain then() calls. When needed to join resolution use Promise.all:

function countWords(pdfUrl){
var pdf = PDFJS.getDocument(pdfUrl);
return pdf.then(function(pdf) { // calculate total count for document
     var maxPages = pdf.pdfInfo.numPages;
     var countPromises = []; // collecting all page promises
     for (var j = 1; j <= maxPages; j++) {
        var page = pdf.getPage(j);

        var txt = "";
        countPromises.push(page.then(function(page) { // add page promise
            var textContent = page.getTextContent();
            return textContent.then(function(page){ // return content promise

            for(var i=0;i<page.items.length;i++){
                txtadd = page.items[i].str
                txt += txtadd.replace(/[^a-zA-Z0-9:;,.?!-() ]/g,'');
            }
                return txt.split(" ").length; // value for page words

            });
        }));
     }
     // Wait for all pages and sum counts
     return Promise.all(countPromises).then(function (counts) {
       var count = 0;
       counts.forEach(function (c) { count += c; });
       return count;
     });
});
}
// waiting on countWords to finish completion, or error
countWords("https://cdn.mozilla.net/pdfjs/tracemonkey.pdf").then(function (count) {
  alert(count);
}, function (reason) {
  console.error(reason);
});
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
async5
  • 2,505
  • 1
  • 20
  • 27