How? Parser PDF to Javascript Array?

Question

How can I manually parse PDF-document to array of words using JavaScript. I don't care about images, digits, tables - only words, for I can work with it as JavaScript objects.

Are you doing that serverside or client side ? If client side, how are you receiving the documents ? No cross domain problems ? — Denys Séguret, Jan 06 '14 at 16:40
@RuzelDavletyarov Do you already have the PDF file on your website, or does the user upload it? — Cilan, Jan 06 '14 at 16:44
Take a look at this project for PDF reading in javascrit https://github.com/mozilla/pdf.js — JaredPar, Jan 06 '14 at 16:44
@RuzelDavletyarov Please don't add a useless signature to your message, we don't care about your nationality. And the language of SO is English. — Denys Séguret, Jan 06 '14 at 16:46
*manually parse* - do you mean *without using any third-party code?* — mkl, Jan 07 '14 at 09:01

Cilan · Answer 1 · 2014-01-06T17:03:09.197

Assuming you have the PDF's contents, you could download pdftotext and:

function App() { var self = this; this.complete = 0;

  this.pdfToText = function(data){            
    // render the first pages
    var pdf = new PDFJS.PDFDoc(data);
    var total = pdf.numPages;

    for (i = 1; i <= total; i++){
      var page = pdf.getPage(i);

      var canvas = document.createElement('canvas');
      canvas.id = 'page' + i;
      canvas.mozOpaque = true;
      div.appendChild(canvas);

      canvas.width = page.width;
      canvas.height = page.height;

      var context = canvas.getContext('2d');
      context.save();
      context.fillStyle = 'rgb(255, 255, 255)';
      context.fillRect(0, 0, canvas.width, canvas.height);
      context.restore();

      self.setMessage("Rendering...");

      var textLayer = document.createElement('div');
      textLayer.className = 'textLayer';
      document.body.appendChild(textLayer);

      page.startRendering(context, function(){
        if (++self.complete == total){
          self.setMessage("Finished rendering. Extracting text...");

          window.setTimeout(function(){
            var layers = [];
            var nodes = document.querySelectorAll(".textLayer > div");
            for (var j = 0; j < nodes.length; j++){
              layers.push(nodes[j].textContent + "\n");
            }
            self.sendOutput(layers.join("\n"));

            self.setMessage("Done!");
          }, 1000);
        }
      }, textLayer);
    }
  };
  }

  this.receiveInput = function(event){
    if (event.source != parent) return;
    if (!event.data.byteLength) return alert("The PDF data needs to be an ArrayBuffer");
    self.setMessage("Received data");
    self.pdfToText(event.data);
  }

  this.sendOutput = function(text){
    var recipient = parent.postMessage ? parent : (parent.document.postMessage ? parent.document : undefined);
    recipient.postMessage(text, "*");
  };

  this.setMessage = function(text){
    text = text.split(' ');
    console.log(text);
  }

  window.addEventListener("message", self.receiveInput, true);
  self.setMessage("Ready");
  self.sendOutput("ready");

and make your input's onchange App().

DEMO (doesn't work in some browsers) This will log an array of words (with punctuation) from the PDF.

Something I did not got to take the words of the document ( – Ruzel Davletyarov Jan 06 '14 at 17:25 — Ruzel Davletyarov, Jan 06 '14 at 17:25

score 0 · Answer 2 · edited May 23 '17 at 12:26

0

There's a well-known JavaScript library called jspdf. Its getTextContent() function would be very helpful in your case. Check out these two examples:

https://stackoverflow.com/a/20522307/2117492,

https://groups.google.com/d/msg/mozilla.dev.pdf-js/Qzq-xA2MHjs/nmlpttSIJcsJ

edited May 23 '17 at 12:26

Community

1
1

answered Jan 06 '14 at 16:53

gthacoder

2,213
3
15
17

score 0 · Answer 3 · answered Mar 13 '15 at 12:35

Using pdf.js, I would do this:

var pdf = require("pdf.js");
PDFJS.getDocument('document.pdf').then(function(pdf){
  pdf.getPage(1).then(function(page){
    page.getTextContent().then(function(txt){
      var arrayOfText = items.map(function(item){
        return item.str;
      });
    });
  });
});

How? Parser PDF to Javascript Array?

3 Answers3

Linked