How can I manually parse PDF-document to array of words using JavaScript. I don't care about images, digits, tables - only words, for I can work with it as JavaScript objects.
Asked
Active
Viewed 7,097 times
2
-
Are you doing that serverside or client side ? If client side, how are you receiving the documents ? No cross domain problems ? – Denys Séguret Jan 06 '14 at 16:40
-
@RuzelDavletyarov Do you already have the PDF file on your website, or does the user upload it? – Cilan Jan 06 '14 at 16:44
-
Take a look at this project for PDF reading in javascrit https://github.com/mozilla/pdf.js – JaredPar Jan 06 '14 at 16:44
-
1@RuzelDavletyarov Please don't add a useless signature to your message, we don't care about your nationality. And the language of SO is English. – Denys Séguret Jan 06 '14 at 16:46
-
@ManofSnow yes, any users upload document ^) – Ruzel Davletyarov Jan 06 '14 at 16:46
-
*manually parse* - do you mean *without using any third-party code?* – mkl Jan 07 '14 at 09:01
-
@mkl yes) i want read pure js) – Ruzel Davletyarov Jan 22 '14 at 23:25
3 Answers
0
Assuming you have the PDF's contents, you could download pdftotext and:
function App() { var self = this; this.complete = 0;
this.pdfToText = function(data){
// render the first pages
var pdf = new PDFJS.PDFDoc(data);
var total = pdf.numPages;
for (i = 1; i <= total; i++){
var page = pdf.getPage(i);
var canvas = document.createElement('canvas');
canvas.id = 'page' + i;
canvas.mozOpaque = true;
div.appendChild(canvas);
canvas.width = page.width;
canvas.height = page.height;
var context = canvas.getContext('2d');
context.save();
context.fillStyle = 'rgb(255, 255, 255)';
context.fillRect(0, 0, canvas.width, canvas.height);
context.restore();
self.setMessage("Rendering...");
var textLayer = document.createElement('div');
textLayer.className = 'textLayer';
document.body.appendChild(textLayer);
page.startRendering(context, function(){
if (++self.complete == total){
self.setMessage("Finished rendering. Extracting text...");
window.setTimeout(function(){
var layers = [];
var nodes = document.querySelectorAll(".textLayer > div");
for (var j = 0; j < nodes.length; j++){
layers.push(nodes[j].textContent + "\n");
}
self.sendOutput(layers.join("\n"));
self.setMessage("Done!");
}, 1000);
}
}, textLayer);
}
};
}
this.receiveInput = function(event){
if (event.source != parent) return;
if (!event.data.byteLength) return alert("The PDF data needs to be an ArrayBuffer");
self.setMessage("Received data");
self.pdfToText(event.data);
}
this.sendOutput = function(text){
var recipient = parent.postMessage ? parent : (parent.document.postMessage ? parent.document : undefined);
recipient.postMessage(text, "*");
};
this.setMessage = function(text){
text = text.split(' ');
console.log(text);
}
window.addEventListener("message", self.receiveInput, true);
self.setMessage("Ready");
self.sendOutput("ready");
and make your input's onchange
App()
.
DEMO (doesn't work in some browsers) This will log an array of words (with punctuation) from the PDF.

Cilan
- 13,101
- 3
- 34
- 51
0
There's a well-known JavaScript library called jspdf. Its getTextContent()
function would be very helpful in your case. Check out these two examples:
https://stackoverflow.com/a/20522307/2117492,
https://groups.google.com/d/msg/mozilla.dev.pdf-js/Qzq-xA2MHjs/nmlpttSIJcsJ
0
Using pdf.js, I would do this:
var pdf = require("pdf.js");
PDFJS.getDocument('document.pdf').then(function(pdf){
pdf.getPage(1).then(function(page){
page.getTextContent().then(function(txt){
var arrayOfText = items.map(function(item){
return item.str;
});
});
});
});

Adrien Joly
- 5,056
- 4
- 28
- 43