I need to extract the text of a PDF using only client side JavaScript.
I have this JSFiddle http://jsfiddle.net/go279m0h/
document.getElementById('file').addEventListener('change', readFile, false);
/** This works
* Extract text from PDFs with PDF.js
* https://mozilla.github.io/pdf.js/getting_started/
*/
pdfToText = function(data) {
PDFJS.workerSrc = "{{ url_for('static', filename='js/pdf.worker.js') }}";
PDFJS.cMapUrl = "{{ url_for('static', filename='cmaps') }}";
PDFJS.cMapPacked = true;
return PDFJS.getDocument(data).then(function(pdf) {
var pages = [];
for (var i = 0; i < pdf.numPages; i++) {
pages.push(i);
}
return Promise.all(pages.map(function(pageNumber) {
return pdf.getPage(pageNumber + 1).then(function(page) {
return page.getTextContent().then(function(textContent) {
return textContent.items.map(function(item) {
return item.str;
}).join(' ');
});
});
})).then(function(pages) {
return pages.join("\r\n");
});
});
}
// this function should get the text of a pdf file and print it to the console.
function readFile (evt) {
var files = evt.target.files;
var file = files[0];
// following from https://stackoverflow.com/questions/1554280/extract-text-from-pdf-in-javascript
// using PDFJS function
self.pdfToText(files[0].path).then(function(result) {
console.log("PDF done!", result);
})
/*
var reader = new FileReader();
reader.onload = function() {
console.log(this.result);
}
//reader.readAsText(file)
*/
}
The PDF JS function to get text from the PDF currently works with a server side file path, BUT I can't get it to accept the files[0] argument for the file the user uploads.
The error I keep getting when I run this is "Uncaught Error: Invalid parameter in getDocument, need either Uint8Array, string or a parameter object"
The second option from the bottom was where I got the function, that I was able to use for text extraction. extract text from pdf in Javascript