0

I'm trying to convert a PDF file to text.

When I open a file, like file:///D:/MyFiles/File.pdf the generated HTML does not contain the contents from the document, instead it's just a <embed>.

Is it possible to read the PDF contents without it being hosted somewhere (with JavaScript or some Chrome API)? Are there any alternatives?

BrunoLM
  • 97,872
  • 84
  • 296
  • 452

1 Answers1

-1

PDF.js seems like a good option for what your trying to do. This is a similar question: How to correctly extract text from a pdf using pdf.js The question was answered with the following code:

function gettext(pdfUrl){
  var pdf = pdfjsLib.getDocument(pdfUrl);
  return pdf.then(function(pdf) { // get all pages text
    var maxPages = pdf.pdfInfo.numPages;
    var countPromises = []; // collecting all page promises
    for (var j = 1; j <= maxPages; j++) {
      var page = pdf.getPage(j);

      var txt = "";
      countPromises.push(page.then(function(page) { // add page promise
        var textContent = page.getTextContent();
        return textContent.then(function(text){ // return content promise
          return text.items.map(function (s) { return s.str; }).join(''); // value page text 
        });
      }));
    }
    // Wait for all pages and join text
    return Promise.all(countPromises).then(function (texts) {
      return texts.join('');
    });
  });
}

// waiting on gettext to finish completion, or error
gettext("https://cdn.mozilla.net/pdfjs/tracemonkey.pdf").then(function (text) {
  alert('parse ' + text);
}, 
function (reason) {
  console.error(reason);
});

HTML:

<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
  • Your answer could be improved with additional supporting information. Please [edit] to add further details, such as citations or documentation, so that others can confirm that your answer is correct. You can find more information on how to write good answers [in the help center](/help/how-to-answer). – Community Nov 28 '21 at 21:47