25

How to make the text in a PDF selectable?

Have tried here. The PDF is written fine, but no text selection

https://github.com/mozilla/pdf.js

https://github.com/mozilla/pdf.js/blob/master/web/text_layer_builder.css
https://github.com/mozilla/pdf.js/blob/master/web/text_layer_builder.js

'use strict';

PDFJS.getDocument('file.pdf').then(function(pdf){
    var page_num = 1;
    pdf.getPage(page_num).then(function(page){
        var scale = 1.5;
        var viewport = page.getViewport(scale);
        var canvas = document.getElementById('the-canvas');
        var context = canvas.getContext('2d');
        canvas.height = viewport.height;
        canvas.width = viewport.width;

        var canvasOffset = $(canvas).offset();
        var $textLayerDiv = $('#text-layer').css({
            height : viewport.height+'px',
            width : viewport.width+'px',
            top : canvasOffset.top,
            left : canvasOffset.left
        });

        page.render({
            canvasContext : context,
            viewport : viewport
        });

        page.getTextContent().then(function(textContent){
            var textLayer = new TextLayerBuilder({
                textLayerDiv : $textLayerDiv.get(0),
                pageIndex : page_num - 1,
                viewport : viewport
            });

            textLayer.setTextContent(textContent);
            textLayer.render();
        });
    });
});

<body>
  <div>
    <canvas id="the-canvas" style="border:1px solid black;"></canvas>
    <div id="text-layer" class="textLayer"></div>
  </div>
</body>
clarkk
  • 27,151
  • 72
  • 200
  • 340
  • http://stackoverflow.com/questions/16775907/is-there-a-minimalistic-pdf-js-sample-that-supports-text-selection?rq=1 doesn't help you? – ChrLipp Oct 13 '15 at 15:31
  • have tried it.. cant make it work.. this is an older version of pdf.js – clarkk Oct 13 '15 at 17:45

4 Answers4

22

On pdf.js version 2.8.61, the checked answer does no more work, as renderTextLayer() is integrated to pdf.js, no more outside source is required, neither jQuery.

The following code will make PDF text selectable. It loads the following PDF document as example, please replace it with your own:

https://raw.githubusercontent.com/mozilla/pdf.js/ba2edeae/web/compressed.tracemonkey-pldi-09.pdf

It uses mainly two html elements:

<canvas id="the-canvas"></canvas>
<div class="textLayer"></div>

canvas for the non selectable document for display, .textLayer div for selectable text. Text on .textLayer div is all transparent, so invisible, it provides only the selection effect.


<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="viewport" content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no">
<script src="//mozilla.github.io/pdf.js/build/pdf.js" crossorigin="anonymous"></script>
<link href="//mozilla.github.io/pdf.js/web/viewer.css" rel="stylesheet" type="text/css" />
<style type="text/css">

#the-canvas {
  border: 1px solid black;
  direction: ltr;
}

</style>
</head>

<body>

<h1>PDF.js Previous/Next example</h1>

<div>
  <button id="prev">Previous</button>
  <button id="next">Next</button>
  &nbsp; &nbsp;
  <span>Page: <span id="page_num"></span> / <span id="page_count"></span></span>
</div>

<canvas id="the-canvas"></canvas>
<div class="textLayer"></div>

<script>
// If absolute URL from the remote server is provided, configure the CORS
// header on that server.
var url = '//raw.githubusercontent.com/mozilla/pdf.js/ba2edeae/web/compressed.tracemonkey-pldi-09.pdf';

// Loaded via <script> tag, create shortcut to access PDF.js exports.
var pdfjsLib = window['pdfjs-dist/build/pdf'];

// The workerSrc property shall be specified.
pdfjsLib.GlobalWorkerOptions.workerSrc = '//mozilla.github.io/pdf.js/build/pdf.worker.js';

var pdfDoc = null,
    pageNum = 1,
    pageRendering = false,
    pageNumPending = null,
    //scale = 0.8,
    scale = 1,
    canvas = document.getElementById('the-canvas'),
    ctx = canvas.getContext('2d');

/**
 * Get page info from document, resize canvas accordingly, and render page.
 * @param num Page number.
 */
function renderPage(num) {
  pageRendering = true;
  // Using promise to fetch the page
  pdfDoc.getPage(num).then(function(page) {
    var viewport = page.getViewport({scale: scale});
    canvas.height = viewport.height;
    canvas.width = viewport.width;

    // Render PDF page into canvas context
    var renderContext = {
      canvasContext: ctx,
      viewport: viewport
    };
    var renderTask = page.render(renderContext);

    // Wait for rendering to finish
    renderTask.promise.then(function() {
      pageRendering = false;
      if (pageNumPending !== null) {
        // New page rendering is pending
        renderPage(pageNumPending);
        pageNumPending = null;
      }
    }).then(function() {
      // Returns a promise, on resolving it will return text contents of the page
      return page.getTextContent();
    }).then(function(textContent) {

      // Assign CSS to the textLayer element
      var textLayer = document.querySelector(".textLayer");

      textLayer.style.left = canvas.offsetLeft + 'px';
      textLayer.style.top = canvas.offsetTop + 'px';
      textLayer.style.height = canvas.offsetHeight + 'px';
      textLayer.style.width = canvas.offsetWidth + 'px';

      // Pass the data to the method for rendering of text over the pdf canvas.
      pdfjsLib.renderTextLayer({
        textContent: textContent,
        container: textLayer,
        viewport: viewport,
        textDivs: []
      });
    });
  });

  // Update page counters
  document.getElementById('page_num').textContent = num;
}

/**
 * If another page rendering in progress, waits until the rendering is
 * finised. Otherwise, executes rendering immediately.
 */
function queueRenderPage(num) {
  if (pageRendering) {
    pageNumPending = num;
  } else {
    renderPage(num);
  }
}

/**
 * Displays previous page.
 */
function onPrevPage() {
  if (pageNum <= 1) {
    return;
  }
  pageNum--;
  queueRenderPage(pageNum);
}
document.getElementById('prev').addEventListener('click', onPrevPage);

/**
 * Displays next page.
 */
function onNextPage() {
  if (pageNum >= pdfDoc.numPages) {
    return;
  }
  pageNum++;
  queueRenderPage(pageNum);
}
document.getElementById('next').addEventListener('click', onNextPage);

/**
 * Asynchronously downloads PDF.
 */
pdfjsLib.getDocument(url).promise.then(function(pdfDoc_) {
  pdfDoc = pdfDoc_;
  document.getElementById('page_count').textContent = pdfDoc.numPages;

  // Initial/first page rendering
  renderPage(pageNum);
});

</script>

</body>
</html>
jacouh
  • 8,473
  • 5
  • 32
  • 43
19

Your javascript code is perfect. You just need to include the UI utilities that Text Layer Builder depends on:

https://github.com/mozilla/pdf.js/blob/master/web/ui_utils.js

Or in HTML:

<script src="https://raw.githubusercontent.com/mozilla/pdf.js/master/web/ui_utils.js"></script>

If you run your code (without ui_utils) and check the debug console, you will see ReferenceError: CustomStyle is not defined. A quick search in PDFjs's repo will show you it is defined in ui_utils.js.

Here is my minimal but complete code for your reference. I am using PDFjs's demo pdf here. Note that in production you should not link to raw.github.

<!DOCTYPE html><meta charset="utf-8">
<link rel="stylesheet" href="https://raw.githubusercontent.com/mozilla/pdf.js/master/web/text_layer_builder.css" />
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.4/jquery.min.js"></script>
<script src="https://raw.githubusercontent.com/mozilla/pdf.js/master/web/ui_utils.js"></script>
<script src="https://raw.githubusercontent.com/mozilla/pdf.js/master/web/text_layer_builder.js"></script>
<script src="https://mozilla.github.io/pdf.js/build/pdf.js"></script>
<body>
  <div>
    <canvas id="the-canvas" style="border:1px solid black;"></canvas>
    <div id="text-layer" class="textLayer"></div>
  </div>
<script>
'use strict';

PDFJS.getDocument('file.pdf').then(function(pdf){
    var page_num = 1;
    pdf.getPage(page_num).then(function(page){
        var scale = 1.5;
        var viewport = page.getViewport(scale);
        var canvas = $('#the-canvas')[0];
        var context = canvas.getContext('2d');
        canvas.height = viewport.height;
        canvas.width = viewport.width;

        var canvasOffset = $(canvas).offset();
        var $textLayerDiv = $('#text-layer').css({
            height : viewport.height+'px',
            width : viewport.width+'px',
            top : canvasOffset.top,
            left : canvasOffset.left
        });

        page.render({
            canvasContext : context,
            viewport : viewport
        });

        page.getTextContent().then(function(textContent){
           console.log( textContent );
            var textLayer = new TextLayerBuilder({
                textLayerDiv : $textLayerDiv.get(0),
                pageIndex : page_num - 1,
                viewport : viewport
            });

            textLayer.setTextContent(textContent);
            textLayer.render();
        });
    });
});
</script>
Sheepy
  • 17,324
  • 4
  • 45
  • 69
  • This is beautiful.. Thanks! :D <3 – clarkk Oct 15 '15 at 17:41
  • @clarkk Which version of `pdf.js` does it work for? I have strange behavior for text highlighting (too small areas) in v1.3.91. It used to work with v1.0.277 – Dmitry Gonchar Sep 05 '16 at 17:32
  • This doesn't work anymore, apparently the way to go now is to build the whole viewer I wanted a simple 2kb~ viewer with text selection capability FTW – Dan Apr 01 '17 at 01:55
  • your link goes to "Trace-based Just-in-Time Type Specialization for Dynamic Languages" – user3217883 Mar 28 '19 at 17:33
  • Your code does not work. Produces the following error in the chrome debugger: "PDFJS is not defined". And the console shows "Refused to apply style from 'https://raw.githubusercontent.com/mozilla/pdf.js/master/web/text_layer_builder.css' because its MIME type ('text/plain') is not a supported stylesheet MIME type, and strict MIME checking is enabled. – user3217883 Mar 28 '19 at 18:11
  • and "index2.html:1 Refused to execute script from 'https://raw.githubusercontent.com/mozilla/pdf.js/master/web/ui_utils.js' because its MIME type ('text/plain') is not executable, and strict MIME type checking is enabled. index2.html:1 Refused to execute script from 'https://raw.githubusercontent.com/mozilla/pdf.js/master/web/text_layer_builder.js' because its MIME type ('text/plain') is not executable, and strict MIME type checking is enabled. network.js:68 The provided value 'moz-chunked-arraybuffer' is not a valid enum value of type XMLHttpRequestResponseType." – user3217883 Mar 28 '19 at 18:12
  • I also get the mimetype error. It may be a new development. – Chris Jensen Nov 21 '19 at 12:58
  • This answer is now outdated. `textLayer` is now an option passed to `page.render()` – Kevin Beal Sep 18 '20 at 16:03
5

After hours of struggling with this I found this article to be very helpful about selecting text and using pdf.js without node. Custom PDF Rendering in JavaScript with Mozilla’s PDF.Js

Petter Ivarsson
  • 463
  • 6
  • 6
  • That didn't work for me either. First off, you need to replace all occurrences of "PDFJS" with "pdfjsLib". Then stuff will show up. But the overlay text doesn't line up with the pdf text. It's all a blob above the pdf text. 3-days wasted trying to get something, anything to work! – user3217883 Mar 28 '19 at 19:06
-1

Hello you have created canvas in your HTML content.

Canvas will not support text selection so you need to change the canvas to another way.

Mitul
  • 3,431
  • 2
  • 22
  • 35