11

This question is specific to pdf.js, a javascript based pdf renderer. I'm building a custom version where I need to extract the text that I select inside the pdf.

There are other posts where you can fetch the text from one page or the whole pdf document such as the one here , but I'm looking to grab a specific text that the user selects and perhaps alert it or print it in the console.

Srikar Reddy
  • 131
  • 1
  • 1
  • 6

3 Answers3

9

What you are looking for is window.getSelection() method. This method returns a specific Selection object with the range of the selected text on the web page.

Here is how you can use getSelection() together with pdf.js:

function getHightlightCoords() {
var pageIndex = PDFViewerApplication.pdfViewer.currentPageNumber - 1; 
var page = PDFViewerApplication.pdfViewer.getPageView(pageIndex);
var pageRect = page.canvas.getClientRects()[0];
var selectionRects = window.getSelection().getRangeAt(0).getClientRects();
var viewport = page.viewport;
var selected = selectionRects.map(function (r) {
  return viewport.convertToPdfPoint(r.left - pageRect.x, r.top - pageRect.y).concat(
     viewport.convertToPdfPoint(r.right - pageRect.x, r.bottom - pageRect.y)); 
});
return {page: pageIndex, coords: selected};
}


function showHighlight(selected) {
var pageIndex = selected.page; 
var page = PDFViewerApplication.pdfViewer.getPageView(pageIndex);
var pageElement = page.canvas.parentElement;
var viewport = page.viewport;
selected.coords.forEach(function (rect) {
  var bounds = viewport.convertToViewportRectangle(rect);
  var el = document.createElement('div');
  el.setAttribute('style', 'position: absolute; background-color: pink;' + 
    'left:' + Math.min(bounds[0], bounds[2]) + 'px; top:' + Math.min(bounds[1], bounds[3]) + 'px;' +
    'width:' + Math.abs(bounds[0] - bounds[2]) + 'px; height:' + Math.abs(bounds[1] - bounds[3]) + 'px;');
  pageElement.appendChild(el);
});
}
semanser
  • 2,310
  • 2
  • 15
  • 33
  • I achieved what I wanted thanks to your suggestion. The solution provided achieves way more than what I needed, but pretty interesting though. I just needed window.getSelection().toString() to get the text inside, and coupled it with an event handler. :) – Srikar Reddy Feb 27 '18 at 08:34
  • 2
    On another note, in the solution you provided, the **.map()** inside **getHighlightCoords()** does not seem to work, as selectionRects i.e. **"window.getSelection().getRangeAt(0).getClientRects();"** returns an object of objects rather than an array of objects, so you would need to convert it to an array similar to this : **"var selectionRectsList = Object.values(selectionRects);"** – Srikar Reddy Feb 27 '18 at 08:45
  • @SrikarReddy can you please confirm how you accomplished it – Sreenath Ganga Oct 03 '18 at 12:37
  • @SrikarReddy window.getSelection().toString() doesn't give me the correct selection but all the content of the div. Could you have an hint to get the specific text selected ? – zagoa Jun 28 '19 at 10:23
  • doesnt work in pdf.js 2.12. all coordinates are NaN – chitgoks Jan 29 '22 at 13:10
5

pdf.js has a file viewer.html inside the directory web.

you have to do the following two steps in viwer.html

  1. Add Event Listener onselectionchange.
  2. Define the event handling function onselectionchange().
<script>

  // addEventListener version
  document.addEventListener('selectionchange', () => {
    console.log(document.getSelection());
  });

  // onselectionchange version
  document.onselectionchange = () => {
    var text = getSelectedText();

    if(text)
    {
      alert(text); 

    }
  };

  function getSelectedText() {
     if (window.getSelection) {
        return window.getSelection().toString();
     } 
     else if (document.selection) {
         return document.selection.createRange().text;
     }
     return '';
  }
</script>
PNR
  • 99
  • 1
  • 5
0

In reply to chitgoks message (doesnt work in pdf.js 2.12)

I made a small adaption to semanser code and testet it in Chrome 97 and PDF.js: 2.13.99

function getHightlightCoords() {
var pageIndex = PDFViewerApplication.pdfViewer.currentPageNumber - 1; 
var page = PDFViewerApplication.pdfViewer.getPageView(pageIndex);
var pageRect = page.canvas.getClientRects()[0];
var selectionRects = window.getSelection().getRangeAt(0).getClientRects();
var viewport = page.viewport;
var r = selectionRects[0];
var selected = viewport.convertToPdfPoint(r.left - pageRect.x, r.top - pageRect.y).concat(
     viewport.convertToPdfPoint(r.right - pageRect.x, r.bottom - pageRect.y)); 
return {page: pageIndex, coords: selected};
}



// addEventListener version
  document.addEventListener('selectionchange', () => {
    console.log(getHightlightCoords());
  });

It works for me!

schube
  • 652
  • 5
  • 18
  • This function with showHighlights will result in the div layer left, top, width, and height to also be NaN? Because of this bounds = viewport.convertToViewportRectangle(rect) results in bounds having [NaN,NaN, NaN, NaN]. While i have done this use case, my problem is with text selection sensitivity. pdf.js has enable_enhanced option for textLayerMode but the retrieved client recs are wrong but the text selection sensitivity works as expected – chitgoks Feb 02 '22 at 12:45