Marking text in a html document

Question

Lets say I have the following markup:

<html>
    <head>
        <title>Page Title</title>
    </head>
    <body>
        <h1>Some title</h1>
        <p>First paragraph</p>
        <p>Second paragraph</p>
    </body>
<html>

I need to mark some parts of the text, namely "irst paragraph secon" It would look something like this:

<html>
    <head>
        <title>Page Title</title>
    </head>
    <body>
        <h1>Some title</h1>
        <p>F
            <mark>
                irst paragraph</p><p>Secon
            </mark>
        d paragraph</p>
    </body>
<html>

But the problem is be the html markup would be broken. The more complex the markup, the more problems this approach would have.

Question:

Looking for ideas on how can I take the first HTML example and apply a function to return a html structure where "irst paragraph second" is specifically marked somehow.

What I currently have is:

the parent container of the string "First paragraph"
the text "irst paragraph second"
the offset of the text "irst" in "First paragraph"

score 4 · Answer 1 · edited Mar 26 '16 at 16:46

4

If you want to highlight text in a document then this plug-in will be helpful for you.

https://github.com/julmot/jquery.mark

Example fiddle: https://jsfiddle.net/julmot/vpav6tL1/

Usage is as simple as:

$(".context").mark("keyword");

edited Mar 26 '16 at 16:46

dude

5,678
11
54
81

answered Feb 09 '16 at 09:02

Anamika Shrivastava

703
5
11

Awesome but without 'Separate word search' option it will not work as expected with more complex markup (e.g. when parts of the search term are found in different child elements). – B0Andrew Feb 09 '16 at 11:02
@B0Andrew I can not reproduce an error. What exactly do you mean? Can you open an issue on the repo page? – user3631654 Mar 20 '16 at 12:18

B0Andrew · Answer 2 · 2016-02-02T09:24:49.723

In principle you have to:

split the documents into words
identify the first word by parent element
skip the offset
mark matching words

Making changes at word level will prevent you from breaking the markup. I added a working example bellow. However I am not sure that it will work with all browsers.

Some of the functions like mergeWords are not used in the example but I included them because they can prove useful.

var splittedToWords = false;

function ignore(el) {
  return (el.nodeType == 8) || 
    (el.tagName == "BLOCKQUOTE") ||
    (el.tagName == "SCRIPT") ||
    (el.tagName == "DIV") ||
    (!el.hasChildNodes() && el.textContent.match(/\S+/) == null);
}

function splitToWords(el) {
  if (el.hasChildNodes()){
    var count = el.childNodes.length;
    for (var i = count - 1; i >= 0; i--) {
      var node = el.childNodes[i];
      if (!ignore(node))
        splitToWords(node);
    }
  }
  else { //text node
    var words = el.textContent.match(/(\S+\s*)/g) || [];
    var count = words.length;
    var parentNode = el.parentNode;
    for (var i = 0; i < count; i++) {
      var wordNode = document.createElement("span");
      wordNode.className = "word";
      wordNode.innerText = words[i];

      wordNode.setAttribute["word-index"] = i;

      parentNode.insertBefore(wordNode, el);
    }
    parentNode.removeChild(el);
  }
  splittedToWords = true;
}

function unwrap(element) {
  var next = element.nextSibling;
  var parent = element.parentNode;
  parent.removeChild(element);
  var current;
  var frag = document.createDocumentFragment();
  do {
    current = element.nextSibling;
    frag.insertBefore(element, null);
  } while ((element = current));
  parent.insertBefore(frag, next);
}

function mergeWords(el) {
  var words = document.getElementsByClassName("word");
  count = words.length;
  if (count > 0)
    for (var i = 0; i < count; i++)
      uwrap(words[i]);
}

function markWord(el, pos, len) {
  var text = el.innerText;
  var pre = text.substr(0, pos);
  var mark = '<mark>' + text.substr(pos, len) + '</mark>';
  var post = text.substring(pos + len, text.length);
  el.innerHTML = pre + mark + post;
}

function mark(element, offset, text) {
  if (!splittedToWords) {
    var body = document.body;
    splitToWords(body);
  }

  var words = document.getElementsByClassName("word");
  var wordsCount = words.length;
  var first = null;
  for (var i = 0; i < wordsCount; i++ ) {
    if (words[i].parentElement == element) {
      first = i;
      break;
    }
  }

  done = false;
  var i = first;
  var pos = 0;

  do {
    var word = words[i];
    var wordLength = word.innerText.length;

    if (offset > pos + wordLength) {
      i++;
      pos += wordLength;
      continue;
    }
    else {
      done = true;
    }
  } while (!done);

  var tWords = text.match(/(\S+\s*)/g) || [];
  var tWordsCount = tWords.length;
  if (tWordsCount == 0)
    return;

  for (var ti = 0; ti < tWordsCount; ti++) {
    var wordEl = words[i++];
    var word = wordEl.innerText;
    var tWord = tWords[ti].trim();
    var pos = word.indexOf(tWord);

    if (pos == -1)
      continue; //or maybe return.

    markWord(wordEl, pos, tWord.length);
  }

}
var e = document.getElementById("e");

//do the magic
mark(e, 1, 'irst paragraph Second');

<h1>Some title</h1>
<p id="e">First paragraph</p>
<p>Second paragraph</p>

The only downside of this it that it doesn't mark the spaces — Ferrybig, Feb 02 '16 at 09:21
It can be changed though. If you mark two adjacent words (spans) including the space at the end of the word they WILL appear as one contiguous marked element. — B0Andrew, Feb 02 '16 at 09:35
@B0Andrew +1, This is actually a good idea but wouldn't "split the documents into words" take a long time for larger web pages? — , Feb 09 '16 at 04:59
This is so buggy, it will only work in this little situation. Wrap "First paragraph" e.g. with `` (https://jsfiddle.net/2c91f8rs/) or search with a blank at the end (https://jsfiddle.net/2c91f8rs/1/) will throw errors. I could list a dozens of situations where this throws errors. Using the plugin recommended from @Anamika Shrivastava works and on top gives you the opportunity to customize element name, class name, mark also diacritics, use synonyms, etc. — user3631654, Mar 26 '16 at 13:51
@user3631654 One of the hypotheses was that "The parent element of the start of the **string** to be searched is known" not "any parent". Of course, if you wrap the start of the searched string with another element the function will not work. Call it a bug, fine. But don't forget this is SO. You get answers for specific problems not for every problem that could possibly arise. — B0Andrew, Mar 28 '16 at 06:59

Marking text in a html document

2 Answers2

Linked