1

It is easy to extract the text from HTML using the jQuery .text() method...

$("<p>This <b>That</b> Other</p>").text() == "This That Other"

But if there is no whitespace between the words/elements, then text becomes concatenated...

$("<p>This <b>That</b><br/>Other</p>").text() == "This ThatOther"
Desired: "This That Other"

$("<div><h1>Title</h1><p>Text</p></div>").text() == "TitleText"
Desired: "Title Text"

Is there any way to get all the text from the HTML (either using .text() or other methods) which would mean that the above examples would come out as desired?

freefaller
  • 19,368
  • 7
  • 57
  • 87

2 Answers2

4

You can traverse the DOM tree looking for a node with a nodeType of 3 (text node). When you find one, add it to an array. If you find a non-text node, you can pass it back into the function to keep looking.

function innerText(element) {
  function getTextLoop(element) {
    const texts = [];
    Array.from(element.childNodes).forEach(node => {
      if (node.nodeType === 3) {
        texts.push(node.textContent.trim());
      } else {
        texts.push(...getTextLoop(node));
      }
    });
    return texts;
  }
  return getTextLoop(element).join(' ');
}

/* EXAMPLES */
const div = document.createElement('div');
div.innerHTML = `<p>This <b>That</b><br/>Other</p>`;
console.log(innerText(div));

const div2 = document.createElement('div');
div2.innerHTML = `<div><h1>Title</h1><p>Text</p></div>`;
console.log(innerText(div2));
KevBot
  • 17,900
  • 5
  • 50
  • 68
0

If you are just worried about br tags, you can replace them with a text node.

var elem = document.querySelector("#text")
var clone = elem.cloneNode(true)
clone.querySelectorAll("br").forEach( function (br) {
  var space = document.createTextNode(' ')
  br.replaceWith(space)
})
var cleanedText = clone.textContent.trim().replace(/\s+/,' ');
console.log(cleanedText)
<div id="text">
  <p>This <b>That</br>Other</p>
</div>
epascarello
  • 204,599
  • 20
  • 195
  • 236