7

I have a document fetched by a $.get call, it's a big bloated HTML document. I need to use jQuery to grab an element from it.

I'm trying this (in coffeescript):

$.get url, (data) ->
  title = $(data).find('title').text()

This doesn't work. In browser console I've whittled this down to $(document.documentElement.outerHTML).find('title') where document.documentElement.outerHTML gives a string of the document.

I've tried jQuery.parseHTML, with the same result.

wjdp
  • 1,468
  • 19
  • 36

2 Answers2

12

The reason why it does not work is because jQuery expects a DOM node to find the 'title' tags. As you noted, you need to parse the html text first.

From here and here, the solution is to parse the string and append it into a temporal div (or other element):

var tempDom = $('<div></div>').append($.parseHTML(str));

Then, you can manipulate tempDom to find elements.

Working demo: http://codepen.io/anon/pen/wKwLMP

Community
  • 1
  • 1
Miguel Jiménez
  • 1,276
  • 1
  • 16
  • 24
10

TL;DR ... use the DOMParser API

var htmlString = "<html><head><title>Name</title></head><body><div class='content'>Hello</div></body></html>";
var htmlDoc = (new DOMParser()).parseFromString(htmlString, "text/xml");

Unfortunately, there current answers don't hit a lot of edge cases

You should not use $.parseHTML(htmlString) as it's immediately lossy. If we check the source code on $.parseHtml, it'll call buildFragment which creates a temporary DOM element and sets the innerHTML property.

innerHtml Parsing

Element.innerHTML provides an API for:

And here's the spec for Html Fragment Parsing Algorithm

Taking a sample string, here's the result of trying various HTML Parsing approaches:

var htmlString = "<html><head><title>Name</title></head><body><div class='content'>Hello</div></body></html>";

function ParseHtmlTests() {

  /*** $.parseHTML ***/
  var $parseHtml = $.parseHTML(htmlString)

  console.LogOutput(
    '1. $.parseHTML',
    $parseHtml,
    $parseHtml.map(function(el, i) { return el.outerHTML }),
    $($parseHtml).find("title").text(),
    $($parseHtml).find(".content").text()
  )


  /*** tempDiv.innerHTML ***/
  var tempDiv = document.createElement("div")
  tempDiv.innerHTML = htmlString

  console.LogOutput(
    '2. tempDiv.innerHTML',
    tempDiv,
    tempDiv.outerHTML,
    $(tempDiv).find("title").text(),
    $(tempDiv).find(".content").text()
  )


  /*** divAppendContents ***/
  var $divAppendContents = $('<div></div>').append(htmlString)

  console.LogOutput(
    '3. divAppendContents',
    $divAppendContents,
    $divAppendContents.html(),
    $divAppendContents.find("title").text(),
    $divAppendContents.find(".content").text()
  )


  /*** tempHtml.innerHTML ***/
  var tmpHtml = document.createElement( 'html' );
  tmpHtml.innerHTML = htmlString;

  console.LogOutput(
   '4. tempHtml.innerHTML',
    tmpHtml,
    tmpHtml.outerHTML,
    tmpHtml.getElementsByTagName('title')[0].innerText,
    tmpHtml.getElementsByClassName('content')[0].innerText
  )


  /*** DOMParser.parseFromString ***/
  var htmlDoc = (new DOMParser()).parseFromString(htmlString, "text/xml");

  console.LogOutput(
    '5. DOMParser.parseFromString',
    htmlDoc,
    htmlDoc.documentElement.outerHTML,
    htmlDoc.documentElement.getElementsByTagName('title')[0].innerHTML,
    htmlDoc.documentElement.getElementsByClassName('content')[0].innerHTML
  )
}

/*** Create Console Log Methods ***/
console.group = console.group || function(msg) {
  console.log(msg)
}
console.groupEnd = console.groupEnd || function(msg) {
  console.log("----------------------------")
}
console.LogOutput = function(method, dom, html, title, content) {
  console.group(method);
  console.log("DOM:", dom)
  console.log("HTML:", html)
  console.log("Title:", title)
  console.log("Content:", content)
  console.groupEnd();
};

/*** Execute Script ***/
ParseHtmlTests()
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.js"></script>

And here's the output from the above script in chrome:

Output

The best approach seems to be creating a HTML Root object by setting the innerHTML of a temporary HTML document or by using the DOMParser API

Further Reading:

Community
  • 1
  • 1
KyleMit
  • 30,350
  • 66
  • 462
  • 664