@Kian's answer mentions using a lexer, but in terms of algorithms I think you'll want to use recursion. HTML is after all a recursive structure:
<div>
<div>
<div>
</div>
</div>
</div>
Here is a naive JS example - although it's not a complete implementation. (I've included no support for <empty />
elements; for <!-- comments -->
; for &entities;
; for xmlns:namespaces
... writing a full fledged HTML or XML parser is a huge undertaking, so don't take it lightly)
This solution notably skips over the process of lexical analysis, but I've deliberately omitted that to contrast my answer with @Kian's.
var markup = "<!DOCTYPE html>\n"+
"<html>\n"+
" <head>\n"+
" <title>Example Input Markup</title>\n"+
" </head>\n"+
" <body>\n"+
" <p id=\"msg\">\n"+
" Hello World!\n"+
" </p>\n"+
" </body>\n"+
"</html>";
parseHtmlDocument(markup);
// Function definitions
function parseHtmlDocument(markup) {
console.log("BEGIN DOCUMENT");
markup = parseDoctypeDeclaration(markup);
markup = parseElement(markup);
console.log("END DOCUMENT");
}
function parseDoctypeDeclaration(markup) {
var regEx = /^(\<!DOCTYPE .*\>\s*)/i;
console.log("DOCTYPE DECLARATION");
var matches = regEx.exec(markup);
var doctypeDeclaration = matches[1];
markup = markup.substring(doctypeDeclaration.length);
return markup;
}
function parseElement(markup) {
var regEx = /^\<(\w*)/i;
var matches = regEx.exec(markup);
var tagName = matches[1];
console.log("BEGIN ELEMENT: "+tagName);
markup = markup.substring(matches[0].length);
markup = parseAttributeList(markup);
regEx = /^\>/i;
matches = regEx.exec(markup);
markup = markup.substring(matches[0].length);
markup = parseNodeList(markup);
regEx = new RegExp("^\<\/"+tagName+"\>");
matches = regEx.exec(markup);
markup = markup.substring(matches[0].length);
console.log("END ELEMENT: "+tagName);
return markup;
}
function parseAttributeList(markup) {
var regEx = /^\s+(\w+)\=\"([^\"]*)\"/i;
var matches;
while(matches = regEx.exec(markup)) {
var attrName = matches[1];
var attrValue = matches[2];
console.log("ATTRIBUTE: "+attrName);
markup = markup.substring(matches[0].length);
}
return markup;
}
function parseNodeList(markup) {
while(markup) {
markup = parseTextNode(markup);
var regEx = /^\<(.)/i;
var matches = regEx.exec(markup);
if(matches[1] !== '/') {
markup = parseElement(markup);
}
else {
return markup;
}
}
}
function parseTextNode(markup) {
var regEx = /([^\<]*)\</i;
var matches = regEx.exec(markup);
markup = markup.substring(matches[1].length);
return markup;
}
Ideally each of these functions would map very closely onto the grammar defined in the XML specification. For example, the specification defines an element
like so:
element ::= EmptyElemTag | STag content ETag
... so ideally we'd want the parseElement()
function to look more like this:
function parseElement(markup) {
if(nextTokenIsEmptyElemTag) { // this kind of logic is where a lexer will help!
parseEmptyElemTag(markup);
}
else {
parseSTag(markup);
parseContent(markup);
parseETag(markup);
}
}
... but I've cut some corners in writing my example, so it doesn't reflect the actual grammar as closely as it should.