Rather than using regular expressions, I'd strongly recommend using the DOM API to parse, and remove, HTML elements using a white-list of elements that may be retained:
function stripHTML(opts) {
// The default settings for the function, can be overridden
// by the user,
// HTML: String of text/HTML from which the HTML elements
// should be removed.
// allowedHTML: Array of Strings, the HTML elements that are
// permitted to remain within the returned HTML string.
var settings = {
'html': null,
'allowedHTML': ['h2', 'br']
},
// creating an element for containing the supplied String
// of content in order for it to be parsed:
temp = document.createElement('div'),
// uninitialised variables for later use:
allowedHTML,
elementNodes,
parent;
// Iterating over the keys of the opts Object if one has
// been supplied, otherwise we iterate over the empty
// object-literal to prevent an error being thrown:
Object.keys(opts || {}).forEach(function(key) {
// here we update the settings Object with the
// properties, and property-values, from the
// opts Object (if supplied):
settings[key] = opts[key];
});
// if we have a settings.html property-value, and
// settings.html is a String:
if (settings.html && 'string' === typeof settings.html) {
// assign the settings.html String as the innerHTML of
// the created-element:
temp.innerHTML = settings.html;
// retrieve all elements from the created-element using
// the universal selector ('*') from CSS and converting
// the resulting Array-like collection into an Array,
// using Array.from():
elementNodes = Array.from(temp.querySelectorAll('*'));
// here we ensure that the Array of elements is of the
// type ['h1','span'] not ['<h1>','<span>'] by iterating
// over the array of settings.allowedHTML and returning
// a new Array of its elements using Array.prototype.map():
allowedHTML = settings.allowedHTML.map(function(el) {
// 'el' the first argument is a reference to the
// current Array-element of the Array over which
// we're iterating.
// returning the string having first removed all ('g')
// incidences of '<' or ('|') '>' from said string:
return el.replace(/<|>/g, '');
});
// iterating over the elementNodes Array:
elementNodes.forEach(function(node) {
// 'node' is (again) a reference to the current
// Array-element of the Array over which we're
// iterating.
// caching a reference to the parentNode of the
// current element:
parent = node.parentNode;
// if the node's localName (same as tagName, but
// lower-case) is not found in the Array of allowed HTML:
if (settings.allowedHTML.indexOf(node.localName) === -1) {
// while the node has a firstChild:
while (node.firstChild) {
// we insert that firstChild into the
// node's parentNode ahead of the node itself:
parent.insertBefore(node.firstChild, node);
}
// removing the node from the parent:
parent.removeChild(node);
}
});
// here we return the innerHTML of the created-element,
// having trimmed its leading and trailing white-space:
return temp.innerHTML.trim();
}
}
console.log(stripHTML({
'html': "jQuery is a JavaScript library.<br>And is the most widely-used such library (at this time)"
}));
// => jQuery is a JavaScript library.<br>And is the most widely-used such library (at this time).
function stripHTML(opts) {
var settings = {
'html': null,
'allowedHTML': ['h2', 'br']
},
temp = document.createElement('div'),
allowedHTML,
elementNodes,
parent;
Object.keys(opts || {}).forEach(function(key) {
settings[key] = opts[key];
});
if (settings.html && 'string' === typeof settings.html) {
temp.innerHTML = settings.html;
elementNodes = Array.from(temp.querySelectorAll('*'));
allowedHTML = settings.allowedHTML.map(function(el) {
return el.replace(/<|>/g, '');
});
elementNodes.forEach(function(node) {
parent = node.parentNode;
if (settings.allowedHTML.indexOf(node.localName) === -1) {
while (node.firstChild) {
parent.insertBefore(node.firstChild, node);
}
parent.removeChild(node);
}
});
return temp.innerHTML.trim();
}
}
console.log(stripHTML({
'html': "jQuery is a JavaScript library.<br>And is the most widely-used such library (at this time). "
}));
JS Fiddle demo.
The above allows for an empty-array of allowedHTML
, which causes the function to remove all HTML tags (from somewhat limited testing):
console.log(stripHTML({
'html': "jQuery is a JavaScript library.<br>And is the most widely-used such library (at this time). ",
'allowedHTML': []
}));
// => jQuery is a JavaScript library.And is the most widely-used such library (at this time).
function stripHTML(opts) {
var settings = {
'html': null,
'allowedHTML': ['h2', 'br']
},
temp = document.createElement('div'),
allowedHTML,
elementNodes,
parent;
Object.keys(opts || {}).forEach(function(key) {
settings[key] = opts[key];
});
if (settings.html && 'string' === typeof settings.html) {
temp.innerHTML = settings.html;
elementNodes = Array.from(temp.querySelectorAll('*'));
allowedHTML = settings.allowedHTML.map(function(el) {
return el.replace(/<|>/g, '');
});
elementNodes.forEach(function(node) {
parent = node.parentNode;
if (settings.allowedHTML.indexOf(node.localName) === -1) {
while (node.firstChild) {
parent.insertBefore(node.firstChild, node);
}
parent.removeChild(node);
}
});
return temp.innerHTML.trim();
}
}
console.log(stripHTML({
'html': "jQuery is a JavaScript library.<br>And is the most widely-used such library (at this time).",
'allowedHTML': []
}));
JS Fiddle demo.
Seems to cope reliably – insofar as any browser is capable of coping with – invalid HTML, such as unopened elements or elements which 'overlap' (the closing tag for the first-opened element appears before the closing tag for the second-opened element):
console.log(stripHTML({
'html': "<div><h1>jQuery</div> is a JavaScript library.</h1><br>And is the most widely-used such library (at this time). "
}));
// => jQuery is a JavaScript library.<br>And is the most widely-used such library (at this time).
function stripHTML(opts) {
var settings = {
'html': null,
'allowedHTML': ['h2', 'br']
},
temp = document.createElement('div'),
allowedHTML,
elementNodes,
parent;
Object.keys(opts || {}).forEach(function(key) {
settings[key] = opts[key];
});
if (settings.html && 'string' === typeof settings.html) {
temp.innerHTML = settings.html;
elementNodes = Array.from(temp.querySelectorAll('*'));
allowedHTML = settings.allowedHTML.map(function(el) {
return el.replace(/<|>/g, '');
});
elementNodes.forEach(function(node) {
parent = node.parentNode;
if (settings.allowedHTML.indexOf(node.localName) === -1) {
while (node.firstChild) {
parent.insertBefore(node.firstChild, node);
}
parent.removeChild(node);
}
});
return temp.innerHTML.trim();
}
}
console.log(stripHTML({
'html': "<div><h1>jQuery</div> is a JavaScript library.</h1><br>And is the most widely-used such library (at this time). "
}));
JS Fiddle demo.
It also seems to manage with (ridiculous) nesting:
console.log(stripHTML({
'html': "<div>jQuery <h1>is <br>a <span><strong><em><span>JavaScript</span></em> library</strong></span>.</span><br>And is the most widely-used such library (at this time).</h1></div> "
}));
function stripHTML(opts) {
var settings = {
'html': null,
'allowedHTML': ['h2', 'br']
},
temp = document.createElement('div'),
allowedHTML,
elementNodes,
parent;
Object.keys(opts || {}).forEach(function(key) {
settings[key] = opts[key];
});
if (settings.html && 'string' === typeof settings.html) {
temp.innerHTML = settings.html;
elementNodes = Array.from(temp.querySelectorAll('*'));
allowedHTML = settings.allowedHTML.map(function(el) {
return el.replace(/<|>/g, '');
});
elementNodes.forEach(function(node) {
parent = node.parentNode;
if (allowedHTML.indexOf(node.localName) === -1) {
while (node.firstChild) {
parent.insertBefore(node.firstChild, node);
}
parent.removeChild(node);
}
});
return temp.innerHTML.trim();
}
}
console.log(stripHTML({
'html': "<div>jQuery <h1>is <br>a <span><strong><em><span>JavaScript</span></em> library</strong></span>.</span><br>And is the most widely-used such library (at this time).</h1></div> "
}));
JS Fiddle demo.
I cannot, though, guarantee that this works, will work, or is able to work, against people inserting <script>
elements in the stripHTML
function's html
string, such as:
console.log(stripHTML({
'html': "<script>alert('Will this work?'); console.log('Maybe not?');</" + "script>"
}));
// => alert('Will this work?'); console.log('Maybe not?');
// it doesn't work in my (again: limited) testing, and
// there's no evaluation (eval()) of the inserted, or resulting
// string so it should be safe. This is not a guarantee, so
// please: test your edge cases
function stripHTML(opts) {
var settings = {
'html': null,
'allowedHTML': ['h2', 'br']
},
temp = document.createElement('div'),
allowedHTML,
elementNodes,
parent;
Object.keys(opts || {}).forEach(function(key) {
settings[key] = opts[key];
});
if (settings.html && 'string' === typeof settings.html) {
temp.innerHTML = settings.html;
elementNodes = Array.from(temp.querySelectorAll('*'));
allowedHTML = settings.allowedHTML.map(function(el) {
return el.replace(/<|>/g, '');
});
elementNodes.forEach(function(node) {
parent = node.parentNode;
if (settings.allowedHTML.indexOf(node.localName) === -1) {
while (node.firstChild) {
parent.insertBefore(node.firstChild, node);
}
parent.removeChild(node);
}
});
return temp.innerHTML.trim();
}
}
console.log(stripHTML({
'html': "<script>alert('Will this work?'); console.log('Maybe not?');</"+"script>"
}));
JS Fiddle demo.
References: