You could always get the content of a tag.
From the content, remove the inner tags, then trim the whitespace.
In the example we're using the div
tag, but you could also use
any tag with attributes, like the p
tag below.
Here is a JS example:
var tag = "div";
// var tag = "p"; // <= try this; works with tags with attributes as well
var rxTagContent = new RegExp( "<" + tag + "(?:\\s*>|\\s+(?=((?:\"[\\S\\s]*?\"|'[\\S\\s]*?'|(?:(?!/>)[^>])?)+))\\1>)((?:(?=(<(?:(?:(?:(script|style|object|embed|applet|noframes|noscript|noembed)(?:\\s+(?:\"[\\S\\s]*?\"|'[\\S\\s]*?'|(?:(?!/>)[^>])?)+)?\\s*>)[\\S\\s]*?</\\4\\s*(?=>))|(?:/?[\\w:]+\\s*/?)|(?:[\\w:]+\\s+(?:\"[\\S\\s]*?\"|'[\\S\\s]*?'|[^>]?)+\\s*/?)|\\?[\\S\\s]*?\\?|(?:!(?:(?:DOCTYPE[\\S\\s]*?)|(?:\\[CDATA\\[[\\S\\s]*?\\]\\])|(?:--[\\S\\s]*?--)|(?:ATTLIST[\\S\\s]*?)|(?:ENTITY[\\S\\s]*?)|(?:ELEMENT[\\S\\s]*?))))>|[\\S\\s]))\\3)*?)</" + tag + "\\s*>", "g" );
var rxRmvInnerTags =
/<(?:(?:(?:(script|style|object|embed|applet|noframes|noscript|noembed)(?:\s+(?:"[\S\s]*?"|'[\S\s]*?'|(?:(?!\/>)[^>])?)+)?\s*>)[\S\s]*?<\/\1\s*(?=>))|(?:\/?[\w:]+\s*\/?)|(?:[\w:]+\s+(?:"[\S\s]*?"|'[\S\s]*?'|[^>]?)+\s*\/?)|\?[\S\s]*?\?|(?:!(?:(?:DOCTYPE[\S\s]*?)|(?:\[CDATA\[[\S\s]*?\]\])|(?:--[\S\s]*?--)|(?:ATTLIST[\S\s]*?)|(?:ENTITY[\S\s]*?)|(?:ELEMENT[\S\s]*?))))>/g;
var rxWspTrim = /\s+/g;
////////////////////////////////////////////////
//
var html =
"<div>\n" +
" <p class=\"someclass\">\n" +
" Some plain text \n" +
" <strong>\n" +
" and some bold\n" +
" </strong>\n" +
" </p>\n" +
"</div>\n";
var match;
while ( match = rxTagContent.exec( html ) )
{
var cont = match[2]; // group 2 is content
var clean = cont.replace( rxRmvInnerTags, "" );
var trim = clean.replace( rxWspTrim, " " );
console.log ("content = " + cont );
console.log ("clean and trim = \n" + trim );
}
This is the expanded, readable version of the constructed Tag Content
regex.
Note that this regex and the one to remove the inner tags are
slightly sophisticated. Should you need specific information on
how they work just let me know. I usually show up every few days,
sometimes a week or two depending how many of my comments are
being deleted by administrator whoever ...
Update: Modified regex to avoid matching the closing tag text
if it happens to be inside a CDATA or even if it's part of another
tag's value, or even if it's in invisible content like a script.
For example, this below will match correctly.
Note the only thing missing is the ability to nest the tag.
This being JavaScript it's not possible. Regex can be used to
find tags and content a piece at a time for a fully custom parse.
But that's a different story.
This though, is going to find the first open tag and the first close tag.
It still can be modified 1 step further to find an un-nested
open / close tag if needed, a simple added assertion is needed.
Also note that this doesn't prevent matching the open tag
if it happens to be inside a CDATA or others as stated above.
This can be avoided but requires expansion of the tag regex and a check within the while() loop to go past these.
Let me know if you may need this ( or I just may add that in a
day or so. I don't want it to be too out of control ), it is possible though.
<tag>
Some content
more
and more
<script>
var xyz;
var tag = "</tag>";
</script>
<![CDATA[ </tag> asdfasdf]]>
</tag>
https://regex101.com/r/Bs4ySe/1
<tag
(?:
\s* >
| \s+
(?=
( # (1 start)
(?:
" [\S\s]*? "
| ' [\S\s]*? '
| (?:
(?! /> )
[^>]
)?
)+
) # (1 end)
)
\1 >
)
( # (2 start)
(?:
(?=
( # (3 start)
<(?:(?:(?:(script|style|object|embed|applet|noframes|noscript|noembed)(?:\s+(?:"[\S\s]*?"|'[\S\s]*?'|(?:(?!/>)[^>])?)+)?\s*>)[\S\s]*?</\4\s*(?=>))|(?:/?[\w:]+\s*/?)|(?:[\w:]+\s+(?:"[\S\s]*?"|'[\S\s]*?'|[^>]?)+\s*/?)|\?[\S\s]*?\?|(?:!(?:(?:DOCTYPE[\S\s]*?)|(?:\[CDATA\[[\S\s]*?\]\])|(?:--[\S\s]*?--)|(?:ATTLIST[\S\s]*?)|(?:ENTITY[\S\s]*?)|(?:ELEMENT[\S\s]*?))))>
| [\S\s]
) # (3 end)
)
\3
)*?
) # (2 end)
</tag \s* >