I'm using the node.js dom-parser which (unideally) pulls tags out of the DOM using regular expressions.
You can find dom-parser at: https://github.com/ershov-konst/dom-parser
Occasionally, the HTML of some web pages (eg https://www.ecosia.org/ ) causes the node.js app to hang. I've tested using a plain vanilla matching script and found that tagRegExp causes the script to hang (perhaps because of catastrophic backtracking?)
I'm actually using it to find link rel="canonical" and a href="xyz" (if any, ecosia has no canonical).
tagRegExp:
/(<\/?[a-z][a-z0-9]*(?::[a-z][a-z0-9]*)?\s*(?:\s+[a-z0-9-_]+=(?:(?:'[\s\S]*?')|(?:"[\s\S]*?")))*\s*\/?>)|([^<]|<(?![a-z\/]))*/gi
Pure JS test script:
<script type="text/javascript">
var text = '... html source ...';
var text_esc = text
text_esc = text_esc.replace(/\</g, "<");
text_esc = text_esc.replace(/\>/g, ">");
var regex = /(<\/?[a-z][a-z0-9]*(?::[a-z][a-z0-9]*)?\s*(?:\s+[a-z0-9-_]+=(?:(?:'[\s\S]*?')|(?:"[\s\S]*?")))*\s*\/?>)|([^<]|<(?![a-z\/]))*/gi;
var found = text.match(regex);
var found_len = found.length;
document.write("Text: " + text_esc + "<br /><br />" + "Regex pattern: " + regex + "<br /><br />");
document.write("Matches: " + found_len + "<br /><br />");
for (var i=0;i<found_len;i++)
{
found[i] = found[i].replace(/\</g, "<");
found[i] = found[i].replace(/\>/g, ">");
document.write("[" + i + "]: " + found[i] + "<br /><br />");
}
</script>
Any ideas most welcome. Thanks in advance.