I have a 2.2mb html file, pure trash generated by acrobat. I need to span every word that is in it. But I keep getting that the html page starts showing parts of the source code.
Here is a small example:
<p class="s21" style="padding-top: 10pt;padding-left: 31pt;text-indent: 0pt;text-align: left;">CONTINGENCY TIMEL
INES.. • • • • • •• • • • • • • • • • • •• • • • • • ••• • •• • • • • •• • • • • •• • •<span class="s25">
</span><span class="s26"> </span>4-<span class="s27">1</span></p>
.word:hover {
background-color: rgba(0,0,0,0.1);
}
const walkDOM = function (node, func) {
func(node);
node = node.firstChild;
while(node) {
walkDOM(node, func);
node = node.nextSibling;
if (node && node.nextSibling == undefined) {
// console.log(node.innerHTML);
}
}
};
walkDOM(document.body, function(node) {
if (node.nodeName == '#text') {
let pnode = node.parentElement;
pnode.innerHTML = pnode.innerHTML.replace(/(^|<\/?[^>]+>|\s+)([^\s<]+)/g, '$1<span class="word">$2</span>');
}
});
https://codepen.io/clankill3r/pen/rNaNmxE
Outputs:
• • ••• • •• • • • • •• • • • • •• • •class="s25"> class="s26"> 4-1
Is there any way of spanning each individual word without having to much pain of avoiding html tags?