This Post doesnt talk about "Why";
But just provides an "Workaround to convert the HtmlEntity back";
in_short
If you want to output the (whole document's) outerHtml
to original text -- with HtmlEntity properly escaped...
Use the function encode_HtmlEntity_in_TagAttr
below.
pre-note
It seems impossible to revert the HtmlEntity in TagAttr back to the original escaped status...
-- while your are modifying inside the actual TagAttr inside Dom
eg:
const elt_AA = $(/* html */ `<span data-foo="Time > 0">Pick_this_span</span>`)[0];
const attr = elt_AA.getAttributeNode('data-foo');
console.log(attr.value); // Time > 0
console.log(elt_AA.outerHTML); // <span data-foo="Time > 0">Pick_this_span</span>
attr.value = _.escape(attr.value); // elt_AA.setAttribute('data-foo', _.escape(attr.value)) // same
console.log(attr.value); // Time > 0
console.log(elt_AA.outerHTML); // <span data-foo="Time &gt; 0">Pick_this_span</span>
// << watch this
// though it appears escaped, but no -- check the outerHtml
// this simply makes your attribute double escaped -- which is still not the original attibute value;
// more test
attr.value = '>';
console.log(attr.value); // >
console.log(elt_AA.outerHTML); // <span data-foo="&gt;">Pick_this_span</span>
attr.value = '>';
console.log(attr.value); // >
console.log(elt_AA.outerHTML); // <span data-foo=">">Pick_this_span</span>
attr.value = '&';
console.log(attr.value); // &
console.log(elt_AA.outerHTML); // <span data-foo="&">Pick_this_span</span>
// attr.value = '&gt;';
// console.log(attr.value);
// console.log(elt_AA.outerHTML);
soln-workaround
As said
It seems impossible to revert the HtmlEntity in TagAttr back to the original escaped status...
-- while your are modifying inside the actual TagAttr inside Dom
So we dont do that, instead:
we change the HtmlEntity into another thing (escape it customly) inside the TagAttr.
we change the HtmlEntity "back to the original escaped status" on the hardcoded outerHtml String -- using Regex.
note:
the whole point is the logic of encode_HtmlEntity_in_TagAttr
the logic is stated above -- its not good -- its really just a workaround
this encode/escapes all HtmlEntity in TagAttr
(-- more than just changes back to original -- if your original HtmlEntity was not escaped).
you cant just directly run the test code below -- you need import & setup some code before.
code is tested, but still not sure the code I posted has bug or not (mostly due to settings in JSDOM)
code (+ test case demo):
it('~test Tag Attribut contains Html Entity', async function () {
// #>>>
const html_HtaHe_ori = /* html */ `
<!DOCTYPE html>
<html lang="en">
<head>
<title>Empty_Title</title>
</head>
<body>
<div class="sect3" title="MIN > 0, TIME == 0 (blocking read)"> AAA </div>
<div class="sect3" title="MIN <>>A> > "TT" & 0'''''sssss "> BBB </div>
</body>
</html>
`;
// #>>> Browser auto unescaped HtmlEntity
fs.writeFileSync(pathStr_tmpFile_1, html_HtaHe_ori);
dom = await JSDOM.fromFile(pathStr_tmpFile_1, {
contentType: 'text/html; charset="utf-8"',
});
const _document = dom.window.document;
let document = _document;
const $ = jQuery(dom.window);
const html_HtaHe_BrowserAutoUnesc = _document.body.outerHTML; // Browser auto unescaped HtmlEntity
// #>>> Escape HtmlEntity back
function encode_HtmlEntity_in_TagAttr() {
// escape // escapeCustomly_for_allUnEscapedHtmlEntity_in_HtmlTagAttr
const idNow = Date.now();
const idDeli = 'BAUEHTAttr';
// $('*').each(function (i, elt) {
const arr_elt_All = document.querySelectorAll('*');
for (const elt of arr_elt_All) {
for (const attr_curr of [...elt.attributes]) {
const attr_curr_val = attr_curr.value;
if (/&|<|>|"|'|`/g.test(attr_curr_val)) {
attr_curr.value = attr_curr.value.replaceAll(/&/g, idDeli + 'amp' + idNow);
attr_curr.value = attr_curr.value.replaceAll(/>/g, idDeli + 'gt' + idNow);
attr_curr.value = attr_curr.value.replaceAll(/</g, idDeli + 'lt' + idNow);
attr_curr.value = attr_curr.value.replaceAll(/"/g, idDeli + 'quot' + idNow);
attr_curr.value = attr_curr.value.replaceAll(/'/g, idDeli + 'apos' + idNow);
attr_curr.value = attr_curr.value.replaceAll(/`/g, idDeli + 'grave' + idNow);
}
}
}
const html_HtaHe_EscapeCustomly = document.body.outerHTML; // << chnage `_document` back to `document` -- if you are not using JSDOM
//repeat (cuz: impossible to revert the HtmlEntity in TagAttr back to the original escaped status... -- while your are modifing inside the actual TagAttr inside Dom)
// unescape
const html_HtaHe_encode_HtmlEntity_in_TagAttr = html_HtaHe_EscapeCustomly.replaceAll(new RegExp(idDeli + '(?<main>amp|gt|lt|quot|apos|grave)' + idNow, 'g'), '&$<main>;');
return html_HtaHe_encode_HtmlEntity_in_TagAttr;
}
const html_HtaHe_encode_HtmlEntity_in_TagAttr = encode_HtmlEntity_in_TagAttr(); // Escape HtmlEntity back
// #>>> compare result
// fs.writeFileSync(pathStr_outFile_1, html_HtaHe_ori);
// fs.writeFileSync(pathStr_outFile_1, html_HtaHe_BrowserAutoUnesc);
// fs.writeFileSync(pathStr_outFile_2, html_HtaHe_encode_HtmlEntity_in_TagAttr);
const html_HtaHe_BrowserAutoUnesc_Result = /* html */ `
<body>
<div class="sect3" title="MIN > 0, TIME == 0 (blocking read)"> AAA </div>
<div class="sect3" title="MIN <>>A> > "TT" & 0'''''sssss "> BBB </div>
</body>
`;
expect(html_HtaHe_BrowserAutoUnesc.replaceAll(/\s{2,}/g, '')).toEqual(html_HtaHe_BrowserAutoUnesc_Result.replaceAll(/\s{2,}/g, ''));
const html_HtaHe_encode_HtmlEntity_in_TagAttr_ExpectedResult = /* html */ `
<body>
<div class="sect3" title="MIN > 0, TIME == 0 (blocking read)"> AAA </div>
<div class="sect3" title="MIN <>>A> > "TT" & 0'''''sssss "> BBB </div>
</body>
`;
expect(html_HtaHe_encode_HtmlEntity_in_TagAttr.replaceAll(/\s{2,}/g, '')).toEqual(html_HtaHe_encode_HtmlEntity_in_TagAttr_ExpectedResult.replaceAll(/\s{2,}/g, ''));
});