Here's a similar solution to the HTML DOM answer. If your HTML is valid, you could try to parse it as XML. The advantage here is, where the InternetExplorer.Application
COM object loads an entire fully-bloated instance of Internet Explorer for each page load, instead you're loading only a dll (msxml3.dll). This should hopefully handle multiple files more efficiently. The down side is that the XML parser is finicky about the validity of your tag structure. If, for example, you have an unordered list where the list items are not closed:
<ul>
<li>Item 1
<li>Item 2
</ul>
... a web browser would understand that just fine, but the XML parser will probably error. Anyway, it's worth a shot. I just tested this on a directory of 500 identical HTML files, and it worked through them in less than a minute.
@if (@CodeSection == @Batch) @then
@echo off
setlocal
for %%I in ("*.htm") do (
cscript /nologo /e:JScript "%~f0" "%%~fI"
)
rem // end main runtime
goto :EOF
@end
// end batch / begin JScript chimera
WSH.StdOut.Write('Checking ' + WSH.Arguments(0) + '... ');
var fso = WSH.CreateObject('scripting.filesystemobject'),
DOM = WSH.CreateObject('Microsoft.XMLDOM'),
htmlfile = fso.OpenTextFile(WSH.Arguments(0), 1),
html = htmlfile.ReadAll().split(/<\/head\b.*?>/i),
head = html[0] + '</head>',
body = html[1].replace(/<\/html\b.*?>/i,''),
changed;
htmlfile.Close();
// attempt to massage body string into valid XHTML
var self_closing_tags = ['area','base','br','col',
'command','comment','embed','hr','img','input',
'keygen','link','meta','param','source','track','wbr'];
body = body.replace(/<\/?\w+/g, function(m) { return m.toLowerCase(); }).replace(
RegExp([ // should match <br>
'<(',
'(' + self_closing_tags.join('|') + ')',
'([^>]+[^\/])?', // for tags with properties, tag is unclosed
')>'
].join(''), 'ig'), "<$1 />"
);
DOM.loadXML(body);
DOM.async = false;
if (DOM.parseError.errorCode) {
WSH.Echo(DOM.parseError.reason);
WSH.Quit(0);
}
for (var d = DOM.documentElement.getElementsByTagName('div'), i = 0; i < d.length; i++) {
var p = d[i].getElementsByTagName('p');
if (p && p[0]) {
// move contents of p node up to parent
while (p[0].hasChildNodes()) p[0].parentNode.insertBefore(p[0].firstChild, p[0]);
// delete now empty p node
p[0].parentNode.removeChild(p[0]);
changed = true;
}
}
html = head + DOM.documentElement.xml + '</html>';
if (changed) {
htmlfile = fso.CreateTextFile(WSH.Arguments(0), 1);
htmlfile.Write(html);
htmlfile.Close();
WSH.Echo('Fixed!');
}
else WSH.Echo('Nothing to change.');