I want to diff html files by structure and not by content. For example: b and a are identical with this diff because the structures of them are equal.
Anyone knows tool (I prefer in python) or implementation do it ?
I want to diff html files by structure and not by content. For example: b and a are identical with this diff because the structures of them are equal.
Anyone knows tool (I prefer in python) or implementation do it ?
You need to parse the HTML/XMLto a DOM tree and then compare those trees. The preferred solution for parsin in Python for this is lxml library. For comparison I am not sure any lib exist but below is a guidelining source code.
Here is one XML comparison function from Ian Bicking (orignal source, under Python Software Foundation License, https://bitbucket.org/ianb/formencode/src/tip/formencode/doctest_xml_compare.py?fileviewer=file-view-default#doctest_xml_compare.py-70 )
try:
import doctest
doctest.OutputChecker
except AttributeError: # Python < 2.4
import util.doctest24 as doctest
try:
import xml.etree.ElementTree as ET
except ImportError:
import elementtree.ElementTree as ET
from xml.parsers.expat import ExpatError as XMLParseError
RealOutputChecker = doctest.OutputChecker
def debug(*msg):
import sys
print >> sys.stderr, ' '.join(map(str, msg))
class HTMLOutputChecker(RealOutputChecker):
def check_output(self, want, got, optionflags):
normal = RealOutputChecker.check_output(self, want, got, optionflags)
if normal or not got:
return normal
try:
want_xml = make_xml(want)
except XMLParseError:
pass
else:
try:
got_xml = make_xml(got)
except XMLParseError:
pass
else:
if xml_compare(want_xml, got_xml):
return True
return False
def output_difference(self, example, got, optionflags):
actual = RealOutputChecker.output_difference(
self, example, got, optionflags)
want_xml = got_xml = None
try:
want_xml = make_xml(example.want)
want_norm = make_string(want_xml)
except XMLParseError, e:
if example.want.startswith('<'):
want_norm = '(bad XML: %s)' % e
# '<xml>%s</xml>' % example.want
else:
return actual
try:
got_xml = make_xml(got)
got_norm = make_string(got_xml)
except XMLParseError, e:
if example.want.startswith('<'):
got_norm = '(bad XML: %s)' % e
else:
return actual
s = '%s\nXML Wanted: %s\nXML Got : %s\n' % (
actual, want_norm, got_norm)
if got_xml and want_xml:
result = []
xml_compare(want_xml, got_xml, result.append)
s += 'Difference report:\n%s\n' % '\n'.join(result)
return s
def xml_compare(x1, x2, reporter=None):
if x1.tag != x2.tag:
if reporter:
reporter('Tags do not match: %s and %s' % (x1.tag, x2.tag))
return False
for name, value in x1.attrib.items():
if x2.attrib.get(name) != value:
if reporter:
reporter('Attributes do not match: %s=%r, %s=%r'
% (name, value, name, x2.attrib.get(name)))
return False
for name in x2.attrib.keys():
if name not in x1.attrib:
if reporter:
reporter('x2 has an attribute x1 is missing: %s'
% name)
return False
if not text_compare(x1.text, x2.text):
if reporter:
reporter('text: %r != %r' % (x1.text, x2.text))
return False
if not text_compare(x1.tail, x2.tail):
if reporter:
reporter('tail: %r != %r' % (x1.tail, x2.tail))
return False
cl1 = x1.getchildren()
cl2 = x2.getchildren()
if len(cl1) != len(cl2):
if reporter:
reporter('children length differs, %i != %i'
% (len(cl1), len(cl2)))
return False
i = 0
for c1, c2 in zip(cl1, cl2):
i += 1
if not xml_compare(c1, c2, reporter=reporter):
if reporter:
reporter('children %i do not match: %s'
% (i, c1.tag))
return False
return True
def text_compare(t1, t2):
if not t1 and not t2:
return True
if t1 == '*' or t2 == '*':
return True
return (t1 or '').strip() == (t2 or '').strip()
def make_xml(s):
return ET.XML('<xml>%s</xml>' % s)
def make_string(xml):
if isinstance(xml, (str, unicode)):
xml = make_xml(xml)
s = ET.tostring(xml)
if s == '<xml />':
return ''
assert s.startswith('<xml>') and s.endswith('</xml>'), repr(s)
return s[5:-6]
def install():
doctest.OutputChecker = HTMLOutputChecker
Sidenote: <\head>
is not a valid HTML tag and will be interpreted as text. HTML close tags look like this: </head>
As other answerers may tell you, using a library that actually knows what a DOM is is probably the most reliable option if you're comparing well-structured, complete HTML documents or fragments. A simpler solution than using a DOM is to use regex to match HTML tags.
<pre>
or <textarea>
elements.</head>
, while DOM/parsing libraries might complain that a <head>
tag is missing.Demo
Following is some code that normalizes HTML input (the HTML of this page, actually) by finding all the tags and printing them in succession.
import re, urllib
f = urllib.urlopen('http://stackoverflow.com/questions/33204018/html-structure-diff-in-python')
html = f.read()
for m in re.finditer(r'''</?\w+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>''', html):
print m.group(0)
You can take the output from the above and use whatever command-line diff tool you prefer to compare them.
Or maybe you want to compare them using Python. Instead of printing out all the lines, you might be interested in concatenating them into a single string:
tags_as_string = ''
for m in re.finditer(r'''</?\w+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>''', html):
s += m.group(0) + '\n' # the newline makes diff output look nicer
or list:
tags_as_list = []
for m in re.finditer(r'''</?(\w+)((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>''', html):
s.append(m.group(0))
Further steps to consider (can be done inside the for loop):
m.group(1)
(the first regex group in parentheses) in the for-loop.Credit: The actual regex is from http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/