Highlight differences between two xml files in a Tkinter textbox

Question

I tried all kinds of logic and methods and even googled a lot, but yet not able to think of any satisfactory answer for the question I have. I have wrote a program as shown below to highlight specific xml code where I am facing some problem. Sorry for making this post bit long. I only wanted to clearly explain my problem.

EDIT: For running below given program you will need two xml files which are here: sample1 and sample2. Save this files and in below code edit the location where you want to save your files in C:/Users/editThisLocation/Desktop/sample1.xml

from lxml import etree
from collections import defaultdict
from collections import OrderedDict
from distutils.filelist import findall
from lxml._elementpath import findtext

from Tkinter import * 
import Tkinter as tk
import ttk

root = Tk()

class CustomText(tk.Text):

    def __init__(self, *args, **kwargs):
        tk.Text.__init__(self, *args, **kwargs)


    def highlight_pattern(self, pattern, tag, start, end,
                          regexp=True):

        start = self.index(start)
        end = self.index(end)
        self.mark_set("matchStart", start)
        self.mark_set("matchEnd", start)
        self.mark_set("searchLimit", end)

        count = tk.IntVar()
        while True:
            index = self.search(pattern, "matchEnd","searchLimit",
                                count=count, regexp=regexp)
            if index == "": break
            self.mark_set("matchStart", index)
            self.mark_set("matchEnd", "%s+%sc" % (index, count.get()))
            self.tag_add(tag, "matchStart", "matchEnd")

    def Remove_pattern(self, pattern, tag, start="1.0", end="end",
                          regexp=True):

        start = self.index(start)
        end = self.index(end)
        self.mark_set("matchStart", start)
        self.mark_set("matchEnd", start)
        self.mark_set("searchLimit", end)

        count = tk.IntVar()
        while True:
            index = self.search(pattern, "matchEnd","searchLimit",
                                count=count, regexp=regexp)
            if index == "": break
            self.mark_set("matchStart", index)
            self.mark_set("matchEnd", "%s+%sc" % (index, count.get()))
            self.tag_remove(tag, start, end)



recovering_parser = etree.XMLParser(recover=True)


sample1File = open('C:/Users/editThisLocation/Desktop/sample1.xml', 'r')
contents_sample1 = sample1File.read()

sample2File = open('C:/Users/editThisLocation/Desktop/sample2.xml', 'r')
contents_sample2 = sample2File.read()


frame1 = Frame(width=768, height=25, bg="#000000", colormap="new")
frame1.pack()
Label(frame1, text="sample 1 below - scroll to see more").pack()

textbox = CustomText(root)
textbox.insert(END,contents_sample1)
textbox.pack(expand=1, fill=BOTH)

frame2 = Frame(width=768, height=25, bg="#000000", colormap="new")
frame2.pack()
Label(frame2, text="sample 2 below - scroll to see more").pack()


textbox1 = CustomText(root)
textbox1.insert(END,contents_sample2)
textbox1.pack(expand=1, fill=BOTH)

sample1 = etree.parse("C:/Users/editThisLocation/Desktop/sample1.xml", parser=recovering_parser).getroot()
sample2 = etree.parse("C:/Users/editThisLocation/Desktop/sample2.xml", parser=recovering_parser).getroot()

ToStringsample1 = etree.tostring(sample1)
sample1String = etree.fromstring(ToStringsample1, parser=recovering_parser)

ToStringsample2 = etree.tostring(sample2)
sample2String = etree.fromstring(ToStringsample2, parser=recovering_parser)

timesample1 = sample1String.findall('{http://www.example.org/eHorizon}time')
timesample2 =  sample2String.findall('{http://www.example.org/eHorizon}time')

for i,j in zip(timesample1,timesample2):       

    for k,l in zip(i.findall("{http://www.example.org/eHorizon}feature"), j.findall("{http://www.example.org/eHorizon}feature")):

        if [k.attrib.get('color'), k.attrib.get('type')] != [l.attrib.get('color'), l.attrib.get('type')]:

            faultyLine = [k.attrib.get('color'), k.attrib.get('type'), k.text]


def high(event):

    textbox.tag_configure("yellow", background="yellow")
    limit_1 = '<p1:time nTimestamp="{0}">'.format(5)     #limit my search between timestamp 5 and timestamp 6 
    limit_2 = '<p1:time nTimestamp="{0}">'.format((5+1)) # timestamp 6

    highlightString = '<p1:feature color="{0}" type="{1}">{2}</p1:feature>'.format(faultyLine[0],faultyLine[1],faultyLine[2]) #string to be highlighted

    textbox.highlight_pattern(limit_1, "yellow", start=textbox.search(limit_1, '1.0', stopindex=END), end=textbox.search(limit_2, '1.0', stopindex=END))
    textbox.highlight_pattern(highlightString, "yellow", start=textbox.search(limit_1, '1.0', stopindex=END), end=textbox.search(limit_2, '1.0', stopindex=END))


button = 'press here to highlight error line' 
c = ttk.Label(root, text=button)
c.bind("<Button-1>",high)
c.pack()  

root.mainloop()

What I want

If you run above code, it would present an output given below:

As you can see in the image, I only intend to highlight code marked with green tick. Some of you might think of limiting the starting and ending index to highlight that pattern. However, if you see in my program I am already making use of starting and ending indexes for limiting my output to only nTimestamp="5" and for that I am using limit_1 and limit_2 variables.

So in this type of a data how to correctly highlight one pattern out of many inside individual nTimestamp?

EDIT: Here I specifically want to highlight 3rd item in nTimestamp="5" because this item is not present in sample2.xml as you can see in two xml files and when program runs it also differentiates this. The only problem is to highlight correct item which is 3rd in my case.

I am using highlighting class from Bryan Oakley's code here

EDIT Recent

In context to what kobejohn asked below in comments, the target file won't ever be empty. There are always chances that target file may have extra or missing elements. Finally, my current intention is to highlight only deep elements which are different or missing and timestamps in which they are located. However, highlighting of timestamps is done correctly but the issue to highlight deep elements like explained above is still an issue. Thank you kobejohn for clarifying this.

NOTE:

One method which I know and you might suggest that works correctly is to extract the index of green color ticked pattern and simply run highlight tag over it, but this approach is very hard-coded and in large data where you have to deal with lots of variations it is completely ineffective. I am searching for another better option.

It's great that you included fully functioning code and the desired vs actual output. However I'm having a hard time understanding your logic. Can you please explain step by step how you want to identify the text to highlight? — KobeJohn, Aug 19 '15 at 23:18
Yes sure and I am sorry for late response. The text that I want to highlight consists of entire line as a string. For e.g. in above code that line is defined in variable `highlightString`. Now comes the task of identifying out of all 3 `location` elements which instance of `highlightString` to be highlighted since there are same strings in all three location elements. So in above case I intend to highlight string located in 3rd `location` element in `nTimestamp="5"` — Radheya, Aug 20 '15 at 08:14
I see what you are trying to do now. Thanks for explaining. However I don't see the logic for how to identify which of the items within the limit. I understand that is your question, but how would you expect a human to identify the 3rd item? If you can explain how a human should identify the correct one then I can probably help you put that logic into your code. These comments are becoming involved so can we move further discussion to [this chat room](http://chat.stackoverflow.com/rooms/87469/highlight-specific-xml-code-in-textbox-using-tkinter)? — KobeJohn, Aug 20 '15 at 10:07
@kobejohn I re-edited my question and added entire code of my xml comparison of two files. Its a working code and you only need to download those two xml files and edit the file location instance for your computer. It will produce the output as shown. The logic here is just to do line wise comparison of `feature` elements located inside those xml files. — Radheya, Aug 20 '15 at 13:16
Is it the case that all you are asking for is how to highlight data in one file that is not in another? Does the data have to be precisely the same (eg: same amount of whitespace?) — Bryan Oakley, Aug 20 '15 at 15:30
@BryanOakley yes that is true. Also data should be precisely same including whitespace. Right now the code shown above is limited to highlight data which is there in one file and absent in another, but in future I am willing to include this logic also to highlight such data which are not in order. — Radheya, Aug 20 '15 at 15:39
Thanks to BryanOakley I could understand that you are basically trying to diff two xml files. I do not know a quick way to do what you want with this code because you are looking for structural information that you threw away when you converted the xml to simple text. What I think you need to do is parse these xml files (as you are doing with lxml) and then make code to compare xml elements instead of working with text. [Here are some examples](http://stackoverflow.com/q/13465807/377366) of how to walk through an xml structure. — KobeJohn, Aug 21 '15 at 15:10
Alternatively you could use [```difflib.SequenceMatcher```](https://docs.python.org/2/library/difflib.html#sequencematcher-objects) and I think provide '\n' as "junk" to compare by line. I'm sorry but I don't have time to create a new design with these techniques. Maybe someone can find an easier way than I have or actually implement it with tkinter as you are doing. — KobeJohn, Aug 21 '15 at 15:10

KobeJohn · Accepted Answer · 2015-08-27T17:08:53.263

This solution works by performing a simplified diff between base.xml and test.xml based on the description you provided. The diff result is a 3rd XML tree that combines the original trees. The output is the diff with color-coded highlighting for the lines that don't match between the files.

I hope you can use this or adapt it to what you need.

Copy-Paste Script

import copy
from lxml import etree
import Tkinter as tk


# assumption: the root element of both trees is the same
# note: missing subtrees will only have the parent element highlighted


def element_content_equal(e1, e2):
    # starting point here: http://stackoverflow.com/a/24349916/377366
    try:
        if e1.tag != e1.tag:
            return False
        elif e1.text != e2.text:
            return False
        elif e1.tail != e2.tail:
            return False
        elif e1.attrib != e2.attrib:
            return False
    except AttributeError:
        # e.g. None is passed in for an element
        return False
    return True


def element_is_in_sequence(element, sequence):
    for e in sequence:
        if element_content_equal(e, element):
            return True
    return False


def copy_element_without_children(element):
    e_copy = etree.Element(element.tag, attrib=element.attrib, nsmap=element.nsmap)
    e_copy.text = element.text
    e_copy.tail = element.tail
    return e_copy


# start at the root of both xml trees
parser = etree.XMLParser(recover=True, remove_blank_text=True)
base_root = etree.parse('base.xml', parser=parser).getroot()
test_root = etree.parse('test.xml', parser=parser).getroot()
# each element from the original xml trees will be placed into a merge tree
merge_root = copy_element_without_children(base_root)


# additionally each merge tree element will be tagged with its source
DIFF_ATTRIB = 'diff'
FROM_BASE_ONLY = 'base'
FROM_TEST_ONLY = 'test'

# process the pair of trees, one set of parents at a time
parent_stack = [(base_root, test_root, merge_root)]
while parent_stack:
    base_parent, test_parent, merge_parent = parent_stack.pop()
    base_children = base_parent.getchildren()
    test_children = test_parent.getchildren()

    # compare children and transfer to merge tree
    base_children_iter = iter(base_children)
    test_children_iter = iter(test_children)
    base_child = next(base_children_iter, None)
    test_child = next(test_children_iter, None)
    while (base_child is not None) or (test_child is not None):
        # first handle the case of a unique base child
        if (base_child is not None) and (not element_is_in_sequence(base_child, test_children)):
            # base_child is unique: deep copy with base only tag
            merge_child = copy.deepcopy(base_child)
            merge_child.attrib[DIFF_ATTRIB] = FROM_BASE_ONLY
            merge_parent.append(merge_child)
            # this unique child has already been fully copied to the merge tree so it doesn't go on the stack
            # only move the base child since test child hasn't been handled yet
            base_child = next(base_children_iter, None)
        elif (test_child is not None) and (not element_is_in_sequence(test_child, base_children)):
            # test_child is unique: deep copy with base only tag
            merge_child = copy.deepcopy(test_child)
            merge_child.attrib[DIFF_ATTRIB] = FROM_TEST_ONLY
            merge_parent.append(merge_child)
            # this unique child has already been fully copied to the merge tree so it doesn't go on the stack
            # only move test child since base child hasn't been handled yet
            test_child = next(test_children_iter, None)
        elif element_content_equal(base_child, test_child):
            # both trees share the same element: shallow copy either child with shared tag
            merge_child = copy_element_without_children(base_child)
            merge_parent.append(merge_child)
            # put pair of children on stack as parents to be tested since their children may differ
            parent_stack.append((base_child, test_child, merge_child))
            # move on to next children in both trees since this was a shared element
            base_child = next(base_children_iter, None)
            test_child = next(test_children_iter, None)
        else:
            raise RuntimeError  # there is something wrong - element should be unique or shared.

# display merge_tree with highlighting to indicate source of each line
#   no highlight: common element in both trees
#   green: line that exists only in test tree (i.e. additional)
#   red: line that exists only in the base tree (i.e. missing)
root = tk.Tk()
textbox = tk.Text(root)
textbox.pack(expand=1, fill=tk.BOTH)
textbox.tag_config(FROM_BASE_ONLY, background='#ff5555')
textbox.tag_config(FROM_TEST_ONLY, background='#55ff55')

# find diff lines to highlight within merge_tree string that includes kludge attributes
merge_tree_string = etree.tostring(merge_root, pretty_print=True)
diffs_by_line = []
for line, line_text in enumerate(merge_tree_string.split('\n')):
    for diff_type in (FROM_BASE_ONLY, FROM_TEST_ONLY):
        if diff_type in line_text:
            diffs_by_line.append((line+1, diff_type))

# remove kludge attributes
for element in merge_root.iter():
    try:
        del(element.attrib[DIFF_ATTRIB])
    except KeyError:
        pass
merge_tree_string = etree.tostring(merge_root, pretty_print=True)

# highlight final lines
textbox.insert(tk.END, merge_tree_string)
for line, diff_type in diffs_by_line:
    textbox.tag_add(diff_type, '{}.0'.format(line), '{}.0'.format(int(line)+1))
root.mainloop()

Inputs:

Please note that I cleaned up the xml because I was getting inconsistent behavior with the original XML. The original was basically using back slashes instead of forward slashes and also had false closing slashes on opening tags.

base.xml (in the same location as this script)

<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
<p1:sample1 xmlns:p1="http://www.example.org/eHorizon">
   <p1:time nTimestamp="5">
      <p1:location hours = "1" path = '1'>
         <p1:feature color="6" type="a">560</p1:feature>
         <p1:feature color="2" type="a">564</p1:feature>
         <p1:feature color="3" type="b">570</p1:feature>
         <p1:feature color="4" type="c">570</p1:feature>
      </p1:location>
      <p1:location hours = "5" path = '1'>
         <p1:feature color="6" type="a">560</p1:feature>
         <p1:feature color="7" type="b">570</p1:feature>
         <p1:feature color="8" type="c">580</p1:feature>
      </p1:location>
      <p1:location hours = "5" path = '1'>
         <p1:feature color="6" type="a">560</p1:feature>
      </p1:location>
   </p1:time>
   <p1:time nTimestamp="6">
      <p1:location hours = "1" path = '1'>
         <p1:feature color="2" type="a">564</p1:feature>
         <p1:feature color="3" type="b">570</p1:feature>
         <p1:feature color="4" type="c">570</p1:feature>
      </p1:location>
      <p1:location hours = "5" path = '1'>
         <p1:feature color="6" type="a">560</p1:feature>
         <p1:feature color="9" type="b">590</p1:feature>
         <p1:feature color="10" type="c">600</p1:feature>
      </p1:location>
      <p1:location hours = "5" path = '1'>
         <p1:feature color="6" type="a">560</p1:feature>
         <p1:feature color="7" type="b">570</p1:feature>
         <p1:feature color="8" type="c">580</p1:feature>
      </p1:location>
   </p1:time>
</p1:sample1>

test.xml (in the same location as this script)

<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
<p1:sample1 xmlns:p1="http://www.example.org/eHorizon">
   <p1:time nTimestamp="5">
      <p1:location hours = "1" path = '1'>
         <p1:feature color="6" type="a">560</p1:feature>
         <p1:feature color="2" type="a">564</p1:feature>
         <p1:feature color="3" type="b">570</p1:feature>
         <p1:feature color="4" type="c">570</p1:feature>
      </p1:location>
      <p1:location hours = "5" path = '1'>
         <p1:feature color="6" type="a">560</p1:feature>
         <p1:feature color="7" type="b">570</p1:feature>
         <p1:feature color="8" type="c">580</p1:feature>
      </p1:location>
      <p1:location hours = "5" path = '1'>
         <p1:feature color="9" type="b">1111</p1:feature>
         <p1:feature color="10" type="c">2222</p1:feature>
      </p1:location>
   </p1:time>
   <p1:time nTimestamp="6">
      <p1:location hours = "1" path = '1'>
         <p1:feature color="2" type="a">564</p1:feature>
         <p1:feature color="3" type="b">570</p1:feature>
         <p1:feature color="4" type="c">570</p1:feature>
      </p1:location>
      <p1:location hours = "5" path = '1'>
         <p1:feature color="6" type="a">560</p1:feature>
         <p1:feature color="9" type="b">590</p1:feature>
         <p1:feature color="10" type="c">600</p1:feature>
      </p1:location>
      <p1:location hours = "5" path = '1'>
         <p1:feature color="6" type="a">560</p1:feature>
         <p1:feature color="7" type="b">570</p1:feature>
         <p1:feature color="8" type="c">580</p1:feature>
      </p1:location>
   </p1:time>
</p1:sample1>

Highlight differences between two xml files in a Tkinter textbox

1 Answers1

Inputs: