This is the answer - finally cracked it myself -:)
import os
import sys
import re
import webbrowser
Comparison function - does it line by line:
def CompareFiles(str_file1,str_file2):
'''
This function compares two long string texts and returns their
differences as two sequences of unique lines, one list for each.
'''
#reading from text file and splitting str_file into lines - delimited by "\n"
file1_lines = str_file1.split("\n")
file2_lines = str_file2.split("\n")
#unique lines to each one, store it in their respective lists
unique_file1 = []
unique_file2 = []
#unique lines in str1
for line1 in file1_lines:
if line1 !='':
if line1 not in file2_lines:
unique_file1.append(line1)
#unique lines in str2
for line2 in file2_lines:
if line2 != '':
if line2 not in file1_lines:
unique_file2.append(line2)
return unique_file1, unique_file2
Use this function to mask:
def Masker(pattern_lines, file2mask):
'''
This function masks some fields (based on the pattern_lines) with
dummy text to simplify the comparison
'''
#mask the values of all matches from the pattern_lines by a dummy data - 'xxxxxxxxxx'
for pattern in pattern_lines:
temp = pattern.findall(file2mask)
if len(temp) != 0:
for value in temp:
if isinstance(value, str):
masked_file = file2mask.replace(str(value),'x'*10)
elif isinstance(value, tuple):
for tup in value:
masked_file = file2mask.replace(str(tup),'x'*10)
return masked_file
Open the files:
f1 = open("file1.txt","r")
data1 = f1.read()
f1.close()
f3 = open("file2.txt","r")
data3 = f3.read()
f3.close()
Create a folder to store the output file (optional):
save_path = os.path.join(os.path.dirname(__file__), 'outputs')
filename = os.path.normpath(os.path.join(save_path,"interim.txt"))
Pattern lines for masking:
pattern_lines = [
re.compile(r'\- This file is located in 3000.3422.(.*) description \"(.*)\"', re.M),
re.compile(r'\- City address of file is \"(.*)\"',re.M),
re.compile(r'\- Country of file is (.*)',re.M)
]
Mask the two files:
data1_masked = Masker(pattern_lines,data1)
data3_masked = Masker(pattern_lines,data3)
compare the two files and return the unique lines for both
unique1, unique2 = CompareFiles(data1_masked, data3_masked)
Reporting - you can write it into a function:
file = open(filename,'w')
file.write("-------------------------\n")
file.write("\nONLY in FILE ONE\n")
file.write("\n-------------------------\n")
file.write(str('\n'.join(unique1)))
file.write("\n-------------------------\n")
file.write("\nONLY in FILE TWO\n")
file.write("\n-------------------------\n")
file.write(str('\n'.join(unique2)))
file.close()
And finally open the comparison output file:
webbrowser.open(filename)