Here's one approach that I think satisfies your requirements. It also allows you to specify whether only the same difference should be allowed on each line or not (which would consider your second file example as not matching):
UPDATE: this accounts for lines in the master and other files not necessarily being in the same order
from itertools import zip_longest
def get_min_diff(master_lines, to_check):
min_diff = None
match_line = None
for ln, ml in enumerate(master_lines):
diff = [w for w, m in zip_longest(ml, to_check) if w != m]
n_diffs = len(diff)
if min_diff is None or n_diffs < min_diff:
min_diff = n_diffs
match_line = ln
return min_diff, diff, match_line
def check_files(master, files):
# get lines to compare against
master_lines = []
with open(master) as mstr:
for line in mstr:
master_lines.append(line.strip().split())
matches = []
for f in files:
temp_master = list(master_lines)
diff_sizes = set()
diff_types = set()
with open(f) as checkfile:
for line in checkfile:
to_check = line.strip().split()
# find each place in current line where it differs from
# the corresponding line in the master file
min_diff, diff, match_index = get_min_diff(temp_master, to_check)
if min_diff <= 1: # acceptable number of differences
# remove corresponding line from master search space
# so we don't match the same master lines to multiple
# lines in a given test file
del temp_master[match_index]
# if it only differs in one place, keep track of what
# word was different for optional check later
if min_diff == 1:
diff_types.add(diff[0])
diff_sizes.add(min_diff)
# if you want any file where the max number of differences
# per line was 1
if max(diff_sizes) == 1:
# consider a match if there is only one difference per line
matches.append(f)
# if you instead want each file to only
# be different by the same word on each line
#if len(diff_types) == 1:
#matches.append(f)
return matches
I've made a few test files to check, based on your supplied examples:
::::::::::::::
test1.txt
::::::::::::::
file contains y
the image is of y type
the user is admin
the address is y
::::::::::::::
test2.txt
::::::::::::::
file contains x
the image is of x type
the user is admin
the address is x
::::::::::::::
test3.txt
::::::::::::::
file contains xyz
the image is of abc type
the user is admin
the address is pqrs
::::::::::::::
testmaster.txt
::::::::::::::
file contains m
the image is of m type
the user is admin
the address is m
::::::::::::::
test_nomatch.txt
::::::::::::::
file contains y and some other stuff
the image is of y type unlike the other
the user is bongo the clown
the address is redacted
::::::::::::::
test_scrambled.txt
::::::::::::::
the image is of y type
file contains y
the address is y
the user is admin
When run, the code above returns the correct files:
In: check_files('testmaster.txt', ['test1.txt', 'test2.txt', 'test3.txt', 'test_nomatch.txt', 'test_scrambled.txt'])
Out: ['test1.txt', 'test2.txt', 'test3.txt', 'test_scrambled.txt']