I have two files: fileA and fileB. I'd like to get the line numbers of all the lines in the fileB that exist in the fileA. But if a line is indeed in fileA, I won't identify it as "exists in fileA" unless the next line is also in it. So I've written the following code:
def compare_two(fileA, fileB):
with open(fileA, 'r') as fa:
fa_content = fa.read()
with open(fileB, 'r') as fb:
keep_line_num = [] # the line number that's not in fileA
i = 1
while True:
line = fb.readline()
if line == '': # There are no blank lines in both files
break
last_pos = fb.tell()
theFollowing = line
new_line = fb.readline() # get the next line
theFollowing += new_line
fb.seek(last_pos)
if theFollowing not in fa_content:
keep_line_num.append(i)
i += 1
fb.close()
fa.close()
return keep_line_num
compare_two(fileA, fileB)
This works fine for small files. But I want to use it for large files as large as 2GB and this method is too slow for me. Are there any other way to work with this in Python2.7?