Got around to this. After minor testing this seems to work, though more is needed. Again, this can be extremely long, depending both on the amount of files and their size:
import filecmp
import os
from collections import defaultdict
from sys import argv
def compareDirs(d1,d2):
files1 = defaultdict(set)
files2 = defaultdict(set)
subd1 = set()
subd2 = set()
for entry in os.scandir(d1):
if entry.is_dir(): subd1.add(entry)
else: files1[os.path.getsize(entry)].add(entry)
#Collecting first to compare length since we are guessing no
#match is more likely. Can compare files directly if this is
# not true.
for entry in os.scandir(d2):
if entry.is_dir(): subd2.add(entry)
else: files2[os.path.getsize(entry)].add(entry)
#Structure not the same. Checking prior to content.
if len(subd1) != len(subd2) or len(files1) != len(files2): return False
for size in files2:
for entry in files2[size]:
for fname in files1[size]: #If size does not exist will go to else
if filecmp.cmp(fname,entry,shallow=False): break
else: return False
files1[size].remove(fname)
if not files1[size]: del files1[size]
#Missed a file
if files1: return False
#This is enough since we checked lengths - if all sd2 are matched, sd1
#will be accounted for.
for sd1 in subd1:
for sd2 in subd2:
if compareDirs(sd1,sd2): break
else: return False #Did not find a sub-directory
subd2.remove(sd2)
return True
print(compareDirs(argv[1],argv[2]))
Recursively enter both directories. Compare files on the first level - fail if no match. Then try and match any sub-dir in the first directory to any sub-dir in the next recursively, until all are matched.
This is the most naive solution. Possibly traversing the tree and only matching sizes and structure would be beneficial in the average case. In that case the function would look similar, except we compare getsize
instead of using filecmp
, and save the matching tree structures, so the second run would be faster.
Of course, in case of a few sub-directories with the exact same structures and sizes we would still need to compare all possibilities of matching.