I am trying to avoid duplicates in my mp3 collection (quite large). I want to check for duplicates by checking file contents, instead of looking for same file name. I have written the code below to do this but it throws a MemoryError after about a minute. Any suggestions on how I can get this to work?
import os
import hashlib
walk = os.walk('H:\MUSIC NEXT GEN')
mySet = set()
dupe = []
hasher = hashlib.md5()
for dirpath, subdirs, files in walk:
for f in files:
fileName = os.path.join(dirpath, f)
with open(fileName, 'rb') as mp3:
buf = mp3.read()
hasher.update(buf)
hashKey = hasher.hexdigest()
print hashKey
if hashKey in mySet:
dupe.append(fileName)
else:
mySet.add(hashKey)
print 'Dupes: ' + str(dupe)