Finding duplicate files and removing them

Question

I am writing a Python program to find and remove duplicate files from a folder.

I have multiple copies of mp3 files, and some other files. I am using the sh1 algorithm.

How can I find these duplicate files and remove them?

For anyone interested, I've written a humble program with a GUI that compares based on file size and binary chunks: https://github.com/nav9/duplicateFileFinder — Nav, Mar 15 '21 at 15:03
Just wanted to say that algorithm should rely on either the _file properties_ or _file content hash_ or _image similarity_. Based on this the program would differ. — Rajesh Swarnkar, Jan 27 '23 at 16:07

Todor Minakov · Answer 1 · 2022-11-05T06:02:31.823

Fastest algorithm - 100x performance increase compared to the accepted answer (really :))

The approaches in the other solutions are very cool, but they forget about an important property of duplicate files - they have the same file size. Calculating the expensive hash only on files with the same size will save tremendous amount of CPU; performance comparisons at the end, here's the explanation.

Iterating on the solid answers given by @nosklo and borrowing the idea of @Raffi to have a fast hash of just the beginning of each file, and calculating the full one only on collisions in the fast hash, here are the steps:

Buildup a hash table of the files, where the filesize is the key.
For files with the same size, create a hash table with the hash of their first 1024 bytes; non-colliding elements are unique
For files with the same hash on the first 1k bytes, calculate the hash on the full contents - files with matching ones are NOT unique.

The code:

#!/usr/bin/env python3
from collections import defaultdict
import hashlib
import os
import sys


def chunk_reader(fobj, chunk_size=1024):
    """Generator that reads a file in chunks of bytes"""
    while True:
        chunk = fobj.read(chunk_size)
        if not chunk:
            return
        yield chunk


def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
    hashobj = hash()
    file_object = open(filename, 'rb')

    if first_chunk_only:
        hashobj.update(file_object.read(1024))
    else:
        for chunk in chunk_reader(file_object):
            hashobj.update(chunk)
    hashed = hashobj.digest()

    file_object.close()
    return hashed


def check_for_duplicates(paths, hash=hashlib.sha1):
    hashes_by_size = defaultdict(list)  # dict of size_in_bytes: [full_path_to_file1, full_path_to_file2, ]
    hashes_on_1k = defaultdict(list)  # dict of (hash1k, size_in_bytes): [full_path_to_file1, full_path_to_file2, ]
    hashes_full = {}   # dict of full_file_hash: full_path_to_file_string

    for path in paths:
        for dirpath, dirnames, filenames in os.walk(path):
            # get all files that have the same size - they are the collision candidates
            for filename in filenames:
                full_path = os.path.join(dirpath, filename)
                try:
                    # if the target is a symlink (soft one), this will 
                    # dereference it - change the value to the actual target file
                    full_path = os.path.realpath(full_path)
                    file_size = os.path.getsize(full_path)
                    hashes_by_size[file_size].append(full_path)
                except (OSError,):
                    # not accessible (permissions, etc) - pass on
                    continue


    # For all files with the same file size, get their hash on the 1st 1024 bytes only
    for size_in_bytes, files in hashes_by_size.items():
        if len(files) < 2:
            continue    # this file size is unique, no need to spend CPU cycles on it

        for filename in files:
            try:
                small_hash = get_hash(filename, first_chunk_only=True)
                # the key is the hash on the first 1024 bytes plus the size - to
                # avoid collisions on equal hashes in the first part of the file
                # credits to @Futal for the optimization
                hashes_on_1k[(small_hash, size_in_bytes)].append(filename)
            except (OSError,):
                # the file access might've changed till the exec point got here 
                continue

    # For all files with the hash on the 1st 1024 bytes, get their hash on the full file - collisions will be duplicates
    for __, files_list in hashes_on_1k.items():
        if len(files_list) < 2:
            continue    # this hash of fist 1k file bytes is unique, no need to spend cpy cycles on it

        for filename in files_list:
            try: 
                full_hash = get_hash(filename, first_chunk_only=False)
                duplicate = hashes_full.get(full_hash)
                if duplicate:
                    print("Duplicate found: {} and {}".format(filename, duplicate))
                else:
                    hashes_full[full_hash] = filename
            except (OSError,):
                # the file access might've changed till the exec point got here 
                continue


if __name__ == "__main__":
    if sys.argv[1:]:
        check_for_duplicates(sys.argv[1:])
    else:
        print("Please pass the paths to check as parameters to the script")

And, here's the fun part - performance comparisons.

Baseline -

a directory with 1047 files, 32 mp4, 1015 - jpg, total size - 5445.998 MiB - i.e. my phone's camera auto upload directory :)
small (but fully functional) processor - 1600 BogoMIPS, 1.2 GHz 32L1 + 256L2 Kbs cache, /proc/cpuinfo:

Processor : Feroceon 88FR131 rev 1 (v5l) BogoMIPS : 1599.07

(i.e. my low-end NAS :), running Python 2.7.11.

So, the output of @nosklo's very handy solution:

root@NAS:InstantUpload# time ~/scripts/checkDuplicates.py 
Duplicate found: ./IMG_20151231_143053 (2).jpg and ./IMG_20151231_143053.jpg
Duplicate found: ./IMG_20151125_233019 (2).jpg and ./IMG_20151125_233019.jpg
Duplicate found: ./IMG_20160204_150311.jpg and ./IMG_20160204_150311 (2).jpg
Duplicate found: ./IMG_20160216_074620 (2).jpg and ./IMG_20160216_074620.jpg

real    5m44.198s
user    4m44.550s
sys     0m33.530s

And, here's the version with filter on size check, then small hashes, and finally full hash if collisions are found:

root@NAS:InstantUpload# time ~/scripts/checkDuplicatesSmallHash.py . "/i-data/51608399/photo/Todor phone"
Duplicate found: ./IMG_20160216_074620 (2).jpg and ./IMG_20160216_074620.jpg
Duplicate found: ./IMG_20160204_150311.jpg and ./IMG_20160204_150311 (2).jpg
Duplicate found: ./IMG_20151231_143053 (2).jpg and ./IMG_20151231_143053.jpg
Duplicate found: ./IMG_20151125_233019 (2).jpg and ./IMG_20151125_233019.jpg

real    0m1.398s
user    0m1.200s
sys     0m0.080s

Both versions were ran 3 times each, to get the avg of the time needed.

So v1 is (user+sys) 284s, the other - 2s; quite a diff, huh :) With this increase, one could go to SHA512, or even fancier - the perf penalty will be mitigated by the less calculations needed.

Negatives:

More disk access than the other versions - every file is accessed once for size stats (that's cheap, but still is disk IO), and every duplicate is opened twice (for the small first 1k bytes hash, and for the full contents hash)
Will consume more memory due to storing the hash tables runtime

@TodorMinakov - You are welcome. I only use `os.readlink()` if it _is_ a symlink. There are different uses cases for this code, but in my case I don't want the file pointed to by a symlink to 'count'. As I recall, the `OSError` handling can be needed when the user has read permissions for the directory (and therefore the file's size), but not the file itself. But we're beyond where DVCS would be useful. :-) — bitinerant, Jan 10 '19 at 18:30
@TodorMinakov I'm just curious, how many false positives from file size check to short hash check and how many from short hash to full hash? tia, — gboffi, Aug 16 '19 at 11:55
You could write `hashes_by_size.setdefault(file_size, []).append(filename)` instead of checking an aux variable (the same applies to the other `hashes_by_x`) and (but this is really nitpicking...) you write `filename` (w/o underscore) and `file_size` (with underscore). It's perfectly fine, i understand that, nevertheless... Ciao — gboffi, Aug 16 '19 at 12:08
I updated this script for python3 with some small code improvements: https://gist.github.com/tfeldmann/fc875e6630d11f2256e746f67a09c1ae — tfeldmann, Nov 27 '19 at 11:16
This isn't true and a bad approach especially if you're creating image datasets. SOmetimes you have the same picture but one is cropped. So while the image is the same, the size isn't. I get that the other method is expensive but since you'll most likely do this only once, it isn't a problem. — Onur-Andros Ozbek, Mar 18 '20 at 23:21
I don't understand you @OnukOzbek? This script looks for identical, byte-to-byte files. Having a 2nd cropped version of a picture is **not** identical - it may look so cognitively, having the same subject & structure to the human eye, but even so - it _is_ a different picture, with less information (the cropped out data), and a different & not identical file. — Todor Minakov, Mar 19 '20 at 06:27
You increase the risk of false positives because dictionaries `hashes_on_1k` and `hashes_full` compare hashes for files which may have different sizes. You should use tuples (size, hash) as keys, not the hash alone. If there are millions of files, the risk is not negligible. — Futal, Mar 27 '20 at 10:28
@Futal that's a really nice optimization, on a huge set of files it really will decrease the # of calculations of the `hashes_on_1k` items. — Todor Minakov, Mar 27 '20 at 10:52
@Futal Good point! I updated https://gist.github.com/tfeldmann/fc875e6630d11f2256e746f67a09c1ae accordingly. — tfeldmann, Jun 15 '20 at 15:03
doesn't work, no files are deleted when duplicates are detected — prismspecs, Jan 14 '21 at 12:08
On the contrary, it does work :); but - it doesn't delete the files, it was never the intent - check the code, it just prints their names. I didn't add it on purpose - in my case, I want to see which of the duplicates to actually keep. If you want to do it automatically - just add `os.remove(duplicate)` just after the print, at your own risk. @prismspecs — Todor Minakov, Jan 14 '21 at 13:58
post title "Finding duplicate files and removing them" thanks for the updated code — prismspecs, Jan 14 '21 at 16:16
I think you can improve this algorithm even more. Let me explain to you why and how it can be done and what have you missed out. In the first step you do a checksum on the first chunk of each file. In the second step you do full hash (ALL CHUNKS) of the entire file. Imagine if the file is huge like 5Gb (even more) in size or so then it will take time to do it. So, I think in the second step it should do checksum on the first an second chunck or just for the second chunck only (if faster) and this will reduce even more the number of duplicates. On the 3rd step you should decide which is faster: — YoYoYo, Mar 29 '22 at 06:20
Doing 3rd chunk checksum on the third step or full checksum. Also you should decide which is faster for both of these situations: very large number of duplicate files with very large size vs very large number of duplicate files different but with first Xth chuncks the same, e.g. PDFs which have the same size and the same first X pages (e.g. advertising pages or so). — YoYoYo, Mar 29 '22 at 06:26
@YoYoYo I see your point, and yes on a huge collection of large(-ish) files having relatively high collision rate in the bytesize this will help. A version of this could probably work with a recursive function, going as deep (in read file chunks) as needed. Overall for an approach like you suggest the negative will be runtime memory usage - to keep pointers of file:number_of_chunks:hash for all objects. Still in a "regular" use case what you describe can be mitigated with a larger `chunk` value in the reader; and - most file formats start with metainfo @ head, thus large chance of differences. — Todor Minakov, Mar 29 '22 at 09:47
Instead of doing a for loop you can have two lists (or dicts), one with files still to check and one with final files checked completely and you do a while len(still_to_check_list) >0: => check chunk by chunk. #Also you need to move the item in the final_full checked_list if the file checked already as its size is already smaller than the chunk size so it means it is full checked once the chunck is done. I think it will be even more faster. Just need to be tested this algorithm to see how fast it is (if it is) compared to this one above. It will be very useful and interesting. Hopesome1 do it. — YoYoYo, Mar 29 '22 at 11:06

nosklo · Answer 2 · 2009-04-15T18:43:47.847

47

Recursive folders version:

This version uses the file size and a hash of the contents to find duplicates. You can pass it multiple paths, it will scan all paths recursively and report all duplicates found.

import sys
import os
import hashlib

def chunk_reader(fobj, chunk_size=1024):
    """Generator that reads a file in chunks of bytes"""
    while True:
        chunk = fobj.read(chunk_size)
        if not chunk:
            return
        yield chunk

def check_for_duplicates(paths, hash=hashlib.sha1):
    hashes = {}
    for path in paths:
        for dirpath, dirnames, filenames in os.walk(path):
            for filename in filenames:
                full_path = os.path.join(dirpath, filename)
                hashobj = hash()
                for chunk in chunk_reader(open(full_path, 'rb')):
                    hashobj.update(chunk)
                file_id = (hashobj.digest(), os.path.getsize(full_path))
                duplicate = hashes.get(file_id, None)
                if duplicate:
                    print "Duplicate found: %s and %s" % (full_path, duplicate)
                else:
                    hashes[file_id] = full_path

if sys.argv[1:]:
    check_for_duplicates(sys.argv[1:])
else:
    print "Please pass the paths to check as parameters to the script"

edited Apr 15 '09 at 18:43

answered Apr 14 '09 at 19:00

nosklo

217,122
57
293
297

@Jakob Bowyer: Sure, the implementation is iterative. By "Recursive folders" I mean that it recurses the entire folder tree. – nosklo Aug 03 '11 at 21:54
pls am new to python, how do i pass in my paths...? – X-Black... Jun 05 '18 at 22:56
1

@X-Black... pass it as command line parameters. Example: open a cmd prompt, navigate to the folder and type: `python myscript.py c:\path1 c:\path2` – nosklo Jun 06 '18 at 19:05
Resurrecting this old post. This is a great script, but it fails if it comes up against a file which it does not have permission to access (for example pagefile.sys). How can the "for chunk in ..." line be modified to handle this error? – Michael Jul 09 '18 at 12:18
1

@Michael Use a try/except; `try: for chunk..... except OSError: print('Skipping file')` – nosklo Jul 09 '18 at 15:31
Please Can you explain the purpose of the chunk_reader function? Do you want to avoid to load big files in memory? – Zioalex Jan 04 '19 at 15:55
@Alex yes, I have big files and loading just one of them entirely would easily exceed my available ram. – nosklo Jan 04 '19 at 17:15
Brilliant. Wouldn't checking filetype also reduce time. – user96265 Jan 19 '20 at 01:53
@user96265 no, one can have identical file content but different name or filetype (extension). – AcK Apr 07 '22 at 23:49

score 25 · Answer 3 · edited Nov 02 '20 at 03:35

25

def remove_duplicates(dir):
    unique = []
    for filename in os.listdir(dir):
        if os.path.isfile(filename):
            filehash = md5.md5(file(filename).read()).hexdigest()
            if filehash not in unique: 
                unique.append(filehash)
            else: 
                os.remove(filename)

//edit:

For MP3 you may be also interested in this topic Detect duplicate MP3 files with different bitrates and/or different ID3 tags?

edited Nov 02 '20 at 03:35

FloPinguin

351
1
5
16

answered Apr 14 '09 at 18:51

zalew

10,171
3
29
32

For performance, you should probably change unique to be a set (though it probably won't be a big factor unless there are lots of small files). Also, your code will fail if there is a directory in the dir. Check os.path.isfile() before you process them. – Brian Apr 14 '09 at 19:31
yep, this code it's more like a basis. I added isfile as you suggested. – zalew Apr 14 '09 at 20:07
2

Warning: I don't know why but your MD5 code generated the same hash for many files *which were not duplicate*... When I replaced by `hashlib.md5(open(filename, 'rb').read()).hexdigest()` it worked correctly. – Basj Nov 15 '16 at 09:28
I would like to know why we use hash for this task? Can you clarify? – João Víctor Melo Aug 26 '22 at 16:15

Raffi · Answer 4 · 2012-10-24T09:21:33.950

Faster algorithm

In case many files of 'big size' should be analyzed (images, mp3, pdf documents), it would be interesting/faster to have the following comparison algorithm:

a first fast hash is performed on the first N bytes of the file (say 1KB). This hash would say if files are different without doubt, but will not say if two files are exactly the same (accuracy of the hash, limited data read from disk)
a second, slower, hash, which is more accurate and performed on the whole content of the file, if a collision occurs in the first stage

Here is an implementation of this algorithm:

import hashlib
def Checksum(current_file_name, check_type = 'sha512', first_block = False):
  """Computes the hash for the given file. If first_block is True,
  only the first block of size size_block is hashed."""
  size_block = 1024 * 1024 # The first N bytes (1KB)

  d = {'sha1' : hashlib.sha1, 'md5': hashlib.md5, 'sha512': hashlib.sha512}

  if(not d.has_key(check_type)):
    raise Exception("Unknown checksum method")

  file_size = os.stat(current_file_name)[stat.ST_SIZE]
  with file(current_file_name, 'rb') as f:
    key = d[check_type].__call__()
    while True:
      s = f.read(size_block)
      key.update(s)
      file_size -= size_block
      if(len(s) < size_block or first_block):
        break
  return key.hexdigest().upper()

def find_duplicates(files):
  """Find duplicates among a set of files.
  The implementation uses two types of hashes:
  - A small and fast one one the first block of the file (first 1KB), 
  - and in case of collision a complete hash on the file. The complete hash 
  is not computed twice.
  It flushes the files that seems to have the same content 
  (according to the hash method) at the end.
  """

  print 'Analyzing', len(files), 'files'

  # this dictionary will receive small hashes
  d = {}
  # this dictionary will receive full hashes. It is filled
  # only in case of collision on the small hash (contains at least two 
  # elements)
  duplicates = {}

  for f in files:

    # small hash to be fast
    check = Checksum(f, first_block = True, check_type = 'sha1')

    if(not d.has_key(check)):
      # d[check] is a list of files that have the same small hash
      d[check] = [(f, None)]
    else:
      l = d[check]
      l.append((f, None))

      for index, (ff, checkfull) in enumerate(l):

        if(checkfull is None):
          # computes the full hash in case of collision
          checkfull = Checksum(ff, first_block = False)
          l[index] = (ff, checkfull)

          # for each new full hash computed, check if their is 
          # a collision in the duplicate dictionary. 
          if(not duplicates.has_key(checkfull)):
            duplicates[checkfull] = [ff]
          else:
            duplicates[checkfull].append(ff)

  # prints the detected duplicates
  if(len(duplicates) != 0):
    print
    print "The following files have the same sha512 hash"

    for h, lf in duplicates.items():
      if(len(lf)==1):
        continue
      print 'Hash value', h
      for f in lf:
        print '\t', f.encode('unicode_escape') if \
          type(f) is types.UnicodeType else f
  return duplicates

The find_duplicates function takes a list of files. This way, it is also possible to compare two directories (for instance, to better synchronize their content.) An example of function creating a list of files, with specified extension, and avoiding entering in some directories, is below:

def getFiles(_path, extensions = ['.png'], 
             subdirs = False, avoid_directories = None):
  """Returns the list of files in the path :'_path', 
     of extension in 'extensions'. 'subdir' indicates if 
     the search should also be performed in the subdirectories. 
     If extensions = [] or None, all files are returned.
     avoid_directories: if set, do not parse subdirectories that 
     match any element of avoid_directories."""

  l = []
  extensions = [p.lower() for p in extensions] if not extensions is None \
    else None
  for root, dirs, files in os.walk(_path, topdown=True):

    for name in files:
      if(extensions is None or len(extensions) == 0 or \
         os.path.splitext(name)[1].lower() in extensions):
        l.append(os.path.join(root, name))

    if(not subdirs):
      while(len(dirs) > 0):
        dirs.pop()
    elif(not avoid_directories is None):
      for d in avoid_directories:
        if(d in dirs): dirs.remove(d)

  return l

This method is convenient for not parsing .svn paths for instance, which surely will trigger colliding files in find_duplicates.

Feedbacks are welcome.

score 6 · Answer 5 · edited Jul 20 '11 at 17:46

6

I wrote one in Python some time ago -- you're welcome to use it.

import sys
import os
import hashlib

check_path = (lambda filepath, hashes, p = sys.stdout.write:
        (lambda hash = hashlib.sha1 (file (filepath).read ()).hexdigest ():
                ((hash in hashes) and (p ('DUPLICATE FILE\n'
                                          '   %s\n'
                                          'of %s\n' % (filepath, hashes[hash])))
                 or hashes.setdefault (hash, filepath)))())

scan = (lambda dirpath, hashes = {}: 
                map (lambda (root, dirs, files):
                        map (lambda filename: check_path (os.path.join (root, filename), hashes), files), os.walk (dirpath)))

((len (sys.argv) > 1) and scan (sys.argv[1]))

edited Jul 20 '11 at 17:46

Ali

18,665
21
103
138

answered Apr 14 '09 at 17:50

John Millikin

197,344
39
212
226

3

I can't follow what's happening there. If you get a chance, could you maybe explain a little of what's going on? – tgray Apr 14 '09 at 18:11
I do not follow what is happening here too – quantum231 Feb 02 '22 at 15:52

score 5 · Answer 6 · answered Apr 27 '16 at 12:49

@IanLee1521 has a nice solution here. It is very efficient because it checks the duplicate based on the file size first.

#! /usr/bin/env python

# Originally taken from:
# http://www.pythoncentral.io/finding-duplicate-files-with-python/
# Original Auther: Andres Torres

# Adapted to only compute the md5sum of files with the same size

import argparse
import os
import sys
import hashlib


def find_duplicates(folders):
    """
    Takes in an iterable of folders and prints & returns the duplicate files
    """
    dup_size = {}
    for i in folders:
        # Iterate the folders given
        if os.path.exists(i):
            # Find the duplicated files and append them to dup_size
            join_dicts(dup_size, find_duplicate_size(i))
        else:
            print('%s is not a valid path, please verify' % i)
            return {}

    print('Comparing files with the same size...')
    dups = {}
    for dup_list in dup_size.values():
        if len(dup_list) > 1:
            join_dicts(dups, find_duplicate_hash(dup_list))
    print_results(dups)
    return dups


def find_duplicate_size(parent_dir):
    # Dups in format {hash:[names]}
    dups = {}
    for dirName, subdirs, fileList in os.walk(parent_dir):
        print('Scanning %s...' % dirName)
        for filename in fileList:
            # Get the path to the file
            path = os.path.join(dirName, filename)
            # Check to make sure the path is valid.
            if not os.path.exists(path):
                continue
            # Calculate sizes
            file_size = os.path.getsize(path)
            # Add or append the file path
            if file_size in dups:
                dups[file_size].append(path)
            else:
                dups[file_size] = [path]
    return dups


def find_duplicate_hash(file_list):
    print('Comparing: ')
    for filename in file_list:
        print('    {}'.format(filename))
    dups = {}
    for path in file_list:
        file_hash = hashfile(path)
        if file_hash in dups:
            dups[file_hash].append(path)
        else:
            dups[file_hash] = [path]
    return dups


# Joins two dictionaries
def join_dicts(dict1, dict2):
    for key in dict2.keys():
        if key in dict1:
            dict1[key] = dict1[key] + dict2[key]
        else:
            dict1[key] = dict2[key]


def hashfile(path, blocksize=65536):
    afile = open(path, 'rb')
    hasher = hashlib.md5()
    buf = afile.read(blocksize)
    while len(buf) > 0:
        hasher.update(buf)
        buf = afile.read(blocksize)
    afile.close()
    return hasher.hexdigest()


def print_results(dict1):
    results = list(filter(lambda x: len(x) > 1, dict1.values()))
    if len(results) > 0:
        print('Duplicates Found:')
        print(
            'The following files are identical. The name could differ, but the'
            ' content is identical'
            )
        print('___________________')
        for result in results:
            for subresult in result:
                print('\t\t%s' % subresult)
            print('___________________')

    else:
        print('No duplicate files found.')


def main():
    parser = argparse.ArgumentParser(description='Find duplicate files')
    parser.add_argument(
        'folders', metavar='dir', type=str, nargs='+',
        help='A directory to parse for duplicates',
        )
    args = parser.parse_args()

    find_duplicates(args.folders)


if __name__ == '__main__':
    sys.exit(main())

ady · Answer 7 · 2013-09-01T14:15:32.100

    import hashlib
    import os
    import sys
    from sets import Set

    def read_chunk(fobj, chunk_size = 2048):
        """ Files can be huge so read them in chunks of bytes. """
        while True:
            chunk = fobj.read(chunk_size)
            if not chunk:
                return
            yield chunk

    def remove_duplicates(dir, hashfun = hashlib.sha512):
        unique = Set()
        for filename in os.listdir(dir):
            filepath = os.path.join(dir, filename)
            if os.path.isfile(filepath):
                hashobj = hashfun()
                for chunk in read_chunk(open(filepath,'rb')):
                    hashobj.update(chunk)
                    # the size of the hashobj is constant
                    # print "hashfun: ", hashfun.__sizeof__()
                hashfile = hashobj.hexdigest()
                if hashfile not in unique:
                    unique.add(hashfile)
                else: 
                    os.remove(filepath)

    try:
        hashfun = hashlib.sha256
        remove_duplicates(sys.argv[1], hashfun)
    except IndexError:
        print """Please pass a path to a directory with 
        duplicate files as a parameter to the script."""

how do i set the dir to my own dir...? – X-Black... Jun 05 '18 at 20:35 — X-Black..., Jun 05 '18 at 20:35

score 2 · Answer 8 · answered Sep 13 '21 at 21:59

2

Python has a standard library called filecmp to compare files and directories.

It checks for file size. It checks content in 8k chunks. It works on binary files.

It does not hash.

python docs for filecmp

answered Sep 13 '21 at 21:59

brocla

123
6

`filecmp` is nice for checking a small number of files. If you have to deal with larger numbers, it has the potential to be rather slow. Checking `n` files of equal size for duplicates with `filecmp` can require to open and to (partly) read each file `n-1` times, whereas hashing only requires it once. – Timus Feb 01 '22 at 22:30

score 0 · Answer 9 · answered Nov 15 '16 at 09:26

In order to be safe (removing them automatically can be dangerous if something goes wrong!), here is what I use, based on @zalew's answer.

Pleas also note that the md5 sum code is slightly different from @zalew's because his code generated too many wrong duplicate files (that's why I said removing them automatically is dangerous!).

import hashlib, os
unique = dict()
for filename in os.listdir('.'):
    if os.path.isfile(filename):
        filehash = hashlib.md5(open(filename, 'rb').read()).hexdigest()

        if filehash not in unique: 
            unique[filehash] = filename
        else:
            print filename + ' is a duplicate of ' + unique[filehash]

score 0 · Answer 10 · answered Feb 19 '22 at 11:34

I have found a 100% working code for removing duplicate files recursively inside a folder. Just replace the folder name in the clean method with your folder name.

import time
import os
import shutil
from hashlib import sha256


class Duplython:
    def __init__(self):
        self.home_dir = os.getcwd()
        self.File_hashes = []
        self.Cleaned_dirs = []
        self.Total_bytes_saved = 0
        self.block_size = 65536
        self.count_cleaned = 0

    def welcome(self) -> None:
        print('******************************************************************')
        print('****************        DUPLYTHON      ****************************')
        print('********************************************************************\n\n')
        print('----------------        WELCOME        ----------------------------')
        time.sleep(3)
        print('\nCleaning .................')
        return None

    def generate_hash(self, Filename: str) -> str:
        Filehash = sha256()
        try:
            with open(Filename, 'rb') as File:
                fileblock = File.read(self.block_size)
                while len(fileblock) > 0:
                    Filehash.update(fileblock)
                    fileblock = File.read(self.block_size)
                Filehash = Filehash.hexdigest()
            return Filehash
        except:
            return False

    def clean(self) -> None:
        all_dirs = [path[0] for path in os.walk('E:\\songs')]
        for path in all_dirs:
            os.chdir(path)
            All_Files = [file for file in os.listdir() if os.path.isfile(file)]
            for file in All_Files:
                filehash = self.generate_hash(file)
                if not filehash in self.File_hashes:
                    if filehash:
                        self.File_hashes.append(filehash)
                        # print(file)
                else:
                    byte_saved = os.path.getsize(file)
                    self.count_cleaned += 1
                    self.Total_bytes_saved += byte_saved
                    os.remove(file)
                    filename = file.split('/')[-1]
                    print(filename, '.. cleaned ')
            os.chdir(self.home_dir)

    def cleaning_summary(self) -> None:
        mb_saved = self.Total_bytes_saved / 1048576
        mb_saved = round(mb_saved, 2)
        print('\n\n--------------FINISHED CLEANING ------------')
        print('File cleaned  : ', self.count_cleaned)
        print('Total Space saved : ', mb_saved, 'MB')
        print('-----------------------------------------------')

    def main(self) -> None:
        self.welcome()
        self.clean()
        self.cleaning_summary()


#
# if __name__ == '__main__':
#     App = Duplython()
#     App.main()


def dedupe_bing_images():
    App = Duplython()
    App.main()
    return True


dedupe_bing_images()

Finding duplicate files and removing them

10 Answers10

Fastest algorithm - 100x performance increase compared to the accepted answer (really :))

Recursive folders version:

Faster algorithm

Linked

Related