8
import filecmp

comparison = filecmp.dircmp(dir_local, dir_server)
comparison.report_full_closure()

I want to compare all CSV files kept on my local machine to files kept on a server. The folder structure is the same for both of them. I only want to do a data comparison and not metadata (like time of creation, etc). I am using filecmp but it seems to perform metadata comparison. Is there a way to do what I want?

Brhaka
  • 1,622
  • 3
  • 11
  • 31
user308827
  • 21,227
  • 87
  • 254
  • 417

9 Answers9

6

There are multiple ways to compare the .csv files between the 2 repositories (server file system and local file system).


Method 1: using hashlib

This method uses the Python module hashlib. I used the hashing algorithm sha256 to compute the hash digest for the files. I compare the hashes for files with the exact file name. This method works well, but it will overlook any file that doesn't exist in both directories.

import hashlib

def compare_common_files_by_hash(directory_one, directory_two):
   d1_files = set(os.listdir(directory_one))
   d2_files = set(os.listdir(directory_two))
   common_files = list(d1_files &  d2_files)
   if common_files:
     for filename in common_files:
        hash_01 = hashlib.sha256(open(f'{directory_one}/{filename}', 'rb').read()).hexdigest()
        hash_02 = hashlib.sha256(open(f'{directory_two}/{filename}', 'rb').read()).hexdigest()
        if hash_01 == hash_02:
            print(f'The file - {filename} is identical in the directories {directory_one} and {directory_two}')
        elif hash_01 != hash_02:
            print(f'The file - {filename} is different in the directories {directory_one} and {directory_two}')

Method 2: using os st_size

This method uses the Python module os. In this example, I compared the size of files. This method works ok, but it will misclassify any file that has any data change that doesn't change the size of the file.

import os 

def compare_common_files_by_size(directory_one, directory_two):
  d1_files = set(os.listdir(directory_one))
  d2_files = set(os.listdir(directory_two))
  common_files = list(d1_files &  d2_files)
  if common_files:
    for filename in common_files:
       file_01 = os.stat(f'{directory_one}/{filename}')
       file_02 = os.stat(f'{directory_two}/{filename}')
       if file_01.st_size == file_02.st_size:
            print(f'The file - {filename} is identical in the directories {directory_one} and {directory_two}')
       elif file_01.st_size != file_02.st_size:
            print(f'The file - {filename} is different in the directories {directory_one} and'
                  f' {directory_two}')

Method 3: using os st_size and st_mtime

This method also uses the Python module os. In this example, I compared not only the size of the file, but also the last modification time. This method works good, but it will misclassify files as being identical. In testing, I saved a file with no data modifications and os.st_mtime flagged the file as being different, but in reality it wasn't really different.

import os

 def compare_common_files_by_metadata(directory_one, directory_two):
   d1_files = set(os.listdir(directory_one))
   d2_files = set(os.listdir(directory_two))
   common_files = list(d1_files & d2_files)
   if common_files:
     for filename in common_files:
        file_01 = os.stat(f'{directory_one}/{filename}')
        file_02 = os.stat(f'{directory_two}/{filename}')
        if file_01.st_size == file_02.st_size and file_01.st_mtime == file_02.st_mtime:
            print(f'The file - {filename} is identical in the directories {directory_one} and {directory_two}')
        elif file_01.st_size != file_02.st_size or file_01.st_mtime != file_02.st_mtime:
            print(f'The file - {filename} is different in the directories {directory_one} and'
                  f' {directory_two}')

Method 4: using set()

This example uses Python set() to determine the line to line differences between 2 csv files with the same name. This method will output the exact change between the 2 csv files.

import os

def compare_common_files_by_lines(directory_one, directory_two):
   d1_files = set(os.listdir(directory_one))
   d2_files = set(os.listdir(directory_two))
   common_files = list(d1_files & d2_files)
   if common_files:
     for filename in common_files:
        if fileName.endswith('.csv'):
          file_01 = open(f'{directory_one}/{filename}', 'r', encoding='ISO-8859-1')
          file_02 = open(f'{directory_two}/{filename}', 'r', encoding='ISO-8859-1')
          csv_file_01 = set(map(tuple, csv.reader(file_01)))
          csv_file_02 = set(map(tuple, csv.reader(file_02)))
          different = csv_file_01 ^ csv_file_02
            for row in sorted(different, key=lambda x: x, reverse=True):
               if row:
                  print(f'This row: \n {row} \n was different between the file {fileName} in the directories'
                          f' {directory_one} and {directory_two}')

Method 5: using filecmp.cmp

This method uses the Python module filecmp. In this example I used filecmp.cmp with shallow set to False. Setting this parameter to False instructs filecmp to look at the contents of the files and not the metadata, such as filesize, which is the default for filecmp.cmp. This method works as well as Method 1, that used hashlib.

import filecmp

def compare_common_files(directory_one, directory_two):
  d1_files = set(os.listdir(directory_one))
  d2_files = set(os.listdir(directory_two))
  common_files = list(d1_files & d2_files)
  if common_files:
    for filename in common_files:
        file_01 = f'{directory_one}/{filename}'
        file_02 = f'{directory_two}/{filename}'
        comparison = filecmp.cmp(file_01, file_02, shallow=False)
        if comparison:
            print(f'The file - {filename} is identical in the directories - {directory_one} and {directory_two}')
        elif not comparison:
            print(f'The file - {filename} is different in the directories - {directory_one} and {directory_two}')

Method 6: using filecmp.dircmp

This method also uses the Python module filecmp. In this example I used filecmp.dircmp, which allows me to not only identify files that are non-common between the 2 directories and find those files that have similar names, but different content.

import filecmp

def directory_recursive(directory_one, directory_two):
   files = filecmp.dircmp(directory_one, directory_two)
   for filename in files.diff_files:
      print(f'The file - {filename} is different in the directories - {files.left} and {files.right}')
   for filename in files.left_only:
      print(f'The file - {filename} - was only found in the directory {files.left}')
   for filename in files.right_only:
      print(f'The file - {filename} - was only found in the directory {files.right}')

Method 7: line-by-line comparison

This example does a line-by-line comparison of 2 csv files and output the line that are different. The output can be added to either Python dictionary or to JSON file for secondary.

import csv

def get_csv_file_lines(file):
   with open(file, 'r', encoding='utf-8') as csv_file:
      rows = csv.reader(csv_file)
      for row in rows:
         yield row

def compare_csv_files_line_by_line(csv_file_one, csv_file_two):
   csvfile_02 = get_csv_file_lines(csv_file_two)
   for line_one in get_csv_file_lines(csv_file_one):
      line_two = csvfile_02.__next__()
      if line_two != line_one:
        print('File names being compared:')
        print(f'csv_file_one: {csv_file_one}')
        print(f'csv_file_two: {csv_file_two}')
        print(f'The following rows have difference in the files being compared.')
        print('csv_file_one:', line_one)
        print('csv_file_two:', line_two)
        print('\n')

Local file system to S3 bucket using hashlib

The example below is a real world use case for comparing files between a local file system and a remote S3 bucket. I originally was going to use object.e_tag that AWS S3 creates, but that tag can have issues and shouldn't be used in a hashing comparison operation. I decided to query S3 and load an individual file into a memory file system that could be queried and emptied during each comparison operation. This method worked very well and have no adverse impact to my system performance.

import fs
import os
import boto3
import hashlib

def create_temp_memory_filesystem():
   mem_fs = fs.open_fs('mem://')
   virtual_disk = mem_fs.makedir('hidden_dir')
   return mem_fs, virtual_disk

def query_s3_file_by_name(filename, memory_filesystem, temp_directory):
   s3 = boto3.resource('s3', aws_access_key_id='your_access_key_id',
                    aws_secret_access_key='your_secret_access_key')
   bucket = s3.Bucket('your_bucket_name')
   for obj in bucket.objects.all():
      if obj.key == filename:
        body = obj.get()['Body'].read()
        with memory_filesystem.open(f'{temp_directory}/s3_{filename}', 'w') as f:
            f.write(str(body))
            f.close()

 def compare_local_files_to_s3_files(local_csv_files):
    virtual_disk = create_temp_memory_filesystem()
    directory_name = str(virtual_disk[1]).split('/')[1]
    files = set(os.listdir(local_csv_files))
    for filename in files:
       if filename.endswith('.csv'):
         local_file_hash = hashlib.sha256(open(f'{local_csv_files}/{filename}', 'rb').read()).hexdigest()
         query_s3_file_by_name(filename, virtual_disk[0], directory_name)
         virtual_files = virtual_disk[0].opendir(directory_name)
         for file_name in virtual_files.listdir('/'):
            s3_file_hash = hashlib.sha256(open(file_name, 'rb').read()).hexdigest()
            if local_file_hash == s3_file_hash:
                print(f'The file - {filename} is identical in both the local file system and the S3 bucket.')
            elif local_file_hash != s3_file_hash:
                print(f'The file - {filename} is different between the local file system and the S3 bucket.')
            virtual_files.remove(file_name)
    virtual_disk[0].close()

Local file system to S3 bucket using filecmp

This example is the same as the one above except I use filecmp.cmp instead of hashlib for the comparison operation.

import fs
import os
import boto3
import filecmp

def create_temp_memory_filesystem():
   mem_fs = fs.open_fs('mem://')
   virtual_disk = mem_fs.makedir('hidden_dir')
   return mem_fs, virtual_disk

def query_s3_file_by_name(filename, memory_filesystem, temp_directory):
   s3 = boto3.resource('s3', aws_access_key_id='your_access_key_id',
                    aws_secret_access_key='your_secret_access_key')
   bucket = s3.Bucket('your_bucket_name')
   for obj in bucket.objects.all():
      if obj.key == filename:
        body = obj.get()['Body'].read()
        with memory_filesystem.open(f'{temp_directory}/s3_{filename}', 'w') as f:
            f.write(str(body))
            f.close()

def compare_local_files_to_s3_files(local_csv_files):
   virtual_disk = create_temp_memory_filesystem()
   directory_name = str(virtual_disk[1]).split('/')[1]
   files = set(os.listdir(local_csv_files))
   for filename in files:
      if filename.endswith('.csv'):
        local_file = f'{local_csv_files}/{filename}'
        query_s3_file_by_name(filename, virtual_disk[0], directory_name)
        virtual_files = virtual_disk[0].opendir(directory_name)
        for file_name in virtual_files.listdir('/'):
            comparison = filecmp.cmp(local_file, file_name, shallow=False)
            if comparison:
                print(f'The file - {filename} is identical in both the local file system and the S3 bucket.')
            elif not comparison:
                print(f'The file - {filename} is different between the local file system and the S3 bucket.')
            virtual_files.remove(file_name)
   virtual_disk[0].close()

Local file system to Google Cloud storage bucket using hashlib

This example is similar to the S3 hashlib code example above, but it uses a Google Cloud storage bucket.

import fs
import os
import hashlib
from google.cloud import storage

def create_temp_memory_filesystem():
   mem_fs = fs.open_fs('mem://')
   virtual_disk = mem_fs.makedir('hidden_dir')
   return mem_fs, virtual_disk

def query_google_cloud_storage_file_by_name(filename, memory_filesystem, temp_directory):
  client = storage.Client.from_service_account_json('path_to_your_credentials.json')
  bucket = client.get_bucket('your_bucket_name')
  blobs = bucket.list_blobs()
  for blob in blobs:
     if blob.name == filename:
       with memory_filesystem.open(f'{temp_directory}/{filename}', 'w') as f:
           f.write(str(blob.download_to_filename(blob.name)))
           f.close()

def compare_local_files_to_google_storage_files(local_csv_files):
   virtual_disk = create_temp_memory_filesystem()
   directory_name = str(virtual_disk[1]).split('/')[1]
   files = set(os.listdir(local_csv_files))
   for filename in files:
      if filename.endswith('.csv'):
        local_file_hash = hashlib.sha256(open(f'{local_csv_files}/{filename}', 'rb').read()).hexdigest()
        query_google_cloud_storage_file_by_name(filename, virtual_disk[0], directory_name)
        virtual_files = virtual_disk[0].opendir(directory_name)
        for file_name in virtual_files.listdir('/'):
            gs_file_hash = hashlib.sha256(open(file_name, 'rb').read()).hexdigest()
            if local_file_hash == gs_file_hash:
                print(f'The file - {filename} is identical in both the local file system and the Google Cloud bucket.')
            elif local_file_hash != gs_file_hash:
                print(f'The file - {filename} is different between the local file system and the Google Cloud bucket.')
            virtual_files.remove(file_name)
    virtual_disk[0].close()

Local file system to Google Cloud storage bucket using filecmp

This example is similar to the S3 filecmp code example above, but it uses a Google Cloud storage bucket.

 import fs
 import os
 import filecmp
 from google.cloud import storage

 def create_temp_memory_filesystem():
    mem_fs = fs.open_fs('mem://')
    virtual_disk = mem_fs.makedir('hidden_dir')
    return mem_fs, virtual_disk

 def query_google_cloud_storage_file_by_name(filename, memory_filesystem, temp_directory):
   client = storage.Client.from_service_account_json('path_to_your_credentials.json')
   bucket = client.get_bucket('your_bucket_name')
   blobs = bucket.list_blobs()
   for blob in blobs:
      if blob.name == filename:
        with memory_filesystem.open(f'{temp_directory}/{filename}', 'w') as f:
            f.write(str(blob.download_to_filename(blob.name)))
            f.close()

 def compare_local_files_to_google_storage_files(local_csv_files):
   virtual_disk = create_temp_memory_filesystem()
   directory_name = str(virtual_disk[1]).split('/')[1]
   files = set(os.listdir(local_csv_files))
   for filename in files:
      if filename.endswith('.csv'):
        local_file = f'{local_csv_files}/{filename}'
        query_google_cloud_storage_file_by_name(filename, virtual_disk[0], directory_name)
        virtual_files = virtual_disk[0].opendir(directory_name)
        for file_name in virtual_files.listdir('/'):
          comparison = filecmp.cmp(local_file, file_name, shallow=False)
          if comparison:
            print(f'The file - {filename} is identical in both the local file system and the Google Cloud bucket.')
          elif not comparison:
                print(f'The file - {filename} is different between the local file system and the Google Cloud bucket.')
           virtual_files.remove(file_name)
   virtual_disk[0].close()
Life is complex
  • 15,374
  • 5
  • 29
  • 58
3

Use the optional shallow argument for filecmp : A bool value ‘True’ or ‘False’. The default value of this parameter is True.

From Documentation :

If shallow is true and the os.stat() signatures (file type, size, and modification time) of both files are identical, the files are taken to be equal.

Otherwise, the files are treated as different if their sizes or contents differ.

import filecmp   
  
# Path of first file 
file1 = "/home/geeks/Desktop/gfg/data.txt"
  
# Path of second file 
file2 = "/home/geeks/Desktop/gfg/gfg.txt"
   
# Compare the os.stat() 
# signature i.e the metadata 
# of both files  
comp = filecmp.cmp(file1, file2) 
  
# Print the result of comparison 
print(comp) 
  
# Compare the 
# contents of both files 
comp = filecmp.cmp(file1, file2, shallow = False) 
  
# Print the result of comparison 
print(comp)

documentation link

https://www.geeksforgeeks.org/python-filecmp-cmp-method/#:~:text=cmp()%20method%20in%20Python,size%2C%20date%20modified%20etc.)

tommytucker7182
  • 213
  • 1
  • 5
  • 11
Darwin
  • 1,695
  • 1
  • 19
  • 29
2

The issue is that filecmp.dircmp performs a shallow comparison:

The dircmp class compares files by doing shallow comparisons as described for filecmp.cmp()

Shallow comparison means that filecmp will check if file A and file B os.stat is equal. In that case, it returns true. If false, it then compares A and B contents and returns true if they are equal, and false otherwise.


In order to ignore os.stat, you can use filecmp.cmpfiles(dir1, dir2, common, shallow=False). Take note that filecmp.cmpfiles works as following:

Compare the files in the two directories dir1 and dir2 whose names are given by common.

You can read more about it here.


Also, you can loop trought all the files inside dir1 and dir2, and for each one run filecmp.cmp(f1, f2, shallow=False). You can read more about filecmp.cmp here.


If you have doubts on how shallow works, this answer might help you.

Brhaka
  • 1,622
  • 3
  • 11
  • 31
1

According to filecmp documentation:

The filecmp module defines functions to compare files and directories, with various optional time/correctness trade-offs. For comparing files, see also the difflib module.

Specifically, .cmp and .cmpfiles compare files using their signatures, as well as, other metadata:

filecmp.cmp(f1, f2, shallow=True)

Compare the files named f1 and f2, returning True if they seem equal, False otherwise. If shallow is true, files with identical os.stat() signatures are taken to be equal. Otherwise, the contents of the files are compared. Note that no external programs are called from this function, giving it portability and efficiency. This function uses a cache for past comparisons and the results, with cache entries invalidated if the os.stat() information for the file changes. The entire cache may be cleared using clear_cache().

filecmp.cmpfiles(dir1, dir2, common, shallow=True)

Compare the files in the two directories dir1 and dir2 whose names are given by common. Returns three lists of file names: match, mismatch, errors. match contains the list of files that match, mismatch contains the names of those that don’t, and errors lists the names of files which could not be compared. Files are listed in errors if they don’t exist in one of the directories, the user lacks permission to read them or if the comparison could not be done for some other reason. The shallow parameter has the same meaning and default value as for filecmp.cmp(). For example, cmpfiles('a', 'b', ['c', 'd/e']) will compare a/c with b/c and a/d/e with b/d/e. 'c' and 'd/e' will each be in one of the three returned lists.

Furthermore if you wish for a diff output consider using difflib noted in the filecmp documentation.

References

filecmp: https://docs.python.org/3/library/filecmp.html

difflib: https://docs.python.org/3/library/difflib.html#module-difflib

pygeek
  • 7,356
  • 1
  • 20
  • 41
1

Try using git and diff.

https://gitpython.readthedocs.io/en/stable/

from git import Repo

repo = Repo('my_repo')

# Check differences between current files and last commit
diff = repo.git.diff(repo.head.commit.tree)
print(diff)
Merlin
  • 24,552
  • 41
  • 131
  • 206
1

If you need a once in a while solution, winmerge compares specific files as well as entire directories. The compare does not include meta information.

If you need a continuous solution, winscp is an ftp client that can be set to constantly compare directories. There are probably other ftp clients that can do this. And you can programmatically move or process files using powershell or something similar.

I realize that this isn't a python answer, but you may be doing unnecessary work by trying to code this yourself (and you won't be able to do a better job).

mson
  • 7,762
  • 6
  • 40
  • 70
  • Don't these solutions do more than compare and actually merge directories so that the "target" directory is updated with all the files that are in the "source" directory? That may be biting off far more than the OP is willing to chew. – Booboo Nov 10 '20 at 18:23
  • @Booboo - winmerge is manual, so you can do whatever you intend manually. winscp can be automated to push/move/overwrite/alert depending on what is intended. – mson Nov 13 '20 at 02:26
  • Looking at winmerge's folder comparison, it seems to be based on file dates and sizes (not what the OP wants). WinScp's synchronize command has a `-preview` option but as far as I can tell it too is based on dates and/or sizes depending on how you set the `-criteria` option; I don't think there is any way in which WinScp will do a byte-for-byte comparison of the files' contents. – Booboo Nov 13 '20 at 12:31
1

This is quick, dirty, and resource intensive ;) If you're on linux, call diff, if you're on windows, call fc. That is, if you just want to know if they have the same data. You would need to be able to access the files 'from the server' locally, so either download them and bin them once you've compared - or mount a shared drive if that's feasible. As you're going to compare the data wherever your code is running, you either have to upload your data or download the server data anyway, so just pull it down and bin it when you're done. e.g. on windows:

import subprocess
def files_are_a_match(file1, file2):
"""function returns True for matching files. False on mismatch or error. Expects windows file paths as strings"""
 try:
  cmd = f"fc /B \"{file1}\" \"{file2}\""
  txt = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
 except:
  return False
 return "FC: no differences encountered" in str(txt)

A better approach to get a 'is this not the same?' answer would be to generate a hash of the files, if you have control of the server you would do that there, and your own locally, and then compare hashes (less data flying about). But it's not clear what your intention is or what your control of the server is.

amateur
  • 38
  • 7
1

Here is one way to compare contents of the CSV files.

  • Create a dictionary with file names as keys and SHA hashes as values.
  • Do this on both local and remote machines.
  • Compare dictionaries (identical contents will have identical hashes).

Import packages and create two functions:

import hashlib
from pathlib import Path
from time import perf_counter

def sha256sum(filename):
    ''' source:  https://stackoverflow.com/a/44873382/13608599 '''
    h  = hashlib.sha256()
    b  = bytearray(128 * 1024)
    mv = memoryview(b)
    with open(filename, 'rb', buffering=0) as f:
        for n in iter(lambda : f.readinto(mv), 0):
            h.update(mv[:n])
    return h.hexdigest()

def csv_hashes(dir_name):
    ''' Map CSV filenames to SHA hashes. '''
    return { csv_file: sha256sum(csv_file)
             for csv_file in dir_name.rglob('*.csv') }

Specify the top-level directory and build filename: hash value dict on the local machine.

local_dir = Path('../../../projects')

start = perf_counter()
local_hashes = csv_hashes(local_dir)
elapsed = perf_counter() - start

rate = len(local_hashes) / elapsed
print(f'indexed {rate:.3f} files/sec')

indexed 53.342 files/sec  ## too slow for real-world use case?

Repeat on the remote machine and compare the two dictionaries.

jsmart
  • 2,921
  • 1
  • 6
  • 13
0

This program uses package pysftp from the PyPI repository. It recursively walks the local directory looking for csv files. For each file found, it computes the path in the remote directory of the corresponding remote csv file and using pysftp first tests whether the remote file exists or not. If the file exists, the file is read. pysftp (and the sftp 3 protocol in general) only supports reading binary. So an assumption is made that the file contents can be decoded using utf-8. Both the local and remote files are "normalized" to take into account that different line-ending conventions may be in use for the two files if different OS platforms are being used before being compared. The files are then compared for equality. You can, of course, modify how the output is to be displayed.

#!/usr/bin/env python3

import pysftp
import sys
from pathlib import Path
from io import BytesIO
import re

LOCAL_DIR = 'C:\\My\\Directory\\' # with closing separator
REMOTE_DIR = '/home/directory/' # absolute directory with closing separator


class Sftp:
    def __init__(self, host, port, username, password, deploymentDirectory, verbose=True):
        if deploymentDirectory[-1] != '/': deploymentDirectory += '/'
        self.deployment_directory = deploymentDirectory
        self.verbose = verbose
        self.connection = None
        try:
            self.connection = pysftp.Connection(host, port=port, username=username, password=password)
        except Exception:
            print('Could not connect to remote sftp server with the specified arguments.', file=sys.stderr)
            sys.exit(1)

    def __del__(self):
        self.close()

    def close(self):
        if self.connection:
            self.connection.close()
            self.connection = None

    def read_text_file(self, remote_file_name):
        full_remote_file_name = self.deployment_directory + remote_file_name
        b = BytesIO()
        self.connection.getfo(full_remote_file_name, b)
        s = b.getvalue().decode('utf-8')
        return s


    def remote_file_exists(self, remote_file_name):
        full_remote_file_name = self.deployment_directory + remote_file_name
        return self.connection.isfile(full_remote_file_name)


def compare(local_text, remote_text):
    """
    The files could be the same except for the way the hosts handle the line-termination sequence (Windows: \r\n, Unix/Linux: \n, Mac: \r).
    So, let's normalize:
    """
    rex = re.compile(r'\r\n?')
    local_text = rex.sub('\n', local_text)
    remote_text = rex.sub('\n', remote_text)
    return local_text == local_text


def main():
    sftp = Sftp(host='demo.com', port=22, username='xxxx', password='xxxx', deploymentDirectory=REMOTE_DIR)
    l_local_dir = len(LOCAL_DIR)
    for path in Path(LOCAL_DIR).rglob('*.csv'):
        dir, file_name = path.parent, path.name
        # compute relative remote path:
        remote_file_name = str(dir)[l_local_dir:].replace('\\', '/') + '/' + file_name
        if not sftp.remote_file_exists(remote_file_name):
            print(f'{path}: This file does not exist in remote directory.')
        else:
            remote_text = sftp.read_text_file(remote_file_name)
            with path.open(encoding='utf-8') as f:
                local_text = f.read()
                if compare(local_text, remote_text):
                    print(f'{path} exits in the remote directory and matches.')
                else:
                    print(f'{path} exits in the remote directory but does not match.')
    sftp.close()


main()
Booboo
  • 38,656
  • 3
  • 37
  • 60