2

I am writing a program to compare two binary files and plot the first difference. I want to read 16 bytes of data from each file continuously and compare them. For that I am storing 16 bytes from both file into char *buffer1, buffer2. When I print the output I am getting that buffer1 has both the data of file1 and file2.

The code is as follows:

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

void printConversion(char *buf1, char *buf2) {
    size_t len = strlen(buf1);
    char *binary = malloc(len * 8 + 1); 
    binary[0] = '\0';
    for (size_t i = 0; i < len; ++i) {
        char ch = buf1[i];
        for (int j = 7; j >= 0; --j) {
            if (ch & (1 << j)) {
                strcat(binary,"1");
            } else {
                strcat(binary,"0");
            }
        }
    }

    printf("File1: %s\t", binary);
    free(binary);
    printf("File2:");
    for (int i = 0; i < sizeof(buf2); i++) {
        printf("%x", buf2[i] - '0');
    }
}

void fileRead(FILE *fp, char *buf, int count) {
    fseek(fp, count, SEEK_SET);    
    fread(buf, 1, 16, fp);
}

int fileSize(FILE *fp) {
    fseek(fp, 0, SEEK_END);
    int size = ftell(fp) + 1;
    return size;
}

int main(int argc, char *argv[]) {
    printf("***Binary File Comparator***\n ");
    int count = 0;
    int index = 0;
    char buffer1[16];
    char buffer2[16];
    char buffer3[16];
    char buffer4[16];

    // Invalid Number of Arguments
    if (argc < 3 || argc > 3) {
        printf("Invalid Number of Arguments\n");
    }

    FILE *fp1, *fp2;
    fp1 = fopen(argv[1], "rb");
    int size = fileSize(fp1);
    int size1 = size;
    fclose(fp1);

    while (size > 1) {
        fp1 = fopen(argv[1], "rb");
        fileRead(fp1, buffer1, count);
        fclose(fp1);

        fp2 = fopen(argv[2], "rb");
        fileRead(fp2, buffer2, count);
        if (size1 < count) {
            int lastSize = count - size1;
            count = count + lastSize;
            fclose(fp2);
        } else {
            count = count+16;
            fclose(fp2);
        }

        **printf("buffer1:%s\tbuffer2:%s\n", buffer1, buffer2)**;
        size = size - 16;

        int result = strcmp(buffer1, buffer2);
        if (result != 0) {
            for (int i = 0; i < sizeof(buffer1); i++) {
                if (buffer1[i] != buffer2[i]) {
                    int count1 = (count - 16) + i;
                    index++;
                    if (index == 1) {
                        printf("Byte_Offset:%x\n", count1);
                        fp1 = fopen(argv[1], "rb");
                        fileRead(fp1, buffer3, count1);
                        fclose(fp1);
                        fp2 = fopen(argv[2], "rb");
                        fileRead(fp2, buffer4, count1);
                        fclose(fp2);
                        printConversion(buffer3, buffer4);
                        break;
                    }
                } else {
                    continue;
                }
            } 
        }
    }
}

I have tried to highlight the printf part that is printing my buffer1 and buffer2

The output is as follows:

 buffer1:83867715933586928386771593358692   buffer2:8386771593358692
buffer1:49216227905963264921622790596326    buffer2:4921622790596326
buffer1:40267236116867294026723611686729    buffer2:4026723611686729
buffer1:82306223673529228230622367352922    buffer2:8230622367352922
buffer1:25869679356114222586967935611422    buffer2:2586967935611422

Can anybody help what I am doing wrong. Please point me the error and what optimization changes could be done in code. I am at learning stage your feedback will be very helpful.

chqrlie
  • 131,814
  • 10
  • 121
  • 189
Bhavin Thakar
  • 43
  • 1
  • 7
  • You should only call fopen() once on each file, at the top of your program. Then read through the file as necessary, and call fclose() on each file, once, at the end of your program. – Jeremy Friesner Mar 06 '20 at 19:59
  • 1
    Also open the file in "rb" for read binary instead of "r" – EvilTeach Mar 06 '20 at 19:59
  • Also you call `malloc()` to allocate memory from the heap, but never `free()` the memory when you're done with it, which means you are leaking memory. Be sure to call `free()` when you are done using the `malloc()`'d buffer. – Jeremy Friesner Mar 06 '20 at 20:01
  • I might be wrong but based on my observation I am saying earlier when I tried the steps you told I got in my console as file1 output was printed ok but when I tried to print output for file 2 . console leave some space and then started printing. I guess that gap was for file 1. So as you said open both the files process and then close. It just opened both the files and there was difference in space when printed output for only file 2. – Bhavin Thakar Mar 06 '20 at 20:08
  • Why are you finding the size of each file? Just read in a loop until there's no more data. IOW, rather than trying to pre-compute how many iterations you'll need, just read until you can't read any more. – William Pursell Mar 06 '20 at 20:14
  • There is a question (and answer) that covers this [here](https://stackoverflow.com/questions/20688258/compare-two-files-byte-by-byte). – ryyker Mar 06 '20 at 20:14
  • You start out with `count == 0`. So you read 0 bytes from file 1, and zero bytes from file 2. – William Pursell Mar 06 '20 at 20:16
  • Are you sure your `bufferN` arrays contain a room for a zero byte to terminate the string? That is necessary for example for `strlen()` to work correctly in `printConversion()`. – CiaPan Mar 06 '20 at 20:16
  • No,count is for offset to tell from what point we need to read next string @WilliamPursell – Bhavin Thakar Mar 06 '20 at 20:20
  • Ack! I just looked at your `fileRead` implementation! Don't do that. Don't keep seeking all over the place. For this code, you should have zero `seek` calls. Open a file and do `while(( n = fread()) > 0)`. – William Pursell Mar 06 '20 at 20:24
  • I have done that because I have to read both the file and compare it we can't write ``` while((n1=fread(file1))>0 || (n2 = fread(file2))>0)``` @WilliamPursell – Bhavin Thakar Mar 06 '20 at 20:31
  • Can you also point out the difference in buffer1 and buffer 2 output? – Bhavin Thakar Mar 06 '20 at 20:48
  • You really, really don't want to seek. You absolutely can (and should) do `while( (n1 = fread(file1...)) > 0 || (n2 = fread(file2..)) > 0)` – William Pursell Mar 06 '20 at 20:54
  • Well, except that you shouldn't be using `fread` at all. `fgetc` is sufficient for this. – William Pursell Mar 06 '20 at 20:58
  • Can You please point out the error why buffer 1 is having both the values of buf1 and buf 2 – Bhavin Thakar Mar 06 '20 at 21:08
  • It could simply be that the `fseek` or the `fread` (or both) is failing. You should check the return values. Also, if these are binary files, you ought not use `%s` to print them. – William Pursell Mar 06 '20 at 21:15
  • then what to use to print them – Bhavin Thakar Mar 06 '20 at 21:25
  • If you have arbitrary binary data, one good approach is to use `%02x` and print a hex dump. It completely depends on your use case. – William Pursell Mar 06 '20 at 22:53

3 Answers3

5

You are complicating the task by reading 16 bytes at a time. If the goal is to indicate the first difference, just read one byte at a time from both files with getc() this way:

int compare_files(FILE *fp1, FILE *fp2) {
    unsigned long pos;
    int c1, c2;
    for (pos = 0;; pos++) {
        c1 = getc(fp1);
        c2 = getc(fp2);
        if (c1 != c2 || c1 == EOF)
            break;
    }
    if (c1 == c2) {
        printf("files are identical and have %lu bytes\n", pos);
        return 0;  // files are identical
    } else
    if (c1 == EOF) {
        printf("file1 is included in file2, the first %lu bytes are identical\n", pos);
        return 1;
    } else
    if (c2 == EOF) {
        printf("file2 is included in file1, the first %lu bytes are identical\n", pos);
        return 2;
    } else {
        printf("file1 and file2 differ at position %lu: 0x%02X <> 0x%02X\n", pos, c1, c2);
        return 3;
    }
}

In terms of efficiency, reading one byte at a time does not pose a problem if the streams are buffered. For large files, you can get better performance by memory mapping the file contents if available on the target system and for the given input streams.

chqrlie
  • 131,814
  • 10
  • 121
  • 189
  • @chux-ReinstateMonica: I resisted the temptation to simplify the loop test as `(c1 = getc(fp1)) == (c2 == getc(fp2)) && c1 != EOF)` with a empty loop body, because it is less readable for beginners. – chqrlie Mar 07 '20 at 16:00
  • You don't need `else` after `return`. – CiaPan Mar 13 '20 at 09:06
  • @CiaPan: Indeed the `else` clauses are not necessary, but they do not produce any extra code and help show the regular structure of the final dispatch. There are 4 cases, each producing a message on `stdout` and returning a different value. – chqrlie Mar 13 '20 at 16:52
1

Not an actual answer, but a word on optimisation. You can increase the speed of the program if you have a bigger buffer. Basically the larger the buffer the faster the program runs HOWEVER the speed you gain from just making it larger will increase logarithmically.

Here is a picture of a graph that will help you understand. Also, what i mentioned applies to any simmilar situation. This includes: Copying files, filling the sound buffer etc. Loading the entire file in your RAM first and operationg on it will usually be faster than loading parts of it. Ofc this is not possible with larger files but still this is what you should aim for if you want speed.

PS: I'm writting here because i don't have rep to comment.

EDIT: I came up with solution but since you did not state what you need to do with your buffer3 and buffer4 i packed it up inside a function.

If you are sure that you are only going to use 16 bytes as a buffer size, remove the nBufferSize parameter and replace the buffer dynamic allocation with a static one.

If after the execution you need the buffers, add them as parameters and keep the nBufferSize param. Keep in mind that if you intend to use them outside the function, you should also allocate them outside the function, so things don't get messy.

/** Returns 0 if files are identical, 1 if they are different and -1 if there 
is an error. */
int FileCmp(char* szFile1, char* szFile2, int nBufferSize)
{
    FILE *f1, *f2;
    f1 = fopen(szFile1, "rb");
    f2 = fopen(szFile2, "rb");

    // Some error checking?
    if (f1 == NULL || f2 == NULL)
        return -1;

    // You can check here for file sizes before you start comparing them.
    //  ...

    // Start the comparrison.

    /// Replace this part with static allocation. --------
    char* lpBuffer1 = malloc(sizeof(char)*nBufferSize);
    if (lpBuffer1 == NULL) // close the files and return error.
    {
        fclose(f1);
        fclose(f2);
        return -1;
    }
    char* lpBuffer2 = malloc(sizeof(char)*nBufferSize);
    if (lpBuffer2 == NULL) // close the files, free buffer1 and return error.
    {
        free(lpBuffer1);
        fclose(f1);
        fclose(f2);
        return -1;
    }
    /// --------------------------------------------------

    while(1)
    {
        unsigned int uRead1 = fread(lpBuffer1, sizeof(char), nBufferSize, f1);
        unsigned int uRead2 = fread(lpBuffer2, sizeof(char), nBufferSize, f2);

        if (uRead1 != uRead2)
            goto lFilesAreDifferent;

        for(unsigned int i = 0; i < uRead1; i++)
            if (lpBuffer1[i] != lpBuffer2[i])
                goto lFilesAreDifferent;

        if ((feof(f1) != 0) && (feof(f2) != 0))
            break; // both files have nothing more to read and are identical.

        goto lSkip;

        lFilesAreDifferent:
            free(lpBuffer1);
            free(lpBuffer2);
            fclose(f1);
            fclose(f2);
            return 1;

        lSkip:;
    }

    // The files are the same. Close them, free the buffers and return 0.
    free(lpBuffer1);
    free(lpBuffer2);
    fclose(f1);
    fclose(f2);
    return 0;
}

A simple Demo:

#define BUFFER_SIZE 16
int main(int nArgs, char** szArgs)
{
    if (nArgs != 3)
    {
        printf("Invalid number of arguments.");
        return 0;
    }

    int nResult = FileCmp(szArgs[1], szArgs[2], BUFFER_SIZE);
    switch (nResult)
    {
        case 0: printf("Files [%s] and [%s] are identical.", szArgs[1], szArgs[2]); break;
        case 1: printf("Files [%s] and [%s] are different.", szArgs[1], szArgs[2]); break;
        case -1: printf("Error."); break;
    }

    return 0;
}

EDIT II: Personally, i have never used the C standard FILE library (it was either C++ fstream or pure win32 fileapi) so don't take my word here for granted but fread is the fastest function i could find (faster than fgets or fgetc). If you want even faster than this you should get into OS dependant functions (like ReadFile() for Windows).

Tudor
  • 436
  • 3
  • 10
  • This is a good answer but I have limitation of 16 bytes of memory buffer – Bhavin Thakar Mar 06 '20 at 20:51
  • Then 16 it should be. I hit my `sweet spot` when copying files at around 2048 bytes on my laptop. – Tudor Mar 06 '20 at 20:54
  • Not opening, seeking, reading, and closing each file every time through the loop would surely help too. Open both files outside the `while` loop, and then just keep reading them each time through the loop. The file pointers will keep track of the current position, so you don't have to keep seeking to the next 16 bytes. – Caleb Mar 06 '20 at 20:59
0

chqrlie's solution using getc is absolutely the right way to do this. I wanted to address some points brought up in comments, and find it's best to do that with code. In one comment, I recommend pseudo code which could be confusing (namely, you can't write fwrite(file1...) || fwrite(file2 ...) because of the short circuit. But you can implement the idea of that with:

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

/*
 * Compare two files, 16 bytes at a time. (Purely to demonstrate memcmp.
 * Clearly, this should be implemented with getc.)
 */

FILE * xfopen(const char *, const char *);
size_t xfread(void *, FILE *, const char *);

int
main(int argc, char **argv)
{
        FILE *fp[2];
        size_t n[2];
        char buf[2][16];
        unsigned count = 0;

        if(argc != 3) { return EXIT_FAILURE; }
        fp[0] = xfopen(argv[1], "r");
        fp[1] = xfopen(argv[2], "r");
        do {
                n[0] = xfread(buf[0], fp[0], argv[1]);
                n[1] = xfread(buf[1], fp[1], argv[2]);
                if( n[0] != n[1] || (n[0] && memcmp(buf[0], buf[1], n[0]))) {
                        fprintf(stderr, "files differ in block %u\n", count);
                        return 1;
                }
                count += 1;
        } while(n[0]);
        puts("files are identical");
        return 0;
}

size_t
xfread(void *b, FILE *fp, const char *name)
{
        size_t n = fread(b, 1, 16, fp);
        if(n == 0 && ferror(fp)) {
                fprintf(stderr, "Error reading %s\n", name);
                exit(EXIT_FAILURE);
        }
        return n;
}

FILE *
xfopen(const char *path, const char *mode)
{
        FILE *fp = strcmp(path, "-") ? fopen(path, mode) : stdin;
        if( fp == NULL ) {
                perror(path);
                exit(EXIT_FAILURE);
        }
        return fp;
}
William Pursell
  • 204,365
  • 48
  • 270
  • 300