1

I am working on a program written in C that recursively walks a given directory in order to print out the Nth largest files and their sizes in bytes. I am using two arrays to account for the filesystem entry names and filesystem entry sizes respectively.

EDIT: I have updated my program to implement the suggestions shared in the comment section. My focus now is on correctly implementing a swap operation within my iSort function.

#include <stdio.h>
#include <dirent.h>
#include <sys/stat.h>
#include <string.h>

// number of files to display size information for
const int N = 10;

// a struct used to hold filesystem entry names and corresponding sizes in bytes
struct info {
    char name[1024];
    int size;
};

/* A simple implementation of insertion sort that will operate upon
   an array of info structs, sorting them by their member size in
   ascending order */
void iSort(struct info *fs_info[], int size, int info_size)
{
    int i, j, key;

    for (i = 1; i < size; i++)
    {
        key = fs_info[i]->size;
        j = i - 1;

        while (j >= 0 && fs_info[j]->size > key)
        {
            printf("info_size: %d\n", info_size);

            // TODO complete a swap operation
            memmove(fs_info[j + 1], fs_info[j], info_size);

            j = j - 1;
        }
        fs_info[j + 1]->size = key;
    }
}

void get_size(char *path, struct info fs_info[N], int info_size)
{
    static int items_added = 0;
    static int max_size = 0;

    struct stat st;
    if (stat(path, &st) == 0)
    {
        if (items_added < N) // if array capacity will not be exceeded
        {
            strcpy(fs_info[items_added].name, path);
            fs_info[items_added].size = st.st_size;

            if (st.st_size > max_size)
                max_size = st.st_size;

            items_added++;
        }
        else
        {
            // do a comparison to determine where to insert
            // sort first
            iSort(&fs_info, 10, info_size);  // this function call results in a seqfault
        }
    }
    else
    {
        printf("Error getting stat for entry %s: %d\n", path, stat(path, &st));
    }
}

void walk(const char *currDir, struct info fs_info[N], int info_size)
{
    DIR *dir = opendir(currDir);
    struct dirent *entry;

    if (dir == NULL)
    {
        // directory could not be opened
        return;
    }

    while ((entry = readdir(dir)) != NULL)
    {
        if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
        {
            // if directory is current dir or parent dir
            continue;
        }

        char path_to_entry[1024];
        snprintf(path_to_entry, sizeof(path_to_entry), "%s/%s", currDir, entry->d_name);

        // use path_to_entry to call stats on the entry
        get_size(path_to_entry, fs_info, info_size);

        if (entry->d_type == DT_DIR)
        {
            // recursively visit subdirectories
            walk(path_to_entry, fs_info, info_size);
        }
    }
    closedir(dir);
}

int main(int argc, char *argv[])
{
    if (argc != 2)
    {
        printf("Usage: %s <target directory>\n", argv[0]);
    }

    const char *target_dir = argv[1];

    struct info fs_entries[N];
    const int info_size = sizeof(struct info);

    for (int i = 0; i < N; i++)
    {
        strcpy(fs_entries[i].name, "");
        fs_entries[i].size = 0;
    }

    printf("Finding %d largest files in: %s\n", N, target_dir);

    walk(target_dir, fs_entries, info_size);

    for (int i = 0; i < N; i++)
    {
        printf("%s : %d\n", fs_entries[i].name, fs_entries[i].size);
    }

    return 0;
}

Currently, the memmove() invocation in iSot() results in a EXC_BAD_ACCESS error, I am working on improving this function now. Any suggestions in the interim are appreciated. Thanks also to those who commented on this question earlier.

John Harrington
  • 1,314
  • 12
  • 36
  • 2
    Don't use two arrays, use one array of structs. – Barmar Dec 23 '22 at 22:45
  • It doesn't actually make a difference, but why do you put `[N]` in the parameter declarations of `entries`, but not `sizes`? – Barmar Dec 23 '22 at 22:49
  • 2
    You need to make a copy of the `path` string when putting it into `entries[items_added]`. The memory of a variable created in a loop becomes invalid when the next iteration of the loop occurs. – Barmar Dec 23 '22 at 22:54
  • 1
    Insertion sort requires you to swap the two array elements in the inner loop, not just assign one to the other. – Barmar Dec 23 '22 at 22:57
  • @Barmar thank you. I was using `[N]`in the declaration of `entries` to see if this would change anything, no strong reason to do so. Do you know what difference it makes for the compiler to provide size information or not? – John Harrington Dec 23 '22 at 23:08
  • As I said, it doesn't make any difference in this case. The size information isn't used for the first array dimension. – Barmar Dec 23 '22 at 23:12
  • 1
    re: `isort()`... Go experiment with `memmove()` shifting multiple contiguous elements within the boundaries of a fixed size array... Get comfortable with `memmove()` and you'll save lots of 'hacky' looping code (that is all-to-often wrong.) Experiment with a separate program until you've got the understanding... – Fe2O3 Dec 23 '22 at 23:38
  • Does this answer your question? [Can a local variable's memory be accessed outside its scope?](https://stackoverflow.com/questions/6441218/can-a-local-variables-memory-be-accessed-outside-its-scope) – Raymond Chen Dec 23 '22 at 23:45
  • 1
    Please give us a program not a bunch of snippets that we have to assemble. – Allan Wind Dec 24 '22 at 02:39
  • 1
    [qsort](https://en.cppreference.com/w/c/algorithm/qsort) is your friend. – Dúthomhas Dec 24 '22 at 02:46
  • Since your program targets POSIX systems, you might as well use the `nftw` function to walk the filesystem rather than roll your own. Also, `d_type` is only on Linux. – Kaz Dec 24 '22 at 03:33

2 Answers2

2

This is not so much an answer as it is an example of how one might code this with a bit less fiddling about with every bit/byte.

I don't run a flavour of UNIX, so this offering is untested and may contain typos and even bugs. I hope not.

#include <stdio.h>  // From 'generic' to 'specific'
#include <string.h>
#include <sys/stat.h>
#include <dirent.h>

const int N = 10;

// Global array holding names and sizes
struct {
    char name[ MAX_PATH ];
    size_t size;
} biggies[ N ], wrk; // and a working buffer.

/* "Global" is bad for large projects.
 * In this 'utility' program, global saves a LOT of typing/reading.
 * Seek clarity, not conformity,
 *
 * "wrk" is used to buffer ALL paths encountered.
 * Notice that each recursion is an EXTENSION of its parent.
 * ONE working buffer to deal with.
 */
void walk() {
    size_t len = strlen( wrk.name ); // The path so far...

    DIR *dir;
    if( ( dir = opendir( wrk.name ) ) == NULL )
        return;

    wrk.name[ len++ ] = '/'; // append a slash ahead of strcpy() below

    struct dirent *entry;
    while( ( entry = readdir( dir ) ) != NULL ) {
        if( strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0 )
            continue;

        // Notice how each 'local' name is written to "the right spot" in one buffer
        strcpy( wrk.name + len, entry->d_name );

        if( entry->d_type == DT_DIR ) // directory, so recursion...
            walk();
        else {
            struct stat st;
            if( stat( wrk.name, &st ) != 0 ) {
                fprintf( stderr, "Error stat'ing '%s'\n", wrk.name );
                exit( EXIT_FAILURE );
            }
            // Add this info to working buffer.
            wrk.size = st.st_size;

            // Find where to 'insert' this (if it is larger than descending ordered list
            for( int i = 0; i < N && biggies[i].size > wrk.size; i++ ) {} // loop

            if( i < N ) {
                // Slide smaller ones down (don't go out of bounds)
                memmove( biggies[i + 1], biggies[i], (N-i-1) * sizeof biggies[0] );
                // Copy this one in place.
                memcpy( biggies[i], &wrk, sizeof biggies[0] );
            }
        }
    }
    closedir(dir);
}

int main( int argc, char *argv[])  {
    char *target_dir = argv[1]; // This is okay, so far...

    if( argc != 2 ) {
        printf( "Usage: %s <target directory>\n", argv[0] );
        puts( "Using current directory..." );
        target_dir = ".";
    }

    strcpy( wrk.name, target_dir );

    printf( "Finding %d largest files in: %s\n", N, wrk.name );

    walk();

    for( size_t i = 0; i < N && biggies[i].size != 0; i++ )
        printf( "%s : %d\n", biggies[i].name, biggies[i].size);

    return 0;
}
Fe2O3
  • 6,077
  • 2
  • 4
  • 20
  • 1
    Good use of `strcpy( wrk.name + len, entry->d_name );` to reduce string copying. On normal return, could use `wrk.name[ --len] = 0;` to restore `wrk.name[]`. – chux - Reinstate Monica Dec 24 '22 at 06:59
  • @chux-ReinstateMonica Right-o, but, for this use (in recursion), it doesn't really matter what's "the rest" of the buffer. Whatever it was, it'll be clobbered almost immediately... `:-)` – Fe2O3 Dec 24 '22 at 07:04
1

You don't need to pass the address of fs_info to iSort() function. fs_info is a pointer to first element of fs_entries array, which is enough to sort the array if the size of array is known. Also, you don't need to pass the size of element of array to iSort().

iSort() function implementation:

void iSort(struct info *fs_info, int size) {
    for (int i = 1; i < size; ++i) {
        int key = fs_info[i].size;
        struct info x = fs_info[i];
        int j = i - 1;

        while (j >= 0 && fs_info[j].size > key) {
            fs_info[j + 1] = fs_info[j];
            j--;
        }

        fs_info[j + 1] = x;
    }
}

Use memmove() in iSort():

void iSort (struct info *fs_info, int size) {
    for (int i = 1; i < size; ++i) {
        int key = fs_info[i].size;
        struct info x = fs_info[i];
        int j = i - 1;

        while (j >= 0 && fs_info[j].size > key) {
            j--;
        }

        if (j != i - 1) {
            memmove (&fs_info[j + 2], &fs_info[j + 1], sizeof(*fs_info) * (i - j - 1));
        }

        memmove (&fs_info[j + 1], &x, sizeof(*fs_info));
    }
}

Call iSort() function like this:

iSort(fs_info, N);

There is a lot of scope of improvement in your code, like, it would be good to have array of pointers to struct info instead of array of struct info, so that, during sort simply swapping the pointers required instead of swapping whole structure. Leaving it up to you to identify the improvements and implement them.

H.S.
  • 11,654
  • 2
  • 15
  • 32