-4

The following C code will list the amount of files and directories and will do it 4 times faster than the linux find command. I need only the count of the folders, not interested in the file count and even listing them. Is there a way to optimize the below code and make it more efficient?

#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>

void listdir(char *path, size_t size) {
    DIR *dir;
    struct dirent *entry;
    size_t len = strlen(path);

    if (!(dir = opendir(path))) {
        fprintf(stderr, "path not found: %s: %s\n",
                path, strerror(errno));
        return;
    }

    puts(path);
    while ((entry = readdir(dir)) != NULL) {
        char *name = entry->d_name;
        if (entry->d_type == DT_DIR) {
            if (!strcmp(name, ".") || !strcmp(name, ".."))
                continue;
            if (len + strlen(name) + 2 > size) {
                fprintf(stderr, "path too long: %s/%s\n", path, name);
            } else {
                path[len] = '/';
                strcpy(path + len + 1, name);
                listdir(path, size);
                path[len] = '\0';
            }
        } else {
            printf("%s/%s\n", path, name);
        }

    }
    closedir(dir);
}

int main( int argc, char *argv[] ) {

   if( argc == 2 ) {
      printf("Path:  %s\n", argv[1]);
   }
   else if( argc > 2 ) {
      printf("Too many arguments supplied.\n");
   }
   else {
      printf("One argument expected.\n");
      return 0;
   }
    char path[1024];
    memcpy (path, argv[1],1024);
    listdir(path, sizeof path);
    return 0;
}

Removing the following lines will of course not display the files , but will not speed up the execution time :

} else {
            printf("%s/%s\n", path, name);
        }
zlobul
  • 335
  • 1
  • 5
  • 20
  • 1
    Use `strncpy` instead of `memcpy`. Your code has out-of-bounds access. – S.S. Anne Jun 28 '19 at 17:40
  • 1
    Use `strlen(path)` instead of `sizeof path`. Again, possible out-of-bounds access. – S.S. Anne Jun 28 '19 at 17:41
  • 1
    Use `PATH_MAX` instead of `1024`. Best to make use of what you have. – S.S. Anne Jun 28 '19 at 17:42
  • 1
    An `opendir` error does not necessarily mean "path not found". An error message of the form "path not found: /foo: permission denied" is not quite as helpful (and potentially confusing) as the simpler "/foo: permission denied". Don't try to guess. Remove the "path not found" – William Pursell Jun 28 '19 at 17:49
  • @JL2210 - `strncpy` does not ensure a *nul-terminated* string and is of no help here. See [man strcpy(3)](http://man7.org/linux/man-pages/man3/strcpy.3.html) and particularly the sentence in the DESCRIPTION that begins with **Warning:**. (your latter two comments are fine) – David C. Rankin Jun 28 '19 at 18:40
  • @DavidC.Rankin Still **way** better than copying 1024 bytes off of `argv[1]`. – S.S. Anne Jun 28 '19 at 18:41
  • 1
    There is no question about that. The `PATH_MAX` size and a simple `strcpy` (after checking the `strlen`) would suffice. (but if you check `strlen` there is no need to scan for end-of-string again, simply `size_t plen = strlen (argv[1]); if (plen < PATH_MAX - 1) memcpy (path, argv[1], plen + 1)`) – David C. Rankin Jun 28 '19 at 18:43
  • 1
    What are you measuring as "code execution time"? The code is inherently I/O bound since it is accessing data from a (typically slow compared with, say, RAM) physical device. The main way to speed it up would be to either optimise access to the directory entries themselves by some low level mechanism, or to somehow do caching (e.g. if doing a count on the same directory multiple times, avoid recounting). The benefits of tweaking string copying - the other work your code is doing - will be insignificant compared with the device access. – Peter Jun 28 '19 at 23:37
  • @ElPpop: you can accept the answer by clicking on the grey checkmark below its score. – chqrlie Aug 17 '19 at 22:33

1 Answers1

5

If you are not interested in printing the filenames, just remove the printf statements.

Note however that there are some problems in the code:

  • memcpy(path, argv[1], 1024); may read beyond the end of the string pointed to by argv[1], which is undefined behavior, or not produce a proper C string, which leads to undefined behavior in the function listdir.

You could also avoid recomputing the length of the directory name in each recursive call.

Here is a modified version you can try:

#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <dirent.h>
#include <unistd.h>

long long countdirs(char *path, size_t size, size_t len) {
    DIR *dir;
    struct dirent *entry;
    long long count;

    if (!(dir = opendir(path))) {
        fprintf(stderr, "path not found: %s: %s\n",
                path, strerror(errno));
        return 0;
    }

    count = 1; // count this directory
    while ((entry = readdir(dir)) != NULL) {
        if (entry->d_type == DT_DIR) {
            char *name = entry->d_name;
            size_t len1 = strlen(name);
            if (*name == '.' && (len1 == 1 || (len1 == 2 && name[1] == '.')))
                continue;
            if (len + len1 + 2 > size) {
                count++;
                fprintf(stderr, "path too long: %s/%s\n", path, name);
            } else {
                path[len] = '/';
                memcpy(path + len + 1, name, len1 + 1);
                count += countdirs(path, size, len + 1 + len1);
                path[len] = '\0';
            }
        }
    }
    closedir(dir);
    return count;
}

int main(int argc, char *argv[]) {
    char buf[4096];
    char *path;
    size_t len;

    if (argc != 2) {
        fprintf(stderr, "one argument expected.\n");
        return 1;
    }
    path = argv[1];
    len = strlen(path);
    if (len >= sizeof(buf)) {
        fprintf(stderr, "path too long: %s\n", path);
        return 1;
    }   
    memcpy(buf, path, len + 1);
    printf("%s: %lld directories\n", path, countdirs(buf, sizeof buf, len));
    return 0;
}

Further notes:

  • The above code might fail if the directory tree is too deep or has loops. Failure may come from running out of handles causing opendir to fail.

  • You should try an alternative approach using the POSIX standard function nftw() as documented in this answer: https://stackoverflow.com/a/29402705/4593267

  • As suggested by EOF, since paths are not used, constructing them is not required. It might be safer and more efficient to use openat() and fdopendir(). (documented here: https://pubs.opengroup.org/onlinepubs/9699919799/functions/open.html https://pubs.opengroup.org/onlinepubs/9699919799/functions/fdopendir.html ).

  • There is little point in optimizing this function as most of the time is spent in the OS or waiting for the storage device. The effect of file system cacheing may be huge: I measured 15x on linux for 133000 directories. Using a different set of system calls may improve or worsen the speed, but small improvements are probably highly system specific.

chqrlie
  • 131,814
  • 10
  • 121
  • 189
  • Thanks a lot for your feedback and outputs, Regarding the performance - your code execution time is the same as the above one . – zlobul Jun 28 '19 at 18:36
  • 1
    Why muck around with paths at all? Just use `openat()` and `fdopendir()` – EOF Jun 28 '19 at 18:37
  • I have tested the POSIX approach and it might be more stable , but it is performing a lot slower than your code. – zlobul Jun 28 '19 at 18:53
  • 1
    @EOF: indeed, since paths are not used, constructing them is not required. It might be safer and more efficient to use `openat()` and `fdopendir()`. (documented here: https://pubs.opengroup.org/onlinepubs/9699919799/functions/open.html https://pubs.opengroup.org/onlinepubs/9699919799/functions/fdopendir.html ). – chqrlie Jun 29 '19 at 15:07
  • I have tested the code with fdopendir() and it is performing 1s. slower ( 9s. ) for the same amount of directories - 690000 compared to opendir() ( 8s ) – zlobul Jul 01 '19 at 07:53
  • 8s for 690000 directories! that's not bad, on MacOS, I get 4,7s of user time for 895410 directories, but 35s of system time for a total of 77s elapsed time! I suspect `readdir` does a lot of work behind the scenes on MacOS, such as sorting the entries. On linux, my timings are consistent with yours, once the directory tree is cached (15x improvement from uncached). Most of the time is spent in the system or waiting for the storage device. – chqrlie Jul 01 '19 at 08:32