68

I need to recursively list all directories and files in C programming. I have looked into FTW but that is not included with the 2 operating systems that I am using (Fedora and Minix). I am starting to get a big headache from all the different things that I have read over the past few hours.

If somebody knows of a code snippet I could look at that would be amazing, or if anyone can give me good direction on this I would be very grateful.

Alex Riley
  • 169,130
  • 45
  • 262
  • 238
Charlie
  • 1,308
  • 5
  • 14
  • 24
  • Why not just do this in a scripting language? That would be faster and easier to write. – dbeer Dec 08 '11 at 20:21
  • 4
    @dbeer What if he needs this information inside a C program? –  Oct 01 '13 at 13:38
  • 1
    Are you sure you want to perform the action recursively? I would point out that cyclic links and open file limits might pose an issue for recursive implementations. I would consider using a linked list (or two), so the code could check against previously processed folders. This will also allow the code to use a single open file while traversing deep hierarchies. – Myst Jun 27 '17 at 05:20

6 Answers6

118

Why does everyone insist on reinventing the wheel again and again?

POSIX.1-2008 standardized the nftw() function, also defined in the Single Unix Specification v4 (SuSv4), and available in Linux (glibc, man 3 nftw), OS X, and most current BSD variants. It is not new at all.

Naïve opendir()/readdir()/closedir() -based implementations almost never handle the cases where directories or files are moved, renamed, or deleted during the tree traversal, whereas nftw() should handle them gracefully.

As an example, consider the following C program that lists the directory tree starting at the current working directory, or at each of the directories named on the command line, or just the files named at the command line:

/* We want POSIX.1-2008 + XSI, i.e. SuSv4, features */
#define _XOPEN_SOURCE 700

/* Added on 2017-06-25:
   If the C library can support 64-bit file sizes
   and offsets, using the standard names,
   these defines tell the C library to do so. */
#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64 

#include <stdlib.h>
#include <unistd.h>
#include <ftw.h>
#include <time.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>

/* POSIX.1 says each process has at least 20 file descriptors.
 * Three of those belong to the standard streams.
 * Here, we use a conservative estimate of 15 available;
 * assuming we use at most two for other uses in this program,
 * we should never run into any problems.
 * Most trees are shallower than that, so it is efficient.
 * Deeper trees are traversed fine, just a bit slower.
 * (Linux allows typically hundreds to thousands of open files,
 *  so you'll probably never see any issues even if you used
 *  a much higher value, say a couple of hundred, but
 *  15 is a safe, reasonable value.)
*/
#ifndef USE_FDS
#define USE_FDS 15
#endif

int print_entry(const char *filepath, const struct stat *info,
                const int typeflag, struct FTW *pathinfo)
{
    /* const char *const filename = filepath + pathinfo->base; */
    const double bytes = (double)info->st_size; /* Not exact if large! */
    struct tm mtime;

    localtime_r(&(info->st_mtime), &mtime);

    printf("%04d-%02d-%02d %02d:%02d:%02d",
           mtime.tm_year+1900, mtime.tm_mon+1, mtime.tm_mday,
           mtime.tm_hour, mtime.tm_min, mtime.tm_sec);

    if (bytes >= 1099511627776.0)
        printf(" %9.3f TiB", bytes / 1099511627776.0);
    else
    if (bytes >= 1073741824.0)
        printf(" %9.3f GiB", bytes / 1073741824.0);
    else
    if (bytes >= 1048576.0)
        printf(" %9.3f MiB", bytes / 1048576.0);
    else
    if (bytes >= 1024.0)
        printf(" %9.3f KiB", bytes / 1024.0);
    else
        printf(" %9.0f B  ", bytes);

    if (typeflag == FTW_SL) {
        char   *target;
        size_t  maxlen = 1023;
        ssize_t len;

        while (1) {

            target = malloc(maxlen + 1);
            if (target == NULL)
                return ENOMEM;

            len = readlink(filepath, target, maxlen);
            if (len == (ssize_t)-1) {
                const int saved_errno = errno;
                free(target);
                return saved_errno;
            }
            if (len >= (ssize_t)maxlen) {
                free(target);
                maxlen += 1024;
                continue;
            }

            target[len] = '\0';
            break;
        }

        printf(" %s -> %s\n", filepath, target);
        free(target);

    } else
    if (typeflag == FTW_SLN)
        printf(" %s (dangling symlink)\n", filepath);
    else
    if (typeflag == FTW_F)
        printf(" %s\n", filepath);
    else
    if (typeflag == FTW_D || typeflag == FTW_DP)
        printf(" %s/\n", filepath);
    else
    if (typeflag == FTW_DNR)
        printf(" %s/ (unreadable)\n", filepath);
    else
        printf(" %s (unknown)\n", filepath);

    return 0;
}


int print_directory_tree(const char *const dirpath)
{
    int result;

    /* Invalid directory path? */
    if (dirpath == NULL || *dirpath == '\0')
        return errno = EINVAL;

    result = nftw(dirpath, print_entry, USE_FDS, FTW_PHYS);
    if (result >= 0)
        errno = result;

    return errno;
}

int main(int argc, char *argv[])
{
    int arg;

    if (argc < 2) {

        if (print_directory_tree(".")) {
            fprintf(stderr, "%s.\n", strerror(errno));
            return EXIT_FAILURE;
        }

    } else {

        for (arg = 1; arg < argc; arg++) {
            if (print_directory_tree(argv[arg])) {
                fprintf(stderr, "%s.\n", strerror(errno));
                return EXIT_FAILURE;
            }
        }

    }

    return EXIT_SUCCESS;
}

Most of the code above is in print_entry(). Its task is to print out each directory entry. In print_directory_tree(), we tell nftw() to call it for each directory entry it sees.

The only hand-wavy detail above is the decision on how many file descriptors one should let nftw() use. If your program uses at most two extra file descriptors (in addition to the standard streams) during the file tree walk, 15 is known to be safe (on all systems having nftw() and being mostly POSIX-compliant).

In Linux, you could use sysconf(_SC_OPEN_MAX) to find the maximum number of open files, and subtract the number you use concurrently with the nftw() call, but I wouldn't bother (unless I knew the utility would be used mostly with pathologically deep directory structures). Fifteen descriptors does not limit the tree depth; nftw() just gets slower (and might not detect changes in a directory if walking a directory deeper than 13 directories from that one, although the tradeoffs and general ability to detect changes vary between systems and C library implementations). Just using a compile-time constant like that keeps the code portable -- it should work not just on Linux, but on Mac OS X and all current BSD variants, and most other not-too-old Unix variants, too.

In a comment, Ruslan mentioned that they had to switch to nftw64() because they had filesystem entries that required 64-bit sizes/offsets, and the "normal" version of nftw() failed with errno == EOVERFLOW. The correct solution is to not switch to GLIBC-specific 64-bit functions, but to define _LARGEFILE64_SOURCE and _FILE_OFFSET_BITS 64. These tell the C library to switch to 64-bit file sizes and offsets if possible, while using the standard functions (nftw(), fstat(), et cetera) and type names (off_t etc.).

Nominal Animal
  • 38,216
  • 5
  • 59
  • 86
  • 13
    The primary reason for reinventing this particular wheel is that it is set as an exercise in the use of `opendir()`, `readdir()`, `closedir()`, and the object is to teach people to think carefully about full path names vs directory entries. In production code, `nftw()` is the way to go — don't get me wrong. But there's also a pedagogic reason for the exercise using the raw functions. – Jonathan Leffler Apr 02 '16 at 17:00
  • @JonathanLeffler: I understand your point, but I disagree. That particular pedagogical approach has the severe downside that *people actually learn to rely on it*, and no teacher I know of shows `nftw()` at all. We also do not have a standard `opendirat()`, and that teaches learners to construct paths instead of traversing directory structures. I know I am flailing uselessly against windmills... :( – Nominal Animal Apr 02 '16 at 17:41
  • nftw and other C callback patterns encourage the use of non-thread-safe globals, gross. See [this answer](http://stackoverflow.com/questions/10281198/how-avoid-using-global-variable-when-using-nftw) for some further discussion. – moodboom Jul 25 '16 at 19:11
  • 5
    @moodboom: The entire filesystem is essentially a non-thread-safe global, gross. Just because A Coding Guru states globals are gross does not mean they do not have their valid uses. Directory tree traversal **correctly** is not simple, and most claimed-correct examples I've seen are as fragile as my inner flower. Very fragile. I would use [fts](http://man7.org/linux/man-pages/man3/fts.3.html) if it was defined in POSIX, but alas, it is 4.4BSD only, and support is scattered. Even in GNU C library, the support (e.g. for large files) was iffy up to 2.23. The linked discussion is just silly. – Nominal Animal Jul 25 '16 at 19:24
  • Yep, agreed, but gross nonetheless. :-) I was just posting a warning, but you're right, there are lots of warnings to post when working at this low level. Personally I prefer to step away from a lot of them (into others!) via C++. Different strokes. – moodboom Jul 25 '16 at 19:30
  • 4
    @moodboom: Right. Note that the key point of my answer is that the typical example of using `opendir()`/`readdir()`/`closedir()` is unsafe and prone to races, and may do really weird stuff if you move the directories involved while the walk is in progress. It is **bad code**. Fragile. `nftw()` et al. are supposed to handle all those correctly. We can write a safe directory tree walker using POSIX.1-2008 `openat()`/`fdopendir()`/`readdir()`/`closedir()`, but the tree depth we can do will be limited by the number of file descriptors we can use. `nftw()` should avoid even that, safely. – Nominal Animal Jul 25 '16 at 19:41
  • @NominalAnimal yep, you raised an important point that anything done to the structure while traversing with readdir() can lead to disaster. – moodboom Jul 25 '16 at 19:50
  • @NominalAnimal there was nothing to edit there, I just thought you were something I knew..Sorry. :) – gsamaras Sep 10 '16 at 19:03
  • 1
    @gsamaras: :) I do know you from cboard; we've emailed and everything, talked about cage matches and so on. If you've lost my e-mail address (I no longer use cboard -- my answers were apparently unwanted, too long and too detailed to actually read), you can always find one that reaches me at [my home page](http://www.nominal-animal.net/). – Nominal Animal Sep 10 '16 at 19:17
  • 3
    I didn't even know about alternatives to `*dir` functions before seeing this post. Thanks a million. – Daniel Kamil Kozar Jan 28 '17 at 16:30
  • Beware of the potential problems that this function has with some (long, deep?..) trees, similar to `stat`: glibc has an alternative called `nftw64`, which passes `struct stat64` instead of `struct stat` to the callback. I got a "Value too large for defined data type" error from `nftw` with my filesystem, which was solved by switching to `nftw64`. – Ruslan Jun 24 '17 at 15:40
  • 2
    @Ruslan: If you define `_XOPEN_SOURCE 500`, `_LARGEFILE64_SOURCE`, and `_FILE_OFFSET_BITS 64`, the GNU C library will use `nftw64()`, `stat64()` etc. automatically, when you call ``nftw()`, `stat()`, etc. There is absolutely no reason to use glibc-specific names! – Nominal Animal Jun 25 '17 at 13:51
  • @Ruslan: I added the `#define`s to my example code, in the hopes that it will help avoid such issues in the future. Thank you for commenting on this! I think it is an important issue and a problem that should be acknowledged (even if I recommend a different solution). – Nominal Animal Jun 25 '17 at 14:03
  • Actually it seems `_FILE_OFFSET_BITS` is enough. Even if I explicitly `#undef` both `_XOPEN_SOURCE` and `_LARGEFILE64_SOURCE` before inclusion of each header in my source, it still works fine with `_FILE_OFFSET_BITS=64`. – Ruslan Jun 25 '17 at 14:40
  • 2
    @Ruslan: If you look at [feature test macros](http://man7.org/linux/man-pages/man7/feature_test_macros.7.html), you'll see *"New programs should not employ `_LARGEFILE64_SOURCE`; instead `_FILE_OFFSET_BITS 64` should be employed instead"*. Unfortunately, there seem to be lots of people still using old glibc versions (2.2.x, 2.3.x), which *do need* that legacy macro to select the correct interfaces. Since it does no harm on newer C libraries (and will not harm in the future, due to backwards compatibility requirements), I chose to include it, too. If you know of a contradiction, let me know! – Nominal Animal Jun 25 '17 at 15:50
  • Does it handle non-english names? E.g. UTF-8 strings? – Netherwire Apr 07 '19 at 09:06
  • Note from the man page on macOS 10.15: `These functions are provided for compatibility with legacy code. New code should use the fts(3) functions.` – twodayslate Nov 01 '19 at 18:59
  • What is this `const char *const dirpath`? – 71GA Jan 18 '21 at 06:50
  • @NominalAnimal *We can write a safe directory tree walker using POSIX.1-2008 `openat()`/`fdopendir()`/`readdir()`/`closedir()` ...* That also allows walking the tree without having to construct path names. https://stackoverflow.com/a/75493638/4756299 Yeah, a few years late. I'm slow. ;-) – Andrew Henle Feb 18 '23 at 14:05
84

Here is a recursive version:

#include <unistd.h>
#include <sys/types.h>
#include <dirent.h>
#include <stdio.h>
#include <string.h>

void listdir(const char *name, int indent)
{
    DIR *dir;
    struct dirent *entry;

    if (!(dir = opendir(name)))
        return;

    while ((entry = readdir(dir)) != NULL) {
        if (entry->d_type == DT_DIR) {
            char path[1024];
            if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
                continue;
            snprintf(path, sizeof(path), "%s/%s", name, entry->d_name);
            printf("%*s[%s]\n", indent, "", entry->d_name);
            listdir(path, indent + 2);
        } else {
            printf("%*s- %s\n", indent, "", entry->d_name);
        }
    }
    closedir(dir);
}

int main(void) {
    listdir(".", 0);
    return 0;
}
Claudio Cortese
  • 1,372
  • 2
  • 10
  • 21
Lloyd Macrohon
  • 1,432
  • 9
  • 7
  • 1
    Should be defined in . What platform are you compiling this on? – Lloyd Macrohon Dec 08 '11 at 23:00
  • it was actually the built in compiler with the IDE i was using didnt like it, it ran fine through GCC in terminal – Charlie Dec 08 '11 at 23:24
  • Oh BTW, change this code so it's while ((entry = readdir(dir)) and remove if (!(entry = readdir(dir)), or at least if readdir fails, make sure you call closedir before returning. – Lloyd Macrohon Dec 09 '11 at 00:00
  • 1
    How does this behave in the presence of cyclic links? – Kerrek SB Mar 25 '12 at 10:47
  • 1
    Won't handle deep hieararchies. :( There's an OPEN_MAX limit. – Petr Skocik Dec 12 '16 at 19:25
  • If the first readdir(dir) fails you don't call closedir but return straight away. – hetepeperfan Jan 12 '17 at 14:29
  • @lloydm: I simplified the code and removed a bug: `path[len] = 0;` was incorrect as `len` could be larger than the buffer size and the buffer is null terminated anyway. – chqrlie Jun 24 '17 at 15:52
  • 1
    @Charlie The use of d_type is higly discouraged because it's not POSIX.1 compliant. BTW if you are planning to support only Linux AND working only with ext4,btrfs and a bunch of other FS you should be fine. – Claudio Cortese Sep 05 '17 at 20:27
10
int is_directory_we_want_to_list(const char *parent, char *name) {
  struct stat st_buf;
  if (!strcmp(".", name) || !strcmp("..", name))
    return 0;
  char *path = alloca(strlen(name) + strlen(parent) + 2);
  sprintf(path, "%s/%s", parent, name);
  stat(path, &st_buf);
  return S_ISDIR(st_buf.st_mode);
}

int list(const char *name) {
  DIR *dir = opendir(name);
  struct dirent *ent;
  while (ent = readdir(dir)) {
    char *entry_name = ent->d_name;
    printf("%s\n", entry_name);
    if (is_directory_we_want_to_list(name, entry_name)) {
      // You can consider using alloca instead.
      char *next = malloc(strlen(name) + strlen(entry_name) + 2);
      sprintf(next, "%s/%s", name, entry_name);
      list(next);
      free(next);
    }
  }
  closedir(dir);
}

Header files worth being skimmed in this context: stat.h, dirent.h. Bear in mind that the code above isn't checking for any errors which might occur.

A completely different approach is offered by ftw defined in ftw.h.

Jan
  • 11,636
  • 38
  • 47
  • you aren't recursing. it doesn't look like it at least. Did you mean to call `list(entry_name)` below your comment? – Jon Egeland Dec 08 '11 at 20:09
  • 1
    @Jon, that's true, I just wrote a skeleton to help the OP to get started. If it's not enough I can elaborate. – Jan Dec 08 '11 at 20:12
  • So that would list all files in a directory along with all the sub directories and all of the files inside of the sub directories? – Charlie Dec 08 '11 at 20:24
  • It's the start of a snippet that does. He put in comments what else needed to be added. As posted there is no check for file vs. dir, or to ensure directory isn't `.` or `..`, and no recursive call, so I think it will only print the files in the given directory. – prelic Dec 08 '11 at 20:31
  • I've updated the answer. Now it lists contents of given directory and recurs to subdirectories for which `is_directory_we_want_to_list` returns a non-zero value. HTH. – Jan Dec 08 '11 at 20:33
  • Ok Just these few comments and the snippet is a whole lot more helpful than everything I have read so far. So if this line if (is_directory_we_want_to_list(name, entry_name)) { was to be if (/etc(name, entry_name)) { then everything in the /etc file would be listed? – Charlie Dec 08 '11 at 20:35
  • Sorry that was me being an idiot of course if(/etc...) wouldnt work, could you please elaborate on what is_directory_we_want_to_list is please? – Charlie Dec 08 '11 at 20:44
  • There you go, I've implemented `is_directory_we_want_to_list`. – Jan Dec 08 '11 at 20:52
  • Im so sorry to keep asking stupid questions I really am... Is the above snippet two different functions? Thanks – Charlie Dec 08 '11 at 20:56
6

As I mentioned in my comment, I believe a recursive approach to have two inherent flaws to this task.

The first flaw is the limit on open files. This limit imposes a limit on deep traversal. If there are enough sub-folders, the recursive approach will break. (See edit regarding stack overflow)

The second flaw is a bit more subtle. The recursive approach makes it very hard to test for hard links. If a folder tree is cyclic (due to hard links), the recursive approach will break (hopefully without a stack overflow). (See edit regarding hard links)

However, it is quite simple to avoid these issues by replacing recursion with a single file descriptor and linked lists.

I assume this isn't a school project and that recursion is optional.

Here's an example application.

Use a.out ./ to view folder tree.

I apologize for the macros and stuff... I usually use inline functions, but I thought it would be easier to follow the code if it was all in a single function.

#include <dirent.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>

int main(int argc, char const *argv[]) {
  /* print use instruction unless a folder name was given */
  if (argc < 2)
    fprintf(stderr,
            "\nuse:\n"
            "    %s <directory>\n"
            "for example:\n"
            "    %s ./\n\n",
            argv[0], argv[0]),
        exit(0);

  /*************** a small linked list macro implementation ***************/

  typedef struct list_s {
    struct list_s *next;
    struct list_s *prev;
  } list_s;

#define LIST_INIT(name)                                                        \
  { .next = &name, .prev = &name }

#define LIST_PUSH(dest, node)                                                  \
  do {                                                                         \
    (node)->next = (dest)->next;                                               \
    (node)->prev = (dest);                                                     \
    (node)->next->prev = (node);                                               \
    (dest)->next = (node);                                                     \
  } while (0);

#define LIST_POP(list, var)                                                    \
  if ((list)->next == (list)) {                                                \
    var = NULL;                                                                \
  } else {                                                                     \
    var = (list)->next;                                                        \
    (list)->next = var->next;                                                  \
    var->next->prev = var->prev;                                               \
  }

  /*************** a record (file / folder) item type ***************/

  typedef struct record_s {
    /* this is a flat processing queue. */
    list_s queue;
    /* this will list all queued and processed folders (cyclic protection) */
    list_s folders;
    /* this will list all the completed items (siblings and such) */
    list_s list;
    /* unique ID */
    ino_t ino;
    /* name length */
    size_t len;
    /* name string */
    char name[];
  } record_s;

/* take a list_s pointer and convert it to the record_s pointer */
#define NODE2RECORD(node, list_name)                                           \
  ((record_s *)(((uintptr_t)(node)) -                                          \
                ((uintptr_t) & ((record_s *)0)->list_name)))

/* initializes a new record */
#define RECORD_INIT(name)                                                      \
  (record_s){.queue = LIST_INIT((name).queue),                                 \
             .folders = LIST_INIT((name).folders),                             \
             .list = LIST_INIT((name).list)}

  /*************** the actual code ***************/

  record_s records = RECORD_INIT(records);
  record_s *pos, *item;
  list_s *tmp;
  DIR *dir;
  struct dirent *entry;

  /* initialize the root folder record and add it to the queue */
  pos = malloc(sizeof(*pos) + strlen(argv[1]) + 2);
  *pos = RECORD_INIT(*pos);
  pos->len = strlen(argv[1]);
  memcpy(pos->name, argv[1], pos->len);
  if (pos->name[pos->len - 1] != '/')
    pos->name[pos->len++] = '/';
  pos->name[pos->len] = 0;
  /* push to queue, but also push to list (first item processed) */
  LIST_PUSH(&records.queue, &pos->queue);
  LIST_PUSH(&records.list, &pos->list);

  /* as long as the queue has items to be processed, do so */
  while (records.queue.next != &records.queue) {
    /* pop queued item */
    LIST_POP(&records.queue, tmp);
    /* collect record to process */
    pos = NODE2RECORD(tmp, queue);
    /* add record to the processed folder list */
    LIST_PUSH(&records.folders, &pos->folders);

    /* process the folder and add all folder data to current list */
    dir = opendir(pos->name);
    if (!dir)
      continue;

    while ((entry = readdir(dir)) != NULL) {

      /* create new item, copying it's path data and unique ID */
      item = malloc(sizeof(*item) + pos->len + entry->d_namlen + 2);
      *item = RECORD_INIT(*item);
      item->len = pos->len + entry->d_namlen;
      memcpy(item->name, pos->name, pos->len);
      memcpy(item->name + pos->len, entry->d_name, entry->d_namlen);
      item->name[item->len] = 0;
      item->ino = entry->d_ino;
      /* add item to the list, right after the `pos` item */
      LIST_PUSH(&pos->list, &item->list);

      /* unless it's a folder, we're done. */
      if (entry->d_type != DT_DIR)
        continue;

      /* test for '.' and '..' */
      if (entry->d_name[0] == '.' &&
          (entry->d_name[1] == 0 ||
           (entry->d_name[1] == '.' && entry->d_name[2] == 0)))
        continue;

      /* add folder marker */
      item->name[item->len++] = '/';
      item->name[item->len] = 0;

      /* test for cyclic processing */
      list_s *t = records.folders.next;
      while (t != &records.folders) {
        if (NODE2RECORD(t, folders)->ino == item->ino) {
          /* we already processed this folder! */
          break; /* this breaks from the small loop... */
        }
        t = t->next;
      }
      if (t != &records.folders)
        continue; /* if we broke from the small loop, entry is done */

      /* item is a new folder, add to queue */
      LIST_PUSH(&records.queue, &item->queue);
    }
    closedir(dir);
  }

  /*************** Printing the results and cleaning up ***************/
  while (records.list.next != &records.list) {
    /* pop list item */
    LIST_POP(&records.list, tmp);
    /* collect and process record */
    pos = NODE2RECORD(tmp, list);
    fwrite(pos->name, pos->len, 1, stderr);
    fwrite("\n", 1, 1, stderr);
    /* free node */
    free(pos);
  }
  return 0;
}

EDIT

@Stargateur mentioned in the comments that the recursive code will probably overflow the stack before reaching the open file limit.

Although I don't see how a stack-overflow is any better, this assessment is probably correct as long as the process isn't close to the file limit when invoked.

Another point mentioned by @Stargateur in the comments was that the depth of the recursive code is limited by the maximum amount of sub-directories (64000 on the ext4 filesystem) and that hard links are extremely unlikely (since hard links to folders aren't allowed on Linux/Unix).

This is good news if the code is running on Linux (which it is, according to the question), so this issue isn't a real concern (unless running the code on macOS or, maybe, Windows)... although 64K subfolders in recursion might blow the stack wide open.

Having said that, the none recursive option still has advantages, such as being able to easily add a limit to the amount of items processed as well as being able to cache the result.

P.S.

According to the comments, here's a non-recursive version of the code that doesn't check for cyclic hierarchies. It's faster and should be safe enough to use on a Linux machine where hard links to folders aren't allowed.

#include <dirent.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>

int main(int argc, char const *argv[]) {
  /* print use instruction unless a folder name was given */
  if (argc < 2)
    fprintf(stderr,
            "\nuse:\n"
            "    %s <directory>\n"
            "for example:\n"
            "    %s ./\n\n",
            argv[0], argv[0]),
        exit(0);

  /*************** a small linked list macro implementation ***************/

  typedef struct list_s {
    struct list_s *next;
    struct list_s *prev;
  } list_s;

#define LIST_INIT(name)                                                        \
  { .next = &name, .prev = &name }

#define LIST_PUSH(dest, node)                                                  \
  do {                                                                         \
    (node)->next = (dest)->next;                                               \
    (node)->prev = (dest);                                                     \
    (node)->next->prev = (node);                                               \
    (dest)->next = (node);                                                     \
  } while (0);

#define LIST_POP(list, var)                                                    \
  if ((list)->next == (list)) {                                                \
    var = NULL;                                                                \
  } else {                                                                     \
    var = (list)->next;                                                        \
    (list)->next = var->next;                                                  \
    var->next->prev = var->prev;                                               \
  }

  /*************** a record (file / folder) item type ***************/

  typedef struct record_s {
    /* this is a flat processing queue. */
    list_s queue;
    /* this will list all the completed items (siblings and such) */
    list_s list;
    /* unique ID */
    ino_t ino;
    /* name length */
    size_t len;
    /* name string */
    char name[];
  } record_s;

/* take a list_s pointer and convert it to the record_s pointer */
#define NODE2RECORD(node, list_name)                                           \
  ((record_s *)(((uintptr_t)(node)) -                                          \
                ((uintptr_t) & ((record_s *)0)->list_name)))

/* initializes a new record */
#define RECORD_INIT(name)                                                      \
  (record_s){.queue = LIST_INIT((name).queue), .list = LIST_INIT((name).list)}

  /*************** the actual code ***************/

  record_s records = RECORD_INIT(records);
  record_s *pos, *item;
  list_s *tmp;
  DIR *dir;
  struct dirent *entry;

  /* initialize the root folder record and add it to the queue */
  pos = malloc(sizeof(*pos) + strlen(argv[1]) + 2);
  *pos = RECORD_INIT(*pos);
  pos->len = strlen(argv[1]);
  memcpy(pos->name, argv[1], pos->len);
  if (pos->name[pos->len - 1] != '/')
    pos->name[pos->len++] = '/';
  pos->name[pos->len] = 0;
  /* push to queue, but also push to list (first item processed) */
  LIST_PUSH(&records.queue, &pos->queue);
  LIST_PUSH(&records.list, &pos->list);

  /* as long as the queue has items to be processed, do so */
  while (records.queue.next != &records.queue) {
    /* pop queued item */
    LIST_POP(&records.queue, tmp);
    /* collect record to process */
    pos = NODE2RECORD(tmp, queue);

    /* process the folder and add all folder data to current list */
    dir = opendir(pos->name);
    if (!dir)
      continue;

    while ((entry = readdir(dir)) != NULL) {

      /* create new item, copying it's path data and unique ID */
      item = malloc(sizeof(*item) + pos->len + entry->d_namlen + 2);
      *item = RECORD_INIT(*item);
      item->len = pos->len + entry->d_namlen;
      memcpy(item->name, pos->name, pos->len);
      memcpy(item->name + pos->len, entry->d_name, entry->d_namlen);
      item->name[item->len] = 0;
      item->ino = entry->d_ino;
      /* add item to the list, right after the `pos` item */
      LIST_PUSH(&pos->list, &item->list);

      /* unless it's a folder, we're done. */
      if (entry->d_type != DT_DIR)
        continue;

      /* test for '.' and '..' */
      if (entry->d_name[0] == '.' &&
          (entry->d_name[1] == 0 ||
           (entry->d_name[1] == '.' && entry->d_name[2] == 0)))
        continue;

      /* add folder marker */
      item->name[item->len++] = '/';
      item->name[item->len] = 0;

      /* item is a new folder, add to queue */
      LIST_PUSH(&records.queue, &item->queue);
    }
    closedir(dir);
  }

  /*************** Printing the results and cleaning up ***************/
  while (records.list.next != &records.list) {
    /* pop list item */
    LIST_POP(&records.list, tmp);
    /* collect and process record */
    pos = NODE2RECORD(tmp, list);
    fwrite(pos->name, pos->len, 1, stderr);
    fwrite("\n", 1, 1, stderr);
    /* free node */
    free(pos);
  }
  return 0;
}
Myst
  • 18,516
  • 2
  • 45
  • 67
  • @Stargateur , if the recursive code is running on a server, than you're probably right - although, in that case, keeping the stack intact and minimizing the use of file descriptors is probably a priority. Also, if a cyclic folder hierarchy exists, than the sub-directory limit won't save you... However, the linked list approach prevents a stack overflow and it will never hang or crash (even when processing a big tree). It's also quite easy to add a limit on the number of items processed. I love recursive code for it's simplicity, but it's often more dangerous to use and it threatens the stack. – Myst Jun 27 '17 at 07:45
  • @Stargateur what about hard links...? aren't they a risk, or does the OS always prevent hard links from creating cyclic hierarchies? – Myst Jun 27 '17 at 07:56
  • macOS doesn't officially allow them, but they are used inside TimeMachine and can be created (although not by the layman)... so I know one system where these hard links might pop up (either because they were created or because the code runs in a folder that contains TimeMachine backups)... however, you might be right about the risk being unlikely. – Myst Jun 27 '17 at 08:03
  • @Stargateur - I updated the answer to reflect your comments. – Myst Jun 27 '17 at 08:22
  • In the final loop, the second use of the LIST_POP(&records.list, tmp) macro is popping items that I think the caller would actually want printed. I find with *both* LIST_POP() uses, several "leaf" file items are skipped. When I remove the second LIST_POP() instance, the program prints the files I expected. – f1fan44 Dec 17 '20 at 01:33
  • His @f1fan44 - thanks for pointing this out. It's been a bit too long for me to remember the logic, bu a quick read through the loop suggests there should only be a single `pop`... I'll fix that. – Myst Dec 17 '20 at 03:37
5

Here is a simplified version that is recursive but uses much less stack space:

#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>

void listdir(char *path, size_t size) {
    DIR *dir;
    struct dirent *entry;
    size_t len = strlen(path);

    if (!(dir = opendir(path))) {
        fprintf(stderr, "path not found: %s: %s\n",
                path, strerror(errno));
        return;
    }

    puts(path);
    while ((entry = readdir(dir)) != NULL) {
        char *name = entry->d_name;
        if (entry->d_type == DT_DIR) {
            if (!strcmp(name, ".") || !strcmp(name, ".."))
                continue;
            if (len + strlen(name) + 2 > size) {
                fprintf(stderr, "path too long: %s/%s\n", path, name);
            } else {
                path[len] = '/';
                strcpy(path + len + 1, name);
                listdir(path, size);
                path[len] = '\0';
            }
        } else {
            printf("%s/%s\n", path, name);
        }
    }
    closedir(dir);
}

int main(void) {
    char path[1024] = ".";
    listdir(path, sizeof path);
    return 0;
}

On my system, its output is exactly identical to that of find .

chqrlie
  • 131,814
  • 10
  • 121
  • 189
0

Walking a Directory Tree Without Constructing Path Names

This is a version that uses file descriptors to refer to directories, with fdopendir(), fstatat(), and openat() to walk a directory tree without having to construct any path names.

This is simpler to implement, and can be useful on systems with deeply-nested directory trees, where a full path name might exceed PATH_MAX - and note that PATH_MAX may not even exist.

The posted code is compressed, broken up, and all error checking removed to remove vertical scroll bars and improve readability. A complete example is at the end of the question.

Headers

#define _POSIX_C_SOURCE 200809L

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <dirent.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>

Actual directory tree walk implementation:

// the actual walking is done by descriptor, not name
static int myftwImp( int dirfd )
{
    DIR *dirp = fdopendir( dirfd );

    for ( ;; )
    {
        struct dirent *dent = readdir( dirp );
        if ( NULL == dent ) break;

        if ( ( 0 == strcmp( ".", dent->d_name ) ) ||
             ( 0 == strcmp( "..", dent->d_name ) ) )
        {
            continue;
        }

        struct stat sb = { 0 };
        fstatat( dirfd, dent->d_name, &sb, 0 );

        if ( S_ISDIR( sb.st_mode ) )
        {
            printf( "dir: %s\n", dent->d_name );
            int newdirfd = openat( dirfd, dent->d_name,
                O_RDONLY | O_DIRECTORY );
            myftwImp( newdirfd );
        }
        printf( "    file: %s\n", dent->d_name );
    }

    // this will close the descriptor, too
    closedir( dirp );
    return( 0 );
}

Public call that uses directory name:

int myftw( const char *dirname )
{
    int dirfd = open( dirname, O_RDONLY | O_DIRECTORY );

    myftwImp( dirfd );

    return( 0 );
}

Example use:

int main( int argc, char **argv )
{
    int rc = myftw( argv[ 1 ] );

    return( rc );
}

No error checking is done here for brevity. Real code should check all calls for errors and handle them appropriately.

Full code with error checking:

#define _POSIX_C_SOURCE 200809L

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <dirent.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>


static int myftwImp( int dirfd )
{
    DIR *dirp = fdopendir( dirfd );
    if ( NULL == dirp )
    {
        return( -1 );
    }

    int rc = 0;

    for ( ;; )
    {
        struct dirent *dent = readdir( dirp );
        if ( NULL == dent )
        {
            break;
        }

        if ( 0 == strcmp( ".", dent->d_name ) )
        {
            continue;
        }

        if ( 0 == strcmp( "..", dent->d_name ) )
        {
            continue;
        }

        struct stat sb = { 0 };

        rc = fstatat( dirfd, dent->d_name, &sb, 0 );
        if ( 0 != rc )
        {
            break;
        }

        if ( S_ISDIR( sb.st_mode ) )
        {
            int newdirfd = openat( dirfd, dent->d_name, O_RDONLY | O_DIRECTORY );
            if ( -1 == newdirfd )
            {
                rc = -1;
                break;
            }

            printf( "dir: %s\n", dent->d_name );

            rc = myftwImp( newdirfd );
            if ( 0 != rc )
            {
                break;
            }
        }

        printf( "    file: %s\n", dent->d_name );
    }

    closedir( dirp );

    return( rc );
}

int myftw( const char *dirname )
{
    int dirfd = open( dirname, O_RDONLY | O_DIRECTORY );
    if ( -1 == dirfd )
    {
        return( -1 );
    }

    int rc = myftwImp( dirfd );

    return( rc );
}

int main( int argc, char **argv )
{
    int rc = myftw( argv[ 1 ] );

    return( rc );
}
Andrew Henle
  • 32,625
  • 3
  • 24
  • 56