170

One of the things which I miss while writing programs in C is a dictionary data structure. What's the most convenient way to implement one in C? I am not looking for performance, but ease of coding it from scratch. I don't want it to be generic either -- something like char*int will do. But I do want it to be able to store an arbitrary number of items.

This is intended more as an exercise. I know that there are 3rd party libraries available which one can use. But consider for a moment, that they don't exist. In such a situation what's the quickest way you can implement a dictionary satisfying the above requirements.

Neuron
  • 5,141
  • 5
  • 38
  • 59
Rohit
  • 7,449
  • 9
  • 45
  • 55
  • 4
    If you miss having it provided for you, then why do you want to make it from scratch, instead of using a third-party implementation? – Karl Knechtel Dec 08 '10 at 05:12
  • Yes, that alternative always exists. I posed this question more as an exercise. – Rohit Dec 08 '10 at 05:16
  • 18
    Writing a hashtable in C is a fun exercise -- every serious C programmer should do it at least once. – Lee Dec 08 '10 at 05:24
  • I think of a dictionary being a datatype rather than a datastructure, since it could be implemented lots of ways -- a list, a hashtable, a tree, a self-balancing tree, etc. Are you asking for a dictionary, or a hashtable? – Paul Hankin Apr 21 '13 at 07:54
  • 1
    Related: How to represent a Python-like dictionary in C?[](https://stackoverflow.com/questions/3269881/how-to-represent-a-python-like-dictionary-in-c) – Gaurang Tandon Jun 23 '18 at 06:05

12 Answers12

155

Section 6.6 of The C Programming Language presents a simple dictionary (hashtable) data structure. I don't think a useful dictionary implementation could get any simpler than this. For your convenience, I reproduce the code here.

struct nlist { /* table entry: */
    struct nlist *next; /* next entry in chain */
    char *name; /* defined name */
    char *defn; /* replacement text */
};

#define HASHSIZE 101
static struct nlist *hashtab[HASHSIZE]; /* pointer table */

/* hash: form hash value for string s */
unsigned hash(char *s)
{
    unsigned hashval;
    for (hashval = 0; *s != '\0'; s++)
      hashval = *s + 31 * hashval;
    return hashval % HASHSIZE;
}

/* lookup: look for s in hashtab */
struct nlist *lookup(char *s)
{
    struct nlist *np;
    for (np = hashtab[hash(s)]; np != NULL; np = np->next)
        if (strcmp(s, np->name) == 0)
          return np; /* found */
    return NULL; /* not found */
}

char *strdup(char *);
/* install: put (name, defn) in hashtab */
struct nlist *install(char *name, char *defn)
{
    struct nlist *np;
    unsigned hashval;
    if ((np = lookup(name)) == NULL) { /* not found */
        np = (struct nlist *) malloc(sizeof(*np));
        if (np == NULL || (np->name = strdup(name)) == NULL)
          return NULL;
        hashval = hash(name);
        np->next = hashtab[hashval];
        hashtab[hashval] = np;
    } else /* already there */
        free((void *) np->defn); /*free previous defn */
    if ((np->defn = strdup(defn)) == NULL)
       return NULL;
    return np;
}

char *strdup(char *s) /* make a duplicate of s */
{
    char *p;
    p = (char *) malloc(strlen(s)+1); /* +1 for ’\0’ */
    if (p != NULL)
       strcpy(p, s);
    return p;
}

Note that if the hashes of two strings collide, it may lead to an O(n) lookup time. You can reduce the likelihood of collisions by increasing the value of HASHSIZE. For a complete discussion of the data structure, please consult the book.

lifebalance
  • 1,846
  • 3
  • 25
  • 57
Vijay Mathew
  • 26,737
  • 4
  • 62
  • 93
  • 12
    why is here `hashval = *s + 31 * hashval;` exactly 31 and not anything else? – Incerteza Sep 25 '14 at 08:50
  • 21
    31 is prime. Primes are often used in hash functions to reduce probability of collisions. It has something to do with integer factorization (i.e. you cannot factor a prime). – jnovacho Oct 04 '14 at 11:08
  • 2
    @アレックス 31 performed well on test data. Choosing a prime has its benefits, but would not be necessary if the size of the hash table itself was prime. – G. Bach Apr 03 '16 at 16:43
  • 5
    Note that the K&R C hashing algorithm is an appalling hash algorithm. See: http://programmers.stackexchange.com/questions/49550/which-hashing-algorithm-is-best-for-uniqueness-and-speed for details on how truly terrible is really is. Stop using it!! – carveone Aug 06 '16 at 11:01
  • 2
    @Overdrivr: Not necessary in this instance. hashtab is of static duration. Uninitialized variables with static duration (that is, those declared outside of functions, and those declared with the storage class static), are guaranteed to start out as a zero of the right type (ie: 0 or NULL or 0.0) – carveone Aug 06 '16 at 11:20
  • In the book, arrows are drawn pointing from the array, hashtab, to the nodes, np. But it seems like it should have been drawn the other way around because np->next = hashtab[hashval] (np points to the array), is this true? – scrollout May 08 '20 at 05:56
  • 1
    Can someone explain np->next = hashtab[hashval]; hashtab[hashval] = np; – scrollout May 08 '20 at 06:04
  • @scrollout this is a singly linked list insertion. Reason: hash tables are subject to 'collisions' because different object keys can reduce to the same hash code (consider e.g. 257 or more unique object keys with an 8-bit 0..255 hash: even a perfectly distributed hash will have duplicate hash values). This implementation uses `struct nlist` table entries with `next` pointers, which implement a singly-linked list. Table entries may be NULL or `nlist` entries; each `nlist` is the head of a chain of 1+ entries with identical hash values. – Technophile Jun 27 '22 at 15:40
28

The quickest way would be to use an already-existing implementation, like uthash.

And, if you really want to code it yourself, the algorithms from uthash can be examined and re-used. It's BSD-licensed so, other than the requirement to convey the copyright notice, you're pretty well unlimited in what you can do with it.

Lord Elrond
  • 13,430
  • 7
  • 40
  • 80
paxdiablo
  • 854,327
  • 234
  • 1,573
  • 1,953
13

For ease of implementation, it's hard to beat naively searching through an array. Aside from some error checking, this is a complete implementation (untested).

typedef struct dict_entry_s {
    const char *key;
    int value;
} dict_entry_s;

typedef struct dict_s {
    int len;
    int cap;
    dict_entry_s *entry;
} dict_s, *dict_t;

int dict_find_index(dict_t dict, const char *key) {
    for (int i = 0; i < dict->len; i++) {
        if (!strcmp(dict->entry[i], key)) {
            return i;
        }
    }
    return -1;
}

int dict_find(dict_t dict, const char *key, int def) {
    int idx = dict_find_index(dict, key);
    return idx == -1 ? def : dict->entry[idx].value;
}

void dict_add(dict_t dict, const char *key, int value) {
   int idx = dict_find_index(dict, key);
   if (idx != -1) {
       dict->entry[idx].value = value;
       return;
   }
   if (dict->len == dict->cap) {
       dict->cap *= 2;
       dict->entry = realloc(dict->entry, dict->cap * sizeof(dict_entry_s));
   }
   dict->entry[dict->len].key = strdup(key);
   dict->entry[dict->len].value = value;
   dict->len++;
}

dict_t dict_new(void) {
    dict_s proto = {0, 10, malloc(10 * sizeof(dict_entry_s))};
    dict_t d = malloc(sizeof(dict_s));
    *d = proto;
    return d;
}

void dict_free(dict_t dict) {
    for (int i = 0; i < dict->len; i++) {
        free(dict->entry[i].key);
    }
    free(dict->entry);
    free(dict);
}
Paul Hankin
  • 54,811
  • 11
  • 92
  • 118
  • 5
    "For ease of implementation": You're exactly right: this is the easiest. Plus it implements the OP's request "I do want it to be able to store an arbitrary number of items" - the highest voted answer doesn't do that (unless you believe that picking a _compile time_ constant satisfies "arbitrary"...) – davidbak Jun 05 '15 at 04:23
  • 2
    This may be a valid approach depending on the use-case, but the OP explicitly requested a dictionary, and this is definitely not a dictionary. – Dan Bechard Apr 14 '19 at 21:20
4

I am surprised no one mentioned hsearch/hcreate set of libraries which although is not available on windows, but is mandated by POSIX, and therefore available in Linux / GNU systems.

The link has a simple and complete basic example that very well explains its usage.

It even has thread safe variant, is easy to use and very performant.

Ciro Santilli OurBigBook.com
  • 347,512
  • 102
  • 1,199
  • 985
fkl
  • 5,412
  • 4
  • 28
  • 68
  • 3
    Worth noting that people here say it is kind of unusable, although I haven't tried it myself: https://stackoverflow.com/a/6118591/895245 – Ciro Santilli OurBigBook.com Apr 26 '18 at 10:15
  • 1
    Fair enough, however, i have tried the hcreate_r (for multiple hash tables) version in at least one app which ran for a reasonably long enough time to consider it real world. Agreed that its a GNU extension but then that's the case for many other libs too. Though i would still argue that you might still be able to use it for one large key value pair being operated in some real world app – fkl Apr 27 '18 at 18:50
3

GLib and gnulib

These are your likely best bets if you don't have more specific requirements, since they are widely available, portable and likely efficient.

See also: Are there any open source C libraries with common data structures?

Zephyr project implementation

This well-known mostly-C embedded OS added one at some point:

Ciro Santilli OurBigBook.com
  • 347,512
  • 102
  • 1,199
  • 985
2

Create a simple hash function and some linked lists of structures , depending on the hash , assign which linked list to insert the value in . Use the hash for retrieving it as well .

I did a simple implementation some time back :

...
#define K 16 // chaining coefficient

struct dict
{
    char *name; /* name of key */
    int val;   /*  value */
    struct dict *next; /* link field */
};

typedef struct dict dict;
dict *table[K];
int initialized = 0;


void  putval ( char *,int);

void init_dict()
{   
    initialized = 1;
    int i;  
    for(i=0;iname = (char *) malloc (strlen(key_name)+1);
    ptr->val = sval;
    strcpy (ptr->name,key_name);


    ptr->next = (struct dict *)table[hsh];
    table[hsh] = ptr;

}


int getval ( char *key_name )
{   
    int hsh = hash(key_name);   
    dict *ptr;
    for (ptr = table[hsh]; ptr != (dict *) 0;
        ptr = (dict *)ptr->next)
    if (strcmp (ptr->name,key_name) == 0)
        return ptr->val;
    return -1;
}
abc def foo bar
  • 2,360
  • 6
  • 30
  • 41
1

here is a quick implement, i used it to get a 'Matrix'(sruct) from a string. you can have a bigger array and change its values on the run also:

typedef struct  { int** lines; int isDefined; }mat;
mat matA, matB, matC, matD, matE, matF;

/* an auxilary struct to be used in a dictionary */
typedef struct  { char* str; mat *matrix; }stringToMat;

/* creating a 'dictionary' for a mat name to its mat. lower case only! */
stringToMat matCases [] =
{
    { "mat_a", &matA },
    { "mat_b", &matB },
    { "mat_c", &matC },
    { "mat_d", &matD },
    { "mat_e", &matE },
    { "mat_f", &matF },
};

mat* getMat(char * str)
{
    stringToMat* pCase;
    mat * selected = NULL;
    if (str != NULL)
    {
        /* runing on the dictionary to get the mat selected */
        for(pCase = matCases; pCase != matCases + sizeof(matCases) / sizeof(matCases[0]); pCase++ )
        {
            if(!strcmp( pCase->str, str))
                selected = (pCase->matrix);
        }
        if (selected == NULL)
            printf("%s is not a valid matrix name\n", str);
    }
    else
        printf("expected matrix name, got NULL\n");
    return selected;
}
dagoltz
  • 62
  • 7
0

A hashtable is the traditional implementation of a simple "Dictionary". If you don't care about speed or size, just search for it. There are many freely available implementations.

Here's the first one I saw -- at a glance, it looks OK to me. (It's pretty basic. If you really want it to hold an unlimited amount of data, then you'll need to add some logic to "realloc" the table memory as it grows.)

halfer
  • 19,824
  • 17
  • 99
  • 186
Lee
  • 13,462
  • 1
  • 32
  • 45
-1

Hashing is the key. I think use lookup table and hashing key for this. You can find many hashing function online.

ashmish2
  • 2,885
  • 8
  • 40
  • 54
-2

redis is a good reference as redis is a key-value system and it's hash table is a good example

  • Your answer could be improved with additional supporting information. Please [edit] to add further details, such as citations or documentation, so that others can confirm that your answer is correct. You can find more information on how to write good answers [in the help center](/help/how-to-answer). – Community Sep 05 '22 at 03:11
-3

The quickest method would be using binary tree. Its worst case is also only O(logn).

  • 17
    This is incorrect. Worst case lookup for a binary tree is O(n) (degenerate case due to bad insertion order, resulting in a link list, basically) when it is unbalanced. – Randy Howard Apr 21 '13 at 08:13
-3

Additionally, you can use Google CityHash:

#include <stdlib.h>
#include <stddef.h>
#include <stdio.h>
#include <string.h>

#include <byteswap.h>

#include "city.h"

void swap(uint32* a, uint32* b) {
    int temp = *a;
    *a = *b;
    *b = temp;
}

#define PERMUTE3(a, b, c) swap(&a, &b); swap(&a, &c);

// Magic numbers for 32-bit hashing.  Copied from Murmur3.
static const uint32 c1 = 0xcc9e2d51;
static const uint32 c2 = 0x1b873593;

static uint32 UNALIGNED_LOAD32(const char *p) {
  uint32 result;
  memcpy(&result, p, sizeof(result));
  return result;
}

static uint32 Fetch32(const char *p) {
  return UNALIGNED_LOAD32(p);
}

// A 32-bit to 32-bit integer hash copied from Murmur3.
static uint32 fmix(uint32 h)
{
  h ^= h >> 16;
  h *= 0x85ebca6b;
  h ^= h >> 13;
  h *= 0xc2b2ae35;
  h ^= h >> 16;
  return h;
}

static uint32 Rotate32(uint32 val, int shift) {
  // Avoid shifting by 32: doing so yields an undefined result.
  return shift == 0 ? val : ((val >> shift) | (val << (32 - shift)));
}

static uint32 Mur(uint32 a, uint32 h) {
  // Helper from Murmur3 for combining two 32-bit values.
  a *= c1;
  a = Rotate32(a, 17);
  a *= c2;
  h ^= a;
  h = Rotate32(h, 19);
  return h * 5 + 0xe6546b64;
}

static uint32 Hash32Len13to24(const char *s, size_t len) {
  uint32 a = Fetch32(s - 4 + (len >> 1));
  uint32 b = Fetch32(s + 4);
  uint32 c = Fetch32(s + len - 8);
  uint32 d = Fetch32(s + (len >> 1));
  uint32 e = Fetch32(s);
  uint32 f = Fetch32(s + len - 4);
  uint32 h = len;

  return fmix(Mur(f, Mur(e, Mur(d, Mur(c, Mur(b, Mur(a, h)))))));
}

static uint32 Hash32Len0to4(const char *s, size_t len) {
  uint32 b = 0;
  uint32 c = 9;
  for (size_t i = 0; i < len; i++) {
    signed char v = s[i];
    b = b * c1 + v;
    c ^= b;
  }
  return fmix(Mur(b, Mur(len, c)));
}

static uint32 Hash32Len5to12(const char *s, size_t len) {
  uint32 a = len, b = len * 5, c = 9, d = b;
  a += Fetch32(s);
  b += Fetch32(s + len - 4);
  c += Fetch32(s + ((len >> 1) & 4));
  return fmix(Mur(c, Mur(b, Mur(a, d))));
}

uint32 CityHash32(const char *s, size_t len) {
  if (len <= 24) {
    return len <= 12 ?
        (len <= 4 ? Hash32Len0to4(s, len) : Hash32Len5to12(s, len)) :
        Hash32Len13to24(s, len);
  }

  // len > 24
  uint32 h = len, g = c1 * len, f = g;
  uint32 a0 = Rotate32(Fetch32(s + len - 4) * c1, 17) * c2;
  uint32 a1 = Rotate32(Fetch32(s + len - 8) * c1, 17) * c2;
  uint32 a2 = Rotate32(Fetch32(s + len - 16) * c1, 17) * c2;
  uint32 a3 = Rotate32(Fetch32(s + len - 12) * c1, 17) * c2;
  uint32 a4 = Rotate32(Fetch32(s + len - 20) * c1, 17) * c2;
  h ^= a0;
  h = Rotate32(h, 19);
  h = h * 5 + 0xe6546b64;
  h ^= a2;
  h = Rotate32(h, 19);
  h = h * 5 + 0xe6546b64;
  g ^= a1;
  g = Rotate32(g, 19);
  g = g * 5 + 0xe6546b64;
  g ^= a3;
  g = Rotate32(g, 19);
  g = g * 5 + 0xe6546b64;
  f += a4;
  f = Rotate32(f, 19);
  f = f * 5 + 0xe6546b64;
  size_t iters = (len - 1) / 20;
  do {
    uint32 a0 = Rotate32(Fetch32(s) * c1, 17) * c2;
    uint32 a1 = Fetch32(s + 4);
    uint32 a2 = Rotate32(Fetch32(s + 8) * c1, 17) * c2;
    uint32 a3 = Rotate32(Fetch32(s + 12) * c1, 17) * c2;
    uint32 a4 = Fetch32(s + 16);
    h ^= a0;
    h = Rotate32(h, 18);
    h = h * 5 + 0xe6546b64;
    f += a1;
    f = Rotate32(f, 19);
    f = f * c1;
    g += a2;
    g = Rotate32(g, 18);
    g = g * 5 + 0xe6546b64;
    h ^= a3 + a1;
    h = Rotate32(h, 19);
    h = h * 5 + 0xe6546b64;
    g ^= a4;
    g = bswap_32(g) * 5;
    h += a4 * 5;
    h = bswap_32(h);
    f += a0;
    PERMUTE3(f, h, g);
    s += 20;
  } while (--iters != 0);
  g = Rotate32(g, 11) * c1;
  g = Rotate32(g, 17) * c1;
  f = Rotate32(f, 11) * c1;
  f = Rotate32(f, 17) * c1;
  h = Rotate32(h + g, 19);
  h = h * 5 + 0xe6546b64;
  h = Rotate32(h, 17) * c1;
  h = Rotate32(h + f, 19);
  h = h * 5 + 0xe6546b64;
  h = Rotate32(h, 17) * c1;
  return h;
}
Alan Turing
  • 2,482
  • 17
  • 20