1

I'm currently working on a hash table implementation in C. I'm trying to implement dynamic resizing, but came across a problem.

If resizing a hash table means creating a new one with double (or half) the size, rehashing, and deleting the old one, how can I deal with old references the user may have made to the old table? Example code (I've omitted error checking just for this example):

int main(int argc, char *argv[])
{
    ht = ht_create(5) /* make hashtable with size 5 */
    ht_insert("john", "employee"); /* key-val pair "john -> employee" */
    ht_insert("alice", "employee");
    char *position = ht_get(ht, "alice"); /* get alice's position from hashtable ht */


    ht_insert("bob", "boss"); /* this insert exceeds the load factor, resizes the hash table */

    printf("%s", position); /* returns NULL because the previous hashtable that was resized was freed */

    return 0;
}

In this case position pointed to alice's value which was found in the hashtable. When it was resized, we freed the hash table and lost it. How can I fix this problem, so the user won't have to worry that a previously defined pointer was freed?

EDIT: my current hash table implementation

hash.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "hash.h"

#define LOADFACTOR 0.75

typedef struct tableentry /* hashtab entry */
{
    struct tableentry *next;
    char *key;
    void *val;
} tableentry_t;

typedef struct hashtable
{
    datatype_t type;
    size_t size;
    size_t load; /* number of keys filled */
    struct tableentry **tab;
} hashtable_t;

/* creates hashtable */
/* NOTE: dynamically allocated, remember to ht_free() */
hashtable_t *ht_create(size_t size, datatype_t type)
{
    hashtable_t *ht = NULL;
    if ((ht = malloc(sizeof(hashtable_t))) == NULL)
        return NULL;
    /* allocate ht's table */
    if ((ht->tab = malloc(sizeof(tableentry_t) * size)) == NULL)
        return NULL;
    /* null-initialize table */
    size_t i;
    for (i = 0; i < size; i++)
        ht->tab[i] = NULL;
    ht->size = size;
    ht->type = type;
    return ht;
}

/* creates hash for a hashtab */
static unsigned hash(char *s)
{
    unsigned hashval;
    for (hashval = 0; *s != '\0'; s++)
        hashval = *s + 31 * hashval;
    return hashval;
}

static int *intdup(int *i)
{
    int *new;
    if ((new = malloc(sizeof(int))) == NULL)
        return NULL;
    *new = *i;
    return new;
}

static void free_te(tableentry_t *te)
{
    free(te->key);
    free(te->val);
    free(te);
}

/* loops through linked list freeing */
static void free_te_list(tableentry_t *te)
{
    tableentry_t *next;
    while (te != NULL)
    {
        next = te->next;
        free_te(te);
        te = next;
    }
}

/* creates a key-val pair */
static tableentry_t *alloc_te(char *k, void *v, datatype_t type)
{
    tableentry_t *te = NULL;
    int status = 0;
    /* alloc struct */
    if ((te = calloc(1, sizeof(*te))) == NULL)
        status = -1;
    /* alloc key */
    if ((te->key = strdup(k)) == NULL)
        status = -1;
    /* alloc value */
    int *d;
    char *s;
    switch (type)
    {
        case STRING:
            s = (char *) v;
            if ((te->val = strdup(s)) == NULL)
                status = -1;
            break;
        case INTEGER:
            d = (int *) v;
            if ((te->val = intdup(d)) == NULL)
                status = -1;
            break;
        default:
            status = -1;
    }
    if (status < 0)
    {
        free_te_list(te);
        return NULL;
    }
    te->next = NULL;
    return te;
}

static tableentry_t *lookup(hashtable_t *ht, char *k)
{
    tableentry_t *te;
    /* step through linked list */
    for (te = ht->tab[hash(k) % ht->size]; te != NULL; te = te->next)
        if (strcmp(te->key, k) == 0)
            return te; /* found */
    return NULL; /* not found */
}

/* inserts the key-val pair */
hashtable_t *ht_insert(hashtable_t *ht, char *k, void *v)
{
    tableentry_t *te;
    /* unique entry */
    if ((te = lookup(ht, k)) == NULL)
    {
        te = alloc_te(k, v, ht->type);
        unsigned hashval = hash(k) % ht->size;
        /* insert at beginning of linked list */
        te->next = ht->tab[hashval]; 
        ht->tab[hashval] = te;
        ht->load++;
    }
    /* replace val of previous entry */
    else
    {
        free(te->val);
        switch (ht->type)
        {
            case STRING:
                if ((te->val = strdup(v)) == NULL)
                    return NULL;
                break;
            case INTEGER:
                if ((te->val = intdup(v)) == NULL)
                    return NULL;
                break;
            default:
                return NULL;
        }
    }
    return ht;
}

static void delete_te(hashtable_t *ht, char *k)
{
    tableentry_t *te, *prev;
    unsigned hashval = hash(k) % ht->size;
    te = ht->tab[hashval];
    /* point head to next element if deleting head */
    if (strcmp(te->key, k) == 0)
    {
        ht->tab[hashval] = te->next;
        free_te(te);
        ht->load--;
        return;
    }
    /* otherwise look through, keeping track of prev to reassign its ->next */
    for (; te != NULL; te = te->next)
    {
        if (strcmp(te->key, k) == 0)
        {
            prev->next = te->next;
            free_te(te);
            ht->load--;
            return;
        }
        prev = te;
    }   
}

hashtable_t *ht_delete(hashtable_t *ht, char *k)
{
    size_t i;
    if (lookup(ht, k) == NULL)
        return NULL;
    else
        delete_te(ht, k);

}

/* retrieve value from key */
void *ht_get(hashtable_t *ht, char *k)
{
    tableentry_t *te;
    if ((te = lookup(ht, k)) == NULL)
        return NULL;
    return te->val;
}

/* frees hashtable created from ht_create() */
void ht_free(hashtable_t *ht)
{
    size_t i;
    if (ht)
    {
        for (i = 0; i < ht->size; i++)
            if (ht->tab[i] != NULL)
                free_te_list(ht->tab[i]);
        free(ht);
    }
}

/* resizes hashtable, returns new hashtable and frees old */
static hashtable_t *resize(hashtable_t *oht, size_t size)
{
    hashtable_t *nht; /* new hashtable */
    nht = ht_create(size, oht->type);
    /* rehash */
    size_t i;
    tableentry_t *te;
    /* loop through hashtable */
    for (i = 0; i < oht->size; i++)
        /* loop through linked list */
        for (te = oht->tab[i]; te != NULL; te = te->next)
            /* insert & rehash old vals into new ht */
            if (ht_insert(nht, te->key, te->val) == NULL)
                return NULL;
    ht_free(oht);
    return nht;
}

hash.h

/* a hash-table implementation in c */
/*
hashing algorithm: hashval = *s + 31 * hashval
resolves collisions using linked lists
*/

#ifndef HASH
#define HASH

typedef struct hashtable hashtable_t;

typedef enum datatype {STRING, INTEGER} datatype_t;

/* inserts the key-val pair */
hashtable_t *ht_insert(hashtable_t *ht, char *k, void *v);

/* creates hashtable */
/* NOTE: dynamically allocated, remember to ht_free() */
hashtable_t *ht_create(size_t size, datatype_t type);

/* frees hashtable created from ht_create() */
void ht_free(hashtable_t *ht);

/* retrive value from key */
void *ht_get(hashtable_t *ht, char *k);

hashtable_t *ht_delete(hashtable_t *ht, char *k);

#endif
dav
  • 626
  • 4
  • 21
  • you can add an entry to your hash table indicating what hashing algorithm it's using. you can use pointers that are under 0x30000 to indicate it's not a pointer to a function, but using a standard hashing algorithm, and pointers above 0x30000 as pointers to functions. This way all you need to do is rehash it into new bucket size. You can use 0 as a placeholder for "invalid hash function" hash function. eg 0 could mean "invalid hash function", 1 could mean standard hash function #1, ..., and pointers over 0x30000 can be used to indicate it's a hashing function pointer. – Dmytro Sep 24 '17 at 15:30
  • oh you mean, if someone has a weak pointer to a hash table, but that table is resized? it's a non issue because you never destroy the table; you just stop the world(prevent anything from interacting with the hash table using semaphores/locks) and rehash everything. any later attempt to use table will use the new bucket size when performing hashing. – Dmytro Sep 24 '17 at 15:36
  • So you never delete/free the old hash table after resizing? Would that not eventually result in a huge amount of wasted memory space? – dav Sep 24 '17 at 15:39
  • why would it? a hash table is an abstract data structure, it is just the same as an associative array, except it's a table of buckets instead of a table of associations. you just copy over the buckets. The hash function takes care of which bucket to access and then you search it like a regular associative array. – Dmytro Sep 24 '17 at 15:40
  • I'm thinking because you'll have a bunch of dead, empty, memory space (from the old hash table) that will never be used. – dav Sep 24 '17 at 15:41
  • your only dead space is unused buckets, the rest is association-dense buckets. When you rehash all you're doing is changing the number of buckets. later hashing will just hash against a smaller/larger number of buckets. You can have a one bucket hash table, it would literally be an associative array of key, value structs with a one pointer overhead). – Dmytro Sep 24 '17 at 15:41
  • @david: if you always double the hash table size when increasing it, and never reallocate it smaller, the total of all allocations will be less than twice the maximum extent of the hash table, which might be acceptable. – rici Sep 24 '17 at 15:44
  • I understand, I think I've been resizing wrong too. I should only `realloc` the list of buckets to include more and rehash in the same hash table. I've been creating an entirely new hash table with double the buckets, rehashing to the new table, and deleting the old. – dav Sep 24 '17 at 15:48
  • @David: yes, if your hash table uses buckets which are linked lists of individually allocated cells, then your solution is to simply relink the same cells when you resize. But consider the overhead of that solution compared with an open addressed hash table. All those allocations, plus the chain links, occupy a lot of space. If it more than doubles the space requirements, just keeping all the old allocations around starts to look less crazy, no? Moral: there is always more than one approach, and the right one often depends on your precise use case. – rici Sep 24 '17 at 16:36
  • should I remake the hash table then? – dav Sep 28 '17 at 00:09

2 Answers2

1

Do not use the hash table as the container for the data; only use it to refer to the data, and you won't have that problem.

For example, let's say you have key-value pairs, using a structure with the actual data in the C99 flexible array member:

struct pair {
    struct pair  *next; /* For hash chaining */
    size_t        hash; /* For the raw key hash */

    /* Payload: */
    size_t        offset; /* value starts at (data + offset) */
    char          data[]; /* key starts at (data) */
};

static inline const char *pair_key(struct pair *ref)
{
    return (const char *)(ref->data);
}

static inline const char *pair_value(struct pair *ref)
{
    return (const char *)(ref->data + ref->offset);
}

Your hash table can then be simply

struct pair_hash_table {
    size_t        size;
    struct pair **entry;
};

If you have struct pair_hash_table *ht, and struct pair *foo with foo->hash containing the hash of the key, then foo should be in the singly-linked list hanging off ht->entry[foo->hash % ht->size];.

Let's say you wish to resize the hash table ht. You choose a new size, and allocate enough memory for that many struct pair *. Then, you go through each singly-linked list in each old hash entry, detaching them from the old list, and prepending them to the lists in correct hash table entries in the new hash table. Then you just free the old hash table entry array, replacing it with the new one:

int resize_pair_hash_table(struct pair_hash_table *ht, const size_t new_size)
{
    struct pair **entry, *curr, *next;
    size_t        i, k;

    if (!ht || new_size < 1)
        return -1; /* Invalid parameters */

    entry = malloc(new_size * sizeof entry[0]);
    if (!entry)
        return -1; /* Out of memory */

    /* Initialize new entry array to empty. */
    for (i = 0; i < new_size; i++)
        entry[i] = NULL;

    for (i = 0; i < ht->size; i++) {

        /* Detach the singly-linked list. */
        next = ht->entry[i];
        ht->entry[i] = NULL;

        while (next) {
            /* Detach the next element, as 'curr' */
            curr = next;
            next = next->next;

            /* k is the index to this hash in the new array */
            k = curr->hash % new_size;

            /* Prepend to the list in the new array */
            curr->next = entry[k];
            entry[k] = curr;
        }
    }

    /* Old array is no longer needed, */
    free(ht->entry);

    /* so replace it with the new one. */
    ht->entry = entry;
    ht->size = size;

    return 0; /* Success */
}

Note that the hash field in struct pair is not modified, nor recalculated.

Having the raw hash (as opposed to modulo table-size), means you can speed up the key search even when different keys use the same slot:

struct pair *find_key(struct pair_hash_table *ht,
                      const char *key, const size_t key_hash)
{
    struct pair *curr = ht->entry[key_hash % ht->size];

    while (curr)
        if (curr->hash == key_hash && !strcmp(key, pair_key(next)))
            return curr;
        else
            curr = curr->next;

    return NULL; /* Not found. */
}

In C, the logical and operator, &&, is short-circuiting. If the left side is not true, the right side is not evaluated at all, because the entire expression can never be true in that case.

Above, this means that the raw hash value of the key is compared, and only when they do match, the actual strings are compared. If your hash algorithm is even halfway good, this means that if the key already exists, typically only one string comparison is done; and if the key does not exist in the table, typically no string comparisons are done.

Nominal Animal
  • 38,216
  • 5
  • 59
  • 86
  • Conventional wisdom suggests that storing the computed hash in the hash table entry is not optimal, because it increases memory requirements without providing significant value. It is extremely unlikely that different strings with the same hash will have the same prefix, so comparing the hash value before comparing the strings normally *increases* the number of comparisons performed. It does save the trouble of recalculating on resize, but exponential resizing means that an entry's hash value will be recomputed on average only once. – rici Sep 25 '17 at 16:45
  • (On the other hand, comparing the lengths of the entry's key and the target key before comparing the string values can be a win -- the lengths of two equally-hashed strings are likely to be different, but more importantly you can optimize the comparison code if you know in advance that the two strings are the same length.) – rici Sep 25 '17 at 16:47
  • 1
    @rici: I don't know where you get your conventional wisdom from (I myself don't trust authorities just because they're well respected); I'm ornery, and look at actual measurables to decide what I use. In the key-value hash table case, I typically store the key hash, and the lengths of both the key and the value, and even align and pad the data to native word size to allow for faster comparisons. The extra memory cost is well covered by the simplicity (and long-term maintainability) of the code, and the efficiency of simple implementation -- no "tricks" are needed to get very performant code. – Nominal Animal Sep 25 '17 at 19:38
  • 1
    @rici: That said, I do also like to use simple but less-than-perfect hash functions, like the DJB2 xor hash variant (for stuff that is usually text, that is). So, I'm not claiming you are wrong; I am just saying I do not agree with your claims, definitely not without actual known examples or practical proof. Popularity is not proof: while there are billions of flies, it does not mean that poop necessarily tastes good. – Nominal Animal Sep 25 '17 at 19:41
  • I make the easily testable claim that the probability that two full hashes in a chain differ is *less* than the probability that the first sizeof(hash) bytes of two keys differ. Both of those of those comparisons can be done in the same amount of time. If the hashes compare, as they eventually will if the target key is present, then the key comparison will have to be done as well. So the hash-comparison is wasted cycles. Your fly analogy is cute but a reasoned argument would be more intellectually satisfying. – rici Sep 25 '17 at 21:42
  • @rici: That last claim I can accept; yet, it does not lead to the conclusion that storing the hash in the hash table entries is unwarranted. Not even in the case of exponential resizing policy -- which is another "conventional wisdom" that I consider extremely wasteful for larger numbers of items (in the millions). I usually use a three-stage allocation policy: initial fixed-size allocation, exponential growth up to a compile-time limit (usually in the megabyte range), then linear growth in fixed-size chunks; all three compile-time constants. And don't tell me *"memory is cheap"* [...] – Nominal Animal Sep 26 '17 at 13:39
  • @rici: [...] or *"getting more/better hardware is cheaper than investing the time into development"*, because I habitually work with simulators run on computing clusters (as well as embedded development for fun). Your assertions may well cover the majority of desktop applications -- which I suppose is all you care about --, but that still does not make them universally true, or even sensible as "conventional wisdom". It's just dogma. – Nominal Animal Sep 26 '17 at 13:42
  • You seem to think you know quite a lot about me: what I might tell you, and what systems I care about. This suggests that you are happier arguing with strawmen, so I'll leave you to it. (For what it's worth, you are completely wrong on both counts.) – rici Sep 27 '17 at 03:10
  • @rici: No; this is just awfully similar to arguments I've heard before -- starting at assertions similar to "conventional wisdom says" (although it is even more common to just assert something without any basis or background at all). In my experience, those arguments inevitably lead to those the followups I assumed above (memory is cheap, just buy more hardware, and desktop covers 99% of computing anyway). If you were not going to do that, then I apologize for the unwarranted and unfair characterization. – Nominal Animal Sep 27 '17 at 18:23
  • @DavidTran: There will not be any missing pointers this way. You won't have pointers to the hash table entries at all; you only have pointers to the hashed entries -- and those will not change when you resize the hash table. Consider the `find_pair()` function in particular: it returns a pointer to the data entry itself; the function just uses the hash table as a fast way to find it. Even if the hash table was resized immediately afterwards, the pointers returned by `find_pair()` do not change. – Nominal Animal Sep 30 '17 at 01:18
-1

You can deal with them the same way the standard library (C++) deals with this exact problem:

Some operations on containers (e.g. insertion, erasing, resizing) invalidate iterators.

For instance std::unordered_map which is basically a hash table implemented with buckets has these rules:

  • insertion

unordered_[multi]{set,map}: all iterators invalidated when rehashing occurs, but references unaffected [23.2.5/8]. Rehashing does not occur if the insertion does not cause the container's size to exceed z * B where z is the maximum load factor and B the current number of buckets. [23.2.5/14]

  • erasure

unordered_[multi]{set,map}: only iterators and references to the erased elements are invalidated [23.2.5/13]

Iterator invalidation rules

The C++ concept of iterators is a generalization of pointers. So this concept can be applied to C.


Your only other alternative is that instead of holding the objects directly into the container you add another level of indirection and hold some sort of proxy. And so the elements always stay at the same position in memory. It's the proxies that move around on resizing/inserting etc. But you need to analize this scenario: are the added double indirection (which will surely affect performance in a negative way) and increase implementation complexity worth it? Is is that important to have persistent pointers?

bolov
  • 72,283
  • 15
  • 145
  • 224
  • you are talking about the C++ standard library but this is a C question. (Also, *references* to C++ associative container entries are only invalidated by deletion, not by resizing.) – rici Sep 24 '17 at 15:34
  • @rici iterators are a generalization of pointers, so the C++ concept of iterator invalidation can be applied to C pointers. – bolov Sep 24 '17 at 15:35
  • @rici different type of containers have different rules. – bolov Sep 24 '17 at 15:38
  • Yes, I know. But this question is about hashtables in C, so an answer about vectors in C++ would appear to require at least a couple of disclaimers. C++ hashtables *do* solve the problem OP is asking about. – rici Sep 24 '17 at 15:42
  • To be fair C/C++ are close, you can easily just export std::unordered_map c interfacing from a dynamic library and treat it like a void *, or numbers representing keys in number -> std::unordered_map * map stored in the library, and C could use C++ services, but at a one pointer + function call overhead. – Dmytro Sep 24 '17 at 15:48
  • there's never a problem of invalidation if you prevent different forces from acting on the same universe. When dealing with ANY object/service, you must be sure that it only does one thing at a time via locks/monitors/semaphores, this is why getters/setters become really important when you add threads, "classes" give you races on data for free. and you should iterate collections backwards if you do intend to mutate them mid-iteration to prevent resizing issues. But you shouldn't really loop over buckets unless you're on a single thread universe/trust others not to interfere anyway. – Dmytro Sep 24 '17 at 15:51
  • @Dmitry I never implied to take the implementation of `std::unordered_map`. My whole point is to just consider pointers invalidated after a resize. That is all. The `c++` example is there to show that a respected standard library does it this way so it can be a valid option. – bolov Sep 24 '17 at 16:06
  • nobody should ever take pointers to buckets, as they can be changed at rehashing, and pointers to records should always be valid because they remain the same after hashing unless they were deleted. you don't delete/add anything during rehashing, you just move the records to new buckets. If you do remove something, the key would be invalid, but the value will still be valid unless the value object itself was deleted. but that's going into handle invalidation. – Dmytro Sep 24 '17 at 16:08
  • @Dmitry buckets can grow/shrink. This most likely means `realloc`. This can invalidate pointers. And I never said anything about taking pointers to buckets. – bolov Sep 24 '17 at 16:15
  • realloc only occurs on the bucket container(the "data" of the hash table object), and buckets(the "data" referenced by each bucket), the records consist of key, value pairs, the keys will get invalidated, the values are just pointers, so they won't get invalidated. If you do however store values as non pointers, they will get invalidated, yes. but if they are pointers, they will stay the same between resizes, the pointer's address may change, but the value of the pointer won't. – Dmytro Sep 24 '17 at 16:16
  • @Dmitry not necessary. You can store the values directly or you can store pointers to values. You can store the values directly or you can store pointers to values. All 4 combinations are valid depending on the needs – bolov Sep 24 '17 at 16:19
  • that's my point. you can get efficiency by storing types like int directly in records but then you can't fix invalidation without resorting to events, and having each object listen on such events via callbacks. which is more expensive. When you commit to such tables, you might wanna never rehash them, or only use their values as values and never point to them. – Dmytro Sep 24 '17 at 16:21
  • I've seen people simply return a pointer to a copy of the returned data, not a pointer to the original data. That way a change to the hashtable wouldn't affect anything on the outside. It would be the user's responsibility to check the hashtable again if a value changed for a given key. Of course there is the additional cost of copying. – synchronizer Sep 24 '17 at 16:29
  • In C++, `std::dequeue` also preserves references during resizing, and it does that without storing every value in a separate allocation. So there are other alternatives, as well. – rici Sep 24 '17 at 16:30
  • as far as i know dequeue is implemented using linked lists, so the discussion about performance goes out the window. im guessing it just doesn't realloc the bucket data to grow it. if you are reallocating the bucket to grow it/shrink it, you will invalidate the non separate allocation pointers. – Dmytro Sep 24 '17 at 16:32
  • @dmitry: `std::deque` is required to provide random access to elements in amortized constant time, which is not possible with a linked list. – rici Sep 24 '17 at 21:29
  • unlike vectors, deques are not guaranteed to store all its elements in contiguous storage locations. (http://www.cplusplus.com/reference/deque/deque/) – Dmytro Sep 24 '17 at 22:30
  • @Dmitry: that is correct. The normal implementation is to store elements in a number of blocks of contiguous elements. The blocks are indexed by a vector of block pointers, not a linked list. That still allows for constant-time random access, and amortized constant time insertion at either end of the list (the size of a block is fixed but occasionally it will be necessary to increase the size of the block index vector.) – rici Sep 24 '17 at 22:51
  • I meant that it's a linked list in that it it is a linked collection. beside that, there's the same issue as taking a pointer of a value in std::vector which isn't a reference, it may be invalidated by resizing. – Dmytro Sep 24 '17 at 22:55
  • @,Dmitry: it is not a linked list of blocks either; you can't achieve amortized constant random access that way. And references are not invalidated on resizing or insertion/deletion at either end. Only if the referenced element is deleted or there is an insertion/deletion in the middle. These are guaranteed by the standard. – rici Sep 25 '17 at 13:32