You can use a single binary search tree (AVL/Red-black/...) to contain all the strings, from all sets, by keying them lexicographically as (set_number, string). You don't need to store sets explicitly anywhere. For example, the comparator defining the order of nodes for the tree could look like:
function compare_nodes (node1, node2) {
if (node1.set_number < node2.set_number) return LESS;
if (node1.set_number > node2.set_number) return GREATER;
if (node1.string < node2.string) return LESS;
if (node1.string > node2.string) return GREATER;
return EQUAL;
}
With such a structure, some common operations are possible (but maybe not straightforward).
To find whether a string s
exists in the set set_number
, simply lookup (set_number
, s
) in the tree, for an exact match.
To find all strings in the set set_number
:
function iterate_all_strings_in_set (set_number) {
// Traverse the tree from root downwards, looking for the given key. Return
// wherever the search ends up, whether it found the value or not.
node = lookup_tree_weak(set_number, "");
// tree empty?
if (node == null) {
return;
}
// We may have gotten the greatest node from the previous set,
// instead of the first node from the set we're interested in.
if (node.set_number != set_number) {
node = successor(node);
}
while (node != null && node.set_number == set_number) {
do_something_with(node.string);
node = successor(node);
}
}
The above requires O((k+1)*log(n))
time, where k
is the number of strings in set_number
, and n
is the number of all strings.
To find all set numbers with at least one string associated:
function iterate_all_sets ()
{
node = first_node_in_tree();
while (node != null) {
current_set = node.set_number;
do_something_with(current_set);
if (cannot increment current_set) {
return;
}
node = lookup_tree_weak(current_set + 1, "");
if (node.set_number == current_set) {
node = successor(node);
}
}
}
The above requires O((k+1)*log(n))
time, where k
is the number of sets with at least one string, and n
is the number of all strings.
Note that the above code assumes that the tree is not modified in the "do_something" calls; it may crash if nodes are removed.
Addidionally, here's some real C code which demonstrates this, using my own generic AVL tree implemetation. To compile it, it's enough to copy the misc/
and structure/
folders from BadVPN source somewhere and add an include path there.
Note how my AVL tree does not contain any "data" in its nodes, and how it doesn't do any of its own memory allocation. This comes handy when you have a lot of data to work with. To make it clear: the program below does only a single malloc()
, which is the one that allocates the nodes array.
#include <stdlib.h>
#include <stdio.h>
#include <inttypes.h>
#include <assert.h>
#include <structure/BAVL.h>
#include <misc/offset.h>
struct value {
uint32_t set_no;
char str[3];
};
struct node {
uint8_t is_used;
struct value val;
BAVLNode tree_node;
};
BAVL tree;
static int value_comparator (void *unused, void *vv1, void *vv2)
{
struct value *v1 = vv1;
struct value *v2 = vv2;
if (v1->set_no < v2->set_no) {
return -1;
}
if (v1->set_no > v2->set_no) {
return 1;
}
int c = strcmp(v1->str, v2->str);
if (c < 0) {
return -1;
}
if (c > 0) {
return 1;
}
return 0;
}
static void random_bytes (unsigned char *out, size_t n)
{
while (n > 0) {
*out = rand();
out++;
n--;
}
}
static void random_value (struct value *out)
{
random_bytes((unsigned char *)&out->set_no, sizeof(out->set_no));
for (size_t i = 0; i < sizeof(out->str) - 1; i++) {
out->str[i] = (uint8_t)32 + (rand() % 94);
}
out->str[sizeof(out->str) - 1] = '\0';
}
static struct node * find_node (const struct value *val)
{
// find AVL tree node with an equal value
BAVLNode *tn = BAVL_LookupExact(&tree, (void *)val);
if (!tn) {
return NULL;
}
// get node pointer from pointer to its value (same as container_of() in Linux kernel)
struct node *n = UPPER_OBJECT(tn, struct node, tree_node);
assert(n->val.set_no == val->set_no);
assert(!strcmp(n->val.str, val->str));
return n;
}
static struct node * lookup_weak (const struct value *v)
{
BAVLNode *tn = BAVL_Lookup(&tree, (void *)v);
if (!tn) {
return NULL;
}
return UPPER_OBJECT(tn, struct node, tree_node);
}
static struct node * first_node (void)
{
BAVLNode *tn = BAVL_GetFirst(&tree);
if (!tn) {
return NULL;
}
return UPPER_OBJECT(tn, struct node, tree_node);
}
static struct node * next_node (struct node *node)
{
BAVLNode *tn = BAVL_GetNext(&tree, &node->tree_node);
if (!tn) {
return NULL;
}
return UPPER_OBJECT(tn, struct node, tree_node);
}
size_t num_found;
static void iterate_all_strings_in_set (uint32_t set_no)
{
struct value v;
v.set_no = set_no;
v.str[0] = '\0';
struct node *n = lookup_weak(&v);
if (!n) {
return;
}
if (n->val.set_no != set_no) {
n = next_node(n);
}
while (n && n->val.set_no == set_no) {
num_found++; // "do_something_with_string"
n = next_node(n);
}
}
static void iterate_all_sets (void)
{
struct node *node = first_node();
while (node) {
uint32_t current_set = node->val.set_no;
iterate_all_strings_in_set(current_set); // "do_something_with_set"
if (current_set == UINT32_MAX) {
return;
}
struct value v;
v.set_no = current_set + 1;
v.str[0] = '\0';
node = lookup_weak(&v);
if (node->val.set_no == current_set) {
node = next_node(node);
}
}
}
int main (int argc, char *argv[])
{
size_t num_nodes = 10000000;
// init AVL tree, using:
// key=(struct node).val,
// comparator=value_comparator
BAVL_Init(&tree, OFFSET_DIFF(struct node, val, tree_node), value_comparator, NULL);
printf("Allocating...\n");
// allocate nodes (missing overflow check...)
struct node *nodes = malloc(num_nodes * sizeof(nodes[0]));
if (!nodes) {
printf("malloc failed!\n");
return 1;
}
printf("Inserting %zu nodes...\n", num_nodes);
size_t num_inserted = 0;
// insert nodes, giving them random values
for (size_t i = 0; i < num_nodes; i++) {
struct node *n = &nodes[i];
// choose random set number and string
random_value(&n->val);
// try inserting into AVL tree
if (!BAVL_Insert(&tree, &n->tree_node, NULL)) {
printf("Insert collision: (%"PRIu32", '%s') already exists!\n", n->val.set_no, n->val.str);
n->is_used = 0;
continue;
}
n->is_used = 1;
num_inserted++;
}
printf("Looking up...\n");
// lookup all those values
for (size_t i = 0; i < num_nodes; i++) {
struct node *n = &nodes[i];
struct node *lookup_n = find_node(&n->val);
if (n->is_used) { // this node is the only one with this value
ASSERT(lookup_n == n)
} else { // this node was an insert collision; some other
// node must have this value
ASSERT(lookup_n != NULL)
ASSERT(lookup_n != n)
}
}
printf("Iterating by sets...\n");
num_found = 0;
iterate_all_sets();
ASSERT(num_found == num_inserted)
printf("Removing all strings...\n");
for (size_t i = 0; i < num_nodes; i++) {
struct node *n = &nodes[i];
if (!n->is_used) { // must not remove it it wasn't inserted
continue;
}
BAVL_Remove(&tree, &n->tree_node);
}
return 0;
}