Note about storing both null-terminated strings and binary data in an array
In your question, you specify that the fixed-length arrays could represent
- null-terminated strings or
- binary data.
If your program has no way of knowing which one of these two possibilities is represented by the array contents, then, if the array contents is supposed to represent a null-terminated string, you will have to fill the remainder of the array with null characters (for example using the function strncpy). It won't be sufficient to only write a single null terminating character.
This is important, because if you don't do this, your program will have no way of knowing how to interpret the remaining data after it encounters a byte with the value 0
. It won't know whether to
interpret this byte as a null-terminating character of a string and ignore all remaining bytes of the string, or
interpret this byte as binary data and treat the remaining bytes also as binary data (i.e. not ignore the remaining bytes).
However, if you always fill all remaining bytes after a terminating null character of a string with null characters, then your program won't habe to worry about whether the array contents represents a string or binary data, because it can treat both the same way.
Therefore, in the remainder of my answer, I will assume that if the array contents represents a string, then all remaining bytes after the end of the string will be filled with bytes with the value 0
. That way, I can assume that no bytes should ever be ignored.
Hash table solution
In Python, I would use a set/hashtable/dict and not a list (because of very fast O(1) lookup):
It is also possible to use a hash table in C, although you will have to program it yourself or use an already existing library. The C standard library does not provide any hashing functions.
The following code will randomly generate an array of 10,000 fixed-length (not null-terminated) strings of length 10 with the characters a
to z
, A
to Z
and 0
to 9
, but it will also place three hard-coded words into the array in different places, so you can search for these words later, in order to test the search function. The program will then insert all words into a hash table, and then perform several hard-coded lookups into the hash table.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <stdbool.h>
#define NUM_ARRAYS 10000
#define ARRAY_LENGTH 10
#define HASH_TABLE_SIZE 10000
struct hash_table_entry
{
struct hash_table_entry *next;
unsigned char data[ARRAY_LENGTH];
};
//function prototype declarations
void fill_with_random_data( unsigned char data[NUM_ARRAYS][ARRAY_LENGTH] );
size_t hash_array( const unsigned char array[ARRAY_LENGTH] );
void build_hash_table_from_data( unsigned char data[][ARRAY_LENGTH], size_t data_length, struct hash_table_entry *hash_table[HASH_TABLE_SIZE] );
void cleanup_hash_table( struct hash_table_entry *hash_table[HASH_TABLE_SIZE] );
bool lookup( const unsigned char array[ARRAY_LENGTH], struct hash_table_entry *hash_table[HASH_TABLE_SIZE] );
bool verbose_lookup( const unsigned char array[ARRAY_LENGTH], struct hash_table_entry *hash_table[HASH_TABLE_SIZE] );
int main( void )
{
//declare 2D array for the input data
static unsigned char data[NUM_ARRAYS][ARRAY_LENGTH];
//declare hash table and initialize all fields to zero
static struct hash_table_entry *hash_table[HASH_TABLE_SIZE] = {NULL};
//seed random number generator
srand( (unsigned)time( NULL ) );
//fill array "data" with random data
printf( "Generating random data..." );
fflush( stdout );
fill_with_random_data( data );
printf( "done\n" );
fflush( stdout );
//overwrite a few strings, so that we can search for
//them later
_Static_assert( NUM_ARRAYS > 500, "unexpected value" );
_Static_assert( ARRAY_LENGTH == 10, "unexpected value" );
memcpy( data[47], "abcdefghij", 10 );
memcpy( data[218], "klmnopqrst", 10 );
memcpy( data[419], "uvwxyz0123", 10 );
//build hash table
printf( "Building hash table..." );
fflush( stdout );
build_hash_table_from_data( data, NUM_ARRAYS, hash_table );
printf( "done\n" );
fflush( stdout );
//perform lookups
verbose_lookup( (unsigned char *)"uvwxyz0123", hash_table );
verbose_lookup( (unsigned char *)"hsdkesldhg", hash_table );
verbose_lookup( (unsigned char *)"abcdefghij", hash_table );
verbose_lookup( (unsigned char *)"erlsodn3ls", hash_table );
verbose_lookup( (unsigned char *)"klmnopqrst", hash_table );
//cleanup
printf( "Performing cleanup..." );
fflush( stdout );
cleanup_hash_table( hash_table );
printf( "done\n" );
fflush( stdout );
}
void fill_with_random_data( unsigned char data[NUM_ARRAYS][ARRAY_LENGTH] )
{
static const unsigned char random_chars[62] =
"abcdefghijklmnopqrstuvwxyz"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"0123456789";
for ( size_t i = 0; i < NUM_ARRAYS; i++ )
{
for ( size_t j = 0; j < ARRAY_LENGTH; j++ )
{
data[i][j] = random_chars[rand()%sizeof random_chars];
}
}
}
//This is a simple hash function. Depending on the type of data, a
//more complex hashing function may be required, in order to prevent
//hash collisions.
size_t hash_array( const unsigned char array[ARRAY_LENGTH] )
{
size_t hash = 0;
for ( size_t i = 0; i < ARRAY_LENGTH; i++ )
{
hash = hash * 7 + array[i];
}
return hash % HASH_TABLE_SIZE;
}
void build_hash_table_from_data( unsigned char data[][ARRAY_LENGTH], size_t data_length, struct hash_table_entry *hash_table[HASH_TABLE_SIZE] )
{
for ( size_t i = 0; i < data_length; i++ )
{
struct hash_table_entry **pp, *p;
//hash the array and store the result
size_t hash = hash_array( data[i] );
//perform hash table lookup
pp = &hash_table[hash];
//make pp point to the last null pointer in the list
while ( ( p = *pp ) != NULL )
pp = &p->next;
//allocate memory for linked list node
p = malloc( sizeof *p );
if ( p == NULL )
{
fprintf( stderr, "Memory allocation failure!\n" );
exit( EXIT_FAILURE );
}
//fill in data for new node
p->next = NULL;
memcpy( p->data, data[i], ARRAY_LENGTH );
//insert new node into linked list
*pp = p;
}
}
void cleanup_hash_table( struct hash_table_entry *hash_table[HASH_TABLE_SIZE] )
{
for ( size_t i = 0; i < HASH_TABLE_SIZE; i++ )
{
struct hash_table_entry *p;
p = hash_table[i];
while ( p != NULL )
{
struct hash_table_entry *q = p;
p = p->next;
free( q );
}
}
}
bool lookup( const unsigned char array[ARRAY_LENGTH], struct hash_table_entry *hash_table[HASH_TABLE_SIZE] )
{
size_t hash;
struct hash_table_entry *p;
//find hash
hash = hash_array( array );
//index into hash table
p = hash_table[hash];
//attempt to find array in linked list
while ( p != NULL )
{
if ( memcmp( array, p->data, ARRAY_LENGTH ) == 0 )
{
//array was found
return true;
}
p = p->next;
}
//array was not found
return false;
}
//This function serves as a wrapper function for the
//function "lookup". It prints to "stdout" what it is
//looking for and what the result of the lookup was.
bool verbose_lookup( const unsigned char array[ARRAY_LENGTH], struct hash_table_entry *hash_table[HASH_TABLE_SIZE] )
{
//print message
printf( "Searching for %.*s...", ARRAY_LENGTH, array );
fflush( stdout );
if ( lookup( array, hash_table ) )
{
printf( "found\n" );
fflush( stdout );
return true;
}
else
{
printf( "not found\n" );
fflush( stdout );
return false;
}
}
This program has the following output:
Generating random data...done
Building hash table...done
Searching for uvwxyz0123...found
Searching for hsdkesldhg...not found
Searching for abcdefghij...found
Searching for erlsodn3ls...not found
Searching for klmnopqrst...found
Performing cleanup...done
As you can see, it found all 3 strings that were explicitly inserted, and no other strings were found.
In the code, I am using unsigned char
instead of char
, because in C, a char *
is usually used for passing null-terminated strings. Therefore, it seemed more appropriate to use unsigned char *
for passing data that could be binary or not null-terminated.
bsearch
solution
For comparison, here is a solution which generates the input array in the same way, but uses bsearch
instead of a hash table for searching it:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <stdbool.h>
#define NUM_ARRAYS 10000
#define ARRAY_LENGTH 10
//function prototype declarations
void fill_with_random_data( unsigned char data[NUM_ARRAYS][ARRAY_LENGTH] );
int compare_func( const void *a, const void *b );
bool verbose_search( const unsigned char array[ARRAY_LENGTH], unsigned char data[NUM_ARRAYS][ARRAY_LENGTH] );
int main( void )
{
//declare 2D array for the input data
static unsigned char data[NUM_ARRAYS][ARRAY_LENGTH];
//seed random number generator
srand( (unsigned)time( NULL ) );
//fill array "data" with random data
printf( "Generating random data..." );
fflush( stdout );
fill_with_random_data( data );
printf( "done\n" );
fflush( stdout );
//overwrite a few strings, so that we can search for
//them later
_Static_assert( NUM_ARRAYS > 500, "unexpected value" );
_Static_assert( ARRAY_LENGTH == 10, "unexpected value" );
memcpy( data[47], "abcdefghij", 10 );
memcpy( data[218], "klmnopqrst", 10 );
memcpy( data[419], "uvwxyz0123", 10 );
//sort the array
printf( "Sorting array..." );
fflush( stdout );
qsort( data, NUM_ARRAYS, ARRAY_LENGTH, compare_func );
printf( "done\n" );
fflush( stdout );
//perform lookups
verbose_search( (unsigned char *)"uvwxyz0123", data );
verbose_search( (unsigned char *)"hsdkesldhg", data );
verbose_search( (unsigned char *)"abcdefghij", data );
verbose_search( (unsigned char *)"erlsodn3ls", data );
verbose_search( (unsigned char *)"klmnopqrst", data );
}
void fill_with_random_data( unsigned char data[NUM_ARRAYS][ARRAY_LENGTH] )
{
static const unsigned char random_chars[62] =
"abcdefghijklmnopqrstuvwxyz"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"0123456789";
for ( size_t i = 0; i < NUM_ARRAYS; i++ )
{
for ( size_t j = 0; j < ARRAY_LENGTH; j++ )
{
data[i][j] = random_chars[rand()%sizeof random_chars];
}
}
}
int compare_func( const void *a, const void *b )
{
return memcmp( a, b, ARRAY_LENGTH );
}
//This function serves as a wrapper function for the
//function "bsearch". It prints to "stdout" what it is
//looking for and what the result of the search was.
bool verbose_search( const unsigned char array[ARRAY_LENGTH], unsigned char data[NUM_ARRAYS][ARRAY_LENGTH] )
{
//print message
printf( "Searching for %.*s...", ARRAY_LENGTH, array );
fflush( stdout );
if ( bsearch( array, data, NUM_ARRAYS, ARRAY_LENGTH, compare_func ) != NULL )
{
printf( "found\n" );
fflush( stdout );
return true;
}
else
{
printf( "not found\n" );
fflush( stdout );
return false;
}
}
This program has the following output:
Generating random data...done
Sorting array...done
Searching for uvwxyz0123...found
Searching for hsdkesldhg...not found
Searching for abcdefghij...found
Searching for erlsodn3ls...not found
Searching for klmnopqrst...found
The hash table solution is probably faster than the binary search solution though, assuming that a good hash function is selected for the input, so that the number of hash collisions is minimal.