The sizes of some buffers (66, 1466) reminded me that I helped another poster with a quite similar problem. They got one of the large Yelp files, massaged into a 90MB CSV file with the names (some included spaces) as the keys and the rest as data. Using the name as the key means that that key is not unique, of course, and I proposed a linked list for the data to handle it (was the simplest I could come up with).
Another problem: the CSV file was sorted, so a simple BST would be quite skewed, it needs to be balanced.
As it is very similar to the problem of the OP, although not the actual question, I take the liberty to post it here. May it be helpful.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
// make a string out of a number macro
#define MAKE_STRING(x) #x
#define CONVERT_TO_STRING(x) MAKE_STRING(x)
#define MAX(x,y) (((x) > (y)) ? (x) : (y))
// For search-key buffer
#define MAXNAMELENGTH 512
#define MAXSCANF CONVERT_TO_STRING(MAXNAMELENGTH)
// linked-list node to hold the actual data
// TODO: extend to a skip-list/another tree
struct list {
char *data;
struct list *next;
};
typedef struct list list;
// AVL tree
struct node {
// should have been named "key", I s'ppose?
char *name;
// Data goes in a linked list
list *ll;
struct node *left;
struct node *right;
// the single extra information needed for balancing: the level
int height;
};
typedef struct node node;
node *search(node ** tree, char *key);
int height(node * tree);
int diff_height(node * tree);
node *rot_left(node * tree);
node *rot_right(node * tree);
int insertion(node ** r, char *key, char *value);
void bt_print(node * leaf);
void deltree(node * tree);
//void trim(char *source);
// getline() used for clarity of this example.
// Uses a version from the Stackoverflow documentation
// if not available natively. See getline() at the end
// of this code for a link
#if !(defined _POSIX_C_SOURCE) || _POSIX_C_SOURCE < 200809L
# include <sys/types.h>
ssize_t getline(char **pline_buf, size_t * pn, FILE * fin);
#endif
#ifdef SORTED_LIST
int ll_sorted_insert(list ** head, char *value);
#else
int ll_insert(list ** head, char *value);
#endif
void ll_free(list * knot);
void ll_print(list * knot);
void ll_fprint(FILE * fp, list * knot);
// int ll_search(list * knot, char *value);
int main(int argc, char **argv)
{
node *root;
FILE *fp;
int res;
size_t bufsize = MAXNAMELENGTH + 2;
char *buffer;
char *keyin = NULL;
char *valuein = NULL;
char *comma;
char *point;
char *line_buf = NULL;
size_t line_buf_size = 0;
ssize_t line_size;
int exitvalue = EXIT_SUCCESS;
int processed = 0;
root = NULL;
// must have at least one argument
if (argc < 2) {
fprintf(stderr, "Usage: %s datafile [< key-list]\n", argv[0]);
exit(EXIT_FAILURE);
}
// tried to keep everything on the heap.
buffer = malloc(bufsize);
if (buffer == NULL) {
fprintf(stderr, "Malloc failed\n");
exit(EXIT_FAILURE);
}
// the datafile
fp = fopen(argv[1], "r");
if (fp == NULL) {
fprintf(stderr, "Opening \"%s\" for reading failed\n", argv[1]);
exit(EXIT_FAILURE);
}
// takes search keys from stdin only. You might see a need to change that
// main difference to fgets(): getline() takes care of the memory
while ((line_size = getline(&line_buf, &line_buf_size, fp)) > 0) {
if (ferror(fp)) {
// TODO: check errno for the exact kind of error
fprintf(stderr, "An eror occured during reading \"%s\"\n", argv[1]);
exitvalue = EXIT_FAILURE;
goto _END;
} else if (feof(fp)) {
break;
}
// assuming strict KEY","DATA"\n" and without whitespace
//TODO: check if line_size < bufsize!
// valuein points to the comma before DATA
valuein = strchr(line_buf, ',');
// skip comma
valuein++;
// valuein points to DATA now
// might have no new line at the end
if ((point = strchr(valuein, '\n')) != NULL) {
*point = '\0';
}
//printf("%s",valuein);
// comma points to the comma before DATA
comma = strchr(line_buf, ',');
// make *comma NUL, that is: replace ',' with '\0'
*comma = '\0';
keyin = line_buf;
// keyin points to KEY now
if (!insertion(&root, keyin, valuein)) {
fprintf(stderr, "An eror occured during inserting \"%s\"\n", keyin);
exitvalue = EXIT_FAILURE;
goto _END;
}
if (++processed % 1000 == 0) {
printf("Processing line %d\n", processed);
}
}
// bt_print(root);
free(line_buf);
// search-keys come from either stdin or get typed in.
// things typed in are also stdin
puts("Searching...");
while ((res = scanf("%" MAXSCANF "s", buffer)) == 1) {
if (strcmp(buffer, "exit") == 0) {
break;
}
// ignoring return for now
(void) search(&root, buffer);
}
_END:
// clean up
deltree(root);
free(buffer);
fclose(fp);
exit(exitvalue);
}
// Not used here, code is already overly cluttered
/*
#include <ctype.h>
// trim whitespace, fore and aft
// look into your textbook, you might find it there, too
char *trim_both(char *s)
{
char *end;
while (isspace(*s)) {
s++;
}
if (*s == '\0') {
return s;
}
end = s + stringlength(s) - 1;
while (end > s && isspace(*end)) {
end--;
}
*(end + 1) = '\0';
return s;
}
*/
// traverse tree (defauls to preorder) and print the linked lists at the nodes
void bt_print(node * leaf)
{
if (leaf) {
#ifdef PRINT_POSTORDER
bt_print(leaf->left);
bt_print(leaf->right);
puts("LEAF_START");
printf("%s\n", leaf->name);
ll_print(leaf->ll);
puts("LEAF_END\n");
#elif (defined PRINT_INORDER)
bt_print(leaf->left);
puts("LEAF_START");
printf("%s\n", leaf->name);
ll_print(leaf->ll);
puts("LEAF_END\n");
#else
puts("LEAF_START");
printf("%s\n", leaf->name);
ll_print(leaf->ll);
puts("LEAF_END\n");
bt_print(leaf->left);
bt_print(leaf->right);
#endif
}
}
// (simple) implementation of an AVL tree
// or use a macro if "inline" is not possible
// get height
inline int height(node * tree)
{
return (tree == NULL) ? 0 : tree->height;
}
inline int diff_height(node * tree)
{
return (tree == NULL) ? 0 : height(tree->left) - height(tree->right);
}
// rotation of branch for balancing
node *rot_left(node * tree)
{
node *right = tree->right;
// temporary variable for the swap, pardon, rotation
node *left;
// rotate
left = right->left;
right->left = tree;
tree->right = left;
// increment heights
tree->height = MAX(height(tree->left), height(tree->right)) + 1;
right->height = MAX(height(right->left), height(right->right)) + 1;
return right;
}
// rotation of branch for balancing
node *rot_right(node * tree)
{
node *left = tree->left;
node *right;
right = left->right;
left->right = tree;
tree->left = right;
tree->height = MAX(height(tree->left), height(tree->right)) + 1;
left->height = MAX(height(left->left), height(left->right)) + 1;
return left;
}
// insert a key/value pair into the tree
int insertion(node ** r, char *key, char *value)
{
int heightdiff = 0;
if (*r == NULL) {
// allocate memory for a new node
*r = malloc(sizeof(node));
if (r == NULL) {
return 0;
}
// the caller gave us nothing but pointers to the actual content
// so allocate memory...
(*r)->name = malloc(strlen(key) + 1);
if ((*r)->name == NULL) {
return 0;
}
// and make a copy
strcpy((*r)->name, key);
// Must be NULL for ll_sorted_insert() to work
(*r)->ll = NULL;
// insert value into a linked list either sorted or
// or as it comes
#ifdef SORTED_LIST
ll_sorted_insert(&(*r)->ll, value);
#else
ll_insert(&(*r)->ll, value);
#endif
// terminate node
(*r)->left = NULL;
(*r)->right = NULL;
// set height, used for the AVL-tree balancing, to one
(*r)->height = 1;
}
// search a place for the key
// Checks of returns omitted for clarity
else if (strcmp(key, (*r)->name) < 0) {
insertion(&(*r)->left, key, value);
} else if (strcmp(key, (*r)->name) > 0) {
insertion(&(*r)->right, key, value);
} else {
// insert the data for the key into the linked list
#ifdef SORTED_LIST
ll_sorted_insert(&(*r)->ll, value);
#else
ll_insert(&(*r)->ll, value);
#endif
return 1;
}
// And now balance the tree
// see your textbook or even wikipedia for an explanation
// (they have colored pictures!)
(*r)->height = MAX(height((*r)->left), height((*r)->right)) + 1;
heightdiff = diff_height(*r);
// left-left
if (heightdiff > 1 && strcmp(key, (*r)->left->name) < 0) {
*r = rot_right(*r);
return 1;
}
// right-right
if (heightdiff < -1 && strcmp(key, (*r)->right->name) > 0) {
*r = rot_left(*r);
return 1;
}
// left-right
if (heightdiff > 1 && strcmp(key, (*r)->left->name) > 0) {
*r = rot_right(*r);
return 1;
}
// right-left
if (heightdiff < -1 && strcmp(key, (*r)->right->name) < 0) {
*r = rot_left(*r);
return 1;
}
return 1;
}
// insert data into a linked list. Sorted here...
int ll_sorted_insert(list ** head, char *value)
{
list *current;
list *llnode;
llnode = malloc(sizeof(list));
if (llnode == NULL) {
return 0;
}
// as with the key, we only got a pointer and need to take care of the
// memory here
llnode->data = malloc(strlen(value) + 1);
if (llnode->data == NULL) {
return 0;
}
strcpy(llnode->data, value);
if (*head == NULL || strcmp((*head)->data, value) >= 0) {
llnode->next = *head;
*head = llnode;
} else {
// put before
current = *head;
while (current->next != NULL &&
strcmp(current->next->data, llnode->data) < 0) {
current = current->next;
}
llnode->next = current->next;
current->next = llnode;
}
return 1;
}
// ... and here as it comes in
int ll_insert(list ** head, char *value)
{
list *llnode;
llnode = malloc(sizeof(list));
if (llnode == NULL) {
return 0;
}
llnode->data = malloc(strlen(value) + 1);
if (llnode->data == NULL) {
return 0;
}
// put in front
strcpy(llnode->data, value);
llnode->next = *head;
*head = llnode;
return 1;
}
// traverse the tree and delete everything
void deltree(node * tree)
{
if (tree) {
deltree(tree->left);
deltree(tree->right);
ll_free(tree->ll);
free(tree->name);
free(tree);
tree = NULL;
}
}
node *search(node ** tree, char *key)
{
node *tmp;
if (!(*tree)) {
// search exhausted (or no tree at all)
printf("NOT FOUND %s\n", key);
return NULL;
} else {
if (strcmp(key, (*tree)->name) < 0) {
tmp = search(&(*tree)->left, key);
return tmp;
} else if (strcmp(key, (*tree)->name) > 0) {
tmp = search(&(*tree)->right, key);
return tmp;
} else {
// found, print the linked list
printf("FOUND %s\n", key);
ll_print((*tree)->ll);
return *tree;
}
}
}
// Helpers for the linked list
// free the linked list
void ll_free(list * knot)
{
list *cur;
while (knot != NULL) {
cur = knot;
free(knot->data);
knot = knot->next;
free(cur);
}
}
// print the linked list to stdout
void ll_print(list * knot)
{
while (knot != NULL) {
printf("\t%s\n", knot->data);
knot = knot->next;
}
}
// search list, not used here
/*
int ll_search(list * knot, char *value)
{
while (knot != NULL) {
if(strcmp(knot->data, value) == 0){
return 1;
}
knot = knot->next;
}
return 0;
}
*/
// getline() from http://stackoverflow.com/documentation/c/507/files-and-i-o-streams/8274/get-lines-from-a-file-using-getline
#include <errno.h>
#include <stdint.h>
#if !(defined _POSIX_C_SOURCE)
typedef long int ssize_t;
#endif
/* Only include our version of getline() if the POSIX version isn't available. */
#if !(defined _POSIX_C_SOURCE) || _POSIX_C_SOURCE < 200809L
# if !(defined SSIZE_MAX)
# define SSIZE_MAX (SIZE_MAX >> 1)
# endif
ssize_t getline(char **pline_buf, size_t * pn, FILE * fin)
{
const size_t INITALLOC = 16;
const size_t ALLOCSTEP = 16;
size_t num_read = 0;
if ((NULL == pline_buf) || (NULL == pn) || (NULL == fin)) {
errno = EINVAL;
return -1;
}
if (NULL == *pline_buf) {
*pline_buf = malloc(INITALLOC);
if (NULL == *pline_buf) {
return -1;
} else {
*pn = INITALLOC;
}
}
{
int c;
while (EOF != (c = getc(fin))) {
num_read++;
if (num_read >= *pn) {
size_t n_realloc = *pn + ALLOCSTEP;
char *tmp = realloc(*pline_buf, n_realloc + 1);
if (NULL != tmp) {
*pline_buf = tmp;
*pn = n_realloc;
} else {
return -1;
}
if (SSIZE_MAX < *pn) {
errno = ERANGE;
return -1;
}
}
(*pline_buf)[num_read - 1] = (char) c;
if (c == '\n') {
break;
}
}
if (EOF == c) {
errno = 0;
return -1;
}
}
(*pline_buf)[num_read] = '\0';
return (ssize_t) num_read;
}
#endif
Some of the necessary checks are omitted for brevity!
$ gcc-4.9 -O3 -g3 -W -Wall -Wextra -std=c11 balancbst.c -o balancbst
$ wc < yelp_academic_dataset_user_alternative.csv
552275 17307225 93975270
$ valgrind --leak-check=full --show-leak-kinds=all --track-origins=yes ./balancbst yelp_academic_dataset_user_alternative.csv
==9100== Memcheck, a memory error detector
==9100== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al.
==9100== Using Valgrind-3.10.1 and LibVEX; rerun with -h for copyright info
==9100== Command: ./Yelp1M yelp_academic_dataset_user_alternative.csv
==9100==
Processing line 1000
Processing line 2000
...
Processing line 550000
Processing line 551000
Processing line 552000
Searching...
aron
FOUND aron
yelping_since: 2008 01 || votes: funny 17 useful 19 cool 15 || name: aron || type: user || compliments: funny 2 plain 4 writer 3 note 4 hot 2 cool 1 || fans: 2 || average_stars: 4 28 || review_count: 45 ||
Aron
FOUND Aron
yelping_since: 2011 01 || votes: funny 2 useful 14 cool 4 || name: Aron || type: user || compliments: note 1 || fans: 0 || average_stars: 3 91 || review_count: 35 ||
yelping_since: 2009 07 || votes: funny 0 useful 5 cool 2 || name: Aron || type: user || fans: 0 || average_stars: 3 6 || review_count: 5 ||
yelping_since: 2012 07 || votes: funny 0 useful 4 cool 0 || name: Aron || type: user || fans: 0 || average_stars: 5 0 || review_count: 5 ||
yelping_since: 2011 03 || votes: funny 1 useful 16 cool 3 || name: Aron || type: user || fans: 0 || average_stars: 3 75 || review_count: 8 ||
yelping_since: 2012 10 || votes: funny 0 useful 2 cool 1 || name: Aron || type: user || fans: 0 || average_stars: 5 0 || review_count: 1 ||
yelping_since: 2013 07 || votes: funny 0 useful 0 cool 0 || name: Aron || type: user || fans: 0 || average_stars: 5 0 || review_count: 1 ||
yelping_since: 2010 01 || votes: funny 0 useful 0 cool 0 || name: Aron || type: user || fans: 0 || average_stars: 5 0 || review_count: 2 ||
yelping_since: 2012 11 || votes: funny 0 useful 5 cool 1 || name: Aron || type: user || fans: 0 || average_stars: 4 29 || review_count: 14 ||
yelping_since: 2011 11 || votes: funny 8 useful 9 cool 2 || name: Aron || type: user || fans: 0 || average_stars: 2 57 || review_count: 21 ||
yelping_since: 2012 03 || votes: funny 0 useful 9 cool 1 || name: Aron || type: user || compliments: plain 1 || fans: 2 || average_stars: 4 92 || review_count: 21 ||
yelping_since: 2008 03 || votes: funny 11 useful 44 cool 13 || name: Aron || type: user || compliments: profile 1 writer 1 note 2 hot 2 cool 1 more 1 || fans: 0 || average_stars: 3 7 || review_count: 19 ||
yelping_since: 2009 05 || votes: funny 0 useful 8 cool 1 || name: Aron || type: user || fans: 0 || average_stars: 3 83 || review_count: 6 ||
yelping_since: 2015 02 || votes: funny 0 useful 0 cool 0 || name: Aron || type: user || fans: 0 || average_stars: 5 0 || review_count: 2 ||
yelping_since: 2010 11 || votes: funny 9 useful 20 cool 16 || name: Aron || type: user || compliments: writer 1 || fans: 0 || average_stars: 3 71 || review_count: 6 ||
yelping_since: 2009 10 || votes: funny 12 useful 83 cool 12 || name: Aron || type: user || compliments: funny 1 || fans: 0 || average_stars: 3 8 || review_count: 42 ||
yelping_since: 2014 10 || votes: funny 17 useful 57 cool 47 || name: Aron || elite: 2015 || type: user || compliments: funny 3 cute 2 plain 17 writer 9 list 1 note 3 photos 5 hot 3 more 1 cool 15 || fans: 6 || average_stars: 4 39 || review_count: 32 ||
yelping_since: 2012 04 || votes: funny 0 useful 0 cool 0 || name: Aron || type: user || fans: 0 || average_stars: 5 0 || review_count: 2 ||
yelping_since: 2012 11 || votes: funny 3 useful 4 cool 1 || name: Aron || type: user || fans: 0 || average_stars: 4 57 || review_count: 7 ||
yelping_since: 2014 08 || votes: funny 2 useful 6 cool 3 || name: Aron || type: user || fans: 0 || average_stars: 3 5 || review_count: 4 ||
yelping_since: 2010 03 || votes: funny 121 useful 465 cool 157 || name: Aron || elite: 2012 2013 2014 2015 || type: user || compliments: funny 2 plain 12 writer 9 note 2 photos 1 hot 7 more 1 cool 5 || fans: 10 || average_stars: 3 78 || review_count: 233 ||
yelping_since: 2014 08 || votes: funny 3 useful 37 cool 5 || name: Aron || type: user || fans: 1 || average_stars: 3 93 || review_count: 14 ||
yelping_since: 2012 04 || votes: funny 0 useful 5 cool 0 || name: Aron || type: user || fans: 0 || average_stars: 1 0 || review_count: 1 ||
yelping_since: 2011 01 || votes: funny 123 useful 492 cool 213 || name: Aron || elite: 2012 2013 2014 2015 || type: user || compliments: profile 3 cute 3 funny 3 plain 27 writer 13 list 1 note 8 hot 16 more 7 cool 27 || fans: 12 || average_stars: 4 24 || review_count: 378 ||
qqqqqqqqqqqqqqqqq
NOT FOUND qqqqqqqqqqqqqqqqq
exi==9100==
==9100== HEAP SUMMARY:
==9100== in use at exit: 0 bytes in 0 blocks
==9100== total heap usage: 1,212,396 allocs, 1,212,396 frees, 101,900,376 bytes allocated
==9100==
==9100== All heap blocks were freed -- no leaks are possible
==9100==
==9100== For counts of detected and suppressed errors, rerun with: -v
==9100== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)t
$ time echo "exit" | ./balancbst yelp_academic_dataset_user_alternative.csv
Processing line 1000
Processing line 2000
...
Processing line 551000
Processing line 552000
Searching...
real 0m1.391s
user 0m1.279s
sys 0m0.097s