2

I am really trying to learn if someone wouldn't mind to educate me in the principles I may be missing out on here. I thought I had everything covered but it seems I am doing something incorrectly.

The following code gives me a segmentation fault, and I cannot figure out why? I am adding the & in front of the arguments name being passed in to fscanf.

int word_size = 0;

#define HASH_SIZE 65536

#define LENGTH = 45

node* global_hash[HASH_SIZE] = {NULL};

typedef struct node {
  char word[LENGTH + 1];
  struct node* next;
} node;

int hash_func(char* hash_val){
    int h = 0;
    for (int i = 0, j = strlen(hash_val); i < j; i++){
        h = (h << 2) ^ hash_val[i];
    }
    return h % HASH_SIZE;
}

bool load(const char *dictionary)
{
    char* string;
    FILE* dic = fopen(dictionary, "r");
    if(dic == NULL){
        fprintf(stdout, "Error: File is NULL.");
        return false;
    }
    while(fscanf(dic, "%ms", &string) != EOF){
        node* new_node = malloc(sizeof(node));
        if(new_node == NULL){
            return false;
        }
        strcpy(new_node->word, string);
        new_node->next = NULL;
        int hash_indx = hash_func(new_node->word);
        node* first = global_hash[hash_indx];
        if(first == NULL){
            global_hash[hash_indx] = new_node;
        } else {
            new_node->next = global_hash[hash_indx];
            global_hash[hash_indx] = new_node;
        }
        word_size++;
        free(new_node);
    }
    fclose(dic);
    return true;
}

dictionary.c:25:16: runtime error: left shift of 2127912344 by 2 places cannot be represented in type 'int'
dictionary.c:71:23: runtime error: index -10167 out of bounds for type 'node *[65536]'
dictionary.c:73:13: runtime error: index -10167 out of bounds for type 'node *[65536]'
dictionary.c:75:30: runtime error: index -22161 out of bounds for type 'node *[65536]'
dictionary.c:76:13: runtime error: index -22161 out of bounds for type 'node *[65536]'

Segmentation fault
rebbailey
  • 714
  • 1
  • 7
  • 16
  • 1
    What line does the debugger tell us the error occurs on? – Ed Heal Feb 24 '18 at 08:48
  • 1
    Post a [mcve].. – user2736738 Feb 24 '18 at 08:49
  • Please enlighten us to the structure of `new_node` – Ed Heal Feb 24 '18 at 08:51
  • regarding: `fprintf(stdout, "Error: File is NULL.");` Error messages should be output to `stderr`, not `stdout` AND the value of `errno` can be used to select a line of text that indicates why the system thinks the error occurred. Strongly suggest calling: `perror( "fopen failed" );` as that function will perform all the desired functionality for you. – user3629249 Feb 24 '18 at 08:56
  • 1
    Are you sure that `new_node->word` is long enough – Ed Heal Feb 24 '18 at 08:59
  • 1
    regarding: `while(fscanf(dic, "%ms", &string) != EOF){` 1) '%ms' is not valid, 2) when using the '%s' input format specifier, always include a MAX CHARACTERS modifier that is one less than the length of the input buffer because '%s" always appends a NUL byte to the input. This avoids any possibility of a buffer overflow. Such overflow is undefined behavior and can lead to a seg fault event. BTW: the posted code is missing the necessary allocation (probably via `malloc()`) of any buffer for the pointer `string` to point to. – user3629249 Feb 24 '18 at 09:01
  • @user3629249 - Check this for `%ms` - http://man7.org/linux/man-pages/man3/scanf.3.html – Support Ukraine Feb 24 '18 at 09:03
  • Note: in C, referencing the name of an array degrades to the address of the first byte of the array,, so don't use a '&' on `string` in the call to `fscinf()` – user3629249 Feb 24 '18 at 09:03
  • @user3629249 `%ms` requires a `char**` so it must be `&string` – Support Ukraine Feb 24 '18 at 09:05
  • every time any call to any of the heap allocation functions is performed, there must be a matching call to `free()`. However, in the posted code there is absolutely no need to every call `malloc()` instead use statements like `char string[ sizeof( node ) ];` and `char new_node[ sizeof node ];` – user3629249 Feb 24 '18 at 09:07
  • regarding a call to `fscanf()` there are other returned values other than `EOF` that indicate an error occurred. In the posted code there is one 'input/format specifier' in the call to `fscanf()` so anything returned other than 1 is indicating of any error – user3629249 Feb 24 '18 at 09:10
  • @user3629249 https://stackoverflow.com/questions/38685724/difference-between-ms-and-s-scanf – Support Ukraine Feb 24 '18 at 09:20
  • 2
    Your error is not in the code shown. Understand that the `'m'` modifier to `%s` is non-portable. It is implementation defined even among Linux based compilers. – David C. Rankin Feb 24 '18 at 09:37
  • 1
    What is happening in `hash_func()`? The error messages seem to suggest that there is bit-shifting code someplace, and the negative array indices shown in error messages suggest that overflow is happening. – ad absurdum Feb 24 '18 at 09:38
  • I have tried many of your suggestions but none have caused a solution. I have updated the code with more information that may help. Thank you all for your input! – rebbailey Feb 24 '18 at 09:45
  • you still have not posted a Minimal, Complete, and Verifiable example! I looked at your question updates. When updating a question, keep the old text and write a EDIT section for the changes. Otherwise many comments will be obsolete and confusing to the reader. – user3629249 Feb 24 '18 at 10:11
  • the hash_func() is not working as you are expecting. Suggest re-designing that function. – user3629249 Feb 24 '18 at 10:12
  • the latest version of the posted code has the definition of the struct 'node' AFTER the first use of the struct. This is an error – user3629249 Feb 24 '18 at 10:19
  • there is no `=` in a `#define`. a #define does NOT end in a semicolon. so this code does not compile – user3629249 Feb 24 '18 at 10:22
  • regarding: `for (int i = 0, j = strlen(hash_val); i < j; i++)` the returned type from `strlen()` is `size_t` which is a `long unsigned`, not an `int` so the statement would be better written as: `for ( size_t i = 0, j = strlen(hash_val); i < j; i++) – user3629249 Feb 24 '18 at 10:25
  • when compiling, always enable the warnings, then fix those warnings. ( for `gcc`, at a minimum use: `-Wall -Wextra -Wconversion -pedantic -std=gnu11` ) – user3629249 Feb 24 '18 at 10:47

3 Answers3

4

Update after OP posted more code

The problem is that your hash_func works with signed integers and that it overflows. Therefore you get a negative return value (or rather undefined behavior).

That is also what these lines tell you:

dictionary.c:25:16: runtime error: left shift of 2127912344 by 2 places cannot be represented in type 'int'

Here it tells you that you have a signed integer overflow

dictionary.c:71:23: runtime error: index -10167 out of bounds for type 'node *[65536]'

Here it tells you that you use a negative index into an array (i.e. global_hash)

Try using unsigned integer instead

unsigned int hash_func(char* hash_val){
    unsigned int h = 0;
    for (int i = 0, j = strlen(hash_val); i < j; i++){
        h = (h << 2) ^ hash_val[i];
    }
    return h % HASH_SIZE;
}

and call it like:

unsigned int hash_indx = hash_func(new_node->word);

Original answer

I'm not sure this is the root cause of all problems but it seems you have some problems with memory allocation.

Each time you call fscanf you get new dynamic memory allocated for string du to %ms. However, you never free that memory so you have a leak.

Further, this looks like a major problem:

        global_hash[hash_indx] = new_node;  // Here you save new_node
    } else {
        new_node->next = global_hash[hash_indx];
        global_hash[hash_indx] = new_node;  // Here you save new_node
    }
    word_size++;
    free(new_node);  // But here you free the memory

So it seems your table holds pointers to memory that have been free'd already.

That is a major problem that may cause seg faults when you use the pointers.

Maybe change this

free(new_node); 

to

free(string);

In general I'll suggest that you avoid %ms and also avoid fscanf. Use char string[LENGTH + 1] and fgets instead.

Support Ukraine
  • 42,271
  • 4
  • 38
  • 63
  • Thank you for you answer, but this did not work for me. I still received the `runtime errors` and also an infinite loop. – rebbailey Feb 24 '18 at 09:07
  • @rebbailey - Are you sure you get it inside the posted function? – Support Ukraine Feb 24 '18 at 09:08
  • @rebbailey Maybe try this change: `(fscanf(dic, "%ms", &string) == 1)` – Support Ukraine Feb 24 '18 at 09:11
  • Yes, positive it's inside. I tried the suggested but it gave the same exact outcome. – rebbailey Feb 24 '18 at 09:14
  • @rebbailey - have you checked how much data you get from `fscanf` ? Like: `if (strlen(string) > LENGTH ) printf("oh dear... this is bad\n")` just after the `fscanf` – Support Ukraine Feb 24 '18 at 09:17
  • I added: `if(strlen(string) > LENGTH){ printf("Oh dear...this is bad\n"); }else{ printf("%lu\n", strlen(string)); }` and got back all numbers. – rebbailey Feb 24 '18 at 09:23
  • @rebbailey ok - but in an infinite loop?? Or... how may numbers did you get? – Support Ukraine Feb 24 '18 at 09:26
  • Ok, I changed the `fcanf` to `fgets`, removed the `free(new_node)` completely and now the program partially works, but I still get the `runtime error`. – rebbailey Feb 24 '18 at 09:35
  • 1
    @rebbailey - I'm not sure what "partially" works means. Maybe the "new" problem is unrelated to the problem described here. If that is the case, I think you should ask a new question where you post the updated code and describes what goes wrong. – Support Ukraine Feb 24 '18 at 09:38
  • The updated answer resulted in the following: `dictionary.c:25:16: runtime error: left shift of 2115501435 by 2 places cannot be represented in type 'int' dictionary.c:71:23: runtime error: index 4294964710 out of bounds for type 'node *[65536]' Segmentation fault` – rebbailey Feb 24 '18 at 10:01
  • @rebbailey That sounds strange - are you sure you change `h` to unsigned? – Support Ukraine Feb 24 '18 at 10:06
  • Changing to `unsigned int` worked! Thank you so much! – rebbailey Feb 24 '18 at 10:13
2

There are multiple issues in the code posted. Here are the major ones:

  • you should use unsigned arithmetic for the hash code computation to ensure that the hash value is positive. The current implementation has undefined behavior as words longer than 15 letters cause an arithmetic overflow, which may produce a negative value and cause the modulo to be negative as well, indexing outside the bounds of global_hash.

  • You free the newly allocated node with free(new_node);. It has been stored into the global_hash array: later dereferencing it for another word with the same hash value will cause undefined behavior. You probably meant to free the parsed word instead with free(string);.

Here are the other issues:

  • you should check the length of the string before copying it to the node structure array with strcpy(new_node->word, string);

  • fscanf(dic, "%ms", &string) is not portable. the m modifier causes fscanf to allocate memory for the word, but it is an extension supported by the glibc that may not be available in other environments. You might want to write a simple function for better portability.

  • the main loop should test for successful conversion with while(fscanf(dic, "%ms", &string) == 1) instead of just end of file with EOF. It may not cause a problem in this specific case, but it is a common cause of undefined behavior for other conversion specifiers.

  • the definition #define HASH_SIZE 65536; has a extra ; which may cause unexpected behavior if HASH_SIZE is used in expressions.

  • the definition #define LENGTH = 45; is incorrect: the code does not compile as posted.

Here is a modified version:

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>

#define HASH_SIZE 65536
#define LENGTH 45

typedef struct node {
    char word[LENGTH + 1];
    struct node *next;
} node;

int word_size = 0;
node *global_hash[HASH_SIZE];

unsigned hash_func(const char *hash_val) {
    unsigned h = 0;
    for (size_t i = 0, j = strlen(hash_val); i < j; i++) {
        h = ((h << 2) | (h >> 30)) ^ (unsigned char)hash_val[i];
    }
    return h % HASH_SIZE;
}

/* read a word from fp, skipping initial whitespace.
   return the length of the word read or EOF at end of file
   store the word into the destination array, truncating it as needed
*/
int get_word(char *buf, size_t size, FILE *fp) {
    int c;
    size_t i;
    while (isspace(c = getc(fp)))
        continue;
    if (c == EOF)
        return EOF;
    for (i = 0;; i++) {
        if (i < size)
           buf[i] = c;
        c = getc(fp);
        if (c == EOF)
            break;
        if (isspace(c)) {
            ungetc(c, fp);
            break;
        }
    }
    if (i < size)
        buf[i] = '\0';
    else if (size > 0)
        buf[size - 1] = '\0';
    return i;
}

bool load(const char *dictionary) {
    char buf[LENGTH + 1];
    FILE *dic = fopen(dictionary, "r");
    if (dic == NULL) {
        fprintf(stderr, "Error: cannot open dictionary file %s\n", dictionary);
        return false;
    }
    while (get_word(buf, sizeof buf, dic) != EOF) {
        node *new_node = malloc(sizeof(node));
        if (new_node == NULL) {
            fprintf(stderr, "Error: out of memory\n");
            fclose(dic);
            return false;
        }
        unsigned hash_indx = hash_func(buf);
        strcpy(new_node->word, buf);
        new_node->next = global_hash[hash_indx];
        global_hash[hash_indx] = new_node;
        word_size++;
    }
    fclose(dic);
    return true;
}
chqrlie
  • 131,814
  • 10
  • 121
  • 189
0

the following proposed code:

  1. cleanly compiles
  2. still has a major problem with the function: hash_func()
  3. separates the definition of the struct from the typedef for that struct for clarity and flexibility.
  4. properly formats the #define statements
  5. properly handles errors from fopen() and malloc()
  6. properly limits the length of the string read from the 'dictionary' file
  7. assumes that no text from the 'dictionary' file will be greater than 45 bytes.

and now, the proposed code:

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>

//prototypes
bool load(const char *dictionary);
int hash_func(char* hash_val);


#define HASH_SIZE 65536
#define LENGTH  45


struct node
{
    char word[LENGTH + 1];
    struct node* next;
};
typedef struct node node;


node* global_hash[HASH_SIZE] = {NULL};
int word_size = 0;

int hash_func(char* hash_val)
{
    int h = 0;
    for ( size_t i = 0, j = strlen(hash_val); i < j; i++)
    {
        h = (h << 2) ^ hash_val[i];
    }
    return h % HASH_SIZE;
}


bool load(const char *dictionary)
{
    char string[ LENGTH+1 ];
    FILE* dic = fopen(dictionary, "r");
    if(dic == NULL)
    {
        perror( "fopen failed" );
        //fprintf(stdout, "Error: File is NULL.");
        return false;
    }

    while( fscanf( dic, "%45s", string) == 1 )
    {
        node* new_node = malloc(sizeof(node));
        if(new_node == NULL)
        {
            perror( "malloc failed" );
            return false;
        }

        strcpy(new_node->word, string);
        new_node->next = NULL;

        int hash_indx = hash_func(new_node->word);

        // following statement for debug:
        printf( "index returned from hash_func(): %d\n", hash_indx );

        if( !global_hash[hash_indx] )
        {
            global_hash[hash_indx] = new_node;
        }

        else
        {
            new_node->next = global_hash[hash_indx];
            global_hash[hash_indx] = new_node;
        }

        word_size++;
    }
    fclose(dic);
    return true;
}
user3629249
  • 16,402
  • 1
  • 16
  • 17