-2

I wrote this code to return the top n most frequent words in a .txt file:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_WORD_LENGTH 100
#define MAX_LINE_LENGTH 1000

struct WordFrequency {
    char word[MAX_WORD_LENGTH];
    int frequency;
};

char** find_frequent_words(const char* path, int32_t n) {
    FILE* file = fopen(path, "r");
    if (file == NULL) {
        fprintf(stderr, "Failed to open file: %s\n", path);
        return NULL;
    }

    // Create a hash table to count word frequencies
    struct WordFrequency* wordFrequencies = NULL;
    int uniqueWordCount = 0;
    int maxWordCount = 10000;

    char line[MAX_LINE_LENGTH];
    while (fgets(line, sizeof(line), file) != NULL) {
        char* word = strtok(line, " \t\n");
        while (word != NULL) {
            int existingIndex = -1;
            for (int i = 0; i < uniqueWordCount; i++) {
                if (strcmp(wordFrequencies[i].word, word) == 0) {
                    existingIndex = i;
                    break;
                }
            }

            if (existingIndex >= 0) {
                wordFrequencies[existingIndex].frequency++;
            } else {
                if (uniqueWordCount == maxWordCount) {
                    // Increase the size of the word frequencies array
                    maxWordCount *= 2;
                    struct WordFrequency* newWordFrequencies = realloc(
                        wordFrequencies, maxWordCount * sizeof(struct WordFrequency)
                    );
                    if (newWordFrequencies == NULL) {
                        fprintf(stderr, "Memory allocation failed.\n");
                        fclose(file);
                        free(wordFrequencies);
                        return NULL;
                    }
                    wordFrequencies = newWordFrequencies;
                }
                strncpy(wordFrequencies[uniqueWordCount].word, word, sizeof(wordFrequencies[uniqueWordCount].word) - 1);
                wordFrequencies[uniqueWordCount].word[sizeof(wordFrequencies[uniqueWordCount].word) - 1] = '\0';
                wordFrequencies[uniqueWordCount].frequency = 1;
                uniqueWordCount++;
            }

            word = strtok(NULL, " \t\n");
        }
    }

    fclose(file);

    // Sort word frequencies in descending order
    for (int i = 0; i < uniqueWordCount - 1; i++) {
        for (int j = 0; j < uniqueWordCount - i - 1; j++) {
            if (wordFrequencies[j].frequency < wordFrequencies[j + 1].frequency) {
                struct WordFrequency temp = wordFrequencies[j];
                wordFrequencies[j] = wordFrequencies[j + 1];
                wordFrequencies[j + 1] = temp;
            }
        }
    }

    // Create the result array with the most frequent words
    int resultCount = (n < uniqueWordCount) ? n : uniqueWordCount;
    char** frequentWords = malloc((resultCount + 1) * sizeof(char*));
    if (frequentWords == NULL) {
        fprintf(stderr, "Memory allocation failed.\n");
        free(wordFrequencies);
        return NULL;
    }

    for (int i = 0; i < resultCount; i++) {
        frequentWords[i] = strdup(wordFrequencies[i].word);
        if (frequentWords[i] == NULL) {
            fprintf(stderr, "Memory allocation failed.\n");
            for (int j = 0; j < i; j++) {
                free(frequentWords[j]);
            }
            free(frequentWords);
            free(wordFrequencies);
            return NULL;
        }
    }
    frequentWords[resultCount] = NULL;

    free(wordFrequencies);

    return frequentWords;
}

int main(int argc, char* argv[]) {
    if (argc < 3) {
        fprintf(stderr, "Usage: %s <file_path> <n>\n", argv[0]);
        return 1;
    }

    const char* path = argv[1];
    int32_t n = atoi(argv[2]);
    if (n <= 0) {
        fprintf(stderr, "Invalid value for n: %s\n", argv[2]);
        return 1;
    }

    char** frequentWords = find_frequent_words(path, n);
    if (frequentWords == NULL) {
        return 1;
    }

    printf("The %d most frequent words:\n", n);
    for (int i = 0; frequentWords[i] != NULL; i++) {
        printf("%s\n", frequentWords[i]);
    }

    // Free the memory allocated for the frequentWords array and its elements
    for (int i = 0; frequentWords[i] != NULL; i++) {
        free(frequentWords[i]);
    }
    free(frequentWords);

    return 0;
}

And this is how I'm compiling and running it:

gcc -o frequent_words frequent_words.c
./frequent_words tiny_shakespeare.txt 5

But this is the error I'm getting:

Segmentation fault (core dumped)

I don't know what line is causing this. Where am I going wrong?

273K
  • 29,503
  • 10
  • 41
  • 64
Onur-Andros Ozbek
  • 2,998
  • 2
  • 29
  • 78
  • 1
    Where does the debugger say the problem is? – Raymond Chen May 25 '23 at 04:11
  • Don't tag with C++ the C code unless you accept C++ solutions. – 273K May 25 '23 at 04:13
  • 1
    At first call of `strncpy(wordFrequencies[uniqueWordCount].word...`, `wordFrequencies` appears to be `NULL`. – chux - Reinstate Monica May 25 '23 at 04:14
  • Unrelated but you are missing a stdint.h include – Allan Wind May 25 '23 at 05:09
  • Consider breaking find_frequent_words() into smaller functions. It currently does a lot: opens and reads a file, tokenizes the content, builds a table, sort the table and extract the top n results. – Allan Wind May 25 '23 at 05:35
  • Maybe OT: beware of `strncpy`, it does not what you think it does. – Jabberwocky May 25 '23 at 06:01
  • 2
    @273K [What is a debugger and how can it help me diagnose problems?](https://stackoverflow.com/questions/25385173/what-is-a-debugger-and-how-can-it-help-me-diagnose-problems) is _not_ a duplicate of this question. The OP did not ask how to use a debugger. If you feel that you think they should have made a better job at narrowing down the problem or posting a minimal example, then vote to close -> needs debugging details. _Not_ close as duplicate. I will switch the link to something that's actually relevant for the OP's problem. – Lundin May 25 '23 at 07:47

1 Answers1

2

It segfaults on:

 strncpy(wordFrequencies[uniqueWordCount].word, word, sizeof(wordFrequencies[uniqueWor
  dCount].word) - 1);

as wordFrequencies == NULL because you only allocate space in the array when uniqueWordCount == maxWordCount. I suggest you initialize maxWordCount to the correct capacity:

#define INITIAL_WORD_COUNT 10000
// ...
int maxWordCount = 0;
// ...
                if (uniqueWordCount == maxWordCount) {
                    // Increase the size of the word frequencies array
                    maxWordCount = maxWordCount ? 2 * maxWordCount : INITIAL_WORD_COUNT;

Consider eliminating the batch logic and just grow it one entry at a time. If it's a proven performance issue then reintroduce the batch logic.

Allan Wind
  • 23,068
  • 5
  • 28
  • 38