2

So this is my second time adapting my code to fscanf to get what I want. I threw some comments next to the output. The main issue I am having is that the one null character or space is getting added into the array. I have tried to check for the null char and the space in the string variable and it does not catch it. I am a little stuck and would like to know why my code is letting that one null character through?

Part where it is slipping up "Pardon, O King," output:King -- 1; -- 1 so here it parses king a word and then ," goes through the strip function and becomes \0, then my check later down the road allows it through??

Input: a short story containing apostrophes and commas (the lion's rock. First, the lion woke up)

//Output: Every unique word that shows up with how many times it shows up.
//Lion -- 1
//s - 12
//lion -- 8
//tree -- 2
//-- 1   //this is the line that prints a null char?
//cub -- //3 it is not a space! I even check if it is \0 before entering
         //it into the array. Any ideas (this is my 2nd time)?
         //trying to rewrite my code around a fscanf function.


#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <ctype.h>

//Remove non-alpha numeric characters
void strip_word(char* string)
{
    char* string_two = calloc(80, sizeof(char));
    int i;
    int c = 0;
    for(i = 0; i < strlen(string); i++)
    {
        if(isalnum(string[i]))
        {
            string_two[c] = string[i];
            ++c;
        }
    }
    string_two[i] = '\0';
    strcpy(string, string_two);
    free(string_two);
}

//Parse through file
void file_parse(FILE* text_file, char*** word_array, int** count_array, int* total_count, int* unique_count)
{
    int mem_Size = 8;
    int is_unique = 1;

    char** words = calloc(mem_Size, sizeof(char *)); //Dynamically allocate array of size 8 of char*
    if (words == NULL)
    {
        fprintf(stderr, "ERROR: calloc() failed!");
    }

    int* counts = calloc(mem_Size, sizeof(int)); //Dynamically allocate array of size 8 of int
    if (counts == NULL)
    {
        fprintf(stderr, "ERROR: calloc() failed!");
    }

    printf("Allocated initial parallel arrays of size 8.\n");
    fflush(stdout);

    char* string;

    while('A')
    {
        is_unique = 1;

        fscanf(text_file, " ,");
        fscanf(text_file, " '");

        while(fscanf(text_file, "%m[^,' \n]", &string) == 1) //%m length modifier 
        {
            is_unique = 1;
            strip_word(string);
            if(string == '\0') continue; //if the string is empty move to next iteration
            else
            {
                int i = 0;              
                ++(*total_count);
                for(i = 0; i < (*unique_count); i++)
                {
                    if(strcmp(string, words[i]) == 0)
                    {
                        counts[i]++;
                        is_unique = 0;
                        break;
                    }
                }
                if(is_unique)
                {
                    ++(*unique_count);
                    if((*unique_count) >= mem_Size)
                    {
                        mem_Size = mem_Size*2;
                        words = realloc(words, mem_Size * sizeof(char *));
                        counts = realloc(counts, mem_Size * sizeof(int));
                        if(words == NULL || counts == NULL)
                        {
                            fprintf(stderr, "ERROR: realloc() failed!");
                        }
                        printf("Re-allocated parallel arrays to be size %d.\n", mem_Size);
                        fflush(stdout);
                    }
                    words[(*unique_count)-1] = calloc(strlen(string) + 1, sizeof(char));
                    strcpy(words[(*unique_count)-1], string);
                    counts[(*unique_count) - 1] = 1;
                }
            }
            free(string);
        }
        if(feof(text_file)) break;
    }
    printf("All done (successfully read %d words; %d unique words).\n", *total_count, *unique_count);
    fflush(stdout);
    *word_array = words;
    *count_array = counts;

}

int main(int argc, char* argv[])
{
    if(argc < 2 || argc > 3) //Checks if too little or too many args
    {
        fprintf(stderr, "ERROR: Invalid Arguements\n");
        return EXIT_FAILURE;
    }

    FILE * text_file = fopen(argv[1], "r");
    if (text_file == NULL)
    {
        fprintf(stderr, "ERROR: Can't open file");

    }

    int total_count = 0;
    int unique_count = 0;
    char** word_array;
    int* count_array;

    file_parse(text_file, &word_array, &count_array, &total_count, &unique_count);

    fclose(text_file);

    int i;

    if(argv[2] == NULL)
    {
        printf("All words (and corresponding counts) are:\n");
        fflush(stdout);
        for(i = 0; i < unique_count; i++)
        {
            printf("%s -- %d\n", word_array[i], count_array[i]);
            fflush(stdout);
        }
    }

    else
    {
        printf("First %d words (and corresponding counts) are:\n", atoi(argv[2]));
        fflush(stdout);
        for(i = 0; i < atoi(argv[2]); i++)
        {
            printf("%s -- %d\n", word_array[i], count_array[i]);
            fflush(stdout);
        }
    }

    for(i = 0; i < unique_count; i++)
    {
        free(word_array[i]);
    }
    free(word_array);
    free(count_array);

    return EXIT_SUCCESS;
}
train55255
  • 49
  • 6
  • Why not simply use `char string_two[80];` in the function `strip_word()` and avoid `calloc()` and `free()`? There's really no need to worry about 80 bytes on the stack on any except the very smallest of embedded systems, where you wouldn't be doing this analysis anyway. Also, in principle, the loop `for(i = 0; i < strlen(string); i++)` is a bad idea because it is quadratic in the length of the string rather than linear. An optimizer might optimize it for you, but it is not wise to rely on that. – Jonathan Leffler Jan 31 '17 at 22:49
  • You never allocated any memory for `string`. And `if(string == '\0')` is wrong, since `string` is a pointer, not a character -- I guess you meant `if (*string == '\0')` – Barmar Jan 31 '17 at 22:50
  • 1
    @Barmar: in `fscanf(text_file, "%m[^,' \n]", &string)` the `m` means 'allocate memory' (a POSIX extension in [`fscanf()`](http://pubs.opengroup.org/onlinepubs/9699919799/functions/fscanf.html) over standard C). – Jonathan Leffler Jan 31 '17 at 22:52
  • I just changed it and it still prints " -- 1" and Jonathan is right %m allocates so I dont have to – train55255 Jan 31 '17 at 22:53
  • @JonathanLeffler It's a POSIX extension I wasn't familiar with. http://stackoverflow.com/questions/11926883/scanf-dynamic-allocation – Barmar Jan 31 '17 at 22:53
  • Your comments about the words shown discuss 'tree' and 'cub', but your putative sample sentences are "the lion's rock. First, the lion woke up". How are we to know what to input to reproduce your output? – Jonathan Leffler Jan 31 '17 at 22:57
  • FYI: You have `if(feof(text_file)) break;` in a position where it is more or less valid, except that the condition won't be true because you won't have entered the loop if you've reached EOF. (See [`while (!feof(file))` is always wrong](http://stackoverflow.com/questions/5431941/while-feof-file-is-always-wrong) for a discussion of the normal pitfalls of using `feof()` — but you managed to avoid the worst problem.) – Jonathan Leffler Jan 31 '17 at 22:59
  • @JonathanLeffler that did not fix it, but now I have � -- 1 instead of a blank spot. (changing the strip_word function) Here is the part where it is messing up. Input : "Pardon, O King," and the Output: King -- 1; -- 1 so that ," is messing it up, but strip word should just return a null I am not sure why it is add the null to the array – train55255 Jan 31 '17 at 22:59
  • I take it back. Because you have a `while ('A')` loop, the `if (feof(text_file)) break;` is necessary to terminate the otherwise infinite loop. However, I can't help but feel that you could improve the loop controls so that the infinite loop was not there. – Jonathan Leffler Jan 31 '17 at 23:12
  • I probably could, but it was the easiest thing for me conceptually to have. Also I added to the post the input where it is messing up. I am baffled to see that its not working. Cause I did my best to handle all of these 'special' cases. and thanks for the extra tips! – train55255 Jan 31 '17 at 23:14
  • Thanks! I got it. I needed fscaf(text_file, " "); – train55255 Jan 31 '17 at 23:23

1 Answers1

1

I'm not sure quite what's going wrong with your code. I'm working on macOS Sierra 10.12.3 with GCC 6.3.0, and the local fscanf() does not support the m modifier. Consequently, I modified the code to use a fixed size string of 80 bytes. When I do that (and only that), your program runs without obvious problem (certainly on the input "the lion's rock. First, the lion woke up").

I also think that the while ('A') loop (which should be written conventionally while (1) if it is used at all) is undesirable. I wrote a function read_word() which gets the next 'word', including skipping blanks, commas and quotes, and use that to control the loop. I left your memory allocation in file_parse() unchanged. I did get rid of the memory allocation in strip_word() (eventually — it worked OK as written too).

That left me with:

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <ctype.h>

static void strip_word(char *string)
{
    char string_two[80];
    int i;
    int c = 0;
    int len = strlen(string);
    for (i = 0; i < len; i++)
    {
        if (isalnum(string[i]))
            string_two[c++] = string[i];
    }
    string_two[c] = '\0';
    strcpy(string, string_two);
}

static int read_word(FILE *fp, char *string)
{
    if (fscanf(fp, " ,") == EOF ||
        fscanf(fp, " '") == EOF ||
        fscanf(fp, "%79[^,' \n]", string) != 1)
        return EOF;
    return 0;
}

static void file_parse(FILE *text_file, char ***word_array, int **count_array, int *total_count, int *unique_count)
{
    int mem_Size = 8;

    char **words = calloc(mem_Size, sizeof(char *));
    if (words == NULL)
    {
        fprintf(stderr, "ERROR: calloc() failed!");
    }

    int *counts = calloc(mem_Size, sizeof(int));
    if (counts == NULL)
    {
        fprintf(stderr, "ERROR: calloc() failed!");
    }

    printf("Allocated initial parallel arrays of size 8.\n");
    fflush(stdout);

    char string[80];

    while (read_word(text_file, string) != EOF)
    {
        int is_unique = 1;
        printf("Got [%s]\n", string);
        strip_word(string);
        if (string[0] == '\0')
            continue;
        else
        {
            int i = 0;
            ++(*total_count);
            for (i = 0; i < (*unique_count); i++)
            {
                if (strcmp(string, words[i]) == 0)
                {
                    counts[i]++;
                    is_unique = 0;
                    break;
                }
            }
            if (is_unique)
            {
                ++(*unique_count);
                if ((*unique_count) >= mem_Size)
                {
                    mem_Size = mem_Size * 2;
                    words = realloc(words, mem_Size * sizeof(char *));
                    counts = realloc(counts, mem_Size * sizeof(int));
                    if (words == NULL || counts == NULL)
                    {
                        fprintf(stderr, "ERROR: realloc() failed!");
                        exit(EXIT_FAILURE);
                    }
                    printf("Re-allocated parallel arrays to be size %d.\n", mem_Size);
                    fflush(stdout);
                }
                words[(*unique_count) - 1] = calloc(strlen(string) + 1, sizeof(char));
                strcpy(words[(*unique_count) - 1], string);
                counts[(*unique_count) - 1] = 1;
            }
        }
    }
    printf("All done (successfully read %d words; %d unique words).\n", *total_count, *unique_count);
    fflush(stdout);
    *word_array = words;
    *count_array = counts;
}

int main(int argc, char *argv[])
{
    if (argc < 2 || argc > 3)
    {
        fprintf(stderr, "ERROR: Invalid Arguements\n");
        return EXIT_FAILURE;
    }

    FILE *text_file = fopen(argv[1], "r");
    if (text_file == NULL)
    {
        fprintf(stderr, "ERROR: Can't open file");
        return EXIT_FAILURE;
    }

    int total_count = 0;
    int unique_count = 0;
    char **word_array = 0;
    int *count_array = 0;

    file_parse(text_file, &word_array, &count_array, &total_count, &unique_count);

    fclose(text_file);

    if (argv[2] == NULL)
    {
        printf("All words (and corresponding counts) are:\n");
        fflush(stdout);
        for (int i = 0; i < unique_count; i++)
        {
            printf("%s -- %d\n", word_array[i], count_array[i]);
            fflush(stdout);
        }
    }
    else
    {
        printf("First %d words (and corresponding counts) are:\n", atoi(argv[2]));
        fflush(stdout);
        for (int i = 0; i < atoi(argv[2]); i++)
        {
            printf("%s -- %d\n", word_array[i], count_array[i]);
            fflush(stdout);
        }
    }

    for (int i = 0; i < unique_count; i++)
        free(word_array[i]);
    free(word_array);
    free(count_array);

    return EXIT_SUCCESS;
}

When run on the data file:

the lion's rock. First, the lion woke up

the output was:

Allocated initial parallel arrays of size 8.
Got [the]
Got [lion]
Got [s]
Got [rock.]
Got [First]
Got [the]
Got [lion]
Got [woke]
Got [up]
All done (successfully read 9 words; 7 unique words).
All words (and corresponding counts) are:
the -- 2
lion -- 2
s -- 1
rock -- 1
First -- 1
woke -- 1
up -- 1

When the code was run on your text, including double quotes, like this:

$ echo '"Pardon, O King,"' | cw37 /dev/stdin
Allocated initial parallel arrays of size 8.
Got ["Pardon]
Got [O]
Got [King]
Got ["]
All done (successfully read 3 words; 3 unique words).
All words (and corresponding counts) are:
Pardon -- 1
O -- 1
King -- 1
$

It took a little finnagling of the code. If there isn't an alphabetic character, your code still counts it (because of subtle problems in strip_word()). That would need to be handled by checking strip_word() more carefully; you test if (string == '\0') which checks (belatedly) whether memory was allocated where you need if (string[0] == '\0') to test whether the string is empty.

Note that the code in read_word() would be confused into reporting EOF if there were two commas in a row, or an apostrophe followed by a comma (though it handles a comma followed by an apostrophe OK). Fixing that is fiddlier; you'd probably be better off using a loop with getc() to read a string of characters. You could even use that loop to strip non-alphabetic characters without needing a separate strip_word() function.

I am assuming you've not yet covered structures yet. If you had covered structures, you'd use an array of a structure such as struct Word { char *word; int count; }; and allocate the memory once, rather than needing two parallel arrays.

Jonathan Leffler
  • 730,956
  • 141
  • 904
  • 1,278