1

I have a text file which contains a list of words in a precise order. I'm trying to create a function that return an array of words from this file. I managed to retrieve words in the same order as the file like this:

char *readDict(char *fileName) {

    int i;

    char * lines[100];
    FILE *pf = fopen ("francais.txt", "r");

    if (pf == NULL) {
        printf("Unable to open the file");
    } else {

        for (i = 0; i < 100; i++) {

            lines[i] = malloc(128);

            fscanf(pf, "%s", lines[i]);

            printf("%d: %s\n", i, lines[i]);
        }


        fclose(pf);

        return *lines;
    }

    return "NULL";
}

My question is: How can I return an array with random words from the text file; Not as the file words order?

The file looks like this:

exemple1
exemple2
exemple3
exemple4
Paul Bénéteau
  • 775
  • 2
  • 12
  • 36
  • Well, reading them in a random order doesn't make sense. So just randomize the array after you read. See [this link](http://stackoverflow.com/q/6127503/1133626) – jakeehoffmann Apr 04 '17 at 17:53
  • Yeah i thought about that but it makes me read 40k words in the file... – Paul Bénéteau Apr 04 '17 at 17:54
  • If this is the only solution, how can I get the number of words in the file? – Paul Bénéteau Apr 04 '17 at 17:56
  • Count them as you read them in. – jakeehoffmann Apr 04 '17 at 17:57
  • Why do you `return *lines;` having `freed` everything? – Weather Vane Apr 04 '17 at 17:58
  • I was doing it... I removed the `free()`.. – Paul Bénéteau Apr 04 '17 at 18:04
  • I think you should prepare a [Minimal, Complete, and Verifiable example](http://stackoverflow.com/help/mcve) that can use the file's contents *outside of the function* before trying to randomize, since you attempt to return something to do with the array of pointers. That local array you build will go out of life after the function returns. BTW you don't use the filename you pass. – Weather Vane Apr 04 '17 at 18:16
  • There are 239 000 words... – Paul Bénéteau Apr 04 '17 at 18:20
  • So what? Work with the four words in the question first. – Weather Vane Apr 04 '17 at 18:26
  • I'm assuming a lot here. I'm assuming that you want a maximum of 100 words. Also, assuming the 239,000 words are random length, you'll have to do a massive sequential read and parse (like you are doing, since seeking is out of the question), and decide whether or not to keep them as you go. You could create a "toss the dice" function that decides to keep a word or not (using rand), and stop when you reach 100 words chosen. – Code4aliving Apr 04 '17 at 18:27
  • That is what want.. But how? – Paul Bénéteau Apr 04 '17 at 18:32

2 Answers2

3

Reservoir sampling allows you to select a random number of elements from a stream of indeterminate size. Something like this could work (although untested):

char **reservoir_sample(const char *filename, int count) {
    FILE *file;
    char **lines;
    char buf[LINE_MAX];
    int i, n;

    file = fopen(filename, "r");
    lines = calloc(count, sizeof(char *));
    for (n = 1; fgets(buf, LINE_MAX, file); n++) {
        if (n <= count) {
            lines[n - 1] = strdup(buf);
        } else {
            i = random() % n;
            if (i < count) {
                free(lines[i]);
                lines[i] = strdup(buf);
            }
        }
    }
    fclose(file);

    return lines;
}

This is "Algorithm R":

  • Read the first count lines into the sample array.
  • For each subsequent line, replace a random element of the sample array with probability count / n, where n is the line number.
  • At the end, the sample contains a set of random lines. (The order is not uniformly random, but you can fix that with a shuffle.)
ephemient
  • 198,619
  • 38
  • 280
  • 391
1

If each line of the file contains one word, one possibility would be to open the file and count the number of lines first. Then rewind() the file stream and select a random number, sel, in the range of the number of words in the file. Next, call fgets() in a loop to read sel words into a buffer. The last word read can be copied into an array that stores the results. Rewind and repeat for each word desired.

Here is a program that uses the /usr/share/dict/words file that is typical on Linux systems. Note that if the number of lines in the file is greater than RAND_MAX (the largest number that can be returned by rand()), words with greater line numbers will be ignored. This number can be as small as 32767. In the GNU C Library RAND_MAX is 2147483647.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_WORD   100
#define NUM_WORDS  10

int main(void)
{
    /* Open words file */
    FILE *fp = fopen("/usr/share/dict/words", "r");

    if (fp == NULL) {
        perror("Unable to locate word list");
        exit(EXIT_FAILURE);
    }

    /* Count words in file */
    char word[MAX_WORD];
    long wc = 0;
    while (fgets(word, sizeof word, fp) != NULL) {
        ++wc;
    }

    /* Store random words in array */
    char randwords[NUM_WORDS][MAX_WORD];
    srand((unsigned) time(NULL));
    for (size_t i = 0; i < NUM_WORDS; i++) {
        rewind(fp);
        int sel = rand() % wc + 1;
        for (int j = 0; j < sel; j++) {
            if (fgets(word, sizeof word, fp) == NULL) {
                perror("Error in fgets()");
            }
        }
        strcpy(randwords[i], word);
    }

    if (fclose(fp) != 0) {
        perror("Unable to close file");
    }

    /* Display results */
    for (size_t i = 0; i < NUM_WORDS; i++) {
        printf("%s", randwords[i]);
    }

    return 0;
}

Program output:

biology's
lists
revamping
slitter
loftiness's
concur
solemnity's
memories
winch's
boosting

If blank lines in input are a concern, the selection loop can test for them and reset to select another word when they occur:

/* Store random words in array */
char randwords[NUM_WORDS][MAX_WORD];
srand((unsigned) time(NULL));
for (size_t i = 0; i < NUM_WORDS; i++) {
    rewind(fp);
    int sel = rand() % wc + 1;
    for (int j = 0; j < sel; j++) {
        if (fgets(word, sizeof word, fp) == NULL) {
            perror("Error in fgets()");
        }
    }
    if (word[0] == '\n') {      // if line is blank
        --i;                    // reset counter
        continue;               // and select another one
    }

    strcpy(randwords[i], word);
}

Note that if a file contains only blank lines, with the above modification the program would loop forever; it may be safer to count the number of blank lines selected in a row and skip until some reasonable threshold is reached. Better yet to verify that at least one line of the input file is not blank during the initial line-count:

/* Count words in file */
char word[MAX_WORD];
long wc = 0;
long nonblanks = 0;
while (fgets(word, sizeof word, fp) != NULL) {
    ++wc;
    if (word[0] != '\n') {
        ++nonblanks;
    }
}
if (nonblanks == 0) {
    fprintf(stderr, "Input file contains only blank lines\n");
    exit(EXIT_FAILURE);
}
ad absurdum
  • 19,498
  • 5
  • 37
  • 60
  • That looks good but does it count the blank lines in the file? Thanks – Paul Bénéteau Apr 05 '17 at 10:00
  • @greenpoisononeTV-- yes, blank lines may be selected. They need to be counted since this method works by knowing the number of lines in the file, but they need not be selected. You can add three lines to the selection loop to accomplish this. I have updated my answer. – ad absurdum Apr 05 '17 at 13:33