0

My program emulates the behavior of the grep command. That is, when executing ./main grep <pattern> <file.txt>, it will search line by line using a buffer and write the line number where the match was found, as well as the content of the line. The program works correctly when executed without threads, but when executed with threads, it writes the matches found in an infinite loop and the thread counter grows infinitely.

You only need to compile the program with gcc -o main main.c and execute it with ./main grep <pattern> <file.txt>, where the pattern is the word or part of it that you want to search for and see in which line number and line it appears, and the file can be any .txt file that contains information, such as sample file.

#include "regex.h"
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define BUFFER_SIZE 100
#define NUM_THREADS 5

typedef struct {
  char *pattern;
  char *filename;
} Parameter_Handler;

typedef struct {
  char buffer[BUFFER_SIZE];
  int current_offset;
  FILE *file;
  int num_line;
  int char_count;
  regex_t regex;
  Parameter_Handler ph;
  pthread_mutex_t mutex;
  int stop;
} File_Handler;

typedef struct {
  int id;
  File_Handler *fh;
} Thread_manager;

void init_parameter_handler(Parameter_Handler *ph, char *args[], int n) {
  if (n < 4) {
    printf("Parameters number is inconsistent");
    exit(1);
  }

  if (strcmp(args[1], "grep")) {
    printf("Invalid command %s", args[1]);
    exit(1);
  }

  ph->pattern = args[2];
  ph->filename = args[3];
}

void init_file_handler(File_Handler *fh, int argc, char *argv[]) {
  fh->current_offset = fh->num_line = fh->char_count = fh->stop = 0;

  fh->ph = (Parameter_Handler){"", ""};

  init_parameter_handler(&fh->ph, argv, argc);

  pthread_mutex_init(&fh->mutex, NULL);

  fh->file = fopen(fh->ph.filename, "r");

  if (fh->file == NULL) {
    printf("Could not open the file\n");
    exit(1);
  }
}

void check_match(File_Handler *fh) {
  int ret = regcomp(&fh->regex, fh->ph.pattern, REG_EXTENDED);

  if (ret != 0) {
    printf("Failed to compile regex.\n");
    exit(1);
  }
  char *token = strtok(fh->buffer, "\n");

  while (token != NULL) {
    fh->num_line++;

    ret = regexec(&fh->regex, token, 0, NULL, 0);

    if (ret == 0)
      printf("[%d] %s\n", fh->num_line, token);

    token = strtok(NULL, "\n");
  }

  regfree(&fh->regex);
}

void restore_offset(File_Handler *fh, int offset) {
  do {
    offset--;
  } while (fh->buffer[offset - 1] != '\n');

  fh->current_offset += offset;
  strrchr(fh->buffer, '\n')[1] = '\0';

  fseek(fh->file, fh->current_offset, SEEK_SET);
}

int read_fragment(File_Handler *fh) {
  memset(fh->buffer, 0, sizeof(fh->buffer));

  int offset = fread(fh->buffer, sizeof(char), BUFFER_SIZE, fh->file);

  if (offset == BUFFER_SIZE) {
    if (fh->buffer[offset - 1] != '\n') {
      restore_offset(fh, offset);
    } else
      fh->current_offset += offset;
    check_match(fh);
    return 0;
  } else {
    check_match(fh);
    return 1;
  }
}

void *func(void *arg) {
  File_Handler *fh = (File_Handler *)arg;

  int status;

  while (1) {
    pthread_mutex_lock(&fh->mutex);

    status = read_fragment(fh);

    pthread_mutex_unlock(&fh->mutex);

    if (status) {
      break;
    }
  }
}

int main(int argc, char *argv[]) {
  File_Handler fh;

  init_file_handler(&fh, argc, argv);

  pthread_t thread[NUM_THREADS];

  int t_id[NUM_THREADS];

  for (int i = 0; i < NUM_THREADS; i++) {
    pthread_create(&thread[i], NULL, func, (void *)&fh);
  }

  for (int i = 0; i < NUM_THREADS; i++)
    pthread_join(thread[i], NULL);

  pthread_mutex_destroy(&fh.mutex);

  return 0;
}

I want the threads to access the File_Handler, process a part of the file each, and when the file is finished, all of them together generate and show the results.

  • You only need to compile the program with gcc -o main main.c and execute it with ./main grep , where the pattern is the word or part of it that you want to search for and see in which line number and line it appears, and the file can be any .txt file that contains information, such as https://www.gutenberg.org/cache/epub/57303/pg57303.txt. – Fabián Vargas Apr 22 '23 at 15:37
  • 1
    If you have an infinite loop, that makes it easy to debug: just attach the debugger once it's no longer making progress and step through. – Useless Apr 22 '23 at 15:46
  • Practically the first thing you should have tried is setting `NUM_THREADS` to 1 so it's effectively single-threaded, and see if that has the same problem – Useless Apr 22 '23 at 15:47
  • 2
    What do you mean when you say it works without threads? You change `NUM_THREADS` to 1? Or you rewrite `main` to not use threads at all and call `func` directly? – Kevin Apr 22 '23 at 15:48
  • 2
    This multi-threading arrangement does absolutely nothing, as the entire duration of the thread function is spend with the mutex locked. More importantly, "The program works correctly when executed without threads" is a lie, [demo](https://godbolt.org/z/oYjMM8vKq) – n. m. could be an AI Apr 22 '23 at 15:50
  • FWIW it looks to me like the `restore_offset` system will lock you into an infinite loop if the last line is longer than `BUFFER_SIZE` and does not end with a newline. But you should take this opportunity to practice debugging anyway – Useless Apr 22 '23 at 15:50
  • I rewrote main to not use threads at all. When I add threads, it loses the expected behavior – Fabián Vargas Apr 22 '23 at 15:52
  • Simply put, the specifications call for it. – Fabián Vargas Apr 22 '23 at 15:54
  • 1
    There are ways to run grep in parallel, but this won't do it because your mutex prevents it. What you have now is just a very expensive single-threaded implementation. You've been given two debugging approaches to start off with, and I've pointed out one potential bug. Have you tried using the debugger at all? – Useless Apr 22 '23 at 15:58
  • "Specifications" normally describe user-visible behaviour. When a specification requires you to use a specific internal mechanism, such as threads, it is usually called "assignment". Trying to use your required mechanism in a trivial way (e.g. such that only one thread ever runs at any given time) may or may not get your marks lowered. If I were your professor, I would give it an F (but I would define this rule beforehand, so no surprises). – n. m. could be an AI Apr 22 '23 at 22:08

1 Answers1

0

A number of issues ...

  1. You have only one instance of File_Handler instead of one for each thread.
  2. You try to mask this with mutexes instead of using an array of File_Handler structs. With the array, no mutex is needed.
  3. You use strtok which is not thread-safe instead of strtok_r
  4. Your use of fread in read_fragment is semi-broken. Better to use fgets [initially]. With fgets, no need to use strtok. More on this below.
  5. Using the mutex, the performance is no better (and may be worse) than a single thread.
  6. You call regcomp repeatedly instead of just once per thread.
  7. Because of the way File_Handler is set up, all threads read all lines. This is duplicated effort.
  8. You attempt to mitigate this with read_fragment and restore_offset. [I think] you're trying to get each thread to scan a unique segment of the file. The concept is good but the implementation is incorrect. Again, see below.

Here is the [first pass of] the corrected code. It is annotated with the bugs and fixes:

#include "regex.h"
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define BUFFER_SIZE 100

#ifndef NUM_THREADS
#define NUM_THREADS 5
#endif

typedef struct {
    char *pattern;
    char *filename;
} Parameter_Handler;

typedef struct {
    char buffer[BUFFER_SIZE];
    int current_offset;
    FILE *file;
    int num_line;
    int char_count;
    regex_t regex;
    Parameter_Handler ph;
    pthread_mutex_t mutex;
    int stop;
#if 1
    int match_count;
#endif
} File_Handler;

typedef struct {
    int id;
    File_Handler *fh;
} Thread_manager;

#if DEBUG
#define dbgprt(_fmt...) \
    printf(_fmt)
#else
#define dbgprt(_fmt...) \
    do { } while (0)
#endif

void
init_parameter_handler(Parameter_Handler *ph, char *args[], int n)
{
    if (n < 4) {
        printf("Parameters number is inconsistent");
        exit(1);
    }

    if (strcmp(args[1], "grep")) {
        printf("Invalid command %s", args[1]);
        exit(1);
    }

    ph->pattern = args[2];
    ph->filename = args[3];
}

void
init_file_handler(File_Handler *fh, int argc, char *argv[])
{
    fh->current_offset = fh->num_line = fh->char_count = fh->stop = 0;

#if 0
// NOTE/BUG: this is extraneous -- all values replaced by init_parameter_handler
    fh->ph = (Parameter_Handler) {
    "", ""};
#endif

    init_parameter_handler(&fh->ph, argv, argc);

#if 0
// NOTE/BUG: not needed with array of File_Handler
    pthread_mutex_init(&fh->mutex, NULL);
#endif

    fh->file = fopen(fh->ph.filename, "r");
    if (fh->file == NULL) {
        printf("Could not open the file\n");
        exit(1);
    }

#if 1
// NOTE/FIX -- this only needs to be done once per thread
    int ret = regcomp(&fh->regex, fh->ph.pattern, REG_EXTENDED);
    if (ret != 0) {
        printf("Failed to compile regex.\n");
        exit(1);
    }
#endif
}

void
check_match(File_Handler *fh)
{
#if 0
// NOTE/BUG: this is expensive -- it should be moved to init_file_handler
// function
    int ret = regcomp(&fh->regex, fh->ph.pattern, REG_EXTENDED);
    if (ret != 0) {
        printf("Failed to compile regex.\n");
        exit(1);
    }
#endif

#if 0
// NOTE/BUG: strtok is _not_ thread-safe -- use strtok_r
    char *token = strtok(fh->buffer, "\n");
#else
    char *save;
    char *token = strtok_r(fh->buffer, "\n", &save);
#endif

    while (token != NULL) {
        fh->num_line++;

        int ret = regexec(&fh->regex, token, 0, NULL, 0);
        if (ret == 0) {
            printf("[%d] %s\n", fh->num_line, token);
            fh->match_count += 1;
        }

#if 0
        token = strtok(NULL, "\n");
#else
        token = strtok_r(NULL, "\n", &save);
#endif
    }

#if 0
    regfree(&fh->regex);
#endif
}

void
restore_offset(File_Handler *fh, int offset)
{
    do {
        offset--;
    } while (fh->buffer[offset - 1] != '\n');

    fh->current_offset += offset;
    strrchr(fh->buffer, '\n')[1] = '\0';

    fseek(fh->file, fh->current_offset, SEEK_SET);
}

int
read_fragment(File_Handler *fh)
{
    memset(fh->buffer, 0, sizeof(fh->buffer));

    int offset = fread(fh->buffer, sizeof(char), BUFFER_SIZE, fh->file);

    if (offset == BUFFER_SIZE) {
        if (fh->buffer[offset - 1] != '\n') {
            restore_offset(fh, offset);
        }
        else
            fh->current_offset += offset;
        check_match(fh);
        return 0;
    }
    else {
        check_match(fh);
        return 1;
    }
}

int
read_line(File_Handler *fh)
{
    int eof = 0;

    do {
        char *cp = fgets(fh->buffer, BUFFER_SIZE, fh->file);

        eof = (cp == NULL);
        if (eof)
            break;

        cp = strchr(fh->buffer,'\n');
        if (cp != NULL)
            *cp = 0;

        check_match(fh);
    } while (0);

    return eof;
}

void *
func(void *arg)
{
#if 0
    File_Handler *fh = (File_Handler *) arg;
#else
    File_Handler *fh = arg;
#endif

    int status;

    while (1) {
#if 0
// NOTE/BUG: doing this just masks the problem that only one File_Handler is
// used by all threads
        pthread_mutex_lock(&fh->mutex);
#endif

#if 0
// NOTE/BUG: read_fragment hangs
        status = read_fragment(fh);
#else
        status = read_line(fh);
#endif

#if 0
        pthread_mutex_unlock(&fh->mutex);
#endif

        if (status) {
            break;
        }
    }

#if 1
    return (void *) 0;
#endif
}

int
main(int argc, char *argv[])
{
#if 0
// NOTE/BUG: we need one of these for each thread
    File_Handler fh;
    init_file_handler(&fh, argc, argv);
#else
    File_Handler *fh;
    File_Handler fharray[NUM_THREADS];
#endif

    pthread_t thread[NUM_THREADS];

#if 0
// NOTE/BUG: this is unused
    int t_id[NUM_THREADS];
#endif

    for (int i = 0; i < NUM_THREADS; i++) {
#if 0
        pthread_create(&thread[i], NULL, func, (void *) &fh);
#else
        fh = &fharray[i];
        init_file_handler(fh, argc, argv);
        pthread_create(&thread[i], NULL, func, fh);
#endif
    }

    for (int i = 0; i < NUM_THREADS; i++)
        pthread_join(thread[i], NULL);

    for (int i = 0; i < NUM_THREADS; i++) {
        fh = &fharray[i];
        regfree(&fh->regex);
        printf("%d: %d matches\n",i,fh->match_count);
    }

#if 0
    pthread_mutex_destroy(&fh.mutex);
#endif

    return 0;
}

In the code above, I've used cpp conditionals to denote old vs. new code:

#if 0
// old code
#else
// new code
#endif

#if 1
// new code
#endif

Note: this can be cleaned up by running the file through unifdef -k


The idea of doing fread and then backing up to find the newline boundary is a reasonable idea.

But, it is much better to use open once. Then, use mmap to map the entire file.

For some background, see my answers:

  1. read line by line in the most efficient way platform specific
  2. Copying the lines from a file to another backwards in c
  3. Sorting with shared memory with child processes
  4. How does mmap improve file reading speed?

During the setup for each thread, the chunks are the file size divided by the number of threads. And, similar for the starting offset for each thread.

Here is some pseudo code for getting segment offsets and lengths:

// do a stat to get the file size ...
struct stat st;

// map the entire file
char *bigbuffer = mmap(...);

off_t chunksize;
off_t offset = 0;

for (int i = 0;  i < NUM_THREADS;  ++i) {
    fh = &fhlist[i];

    // last thread must use the remainder of the file
    if (i == (NUM_THREADS - 1)) {
        chunksize = st.st_size - offset;
        fh->starting_offset = offset;
        fh->chunk_size = chunksize;
        break;
    }

    // get the chunk size
    chunksize = st.st_size / NUM_Threads;

    // back up integral line boundary
    char *cp = bigbuffer + offset + chunksize - 1;
    for (;  cp >= bigbuffer;  --cp, --chunksize) {
        if (*cp == '\n')
            break;
    }

    // set starting offset and length for this thread
    fh->starting_offset = offset;
    fh->chunk_size = chunksize;

    // get starting offset for _next_ thread
    offset += chunksize;
}
Craig Estey
  • 30,627
  • 4
  • 24
  • 48