0

I have been working with a program that will read through multiple text files, record the number of words in them, and write to a file all of the words and the frequency of them. However, I have encounter a segmentation fault somewhere in my code. I have tried using tools such as Valgrind to help me debug it, however it only points to where I say int i = 0 in the main loop. I apologize for posting a large portion of my code but I have spent hours trying to find where the bug is and cannot seem to find it for the life of me. The issues began when I started passing a structure in pthread_exit().

#include <iostream>
#include <fstream>
#include <string>
#include <pthread.h>
#include <vector>
#include <algorithm>
#include <sstream>
#include <iterator>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <cstdio>
using namespace std;

// Create a structure that we can store information in
typedef struct info{
    int words;
    string dictionary[500000];
} info;

// Counts the number of words in the text file so we know how big to make our array
int countWord(char *arg){
    char words[25000];
    int count = 0;

    ifstream check;
    check.open(arg);
    while(!check.eof()){
        check>>words;
        count++;
    }
    cout<<"Word Count: "<< count << '\n';
    check.close();
    return count;

}

// Checks to see if the word exists in our dictionary or not
int findWord(string array[], string target, int wordCount){
    for(int i = 0; i < wordCount; ++i){
        if(array[i] == target){
            return 1;
        }
    }
    return 0;
}


// Checks to see how many times a word is repeated
int checkWord(string array[], string target, int wordCount){
    int number = 0;
    for(int i = 0; i < wordCount; i++){
        if(array[i] == target){
            number++;
        }
    }
    return number;
}


void *threads(void *arg){
    info information;
    char *fileName = (char *)arg;
    ifstream myfile (fileName);
    string line;
    string fullText[15000];
    string dictionary[500000];
    int wordCount = countWord(fileName);
    int i = 0;
    int find;
    int check;
    int x = 0;
    int checkingStart = 0;


    // Opens and reads the file word by word removing any symbols that we dislike
    if (myfile.is_open()){
        while(myfile >> line){
            transform(line.begin(), line.end(), line.begin(), ::tolower);
            line.erase(remove(line.begin(), line.end(), ','), line.end());
            fullText[i] = line;
            i++;
        }
    }

    else cout << "Unable to Open the File";
    myfile.close();

    // Goes through and adds all the words to our dictionary
    for(i = 0; i < wordCount; ++i){
        find = findWord(dictionary, fullText[i], wordCount);
        if(find == 0){
            dictionary[x] = {fullText[i]};
            ++x;
            checkingStart = 1;
        }
    }

    // Sets each section of dictionary equal to the one in the structure
    for(i = 0; i < wordCount; ++i){
        information.dictionary[i] = dictionary[i];
    }

    // Sets words equal to word count and then passes the structure information out of the thread
    information.words = wordCount;
    pthread_exit(&information);
    return NULL;
}

int main(){
    int i = 0;
    int x = 0;
    int y = 0;
    int z = 0;
    int a = 0;
    int b = 0;
    int add = 0;
    int currentSize = 0;
    int checkingStart = 0;
    int wordCount;
    int find;
    string fullDictionary[500000];
    string dict[500000];
    ofstream writeFile;
    info information;
    char *fileName;
    char *fileList[2];
    pthread_t threadCount[2];
    int frequency[500000];
    int check;
    fileList[0] = "text1";
    fileList[1] = "text2";

    // Creates a loop that creates and joins threads for each text file
    for(a = 0; a < 1; ++a){
        fileName = fileList[a];
        pthread_create(&threadCount[a], NULL, threads, &fileName);
        pthread_join(threadCount[a], (void **)&information);
        wordCount = information.words;

        // Sets each part of dict equal to the same slot on info.dict
        for(b = 0; b < wordCount; ++b){
            dict[b] = information.dictionary[b];
        }

        // Adds to a complete list of all the text files added together
        for(y = 0, z = currentSize; z < wordCount; ++z, ++y){
            fullDictionary[z] = dict[y];
        }
        currentSize = (currentSize + wordCount);
    }

    // Goes through and adds all the words to our dictionary
        for(i = 0; i < wordCount; ++i){
            find = findWord(dict, fullDictionary[i], currentSize);
            if(find == 0){
                dict[x] = {fullDictionary[i]};
                cout << "Added the Word: " << fullDictionary[i] << "\n";
                add = 1;
                checkingStart = 1;
            }
            // Checks the number of times each word appears in the text file
            if(checkingStart == 1){
                    check = checkWord(fullDictionary, dict[x], wordCount);
                    frequency[x] = {check};

                }
            // Checks to see if it needs to move to the next open dictionary spot
            if(add == 1){
            ++x;
            add = 0;
                }
        }
return 0;
  }
  • 1
    If you run your program under the debugger, it should show you exactly where the segmentation fault is occurring. Then you can inspect variable values at the point of the failure to understand what's happening. – Jim Lewis Jun 28 '16 at 19:56
  • 1
    Why are you not simply using `std::vector` instead of potentially blowing out your stack space with things like `string dictionary[500000];`? You even have `#include ` in your code, but you didn't use it. – PaulMcKenzie Jun 28 '16 at 19:57
  • 1
    probably because u are returning "info information" in thread is a local variable could be causing it . try allocating the structure and return the address. – Rahul Menon Jun 28 '16 at 20:02
  • 1
    probably similar issue http://stackoverflow.com/questions/2251452/how-to-return-a-value-from-thread-in-c – Rahul Menon Jun 28 '16 at 20:03
  • 1
    Off topic: Take a look into [std::map](http://en.cppreference.com/w/cpp/container/map). You maybe able to reduce a lot of your code to `dictionarymap[word]++;` – user4581301 Jun 28 '16 at 20:16

1 Answers1

0

These were the changes that were needed to get the program working.

1) One issue seems to be that the size of the variables in the function threads. Looks like every thread that is spawned has some default limit . You could read up on pthread_attr_setstacksize. but the simplest solution was to reduce the size of the strings in thread.So the size of the variables are why it's gives a segmentation fault as soon as the threads function is called. As already mention in the comments above usage of vector/maps classes will help reduce the need for large local variables.

2) The return variable needs to be a non-local variable else the return value does not make it back successfully.

3) just noticed the main loop ( variable a ) is running only once . Also once the thread is launched(pthread_create) the loop is waiting for the join . This will result in serialization of the threads. The create can be done first and then the join can be in called in a separate loop after that.

Changes are given below ..

In function - threads

  info *information;  
  //changed to pointer
  // info information;                                                                                                                                                                                         
  char *fileName = (char *)arg;                                                                      
  ifstream myfile (fileName);                                                                        
  string line;                                                                                       
  string fullText[1500];                                                                                                                                                                                      
  string dictionary[5000];  
  // reduced size                                                                                                                                                                                   
  //string fullText[15000]; 
  //string dictionary[500000];   

.....

 information = new info;    // create an instance 

........

// change to pointer 
     information->dictionary[i] = dictionary[i];                                                                                                                                                              
  }                                                                                                  

  // Sets words equal to word count and then passes the structure information out of the thread      
  information->words = wordCount;                                                                                                                                                                             
  pthread_exit(information); // return pointer 

in function - main

info *information; // change to pointer 

....

 for(a = 0; a < 2; ++a){      // loop to 2 

.....

pthread_create(&threadCount[a], NULL, threads, (void *)fileName);  // changed file name                               
// pthread_create(&threadCount[a], NULL, threads, &fileName);   



wordCount = information->words; // changed for pointer 
...

dict[b] = information->dictionary[b]  // changed for pointer      

After the edits you should be able to run to debug the rest of the functionality.

Rahul Menon
  • 792
  • 7
  • 12
  • Thank you so much for helping me with my code, however if I use a debugger such as valgrind it always says there is "Invalid Write Size of 4" in the same spot as I stated before - int i = 0; in the main loop. Any ideas on what is going on? –  Jul 05 '16 at 18:13
  • Guessing you are having similar problem , this time in the main() , as you have too many huge variables as local variables in your main() . Probably they are causing issues with your stack , you could move them to the global space (outside main ) or reduce their sizes. Better option would be to convert the variables ( as suggested in the comments above ) into vector/map etc.. If this does not solve you issue you need to give the latest code and the full error . – Rahul Menon Jul 05 '16 at 20:07