I try to count word in huge file. I want to use max of CPU resources and i try to split input data and count words in threads. But i have a problem, when i split data it can split the words and in the end i have wrong answer. How can i split data from file to avoid spliting words? Can somebody help me?
#include <iostream>
#include <fstream>
#include <set>
#include <string>
#include <thread>
#include <mutex>
#include <sstream>
#include <vector>
#include <algorithm>
#define BUFER_SIZE 1024
using namespace std;
std::mutex mtx;
void worker(int n, set<std::string> &mySet, std::string path)
{
mtx.lock();
ifstream file (path, ios::in);
if (file.is_open())
{
char *memblock = new char [BUFER_SIZE];
file.seekg (n * (BUFER_SIZE - 1), ios::beg);
file.read(memblock, BUFER_SIZE - 1);
std::string blockString(memblock);
std::string buf;
stringstream stream(blockString);
while(stream >> buf) mySet.insert(buf);
memblock[BUFER_SIZE] = '\0';
file.close();
delete[] memblock;
}
else
cout << "Unable to open file";
mtx.unlock();
}
int main(int argc, char *argv[])
{
set<std::string> uniqWords;
int threadCount = 0;
ifstream file(argv[1], ios::in);
if(!file){
std::cout << "Bad path.\n";
return 1;
}
file.seekg(0, ios::end);
int fileSize = file.tellg();
file.close();
std::cout << "Size of the file is" << " " << fileSize << " " << "bytes\n";
threadCount = fileSize/BUFER_SIZE + 1;
std::cout << "Thread count: " << threadCount << std::endl;
std::vector<std::thread> vec;
for(int i=0; i < threadCount; i++)
{
vec.push_back(std::thread(worker, i, std::ref(uniqWords), argv[1]));
}
std::for_each(vec.begin(), vec.end(), [](std::thread& th)
{
th.join();
});
std::cout << "Count: " << uniqWords.size() << std::endl;
return 0;
}