I am trying to read multiple text files from a folder, and store each word's beginning position. I am using Boost to clear the text from punctuation.
I encounter a problem when the words have special characters such as (Õ, Ø, æ, etc). In this case, i get an error with the message: "Expression: (unsigned)(c+1)<=256".
Here is the code for the aplication I've mentioned:
#include "stdafx.h"
#include <iostream>
#include <fstream>
#include<iterator>
#include<string>
#include "/../dirent.h/dirent.h"
#include <boost/tokenizer.hpp>
using namespace std;
using namespace boost;
int main() {
DIR* dir;
dirent* pdir;
dir = opendir("D:/../dataset/");
int number_of_words=0;
int text_length = 30;
char filename[300];
int i=0;
while (pdir = readdir(dir))
{
string fileString;
cout<<"-------------------------------------------"<<endl;
cout<<"Name of text file: "<<pdir->d_name << endl;
strcpy(filename, "D:/.../dataset/");
strcat(filename, pdir->d_name);
ifstream file(filename);
std::istream_iterator<std::string> beg(file), end;
number_of_words = distance(beg,end);
//cout<<"Number of words in file: "<<number_of_words<<endl;
ifstream files(filename);
//char output[200];
if (file.is_open())
{
string output;
while (!files.eof())
{
files >> output;
fileString += " ";
fileString += output;
//cout<<output<<endl;
}
//cout<<fileString<<endl;
cout<<"Number of characters: "<<fileString.size()<<endl;
cout<<"-------------------------------------------"<<endl;
string fileStringTokenized;
tokenizer<>tok (fileString);
int indice_cuvant_curent = 0;
int index = 0;
vector<int> myvector;
for(tokenizer<>::iterator beg=tok.begin(); beg!=tok.end(); ++beg)
{
string currentWord;
currentWord = *beg;
myvector.push_back(index);
index+=currentWord.size();
//cout<<index<<"\t";
//cout<<*beg<<endl;
fileStringTokenized += *beg;
}
}
file.close();
}
closedir(dir);
return 0;
}
Why does this problem appear and how can I solve it?