-2

I am trying to read multiple text files from a folder, and store each word's beginning position. I am using Boost to clear the text from punctuation.

I encounter a problem when the words have special characters such as (Õ, Ø, æ, etc). In this case, i get an error with the message: "Expression: (unsigned)(c+1)<=256".

Here is the code for the aplication I've mentioned:

#include "stdafx.h"
#include <iostream>
#include <fstream>
#include<iterator>
#include<string>
#include "/../dirent.h/dirent.h"
#include <boost/tokenizer.hpp>

using namespace std;
using namespace boost;

int main() {

    DIR*     dir;
    dirent*  pdir;

    dir = opendir("D:/../dataset/"); 

    int number_of_words=0;
    int text_length = 30;
    char filename[300];
    int i=0;
    while (pdir = readdir(dir)) 
    {
        string fileString;

        cout<<"-------------------------------------------"<<endl;
        cout<<"Name of text file: "<<pdir->d_name << endl;
        strcpy(filename, "D:/.../dataset/");
        strcat(filename, pdir->d_name);
        ifstream file(filename);
        std::istream_iterator<std::string> beg(file), end;

        number_of_words = distance(beg,end);

        //cout<<"Number of words in file: "<<number_of_words<<endl;
        ifstream files(filename);
         //char output[200];

         if (file.is_open()) 
         {

             string output;

             while (!files.eof())
             {

                    files >> output;
                    fileString += " ";
                    fileString += output;
                    //cout<<output<<endl;

             }
             //cout<<fileString<<endl;
             cout<<"Number of characters: "<<fileString.size()<<endl;
             cout<<"-------------------------------------------"<<endl;


            string fileStringTokenized;
            tokenizer<>tok (fileString);

            int indice_cuvant_curent = 0;
            int index = 0;
            vector<int> myvector;

            for(tokenizer<>::iterator beg=tok.begin(); beg!=tok.end(); ++beg)
            {
                string currentWord;
                currentWord = *beg;

                myvector.push_back(index);
                index+=currentWord.size();
                //cout<<index<<"\t";

                //cout<<*beg<<endl;
                fileStringTokenized += *beg;
            }

         }
         file.close();
    }
    closedir(dir);
    return 0;
}

Why does this problem appear and how can I solve it?

Alex Iacob
  • 33
  • 3
  • 12
  • Use unicode? Create a minimal example? I would really copy-paste it onto my machine and made an example, but I don't have time to stripe your own code and do your work. – luk32 Nov 14 '14 at 11:07
  • Maybe try `std::wstring` (wide character strings). Aside: don't use `while (!files.eof())` like that, use `while (files >> output)` - see [here](http://stackoverflow.com/questions/5605125/why-is-iostreameof-inside-a-loop-condition-considered-wrong). – Tony Delroy Nov 14 '14 at 11:08

2 Answers2

1

Something like this should work:

#include <iostream>
#include <string>
#include <vector>
#include <boost/tokenizer.hpp>

using String = std::wstring;
using Tokenizer = boost::tokenizer< boost::char_delimiters_separator<String::value_type>, String::const_iterator, String>;
int main()
{
    String str(L"Õ, Ø, æ");
    Tokenizer tok (str);

    for(Tokenizer::iterator beg=tok.begin(); beg!=tok.end(); ++beg)
    {
        std::wcout << (*beg) << L'\n';
    }
}

It uses a tokenizer for wide characters.

luk32
  • 15,812
  • 38
  • 62
-2

use UTF-16 string , it will help you to solve your problem

Ali Kazmi
  • 1,460
  • 9
  • 22
  • 1
    Better yet, convert anything coming in to utf-8 and convert back to whatever the system expects when printing out stuff. You can keep using plain normal string functions (except for counting characters) then. utf-16 is the worst possible choice, combining the disadvantages of utf-32 and utf-8. – Damon Nov 14 '14 at 11:13