I have been struggling for some time with trying to extract an int from a UTF8 file:
#include <iostream>
#include <fstream>
#include <sstream>
using namespace std;
int main()
{
ifstream file("UTF8.txt");
if(file.is_open())
{
string line;
getline(file, line);
istringstream ss(line);
int a;
ss >> a;
if(ss.fail())
{
cout << "Error parsing" << endl;
ss.clear();
}
getline(file, line);
cout << a << endl << line << endl;
file.close();
}
}
The file contains 2 lines: "42" and "è_é", and is saved in Notepad as UTF8. The above works when the file is ANSI, but fails when it is Unicode. I've tried a number of things, the most promising one being to set the locale, but I would like the program to be independent on the locale of the computer (i.e. read chinese characters even if the PC is a US one). Honestly, I'm out of ideas now. I'd like to avoid using CStrings from Qt if possible.
UPDATE
The following displays "0", "Error parsing" because of one weird character at the very beginning of the file. An empty line, discarded when read, just before the number makes it work but I can't change the file in the final program. Accents are not displayed properly in the console, but when I write the output to a file all is well and that's all I need. So it's only that issue with the beginning of the file!
#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>
int main()
{
std::ifstream file("UTF8.srt");
file.imbue(std::locale(file.getloc(),
new std::codecvt_utf8<wchar_t,0x10ffff,std::consume_header>));
if (file.is_open()) {
std::string line;
std::getline(file,line);
std::istringstream ss{line};
int a;
ss >> a;
if (ss.fail()) {
std::cout << "Error parsing" << std::endl;
ss.clear();
}
getline(file,line);
std::cout << a << std::endl << line << std::endl;
file.close();
}
}
SOLUTION
The following works, with the input file content as follows:
5
bla bla é_è
6
truc è_é
Code:
#include <cstdint>
#include <iostream>
#include <fstream>
#include <sstream>
// Do not get used to it:
// using namespace std;
inline const char* skip_utf8_bom(const char* s, std::size_t size)
{
if(3 <= size && s[0] == char(0xEF) && s[1] == char(0xBB) && s[2] == char(0xBF))
s += 3;
return s;
}
int main()
{
std::ifstream file("UTF8.txt");
std::ofstream fileO("UTF8_copy.txt");
if(!file || !fileO) {
std::cout << "Error opening files" << std::endl;
}
else {
std::string line;
//Parse the first number
std::getline(file, line);
{
const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
std::istringstream input(linePtr);
int a = -1;
input >> a;
if( ! input) {
std::cout << "Error parsing" << std::endl;
}
std::cout << "Number 1: " << a << std::endl;
fileO << a << std::endl;
}
//Copy the following line as is
std::getline(file, line);
fileO << line << std::endl;
//Discard empty line, copy it in the output file
std::getline(file, line);
fileO << std::endl;
//Parse the second number
std::getline(file, line);
{
const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
std::istringstream input(linePtr);
int a = -1;
input >> a;
if( ! input) {
std::cout << "Error parsing" << std::endl;
}
std::cout << "Number 1: " << a << std::endl;
fileO << a << std::endl;
}
//Copy the following line as is
std::getline(file, line);
fileO << line << std::endl;
file.close();
fileO.close();
}
return 0;
}