Give a string std::string str = "google谷歌"
, traverse it and print each character:
for (uint32 i = 0; i <= str.length(); ++i)
std::cout << str[i] << std::endl;
, which prints:
g
o
o
g
l
e
�
�
�
�
�
�
This is obviously wrong, and I change to use std::wstring
:
for (uint32 i = 0; i <= str.length(); ++i)
std::cout << str[i] << std::endl;
, which prints:
103
111
111
103
108
101
35895
27468
0
Above are the raw integer data of each characters, which are correct. I could use the utf8cpp
library to convert them to utf8 and print correctly.
The question is: Is there any easy way to traverse std::string
with variable length characters without using std::wstring
?
I also have few ugly code here:
bool Utf8toWStr(const std::string& utf8str, std::wstring& wstr)
{
size_t len = utf8::distance(utf8str.c_str(), utf8str.c_str() + utf8str.size());
wstr.resize(len);
if (len)
utf8::utf8to16(utf8str.c_str(), utf8str.c_str() + utf8str.size(), &wstr[0]);
return true;
}
bool WStrToUtf8(std::wstring wstr, std::string& utf8str)
{
std::string utf8str2;
utf8str2.resize(wstr.size() * 4); // allocate for most long case
char* oend = utf8::utf16to8(wstr.c_str(), wstr.c_str() + wstr.size(), &utf8str2[0]);
utf8str2.resize(oend - (&utf8str2[0])); // remove unused tail
utf8str = utf8str2;
return true;
}
std::string m_text;
std::wstring textWStr;
Utf8toWStr(m_text, textWStr);
auto textLen = textWStr.length();
for (uint32 1 = 1; i <= textLen; ++i)
{
std::wstring subWStr = textWStr.substr(0, i);
std::string subStr;
WStrToUtf8(subWStr, subStr);
std::cout << "subStr = " << subStr << std::endl;
}