UTF8 and UTF16 store text in a completely different way. Casting wchar_t*
to char*
is meaningless, it's the same as casting float
to char*
.
Use WideCharToMultiByte
to convert UTF16 to UTF8 to send to network function.
When receiving UTF8 from network functions, use MultiByteToWideChar
to convert back to UTF16 so that it can be used in Windows functions.
Example:
#include <iostream>
#include <string>
#include <windows.h>
std::string get_utf8(const std::wstring &wstr)
{
if (wstr.empty()) return std::string();
int sz = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], -1, 0, 0, 0, 0);
std::string res(sz, 0);
WideCharToMultiByte(CP_UTF8, 0, &wstr[0], -1, &res[0], sz, 0, 0);
return res;
}
std::wstring get_utf16(const std::string &str)
{
if (str.empty()) return std::wstring();
int sz = MultiByteToWideChar(CP_UTF8, 0, &str[0], -1, 0, 0);
std::wstring res(sz, 0);
MultiByteToWideChar(CP_UTF8, 0, &str[0], -1, &res[0], sz);
return res;
}
int main()
{
std::wstring greek = L"ελληνικά";
std::string utf8 = get_utf8(greek);
//use utf8.data() for network function...
//convert utf8 back to utf16 so it can be displayed in Windows:
std::wstring utf16 = get_utf16(utf8);
MessageBoxW(0, utf16.c_str(), 0, 0);
return 0;
}
Edit
Another example to show the difference between UTF16 and UTF8. This example looks at the byte values of UTF16 and UTF8.
Note that for Latin alphabet the UTF8 and ANSI bytes are exactly the same.
Also for Latin alphabet there is a similarity between UTF8 and UTF16, except UTF16 has an extra zero.
For Greek and Chinese alphabet there is a noticeable difference.
//(Windows example)
void printbytes_char(const char* ANSI_or_UTF8)
{
const char *bytes = ANSI_or_UTF8;
int len = strlen(bytes);
for (size_t i = 0; i < len; i++)
printf("%02X ", 0xFF & bytes[i]);
printf("\n");
}
void printbytes_wchar_t(const wchar_t* UTF16)
{
//Note, in Windows wchar_t length is 2 bytes
const char *bytes = (const char*)UTF16;
int len = wcslen(UTF16) * 2;
for (size_t i = 0; i < len; i++)
printf("%02X ", 0xFF & bytes[i]);
printf("\n");
}
int main()
{
printbytes_char("ABC");
printbytes_char(u8"ABC");
printbytes_wchar_t(L"ABC");
printbytes_char(u8"ελληνικά");
printbytes_wchar_t(L"ελληνικά");
printbytes_char(u8"汉字/漢字");
printbytes_wchar_t(L"汉字/漢字");
return 0;
}
Output:
"ABC":
41 42 43 //ANSI
41 42 43 //UTF8
41 00 42 00 43 00 //UTF16 (this is little endian, bytes are swapped)
"ελληνικά"
CE B5 CE BB CE BB CE B7 CE BD CE B9 CE BA CE AC //UTF8
B5 03 BB 03 BB 03 B7 03 BD 03 B9 03 BA 03 AC 03 //UTF16
"汉字/漢字"
E6 B1 89 E5 AD 97 2F E6 BC A2 E5 AD 97 //UTF8
49 6C 57 5B 2F 00 22 6F 57 5B //UTF16