I am trying to convert a C program to use the Windows UTF-8 feature and am having trouble getting it to work properly. The input messages from the window procedure don't seem to receive the correct UTF-8 values.
I've created the test program below which can be run in UTF-8 (using the A API versions) or Unicode (using the W API versions) modes. The UTF-8 mode is default and displays the values received by the WM_IME_CHAR and WM_CHAR messages in hex. To use the Unicode mode, the letter w is added as a command line argument. The Unicode mode also displays the values received by the WM_IME_CHAR and WM_CHAR messages but additionally displays the UTF-8 value converted from the Unicode (UTF-16LE) value.
#include <windows.h>
#include <stdio.h>
LRESULT CALLBACK WndProc( HWND, UINT, WPARAM, LPARAM );
// 1 2 3 4 5 6
// 123456789012345678901234567890123456789012345678901234567890
static char szTop[] = "Message Key Char value(s)";
static char szUnd[] = "------------------ --------------- ------------------";
static char *szFormat = "%-18s %-15.15s\n";
static char *szInputFmt = "%-18s 0x%p\n";
static char *szInput2Fmt = "%34s";
static char *szMsgName[] = {
"WM_KEYDOWN", "WM_KEYUP", // 0x100, 0x101,
"WM_CHAR", "WM_DEADCHAR", // 0x102, 0x103,
"WM_SYSKEYDOWN", "WM_SYSKEYUP", // 0x104, 0x105,
"WM_SYSCHAR", "WM_SYSDEADCHAR" }; // 0x106, 0x107
static int Wide = 0;
static HWND createWindowA()
{
DWORD err;
// Register window class
WNDCLASSA WindowClass;
WindowClass.style = CS_HREDRAW | CS_VREDRAW | CS_OWNDC;
WindowClass.lpfnWndProc = (WNDPROC)WndProc;
WindowClass.cbClsExtra = 0;
WindowClass.cbWndExtra = 0;
WindowClass.hInstance = (HINSTANCE)NULL;
WindowClass.hIcon = LoadIconA( NULL, (LPCSTR)IDI_APPLICATION );
WindowClass.hCursor = LoadCursorA( NULL, (LPCSTR)IDC_ARROW );
WindowClass.hbrBackground = GetStockObject( WHITE_BRUSH );
WindowClass.lpszMenuName = NULL;
WindowClass.lpszClassName = "MY_CLASS";
if (!RegisterClassA( &WindowClass )) {
err = GetLastError();
printf( "RegisterClass() Error= 0x%08x, %d\n", err, err );
return NULL;
}
// create window
HWND hwnd = CreateWindowA(
"MY_CLASS", /* window class name */
"UTF-8 Keyboard Message Viewer",/* window name */
WS_OVERLAPPEDWINDOW,
CW_USEDEFAULT, CW_USEDEFAULT, /* initial position */
CW_USEDEFAULT, CW_USEDEFAULT, /* size (w, h) */
NULL, NULL, /* parent, menu */
(HINSTANCE)GetModuleHandleA( NULL ),
NULL );
if (!hwnd) {
err = GetLastError();
printf( "CreateWindow() Error= 0x%08x, %d\n", err, err );
return NULL;
}
return hwnd;
}
static HWND createWindowW()
{
DWORD err;
// Register window class
WNDCLASSW WindowClass;
WindowClass.style = CS_HREDRAW | CS_VREDRAW | CS_OWNDC;
WindowClass.lpfnWndProc = (WNDPROC)WndProc;
WindowClass.cbClsExtra = 0;
WindowClass.cbWndExtra = 0;
WindowClass.hInstance = (HINSTANCE)NULL;
WindowClass.hIcon = LoadIconW( NULL, (LPCWSTR)IDI_APPLICATION );
WindowClass.hCursor = LoadCursorW( NULL, (LPCWSTR)IDC_ARROW );
WindowClass.hbrBackground = GetStockObject( WHITE_BRUSH );
WindowClass.lpszMenuName = NULL;
WindowClass.lpszClassName = L"MY_CLASS";
if (!RegisterClassW( &WindowClass )) {
err = GetLastError();
printf( "RegisterClass() Error= 0x%08x, %d\n", err, err );
return NULL;
}
// create window
HWND hwnd = CreateWindowExW(
WS_EX_CLIENTEDGE,
L"MY_CLASS", /* window class name */
L"UTF-8 Keyboard Message Viewer",/* window name */
WS_OVERLAPPEDWINDOW,
CW_USEDEFAULT, CW_USEDEFAULT, /* initial position */
CW_USEDEFAULT, CW_USEDEFAULT, /* size (w, h) */
NULL, NULL, /* parent, menu */
(HINSTANCE)GetModuleHandleW( NULL ),
NULL );
if (!hwnd) {
err = GetLastError();
printf( "CreateWindowExW() Error= 0x%08x, %d\n", err, err );
return NULL;
}
return hwnd;
}
int main( int argc, char *argv[] )
{
HWND hwnd = NULL;
Wide = ((argc > 1) && ('w' == *argv[1] || 'W' == *argv[1]));
if (Wide) {
hwnd = createWindowW(); // Unicode version
}
else {
hwnd = createWindowA(); // Multi-byte version
}
if (!hwnd) {
printf( "Failed to register Window Class.\n" );
exit( 1 );
}
printf( "Process ACP: %u\n\n", GetACP() );
printf( "%s\n", szTop );
printf( "%s\n", szUnd );
ShowWindow( hwnd, SW_SHOWDEFAULT );
UpdateWindow( hwnd );
MSG msg;
if (Wide) {
while (GetMessageW( &msg, NULL, 0, 0 )) {
TranslateMessage( &msg );
DispatchMessageW( &msg );
}
}
else {
while (GetMessageA( &msg, NULL, 0, 0 )) {
TranslateMessage( &msg );
DispatchMessageA( &msg );
}
}
return (int)msg.wParam; // WM_QUIT
}
LRESULT CALLBACK WndProc( HWND hwnd, UINT message, WPARAM wParam, LPARAM lParam )
{
static wchar_t utf16_char[3];
static int surrogate_count = 0;
HDC hdc;
char szKeyName[32];
PAINTSTRUCT ps;
switch (message)
{
case WM_SIZE: // 0x5
InvalidateRect( hwnd, NULL, TRUE );
return 0;
// MESSAGES TO REPORT
case WM_INPUTLANGCHANGE: // 0x51
{
wchar_t szLocaleName[LOCALE_NAME_MAX_LENGTH];
HKL hkl = (HKL)lParam;
LANGID langId = LOWORD(HandleToUlong( hkl ));
LCID lcid = MAKELCID(langId, SORT_DEFAULT);
LCIDToLocaleName( lcid, szLocaleName, LOCALE_NAME_MAX_LENGTH,
LOCALE_ALLOW_NEUTRAL_NAMES );
printf( "\nWM_INPUTLANGCHANGE 0x%08x 0x%08x=(%S)\n",
(DWORD)wParam, lcid, szLocaleName );
printf( "%s\n", szTop );
printf( "%s\n", szUnd );
return TRUE;
}
// KEYBOARD MESSAGES
case WM_KEYDOWN:
case WM_KEYUP:
case WM_SYSKEYDOWN:
case WM_SYSKEYUP:
GetKeyNameTextA( (LONG)lParam, szKeyName, _countof(szKeyName) );
printf( szFormat,
szMsgName[message - WM_KEYFIRST], // 1
szKeyName ); // 2
break; // call DefWindowProc so Sys messages work
// INPUT MESSAGES
case WM_CHAR:
case WM_DEADCHAR:
case WM_SYSCHAR:
case WM_SYSDEADCHAR:
printf( szInputFmt,
szMsgName[message - WM_KEYFIRST], // 1
wParam ); // 2 [character value(s)]
if (WM_CHAR == message) {
if (Wide) {
char utf8[7];
wchar_t wchar = (wchar_t)wParam;
// handle UTF-16 surrogates
if (IS_HIGH_SURROGATE(wchar)) {
utf16_char[0] = wchar;
utf16_char[1] = L'\0';
surrogate_count += 1;
}
else if (IS_LOW_SURROGATE(wchar)) {
utf16_char[1] = wchar;
utf16_char[2] = L'\0';
surrogate_count += 1;
}
else {
// normal UTF-16 character
utf16_char[0] = wchar;
}
if ((0 == surrogate_count) // not supplementary character
|| (2 == surrogate_count)) { // supplementary character completed
// UTF-16
int i = 0;
printf( szInput2Fmt, surrogate_count > 0 ? "UTF-16 Supplementary" : "UTF-16" );
surrogate_count = 0;
while (utf16_char[i]) {
printf( " 0x%02x", (int)(unsigned short)utf16_char[i++] );
}
printf( "\n" );
// UTF-8
int len = WideCharToMultiByte( CP_UTF8, 0, utf16_char, -1, utf8, 7, NULL, NULL);
if (!len) {
printf( "WideCharToMultiByte error\n" );
}
printf( szInput2Fmt, " UTF-8");
for (i = 0; i < len - 1; ++i) {
printf( " 0x%02x", (int)(unsigned char)utf8[i] );
}
printf( "\n" );
}
}
return 0; // WM_CHAR was processed
}
break; // call DefWindowProc so Sys messages work
case WM_IME_CHAR:
printf( szInputFmt,
"WM_IME_CHAR", // 1
wParam ); // 2 [character value(s)]
break; // call DefWindowProc so we'll get WM_CHAR messages
case WM_PAINT: // 0xF
hdc = BeginPaint( hwnd, &ps );
EndPaint( hwnd, &ps );
return 0;
case WM_DESTROY: // 0x2
PostQuitMessage( 0 );
return 0;
}
if (Wide) {
return DefWindowProcW( hwnd, message, wParam, lParam );
}
else {
return DefWindowProcA( hwnd, message, wParam, lParam );
}
}
The program project property "Configuration Properties", "Advanced", "Character Set" in Visual Studio was set to "Use Multi-Byte Character Set". A manifest file was used to set the process ACP to utf-8 (65001) which bypasses the need to select the utf-8 beta option under "Administrative language setting", "Language for non-Unicode programs" and reboot. Information on specifying UTF-8 in an application manifest can be found at Use UTF-8 code pages in Windows apps.
I tested Hebrew and Japanese with both the UTF-8 mode and Unicode mode running on Windows 11, 64 bit. With the input language set to Hebrew, I tried the characters Alef and He which are the keys "T" and "V" respectively. The Hebrew keyboard layout is at Microsoft keyboards. With the input language set to Japanese, the IME option was switched to あ then the "A" key was typed and then the character あ was clicked in the IME.
The Hebrew and Japanese characters are not correct when the program is run in UTF-8 (multi-byte) mode as listed below.
WM_INPUTLANGCHANGE 0x000000b1 0x0000040d=(he-IL)
WM_KEYDOWN T
WM_CHAR 0x00000000000000E0
WM_KEYUP T
WM_KEYDOWN V
WM_CHAR 0x00000000000000E4
WM_KEYUP V
WM_INPUTLANGCHANGE 0x00000080 0x00000411=(ja-JP)
WM_KEYDOWN A
WM_KEYUP A
WM_IME_CHAR 0x000000000000E381
WM_CHAR 0x0000000000004581
WM_CHAR 0x0000000000004581
The UTF-16LE values and UTF-8 converted values are correct when the program is run in Unicode mode as listed below.
WM_INPUTLANGCHANGE 0x000000b1 0x0000040d=(he-IL)
WM_KEYDOWN T
WM_CHAR 0x00000000000005D0
0xd7 0x90
WM_KEYUP T
WM_KEYDOWN V
WM_CHAR 0x00000000000005D4
0xd7 0x94
WM_KEYUP V
WM_INPUTLANGCHANGE 0x00000080 0x00000411=(ja-JP)
WM_KEYDOWN A
WM_KEYUP A
WM_IME_CHAR 0x0000000000003042
WM_CHAR 0x0000000000003042
0xe3 0x81 0x82
The correct UTF-8 values (from online UTF-8 tables) are:
Aleph 0xd7 0x90
He 0xd7 0x94
あ 0xe3 0x81 0x82
Is there something I'm missing when using the UTF-8 mode? Or does it only work for certain languages? The Microsoft documentation for WM_CHAR states, "the system provides characters in the current process code page, which can be set to UTF-8 in Windows Version 1903 (May 2019 Update) and newer."
Edit
I have modified the UTF-16 WM_CHAR handling so that it handles surrogates as mentioned by Remy in the comments.
This question is similar to Windows producing UTF8 sequence for ANSI version WM_CHAR? Why I cannot see it?, but different. The solution for that question is that "the three UTF8 bytes are sent in a single WM_CHAR message, not across three WM_CHAR messages." That solution is not applicable to this question. The problem presented here is that some incorrect or incomplete UTF-8 values are received by the WM_CHAR/WM_IME_CHAR messages, so there doesn't seem to be a way to get the UTF-8 characters directly from the window procedure messages in this case.