0

I am trying to convert a C program to use the Windows UTF-8 feature and am having trouble getting it to work properly. The input messages from the window procedure don't seem to receive the correct UTF-8 values.

I've created the test program below which can be run in UTF-8 (using the A API versions) or Unicode (using the W API versions) modes. The UTF-8 mode is default and displays the values received by the WM_IME_CHAR and WM_CHAR messages in hex. To use the Unicode mode, the letter w is added as a command line argument. The Unicode mode also displays the values received by the WM_IME_CHAR and WM_CHAR messages but additionally displays the UTF-8 value converted from the Unicode (UTF-16LE) value.

#include <windows.h>
#include <stdio.h>

LRESULT CALLBACK WndProc( HWND, UINT, WPARAM, LPARAM );

//                              1         2         3         4         5         6
//                     123456789012345678901234567890123456789012345678901234567890
static char szTop[] = "Message            Key             Char value(s)";
static char szUnd[] = "------------------ --------------- ------------------";
static char *szFormat   = "%-18s %-15.15s\n";
static char *szInputFmt = "%-18s                 0x%p\n";
static char *szInput2Fmt = "%34s";
static char *szMsgName[] = {
    "WM_KEYDOWN",    "WM_KEYUP",    // 0x100, 0x101,
    "WM_CHAR",       "WM_DEADCHAR", // 0x102, 0x103,
    "WM_SYSKEYDOWN", "WM_SYSKEYUP", // 0x104, 0x105,
    "WM_SYSCHAR",    "WM_SYSDEADCHAR" };    // 0x106, 0x107

static int Wide = 0;

static HWND createWindowA()
{
    DWORD err;

    // Register window class

    WNDCLASSA   WindowClass;
    WindowClass.style = CS_HREDRAW | CS_VREDRAW | CS_OWNDC;
    WindowClass.lpfnWndProc = (WNDPROC)WndProc;
    WindowClass.cbClsExtra = 0;
    WindowClass.cbWndExtra = 0;
    WindowClass.hInstance = (HINSTANCE)NULL;
    WindowClass.hIcon = LoadIconA( NULL, (LPCSTR)IDI_APPLICATION );
    WindowClass.hCursor = LoadCursorA( NULL, (LPCSTR)IDC_ARROW );
    WindowClass.hbrBackground = GetStockObject( WHITE_BRUSH );
    WindowClass.lpszMenuName = NULL;
    WindowClass.lpszClassName = "MY_CLASS";
    if (!RegisterClassA( &WindowClass )) {
        err = GetLastError();
        printf( "RegisterClass() Error= 0x%08x, %d\n", err, err );
        return NULL;
    }

    // create window

    HWND hwnd = CreateWindowA(
        "MY_CLASS",     /* window class name */
        "UTF-8 Keyboard Message Viewer",/* window name */
        WS_OVERLAPPEDWINDOW,
        CW_USEDEFAULT, CW_USEDEFAULT,   /* initial position */
        CW_USEDEFAULT, CW_USEDEFAULT,   /* size (w, h) */
        NULL, NULL,         /* parent, menu */
        (HINSTANCE)GetModuleHandleA( NULL ),
        NULL );
    if (!hwnd) {
        err = GetLastError();
        printf( "CreateWindow() Error= 0x%08x, %d\n", err, err );
        return NULL;
    }

    return hwnd;
}

static HWND createWindowW()
{
    DWORD err;

    // Register window class

    WNDCLASSW   WindowClass;
    WindowClass.style = CS_HREDRAW | CS_VREDRAW | CS_OWNDC;
    WindowClass.lpfnWndProc = (WNDPROC)WndProc;
    WindowClass.cbClsExtra = 0;
    WindowClass.cbWndExtra = 0;
    WindowClass.hInstance = (HINSTANCE)NULL;
    WindowClass.hIcon = LoadIconW( NULL, (LPCWSTR)IDI_APPLICATION );
    WindowClass.hCursor = LoadCursorW( NULL, (LPCWSTR)IDC_ARROW );
    WindowClass.hbrBackground = GetStockObject( WHITE_BRUSH );
    WindowClass.lpszMenuName = NULL;
    WindowClass.lpszClassName = L"MY_CLASS";
    if (!RegisterClassW( &WindowClass )) {
        err = GetLastError();
        printf( "RegisterClass() Error= 0x%08x, %d\n", err, err );
        return NULL;
    }

    // create window

    HWND hwnd = CreateWindowExW(
        WS_EX_CLIENTEDGE,
        L"MY_CLASS",        /* window class name */
        L"UTF-8 Keyboard Message Viewer",/* window name */
        WS_OVERLAPPEDWINDOW,
        CW_USEDEFAULT, CW_USEDEFAULT,   /* initial position */
        CW_USEDEFAULT, CW_USEDEFAULT,   /* size (w, h) */
        NULL, NULL,         /* parent, menu */
        (HINSTANCE)GetModuleHandleW( NULL ),
        NULL );
    if (!hwnd) {
        err = GetLastError();
        printf( "CreateWindowExW() Error= 0x%08x, %d\n", err, err );
        return NULL;
    }

    return hwnd;
}

int main( int argc, char *argv[] )
{
    HWND hwnd = NULL;

    Wide = ((argc > 1) && ('w' == *argv[1] || 'W' == *argv[1]));

    if (Wide) {
        hwnd = createWindowW(); // Unicode version
    }
    else {
        hwnd = createWindowA(); // Multi-byte version
    }
    if (!hwnd) {
        printf( "Failed to register Window Class.\n" );
        exit( 1 );
    }

    printf( "Process ACP: %u\n\n", GetACP() );
    printf( "%s\n", szTop );
    printf( "%s\n", szUnd );

    ShowWindow( hwnd, SW_SHOWDEFAULT );
    UpdateWindow( hwnd );

    MSG msg;
    if (Wide) {
        while (GetMessageW( &msg, NULL, 0, 0 )) {
            TranslateMessage( &msg );
            DispatchMessageW( &msg );
        }
    }
    else {
        while (GetMessageA( &msg, NULL, 0, 0 )) {
            TranslateMessage( &msg );
            DispatchMessageA( &msg );
        }
    }

    return (int)msg.wParam;  // WM_QUIT
}

LRESULT CALLBACK WndProc( HWND hwnd, UINT message, WPARAM wParam, LPARAM lParam )
{
    static wchar_t utf16_char[3];
    static int surrogate_count = 0;
    HDC hdc;
    char szKeyName[32];
    PAINTSTRUCT ps;

    switch (message)
    {
    case WM_SIZE:   // 0x5
        InvalidateRect( hwnd, NULL, TRUE );
        return 0;

        // MESSAGES TO REPORT
    case WM_INPUTLANGCHANGE:    // 0x51
    {
        wchar_t szLocaleName[LOCALE_NAME_MAX_LENGTH];
        HKL hkl = (HKL)lParam;
        LANGID langId = LOWORD(HandleToUlong( hkl ));
        LCID lcid = MAKELCID(langId, SORT_DEFAULT);
        LCIDToLocaleName( lcid, szLocaleName, LOCALE_NAME_MAX_LENGTH,
            LOCALE_ALLOW_NEUTRAL_NAMES );

        printf( "\nWM_INPUTLANGCHANGE 0x%08x 0x%08x=(%S)\n",
            (DWORD)wParam, lcid, szLocaleName );
        printf( "%s\n", szTop );
        printf( "%s\n", szUnd );
        return TRUE;
    }

        // KEYBOARD MESSAGES
    case WM_KEYDOWN:
    case WM_KEYUP:
    case WM_SYSKEYDOWN:
    case WM_SYSKEYUP:
        GetKeyNameTextA( (LONG)lParam, szKeyName, _countof(szKeyName) );
        printf( szFormat,
            szMsgName[message - WM_KEYFIRST],   // 1
            szKeyName );                // 2

        break;      // call DefWindowProc so Sys messages work

        // INPUT MESSAGES
    case WM_CHAR:
    case WM_DEADCHAR:
    case WM_SYSCHAR:
    case WM_SYSDEADCHAR:
        printf( szInputFmt,
            szMsgName[message - WM_KEYFIRST],   // 1
            wParam );               // 2 [character value(s)]

        if (WM_CHAR == message) {
            if (Wide) {
                char utf8[7];
                wchar_t wchar = (wchar_t)wParam;

                // handle UTF-16 surrogates

                if (IS_HIGH_SURROGATE(wchar)) {
                    utf16_char[0] = wchar;
                    utf16_char[1] = L'\0';
                    surrogate_count += 1;
                }
                else if (IS_LOW_SURROGATE(wchar)) {
                    utf16_char[1] = wchar;
                    utf16_char[2] = L'\0';
                    surrogate_count += 1;
                }
                else {
                    // normal UTF-16 character
                    utf16_char[0] = wchar;
                }

                if ((0 == surrogate_count)      // not supplementary character
                    || (2 == surrogate_count)) {    // supplementary character completed
                    // UTF-16
                    int i = 0;
                    printf( szInput2Fmt, surrogate_count > 0 ? "UTF-16 Supplementary" : "UTF-16" );
                    surrogate_count = 0;
                    while (utf16_char[i]) {
                        printf( " 0x%02x", (int)(unsigned short)utf16_char[i++] );
                    }
                    printf( "\n" );

                    // UTF-8
                    int len = WideCharToMultiByte( CP_UTF8, 0, utf16_char, -1, utf8, 7, NULL, NULL);
                    if (!len) {
                        printf( "WideCharToMultiByte error\n" );
                    }
                    printf( szInput2Fmt, " UTF-8");
                    for (i = 0; i < len - 1; ++i) {
                        printf( " 0x%02x", (int)(unsigned char)utf8[i] );
                    }
                    printf( "\n" );
                }
            }

            return 0;   // WM_CHAR was processed
        }
        break;      // call DefWindowProc so Sys messages work

    case WM_IME_CHAR:
        printf( szInputFmt,
            "WM_IME_CHAR",              // 1
            wParam );               // 2 [character value(s)]
        break;      // call DefWindowProc so we'll get WM_CHAR messages

    case WM_PAINT:  // 0xF
        hdc = BeginPaint( hwnd, &ps );
        EndPaint( hwnd, &ps );
        return 0;

    case WM_DESTROY:    // 0x2
        PostQuitMessage( 0 );
        return 0;
    }

    if (Wide) {
        return DefWindowProcW( hwnd, message, wParam, lParam );
    }
    else {
        return DefWindowProcA( hwnd, message, wParam, lParam );
    }
}

The program project property "Configuration Properties", "Advanced", "Character Set" in Visual Studio was set to "Use Multi-Byte Character Set". A manifest file was used to set the process ACP to utf-8 (65001) which bypasses the need to select the utf-8 beta option under "Administrative language setting", "Language for non-Unicode programs" and reboot. Information on specifying UTF-8 in an application manifest can be found at Use UTF-8 code pages in Windows apps.

I tested Hebrew and Japanese with both the UTF-8 mode and Unicode mode running on Windows 11, 64 bit. With the input language set to Hebrew, I tried the characters Alef and He which are the keys "T" and "V" respectively. The Hebrew keyboard layout is at Microsoft keyboards. With the input language set to Japanese, the IME option was switched to あ then the "A" key was typed and then the character あ was clicked in the IME.

The Hebrew and Japanese characters are not correct when the program is run in UTF-8 (multi-byte) mode as listed below.

WM_INPUTLANGCHANGE 0x000000b1 0x0000040d=(he-IL)
WM_KEYDOWN         T
WM_CHAR                            0x00000000000000E0
WM_KEYUP           T
WM_KEYDOWN         V
WM_CHAR                            0x00000000000000E4
WM_KEYUP           V

WM_INPUTLANGCHANGE 0x00000080 0x00000411=(ja-JP)
WM_KEYDOWN         A
WM_KEYUP           A
WM_IME_CHAR                        0x000000000000E381
WM_CHAR                            0x0000000000004581
WM_CHAR                            0x0000000000004581

The UTF-16LE values and UTF-8 converted values are correct when the program is run in Unicode mode as listed below.

WM_INPUTLANGCHANGE 0x000000b1 0x0000040d=(he-IL)
WM_KEYDOWN         T
WM_CHAR                            0x00000000000005D0
                                   0xd7 0x90
WM_KEYUP           T
WM_KEYDOWN         V
WM_CHAR                            0x00000000000005D4
                                   0xd7 0x94
WM_KEYUP           V

WM_INPUTLANGCHANGE 0x00000080 0x00000411=(ja-JP)
WM_KEYDOWN         A
WM_KEYUP           A
WM_IME_CHAR                        0x0000000000003042
WM_CHAR                            0x0000000000003042
                                   0xe3 0x81 0x82

The correct UTF-8 values (from online UTF-8 tables) are:

Aleph 0xd7 0x90
He 0xd7 0x94
あ 0xe3 0x81 0x82

Is there something I'm missing when using the UTF-8 mode? Or does it only work for certain languages? The Microsoft documentation for WM_CHAR states, "the system provides characters in the current process code page, which can be set to UTF-8 in Windows Version 1903 (May 2019 Update) and newer."

Edit

I have modified the UTF-16 WM_CHAR handling so that it handles surrogates as mentioned by Remy in the comments.

This question is similar to Windows producing UTF8 sequence for ANSI version WM_CHAR? Why I cannot see it?, but different. The solution for that question is that "the three UTF8 bytes are sent in a single WM_CHAR message, not across three WM_CHAR messages." That solution is not applicable to this question. The problem presented here is that some incorrect or incomplete UTF-8 values are received by the WM_CHAR/WM_IME_CHAR messages, so there doesn't seem to be a way to get the UTF-8 characters directly from the window procedure messages in this case.

Daniel LB
  • 25
  • 6
  • 1
    One problem I see is in wide mode, you are treating the `wParam` of `WM_CHAR` as a null-terminated string, but it is not. It is a single UTF-16 codeunit instead. And if it carries a UTF-16 surrogate, you won't be able to convert it to UTF-8 from a single message – Remy Lebeau Jun 22 '23 at 02:13
  • FWIW, same for french. This is the same issue as this https://stackoverflow.com/questions/75191880/windows-producing-utf8-sequence-for-ansi-version-wm-char-why-i-cannot-see-it The problem happens in TranslateMessage and I think it's somehow a keyboard (driver) issue – Simon Mourier Jun 22 '23 at 08:48
  • Your question is similar to this [thread](https://stackoverflow.com/questions/75191880/windows-producing-utf8-sequence-for-ansi-version-wm-char-why-i-cannot-see-it?rq=2). For some languages, it is temporarily unable to get UTF8 sequences directly from WM_CHAR. – Junjie Zhu - MSFT Jun 22 '23 at 10:29
  • @RemyLebeau Thank you, you are correct! I will work on fixing that part of the code. – Daniel LB Jun 22 '23 at 15:26
  • @SimonMourier Thank you. I looked at that issue while working on this, but that case was different since the UTF-8 values were there, just in a different way than expected. I could try to deal with special cases like that, but the case I ran into, it doesn't present any correct UTF-8 values (except when doing the conversion explicitly in the Unicode case). Even in the Japanese case, the WM_IME_CHAR does return a partial UTF-8 value, but not the entire value. – Daniel LB Jun 22 '23 at 15:41
  • @JunjieZhu-MSFT Does Microsoft plan to fix this any time soon? It doesn't seem like it would be that difficult to fix. It kind of sounds like I need to go the Unicode route and convert to UTF-8. The software I am converting is multi-platform and we have supported UTF-8 for several years on Linux. – Daniel LB Jun 22 '23 at 15:47
  • 1
    I'd suggest you submit a feature request about this feature in the Feedback Hub. You could find the Feedback Hub in the Start Menu. Please select **Developer Platform**-> **API Feedback** as the category when you submit your request. After submitting, please click the share button and send me the short link. – Junjie Zhu - MSFT Jun 23 '23 at 07:33
  • @JunjieZhu-MSFT Thank you. The problem report link is https://aka.ms/AAlayv6. – Daniel LB Jun 23 '23 at 15:16
  • I remember seeing a question a while ago about using UTF-8 with console output (https://stackoverflow.com/questions/73246952/how-to-support-utf-8-on-windows-without-using-chcp-65001-command-in-c). Because of that I wondered if the test program might be getting incorrect values because it is a console program. I rewrote the test program to be a windows program instead, but It still received the same, incorrect character values. – Daniel LB Jun 27 '23 at 03:52

0 Answers0