I did not came up with a better solution than already hinted.
So I will just share the solution based on streambuf here for anyone who is interested in it.
Hopefully, someone will come up with a better solution and share it here.
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>
#if defined(_WIN32) && defined(_UNICODE) && (defined(__MSVCRT__) ||defined(_MSC_VER))
#define TEST_ARG_TYPE wchar_t
#else /* not windows, unicode */
#define TEST_ARG_TYPE char
#endif /* windows, unicode */
#ifndef _O_U16TEXT
#define _O_U16TEXT 0x20000
#endif
static size_t countValidUtf8Bytes(const unsigned char * buf, const size_t size) {
size_t i, charSize;
const unsigned char * src = buf;
for (i = 0; i < size && (*src) != 0; i += charSize, src += charSize) {
charSize = 0;
if ((*src) >= 0xFC) {
charSize = 6;
} else if ((*src) >= 0xF8) {
charSize = 5;
} else if ((*src) >= 0xF0) {
charSize = 4;
} else if ((*src) >= 0xE0) {
charSize = 3;
} else if ((*src) >= 0xC0) {
charSize = 2;
} else if ((*src) >= 0x80) {
/* Skip continuous UTF-8 character (should never happen). */
for (; (i + charSize) < size && src[charSize] != 0 && src[charSize] >= 0x80; charSize++) {
charSize++;
}
} else {
/* ASCII character. */
charSize = 1;
}
if ((i + charSize) > size) break;
}
return i;
}
#if defined(_WIN32) && defined(_UNICODE) && (defined(__MSVCRT__) ||defined(_MSC_VER))
#include <locale>
#include <streambuf>
#include <boost/locale.hpp>
extern "C" {
#include <fcntl.h>
#include <io.h>
#include <windows.h>
int _CRT_glob;
extern void __wgetmainargs(int *, wchar_t ***, wchar_t ***, int, int *);
}
class Utf8ToUtf16Buffer : public std::basic_streambuf< char, std::char_traits<char> > {
private:
char * outBuf;
FILE * outFd;
public:
static const size_t BUFFER_SIZE = 1024;
typedef std::char_traits<char> traits_type;
typedef traits_type::int_type int_type;
typedef traits_type::pos_type pos_type;
typedef traits_type::off_type off_type;
explicit Utf8ToUtf16Buffer(FILE * o) : outBuf(new char[BUFFER_SIZE]), outFd(o) {
/* Initialize the put pointer. Overflow won't get called until this
* buffer is filled up, so we need to use valid pointers.
*/
this->setp(outBuf, outBuf + BUFFER_SIZE - 1);
}
~Utf8ToUtf16Buffer() {
delete[] outBuf;
}
protected:
virtual int_type overflow(int_type c);
virtual int_type sync();
};
Utf8ToUtf16Buffer::int_type Utf8ToUtf16Buffer::overflow(Utf8ToUtf16Buffer::int_type c) {
char * iBegin = this->outBuf;
char * iEnd = this->pptr();
int_type result = traits_type::not_eof(c);
/* If this is the end, add an eof character to the buffer.
* This is why the pointers passed to setp are off by 1
* (to reserve room for this).
*/
if ( ! traits_type::eq_int_type(c, traits_type::eof()) ) {
*iEnd = traits_type::to_char_type(c);
iEnd++;
}
/* Calculate output data length. */
int_type iLen = static_cast<int_type>(iEnd - iBegin);
int_type iLenU8 = static_cast<int_type>(
countValidUtf8Bytes(reinterpret_cast<const unsigned char *>(iBegin), static_cast<size_t>(iLen))
);
/* Convert string to UTF-16 and write to defined file descriptor. */
if (fwprintf(this->outFd, boost::locale::conv::utf_to_utf<wchar_t>(std::string(outBuf, outBuf + iLenU8)).c_str()) < 0) {
/* Failed to write data to output file descriptor. */
result = traits_type::eof();
}
/* Reset the put pointers to indicate that the buffer is free. */
if (iLenU8 == iLen) {
this->setp(outBuf, outBuf + BUFFER_SIZE + 1);
} else {
/* Move incomplete UTF-8 characters remaining in buffer. */
const size_t overhead = static_cast<size_t>(iLen - iLenU8);
memmove(outBuf, outBuf + iLenU8, overhead);
this->setp(outBuf + overhead, outBuf + BUFFER_SIZE + 1);
}
return result;
}
Utf8ToUtf16Buffer::int_type Utf8ToUtf16Buffer::sync() {
return traits_type::eq_int_type(this->overflow(traits_type::eof()), traits_type::eof()) ? -1 : 0;
}
#endif /* windows, unicode */
int test_main(int argc, TEST_ARG_TYPE ** argv);
#if defined(_WIN32) && defined(_UNICODE) && (defined(__MSVCRT__) ||defined(_MSC_VER))
int main(/*int argc, char ** argv*/) {
wchar_t ** wenpv, ** wargv;
int wargc, si = 0;
/* this also creates the global variable __wargv */
__wgetmainargs(&wargc, &wargv, &wenpv, _CRT_glob, &si);
/* enable UTF-16 output to standard output console */
_setmode(_fileno(stdout), _O_U16TEXT);
std::locale::global(boost::locale::generator().generate("UTF-8"));
Utf8ToUtf16Buffer u8cout(stdout);
std::streambuf * out = std::cout.rdbuf();
std::cout.rdbuf(&u8cout);
/* process user defined main function */
const int result = test_main(wargc, wargv);
/* revert stream buffers to let cout clean up remaining memory correctly */
std::cout.rdbuf(out);
return result;
#else /* not windows or unicode */
int main(int argc, char ** argv) {
return test_main(argc, argv);
#endif /* windows, unicode */
}
int test_main(int /*argc*/, TEST_ARG_TYPE ** /*argv*/) {
const std::string str("\x61\x62\x63\xC3\xA4\xC3\xB6\xC3\xBC\xE3\x81\x82\xE3\x81\x88\xE3\x81\x84\xE3\x82\xA2\xE3\x82\xA8\xE3\x82\xA4\xE4\xBA\x9C\xE6\xB1\x9F\xE6\x84\x8F");
for (size_t i = 1; i <= str.size(); i++) {
const std::string part(str.begin(), str.begin() + i);
const size_t validByteCount = countValidUtf8Bytes(reinterpret_cast<const unsigned char *>(part.c_str()), part.size());
wprintf(L"i = %u, v = %u\n", i, validByteCount);
const std::string valid(str.begin(), str.begin() + validByteCount);
std::cout << valid << std::endl;
std::cout.flush();
for (size_t j = 0; j < part.size(); j++) {
wprintf(L"%02X", static_cast<int>(part[j]) & 0xFF);
}
wprintf(L"\n");
}
return EXIT_SUCCESS;
}