So we get a string like Новая папка
which is utf-8 representation of utf-16 encoded line (Новая папка
in utf-16) we want to turn this string into wstring not changing encoding.. meaning literally bring all data from string to wstring with out any conversion. So we would get wstring with Новая папка
contents. How to do such thing?
Update:
What I meant to say - we have all data for correct utf-16 string inside of string. All we need is to put that data into wstring... that means if wstring contains of wchar which could happen to be 0000
we would have to put 2 string chars 00
and 00
together to get it. That is what I do not know how to do.
Update2 How I got here - a C++ lib I am obligated to use on my server is C style parser. and it returns me user request adress as std::string. while I make my clients send to me requests in such format.
url_encode(UTF16toUTF8(wstring)) //pseudocode.
where
string UTF16toUTF8(const wstring & in)
{
string out;
unsigned int codepoint;
bool completecode = false;
for (wstring::const_iterator p = in.begin(); p != in.end(); ++p)
{
if (*p >= 0xd800 && *p <= 0xdbff)
{
codepoint = ((*p - 0xd800) << 10) + 0x10000;
completecode = false;
}
else if (!completecode && *p >= 0xdc00 && *p <= 0xdfff)
{
codepoint |= *p - 0xdc00;
completecode = true;
}
else
{
codepoint = *p;
completecode = true;
}
if (completecode)
{
if (codepoint <= 0x7f)
out.push_back(codepoint);
else if (codepoint <= 0x7ff)
{
out.push_back(0xc0 | ((codepoint >> 6) & 0x1f));
out.push_back(0x80 | (codepoint & 0x3f));
}
else if (codepoint <= 0xffff)
{
out.push_back(0xe0 | ((codepoint >> 12) & 0x0f));
out.push_back(0x80 | ((codepoint >> 6) & 0x3f));
out.push_back(0x80 | (codepoint & 0x3f));
}
else
{
out.push_back(0xf0 | ((codepoint >> 18) & 0x07));
out.push_back(0x80 | ((codepoint >> 12) & 0x3f));
out.push_back(0x80 | ((codepoint >> 6) & 0x3f));
out.push_back(0x80 | (codepoint & 0x3f));
}
}
}
return out;
}
std::string url_encode( std::string sSrc )
{
const char SAFE[256] =
{
/* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
/* 0 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
/* 1 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
/* 2 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
/* 3 */ 1,1,1,1, 1,1,1,1, 1,1,0,0, 0,0,0,0,
/* 4 */ 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
/* 5 */ 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
/* 6 */ 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
/* 7 */ 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
/* 8 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
/* 9 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
/* A */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
/* B */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
/* C */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
/* D */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
/* E */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
/* F */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
};
const char DEC2HEX[16 + 1] = "0123456789ABCDEF";
const unsigned char * pSrc = (const unsigned char *)sSrc.c_str();
const int SRC_LEN = sSrc.length();
unsigned char * const pStart = new unsigned char[SRC_LEN * 3];
unsigned char * pEnd = pStart;
const unsigned char * const SRC_END = pSrc + SRC_LEN;
for (; pSrc < SRC_END; ++pSrc)
{
if (SAFE[*pSrc])
*pEnd++ = *pSrc;
else
{
// escape this char
*pEnd++ = '%';
*pEnd++ = DEC2HEX[*pSrc >> 4];
*pEnd++ = DEC2HEX[*pSrc & 0x0F];
}
}
std::string sResult((char *)pStart, (char *)pEnd);
delete [] pStart;
return sResult;
}
std::string url_decode( std::string sSrc )
{
// Note from RFC1630: "Sequences which start with a percent sign
// but are not followed by two hexadecimal characters (0-9, A-F) are reserved
// for future extension"
const char HEX2DEC[256] =
{
/* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
/* 0 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
/* 1 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
/* 2 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
/* 3 */ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1, -1,-1,-1,-1,
/* 4 */ -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
/* 5 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
/* 6 */ -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
/* 7 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
/* 8 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
/* 9 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
/* A */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
/* B */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
/* C */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
/* D */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
/* E */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
/* F */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1
};
const unsigned char * pSrc = (const unsigned char *)sSrc.c_str();
const int SRC_LEN = sSrc.length();
const unsigned char * const SRC_END = pSrc + SRC_LEN;
const unsigned char * const SRC_LAST_DEC = SRC_END - 2; // last decodable '%'
char * const pStart = new char[SRC_LEN];
char * pEnd = pStart;
while (pSrc < SRC_LAST_DEC)
{
if (*pSrc == '%')
{
char dec1, dec2;
if (-1 != (dec1 = HEX2DEC[*(pSrc + 1)])
&& -1 != (dec2 = HEX2DEC[*(pSrc + 2)]))
{
*pEnd++ = (dec1 << 4) + dec2;
pSrc += 3;
continue;
}
}
*pEnd++ = *pSrc++;
}
// the last 2- chars
while (pSrc < SRC_END)
*pEnd++ = *pSrc++;
std::string sResult(pStart, pEnd);
delete [] pStart;
return sResult;
}
Ofcourse I call url_decode, but I get a string..( so I hope now my problem is more clear.