mb_check_encoding as suggested by another user, seems to be the way. At least, the easiest way in PHP.
I've actually done a lot of this before in C++! There, there is no mb_check_encoding function, I had to write my own.
Don't use this code in PHP, it's just for curiosity's sake ;) Use mb_check_encoding.
Also, this "What you call binary gibberish is still valid UTF-8" by another user, is TOTALLY WRONG. You CAN CHECK UTF-8 with a HIGH DEGREE OF ACCURACY. Assuming of course that it's not a tiny string like 4 bytes, and that it has a lot of "non-ascii" chars. UTF-8 has a specific and "hard to accidentally get right" pattern.
This code also checks for "non-shortest form" UTF-8, which is a security problem. "non-shortest form" UTF-8, can lead to a situation where one program that is meant to filter out bad commands, actually lets them through, perhaps leading to SQL injection holes.
No idea how PHP handles non-shortest form UTF-8 though ;) Best to check it yourself if it worries you.
long VerifyUTF8(u8* source, u8* sourceEnd) {
while (source < sourceEnd) {
u8 c = *source++;
if (c >= 0x80) {
u8* PrevPos = source - 1;
source = LegalUTF8_(c, source);
if ( source > sourceEnd or !source ) {
return sourceEnd - PrevPos;
}
}
}
return 0;
}
// returns 0 if it fails! source point to the 2nd byte of the UTF8!
u8* LegalUTF8_(u8 FirstChar, u8* source) {
if (FirstChar < 0xC2 or FirstChar > 0xF4) {
return 0; // dissallows ASCII! No point calling this on ASCII!
}
u32 ch = FirstChar;
u32 offset;
u8 a = *source++;
switch (FirstChar) { /* no fall-through in this inner switch */
case 0xE0: if (a < 0xA0) return 0; break;
case 0xF0: if (a < 0x90) return 0; break;
case 0xF4: if (a > 0x8F) return 0; break;
}
if (ch <= 0xDF) {
offset = 0x00003080;
goto case2;
} else if (ch <= 0xEF) {
offset = 0x000E2080;
goto case3;
} else { // case 4
offset = 0x03C82080;
}
ch <<= 6; ch += a;
if (a < 0x80 or a > 0xBF) {
return 0;
}
a = *source++;
case3:; ch <<= 6; ch += a;
if (a < 0x80 or a > 0xBF) {
return 0;
}
a = *source++;
case2:; ch <<= 6; ch += a;
if (a < 0x80 or a > 0xBF) {
return 0;
}
if (UniValid(ch-offset)) {
return source;
}
return 0;
}
bool UniValid( u32 c ) { // negative c looks like > 2 billion, which is going to return false!
if ( c < 0xD800 ) { // common case first
return true;
} else if ( c <= 0x0010FFFF and c > 0xDFFF and c != 0xFFFF and c != 0xFFFE ) {
return true;
}
return false;
}