Kind of related: We had a web application that had to send data to a legacy system that could only deal with the first 128 characters of the ASCII character set.
The solution we had to use was something that would "translate" as many characters as possible into close-matching ASCII equivalents, but leave anything that could not be translated alone.
Normally I would do something like this:
<?php
// transliterate
if (function_exists('iconv')) {
$text = iconv('utf-8', 'us-ascii//TRANSLIT', $text);
}
?>
... but that replaces everything that can't be translated into a question mark (?).
So we ended up doing the following. Check at the end of this function for (commented out) php regex that just strips out non-ASCII characters.
<?php
public function cleanNonAsciiCharactersInString($orig_text) {
$text = $orig_text;
// Single letters
$text = preg_replace("/[∂άαáàâãªä]/u", "a", $text);
$text = preg_replace("/[∆лДΛдАÁÀÂÃÄ]/u", "A", $text);
$text = preg_replace("/[ЂЪЬБъь]/u", "b", $text);
$text = preg_replace("/[βвВ]/u", "B", $text);
$text = preg_replace("/[çς©с]/u", "c", $text);
$text = preg_replace("/[ÇС]/u", "C", $text);
$text = preg_replace("/[δ]/u", "d", $text);
$text = preg_replace("/[éèêëέëèεе℮ёєэЭ]/u", "e", $text);
$text = preg_replace("/[ÉÈÊË€ξЄ€Е∑]/u", "E", $text);
$text = preg_replace("/[₣]/u", "F", $text);
$text = preg_replace("/[НнЊњ]/u", "H", $text);
$text = preg_replace("/[ђћЋ]/u", "h", $text);
$text = preg_replace("/[ÍÌÎÏ]/u", "I", $text);
$text = preg_replace("/[íìîïιίϊі]/u", "i", $text);
$text = preg_replace("/[Јј]/u", "j", $text);
$text = preg_replace("/[ΚЌК]/u", 'K', $text);
$text = preg_replace("/[ќк]/u", 'k', $text);
$text = preg_replace("/[ℓ∟]/u", 'l', $text);
$text = preg_replace("/[Мм]/u", "M", $text);
$text = preg_replace("/[ñηήηπⁿ]/u", "n", $text);
$text = preg_replace("/[Ñ∏пПИЙийΝЛ]/u", "N", $text);
$text = preg_replace("/[óòôõºöοФσόо]/u", "o", $text);
$text = preg_replace("/[ÓÒÔÕÖθΩθОΩ]/u", "O", $text);
$text = preg_replace("/[ρφрРф]/u", "p", $text);
$text = preg_replace("/[®яЯ]/u", "R", $text);
$text = preg_replace("/[ГЃгѓ]/u", "r", $text);
$text = preg_replace("/[Ѕ]/u", "S", $text);
$text = preg_replace("/[ѕ]/u", "s", $text);
$text = preg_replace("/[Тт]/u", "T", $text);
$text = preg_replace("/[τ†‡]/u", "t", $text);
$text = preg_replace("/[úùûüџμΰµυϋύ]/u", "u", $text);
$text = preg_replace("/[√]/u", "v", $text);
$text = preg_replace("/[ÚÙÛÜЏЦц]/u", "U", $text);
$text = preg_replace("/[Ψψωώẅẃẁщш]/u", "w", $text);
$text = preg_replace("/[ẀẄẂШЩ]/u", "W", $text);
$text = preg_replace("/[ΧχЖХж]/u", "x", $text);
$text = preg_replace("/[ỲΫ¥]/u", "Y", $text);
$text = preg_replace("/[ỳγўЎУуч]/u", "y", $text);
$text = preg_replace("/[ζ]/u", "Z", $text);
// Punctuation
$text = preg_replace("/[‚‚]/u", ",", $text);
$text = preg_replace("/[`‛′’‘]/u", "'", $text);
$text = preg_replace("/[″“”«»„]/u", '"', $text);
$text = preg_replace("/[—–―−–‾⌐─↔→←]/u", '-', $text);
$text = preg_replace("/[ ]/u", ' ', $text);
$text = str_replace("…", "...", $text);
$text = str_replace("≠", "!=", $text);
$text = str_replace("≤", "<=", $text);
$text = str_replace("≥", ">=", $text);
$text = preg_replace("/[‗≈≡]/u", "=", $text);
// Exciting combinations
$text = str_replace("ыЫ", "bl", $text);
$text = str_replace("℅", "c/o", $text);
$text = str_replace("₧", "Pts", $text);
$text = str_replace("™", "tm", $text);
$text = str_replace("№", "No", $text);
$text = str_replace("Ч", "4", $text);
$text = str_replace("‰", "%", $text);
$text = preg_replace("/[∙•]/u", "*", $text);
$text = str_replace("‹", "<", $text);
$text = str_replace("›", ">", $text);
$text = str_replace("‼", "!!", $text);
$text = str_replace("⁄", "/", $text);
$text = str_replace("∕", "/", $text);
$text = str_replace("⅞", "7/8", $text);
$text = str_replace("⅝", "5/8", $text);
$text = str_replace("⅜", "3/8", $text);
$text = str_replace("⅛", "1/8", $text);
$text = preg_replace("/[‰]/u", "%", $text);
$text = preg_replace("/[Љљ]/u", "Ab", $text);
$text = preg_replace("/[Юю]/u", "IO", $text);
$text = preg_replace("/[fifl]/u", "fi", $text);
$text = preg_replace("/[зЗ]/u", "3", $text);
$text = str_replace("£", "(pounds)", $text);
$text = str_replace("₤", "(lira)", $text);
$text = preg_replace("/[‰]/u", "%", $text);
$text = preg_replace("/[↨↕↓↑│]/u", "|", $text);
$text = preg_replace("/[∞∩∫⌂⌠⌡]/u", "", $text);
//2) Translation CP1252.
$trans = get_html_translation_table(HTML_ENTITIES);
$trans['f'] = 'ƒ'; // Latin Small Letter F With Hook
$trans['-'] = array(
'…', // Horizontal Ellipsis
'˜', // Small Tilde
'–' // Dash
);
$trans["+"] = '†'; // Dagger
$trans['#'] = '‡'; // Double Dagger
$trans['M'] = '‰'; // Per Mille Sign
$trans['S'] = 'Š'; // Latin Capital Letter S With Caron
$trans['OE'] = 'Œ'; // Latin Capital Ligature OE
$trans["'"] = array(
'‘', // Left Single Quotation Mark
'’', // Right Single Quotation Mark
'›', // Single Right-Pointing Angle Quotation Mark
'‚', // Single Low-9 Quotation Mark
'ˆ', // Modifier Letter Circumflex Accent
'‹' // Single Left-Pointing Angle Quotation Mark
);
$trans['"'] = array(
'“', // Left Double Quotation Mark
'”', // Right Double Quotation Mark
'„', // Double Low-9 Quotation Mark
);
$trans['*'] = '•'; // Bullet
$trans['n'] = '–'; // En Dash
$trans['m'] = '—'; // Em Dash
$trans['tm'] = '™'; // Trade Mark Sign
$trans['s'] = 'š'; // Latin Small Letter S With Caron
$trans['oe'] = 'œ'; // Latin Small Ligature OE
$trans['Y'] = 'Ÿ'; // Latin Capital Letter Y With Diaeresis
$trans['euro'] = '€'; // euro currency symbol
ksort($trans);
foreach ($trans as $k => $v) {
$text = str_replace($v, $k, $text);
}
// 3) remove <p>, <br/> ...
$text = strip_tags($text);
// 4) & => & " => '
$text = html_entity_decode($text);
// transliterate
// if (function_exists('iconv')) {
// $text = iconv('utf-8', 'us-ascii//TRANSLIT', $text);
// }
// remove non ascii characters
// $text = preg_replace('/[\x00-\x1F\x80-\xFF]/', '', $text);
return $text;
}
?>