4

I stumble on a strange behaviour while trying to replace diacritics and special character of a filename.

The function works correctly with a string, but when I try this exact same sequence with a filename, it doesn't

.

$(document).on('change', 'input[type=file]', function() {
  var files = this.files;
  for (var i = 0; i < files.length; i++) {
    (function(file) {
      // Assuming the file name is áñǽŦõş
      var _string = 'áñǽŦõş.jpg',
        _filename = file.name;
      $('.string .result').html(convertAscii(_string));
      $('.filename .result').html(convertAscii(_filename));
    })(files[i]);
  }
});

function convertAscii(str) {
  str = str.replace(/Ä/g, 'Ae');
  str = str.replace(/æ|ǽ|ä/g, 'ae');
  str = str.replace(/À|Á|Â|Ã|Å|Ǻ|Ā|Ă|Ą|Ǎ|А/g, 'A');
  str = str.replace(/à|á|â|ã|å|ǻ|ā|ă|ą|ǎ|ª|а/g, 'a');
  str = str.replace(/Б/g, 'B');
  str = str.replace(/б/g, 'b');
  str = str.replace(/Ç|Ć|Ĉ|Ċ|Č|Ц/g, 'C');
  str = str.replace(/ç|ć|ĉ|ċ|č|ц/g, 'c');
  str = str.replace(/Ð|Ď|Đ/g, 'Dj');
  str = str.replace(/ð|ď|đ/g, 'dj');
  str = str.replace(/Д/g, 'D');
  str = str.replace(/д/g, 'd');
  str = str.replace(/È|É|Ê|Ë|Ē|Ĕ|Ė|Ę|Ě|Е|Ё|Э/g, 'E');
  str = str.replace(/è|é|ê|ë|ē|ĕ|ė|ę|ě|е|ё|э/g, 'e');
  str = str.replace(/Ф/g, 'F');
  str = str.replace(/ƒ|ф/g, 'f');
  str = str.replace(/Ĝ|Ğ|Ġ|Ģ|Г/g, 'G');
  str = str.replace(/ĝ|ğ|ġ|ģ|г/g, 'g');
  str = str.replace(/Ĥ|Ħ|Х/g, 'H');
  str = str.replace(/ĥ|ħ|х/g, 'h');
  str = str.replace(/Ì|Í|Î|Ï|Ĩ|Ī|Ĭ|Ǐ|Į|İ|И/g, 'I');
  str = str.replace(/ì|í|î|ï|ĩ|ī|ĭ|ǐ|į|ı|и/g, 'i');
  str = str.replace(/Ĵ|Й/g, 'J');
  str = str.replace(/ĵ|й/g, 'j');
  str = str.replace(/Ķ|К/g, 'K');
  str = str.replace(/ķ|к/g, 'k');
  str = str.replace(/Ĺ|Ļ|Ľ|Ŀ|Ł|Л/g, 'L');
  str = str.replace(/ĺ|ļ|ľ|ŀ|ł|л/g, 'l');
  str = str.replace(/М/g, 'M');
  str = str.replace(/м/g, 'm');
  str = str.replace(/Ñ|Ń|Ņ|Ň|Н/g, 'N');
  str = str.replace(/ñ|ń|ņ|ň|ʼn|н/g, 'n');
  str = str.replace(/Ö/g, 'Oe');
  str = str.replace(/œ|ö/g, 'oe');
  str = str.replace(/Ò|Ó|Ô|Õ|Ō|Ŏ|Ǒ|Ő|Ơ|Ø|Ǿ|О/g, 'O');
  str = str.replace(/ò|ó|ô|õ|ō|ŏ|ǒ|ő|ơ|ø|ǿ|º|о/g, 'o');
  str = str.replace(/П/g, 'P');
  str = str.replace(/п/g, 'p');
  str = str.replace(/Ŕ|Ŗ|Ř|Р/g, 'R');
  str = str.replace(/ŕ|ŗ|ř|р/g, 'r');
  str = str.replace(/Ś|Ŝ|Ş|Ș|Š|С/g, 'S');
  str = str.replace(/ś|ŝ|ş|ș|š|ſ|с/g, 's');
  str = str.replace(/Ţ|Ț|Ť|Ŧ|Т/g, 'T');
  str = str.replace(/ţ|ț|ť|ŧ|т/g, 't');
  str = str.replace(/Ü/g, 'Ue');
  str = str.replace(/ü/g, 'ue');
  str = str.replace(/Ù|Ú|Û|Ũ|Ū|Ŭ|Ů|Ű|Ų|Ư|Ǔ|Ǖ|Ǘ|Ǚ|Ǜ|У/g, 'U');
  str = str.replace(/ù|ú|û|ũ|ū|ŭ|ů|ű|ų|ư|ǔ|ǖ|ǘ|ǚ|ǜ|у/g, 'u');
  str = str.replace(/В/g, 'V');
  str = str.replace(/в/g, 'v');
  str = str.replace(/Ý|Ÿ|Ŷ|Ы/g, 'Y');
  str = str.replace(/ý|ÿ|ŷ|ы/g, 'y');
  str = str.replace(/Ŵ/g, 'W');
  str = str.replace(/ŵ/g, 'w');
  str = str.replace(/Ź|Ż|Ž|З/g, 'Z');
  str = str.replace(/ź|ż|ž|з/g, 'z');
  str = str.replace(/Æ|Ǽ/g, 'AE');
  str = str.replace(/ß/g, 'ss');
  str = str.replace(/IJ/g, 'IJ');
  str = str.replace(/ij/g, 'ij');
  str = str.replace(/Œ/g, 'OE');
  str = str.replace(/Ч/g, 'Ch');
  str = str.replace(/ч/g, 'ch');
  str = str.replace(/Ю/g, 'Ju');
  str = str.replace(/ю/g, 'ju');
  str = str.replace(/Я/g, 'Ja');
  str = str.replace(/я/g, 'ja');
  str = str.replace(/Ш/g, 'Sh');
  str = str.replace(/ш/g, 'sh');
  str = str.replace(/Щ/g, 'Shch');
  str = str.replace(/щ/g, 'shch');
  str = str.replace(/Ж/g, 'Zh');
  str = str.replace(/ж/g, 'zh');
  return str;
}
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<input type="file" name="pic" accept="image/*">

<div>
  <div class="string">Converted string : <span class="result"></span></div>
  <div class="filename">Converted filename : <span class="result"></span></div>
</div>

I also made a fiddle showing the issue, you'll see what I mean when uploading a jpg named áñaéTõş.

I'm getting the file name from an input. The weird behaviour is this :

console.log(convertAscii(_string)); // Correct => anaeTos
console.log(convertAscii(_filename)); // Wrong => áñaéTõş

I believe it's because diacritics are interpreted as independent characters, but does someone have an idea of how to fix it?

mplungjan
  • 169,008
  • 28
  • 173
  • 236
  • I cannot reproduce this: the filename is correctly translated. Which browser are you running in, and which operating system? (I ran on FF, Windows) – trincot Aug 24 '17 at 14:22
  • I can reproduce it. Behaviour happens in Chrome. It's very odd - almost like the string is readonly – Rory McCrossan Aug 24 '17 at 14:23
  • I'm on macOS Sierra 10.12.4, it happens with FF / Chrome / Safari / Opera. – Sylvain Julé Aug 24 '17 at 14:27
  • 1
    It has something to do with different encodings (I believe). Check [your modified fiddle](https://jsfiddle.net/Lr7g32dy/1/) that simply print the URI encoded strings. You probably get the filename in Unicode and the string is UTF-8, or vice versa. – SamWhan Aug 24 '17 at 14:27
  • In actual fact it appears to me that Chrome OSX is actually making the first replacement then stopping: notice `ǽ`, has been replaced with `a` http://i.imgur.com/8kbYvUV.png. It's definitely some kind of encoding issue. – Rory McCrossan Aug 24 '17 at 14:36
  • It is indeed an encoding issue, both encoding differ. Is there a way to harmonize them ? – Sylvain Julé Aug 24 '17 at 15:01
  • 1
    Encoding, code points... I don't know the ins from the outs here, but maybe this guy points to what you need: https://stackoverflow.com/a/10805884/2064981 – SamWhan Aug 25 '17 at 06:44

1 Answers1

0

I am convinced your code does not work because of codeppoint problems. The characters in question may look like the specials chars you expect, but are ultimately not equal. Hence, the string replacement does not work

To solve this issue, as suggested by @ClasG, you can perform a Unicode normalization with an external library, since JavaScript doesn't have this functionality built in.

  • unorm is low-level lib that can do the normalization for you.
  • There are also more high-level libs like iconv lite, which have more dependencies.

It is also required to use Unicode characters in your code instead to make the comparisons with the normalized text work.

Here is my modified fiddle. I hope this solves the issue for you.

$(document).on('change', 'input[type=file]', function() {

  var files = this.files;

  for (var i = 0; i < files.length; i++) {
    (function(file) {
      // Assuming the file name is áñǽŦõş
      var _string = 'äöüß', // 'áñǽŦõş.jpg',
        _filename = file.name;

      $('.string .result').html(convertAscii(_string.normalize('NFC')));
      $('.filename .result').html(convertAscii(_filename.normalize('NFC')));

    })(files[i]);
  }

});

function convertAscii(str) {
  //convert German umlauts (normalized using nfc: Canonical Decomposition, followed by Canonical Composition) to Ascii
  tr = {
    "\u00e4": "ae",
    "\u00fc": "ue",
    "\u00f6": "oe",
    "\u00df": "ss"
  }
  str = str.replace(/[\u00e4|\u00fc|\u00f6|\u00df]/g, function($0) {
    return tr[$0]
  })
  //... add more..

  return str;
}
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
<script src='https://raw.githubusercontent.com/walling/unorm/master/lib/unorm.js'></script>
<input type="file" name="pic" accept="image/*">

</script>
<div>
  <div class="string">Converted string : <span class="result"></span></div>
  <div class="filename">Converted filename : <span class="result"></span></div>
</div>
mplungjan
  • 169,008
  • 28
  • 173
  • 236
wp78de
  • 18,207
  • 7
  • 43
  • 71