1

Why does the following code give a 'Range out of order in character class' error?

var min_wordsafe_length = 1;
var max_length = 20;
var string = 'some-slug-like-string-with-!@£!%-special-chars-';

var PREG_CLASS_UNICODE_WORD_BOUNDARY = [
  '\\x{0}-\\x{2F}\\x{3A}-\\x{40}\\x{5B}-\\x{60}\\x{7B}-\\x{A9}\\x{AB}-\\x{B1}\\x{B4}',
  '\\x{B6}-\\x{B8}\\x{BB}\\x{BF}\\x{D7}\\x{F7}\\x{2C2}-\\x{2C5}\\x{2D2}-\\x{2DF}',
  '\\x{2E5}-\\x{2EB}\\x{2ED}\\x{2EF}-\\x{2FF}\\x{375}\\x{37E}-\\x{385}\\x{387}\\x{3F6}',
  '\\x{482}\\x{55A}-\\x{55F}\\x{589}-\\x{58A}\\x{5BE}\\x{5C0}\\x{5C3}\\x{5C6}',
  '\\x{5F3}-\\x{60F}\\x{61B}-\\x{61F}\\x{66A}-\\x{66D}\\x{6D4}\\x{6DD}\\x{6E9}',
  '\\x{6FD}-\\x{6FE}\\x{700}-\\x{70F}\\x{7F6}-\\x{7F9}\\x{830}-\\x{83E}',
  '\\x{964}-\\x{965}\\x{970}\\x{9F2}-\\x{9F3}\\x{9FA}-\\x{9FB}\\x{AF1}\\x{B70}',
  '\\x{BF3}-\\x{BFA}\\x{C7F}\\x{CF1}-\\x{CF2}\\x{D79}\\x{DF4}\\x{E3F}\\x{E4F}',
  '\\x{E5A}-\\x{E5B}\\x{F01}-\\x{F17}\\x{F1A}-\\x{F1F}\\x{F34}\\x{F36}\\x{F38}',
  '\\x{F3A}-\\x{F3D}\\x{F85}\\x{FBE}-\\x{FC5}\\x{FC7}-\\x{FD8}\\x{104A}-\\x{104F}',
  '\\x{109E}-\\x{109F}\\x{10FB}\\x{1360}-\\x{1368}\\x{1390}-\\x{1399}\\x{1400}',
  '\\x{166D}-\\x{166E}\\x{1680}\\x{169B}-\\x{169C}\\x{16EB}-\\x{16ED}',
  '\\x{1735}-\\x{1736}\\x{17B4}-\\x{17B5}\\x{17D4}-\\x{17D6}\\x{17D8}-\\x{17DB}',
  '\\x{1800}-\\x{180A}\\x{180E}\\x{1940}-\\x{1945}\\x{19DE}-\\x{19FF}',
  '\\x{1A1E}-\\x{1A1F}\\x{1AA0}-\\x{1AA6}\\x{1AA8}-\\x{1AAD}\\x{1B5A}-\\x{1B6A}',
  '\\x{1B74}-\\x{1B7C}\\x{1C3B}-\\x{1C3F}\\x{1C7E}-\\x{1C7F}\\x{1CD3}\\x{1FBD}',
  '\\x{1FBF}-\\x{1FC1}\\x{1FCD}-\\x{1FCF}\\x{1FDD}-\\x{1FDF}\\x{1FED}-\\x{1FEF}',
  '\\x{1FFD}-\\x{206F}\\x{207A}-\\x{207E}\\x{208A}-\\x{208E}\\x{20A0}-\\x{20B8}',
  '\\x{2100}-\\x{2101}\\x{2103}-\\x{2106}\\x{2108}-\\x{2109}\\x{2114}',
  '\\x{2116}-\\x{2118}\\x{211E}-\\x{2123}\\x{2125}\\x{2127}\\x{2129}\\x{212E}',
  '\\x{213A}-\\x{213B}\\x{2140}-\\x{2144}\\x{214A}-\\x{214D}\\x{214F}',
  '\\x{2190}-\\x{244A}\\x{249C}-\\x{24E9}\\x{2500}-\\x{2775}\\x{2794}-\\x{2B59}',
  '\\x{2CE5}-\\x{2CEA}\\x{2CF9}-\\x{2CFC}\\x{2CFE}-\\x{2CFF}\\x{2E00}-\\x{2E2E}',
  '\\x{2E30}-\\x{3004}\\x{3008}-\\x{3020}\\x{3030}\\x{3036}-\\x{3037}',
  '\\x{303D}-\\x{303F}\\x{309B}-\\x{309C}\\x{30A0}\\x{30FB}\\x{3190}-\\x{3191}',
  '\\x{3196}-\\x{319F}\\x{31C0}-\\x{31E3}\\x{3200}-\\x{321E}\\x{322A}-\\x{3250}',
  '\\x{3260}-\\x{327F}\\x{328A}-\\x{32B0}\\x{32C0}-\\x{33FF}\\x{4DC0}-\\x{4DFF}',
  '\\x{A490}-\\x{A4C6}\\x{A4FE}-\\x{A4FF}\\x{A60D}-\\x{A60F}\\x{A673}\\x{A67E}',
  '\\x{A6F2}-\\x{A716}\\x{A720}-\\x{A721}\\x{A789}-\\x{A78A}\\x{A828}-\\x{A82B}',
  '\\x{A836}-\\x{A839}\\x{A874}-\\x{A877}\\x{A8CE}-\\x{A8CF}\\x{A8F8}-\\x{A8FA}',
  '\\x{A92E}-\\x{A92F}\\x{A95F}\\x{A9C1}-\\x{A9CD}\\x{A9DE}-\\x{A9DF}',
  '\\x{AA5C}-\\x{AA5F}\\x{AA77}-\\x{AA79}\\x{AADE}-\\x{AADF}\\x{ABEB}',
  '\\x{E000}-\\x{F8FF}\\x{FB29}\\x{FD3E}-\\x{FD3F}\\x{FDFC}-\\x{FDFD}',
  '\\x{FE10}-\\x{FE19}\\x{FE30}-\\x{FE6B}\\x{FEFF}-\\x{FF0F}\\x{FF1A}-\\x{FF20}',
  '\\x{FF3B}-\\x{FF40}\\x{FF5B}-\\x{FF65}\\x{FFE0}-\\x{FFFD}'].join('');

new RegExp("^(.{" + min_wordsafe_length + ","+ max_length +"})[" + PREG_CLASS_UNICODE_WORD_BOUNDARY + "]");

http://jsfiddle.net/52zz0drz/

The error appears to be with the PREG_CLASS_UNICODE_WORD_BOUNDARY, however I can't work out where the range is out of order.

I'm trying to port the following line from Drupal's truncate_utf8 function to JavaScript:

// Find the last word boundary, if there is one within $min_wordsafe_length
// to $max_length characters. preg_match() is always greedy, so it will
// find the longest string possible.
$found = preg_match('/^(.{' . $min_wordsafe_length . ',' . $max_length . '})[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . ']/u', $string, $matches);

The only changes I've made to Drupal's PREG_CLASS_UNICODE_WORD_BOUNDARY in the JavaScript are to double escape the \x to prevent the \ being lost before the regex, and to replace PHP's string concatenation with an array join. I haven't done anything to change the order of the ranges.

How can I fix this code? I need the JavaScript regex to match the behaviour of the PHP regex as closely as possible.

cweiske
  • 30,033
  • 14
  • 133
  • 194
tommarshall
  • 2,038
  • 5
  • 23
  • 36

1 Answers1

3

JavaScript uses the syntax \uhhhh (exactly 4 hexadecimal digits) to specify an UTF-16 code unit in the regular expression. For characters in BMP range, except for the range D800-DFFF, the code point of the character maps directly to one UTF-16 code unit which has the same value of the code point. This is exactly the case here, so we don't need to deal with surrogate and its quirks.

In this case, just replace \x{h...hh} syntax from PCRE (in PHP) with \uhhhh syntax:

var PREG_CLASS_UNICODE_WORD_BOUNDARY = [
    "\\u0000-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u00A9\\u00AB-\\u00B1\\u00B4",
    "\\u00B6-\\u00B8\\u00BB\\u00BF\\u00D7\\u00F7\\u02C2-\\u02C5\\u02D2-\\u02DF",
    "\\u02E5-\\u02EB\\u02ED\\u02EF-\\u02FF\\u0375\\u037E-\\u0385\\u0387\\u03F6",
    "\\u0482\\u055A-\\u055F\\u0589-\\u058A\\u05BE\\u05C0\\u05C3\\u05C6",
    "\\u05F3-\\u060F\\u061B-\\u061F\\u066A-\\u066D\\u06D4\\u06DD\\u06E9",
    "\\u06FD-\\u06FE\\u0700-\\u070F\\u07F6-\\u07F9\\u0830-\\u083E",
    "\\u0964-\\u0965\\u0970\\u09F2-\\u09F3\\u09FA-\\u09FB\\u0AF1\\u0B70",
    "\\u0BF3-\\u0BFA\\u0C7F\\u0CF1-\\u0CF2\\u0D79\\u0DF4\\u0E3F\\u0E4F",
    "\\u0E5A-\\u0E5B\\u0F01-\\u0F17\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38",
    "\\u0F3A-\\u0F3D\\u0F85\\u0FBE-\\u0FC5\\u0FC7-\\u0FD8\\u104A-\\u104F",
    "\\u109E-\\u109F\\u10FB\\u1360-\\u1368\\u1390-\\u1399\\u1400",
    "\\u166D-\\u166E\\u1680\\u169B-\\u169C\\u16EB-\\u16ED",
    "\\u1735-\\u1736\\u17B4-\\u17B5\\u17D4-\\u17D6\\u17D8-\\u17DB",
    "\\u1800-\\u180A\\u180E\\u1940-\\u1945\\u19DE-\\u19FF",
    "\\u1A1E-\\u1A1F\\u1AA0-\\u1AA6\\u1AA8-\\u1AAD\\u1B5A-\\u1B6A",
    "\\u1B74-\\u1B7C\\u1C3B-\\u1C3F\\u1C7E-\\u1C7F\\u1CD3\\u1FBD",
    "\\u1FBF-\\u1FC1\\u1FCD-\\u1FCF\\u1FDD-\\u1FDF\\u1FED-\\u1FEF",
    "\\u1FFD-\\u206F\\u207A-\\u207E\\u208A-\\u208E\\u20A0-\\u20B8",
    "\\u2100-\\u2101\\u2103-\\u2106\\u2108-\\u2109\\u2114",
    "\\u2116-\\u2118\\u211E-\\u2123\\u2125\\u2127\\u2129\\u212E",
    "\\u213A-\\u213B\\u2140-\\u2144\\u214A-\\u214D\\u214F",
    "\\u2190-\\u244A\\u249C-\\u24E9\\u2500-\\u2775\\u2794-\\u2B59",
    "\\u2CE5-\\u2CEA\\u2CF9-\\u2CFC\\u2CFE-\\u2CFF\\u2E00-\\u2E2E",
    "\\u2E30-\\u3004\\u3008-\\u3020\\u3030\\u3036-\\u3037",
    "\\u303D-\\u303F\\u309B-\\u309C\\u30A0\\u30FB\\u3190-\\u3191",
    "\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3250",
    "\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u33FF\\u4DC0-\\u4DFF",
    "\\uA490-\\uA4C6\\uA4FE-\\uA4FF\\uA60D-\\uA60F\\uA673\\uA67E",
    "\\uA6F2-\\uA716\\uA720-\\uA721\\uA789-\\uA78A\\uA828-\\uA82B",
    "\\uA836-\\uA839\\uA874-\\uA877\\uA8CE-\\uA8CF\\uA8F8-\\uA8FA",
    "\\uA92E-\\uA92F\\uA95F\\uA9C1-\\uA9CD\\uA9DE-\\uA9DF",
    "\\uAA5C-\\uAA5F\\uAA77-\\uAA79\\uAADE-\\uAADF\\uABEB",
    "\\uE000-\\uF8FF\\uFB29\\uFD3E-\\uFD3F\\uFDFC-\\uFDFD",
    "\\uFE10-\\uFE19\\uFE30-\\uFE6B\\uFEFF-\\uFF0F\\uFF1A-\\uFF20",
    "\\uFF3B-\\uFF40\\uFF5B-\\uFF65\\uFFE0-\\uFFFD"].join('');
nhahtdh
  • 55,989
  • 15
  • 126
  • 162