Javascript word boundary regex with "\w" for words with double byte characters

Question

I do match of words in a text to retrieve the word offset begin and end. This normally works for both ascii and unicode texts when using an appropriate unicode-aware regex like '(?<=^|\\PL)$1(?=\\PL|$)'. When I have mixed text (like Korean and English here) there are some problems, while tokenizing:

function aggressive_tokenizer(text) {
  // most punctuation
  text = text.replace(/([^\w\.\'\-\/\+\<\>,&])/g, " $1 ");
  // commas if followed by space
  text = text.replace(/(,\s)/g, " $1");
  // single quotes if followed by a space
  text = text.replace(/('\s)/g, " $1");
  // single quotes if last char
  text = text.replace(/('$)/, " $1");
  text = text.replace(/(\s+[`'"‘])(\w+)\b(?!\2)/g, " $2")
  // periods before newline or end of string
  text = text.replace(/\. *(\n|$)/g, " . ");
  // replace punct
  // ignore "-" since may be in slang scream
  text = text.replace(/[\\?\^%<>=!&|+\~]/g, "");
  text = text.replace(/[…;,.:*#\)\({}\[\]]/g, "");
  // finally split remainings into words
  text = text.split(/\s+/)
  return text;
}
var text = "점점 더 깊이 끌려가"
var tokens = aggressive_tokenizer(text);
var seen = new Map();
var indexes = tokens.map(token => { // for each token
  let item = {
    "word": token
  }
  var pattern = '(?<!\\pL\\pM*)$1(?!\\pL)';
  var escaped = token.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
  var wordRegex = new XRegExp(pattern.replace('$1', escaped), "g");

  // calculate token begin end 
  var match = null;
  while ((match = wordRegex.exec(text)) !== null) {
    if (match.index > (seen.get(token) || -1)) {
      var wordStart = match.index;
      var wordEnd = wordStart + token.length - 1;
      item.characterOffsetBegin = wordStart;
      item.characterOffsetEnd = wordEnd;
      seen.set(token, wordEnd);
      break;
    }
  }
  return item;
});
indexes.forEach(index => {
  if (index.word != text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1)) {
    console.log("NOT MATCHING!!! " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1))
  } else {
    console.log("\tMATCHED " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1))
  }
});

<script src="https://unpkg.com/xregexp/xregexp-all.js"></script>

The problem is that I do some cleanup in tokenization like

text = text.replace(/([^\w\.\'\-\/\+\<\>,&])/g, " $1 ");

where \w is not unicode-aware, but if I replace it with \p{Alnum}:

text = text.replace(/([^\p{Alnum}\.\'\-\/\+\<\>,&])/g, " $1 ");

that it should be the equivalent for Unicode word, it does not work properly.

NOTE Please note that I do use XRegExp to support Unicode regex in JavaScript.

UPDATE According to the comments below, I have updated the code with the modified pattern regexp '(?<=^|\\PL)$1(?=\\PL|$)' by – Wiktor Stribiżew and replaced XRegExp with built-in RegExp, due to missing support for varied width lookbehind patterns (see comments). This solution works better, but I have identified an additional case where the char offset begin and end cannot be matched for the given input text: "점점 더 깊이 끌려가" the output will have a missing offset / match for

{
    "index": 2,
    "word": "점"
}

function aggressive_tokenizer(text) {
  // most punctuation
  text = text.replace(/[^\w\.\-\/\+\<\>,&]/g, " $& ");
  // commas if followed by space
  text = text.replace(/(,\s)/g, " $1");
  // single quotes if followed by a space
  text = text.replace(/('\s)/g, " $1");
  // single quotes if last char
  text = text.replace(/('$)/, " $1");
  text = text.replace(/(\s+[`'"‘])(\w+)\b(?!\2)/g, " $2")
  // periods before newline or end of string
  text = text.replace(/\. *(\n|$)/g, " . ");
  // replace punct
  // ignore "-" since may be in slang scream
  text = text.replace(/[\\?\^%<>=!&|+\~]/g, "");
  text = text.replace(/[…;,.:*#\)\({}\[\]]/g, "");
  // finally split remainings into words
  text = text.split(/\s+/)
  return text;
}
var text = "점점 더 깊이 끌려가"
var tokens = aggressive_tokenizer(text);
var seen = new Map();
var indexes = tokens.map(token => { // for each token
  let item = {
    "word": token
  }
  var pattern = '(?<!\\pL\\pM*)$1(?!\\pL)';
  var escaped = token.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
  var wordRegex = new RegExp(pattern.replace('$1', escaped), "g");

  // calculate token begin end 
  var match = null;
  while ((match = wordRegex.exec(text)) !== null) {
    if (match.index > (seen.get(token) || -1)) {
      var wordStart = match.index;
      var wordEnd = wordStart + token.length - 1;
      item.characterOffsetBegin = wordStart;
      item.characterOffsetEnd = wordEnd;
      seen.set(token, wordEnd);
      break;
    }
  }
  return item;
});
indexes.forEach(index => {
  if (!index.characterOffsetBegin && !index.characterOffsetEnd) {
    console.log("MISSING INDEXES " + index.word);
  } else if (index.word != text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1)) {
    console.log("NOT MATCHING!!! " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1))
  } else {
    console.log("\tMATCHED " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1))
  }
});

The Unicode aware `\w` is `[\p{Alphabetic}\p{Mark}\p{Decimal_Number}\p{Connector_Punctuation}\p{Join_Control}]`, see [this post](https://stackoverflow.com/a/62772689/3832970). — Wiktor Stribiżew, Oct 07 '20 at 11:23
BTW, you only need XRegExp if you want to support all possible JS environments, the majority of them already support ECMAScript 2018+ standard and support Unicode property classes. — Wiktor Stribiżew, Oct 07 '20 at 11:25
If we put aside your "trick" with `$1` in the pattern (you might use a mere string concatenation rather than introducing another regex operation), if you need to support the word boundary between a non-letter+non-mark, you need to replace `var pattern = '(?<=^|\\PL)$1(?=\\PL|$)'` with `var pattern = '(?<!\\pL\\pM*)$1(?!\\pL)'`. However, XRegExp does not seem to support varied width lookbehind patterns. — Wiktor Stribiżew, Oct 07 '20 at 11:29
@WiktorStribiżew thanks. Regarding `XRegExp` I'm using node `v12.16.1` and without that regex will fail. — loretoparisi, Oct 07 '20 at 13:04
@WiktorStribiżew so if I replace in the tokenizer the `\w` with `text = text.replace(/([^[\p{Alphabetic}\p{Mark}\p{Decimal_Number}\p{Connector_Punctuation}\p{Join_Control}]\.\'\-\/\+\<\>,&])/g, " $1 "); ` I still have wrong tokens. Not sure if it is correct my change btw. — loretoparisi, Oct 07 '20 at 13:12
You must use `/u` flag in native JS. `/[^[\p{Alphabetic}\p{Mark}\p{Decimal_Number}\p{Connector_Punctuation}\p{Join_Control}].'\/+<>,&-]/gu`. Do not wrap with a capturing group, you can use a `$&` backreference to the whole match. Replacement is `" $& "` — Wiktor Stribiżew, Oct 07 '20 at 13:17
@WiktorStribiżew In fact your note is correct. Using `RegExp` with `'(?<!\\pL\\pM*)$1(?!\\pL)'` plus the `/gu` modifier and the `\w` replacement `[^[\p{Alphabetic}\p{Mark}\p{Decimal_Number}\p{Connector_Punctuation}\p{Join_Control}].'\/+<>,&-]` will work! Btw I have opened an issue here about `XRegExp` because it was supposed to work I think - https://github.com/slevithan/xregexp/issues/304 — loretoparisi, Oct 07 '20 at 13:33
`(?<!\\pL\\pM*)` is a negative lookbehind and the `*` inside it might be a problem for XRegExp. I am curious to hear from XRegExp team, too. There is a work around for such cases, but it will require some more code and changing the regex to `'(\\pL\\pM*)?$1(?!\\pL)'` — Wiktor Stribiżew, Oct 07 '20 at 13:38
@WiktorStribiżew I have found a very strange case using the update regex `'(?<=^|\\PL)$1(?=\\PL|$)'` with `RegExp`. Look at the updated question! — loretoparisi, Oct 08 '20 at 15:09
Not sure what your code is doing, but you did not follow me.You need `var pattern = '(?<!\\p{L}\\p{M}*)$1(?!\\p{L})';` and `var wordRegex = new RegExp(pattern.replace('$1', escaped), "gu");` — Wiktor Stribiżew, Oct 08 '20 at 22:33
@WiktorStribiżew, thanks I want to tokenize those double byte words as it would work for ascii chars, with the same regex (supporting both ascii and unicode). If I put your last regex with `gu` you will get an errore. This is the code to try all the `pattern`: https://gist.github.com/loretoparisi/f34a956a29091c3ff62507fda8d7efba — loretoparisi, Oct 15 '20 at 10:30
You are not using the regex right, it must be `pattern = '(?<!\\p{L}\\p{M}*)$1(?!\\p{L})';` and then `var wordRegex = new RegExp(pattern.replace('$1', escaped), "gu");`. Sorry, your logic is too complex for me to understand right now, I am rather busy. — Wiktor Stribiżew, Oct 15 '20 at 11:54
Just to point out that someone at `XRegExp` is looking at this issue! https://github.com/slevithan/xregexp/issues/304#issuecomment-718835263 — loretoparisi, Oct 29 '20 at 16:43
I have tried to check it, but the problem is not much with the regex, but with the logic here. — Wiktor Stribiżew, Oct 29 '20 at 16:45

Javascript word boundary regex with "\w" for words with double byte characters

0 Answers0