0

To combine one or more regex patterns in JavaScript I'm using the following function:

Tokenizer.prototype.combinePatterns = function() {
  return new RegExp('(' + [].slice.call(arguments).map(function (e) {
      var e = e.toString()
      return '(?:' + e.substring(1, e.length - 1) + ')'
  }).join('|') + ')', "gi")
};

This works fine. Now I want to "protect" some patterns, that means I want to exclude some patterns when executing the resulting regex. This means that I would like that the default_pattern not to be applied to any of the patterns defined in the protected_patterns array (this concept is taken from MOSES Tokenizer protected patterns option).

These protected patterns may or not be defined in the default patterns:

AggressiveTokenizer.prototype.tokenize = function(text, params = {}) {
    var options = {
        default_pattern: /[^a-z0-9äâàéèëêïîöôùüûœç]+/,
        protected_patterns: []
    };
    for (var attr in params) options[attr] = params[attr];
    var patterns = [].concat(options.protected_patterns).concat(options.default_pattern);
    // LP: pass along all regex patterns as argument
    patterns = this.combinePatterns.apply(this,patterns);
    // break a string up into an array of tokens by anything non-word
    return this.trim(text.split(patterns));

};

Following this approach, assumed to protect some pattern like

[ '\bla([- ]?la)+\b']

I get this combined regex from the result of combinePatterns method:

/((?:la([- ]?la)+)|(?:[^a-z0-9äâàéèëêïîöôùüûœç]+))/gi

The result is not as expected, so by example in the (french) text having salut comment allez-vous la-la-la, while I get the desidered la-la-la token as a whole, I'm getting undefined tokens, and a la- as well:

var combinePatterns = function() {
  return new RegExp('(' + [].slice.call(arguments).map(function(e) {
    var e = e.toString()
    return '(?:' + e.substring(1, e.length - 1) + ')'
  }).join('|') + ')', "gi")
};

var tokenize = function(text, params = {}) {
  var options = {
    default_pattern: /[^a-z0-9äâàéèëêïîöôùüûœç]+/,
    protected_patterns: []
  };
  for (var attr in params) options[attr] = params[attr];
  var patterns = [].concat(options.protected_patterns).concat(options.default_pattern);
  // LP: pass along all regex patterns as argument
  patterns = this.combinePatterns.apply(this, patterns);
  // break a string up into an array of tokens by anything non-word
  return text.trim().split(patterns);

}

var text = "salut comment allez-vous la-la-la";
var res = tokenize(text, {
  protected_patterns: ['\bla([- ]?la)+\b']
})
console.log(res)

My expected result should be

[
  "salut",
  "comment"
  "allez"
  "vous"
  "la-la-la"
]

What is wrong: the protected patterns combination approach or the regex in the protected_patterns array?

Tip: I have noticed that, the combinePatterns if applied only to the default_pattern generated this regex

return this.trim(text.split(/((?:[^a-z0-9äâàéèëêïîöôùüûœç]+))/gi));

that slightly changes the resulting tokens of the default pattern:

return this.trim(text.split(/[^a-z0-9äâàéèëêïîöôùüûœç]+/i));
loretoparisi
  • 15,724
  • 11
  • 102
  • 146
  • 1
    > to exclude some patterns to the default patterns This line is not clear. Do you mean "to replace each protected pattern with the default pattern"? – Alexander Mashin Sep 30 '20 at 09:15
  • @AlexanderMashin sorry, I mean literally avoid that the `default_pattern` would be applied to the each pattern in the `excluded_patterns` array. This concept is taken from MOSES tokenizer protected patterns - https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl#L82 – loretoparisi Sep 30 '20 at 09:18
  • You expect your result to include "la-la-la", but you effectively made that string a separator. Did you mean that "la-la-la" must not be split into "la"s? Oh, yes that is obvious from your previous comment. Then your approach, ORing the "indivisible" patterns with the default separators, is fundamentally wrong. – Alexander Mashin Sep 30 '20 at 09:41
  • @AlexanderMashin I would expect that `la-la-la` would be kept as a whole string. This works if I do `"salut comment allez-vous la-la-la".match(/\bla([- ]?la)+\b/g);`. The question here is about combining patterns and "protected" patterns in the right way. – loretoparisi Sep 30 '20 at 09:45

1 Answers1

0

let tokenize = function( str, preserve ) {
    const separators = /[^a-z0-9äâàéèëêïîöôùüûœç]+/i;
    // Convert the array of preserved patterns into one RegExp:
    const screen = new RegExp( '(?:' + preserve.map( s => '(?:' + s + ')' ).join( '|' ) + ')' );
    // Screening delimiter. Must not be split by separators:
    const scr = 'SSS';
    // Regular expression for unscreening:
    const scr_regex = new RegExp( scr + '(\\d)' + scr );
    // Initialise the array of preserved substrings:
    let screened = [];
    return str
    // Temporarily screen preserved patterns:
    .replace( screen, s => scr + (screened.push( s ) - 1) + scr )
    // Split into tokens:
    .split( separators )
    // Restore screened substrings in each token:
    .map( s => s.replace( scr_regex, (_, i) => screened[parseInt( i )] ) );
};
let text = 'salut comment allez-vous la-la-la';
let res = tokenize( text, ['\\bla(?:[- ]?la)+\\b'] );
console.log( res );
Alexander Mashin
  • 3,892
  • 1
  • 9
  • 15