1

I am trying to create a Javascript function that would find all positions for a set of patterns inside a UTF-8 string. For example:

I have a string "detaj" (it's a transcription written with International Phonetic Alphabet symbols, so I need a full UTF-8 support).

And I have an array of patterns: ["(?!dʒ)d", "(?!tʃ)t"] (each string is also UTF-8 encoded).

I need to find the position of each pattern and obtain the following array:

[0] => [0, "(?!dʒ)d"],
[1] => [2, "(?!tʃ)t"]

0 - is the position of the symbol "d", 2 - is the position of the symbol "t".

I started with this function: https://stackoverflow.com/a/3410557/2006215

var str = "I learned to play the Ukulele in Lebanon."
var regex = /le/gi, result, indices = [];
while ( (result = regex.exec(str)) ) {
    indices.push(result.index);
}

And I changed it to something like this:

function getAllIndicesOfArrayOfStringInsideStringRegex (sounds_regex_array, word_transcription) {
    var allIndices = [];
    for (var i = 0; i < sounds_regex_array.length; i++) {
        var currentSoundRegex = sounds_regex_array[i];
        // straightforward approach doesn't work:
        //var pattern = new RegExp(currentSoundRegex, "g"); 
        // hexEncode is taken from here - https://stackoverflow.com/a/21648161/2006215 - doesn't work:
        //var pattern = new RegExp(currentSoundRegex.hexEncode, "g"); 
        // I'm trying to use utf8.js plugin from here - https://github.com/mathiasbynens/utf8.js - doesn't work:
        var pattern = new RegExp(utf8.encode(currentSoundRegex), "g"); 
        var indices = getIndicesOfRegex (pattern, word_transcription);
        for (var j = 0; j < indices.length; j++) {
            allIndices.push([indices[j], currentSoundRegex ]);
        }
    }
    return allIndices;
}

function getIndicesOfRegex (regex, str) {
    var result, indices = [];
    while (result = regex.exec(str)) {
        indices.push(result.index);
    }
    return indices;
}

Anybody has any ideas?

UPDATE: I take both the transcription and the regex patterns from json file that I generate with PHP from UTF-8 strings. I am not sure how to call this, but it's not UTF-8. In any case it doesn't work with my Javascript function.

var questions = [{"word":"sorte","word_transcription":"s\u0254\u0281t","sounds_array":["d","t"],"sounds_regex_array":["(?!d\u0292)d","(?!t\u0283)t"]}];
  • How are you representing UTF8 strings in JavaScript, as buffers? JS uses UTF16 for all strings. – Bergi May 22 '17 at 23:46

1 Answers1

0

I found where the problem was.

The error was triggered because I tried to execute lookbehind in Javascript, which is not supported.

The workaround for custom lookbehind functions is proposed here - http://blog.stevenlevithan.com/archives/javascript-regex-lookbehind

But finally I just did my own modifications of the code. The above functions require XRegExp library, which is pretty heavy.

My solution:

function getIndicesOfRegex (currentSoundRegex, pattern, str) {
    var result, indices = [];
    while (result = pattern.exec(str)) {
        if ((currentSoundRegex === "ʒ") && (result.index > 0) && (str.substring(result.index-1, result.index) === "d")) { continue; }
        if ((currentSoundRegex === "ʃ") && (result.index > 0) && (str.substring(result.index-1, result.index) === "t")) { continue; }
        indices.push(result.index);
    }
    return indices;
}

function getAllIndicesOfArrayOfStringInsideStringRegex (sounds_array, sounds_regex_array, word_transcription) {
    var allIndices = [];
    for (var i = 0; i < sounds_regex_array.length; i++) {
        var currentSoundRegex = sounds_regex_array[i];
        // lookbehind doesn't work in Javascript:
        // possible workaround - http://blog.stevenlevithan.com/archives/javascript-regex-lookbehind
        if (currentSoundRegex === "(?<!d)ʒ") {
            currentSoundRegex = "ʒ";
        }
        if (currentSoundRegex === "(?<!t)ʃ") {
            currentSoundRegex = "ʃ";
        }
        var pattern = new RegExp(currentSoundRegex, "g");

        var indices = getIndicesOfRegex (currentSoundRegex, pattern, word_transcription);
        var currentSound = sounds_array[i];
        for (var j = 0; j < indices.length; j++) {
            allIndices.push([indices[j], currentSound]);
        }
    }
    return allIndices;
}