3

I would like to split a string using a regular expression and have the separators / matched info included in the resulting array.

In java I used:

theString.split("(?<=[!><=}{])|(?=[!><=}{])|(?<= AND )|(?= AND )|(?<= OR )|(?= OR )")

But, javascript doesn't support lookbehind ?<=

For example I want string:

"Reason={Existing problem or fault}{Bestaande probleem of vout}{Other}{Ander} and Required!=No and Results >=10 and Results <=25 and Tst>5 and Tst<80 and Info=test this or that and those and Success!=Yes"

To split:

Reason,=,{,Existing problem, or ,fault,},{,Bestaande probleem of vout,},{,Other,},{,Ander,}, and ,Required,!,=,No, and ,Results,>,=,10, and ,Results,<,=,25, and ,Tst,>,5, and ,Tst,<,80, and ,Info,=,test this, or ,that, and ,those, and ,Success,!,=,Yes

Example of what I've got:

var thestr = "Reason={Existing problem or fault}{Bestaande probleem of vout}{Other}{Ander} and Required!=No and Results >=10 and Results <=25 and Tst>5 and Tst<80 and Info=test this or that and those and Success!=Yes";

document.write("::SPLIT::<br>");
var patt1=new RegExp(/([!><=}{])|( AND )|( OR ) /gi);

var x = thestr.split(patt1);
//This splits correctly but, doesn't include the separators / matched characters
document.write("length="+x.length+"<br>");
for (c=0;c<x.length;c++) {
    document.write(c+" - "+ x[c]+" |");
}

document.write("<br><br>::MATCH::<br>");

var y = thestr.match(patt1);

//This shows the matched characters but, how do I combine info from split and match
document.write("length="+y.length+"<br>");
for (d=0;d<y.length;d++) {
    document.write(d+" - "+ y[d]+" |");
}

document.write("<br><br>::INCLUDE SEPERATORS::<br>");
var patt2=new RegExp(/(?![!><=}{])|(?=[!><=}{])|(?! AND )|(?= AND )|(?! OR )|(?= OR ) /gi);
//This puts everything in the array, but, each character is a seperate array element.
// Not what I wanted to achieve.
var bits = thestr.split(patt2);
document.write("length="+bits.length+"<br>");
for (r=0;r<bits.length;r++) {
    document.write(r+" - "+ bits[r]+" |");
}
BenMorel
  • 34,448
  • 50
  • 182
  • 322
Cazz
  • 31
  • 1
  • 2
  • So you basically want to split on ` or `, ` and `, and basically between any two characters except between alphanumeric characters or spaces? – Tim Pietzcker Nov 17 '10 at 12:35

6 Answers6

5

If you put the whole pattern in a group, you will also get the separators:

thestr.split(/([!><=}{]| (?:AND|OR) )/)

This returns an array like:

["Reason", "=", "", "{", "Existing problem or fault", "}", "", "{", "Bestaande probleem of vout", "}", "", "{", "Other", "}", "", "{", "Ander", "}", " and Required", "!", "", "=", "No and Results ", ">", "", "=", "10 and Results ", "<", "", "=", "25 and Tst", ">", "5 and Tst", "<", "80 and Info", "=", "test this or that and those and Success", "!", "", "=", "Yes"]

Then you just need to filter the empty strings and you’re done:

thestr.split(/([!><=}{]| (?:AND|OR) )/).filter(Boolean)

Edit    Since Internet Explorer and possibly other browsers do not take a grouped separator into the result array, you could do this instead:

var matches = thestr.split(/(?:[!><=}{]| (?:AND|OR) )/),
    separators = thestr.match(/(?:[!><=}{]| (?:AND|OR) )/g);
for (var i=0; i<separators.length; ++i) {
    matches[i+1] = separators[i];
}

This basically separates the separators from the other parts and then combines both.

Gumbo
  • 643,351
  • 109
  • 780
  • 844
  • Unfortunately (for all of us), Internet Explorer supports neither saving captured groups from *split()* or the *filter()* method, so this isn't an out-of-the-box cross browser solution. – Andy E Nov 17 '10 at 12:58
  • 1
    +1 for your edit. There's also Steven Levithan's [cross browser split](http://blog.stevenlevithan.com/archives/cross-browser-split) function which follows the specification a little better. – Andy E Nov 17 '10 at 13:32
2

Not getting too deep into your query structure, I would suggest you to use replace method with a function as replacement which would collect the terms into an array:

function parse(sQuery) {
    var aParsed = [];
    var oReTerms = /.../gim;
    sQuery.replace(oReTerms, function($0, $1, $2, ...) {
        //...
        if ($1) {
            aParsed.append($1);
        }
        if ($2) {
            aParsed.append($2);
        }
        //...
        return $0; // return what was matched (or any string)
    });
    return aParsed;
}

I did this previously to parse HTML tags and attributes. I hope the idea is clear. You just need to define your regular expression so that it matches all terms in the query.

And you can have another replacing within the replacement function for specific cases.

Aidas Bendoraitis
  • 3,965
  • 1
  • 30
  • 45
  • More information on this technique can be found on John Resig's blog - http://ejohn.org/blog/search-and-dont-replace/. – Andy E Nov 17 '10 at 13:34
1

Gumbo's split function above is a good idea but it doesn't work. It should be:

function split(str, regex) {
    var matches    = str.split(regex),
        separators = str.match(regex),
        ret        = [ matches[0] ];
    if (!separators) return ret;
    for (var i = 0; i < separators.length; ++i) {
        ret[2 * i + 1] = separators[i];
        ret[2 * i + 2] = matches[i + 1];
    }
    return ret;
}

split('a,b,c', /,/g); // returns ["a", ",", "b", ",", "c"]
blah
  • 11
  • 1
1

I'm not sure about how JavaScript behaves if a regex split contains a capturing group. I know that in Python, a splitting delimiter becomes part of the match if it is enclosed in capturing parentheses.

Try

result = subject.split(/( or )|( and )|([^\w\s])\b|(?=[^\w\s])/i);

and see what happens.

Tim Pietzcker
  • 328,213
  • 58
  • 503
  • 561
1
function split2(str, re) {
    if (re.global) {
        // Reset to start of string
        re.lastIndex = 0;
    }
    var result = [];
    var match = re.exec(str);
    var lastEnd = 0;
    while (match != null) {
        if (match.index > lastEnd) {
            result.push(str.substring(lastEnd, match.index));
        }
        result.push(match[0]);
        lastEnd = match.index + match[0].length;
        match = re.exec(str);
    }
    result.push(str.substring(lastEnd));
    return result;
}

var thestr = "Reason={Existing problem or fault}{Bestaande probleem of vout}{Other}{Ander} and Required!=No and Results >=10 and Results <=25 and Tst>5 and Tst<80 and Info=test this or that and those and Success!=Yes";

var patt = /[!><=}{]| AND | OR /gi;

split2(thestr,patt):

Output:

["Reason", "=", "{", "Existing problem", " or ", "fault", "}", "{",
"Bestaande probleem of vout", "}", "{", "Other", "}", "{", "Ander", "}", " and ",
"Required", "!", "=", "No", " and ", "Results ", ">", "=", "10", " and ",
"Results ", "<", "=", "25", " and ", "Tst", ">", "5", " and ", "Tst", "<", "80",
" and ", "Info", "=", "test this", " or ", "that", " and ", "those", " and ",
"Success", "!", "=", "Yes"]
Markus Jarderot
  • 86,735
  • 21
  • 136
  • 138
0

To support most of the browsers in use, you can match your strings

this pattern matches any number of characters except the separators, <>!{}=, or one of the separators.

var rx=/([^<>!{}=]+|[<>!{}=])/g

var str='Reason={Existing problem or fault}{Bestaande probleem of vout}'+
'{Other}{Ander} and Required!=No and Results >=10 and Results <=25 '+
'and Tst>5 and Tst<80 and Info=test this or that and those and Success!=Yes';


str.match(rx).join('\n')

//returned value:
Reason
=
{
Existing problem or fault
}
{
Bestaande probleem of vout
}
{
Other
}
{
Ander
}
 and Required
!
=
No and Results 
>
=
10 and Results 
<
=
25 and Tst
>
5 and Tst
<
80 and Info
=
test this or that and those and Success
!
=
Yes

// I concatenated the string and joined the result for readability

kennebec
  • 102,654
  • 32
  • 106
  • 127