0

I have strings, and i want to find in them 2 words: 'start' and 'end'.

'start' and 'end' always come together (maybe i will have another characters between them, but if i have 'start', i will have 'end' too).

I try to do with regEx source that find the first 'start' and than his own 'end', and it will return the correct substring.


examples of strings: [i wrote in this examples index for every couple of 'start' and 'end' just for clarity (in the real strings i will not have this indexes)- the answer always between index (1)]

  1. something start something_needed end something // print 'something_needed'
  2. start(1) something start(2) something end(2) something end(1) start something end // print 'something start(2) something end(2) something'
  3. start(1) something start(2) start(3) something end(3) something start(4) end(4) something end(2) something end(1) something start(5) something end(5) // print 'something start**(2) start(3) something end(3) something start(4) end(4) something end(2) something'

This is my solution in Javascript, but i prefer the answer in regEx only.

i find all the start, and after that all the end, and than- for every start: count++, for every end: count--. when count == 0, it the position of the correct end.

function getStartEnd(str) {
    str = " "+str+" ";
    var start = matchPosArr(str, /[\d\s\r\n,\(\)\[\]\{\}]+START+(?=[\d\s\r\n,\(\)\[\]\{\}])/gi);
    var end = matchPosArr(str, /[\d\s\r\n,\(\)\[\]\{\}]+END+(?=[\d\s\r\n,\(\)\[\]\{\}])/gi);
    var count = 0;  // counter
    var si = 0;     // index of start array
    var ei = 0;     // index of end array
    var isStart = false;
    while (true) {
        if (ei >= end.length) {
            alert('error');
            break;
        }
        else if (si >= start.length) {
            ei++;
            count--;
            if (count == 0) {
                ei--;
            }
        }
        else if (start[si] > end[ei]) {
            ei++;
            count--;
        }
        else if (start[si] < end[ei]) {
            si++;
            count++;
        }
        if (count == 0 && isStart==true) {
            break;
        }
        isStart = true;
    }
    return str.substring(start[0]+("start ".length),end[ei]);
}
function matchPosArr(str, regEx) {
    var pos = []; 
    while ((match = regEx.exec(str)) != null) {
        pos.push(match.index);
    }
    return pos;
}

alert( getSelectFrom(str) );
ldoroni
  • 629
  • 1
  • 7
  • 18

3 Answers3

3

Here is a possible solution from Matching Nested Constructs in JavaScript, Part 2.

Example usage:

matchRecursiveRegExp("START text START text END text more END text", "START", "END");

// (c) 2007 Steven Levithan <stevenlevithan.com>
// MIT License

/*** matchRecursiveRegExp
 Accepts a string to search, a left and right format delimiter
 as regex patterns, and optional regex flags. Returns an array
 of matches, allowing nested instances of left/right delimiters.
 Use the "g" flag to return all matches, otherwise only the
 first is returned. Be careful to ensure that the left and
 right format delimiters produce mutually exclusive matches.
 Backreferences are not supported within the right delimiter
 due to how it is internally combined with the left delimiter.
 When matching strings whose format delimiters are unbalanced
 to the left or right, the output is intentionally as a
 conventional regex library with recursion support would
 produce, e.g. "<<x>" and "<x>>" both produce ["x"] when using
 "<" and ">" as the delimiters (both strings contain a single,
 balanced instance of "<x>").

 examples:
  matchRecursiveRegExp("test", "\\(", "\\)")
   returns: []
  matchRecursiveRegExp("<t<<e>><s>>t<>", "<", ">", "g")
   returns: ["t<<e>><s>", ""]
  matchRecursiveRegExp("<div id=\"x\">test</div>", "<div\\b[^>]*>", "</div>", "gi")
   returns: ["test"]

*/
function matchRecursiveRegExp (str, left, right, flags) {
 var f = flags || "",
  g = f.indexOf("g") > -1,
  x = new RegExp(left + "|" + right, "g" + f),
  l = new RegExp(left, f.replace(/g/g, "")),
  a = [],
  t, s, m;

 do {
  t = 0;
  while (m = x.exec(str)) {
   if (l.test(m[0])) {
    if (!t++) s = x.lastIndex;
   } else if (t) {
    if (!--t) {
     a.push(str.slice(s, m.index));
     if (!g) return a;
    }
   }
  }
 } while (t && (x.lastIndex = s));

 return a;
}
document.write(matchRecursiveRegExp("something start something_needed end something", "start", "end") + "<br/>");
document.write(matchRecursiveRegExp("start something start something end something end start something end", "start", "end")+ "<br/>");
document.write(matchRecursiveRegExp("start something start start something end something start end something end something end something start something end", "start", "end")+ "<br/>");
Wiktor Stribiżew
  • 607,720
  • 39
  • 448
  • 563
  • 1
    @NarendraSisodia: It would work if the indices were present. I understand these indices are only for illustrative purpose here: *in this examples every couple of 'start' and 'end' have his own index- the answer always between index (1)*. – Wiktor Stribiżew Aug 13 '15 at 13:58
0

I'm having a hard time understanding what you exactly want, but if I understand correctly: you cannot do this with pure regex in javascript because lookbehind (positive (?<=...) and negative (?<!...)) is not supported, and thus you would not be able to match the 'start(n)' before the match result.


but instead you can use subgroups (subgroups aren't fully supported in javascript so you'll need to use replace):

var string = "something start(1) something_needed end(1) something";
var regex = /start\((\d+)\)(.*)end\(\1\)/;
string.replace(regex, function($0, $1, $2) {

    var result = $2;
    console.log($2)
    //do stuff with $2 here
});

$0 is the original match (start\((\d+)\)(.*)end\(\1\))

$1 and $2 are the groups that are outputted by the regex.

$1 refers to (\d+). It's already used to 'store' the number behind start (1 in this case). But here's where the magic happens: it gets loaded again and matched against with \1 inside the regex.

$2 is where the info you need is stored. it refers to (.*)

L0laapk3
  • 890
  • 10
  • 26
  • thanks for your answer. but i think you still not understand me as well, the indexes `(1)`,`(2)`,`(3)`- is just for let you see clarity what `start` and `end` i want. in the real strings, i will not have them- example for real string: something start something_needed end something – ldoroni Aug 13 '15 at 14:08
0

what you are looking for is to find 'start' count the amount of times another 'start' is found, and then ignore an equal amount of 'end's. This is a thing that cannot be done with regex.

its impossible to compare the amount of times 2 strings match with pure regex.


instead, here's several semi-regex solution for this problem:

    var string = "start(1) something start(2) start(3) something end(3) something start(4) end(4) something end(2) something end(1) something start(5) something end(5)";
var stop;
do {
    stop = true;
    string = string.replace(/start((?:[^s]|s(?!tart))*?)end/, function($0, $1) {
         stop = false;
         var result = $1;

         //do stuff with result here..
         console.log(result);

         return ""; //replaces the match with empty so it can continue processing
    });
} while (!stop);

whats good about this method is that is simple, and you can have an infinite number of nested statements.

L0laapk3
  • 890
  • 10
  • 26