1

I'm writing a regular expression in javascript that replaces whitespaces except when:

  1. Some specific syntax is in front of the whitespace
  2. It's surrounded in both single as double quotes (escaped quotes within quotes excluded)

Now, I've got a big part working. It matches all patterns that doesn't have the specific syntax in front of the whitespace, however, I'm stuck with the quote part.

return str.replace(/(function|new|return|var)?\s/g, function($0, $1) {
    return $1 ? $0 : '';
});

I've done quite some testing, but I just can't figure it out. Thanks in advance.

Qtax
  • 33,241
  • 9
  • 83
  • 121
Tim S.
  • 13,597
  • 7
  • 46
  • 72
  • I am surprised it even works so far. I expected you would have to put your keywords in braces. I.e., ((function)|(new)|(return)|(var)). – Nodebody Jul 08 '11 at 11:06
  • 1
    Note that this kind of simple parsing will never work in the general case. For example even if you get the simple quoting to work it will still break on comments, regex quoting, etc. – Qtax Jul 08 '11 at 11:14
  • Could this help with the "surrounded by quotes" issues ? http://stackoverflow.com/questions/366202/regex-for-splitting-a-string-using-space-when-not-surrounded-by-single-or-double – phtrivier Jul 08 '11 at 11:16
  • @phtrivier, it's Java (JS doesn't have lookbehind), and (most of) the answer(s) there do not handle escapes. – Qtax Jul 08 '11 at 11:24
  • If you'd provide an example text together with the desired matches, I am sure, we will find a pattern. – Nodebody Jul 08 '11 at 12:03
  • @QTax: The [accepted answer](http://stackoverflow.com/questions/366202/regex-for-splitting-a-string-using-space-when-not-surrounded-by-single-or-double/366532#366532) in that Q/A doesn't use lookbehind, and it correctly matches quoted sequences with escaped quotes in them. – Alan Moore Jul 08 '11 at 15:26

2 Answers2

2

You can use:

var str = "foo  \"b a \\\" r\" new y 'l o l' foo lol; var x = new 'fo \\' o' ";

var result = str.replace(/(function|new|return|var)?\s+(?=(?:[^\\"']|\\.)*(?:(?:"(?:[^\\"]|\\.)*"|'(?:[^\\']|\\.)*'))*(?:[^\\"']|\\.)*$)/gm,
function($0, $1) { return $1 ? $0 : ''; });

See http://jsfiddle.net/qCeC4/

Lookahead part in Perl /x form:

s/
\s+
(?=
    (?:[^\\"']|\\.)*
    (?:
        (?:
            "(?:[^\\"]|\\.)*"
            |
            '(?:[^\\']|\\.)*'
        )
    )*
    (?:[^\\"']|\\.)*$
)
//xmg;

Note: As I said before, this is not a good way to parse JS, and will break on comments, regex quoting, and who knows what else.

Note2: Forgot to add that this only works for "valid" quoting, all quotes must be closed.

Qtax
  • 33,241
  • 9
  • 83
  • 121
  • Bravo for such regex wizardy! But it's not _quite_ there. This regex assumes that all quotes are closed (eg it fails if str = "x ' asdf " (http://jsfiddle.net/uAcse/) ). All that needs to happen then is a another test to first see if the string has properly enclosed quotes. If it doesn't then .... I don't know what to do exactly – Alexander Bird Jul 08 '11 at 12:38
  • Thanks. :-) Yes, it only works with valid JS (single line quotes), forgot to say that. (Don't know if there is any way to do multi-line quotes in JS, if there is than that won't work either.) – Qtax Jul 08 '11 at 12:46
  • As said, you're VERY close. I thank you for your time and your awesome regex magic. It nearly fixed my problem but it's okay. Definitely getting a vote up. I cant believe JS doesn't support lookbehinds :/ – Tim S. Jul 11 '11 at 07:41
1

My suggestions:

  • mimic lookbehind in javascript (though this hack may not be perfect).

  • use a recursive descent parser (maybe antlr)?

  • Or manually write code to do it for you. Below is my first draft version of what I'm thinking (there's still some pseudo-code ):


function go(str) {
    var quoteStart, quoteEnd, quotedRanges, from, retval;
    quotedRanges = []; //quotedRanges holds the indexes inclusively within which nothing should be changed because it's quoted.


    quoteStart = str.indexOf('"');

    if( quoteStart > -1 ) {
        from = quoteStart;
        while (from !== false) {
            quoteEnd = str.indexOf('"', from);

            if (quoteEnd == -1) { //There is an unmatched quote. We pretend that it is closed off at the end of the string.
                quoteEnd = str.len;
                from = false;
            } else if(str.charAt(quoteEnd - 1) == "\\") {
                from = quoteEnd;
            } else { //we found the ending quote index.
                from = false;
            }
        }
        quotedRanges.push([quoteStart, quoteEnd]);
    }


    retval = str.replace(/(function|new|return|var)?\s/g, function($0, $statement) {
        if($0 within on of quotedRanges)
            return $0;
        return $statement ? $0 : '';
    });
    return retval;
}

assert(1, go("") == "");
assert(2, go("function ") == "function ");
assert(3, go(" ") == "");
assert(4, go('" "') == '" "');
assert(5, go('" ') == '" ');
assert(6, go('"x x"') == '"x x"');
assert(6, go('"new x"') == '"new x"');
assert(7, go(' "x x"') == '"x x"');
assert(8, go("' '") == "' '");
assert(9, go("' \\' '") == "' \\' '");


function assert(num, statement) {
    if(!statement) {
        document.write('test #' + num + ' failed! <br/>');
    }
}
Bart Kiers
  • 166,582
  • 36
  • 299
  • 288
Alexander Bird
  • 38,679
  • 42
  • 124
  • 159