I have an input string like:
'lambda' '(' VARIABLE (',' VARIABLE)* ')' EXPRESSION (EXPRESSION)+
and need to split it into tokens separated by spaces, ( and ) and [ and ], except when a ( or ) is immediately surrounded by single quotes.
I would like to create a regex expression to use with C#'s Regex.Split() method that will split the string into the following tokens:
['lambda', '(', VARIABLE, (, ',' VARIABLE, ), *, ')', EXPRESSION, (, EXPRESSION, ), +]
I was previously using the following regex:
(?=[ \(\)\|\[\]])|(?<=[ \(\)\|\[\]])
which worked great except for when ( or ) is surrounded by single quotes, in which case
'('
gets separated into
[', (, ']
Help is greatly appreciated.
EDIT
Well, I now have one less problem. Here was my eventual solution without using regex at all:
private void Scan()
{
List<char> accum = new List<char>();
int index = 0;
List<string> tokens = new List<string>();
if (INPUT.Length == 0)
return;
while (true)
{
if ((index == INPUT.Length) ||
(
(
(index == 0 || INPUT[index - 1].ToString() != "'") ||
(index == INPUT.Length - 1 || INPUT[index + 1].ToString() != "'") ||
(INPUT[index] == ' ')
)
&&
(
INPUT[index] == ' ' ||
INPUT[index] == '(' ||
INPUT[index] == ')' ||
INPUT[index] == '[' ||
INPUT[index] == ']' ||
INPUT[index] == '|'
)
)
)
{
string accumulatedToken = string.Join("", accum);
string currentToken = index < INPUT.Length ? INPUT[index].ToString() : "";
tokens.Add(accumulatedToken);
tokens.Add(currentToken);
CURRENT_TOKEN = tokens.FirstOrDefault(t => !string.IsNullOrWhiteSpace(t));
INPUT = INPUT.Substring(CURRENT_TOKEN.Length).TrimStart();
if (CURRENT_TOKEN != null)
{
break;
}
index = 0;
}
else
{
accum.Add(INPUT[index]);
index++;
}
}
}