I have read How do I get the name of captured groups in a C# Regex? and How do I access named capturing groups in a .NET Regex? to try to understand how to find the result of a matched group in regular expressions.
I've also read everything in the MSDN at http://msdn.microsoft.com/en-us/library/30wbz966.aspx
It seems strange to me is how C# (or .NET) seems to be the only implementation of regular expressions that makes you iterate groups to find which group matched (especially if you need the name), and also the fact that the name isn't stored with the group result. PHP and Python for example will give you the group name that matched as part of the RegEx match result.
I have to iterate the groups and check for a match, and I have to keep a list of my own group names cause the names aren't in the result.
Here is my code to demonstrate:
public class Tokenizer
{
private Dictionary<string, string> tokens;
private Regex re;
public Tokenizer()
{
tokens = new Dictionary<string, string>();
tokens["NUMBER"] = @"\d+(\.\d*)?"; // Integer or decimal number
tokens["STRING"] = @""".*"""; // String
tokens["COMMENT"] = @";.*"; // Comment
tokens["COMMAND"] = @"[A-Za-z]+"; // Identifiers
tokens["NEWLINE"] = @"\n"; // Line endings
tokens["SKIP"] = @"[ \t]"; // Skip over spaces and tabs
List<string> token_regex = new List<string>();
foreach (KeyValuePair<string, string> pair in tokens)
{
token_regex.Add(String.Format("(?<{0}>{1})", pair.Key, pair.Value));
}
string tok_regex = String.Join("|", token_regex);
re = new Regex(tok_regex);
}
public List<Token> parse(string pSource)
{
List<Token> tokens = new List<Token>();
Match get_token = re.Match(pSource);
while (get_token.Success)
{
foreach (string gname in this.tokens.Keys)
{
Group group = get_token.Groups[gname];
if (group.Success)
{
tokens.Add(new Token(gname, get_token.Groups[gname].Value));
break;
}
}
get_token = get_token.NextMatch();
}
return tokens;
}
}
In the line
foreach (string gname in this.tokens.Keys)
That should not be necessary but it is.
Is there anyway to find the matching group and it's name without having to iterate all the groups?
EDIT: To compare implementations. Here is the same code that I wrote for a Python implementation.
class xTokenizer(object):
"""
xTokenizer converts a text source code file into a collection of xToken objects.
"""
TOKENS = [
('NUMBER', r'\d+(\.\d*)?'), # Integer or decimal number
('STRING', r'".*"'), # String
('COMMENT', r';.*'), # Comment
('VAR', r':[A-Za-z]+'), # Variables
('COMMAND', r'[A-Za-z]+'), # Identifiers
('OP', r'[+*\/\-]'), # Arithmetic operators
('NEWLINE', r'\n'), # Line endings
('SKIP', r'[ \t]'), # Skip over spaces and tabs
('SLIST', r'\['), # Start a list of commands
('ELIST', r'\]'), # End a list of commands
('SARRAY', r'\{'), # Start an array
('EARRAY', r'\}'), # End end an array
]
def __init__(self,tokens=None):
"""
Constructor
Args:
tokens - key/pair of regular expressions used to match tokens.
"""
if tokens is None:
tokens = self.TOKENS
self.tokens = tokens
self.tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in tokens)
pass
def parse(self,source):
"""
Converts the source code into a list of xToken objects.
Args:
sources - The source code as a string.
Returns:
list of xToken objects.
"""
get_token = re.compile(self.tok_regex).match
line = 1
pos = line_start = 0
mo = get_token(source)
result = []
while mo is not None:
typ = mo.lastgroup
if typ == 'NEWLINE':
line_start = pos
line += 1
elif typ != 'SKIP':
val = mo.group(typ)
result.append(xToken(typ, val, line, mo.start()-line_start))
pos = mo.end()
mo = get_token(source, pos)
if pos != len(source):
raise xParserError('Unexpected character %r on line %d' %(source[pos], line))
return result
As you can see Python doesn't require you to iterate the groups, and a similar thing can be done in PHP and I assume Java.