I am trying to find the most efficient way to create a generic tokenizer that will retain the complex delimiters / separators as extra token.
And yes... I looked at some SO questions like How can i use string#split to split a string with the delimiters + - * / ( ) and space and retain them as an extra token? but so far, it's too specific. I need the solution to work against generic string.
In my case, I am looking to tokenize strings such as
" A brown bear A red firetruck A white horse "
and as result, I am expecting the following tokens:
" ", //3 spaces
"A brown bear",
" ", //5 spaces
"A red firetruck",
" ", //2 spaces
"A white horse",
" " //3 spaces
and so, here is the code that I come up with, it's working as expected but I am wondering if there is anyway to improve on this...
public static class StringExtension
{
public static List<string> TokenizeUsingRegex(this string input, string separatorRegexPattern, bool includeSeparatorsAsToken = true)
{
var tokens = Regex.Split(input, separatorRegexPattern).Where(t => !string.IsNullOrWhiteSpace(t)).ToList();
if (!includeSeparatorsAsToken)
return tokens;
//Reinstate the removed separators
var newTokens = new List<string>();
var startIndex = 0;
for(int i = 0, l = tokens.Count(); i < l; i++)
{
var token = tokens[i];
var endIndex = input.IndexOf(token);
if (startIndex < endIndex) {
//Add back the separator as a new token
newTokens.Add(input.Substring(startIndex, endIndex - startIndex));
}
//Then add the token afterward
newTokens.Add(token);
startIndex = endIndex + token.Length;
}
//Add last separator if any
if (startIndex < input.Length) {
newTokens.Add(input.Substring(startIndex));
}
return newTokens;
}
}
Live example at: https://dotnetfiddle.net/l3mesr