While the other contributors to this question provided some clues, I needed an answer. My test is a rules engine that is driven by a regex that is built up from file input, so hard coding the logic into C# is not an option.
However, I did learn here that
- the .NET
Regex
class does not support surrogate pairs and
- you can fake support for surrogate pair ranges by using regex alteration
But of course, in my data-driven case I can't manually change the regexes to a format that .NET will accept - I need to automate it. So, I created the below Utf32Regex
class that accepts UTF32 characters directly in the constructor and internally converts them to regexes that .NET understands.
For example, it will convert the regex
"[abc\\U00011DEF-\\U00013E07]"
To
"(?:[abc]|\\uD807[\\uDDEF-\\uDFFF]|[\\uD808-\\uD80E][\\uDC00-\\uDFFF]|\\uD80F[\\uDC00-\\uDE07])"
Or
"([\\u0000-\\u0009\\u000B\\u000C\\u000E-\\u001F\\u007F-\\u009F\\u00AD" +
"\\u061C\\u180E\\u200B\\u200E\\u200F\\u2028-\\u202E\\u2060-\\u206F\\uD800-" +
"\\uDFFF\\uFEFF\\uFFF0-\\uFFFB\\U0001BCA0-\\U0001BCA3\\U0001D173-" +
"\\U0001D17A\\U000E0000-\\U000E001F\\U000E0080-\\U000E00FF\\U000E01F0-\\U000E0FFF] " +
"| [\\u000D] | [\\u000A]) ()"
To
"((?:[\\u0000-\\u0009\\u000B\\u000C\\u000E-\\u001F\\u007F-\\u009F\\u00AD\\u061C\\u180E" +
"\\u200B\\u200E\\u200F\\u2028-\\u202E\\u2060-\\u206F\\uD800-\\uDFFF\\uFEFF\\uFFF0-\\uFFFB]|" +
"\\uD82F[\\uDCA0-\\uDCA3]|\\uD834[\\uDD73-\\uDD7A]|\\uDB40[\\uDC00-\\uDC1F]|" +
"\\uDB40[\\uDC80-\\uDCFF]|\\uDB40[\\uDDF0-\\uDFFF]|[\\uDB41-\\uDB42][\\uDC00-\\uDFFF]|" +
"\\uDB43[\\uDC00-\\uDFFF]) | [\\u000D] | [\\u000A]) ()"
Utf32Regex.cs
using System;
using System.Globalization;
using System.Text;
using System.Text.RegularExpressions;
/// <summary>
/// Patches the <see cref="Regex"/> class so it will automatically convert and interpret
/// UTF32 characters expressed like <c>\U00010000</c> or UTF32 ranges expressed
/// like <c>\U00010000-\U00010001</c>.
/// </summary>
public class Utf32Regex : Regex
{
private const char MinLowSurrogate = '\uDC00';
private const char MaxLowSurrogate = '\uDFFF';
private const char MinHighSurrogate = '\uD800';
private const char MaxHighSurrogate = '\uDBFF';
// Match any character class such as [A-z]
private static readonly Regex characterClass = new Regex(
"(?<!\\\\)(\\[.*?(?<!\\\\)\\])",
RegexOptions.Compiled);
// Match a UTF32 range such as \U000E01F0-\U000E0FFF
// or an individual character such as \U000E0FFF
private static readonly Regex utf32Range = new Regex(
"(?<begin>\\\\U(?:00)?[0-9A-Fa-f]{6})-(?<end>\\\\U(?:00)?[0-9A-Fa-f]{6})|(?<begin>\\\\U(?:00)?[0-9A-Fa-f]{6})",
RegexOptions.Compiled);
public Utf32Regex()
: base()
{
}
public Utf32Regex(string pattern)
: base(ConvertUTF32Characters(pattern))
{
}
public Utf32Regex(string pattern, RegexOptions options)
: base(ConvertUTF32Characters(pattern), options)
{
}
public Utf32Regex(string pattern, RegexOptions options, TimeSpan matchTimeout)
: base(ConvertUTF32Characters(pattern), options, matchTimeout)
{
}
private static string ConvertUTF32Characters(string regexString)
{
StringBuilder result = new StringBuilder();
// Convert any UTF32 character ranges \U00000000-\U00FFFFFF to their
// equivalent UTF16 characters
ConvertUTF32CharacterClassesToUTF16Characters(regexString, result);
// Now find all of the individual characters that were not in ranges and
// fix those as well.
ConvertUTF32CharactersToUTF16(result);
return result.ToString();
}
private static void ConvertUTF32CharacterClassesToUTF16Characters(string regexString, StringBuilder result)
{
Match match = characterClass.Match(regexString); // Reset
int lastEnd = 0;
if (match.Success)
{
do
{
string characterClass = match.Groups[1].Value;
string convertedCharacterClass = ConvertUTF32CharacterRangesToUTF16Characters(characterClass);
result.Append(regexString.Substring(lastEnd, match.Index - lastEnd)); // Remove the match
result.Append(convertedCharacterClass); // Append replacement
lastEnd = match.Index + match.Length;
} while ((match = match.NextMatch()).Success);
}
result.Append(regexString.Substring(lastEnd)); // Append tail
}
private static string ConvertUTF32CharacterRangesToUTF16Characters(string characterClass)
{
StringBuilder result = new StringBuilder();
StringBuilder chars = new StringBuilder();
Match match = utf32Range.Match(characterClass); // Reset
int lastEnd = 0;
if (match.Success)
{
do
{
string utf16Chars;
string rangeBegin = match.Groups["begin"].Value.Substring(2);
if (!string.IsNullOrEmpty(match.Groups["end"].Value))
{
string rangeEnd = match.Groups["end"].Value.Substring(2);
utf16Chars = UTF32RangeToUTF16Chars(rangeBegin, rangeEnd);
}
else
{
utf16Chars = UTF32ToUTF16Chars(rangeBegin);
}
result.Append(characterClass.Substring(lastEnd, match.Index - lastEnd)); // Remove the match
chars.Append(utf16Chars); // Append replacement
lastEnd = match.Index + match.Length;
} while ((match = match.NextMatch()).Success);
}
result.Append(characterClass.Substring(lastEnd)); // Append tail of character class
// Special case - if we have removed all of the contents of the
// character class, we need to remove the square brackets and the
// alternation character |
int emptyCharClass = result.IndexOf("[]");
if (emptyCharClass >= 0)
{
result.Remove(emptyCharClass, 2);
// Append replacement ranges (exclude beginning |)
result.Append(chars.ToString(1, chars.Length - 1));
}
else
{
// Append replacement ranges
result.Append(chars.ToString());
}
if (chars.Length > 0)
{
// Wrap both the character class and any UTF16 character alteration into
// a non-capturing group.
return "(?:" + result.ToString() + ")";
}
return result.ToString();
}
private static void ConvertUTF32CharactersToUTF16(StringBuilder result)
{
while (true)
{
int where = result.IndexOf("\\U00");
if (where < 0)
{
break;
}
string cp = UTF32ToUTF16Chars(result.ToString(where + 2, 8));
result.Replace(where, where + 10, cp);
}
}
private static string UTF32RangeToUTF16Chars(string hexBegin, string hexEnd)
{
var result = new StringBuilder();
int beginCodePoint = int.Parse(hexBegin, NumberStyles.HexNumber);
int endCodePoint = int.Parse(hexEnd, NumberStyles.HexNumber);
var beginChars = char.ConvertFromUtf32(beginCodePoint);
var endChars = char.ConvertFromUtf32(endCodePoint);
int beginDiff = endChars[0] - beginChars[0];
if (beginDiff == 0)
{
// If the begin character is the same, we can just use the syntax \uD807[\uDDEF-\uDFFF]
result.Append("|");
AppendUTF16Character(result, beginChars[0]);
result.Append('[');
AppendUTF16Character(result, beginChars[1]);
result.Append('-');
AppendUTF16Character(result, endChars[1]);
result.Append(']');
}
else
{
// If the begin character is not the same, create 3 ranges
// 1. The remainder of the first
// 2. A range of all of the middle characters
// 3. The beginning of the last
result.Append("|");
AppendUTF16Character(result, beginChars[0]);
result.Append('[');
AppendUTF16Character(result, beginChars[1]);
result.Append('-');
AppendUTF16Character(result, MaxLowSurrogate);
result.Append(']');
// We only need a middle range if the ranges are not adjacent
if (beginDiff > 1)
{
result.Append("|");
// We only need a character class if there are more than 1
// characters in the middle range
if (beginDiff > 2)
{
result.Append('[');
}
AppendUTF16Character(result, (char)(Math.Min(beginChars[0] + 1, MaxHighSurrogate)));
if (beginDiff > 2)
{
result.Append('-');
AppendUTF16Character(result, (char)(Math.Max(endChars[0] - 1, MinHighSurrogate)));
result.Append(']');
}
result.Append('[');
AppendUTF16Character(result, MinLowSurrogate);
result.Append('-');
AppendUTF16Character(result, MaxLowSurrogate);
result.Append(']');
}
result.Append("|");
AppendUTF16Character(result, endChars[0]);
result.Append('[');
AppendUTF16Character(result, MinLowSurrogate);
result.Append('-');
AppendUTF16Character(result, endChars[1]);
result.Append(']');
}
return result.ToString();
}
private static string UTF32ToUTF16Chars(string hex)
{
int codePoint = int.Parse(hex, NumberStyles.HexNumber, CultureInfo.InvariantCulture);
return UTF32ToUTF16Chars(codePoint);
}
private static string UTF32ToUTF16Chars(int codePoint)
{
StringBuilder result = new StringBuilder();
UTF32ToUTF16Chars(codePoint, result);
return result.ToString();
}
private static void UTF32ToUTF16Chars(int codePoint, StringBuilder result)
{
// Use regex alteration to on the entire range of UTF32 code points
// to ensure each one is treated as a group.
result.Append("|");
AppendUTF16CodePoint(result, codePoint);
}
private static void AppendUTF16CodePoint(StringBuilder text, int cp)
{
var chars = char.ConvertFromUtf32(cp);
AppendUTF16Character(text, chars[0]);
if (chars.Length == 2)
{
AppendUTF16Character(text, chars[1]);
}
}
private static void AppendUTF16Character(StringBuilder text, char c)
{
text.Append(@"\u");
text.Append(Convert.ToString(c, 16).ToUpperInvariant());
}
}
StringBuilderExtensions.cs
public static class StringBuilderExtensions
{
/// <summary>
/// Searches for the first index of the specified character. The search for
/// the character starts at the beginning and moves towards the end.
/// </summary>
/// <param name="text">This <see cref="StringBuilder"/>.</param>
/// <param name="value">The string to find.</param>
/// <returns>The index of the specified character, or -1 if the character isn't found.</returns>
public static int IndexOf(this StringBuilder text, string value)
{
return IndexOf(text, value, 0);
}
/// <summary>
/// Searches for the index of the specified character. The search for the
/// character starts at the specified offset and moves towards the end.
/// </summary>
/// <param name="text">This <see cref="StringBuilder"/>.</param>
/// <param name="value">The string to find.</param>
/// <param name="startIndex">The starting offset.</param>
/// <returns>The index of the specified character, or -1 if the character isn't found.</returns>
public static int IndexOf(this StringBuilder text, string value, int startIndex)
{
if (text == null)
throw new ArgumentNullException("text");
if (value == null)
throw new ArgumentNullException("value");
int index;
int length = value.Length;
int maxSearchLength = (text.Length - length) + 1;
for (int i = startIndex; i < maxSearchLength; ++i)
{
if (text[i] == value[0])
{
index = 1;
while ((index < length) && (text[i + index] == value[index]))
++index;
if (index == length)
return i;
}
}
return -1;
}
/// <summary>
/// Replaces the specified subsequence in this builder with the specified
/// string.
/// </summary>
/// <param name="text">this builder.</param>
/// <param name="start">the inclusive begin index.</param>
/// <param name="end">the exclusive end index.</param>
/// <param name="str">the replacement string.</param>
/// <returns>this builder.</returns>
/// <exception cref="IndexOutOfRangeException">
/// if <paramref name="start"/> is negative, greater than the current
/// <see cref="StringBuilder.Length"/> or greater than <paramref name="end"/>.
/// </exception>
/// <exception cref="ArgumentNullException">if <paramref name="str"/> is <c>null</c>.</exception>
public static StringBuilder Replace(this StringBuilder text, int start, int end, string str)
{
if (str == null)
{
throw new ArgumentNullException(nameof(str));
}
if (start >= 0)
{
if (end > text.Length)
{
end = text.Length;
}
if (end > start)
{
int stringLength = str.Length;
int diff = end - start - stringLength;
if (diff > 0)
{ // replacing with fewer characters
text.Remove(start, diff);
}
else if (diff < 0)
{
// replacing with more characters...need some room
text.Insert(start, new char[-diff]);
}
// copy the chars based on the new length
for (int i = 0; i < stringLength; i++)
{
text[i + start] = str[i];
}
return text;
}
if (start == end)
{
text.Insert(start, str);
return text;
}
}
throw new IndexOutOfRangeException();
}
}
Do note this is not very well tested and probably not very robust, but for testing purposes it should be fine.