This should return the maximal substring starting at index startIndex
and with length up to length
of "complete" graphemes... So initial/final "splitted" surrogate pairs will be removed, initial combining marks will be removed, final characters missing their combining marks will be removed.
Note that probably it isn't what you asked... You seem to want to use graphemes as the unit of measure (or perhaps you want to include the last grapheme even if its length will go over the length
parameter)
public static class StringEx
{
public static string UnicodeSafeSubstring(this string str, int startIndex, int length)
{
if (str == null)
{
throw new ArgumentNullException("str");
}
if (startIndex < 0 || startIndex > str.Length)
{
throw new ArgumentOutOfRangeException("startIndex");
}
if (length < 0)
{
throw new ArgumentOutOfRangeException("length");
}
if (startIndex + length > str.Length)
{
throw new ArgumentOutOfRangeException("length");
}
if (length == 0)
{
return string.Empty;
}
var sb = new StringBuilder(length);
int end = startIndex + length;
var enumerator = StringInfo.GetTextElementEnumerator(str, startIndex);
while (enumerator.MoveNext())
{
string grapheme = enumerator.GetTextElement();
startIndex += grapheme.Length;
if (startIndex > length)
{
break;
}
// Skip initial Low Surrogates/Combining Marks
if (sb.Length == 0)
{
if (char.IsLowSurrogate(grapheme[0]))
{
continue;
}
UnicodeCategory cat = char.GetUnicodeCategory(grapheme, 0);
if (cat == UnicodeCategory.NonSpacingMark || cat == UnicodeCategory.SpacingCombiningMark || cat == UnicodeCategory.EnclosingMark)
{
continue;
}
}
sb.Append(grapheme);
if (startIndex == length)
{
break;
}
}
return sb.ToString();
}
}
Variant that will simply include "extra" characters at the end of the substring, if necessary to make whole a grapheme:
public static class StringEx
{
public static string UnicodeSafeSubstring(this string str, int startIndex, int length)
{
if (str == null)
{
throw new ArgumentNullException("str");
}
if (startIndex < 0 || startIndex > str.Length)
{
throw new ArgumentOutOfRangeException("startIndex");
}
if (length < 0)
{
throw new ArgumentOutOfRangeException("length");
}
if (startIndex + length > str.Length)
{
throw new ArgumentOutOfRangeException("length");
}
if (length == 0)
{
return string.Empty;
}
var sb = new StringBuilder(length);
int end = startIndex + length;
var enumerator = StringInfo.GetTextElementEnumerator(str, startIndex);
while (enumerator.MoveNext())
{
if (startIndex >= length)
{
break;
}
string grapheme = enumerator.GetTextElement();
startIndex += grapheme.Length;
// Skip initial Low Surrogates/Combining Marks
if (sb.Length == 0)
{
if (char.IsLowSurrogate(grapheme[0]))
{
continue;
}
UnicodeCategory cat = char.GetUnicodeCategory(grapheme, 0);
if (cat == UnicodeCategory.NonSpacingMark || cat == UnicodeCategory.SpacingCombiningMark || cat == UnicodeCategory.EnclosingMark)
{
continue;
}
}
sb.Append(grapheme);
}
return sb.ToString();
}
}
This will return what you asked "Hello world!".UnicodeSafeSubstring(0, 6) == "Hello"
.
Note: It's worth pointing out that both of these solutions rely on StringInfo.GetTextElementEnumerator
. This method didn't work as expected prior to a fix in .NET5, so if you're on an earlier version of .NET then this will split more complex multi-character emoji's.