2

I'd like to find the fastest way to replace all reserved characters in a string with their escaped version.

There are two naive ways that come to my mind spontaneously (note that the set of reserved characters is just an example):

A: Using a lookup dictionary and String.Replace

private Dictionary<string, string> _someEscapeTokens = new Dictionary<string, string>()
{
    {"\t", @"\t"},
    {"\n", @"\n"},
    {"\r", @"\r"}
};

public string GetEscapedStringByNaiveLookUp(string s)
{
    foreach (KeyValuePair<string, string> escapeToken in _someEscapeTokens.Where(kvp => s.Contains(kvp.Key)))
    {
        s = s.Replace(escapeToken.Key, escapeToken.Value);
    }
    return s;
}

B: Traversing each character in the string

public string GetEscapedStringByTraversingCharArray(string s)
{
    char[] chars = s.ToCharArray();
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < chars.Length; i++)
    {
        switch (chars[i])
        {
            case '\t':
                sb.Append(@"\t"); break;
            case '\n':
                sb.Append(@"\n"); break;
            case '\r':
                sb.Append(@"\r"); break;
            default:
                sb.Append(chars[i]); break;
         }
    }
    return sb.ToString();
}

As I've already tested, Version B outperforms the first one easily.

Note: I already considered Regex.Escape, but since the character set doesn't match mine, it doesn't fit.

However, are there other ways you would approach this problem (with performance in mind)?

More testing

See below for the code.

Testing was done on two different systems targeting the .NET Framework 4.0. Anyway, the results are pretty much the same:

Char Array (short string) average time: 38600 ns
Foreach (short string) average time: 26680 ns
Char Array (long string) average time: 48,1 ms
Foreach (long string) average time: 64,2 ms
Char Array (escaping only) average time: 13,6 ms
Foreach (escaping only) average time: 17,3 ms

Which leads me to the conclusion that the foreach version seems to be slightly faster for short strings, but it somehow "falls of" for longer strings. However, we're talking about really small differences here.

Testing code:

private static void Main(string[] args)
{
    //around 700 characters
    string shortString = new StackTrace().ToString();
    string longString;
    string pureEscape;
    //loading from a file with 1000000 words http://loremipsum.de/
    using (StreamReader sr = new StreamReader(@"C:\users\ekrueger\desktop\LoremIpsum.txt"))
    {
        longString = sr.ReadToEnd();
    }
    //text file containing only escapable characters (length ~1000000)
    using (StreamReader sr = new StreamReader(@"C:\users\ekrueger\desktop\PureEscape.txt"))
    {
        pureEscape = sr.ReadToEnd();
    }
    List<double> timesCharArrayShortString = new List<double>();
    List<double> timesForeachShortString = new List<double>();
    List<long> timesCharArrayLongString = new List<long>();
    List<long> timesForeachLongString = new List<long>();
    List<long> timesCharArrayPureEscape = new List<long>();
    List<long> timesForeachPureEscape = new List<long>();
    Stopwatch sw = new Stopwatch();

    for (int i = 0; i < 10; i++)
    {
        sw.Restart();
        GetEscapedStringByTraversingCharArray(shortString);
        sw.Stop();
        timesCharArrayShortString.Add(sw.Elapsed.TotalMilliseconds * 1000000);
    }

    for (int i = 0; i < 10; i++)
    {
        sw.Restart();
        GetEscapedStringForeach(shortString);
        sw.Stop();
        timesForeachShortString.Add(sw.Elapsed.TotalMilliseconds * 1000000);
    }

    for (int i = 0; i < 10; i++)
    {
        sw.Restart();
        GetEscapedStringByTraversingCharArray(longString);
        sw.Stop();
        timesCharArrayLongString.Add(sw.ElapsedMilliseconds);
    }

    for (int i = 0; i < 10; i++)
    {
        sw.Restart();
        GetEscapedStringForeach(longString);
        sw.Stop();
        timesForeachLongString.Add(sw.ElapsedMilliseconds);
    }

    for (int i = 0; i < 10; i++)
    {
        sw.Restart();
        GetEscapedStringByTraversingCharArray(pureEscape);
        sw.Stop();
        timesCharArrayPureEscape.Add(sw.ElapsedMilliseconds);
    }

    for (int i = 0; i < 10; i++)
    {
        sw.Restart();
        GetEscapedStringForeach(pureEscape);
        sw.Stop();
        timesForeachPureEscape.Add(sw.ElapsedMilliseconds);
    }

    Console.WriteLine("Char Array (short string) average time: {0} ns", timesCharArrayShortString.Average());
    Console.WriteLine("Foreach (short string) average time: {0} ns", timesForeachShortString.Average());
    Console.WriteLine("Char Array (long string) average time: {0} ms", timesCharArrayLongString.Average());
    Console.WriteLine("Foreach (long string) average time: {0} ms", timesForeachLongString.Average());
    Console.WriteLine("Char Array (escaping only) average time: {0} ms", timesCharArrayPureEscape.Average());
    Console.WriteLine("Foreach (escaping only) average time: {0} ms", timesForeachPureEscape.Average());

    Console.Read();
}

private static string GetEscapedStringByTraversingCharArray(string s)
{
    if (String.IsNullOrEmpty(s))
        return s;

    char[] chars = s.ToCharArray();
    StringBuilder sb = new StringBuilder(s.Length);
    for (int i = 0; i < chars.Length; i++)
    {
        switch (chars[i])
        {
            case '\t':
                sb.Append(@"\t"); break;
            case '\n':
                sb.Append(@"\n"); break;
            case '\r':
                sb.Append(@"\r"); break;
            case '\f':
                sb.Append(@"\f"); break;
            default:
                sb.Append(chars[i]); break;
        }
    }
    return sb.ToString();
}

public static string GetEscapedStringForeach(string s)
{
    if (String.IsNullOrEmpty(s))
        return s;

    StringBuilder sb = new StringBuilder(s.Length);
    foreach (Char ch in s)
    {
        switch (ch)
        {
            case '\t':
                sb.Append(@"\t"); break;
            case '\n':
                sb.Append(@"\n"); break;
            case '\r':
                sb.Append(@"\r"); break;
            default:
                sb.Append(ch); break;
        }
    }
    return sb.ToString();
}
Peter Mortensen
  • 30,738
  • 21
  • 105
  • 131
EKrueger
  • 730
  • 8
  • 20
  • Did you take a look at http://stackoverflow.com/questions/323640/can-i-convert-a-c-sharp-string-value-to-an-escaped-string-literal?rq=1? You can then run performance comparisons and see which implementation is the fastest. And if you could share the results we would all be thankful... – qqbenq May 13 '14 at 12:53
  • We spent a lot of time testing a similar problem. We didn't get any faster than the StringBuilder method which was pretty much identical to yours. – spender May 13 '14 at 12:56
  • 2
    Don't forget that a string is already addressable by index, so ToCharArray is redundant and chums memory for no gain. – spender May 13 '14 at 12:58
  • 1
    Yes, so change the for loop to: `foreach (char ch in s)` and then do `switch (ch)` – Matthew Watson May 13 '14 at 12:59
  • @qqbenq I didn't tried this solution because of Timwi's comment in this answer http://stackoverflow.com/a/324812/1409903. – EKrueger May 13 '14 at 13:10
  • As side comment, don't forget to escape escape (`'\'` -> `@"\\"`), otherwise you will have decoding problem if `t`, `n` or `r` will follow normal escape char. – Sinatr May 13 '14 at 13:31
  • @Sinatr Good point although this character set was just to showcase my problem. – EKrueger May 13 '14 at 13:40
  • @spender Whilst testing I also looked at this and there seems to be a slight advantage using char[] instead of string indexer (at least on my setup). – EKrueger May 14 '14 at 08:54
  • @EKrueger lesur has an answer there which fixes that problem (http://stackoverflow.com/a/14719643/1552016) – qqbenq May 14 '14 at 11:27

2 Answers2

3

You have no need to convert string into Char[], so the solution can be slightly improved:

public string GetEscapedStringByTraversingCharArray(string s) {
  // do not forget about null...
  if (String.IsNullOrEmpty(s))
    return s;

  // we can be sure, that it requires at least s.Length symbols, let's allocate then
  // in case we want to trade memory for speed we can even put
  //   StringBuilder sb = new StringBuilder(2 * s.Length);
  // for the worst case when all symbols should be escaped
  StringBuilder sb = new StringBuilder(s.Length);

  foreach(Char ch in s) {
    switch (ch) {
      case '\t':
        sb.Append(@"\t"); 

        break;
      case '\n':
        sb.Append(@"\n"); 

        break;
      case '\r':
        sb.Append(@"\r"); 

        break;
      default:
        sb.Append(ch); 

        break;
    }
  }

  return sb.ToString();
}
Dmitry Bychenko
  • 180,369
  • 20
  • 160
  • 215
  • I've added some test results to my answer. However i liked pointing out the String.IsNullOrEmpty and initialization of the StringBuilder. – EKrueger May 14 '14 at 08:55
2

It makes sense that the first option is slower since you're creating a lot of string objects with:

s = s.Replace(escapeToken.Key, escapeToken.Value);

In the second method, there's no need to create a char[] because string has an indexer too. Perhaps the only thing you can do to improve performance is initialize the StringBuilder with a capacity, so it doesn't need to resize. You can still use the Dictionary in the second method.

Dennis_E
  • 8,751
  • 23
  • 29