1

Suppose I have an input string:

 String test = "\"item one\",\"item,2\",12345";
 String[] arr = test.Split(',');

The result is:

[0]: "item one"
[1]: "item"
[2]: "2"
[3]: "12345"

But I want:

[0]: "item one"
[1]: "item, 2"
[2]: "12345"

Basically, my input string will have quotes to allow a comma between quotes, that is not a delimiter.

Whats the best way to parse this this line into an array?

PhillyNJ
  • 3,859
  • 4
  • 38
  • 64

2 Answers2

3

Below is simple and complete CsvHelper which implements coding an decoding lines from/to csv data lines. In my last project I wanted to use FastCSVReader but it seemed to be quite complicated inside and because I was not aspiring on fastest and the most robust solution as possible I wrote my little helper.

/// <summary>
/// CsvHelper
/// </summary>
public static class CsvHelper
{
    #region Public methods

    /// <summary>
    /// Codes fields as a line for csv file
    /// </summary>
    /// <param name="fields"></param>
    /// <returns></returns>
    public static string CodeLine(string[] fields)
    {
        if (fields == null || fields.Length == 0)
            return null;

        var sb = new StringBuilder(1024);
        for (int i = 0; i < fields.Length; i++)
        {
            if (fields[i] != null)
            {
                bool mustBeQuoted = fields[i].Contains("\"") || fields[i].Contains(",");
                if (mustBeQuoted)
                {
                    sb.Append("\"");
                    sb.Append(fields[i].Replace("\"", "\"\""));
                    sb.Append("\"");
                }
                else
                {
                    sb.Append(fields[i]);
                }
            }

            if (i != fields.Length - 1)
                sb.Append(",");
        }

        return sb.ToString();
    }

    /// <summary>
    /// Decodes line from csv file into fields
    /// </summary>
    /// <param name="line"></param>
    /// <param name="fields"></param>
    /// <returns>Returns true if decoding was successful</returns>
    public static bool DecodeLine(string line, out string[] fields)
    {
        fields = null;

        if (string.IsNullOrEmpty(line))
            return false;

        int index = 0;
        var res = new List<string>();
        while (index != line.Length)
        {
            string field;
            if (ReadField(line, ref index, out field))
            {
                res.Add(field);
            }
            else
            {
                return false;
            }
        }

        if (line[line.Length - 1] == ',')
        {
            res.Add(string.Empty);
        }

        fields = res.ToArray();
        return true;
    }

    #endregion

    #region Other methods

    private static bool ReadField(string line, ref int index, out string field)
    {
        field = null;

        if (index >= line.Length)
            return false;

        var sb = new StringBuilder(512);
        int state = 0;
        while (true)
        {
            char c = line[index];
            char? c1 = (index + 1 < line.Length - 1) ? (char?)line[index + 1] : null;
            index++;
            switch (state)
            {
                case 0: // START
                    if (c == '"') //text in field starts by quotation mark, text of filed in quotation marks
                    {
                        state = 4;
                    }
                    else if (c == ',') //empty text of filed
                    {
                        field = sb.ToString();
                        return true;
                    }
                    else
                    {
                        state = 1;
                        sb.Append(c);
                    }
                    break;
                case 1: //Not quoted text in field
                    if (c == '"') // error, cannot contains " in the middle of the field
                    {
                        return false;
                    }
                    else if (c == ',')
                    {
                        field = sb.ToString();
                        return true;
                    }
                    else
                    {
                        sb.Append(c);
                    }
                    break;
                case 3: //Escaping quotation mark
                    if (c == '"') //previous quotation mark was escape char for this quotation mark
                    {
                        state = 4;
                        sb.Append(c);
                    }
                    else //error, cannot contais any other char
                    {
                        return false;
                    }
                    break;
                case 4: //Text in between quotation marks
                    if (c == '"') //closing quoted text or escape char for following qoatation mark - based on which char is following
                    {
                        if (c1 != null && c1.Value == '"') //current quotation mark is escape char for following quotation mark
                        {
                            state = 3;
                        }
                        else
                        {
                            state = 5;
                        }
                    }
                    else
                    {
                        sb.Append(c);
                    }
                    break;
                case 5: //Just after closing quotation mark of quoted text
                    if (c == ',') //closing quoted text
                    {
                        field = sb.ToString();
                        return true;
                    }
                    else //error, cannot contais any other char
                    {
                        return false;
                    }
                    break;
            }

            if (index == line.Length)
            {
                if (state == 1 || state == 5)
                {
                    field = sb.ToString();
                    return true;
                }

                return false;
            }
        }
    }

    #endregion
}
user2126375
  • 1,594
  • 12
  • 29
0

I would probably use Regular Expression like (["])[^"]*\1|(\d+) but of course it is possible some other formats might need to be consider with this approach.

Dalorzo
  • 19,834
  • 7
  • 55
  • 102
  • you need to deal with field delimiter "," which can be used - if escaped - in data field. Plus special cases of quotation marks in data field, empty data fields, ect. Write simple csv parser without support quotation marks is trivial (string.split is enough) but write fully functional csv writer is not so trivial (but still relative easy :-)) In fact I was trying it by regular expression first, but using state machine was the fastest way how to implement functional csv parser without side effects which can be easily overlooked in regex pattern where you have all the rules in one pattern string – user2126375 Apr 30 '14 at 04:40