2

I need to split a csv file by comma apart from where the columns is between quote marks. However, what I have here does not seem to be achieving what I need and comma's in columns are being split into separate array items.

    public List<string> GetData(string dataFile, int row)
    {
        try
        {
            var lines = File.ReadAllLines(dataFile).Select(a => a.Split(';'));
            var csv = from line in lines select (from piece in line select piece.Split(',')).ToList();
            var foo = csv.ToList();
            var result = foo[row][0].ToList();
            return result;
        }
        catch
        {
            return null;
        }
    }

    private const string QUOTE = "\"";
    private const string ESCAPED_QUOTE = "\"\"";
    private static char[] CHARACTERS_THAT_MUST_BE_QUOTED = { ',', '"', '\n' };

    public static string Escape(string s)
    {
        if (s.Contains(QUOTE))
            s = s.Replace(QUOTE, ESCAPED_QUOTE);

        if (s.IndexOfAny(CHARACTERS_THAT_MUST_BE_QUOTED) > -1)
            s = QUOTE + s + QUOTE;

        return s;
    }

I am not sure where I can use my escape function in this case.

Example:

Degree,Graduate,08-Dec-17,Level 1,"Advanced, Maths"

The string Advanced, Maths are being split into two different array items which I don't want

Em Jay
  • 63
  • 7

3 Answers3

0

Not sure how this performes - but you can solve that with Linq.Aggregate like this:

using System;
using System.Linq;
using System.Collections.Generic;

public class Program
{
    public static IEnumerable<string> SplitIt(
        char[] splitters, 
        string text, 
        StringSplitOptions opt = StringSplitOptions.None)
    {
        bool inside = false;
        var result = text.Aggregate(new List<string>(), (acc, c) =>
        {
            // this will check each char of your given text
            // and accumulate it in the (empty starting) string list
            // your splitting chars will lead to a new item put into 
            // the list if they are not inside. inside starst as false
            // and is flipped anytime it hits a "
            // at end we either return all that was parsed or only those
            // that are neither null nor "" depending on given opt's
            if (!acc.Any()) // nothing in yet
            {
                if (c != '"' && (!splitters.Contains(c) || inside))
                    acc.Add("" + c);

                else if (c == '"')
                    inside = !inside;

                else if (!inside && splitters.Contains(c)) // ",bla"
                    acc.Add(null);

                return acc;
            }

            if (c != '"' && (!splitters.Contains(c) || inside))
                acc[acc.Count - 1] = (acc[acc.Count - 1] ?? "") + c;

            else if (c == '"')
                inside = !inside;

            else if (!inside && splitters.Contains(c)) // ",bla"
                acc.Add(null);

            return acc;
        }

        );
        if (opt == StringSplitOptions.RemoveEmptyEntries)
            return result.Where(r => !string.IsNullOrEmpty(r));

        return result;
    }

    public static void Main()
    {
        var s = ",,Degree,Graduate,08-Dec-17,Level 1,\"Advanced, Maths\",,";
        var spl = SplitIt(new[]{','}, s);
        var spl2 = SplitIt(new[]{','}, s, StringSplitOptions.RemoveEmptyEntries);
        Console.WriteLine(string.Join("|", spl));
        Console.WriteLine(string.Join("|", spl2));
    }
}

Output:

|Degree|Graduate|08-Dec-17|Level 1|Advanced, Maths||
Degree|Graduate|08-Dec-17|Level 1|Advanced, Maths
Patrick Artner
  • 50,409
  • 9
  • 43
  • 69
0

You could use regex, linq or just loop through each character and use Booleans to figure out what the current behaviour should be. This question actually got me thinking, as I'd previously just looped through and acted on each character. Here is Linq way of breaking an entire csv document up, assuming the end of line can be found with ';':

private static void Main(string[] args)
    {
        string example = "\"Hello World, My name is Gumpy!\",20,male;My sister's name is Amy,29,female";

        var result1 = example.Split(';')
                             .Select(s => s.Split('"')) // This will leave anything in abbreviation marks at odd numbers
                             .Select(sl => sl.Select((ss, index) => index % 2 == 0 ? ss.Split(',') : new string[] { ss })) // if it's an even number split by a comma
                             .Select(sl => sl.SelectMany(sc => sc));

        Console.WriteLine("Press any key to continue.");
        Console.ReadKey();
    }
Monofuse
  • 735
  • 6
  • 14
0

The function gets comma separated fields within a string, excluding commas embedded in a quoted field

The assumptions

  • It should return empty fields ,,
  • There are no quotes within a quote field (as per the example)

The method

  • I uses a for loop with i as a place holder of the current field
  • It scans for the next comma or quote and if it finds a quote it scans for the next comma to create the field
  • It needed to be efficient otherwise we would use regex or Linq
  • The OP didn't want to use a CSV library

Note : There is no error checking, and scanning each character would be faster this was just easy to understand

Code

public List<string> GetFields(string line)
{
    var list = new List<string>();

    for (var i = 0; i < line.Length; i++)
    {
        var firstQuote = line.IndexOf('"', i);
        var firstComma = line.IndexOf(',', i);

        if (firstComma >= 0)
        {
            // first comma is before the first quote, then its just a standard field
            if (firstComma < firstQuote || firstQuote == -1)
            {
                list.Add(line.Substring(i, firstComma - i));
                i = firstComma;
                continue;
            }

            // We have found quote so look for the next comma afterwards
            var nextQuote = line.IndexOf('"', firstQuote + 1);
            var nextComma = line.IndexOf(',', nextQuote + 1);

            // if we found a comma, then we have found the end of this field
            if (nextComma >= 0)
            {
                list.Add(line.Substring(i, nextComma - i));
                i = nextComma;
                continue;
            }
        }

        list.Add(line.Substring(i)); // if were are here there are no more fields
        break;

    }
    return list;
}

Tests 1

Degree,Graduate,08-Dec-17,Level 1,"Advanced, Maths",another

Degree
Graduate
08-Dec-17
Level 1
"Advanced, Maths"
another

Tests 2

,Degree,Graduate,08-Dec-17,\"asdasd\",Level 1,\"Advanced, Maths\",another

<Empty Line>
Degree
Graduate
08-Dec-17
"asdasd"
Level 1
"Advanced, Maths"
another
TheGeneral
  • 79,002
  • 9
  • 103
  • 141