0

I am trying to convert an xlsx file to csv by doing the following:

    public static bool saveAsCsv(string excelFilePath, string destinationCsvPath)
    {
        Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
        
        using (var stream = new FileStream(excelFilePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
        {
            IExcelDataReader? reader = null;

            if (excelFilePath.EndsWith(".xls"))
            {
                reader = ExcelReaderFactory.CreateBinaryReader(stream);
            }
            else if (excelFilePath.EndsWith(".xlsx"))
            {
                reader = ExcelReaderFactory.CreateOpenXmlReader(stream);
            }

            if (reader == null)
                return false;

            var ds = reader.AsDataSet(new ExcelDataSetConfiguration()
            {
                ConfigureDataTable = (tableReader) => new ExcelDataTableConfiguration()
                {
                    UseHeaderRow = false
                }
            });

            var csvContent = string.Empty;
            int row_no = 0;
            while (row_no < ds.Tables[0].Rows.Count)
            {
                var arr = new List<string>();
                for (int i = 0; i < ds.Tables[0].Columns.Count; i++)
                {
#pragma warning disable CS8604 // Possible null reference argument.
                        arr.Add(ds.Tables[0].Rows[row_no][i].ToString());
#pragma warning restore CS8604 // Possible null reference argument.
                }
                row_no++;
                csvContent += string.Join(",", arr) + "\n";
            }

            StreamWriter csv = new StreamWriter(destinationCsvPath, false);
            csv.Write(csvContent);
            csv.Close();
            return true;
        }
    }

This does work, however, the issue I am having is that when it reaches a field that has 2 values separated by commas, it counts them as 2 separate fields.

So for example:

Instead of having:

Test A, test B in 1 field it has them in separate fields, what do I need to change so that this doesn't happen?

lross15
  • 147
  • 2
  • 14
  • I strongly recommend using a library like CsvHelper to produce your CSV output. It will automatically handle escaping the commas correctly, and will likely simplify your code significantly. – StriplingWarrior Jun 06 '22 at 15:29
  • Using a third-party library will also probably help you avoid other mistakes. For example, right now you're concatenating a string for every row in your CSV: if you have very many rows that's going to use _a lot_ of extra memory and CPU time that's totally unnecessary. You should be streaming rows to your output, instead. – StriplingWarrior Jun 06 '22 at 15:32
  • Did not realise you could convert xlsx to csv using that library, thank you, I'll look into that – lross15 Jun 06 '22 at 15:40
  • 1
    I was just referring to the CSV-writing portion of your code, not the XLSX-reading part. But honestly, there's a decent chance the library you're using to read from the Excel file already has a way to save to CSV: e.g. https://stackoverflow.com/a/2538091/120955 – StriplingWarrior Jun 06 '22 at 17:43
  • ah, my mistake. I'll have a look to what you linked, thanks for that, it's much appreciated – lross15 Jun 07 '22 at 08:06

1 Answers1

1

Wrap the values in quotes and escape any quotes in the values with another quote:

csvContent += string.Join(",", arr.Select(s => $"\"{s.Replace("\"", "\"\"")}\"") + "\n";

Alternatively, leave that line as it was and make the modifications when adding the values to arr:

arr.Add($"\"{ds.Tables[0].Rows[row_no][i].ToString().Replace("\"", "\"\"")}\"");

You could streamline that code quite a bit, once you have your DataSet:

var table = ds.Tables[0];
var lines = table.Rows
                 .Cast<DataRow>()
                 .Select(dr => string.Join(",",
                                           dr.ItemArray
                                             .Select(o => $"\"{o.ToString().Replace("\"", "\"\"")}\"")));

File.WriteAllLines(destinationCsvPath, lines);

Note that you can replace table.Rows.Cast<DataRow> with table.AsEnumerable().

user18387401
  • 2,514
  • 1
  • 3
  • 8