open xml reading from excel file

Question

I want to implement openXml sdk 2.5 into my project. I do everything in this link

using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Spreadsheet;
using System.IO.Packaging;


static void Main(string[] args)
        {

            String fileName = @"C:\OPENXML\BigData.xlsx";
            // Comment one of the following lines to test the method separately.
            ReadExcelFileDOM(fileName);    // DOM
            //ReadExcelFileSAX(fileName);    // SAX
        }

        // The DOM approach.
        // Note that the code below works only for cells that contain numeric values.
        // 
        static void ReadExcelFileDOM(string fileName)
        {
            using (SpreadsheetDocument spreadsheetDocument = SpreadsheetDocument.Open(fileName, false))
            {
                WorkbookPart workbookPart = spreadsheetDocument.WorkbookPart;
                WorksheetPart worksheetPart = workbookPart.WorksheetParts.First();
                SheetData sheetData = worksheetPart.Worksheet.Elements<SheetData>().First();
                string text;

                int rowCount= sheetData.Elements<Row>().Count();

                foreach (Row r in sheetData.Elements<Row>())
                {
                    foreach (Cell c in r.Elements<Cell>())
                    {
                        text = c.CellValue.Text;
                        Console.Write(text + " ");
                    }
                }
                Console.WriteLine();
                Console.ReadKey();
            }
        }

But i am not getting any row. It hasn't entered loop. Note: I also set up openXml sdk 2.5 my computer

And I find below code this is work for numeric value.For string value it writes 0 1 2 ...

 private static void Main(string[] args)
            {
                var filePath = @"C:/OPENXML/BigData.xlsx";
                using (var document = SpreadsheetDocument.Open(filePath, false))
                {
                    var workbookPart = document.WorkbookPart;
                    var workbook = workbookPart.Workbook;

                    var sheets = workbook.Descendants<Sheet>();
                    foreach (var sheet in sheets)
                    {
                        var worksheetPart = (WorksheetPart)workbookPart.GetPartById(sheet.Id);
                        var sharedStringPart = workbookPart.SharedStringTablePart;
                        //var values = sharedStringPart.SharedStringTable.Elements<SharedStringItem>().ToArray();

                        string text;
                        var rows = worksheetPart.Worksheet.Descendants<Row>();
                        foreach (var row in rows)
                        {
                            Console.WriteLine();
                            int count = row.Elements<Cell>().Count();

                            foreach (Cell c in row.Elements<Cell>())
                            {

                                text = c.CellValue.InnerText;

                                Console.Write(text + " ");

                            }
                        }
                    }
                }

                Console.ReadLine();
            }

The reason it writes 0, 1, 2... is because of the shared string table that Excel uses. See my answer below for a basic approach to accessing the actual text. — shunty, Apr 16 '14 at 10:32

score 76 · Accepted Answer · edited Jan 11 '23 at 17:33

76

Your approach seemed to work ok for me - in that it did "enter the loop". Nevertheless you could also try something like the following:

void Main()
{
    string fileName = @"c:\path\to\my\file.xlsx";

    using (FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
    {
        using (SpreadsheetDocument doc = SpreadsheetDocument.Open(fs, false))
        {
            WorkbookPart workbookPart = doc.WorkbookPart;
            SharedStringTablePart sstpart = workbookPart.GetPartsOfType<SharedStringTablePart>().First();
            SharedStringTable sst = sstpart.SharedStringTable;
        
            WorksheetPart worksheetPart = workbookPart.WorksheetParts.First();
            Worksheet sheet = worksheetPart.Worksheet;
            
            var cells = sheet.Descendants<Cell>();
            var rows = sheet.Descendants<Row>();
        
            Console.WriteLine("Row count = {0}", rows.LongCount());
            Console.WriteLine("Cell count = {0}", cells.LongCount());
        
            // One way: go through each cell in the sheet
            foreach (Cell cell in cells)
            {
                if ((cell.DataType != null) && (cell.DataType == CellValues.SharedString))
                {
                    int ssid = int.Parse(cell.CellValue.Text);
                    string str = sst.ChildElements[ssid].InnerText;
                    Console.WriteLine("Shared string {0}: {1}", ssid, str);
                }
                else if (cell.CellValue != null)
                {
                    Console.WriteLine("Cell contents: {0}", cell.CellValue.Text);
                }
             }
        
             // Or... via each row
             foreach (Row row in rows)
             {
                 foreach (Cell c in row.Elements<Cell>())
                 {
                     if ((c.DataType != null) && (c.DataType == CellValues.SharedString))
                     {
                         int ssid = int.Parse(c.CellValue.Text);
                         string str = sst.ChildElements[ssid].InnerText;
                         Console.WriteLine("Shared string {0}: {1}", ssid, str);
                     }
                     else if (c.CellValue != null)
                     {
                         Console.WriteLine("Cell contents: {0}", c.CellValue.Text);
                     }
                 }
             }
         }
     }
 }

I used the filestream approach to open the workbook because this allows you to open it with shared access - so that you can have the workbook open in Excel at the same time. The Spreadsheet.Open(... method won't work if the workbook is open elsewhere.

Perhaps that is why your code didn't work.

Note, also, the use of the SharedStringTable to get the cell text where appropriate.

EDIT 2018-07-11:

Since this post is still getting votes I should also point out that in many cases it may be a lot easier to use ClosedXML to manipulate/read/edit your workbooks. The documentation examples are pretty user friendly and the coding is, in my limited experience, much more straight forward. Just be aware that it does not (yet) implement all the Excel functions (for example INDEX and MATCH) which may or may not be an issue. [Not that I would want to be trying to deal with INDEX and MATCH in OpenXML anyway.]

edited Jan 11 '23 at 17:33

Wai Ha Lee

8,598
83
57
92

answered Apr 16 '14 at 08:38

shunty

3,699
1
22
27

I think your code absolutely work but When i copy above code to my project it doesn't work and i don't find why it doesn't work. – altandogan Apr 16 '14 at 09:36
1

Then you need to debug it or give a little more information than "it doesn't work". eg at what point does it fail, does it produce an exception, what are the row and cell counts. Also, for example, some Assert statements - like Assert(workbookPart != null); Assert(worksheetPart != null); Assert(sheet != null); etc etc. – shunty Apr 16 '14 at 10:33
it creates object about workbookPart worksheetPart. But when i want to check ResultViews i see it Empty = "Enumeration yielded no results" – altandogan Apr 17 '14 at 10:39
I change openxml version v2.0.50727 and Windowsbase v4.0.30319. could it wrong thing about version? – altandogan Apr 17 '14 at 10:41
hi shunty i try some come like this //var worksheetPart = (WorksheetPart)workbookPart.GetPartById("rId1"); this code gets worksheet i am implementing this your code write screen some cells but not all of them. – altandogan Apr 17 '14 at 11:48
The code above used OpenXML 2.5 and WindowsBase 4.0.30319 - but the 2.0 version of OpenXML would work fine too. OpenXML workbooks only store cell elements for cells that actually have (or had) real data and not all cell elements will have a non-null CellValue property - hence the null checks in the above code. Unused/null cells won't appear in the cells collection and so you won't see them output on screen. – shunty Apr 17 '14 at 13:18
Hi, thanks for advice i want to ask one question too. why do you use filestream reading excel. is it fast than other ? – altandogan Apr 17 '14 at 13:48
1

I mentioned it in the answer - most of the docs online suggest using the SpreadsheetDocument.Open(... approach and this works fine - but will only work if the workbook is not open elsewhere. For what I was doing I needed to be able to have the workbook open in Excel and still read it with OpenXML. The filestream approach allowed me to specify the file open and sharing modes. Obviously this would not be a good idea if trying to write to the workbook but I wanted read-only access via OpenXML. I doubt it's any faster as I presume OpenXML uses a filestream beneath the covers anyway. – shunty Apr 17 '14 at 13:56
Hi shunty, i want to use open xml because when i work with interop objects i get errors like this System.Runtime.InteropServices.COMException (0x80080005): Retrieving the COM class factory for component with CLSID this error occurs when many console application access excel services to read excel. if i use openXML library can i solve these problems ? – altandogan Apr 18 '14 at 12:33
I have no idea if it will solve your problems. I think the Excel COM stuff is not thread safe and I have used multiple threads with OpenXML to read several pages at once - but I do not guarantee that it is meant to work that way. I may just have been lucky! With OpenXML you do have the benefit of avoiding the COM stuff altogether - so it may well help just because it is one less layer to worry about. Maybe. – shunty Apr 22 '14 at 16:41
@shunty to work with OpenXml like read-only mode - you could load file to memory and work there - see http://stackoverflow.com/questions/22101338/openxml-spreadsheetdocument-return-byte-array-for-mvc-file-download – Dmitry Pavlov Aug 04 '15 at 15:41
For me, the main reason was the SharedStringTable that I was not using. In my opinion it should be clearer in the documentation that it doesn't read string values unless using the class mentioned. – Bazil Feb 18 '22 at 17:44

vik · Answer 2 · 2014-09-26T20:36:00.050

I had the same issue as the OP, and the answer above did not work for me.

I think this is the issue: when you create a document in Excel (not programmatically), you have 3 sheets by default and the WorksheetParts that has the row data for Sheet1 is the last WorksheetParts element, not the first.

I figured this out by putting a watch for document.WorkbookPart.WorksheetParts in Visual Studio, expanding Results, then looking at all of the sub elements until I found a SheetData object where HasChildren = true.

Try this:

// open the document read-only
SpreadSheetDocument document = SpreadsheetDocument.Open(filePath, false);
SharedStringTable sharedStringTable = document.WorkbookPart.SharedStringTablePart.SharedStringTable;
string cellValue = null;

foreach (WorksheetPart worksheetPart in document.WorkbookPart.WorksheetParts)
{
    foreach (SheetData sheetData in worksheetPart.Worksheet.Elements<SheetData>())
    {
        if (sheetData.HasChildren)
        {
            foreach (Row row in sheetData.Elements<Row>())
            {
                foreach (Cell cell in row.Elements<Cell>())
                {
                    cellValue = cell.InnerText;

                    if (cell.DataType == CellValues.SharedString)
                    {
                        Console.WriteLine("cell val: " + sharedStringTable.ElementAt(Int32.Parse(cellValue)).InnerText);
                    }
                    else
                    {
                        Console.WriteLine("cell val: " + cellValue);
                    }
                }
            }
        }
    }
}
document.Close();

Thanks, nice example! – Mikhail Zhuikov Feb 27 '19 at 12:00 — Mikhail Zhuikov, Feb 27 '19 at 12:00

Rouzbeh Zarandi · Answer 3 · 2021-06-09T14:10:23.383

Read Large Excel : openxml has two approaches of DOM and SAX to read an excel. the DOM one consume more RAM resource since it loads the whole xml content(Excel file) in Memory but its strong typed approach. SAX in other hand is event base parse. more here

so if you are facing large excel file its better to use SAX.

the below code sample uses SAX approach and also handle two important scenario in excel file reading.

open xml skips the empty cells so your dataset faces displacement and wrong index.
you need to skip the empty rows also.

this function returns the exact actual index of the cell at the time and handle the first scenario. from here

private static int CellReferenceToIndex(Cell cell)
        {
            int index = 0;
            string reference = cell.CellReference.ToString().ToUpper();
            foreach (char ch in reference)
            {
                if (Char.IsLetter(ch))
                {
                    int value = (int)ch - (int)'A';
                    index = (index == 0) ? value : ((index + 1) * 26) + value;
                }
                else
                    return index;
            }
            return index;
        }

code to read excel sax approach.

//i want to import excel to data table
            dt = new DataTable();

            using (SpreadsheetDocument document = SpreadsheetDocument.Open(path, false))
            {

                WorkbookPart workbookPart = document.WorkbookPart;
                WorksheetPart worksheetPart = workbookPart.WorksheetParts.First();

                OpenXmlReader reader = OpenXmlReader.Create(worksheetPart);

                //row counter
                int rcnt = 0;

                while (reader.Read())
                {


                    //find xml row element type 
                    //to understand the element type you can change your excel file eg : test.xlsx to test.zip
                    //and inside that you may observe the elements in xl/worksheets/sheet.xml
                    //that helps to understand openxml better
                    if (reader.ElementType == typeof(Row))
                    {

                        //create data table row type to be populated by cells of this row
                        DataRow tempRow = dt.NewRow();



                        //***** HANDLE THE SECOND SENARIO*****
                        //if row has attribute means it is not a empty row
                        if (reader.HasAttributes)
                        {
                            
                            //read the child of row element which is cells

                            //here first element
                            reader.ReadFirstChild();



                            do
                            {
                                //find xml cell element type 
                                if (reader.ElementType == typeof(Cell))
                                {
                                    Cell c = (Cell)reader.LoadCurrentElement();

                                    string cellValue;

                                    
                                    int actualCellIndex = CellReferenceToIndex(c);

                                    if (c.DataType != null && c.DataType == CellValues.SharedString)
                                    {
                                        SharedStringItem ssi = workbookPart.SharedStringTablePart.SharedStringTable.Elements<SharedStringItem>().ElementAt(int.Parse(c.CellValue.InnerText));

                                        cellValue = ssi.Text.Text;
                                    }
                                    else
                                    {
                                        cellValue = c.CellValue.InnerText;
                                    }



                                    //if row index is 0 its header so columns headers are added & also can do some headers check incase
                                    if (rcnt == 0)
                                    {
                                        dt.Columns.Add(cellValue);
                                    }
                                    else
                                    {
                                        // instead of tempRow[c.CellReference] = cellValue;
                                        tempRow[actualCellIndex] = cellValue;
                                    }

                                    

                                }


                            }
                            while (reader.ReadNextSibling());


                            //if its not the header row so append rowdata to the datatable
                            if (rcnt != 0)
                            {
                                dt.Rows.Add(tempRow);
                            }

                            rcnt++;


                        }


                    }





                }


            }

score 0 · Answer 4 · edited Oct 05 '22 at 10:14

Everything is explained in the accepted answer.
Here is just an extension method to solve the problem

public static string GetCellText(this Cell cell, in SharedStringTable sst)
{
    if (cell.CellValue is null)
        return string.Empty;

    if ((cell.DataType is not null) &&
        (cell.DataType == CellValues.SharedString))
    {
        int ssid = int.Parse(cell.CellValue.Text);
        return sst.ChildElements[ssid].InnerText;
    }

    return cell.CellValue.Text;
}

open xml reading from excel file

4 Answers4

Linked