1

We had a code which was loading the Excel XLSX document into the memory, doing some modifications with it and saving it back.

XmlDocument doc = new XmlDocument();
doc.Load(pp.GetStream());
XmlNode rootNode = doc.DocumentElement;

if (rootNode == null) return;
ProcessNode(rootNode);

if (this.fileModified)
{
    doc.Save(pp.GetStream(FileMode.Create, FileAccess.Write));
}

This was working good with small files, but throwing OutOfMemory exceptions with some large Excel files. So we decided to change the approach and use XmlReader class to not load the file into the memory at once.

PackagePartCollection ppc = this.Package.GetParts();
foreach (PackagePart pp in ppc)
{
     if (!this.xmlContentTypesXlsx.Contains(pp.ContentType)) continue;

     using (XmlReader reader = XmlReader.Create(pp.GetStream()))
     {
          reader.MoveToContent();
          while (reader.EOF == false)
          {
             XmlDocument doc;
             XmlNode rootNode;
             if (reader.NodeType == XmlNodeType.Element && reader.Name == "hyperlinks")
             {
                   doc = new XmlDocument();
                   rootNode = doc.ReadNode(reader);
                   if (rootNode != null)
                   {
                        doc.AppendChild(rootNode);
                        ProcessNode(rootNode);  // how can I save updated changes back to the file?
                   }
              }
              else if (reader.NodeType == XmlNodeType.Element && reader.Name == "row")
              {
                    doc = new XmlDocument();
                    rootNode = doc.ReadNode(reader);

                    if (rootNode != null)
                    {
                        doc.AppendChild(rootNode);
                        ProcessNode(rootNode); // how can I save updated changes back to the file?
                    }
              }
              else
              {
                    reader.Read();
              }
          }
     }
}

This reads the file node by node and processes nodes we need (and changes some values there). However, I'm not sure how we can update those values back to the original Excel file. I tried to use XmlWriter together with the XmlReader, but was not able to make it work. Any ideas?

UPDATE:

I tried to use @dbc's suggestions from the comments section, but it seems too slow to me. It probably will not throw OutOfMemory exceptions for huge files, but processing will take forever.

PackagePartCollection ppc = this.Package.GetParts();
foreach (PackagePart pp in ppc)
{
     if (!this.xmlContentTypesXlsx.Contains(pp.ContentType)) continue;

     StringBuilder strBuilder = new StringBuilder();
     
     using (XmlReader reader = XmlReader.Create(pp.GetStream()))
     {
        using (XmlWriter writer = this.Package.FileOpenAccess == FileAccess.ReadWrite ? XmlWriter.Create(strBuilder) : null)
        {
          reader.MoveToContent();
          while (reader.EOF == false)
          {
             XmlDocument doc;
             XmlNode rootNode;
             if (reader.NodeType == XmlNodeType.Element && reader.Name == "hyperlinks")
             {
                   doc = new XmlDocument();
                   rootNode = doc.ReadNode(reader);
                   if (rootNode != null)
                   {
                        doc.AppendChild(rootNode);
                        ProcessNode(rootNode);
                        writer?.WriteRaw(rootNode.OuterXml);
                   }
              }
              else if (reader.NodeType == XmlNodeType.Element && reader.Name == "row")
              {
                    doc = new XmlDocument();
                    rootNode = doc.ReadNode(reader);

                    if (rootNode != null)
                    {
                        doc.AppendChild(rootNode);
                        ProcessNode(rootNode);
                        writer?.WriteRaw(rootNode.OuterXml);
                    }
              }
              else
              {
                    WriteShallowNode(writer, reader); // Used from the @dbc's suggested stackoverflow answers
                    reader.Read();
              }
            }

            writer?.Flush();
         }
      }
}

NOTE 1: I'm using StringBuilder for the test, but was planning to switch to a temp file in the end. NOTE 2: I tried flushing the XmlWriter after every 100 elements, but it's still slow.

Any ideas?

Chuck Norris
  • 15,207
  • 15
  • 92
  • 123
  • 1
    You could do streaming transformations from `XmlReader` to `XmlWriter` e.g. as shown in [Edit a large XML file](https://stackoverflow.com/q/48574631/3744182) and [Automating replacing tables from external files](https://stackoverflow.com/q/28891440/3744182). – dbc Jun 28 '21 at 16:33
  • You will have to stream to some temporary file then later replace the original with the modified version, you can't read from and write to the same stream at the same time with `XmlReader` and `XmlWriter`. – dbc Jun 28 '21 at 16:39
  • Do those two questions answer your also? Or do you need more specific help? If so might you [edit] your question to show where you specifically have trouble with the streaming transform? – dbc Jun 28 '21 at 16:40
  • @dbc Thanks, I need some time to investigate the answers in those questions to see how can I easily modify the current code to do that. I would like to make it work with minimal number of changes if possible. One more thing I want to mention is this.fileModified property - in the first code you can see that we update the file only when there are actual changes. I would like to use the same logic for the XmlReader method as well (I know that it can be complicated though...), so we do not change file modify date if there are no actual changes. – Chuck Norris Jun 28 '21 at 16:44
  • 1
    `XmlReader` is forward-only so if you want to create a temp output file if and only if necessary, you will need to make two passes. – dbc Jun 28 '21 at 16:57
  • On the other hand, it will be easy to keep track of whether you modified anything and only copy the temp file into the the XLSX file if a change was actually made, then delete the temp file either way. – dbc Jun 28 '21 at 17:09
  • @dbc I tried to re-use those answers and make the code to work with minimal changes. It seems to work, but it's super-slow. Any ideas? Do you see any drawback in the updated code above? Do you have any ideas how can we speed up the process? – Chuck Norris Jun 29 '21 at 13:26
  • 1
    I would get rid of `writer?.WriteRaw(rootNode.OuterXml);` and use `if (writer != null) { rootNode.WriteContentTo(writer); }`. Also LINQ-to-XML is a little faster than `XmlDocument` so you might switch to that. And why are you allowing a null `writer`? What is the purpose of the code if you don't write anything? Are you making two passes, one to check to see if anything will be modified? If so, on your first pass you could just return `true` as soon as a node requiring modification is found. – dbc Jun 29 '21 at 14:57
  • Other than that I think you need to profile it, you may be comparing apples to oranges because your old code worked entirely in memory. I believe that the XML inside a package part is compressed, so maybe the time is taken by compression? `XmlWriter` creates a `FileStream` with a buffer size of 4096 bytes so try creating your own `FileStream` with larger buffers. Or write to a temp file then later copy into the package. – dbc Jun 29 '21 at 15:00
  • Any why are you manually flushing the writer at all? Writing in small chunks will lead to worse compression results. Instead let the framework flush its buffer when necessary. According to [Always wrap GZipStream with BufferedStream](https://faithlife.codes/blog/2012/06/always-wrap-gzipstream-with-bufferedstream/) by Bradley Grainger, the optimum buffer size seems to be around 8192 bytes. You want to avoid buffers 80000 bytes or larger because the will go on the large object heap. – dbc Jun 29 '21 at 15:04
  • @dbc We have read-only processes too where we just need to read the file. That's why XmlWriter object is nullable. "rootNode.WriteContentTo(writer);" seems not correct, since it does not write "row" attributes and just writes inner text. "rootNode.WriteTo(writer);" seems much better. I will try your other suggestions today, thanks a lot! – Chuck Norris Jun 29 '21 at 16:16
  • 1
    Sorry, my mistake, I should have suggested [`XmlElement.WriteTo(XmlWriter)`](https://learn.microsoft.com/en-us/dotnet/api/system.xml.xmlelement.writeto?view=net-5.0#System_Xml_XmlElement_WriteTo_System_Xml_XmlWriter_). I'm so used to using `XElement` now that I'm a bit rusty with the old XML DOM. – dbc Jun 29 '21 at 16:20
  • @dbc I think I'm close to finish this up, thanks for your help a lot! I have one last question - is there any way I can keep the XML header attributes from XmlReader (encoding, version, standalone, etc.) when creating a XmlWriter and temp file? – Chuck Norris Jun 29 '21 at 16:21
  • 1
    You should probably ask another question for that. There is [`XmlWriterSettings.OmitXmlDeclaration`](https://learn.microsoft.com/en-us/dotnet/api/system.xml.xmlwritersettings.omitxmldeclaration?view=net-5.0) so one option might be to just copy the nodes prior to the root node manually with `WriteShallowNode()`. – dbc Jun 29 '21 at 16:35
  • @dbc I think I've figured out this finally with the help of your comments and based on my tests it works quite good now. I've added the working code as "Update 2" in the question. You can post it as an answer to this question (if you don't like something there, feel free to change) so I can mark it as accepted and reward a little bit for your help. Thanks. – Chuck Norris Jun 30 '21 at 17:03

2 Answers2

0

Try following. I've been use for a long time with huge xml files that give Out of Memory

           using (XmlReader reader = XmlReader.Create("File Stream", readerSettings))
            {
                while (!reader.EOF)
                {
                    if (reader.Name != "row")
                    {
                        reader.ReadToFollowing("row");

                    }
                    if (!reader.EOF)
                    {
                        XElement row = (XElement)XElement.ReadFrom(reader);
                    }
                }
              }
            }
jdweng
  • 33,250
  • 2
  • 15
  • 20
  • 1
    What is the difference between this code and my "read-only" code? How does this solve the issue of updating values in the XML and saving them back? – Chuck Norris Jun 28 '21 at 20:26
  • I know my code solves the Out Of Memory issue. I think you need to create a new XDocument and modify the XElement row and then add to new XDocument. – jdweng Jun 29 '21 at 01:04
0

I did some more modifications with @dbc's help and now it works as I wanted.

PackagePartCollection ppc = this.Package.GetParts();
foreach (PackagePart pp in ppc)
{
  try
  {
     if (!this.xmlContentTypesXlsx.Contains(pp.ContentType)) continue;

     string tempFilePath = GetTempFilePath();
     
     using (XmlReader reader = XmlReader.Create(pp.GetStream()))
     {
        using (XmlWriter writer = this.Package.FileOpenAccess == FileAccess.ReadWrite ? XmlWriter.Create(tempFilePath) : null)
        {
          while (reader.EOF == false)
          {
             if (reader.NodeType == XmlNodeType.Element && reader.Name == "hyperlinks")
             {
                   XmlDocument doc = new XmlDocument();
                   XmlNode rootNode = doc.ReadNode(reader);
                   if (rootNode != null)
                   {
                        ProcessNode(rootNode);
                        if (writer != null)
                        {
                            rootNode.WriteTo(writer);
                        }
                   }
              }
              else if (reader.NodeType == XmlNodeType.Element && reader.Name == "row")
              {
                    XmlDocument doc = new XmlDocument();
                    XmlNode rootNode = doc.ReadNode(reader);

                    if (rootNode != null)
                    {
                        ProcessNode(rootNode);
                        if (writer != null)
                        {
                            rootNode.WriteTo(writer);
                        }
                    }
              }
              else
              {
                    WriteShallowNode(writer, reader); // Used from the @dbc's suggested StackOverflow answers
                    reader.Read();
              }
            }
         }
      }


      if (this.packageChanged) // is being set in ProcessNode method
      {
          this.packageChanged = false;

          using (var tempFile = File.OpenRead(tempFilePath))
          {
               tempFile.CopyTo(pp.GetStream(FileMode.Create, FileAccess.Write));
          }
       }
   }
   catch (OutOfMemoryException)
   {
        throw;
   }
   catch (Exception ex)
   {
      Log.Exception(ex, @"Failed to process a file."); // our inner log method
   }
   finally
   {
       if (!string.IsNullOrWhiteSpace(tempFilePath))
       {
            // Delete temp file
       }
   }
}
Chuck Norris
  • 15,207
  • 15
  • 92
  • 123