4

I'm downloading a file from a 3rd party server, like so:

Try
    req = DirectCast(HttpWebRequest.Create("https://www.example.com/my.xml"), HttpWebRequest)
    req.Timeout = 100000 '100 seconds
    Resp = DirectCast(req.GetResponse(), HttpWebResponse)
    reader = New StreamReader(Resp.GetResponseStream)
    responseString = reader.ReadToEnd()
Catch ex As Exception

End Try

The file my.xml is 1.2GB and I'm getting the error "Exception of type 'System.OutOfMemoryException' was thrown." When I open Windows Task Manager I see memory usage is at just 70% of total available memory and IIS Worker Process is not growing in size to use full system memory. When I found this: https://learn.microsoft.com/en-us/archive/blogs/tom/chat-question-memory-limits-for-32-bit-and-64-bit-processes, so the 70% failure sounds about right.

So now I'm considering splitting the file in more manageable smaller chunks. However, how can I do this without creating separate files? Is there a way to load for example 100MB into memory each time (respecting XML node endings) or perhaps by reading X number of XML nodes each time?

When I Google on "Read large XML file from webserver without splitting in smaller chunks" I get nothing but file splitting tools.

UPDATE 1

Based on Lex Li's suggestion I searched and found this tutorial: https://learn.microsoft.com/en-us/dotnet/standard/linq/perform-streaming-transform-large-xml-documents

So I translated the code, which works as per the tutorial:

Private Shared Iterator Function StreamCustomerItem(ByVal uri As String) As IEnumerable(Of XElement)
    Using reader As XmlReader = XmlReader.Create(uri)
        Dim name As XElement = Nothing
        Dim item As XElement = Nothing
        reader.MoveToContent()

        While reader.Read()

            If reader.NodeType = XmlNodeType.Element AndAlso reader.Name = "Customer" Then

                While reader.Read()

                    If reader.NodeType = XmlNodeType.Element AndAlso reader.Name = "Name" Then
                        name = TryCast(XElement.ReadFrom(reader), XElement)
                        Exit While
                    End If
                End While

                While reader.Read()
                    If reader.NodeType = XmlNodeType.EndElement Then Exit While

                    If reader.NodeType = XmlNodeType.Element AndAlso reader.Name = "Item" Then
                        item = TryCast(XElement.ReadFrom(reader), XElement)

                        If item IsNot Nothing Then
                            Dim tempRoot As XElement = New XElement("Root", New XElement(name))
                            tempRoot.Add(item)
                            Yield item
                        End If
                    End If
                End While
            End If
        End While
    End Using
End Function

Private Shared Sub Main()
    Dim srcTree As IEnumerable(Of XElement) = From el In StreamCustomerItem("https://www.example.com/source.xml") Select New XElement("Item", New XElement("Customer", CStr(el.Parent.Element("Name"))), New XElement(el.Element("Key")))
    Dim xws As XmlWriterSettings = New XmlWriterSettings()
    xws.OmitXmlDeclaration = True
    xws.Indent = True

    Using xw As XmlWriter = XmlWriter.Create(HttpContext.Current.Server.MapPath("files\") + "Output.xml", xws)
        xw.WriteStartElement("Root")

        For Each el As XElement In srcTree
            el.WriteTo(xw)
        Next

        xw.WriteEndElement()
    End Using

End Sub

The example above transforms the source.xml in an output.xml, but all I want is to read product nodes exactly as is (no transformation needed) and in such a way that it reads in individual nodes so I can process large XML files.

I tried to rewrite it so it extracts values from my XML just like the structure. First I tried just getting something ready from my xml file like so:

Private Shared Iterator Function StreamCustomerItem(ByVal uri As String) As IEnumerable(Of XElement)
    Using reader As XmlReader = XmlReader.Create(uri)
        Dim name As XElement = Nothing
        Dim item As XElement = Nothing
        reader.MoveToContent()

        While reader.Read()
            If reader.NodeType = XmlNodeType.Element AndAlso reader.Name = "Id" Then
                name = TryCast(XElement.ReadFrom(reader), XElement)
                item = TryCast(XElement.ReadFrom(reader), XElement)

                If item IsNot Nothing Then
                    Dim tempRoot As XElement = New XElement("Root", New XElement(name))
                    tempRoot.Add(item)
                    Yield item
                End If

                Exit While
            End If
        End While
    End Using
End Function

Private Shared Sub Main()
    Dim srcTree As IEnumerable(Of XElement)

    srcTree = From el In StreamCustomerItem("https://www.example.com/mysource.xml")
              Select New XElement("product", New XElement("product", CStr(el.Parent.Element("Id"))))


    Dim xws As XmlWriterSettings = New XmlWriterSettings()
    xws.OmitXmlDeclaration = True
    xws.Indent = True

    Using xw As XmlWriter = XmlWriter.Create(HttpContext.Current.Server.MapPath("files\") + "Output.xml", xws)
        xw.WriteStartElement("Root")

        For Each el As XElement In srcTree
            el.WriteTo(xw)
        Next

        xw.WriteEndElement()
    End Using


End Sub

That just writes <Root /> to my output.xml though

mysource.xml

<?xml version="1.0" encoding="UTF-8" ?>
<products>
    <product>
        <Id>
            <![CDATA[122854]]>
        </Id>
        <Type>
            <![CDATA[restaurant]]>
        </Type>
        <features>
            <wifi>
                <![CDATA[included]]>
            </wifi>
        </features>         
    </product>
</products>

So to summarize my question: how can I read individual product nodes as-is from "mysource.xml" without loading the full file into memory?

UPDATE 1

Private Shared Iterator Function StreamCustomerItem(ByVal uri As String) As IEnumerable(Of XElement)
    Using reader As XmlReader = XmlReader.Create(uri)
        Dim name As XElement = Nothing
        Dim item As XElement = Nothing
        reader.MoveToContent()

        While Not reader.EOF
            If reader.NodeType = XmlNodeType.Element AndAlso reader.Name = "product" Then
                Dim el As XElement = TryCast(XElement.ReadFrom(reader), XElement)
                If el IsNot Nothing Then Yield el
            Else
                reader.Read()
            End If
        End While
    End Using
End Function            


Private Shared Sub Main()
    Dim element As IEnumerable(Of XmlElement) = From el In StreamCustomerItem("source.xml") Select el

    For Each str As XmlElement In grandChildData
    'here loop through `product` element
        Console.WriteLine(str)
    Next
End Sub 

My full test file via Onion Share (use TOR browser to download):

http://jkntfybog2s5cc754sn7mujvyaawdqxd4q5imss66x3hsos34rrbjrid.onion Key: YLTDQSDHTBWGDGQ6FIADTN2K7GFOFT5R7SFKWKTDER3WETD7EMKA

Adam
  • 6,041
  • 36
  • 120
  • 208
  • 2
    `ReadToEnd` is the mistake you made here. As you got a stream object, utilize it to write to a local file in stream API (which internally handles the chunks for you). A lot of discussions are out there on stream reader and file stream which you can use as references. – Lex Li Aug 12 '22 at 01:13
  • @LexLi: thanks. I added update 1, thinking that is what you meant, maybe you could have a look? – Adam Aug 13 '22 at 20:10
  • 1
    You could use XNode.ReadFrom(XmlReader) Method https://learn.microsoft.com/en-us/dotnet/api/system.xml.linq.xnode.readfrom?redirectedfrom=MSDN&view=net-6.0#System_Xml_Linq_XNode_ReadFrom_System_Xml_XmlReader_ it will help you to read the data loading the very large file into memory. – Jalpa Panchal Aug 17 '22 at 09:57
  • @JalpaPanchal thank you. That example looks much simpler. However it selects a string value, whereas I want to retrieve each full `product` element including all its child elements from a large file. Please see my update 1, what am I doing wrong? – Adam Aug 17 '22 at 20:56
  • @JalpaPanchal care to take another look at this? :) – Adam Aug 26 '22 at 10:27
  • 1
    Do you have a real 1.2G file to test? Or how can we create one that matches your test. – Simon Mourier Sep 30 '22 at 07:10
  • @SimonMourier good call, I shared the XML file link – Adam Oct 02 '22 at 11:15
  • @Adam - I can't download it (tried with Brave in Tor mode) – Simon Mourier Oct 02 '22 at 15:19
  • Hmm, I actually have Brave Browser in Tor mode but run into a bug that it can't connect to Tor network at all (https://community.brave.com/t/tor-status-is-always-disconnected/337876/27). In the Tor browser application itself it does work, could you maybe install and try that? – Adam Oct 02 '22 at 17:35
  • 1
    Ok, I've successfully downloaded the file and tested on a local web server. If you want each product split as an "inner" xml fragment, just use `ReadOuterXml` or `ReadInnerXml` on the `product` element. Here is a code that works fine (it's C# but easily translatable to VB.NET using online converters) https://pastebin.com/raw/yTV42sPu it tells me you have exactly 36889 products in the file w/o allocating more that what's needed both on server and client – Simon Mourier Oct 03 '22 at 07:15
  • Thanks. I tried converting your code using https://converter.telerik.com/, but it throws an error. Also I see `var resp = req.GetResponse()` does that not load the full file (undesirably)? – Adam Oct 03 '22 at 08:00
  • Thanks! That seems to work :) Please add it as an answer so I can award the bounty. Quick question: which classes would you use if it were a JSON file rather than an XML? – Adam Oct 03 '22 at 09:54
  • 1
    For json, it's similar, use JsonReader for Newtonsoft.Json and Utf8JsonReader for System.Text.Json. Always use reader to avoid allocating memory. – Simon Mourier Oct 03 '22 at 10:28
  • @SimonMourier I made an attempt, but I am stuck. Could you have a look please at what I'm doing wrong? https://stackoverflow.com/questions/74027865/read-single-row-and-attributes-from-huge-json-file-via-jsontextreader – Adam Oct 11 '22 at 12:06

3 Answers3

1

The important thing is to make sure you never load the whole file, but "stream" (in the general sense, stream bytes, characters, xml nodes, etc.) everything from end to end (ie: server to client here).

For network bytes, it means you must use a raw Stream object.

For Xml nodes, it means you can use an XmlReader (not an XmlDocument which loads a full document object model from a stream). In this case, you can use an XmlTextReader which "Represents a reader that provides fast, non-cached, forward-only access to XML data".

Here is a C# piece of code (that can easily be translated to VB.NET) that does this, but can still build an intermediary small Xml document for each product in the big Gb file, using XmlReader methods ReadInnerXml and/or ReadOuterXml:

var req = (HttpWebRequest)WebRequest.Create("https://www.yourserver.com/spotahome_1.xml");
using (var resp = req.GetResponse())
{
    using (var stream = resp.GetResponseStream())
    {
        using (var xml = new XmlTextReader(stream))
        {
            var count = 0;
            while (xml.Read())
            {
                switch (xml.NodeType)
                {
                    case XmlNodeType.Element:
                        if (xml.Name == "product")
                        {
                            // using XmlDocument is ok here since we know
                            // a product is not too big
                            // but we could continue with the reader too
                            var product = new XmlDocument();
                            product.LoadXml(xml.ReadOuterXml());
                            Console.WriteLine(count++);
                        }
                        break;
                }
            }
        }
    }
}

PS: Ideally, you could use async / await code with Async counterparts methods ReadInnerXmlAsync / ReadOuterXmlAsync but this is another story and easy to setup.

Simon Mourier
  • 132,049
  • 21
  • 248
  • 298
-1

Did you checkout this documentation from Microsoft yet? https://learn.microsoft.com/en-us/dotnet/standard/linq/stream-xml-fragments-xmlreader

I had a similar issue, but reading a large json. What I did there was I read the token reprezenting the start of a product and iterated through those tokens. This way you won't load the entire file in memory. I believe the same solution can be achieved in XML also.

Hope it helps.

SimpForJS
  • 88
  • 9
-1

This is a bit of an old-school approach, but I usually keep a track of the XPATH address of where I am inside the XML file, then use the XPATH to work out what to do with the value.

Imports System.Xml

Module Program
  Sub Main(args As String())
    Dim filename = "C:\Junk\Junk.xml"    
    Using reader As XmlReader = XmlReader.Create(filename)
      Dim xpath = ""
      Dim currentProduct As Product = Nothing
      Do While reader.Read
        Select Case reader.NodeType
          Case XmlNodeType.Element
            If Not reader.IsEmptyElement Then
              xpath &= "/" & reader.Name
            End If
            If xpath = "/products/product" Then
              If currentProduct IsNot Nothing Then
                Console.WriteLine(currentProduct)
              End If
              currentProduct = New Product
            End If
          Case XmlNodeType.EndElement
            xpath = xpath.Substring(0, xpath.LastIndexOf("/"))
          Case XmlNodeType.CDATA
            Select Case xpath
              Case "/products/product/Id"
                currentProduct.Id = reader.Value
              Case "/products/product/Type"
                currentProduct.ProductType = reader.Value
              Case "/products/product/features/wifi"
                If reader.Value = "included" Then
                  currentProduct.Wifi = True
                End If
            End Select
        End Select
      Loop
      If currentProduct IsNot Nothing Then
        Console.WriteLine(currentProduct)
      End If
    End Using
    Console.WriteLine("FINISHED")
  End Sub

  Class Product
    Public Property Id As String
    Public Property ProductType As String
    Public Property Wifi As Boolean
    Public Overrides Function ToString() As String
      Return $"{Id}-{ProductType}-{Wifi}"
    End Function    
  End Class
End Module
SSS
  • 4,807
  • 1
  • 23
  • 44
  • Thanks, I however don't want to create a class, I just want to extract the full `product` element as a new XMLNode as I read through the file. Also, your code does not work for large files, it does not use `reader.MoveToContent()`. The bounty will be awarded to a working code sample for large files – Adam Aug 27 '22 at 08:10