0

I have gotten an application where the input has been scaled up from 50K location records to 1.1 Million location records. This has caused serious issues as the entire file was previously de-serialized into a single object. The size of the object is ~1GB for a production like file with 1.1 Million records. Due to large object GC issues I want to keep the de-serialized object below the 85K mark.

I'm trying to parse out a single location object at a time and de-serialize it so I can control the number of objects that get de-serialized and in turn control the size of the object. I'm using the Json.Net libraries to do this.

Below is a sample of the JSON file that I'm receiving as a stream into my application.

    {
    "Locations": [{
        "LocationId": "",
        "ParentLocationId": "",
        "DisplayFlag": "Y",
        "DisplayOptions": "",
        "DisplayName": "",
        "Address": "",
        "SecondaryAddress": "",
        "City": "",
        "State": "",
        "PostalCode": "",
        "Country": "",
        "Latitude": 40.59485,
        "Longitude": -73.96174,
        "LatLonQuality": 99,
        "BusinessLogoUrl": "",
        "BusinessUrl": "",
        "DisplayText": "",
        "PhoneNumber": "",
        "VenueGroup": 7,
        "VenueType": 0,
        "SubVenue": 0,
        "IndoorFlag": "",
        "OperatorDefined": "",
        "AccessPoints": [{
            "AccessPointId": "",
            "MACAddress": "",
            "DisplayFlag": "",
            "DisplayOptions": "",
            "Latitude": 40.59485,
            "Longitude": -73.96174,
            "Status": "Up",
            "OperatorDefined": "",
            "RoamingGroups": [{
                "GroupName": ""
            },
            {
                "GroupName": ""
            }],
            "Radios": [{
                "RadioId": "",
                "RadioFrequency": "",
                "RadioProtocols": [{
                    "Protocol": ""
                }],
                "WifiConnections": [{
                    "BSSID": "",
                    "ServiceSets": [{
                        "SSID": "",
                        "SSID_Broadcasted": ""
                    }]
                }]
            }]
        }]
    },
    {
        "LocationId": "",
        "ParentLocationId": "",
        "DisplayFlag": "Y",
        "DisplayOptions": "",
        "DisplayName": "",
        "Address": "",
        "SecondaryAddress": "",
        "City": "",
        "State": "",
        "PostalCode": "",
        "Country": "",
        "Latitude": 40.59485,
        "Longitude": -73.96174,
        "LatLonQuality": 99,
        "BusinessLogoUrl": "",
        "BusinessUrl": "",
        "DisplayText": "",
        "PhoneNumber": "",
        "VenueGroup": 7,
        "VenueType": 0,
        "SubVenue": 0,
        "IndoorFlag": "",
        "OperatorDefined": "",
        "AccessPoints": [{
            "AccessPointId": "",
            "MACAddress": "",
            "DisplayFlag": "",
            "DisplayOptions": "",
            "Latitude": 40.59485,
            "Longitude": -73.96174,
            "Status": "Up",
            "OperatorDefined": "",
            "RoamingGroups": [{
                "GroupName": ""
            },
            {
                "GroupName": ""
            }],
            "Radios": [{
                "RadioId": "",
                "RadioFrequency": "",
                "RadioProtocols": [{
                    "Protocol": ""
                }],
                "WifiConnections": [{
                    "BSSID": "",
                    "ServiceSets": [{
                        "SSID": "",
                        "SSID_Broadcasted": ""
                    }]
                }]
            }]
        }]
    }]
}

I need to be able to pull out the individual Location objects, so that I would be looking at the following

    {
    "LocationId": "",
    "ParentLocationId": "",
    "DisplayFlag": "Y",
    "DisplayOptions": "",
    "DisplayName": "",
    "Address": "",
    "SecondaryAddress": "",
    "City": "",
    "State": "",
    "PostalCode": "",
    "Country": "",
    "Latitude": 40.59485,
    "Longitude": -73.96174,
    "LatLonQuality": 99,
    "BusinessLogoUrl": "",
    "BusinessUrl": "",
    "DisplayText": "",
    "PhoneNumber": "",
    "VenueGroup": 7,
    "VenueType": 0,
    "SubVenue": 0,
    "IndoorFlag": "",
    "OperatorDefined": "",
    "AccessPoints": [{
        "AccessPointId": "",
        "MACAddress": "",
        "DisplayFlag": "",
        "DisplayOptions": "",
        "Latitude": 40.59485,
        "Longitude": -73.96174,
        "Status": "Up",
        "OperatorDefined": "",
        "RoamingGroups": [{
            "GroupName": ""
        },
        {
            "GroupName": ""
        }],
        "Radios": [{
            "RadioId": "",
            "RadioFrequency": "",
            "RadioProtocols": [{
                "Protocol": ""
            }],
            "WifiConnections": [{
                "BSSID": "",
                "ServiceSets": [{
                    "SSID": "",
                    "SSID_Broadcasted": ""
                }]
            }]
        }]
    }]
}

I'm trying to use the Json.NET JsonTextReader to accomplish this, however I cannot get the reader to contain an entire location in its buffer, due to the size of the records in the stream the reader initially will have down as far as "RadioProtocols", which is mid way through the object, by the time the stream reaches the end of the object, the reader has discarded the start of the object.

The code I'm using to try to get this functionality to work is

var ser = new JsonSerializer();
using (var reader = new JsonTextReader(new StreamReader(stream)))
{
    reader.SupportMultipleContent = true;

    while (reader.Read())
    {   
        if (reader.TokenType == JsonToken.StartObject && reader.Depth == 2)
        {                            
            do
            {
                reader.Read();                                
            } while (reader.TokenType != JsonToken.EndObject && reader.Depth == 2);

            var singleLocation = ser.Deserialize<Locations>(reader);
        }
    }
}

Any information on this or an alternative to doing it would be greatly appreciated. As a side note, the way our customers send the information cannot change at this time.

polydegmon
  • 575
  • 1
  • 7
  • 17
  • It sounds like you're going to have to roll your own serializer because the smallest reasonable unit of json that json.NET is going to deserialize will cause you an `OutOfMemoryException`. That being said I think it's completely the wrong approach. I would address the bigger problem which is your obviously your unwieldy data source or insufficient hardware. – evanmcdonnal Jun 12 '15 at 22:11
  • Sadly we can't change the approach at this time, we've basically been told to patch only, or more accurately, "just make it work without changing too much" – polydegmon Jun 12 '15 at 22:26
  • I tried running your code, but I found a problem. Assuming the `Locations` type corresponds to an entry in the `Locations` array, the code throws an exception because the reader is incorrectly positioned on a `"LocationId"` property. Is the idea to enumerate through each entry in the `Locations` array, loading each one individually? – dbc Jun 12 '15 at 23:04

2 Answers2

0

When the reader is positioned at the beginning of the object you want to deserialize (an entry in the Locations array in your case), you can just call ser.Deserialize<T>(reader) and it will work, advancing to the end of the object at that level, and no further. Thus the following should iterate through the Location objects in your file, loading each one separately:

    public static IEnumerable<T> DeserializeNestedItems<T>(TextReader textReader)
    {
        var ser = new JsonSerializer();
        using (var reader = new JsonTextReader(textReader))
        {
            reader.SupportMultipleContent = true;

            while (reader.Read())
            {
                if (reader.TokenType == JsonToken.StartObject && reader.Depth == 2)
                {
                    var item = ser.Deserialize<T>(reader);
                    yield return item;
                }
            }
        }
    }

And an example of use using your test string:

        Debug.Assert(DeserializeNestedItems<Location>(new StringReader(json)).Count() == 2); // No assert.

        var list = DeserializeNestedItems<Location>(new StringReader(json)).SelectMany(l => l.AccessPoints).Select(a => new { a.Latitude, a.Longitude }).ToList();

        Debug.WriteLine(JsonConvert.SerializeObject(list, Formatting.Indented));

Which outputs:

[
  {
    "Latitude": 40.59485,
    "Longitude": -73.96174
  },
  {
    "Latitude": 40.59485,
    "Longitude": -73.96174
  }
]

Note - the Location class comes from posting your JSON to http://json2csharp.com/.

dbc
  • 104,963
  • 20
  • 228
  • 340
  • @polydegmon - I'm not 100% sure this answers your question. If an individual `Location` object won't fit in memory, this doesn't help. But from your code you may be having difficulty parsing the JSON incrementally. – dbc Jun 12 '15 at 23:08
0

Thanks for all the help, I've managed to get it doing what I want which is de-serializing individual location objects.

If the item is converted to a JObject it will read in the full object and de-serialize it, this can be looped to get the solution.

This is the code that was settled on

while (reader.Read())
{
    if (reader.TokenType == JsonToken.StartObject && reader.Depth == 2)
    {
        location = JObject.Load(reader).ToObject<Location>();

        var lv = new LocationValidator(location, FootprintInfo.OperatorId, FootprintInfo.RoamingGroups, true);
        var vr = lv.IsValid();
        if (vr.Successful)
        {
            yield return location;
        }
        else
        {
            errors.Add(new Error(elNumber, location.LocationId, vr.Error.Field, vr.Error.Detail));
            if (errors.Count >= maxErrors)
            {
                yield break;
            }
        }

        ++elNumber;
    }
}
polydegmon
  • 575
  • 1
  • 7
  • 17