1

Using JSON.NET I am reading JSON objects in an array from a large file. As the JSON object is read, it is conditionally converted to the destination class, and returned as an item in an IEnumerable.

I use an IEnumerable to allow me to "pull" objects from the file and process them as they are read, avoiding having to read all objects into memory.

I use a similar technique when reading rows from a CSV file, where I use CsvHelper ShouldSkipRecord() to conditionally process the row in the CSV file.

I have not found a way to filter the JSON object as it is read from the array, and I end up using LINQ Where to filter the objects before they are converted and added to the IEnumerable. Problem is that the Where clause reads all the objects into memory, defeating the purpose of using IEnumerable.

I know I can manually read each object, and then process them, but I am looking for a more elegant way to have a form of callback that will allow me to pull records and the callback filter records I do not want.

E.g. how I filter rows in a CSV file:

internal static bool ShouldSkipRecord(string[] fields)
{
    // Skip rows with incomplete data
    // 2019-01-24 20:46:57 UTC,63165,4.43,6.23,6.80,189,-18,81.00,16.00,6.23
    // 2019 - 01 - 24 20:47:40 UTC,63166,4.93,5.73,5.73,0,-20,,,5.73
    if (fields.Length < 10)
        return true;

    // Temperature and humidity is optional, air quality is required
    if (string.IsNullOrEmpty(fields[9]))
        return true;

    return false;
}

E.g. how I filter JSON objects:

internal static PurpleAirData Convert(Feed jsonData)
{
    PurpleAirData data = new PurpleAirData()
    {
        TimeStamp = jsonData.CreatedAt.DateTime,
        AirQuality = Double.Parse(jsonData.Field8)
    };

    // Temperature and humidity is optional
    if (double.TryParse(jsonData.Field6, out double val))
        data.Temperature = val;
    if (double.TryParse(jsonData.Field7, out val))
        data.Humidity = val;

    return data;
}

internal static IEnumerable<PurpleAirData> Load(JsonTextReader jsonReader)
{
    // Deserialize objects in parts
    jsonReader.SupportMultipleContent = true;
    JsonSerializer serializer = new JsonSerializer();

    // Read Channel
    // TODO : Add format checking
    jsonReader.Read();
    jsonReader.Read();
    jsonReader.Read();
    Channel channel = serializer.Deserialize<Channel>(jsonReader);

    // Read the Feeds
    jsonReader.Read();
    jsonReader.Read();
    // TODO : The Where results in a full in-memory iteration defeating the purpose of the streaming iteration
    return serializer.Deserialize<List<Feed>>(jsonReader).Where(feed => !string.IsNullOrEmpty(feed.Field8)).Select(Convert);
}

Example JSON:

{
   "channel":{
      "id":622370,
      "name":"AirMonitor_e81a",
      "latitude":"0.0",
      "longitude":"0.0",
      "field1":"PM1.0 (ATM)",
      "field2":"PM2.5 (ATM)",
      "field3":"PM10.0 (ATM)",
      "field4":"Uptime",
      "field5":"RSSI",
      "field6":"Temperature",
      "field7":"Humidity",
      "field8":"PM2.5 (CF=1)",
      "created_at":"2018-11-09T00:35:34Z",
      "updated_at":"2018-11-09T00:35:35Z",
      "last_entry_id":65435
   },
   "feeds":[
      {
         "created_at":"2019-01-10T23:56:09Z",
         "entry_id":56401,
         "field1":"1.00",
         "field2":"1.80",
         "field3":"1.80",
         "field4":"369",
         "field5":"-30",
         "field6":"66.00",
         "field7":"59.00",
         "field8":"1.80"
      },
      {
         "created_at":"2019-01-10T23:57:29Z",
         "entry_id":56402,
         "field1":"1.08",
         "field2":"2.44",
         "field3":"3.33",
         "field4":"371",
         "field5":"-32",
         "field6":"66.00",
         "field7":"59.00",
         "field8":"2.44"
      },
      {
         "created_at":"2019-01-26T00:14:04Z",
         "entry_id":64400,
         "field1":"0.27",
         "field2":"0.95",
         "field3":"1.25",
         "field4":"213",
         "field5":"-27",
         "field6":"72.00",
         "field7":"40.00",
         "field8":"0.95"
      }
   ]
}

Example JSON:

[
{
    "monthlyrainin": 0.01,
    "humidityin": 42,
    "eventrainin": 0,
    "humidity": 29,
    "maxdailygust": 20.13,
    "dateutc": 1549476900000,
    "battout": "1",
    "lastRain": "2019-02-05T19:21:00.000Z",
    "dailyrainin": 0,
    "tempf": 52.2,
    "winddir": 286,
    "totalrainin": 0.01,
    "dewPoint": 20.92,
    "baromabsin": 29.95,
    "hourlyrainin": 0,
    "feelsLike": 52.2,
    "yearlyrainin": 0.01,
    "uv": 1,
    "weeklyrainin": 0.01,
    "solarradiation": 157.72,
    "windspeedmph": 0,
    "tempinf": 73.8,
    "windgustmph": 0,
    "battin": "1",
    "baromrelin": 30.12,
    "date": "2019-02-06T18:15:00.000Z"
},
{
    "dewPoint": 20.92,
    "tempf": 52.2,
    "maxdailygust": 20.13,
    "humidityin": 42,
    "windspeedmph": 4.03,
    "eventrainin": 0,
    "tempinf": 73.6,
    "feelsLike": 52.2,
    "dateutc": 1549476600000,
    "windgustmph": 4.92,
    "hourlyrainin": 0,
    "monthlyrainin": 0.01,
    "battin": "1",
    "humidity": 29,
    "totalrainin": 0.01,
    "baromrelin": 30.12,
    "winddir": 314,
    "lastRain": "2019-02-05T19:21:00.000Z",
    "yearlyrainin": 0.01,
    "baromabsin": 29.94,
    "dailyrainin": 0,
    "battout": "1",
    "uv": 1,
    "solarradiation": 151.86,
    "weeklyrainin": 0.01,
    "date": "2019-02-06T18:10:00.000Z"
}]

Is there a way in JSON.NET to filter objects as they are read?

dbc
  • 104,963
  • 20
  • 228
  • 340
PieterV
  • 555
  • 1
  • 4
  • 18
  • What does your JSON file format look like? – dbc Feb 08 '19 at 04:14
  • You might start with, say, [Issues parsing a 1GB json file using JSON.NET](https://stackoverflow.com/q/30812828/3744182) and [Deserialize json array stream one item at a time](https://stackoverflow.com/q/20374083/3744182) but without knowing your file format it's hard to say for sure. – dbc Feb 08 '19 at 04:31
  • I added some example JSON snippets. – PieterV Feb 11 '19 at 19:44

1 Answers1

1

What you can do is to adopt the basic approaches of Issues parsing a 1GB json file using JSON.NET and Deserialize json array stream one item at a time, which is to stream through the array and yield return each item; but in addition apply a where expression to filter incomplete items, or a select clause to transform some intermediate deserialized object such as a JObject or a DTO to your final data model. By applying the where clause during streaming, unwanted objects will never get added to the list being deserialized, and thus will get cleaned up by the garbage collector during streaming. Filtering array contents while streaming can be done at the root level, when the root JSON container is an array, or as part of some custom JsonConverter for List<T> when the array to be deserialized is nested with some outer JSON.

As a concrete example, consider your first JSON example. You would like to deserialize it to a data model that looks like:

public class PurpleAirData
{
    public PurpleAirData(DateTime createdAt, double airQuality)
    {
        this.CreatedAt = createdAt;
        this.AirQuality = airQuality;
    }
    // Required properties
    public DateTime CreatedAt { get; set; }
    public double AirQuality { get; set; }

    // Optional properties, thus nullable
    public double? Temperature { get; set; }
    public double? Humidity { get; set; }
}

public class RootObject
{
    public Channel channel { get; set; } // Define this using http://json2csharp.com/
    public List<PurpleAirData> feeds { get; set; }
}

To do this, first introduce the following extension methods:

public static partial class JsonExtensions
{
    public static IEnumerable<T> DeserializeArrayItems<T>(this JsonSerializer serializer, JsonReader reader)
    {
        if (reader.MoveToContent().TokenType == JsonToken.Null)
            yield break;
        if (reader.TokenType != JsonToken.StartArray)
            throw new JsonSerializationException(string.Format("Current token {0} is not an array at path {1}", reader.TokenType, reader.Path));
        // Process the collection items
        while (reader.Read())
        {
            switch (reader.TokenType)
            {
                case JsonToken.EndArray:
                    yield break;

                case JsonToken.Comment:
                    break;

                default:
                    yield return serializer.Deserialize<T>(reader);
                    break;
            }
        }
        // Should not come here.
        throw new JsonReaderException(string.Format("Unclosed array at path {0}", reader.Path));
    }

    public static JsonReader MoveToContent(this JsonReader reader)
    {
        if (reader.TokenType == JsonToken.None)
            reader.Read();
        while (reader.TokenType == JsonToken.Comment && reader.Read())
            ;
        return reader;
    }
}

Next, introduce the following JsonConverter for List<PurpleAirData>:

class PurpleAirListConverter : JsonConverter
{
    class PurpleAirDataDTO
    {
        // Required properties
        [JsonProperty("created_at")]
        public DateTime? CreatedAt { get; set; }
        [JsonProperty("Field8")]
        public double? AirQuality { get; set; }

        // Optional properties
        [JsonProperty("Field6")]
        public double? Temperature { get; set; }
        [JsonProperty("Field7")]
        public double? Humidity { get; set; }
    }

    public override bool CanConvert(Type objectType)
    {
        return objectType == typeof(List<PurpleAirData>);
    }

    public override object ReadJson(JsonReader reader, Type objectType, object existingValue, JsonSerializer serializer)
    {
        if (reader.MoveToContent().TokenType == JsonToken.Null)
            return null;
        var list = existingValue as List<PurpleAirData> ?? new List<PurpleAirData>();

        var query = from dto in serializer.DeserializeArrayItems<PurpleAirDataDTO>(reader)
                    where dto != null && dto.CreatedAt != null && dto.AirQuality != null
                    select new PurpleAirData(dto.CreatedAt.Value, dto.AirQuality.Value) { Humidity = dto.Humidity, Temperature = dto.Temperature };

        list.AddRange(query);

        return list;
    }

    public override void WriteJson(JsonWriter writer, object value, JsonSerializer serializer)
    {
        throw new NotImplementedException();
    }
}

The purpose of this converter is to stream through the "feeds" array, deserialize each JSON item to an intermediate PurpleAirDataDTO, check for the presence of required members, then convert the DTO to the final model.

Finally, deserialize the entire file as follows:

static RootObject DeserializePurpleAirDataFile(TextReader textReader)
{
    var settings = new JsonSerializerSettings
    {
        Converters = { new PurpleAirListConverter() },
        NullValueHandling = NullValueHandling.Ignore,
    };
    var serializer = JsonSerializer.CreateDefault(settings);
    using (var reader = new JsonTextReader(textReader) { CloseInput = false })
    {
        return serializer.Deserialize<RootObject>(reader);
    }
}

Demo fiddle here.

When the array to be filtered is the root container in the JSON file, the extension method JsonExtensions.DeserializeArrayItems() can be used directly, e.g. as follows:

static bool IsValid(WeatherData data)
{
    // Return false if certain fields are missing

    // Otherwise return true;
    return true;
}

static List<WeatherData> DeserializeFilteredWeatherData(TextReader textReader)
{
    var serializer = JsonSerializer.CreateDefault();
    using (var reader = new JsonTextReader(textReader) { CloseInput = false })
    {
        var query = from data in serializer.DeserializeArrayItems<WeatherData>(reader)
                    where IsValid(data)
                    select data;

        return query.ToList();
    }
}

Notes:

  • nullable types can be used to track whether or not value type members were actually encountered during deserialization.

  • Here the conversion from DTO to final data model is done manually, but for more complicated models something like could be used instead.

dbc
  • 104,963
  • 20
  • 228
  • 340