3

I have a large json file (around 16Gb) with the following structure:

{
  "Job": {
    "Keys": {
      "JobID": "test123",
      "DeviceID": "TEST01"
    },
    "Props": {
      "FileType": "Measurements",
      "InstrumentDescriptions": [
        {
          "InstrumentID": "1723007",
          "InstrumentType": "Actual1",
          "Name": "U",
          "DataType": "Double",
          "Units": "degC"
        },
        {
          "InstrumentID": "2424009",
          "InstrumentType": "Actual2",
          "Name": "VG03",
          "DataType": "Double",
          "Units": "Pa"
        }
      ]
    },
    "Steps": [
      {
        "Keys": {
          "StepID": "START",
          "StepResult": "NormalEnd"
        },
        "InstrumentData": [
          {
            "Keys": {
              "InstrumentID": "1723007"
            },
            "Measurements": [
              {
                "DateTime": "2021-11-16 21:18:37.000",
                "Value": 540
              },
              {
                "DateTime": "2021-11-16 21:18:37.100",
                "Value": 539
              },
              {
                "DateTime": "2021-11-16 21:18:37.200",
                "Value": 540
              },
              {
                "DateTime": "2021-11-16 21:18:37.300",
                "Value": 540
              },
              {
                "DateTime": "2021-11-16 21:18:37.400",
                "Value": 540
              },
              {
                "DateTime": "2021-11-16 21:18:37.500",
                "Value": 540
              },
              {
                "DateTime": "2021-11-16 21:18:37.600",
                "Value": 540
              },
              {
                "DateTime": "2021-11-16 21:18:37.700",
                "Value": 538
              },
              {
                "DateTime": "2021-11-16 21:18:37.800",
                "Value": 540
              }
            ]
          },
          {
            "Keys": {
              "InstrumentID": "2424009"
            },
            "Measurements": [
              {
                "DateTime": "2021-11-16 21:18:37.000",
                "Value": 1333.22
              },
              {
                "DateTime": "2021-11-16 21:18:37.100",
                "Value": 1333.22
              },
              {
                "DateTime": "2021-11-16 21:18:37.200",
                "Value": 1333.22
              },
              {
                "DateTime": "2021-11-16 21:18:37.300",
                "Value": 1333.22
              },
              {
                "DateTime": "2021-11-16 21:18:37.400",
                "Value": 1333.22
              },
              {
                "DateTime": "2021-11-16 21:18:37.500",
                "Value": 1333.22
              },
              {
                "DateTime": "2021-11-16 21:18:37.600",
                "Value": 1333.22
              },
              {
                "DateTime": "2021-11-16 21:18:37.700",
                "Value": 1333.22
              },
              {
                "DateTime": "2021-11-16 21:18:37.800",
                "Value": 1333.22
              }
            ]
          }
        ]
      }
    ]
  }
}

The problem

I would like to split this file into multiple files by splitting the array "InstrumentData" because this array will be holding the major chunk of the data. Splitting this file into smaller files would enable me to parse the file without getting an out of memory exception.

Current State

public static void SplitJson(string filename, string arrayPropertyName)
    {
        string templateFileName = @"C:\Temp\template.json";
        string arrayFileName = @"C:\Temp\array.json";

        CreateEmptyFile(templateFileName);
        CreateEmptyFile(arrayFileName);

        using (Stream stream = File.OpenRead(filename))
        using (JsonReader reader = new JsonTextReader(new StreamReader(stream)))
        using (JsonWriter templateWriter = new JsonTextWriter(new StreamWriter(templateFileName)))
        using (JsonWriter arrayWriter = new JsonTextWriter(new StreamWriter(arrayFileName)))
        {
            if (reader.Read() && reader.TokenType == JsonToken.StartObject)
            {
                templateWriter.WriteStartObject();
                while (reader.Read() && reader.TokenType != JsonToken.EndObject)
                {
                    string propertyName = (string)reader.Value;
                    reader.Read();
                    templateWriter.WritePropertyName(propertyName);
                    if (propertyName == arrayPropertyName)
                    {
                        arrayWriter.WriteToken(reader);
                        templateWriter.WriteStartObject();  // empty placeholder object
                        templateWriter.WriteEndObject();
                    }
                    else if (reader.TokenType == JsonToken.StartObject ||
                             reader.TokenType == JsonToken.StartArray)
                    {
                        templateWriter.WriteToken(reader);
                    }
                    else
                    {
                        templateWriter.WriteValue(reader.Value);
                    }
                }
                templateWriter.WriteEndObject();
            }
        }

        // Now read the huge array file and combine each item in the array
        // with the template to make new files
        JObject template = JObject.Parse(File.ReadAllText(templateFileName));
        using (JsonReader arrayReader = new JsonTextReader(new StreamReader(arrayFileName)))
        {
            int counter = 0;
            while (arrayReader.Read())
            {
                if (arrayReader.TokenType == JsonToken.StartObject)
                {
                    counter++;
                    JObject item = JObject.Load(arrayReader);
                    template[arrayPropertyName] = item;
                    string fileName = string.Format(@"C:\Temp\output_{0}_{1}_{2}.json",
                                                    template["name"], template["age"], counter);

                    File.WriteAllText(fileName, template.ToString());
                }
            }
        }

        // Clean up temporary files
        File.Delete(templateFileName);
        File.Delete(arrayFileName);
    }

I am using this method to try and split the file into smaller files. However, this method can only split the files based on properties which are in the root level.

The question

Am I in the right track to tackle this problem? Is this an efficient way to tackle this? How do I split the JSON into multiple files by splitting the array in an efficient way? The JSON file should be split in a way that there is one file for each of the element in "InstrumentData" array. All the other properties and structures should be retained in the splitted files.

Ken White
  • 123,280
  • 14
  • 225
  • 444
mkr231
  • 68
  • 2
  • 16
  • 1
    What criteria do you want to use to split the array? It's not clear from your question as currently written. – dbc Jan 04 '22 at 21:34
  • @dbc : updated the last couple of lines of the question with criteria for splitting. I hope this makes it clear. – mkr231 Jan 05 '22 at 03:06
  • 1
    This seems similar: [Strategy for splitting a large JSON file](https://stackoverflow.com/q/31410187/3744182). – dbc Jan 05 '22 at 03:08
  • But this strategy seems to work only for arrays in the root level of the object. Mine is slightly different in the sense that the JSON has to be split based on an array deep inside the structure. – mkr231 Jan 05 '22 at 13:14
  • 2
    You want to split the array `InstrumentData` but it is nested inside an array `Steps`: `"Job.Steps[*].InstrumentData[*]"`. If the outer array contains multiple entries do you expect that to get split as well in the process of splitting the inner array? Or if not, what do you want to do when the outer array has more than one entry? – dbc Jan 09 '22 at 15:42

3 Answers3

3

It's not clear from your question what you mean by splitting the JSON by splitting the array "InstrumentData". The array in question is located at the path "Job.Steps[*].InstrumentData[*]", so are you also effectively going to split the containing array "Job.Steps[*]" as well? And what about prefix and postfix properties such as "Job.Keys" -- what do you want to do with them?

One approach to defining and implementing your split would be to adopt the approach from Strategy for splitting a large JSON file, suitably generalized for nested arrays. In that question, prefix and postfix properties were retained in each split file while the array to be split would be divided up in chunks. In that question the array property was at the root level, but in your case you need to specify a path to the array property. And if the array to be split is nested deeply within some other array values, those will need to get split as well.

Assuming this is what you want, the following extension method should do the trick:

public static partial class JsonExtensions
{
    public static string [] SplitJsonFile(string fileName, string [] splitPath, Func<string, string, int, string, string> nameCreator)
    {
        List<string> fileNames = new List<string>();
        
        var name = Path.GetFileNameWithoutExtension(fileName);
        var ext = Path.GetExtension(fileName);
        var directory = Path.GetDirectoryName(fileName);
        Func<int, TextWriter> createStream = (i) => 
        {
            // Use whatever method you like to generate a name for each fragment.
            var newName = nameCreator(directory, name, i, ext);
            var writer = new StreamWriter(newName, false, Encoding.UTF8);
            fileNames.Add(newName);
            return writer;
        };

        using (var reader = new StreamReader(fileName, Encoding.UTF8))
        {
            JsonExtensions.SplitJson(reader,splitPath, 1, createStream, Formatting.Indented);
        }

        return fileNames.ToArray();
    }
    
    public static void SplitJson(TextReader textReader, IList<string> splitPath, long maxItems, Func<int, TextWriter> createStream, Formatting formatting)
    {
        if (splitPath == null || createStream == null || textReader == null)
            throw new ArgumentNullException();
        if (splitPath.Count < 1 || maxItems < 1)
            throw new ArgumentException();
        using (var reader = new JsonTextReader(textReader))
        {
            List<JsonWriter> writers = new ();
            List<ParentToken> parentTokens = new ();
            try
            {
                SplitJson(reader, splitPath, 0, maxItems, createStream, formatting, parentTokens, writers);
            }
            finally
            {
                // Make sure files are closed in the event of an exception.
                foreach (IDisposable writer in writers)
                    writer?.Dispose();
            }
        }
    }
    
    struct ParentToken
    {
        public ParentToken(JsonToken tokenType, IList<JToken> prefixTokens = default) => (this.TokenType, this._prefixTokens) = (tokenType, prefixTokens);
        readonly IList<JToken> _prefixTokens;
        public JsonToken TokenType { get; }
        public IList<JToken> PrefixTokens => _prefixTokens ?? Array.Empty<JToken>();
    }
    
    static JsonWriter AddWriter(List<JsonWriter> writers, List<ParentToken> parentTokens, Func<int, TextWriter> createStream, Formatting formatting)
    {
        var writer = new JsonTextWriter(createStream(writers.Count)) { Formatting = formatting, AutoCompleteOnClose = false };
        writers.Add(writer);
        foreach (var parent in parentTokens)
        {
            switch (parent.TokenType)
            {
                case JsonToken.StartObject:
                    writer.WriteStartObject();
                    break;
                case JsonToken.StartArray:
                    writer.WriteStartArray();
                    break;
                default:
                    throw new JsonException();
            }
            for (int i = 0; i < parent.PrefixTokens.Count; i++)
            {
                if (i == parent.PrefixTokens.Count - 1 && parent.PrefixTokens[i] is JProperty property && property.Value.Type == JTokenType.Undefined)
                    writer.WritePropertyName(property.Name);
                else
                    parent.PrefixTokens[i].WriteTo(writer);
            }
        }
        return writer;
    }
    
    static (JsonWriter, int) GetCurrentWriter(List<JsonWriter> writers, List<ParentToken> parentTokens, Func<int, TextWriter> createStream, Formatting formatting)
        => writers.Count == 0 ? (AddWriter(writers, parentTokens, createStream, formatting), 0) : (writers[writers.Count-1], writers.Count-1);
    
    static void SplitJson(JsonTextReader reader, IList<string> splitPath, int index, long maxItems, Func<int, TextWriter> createStream, Formatting formatting, List<ParentToken> parentTokens , List<JsonWriter> writers)
    {
        var startTokenType = reader.MoveToContentAndAssert().TokenType;
        var startReaderDepth = reader.Depth;

        var bottom = index >= splitPath.Count;
        
        switch (startTokenType)
        {
            case JsonToken.StartObject:
                {
                    (var firstWriter, var firstWriterIndex) = GetCurrentWriter(writers, parentTokens, createStream, formatting);
                    bool prefix = true;
                    bool doRead = true;
                    firstWriter.WriteStartObject();
                    var parentToken = new ParentToken(JsonToken.StartObject, new List<JToken>());
                    while ((doRead ? reader.ReadToContentAndAssert() : reader.MoveToContentAndAssert()).TokenType != JsonToken.EndObject)
                    {
                        doRead = true;
                        var propertyName = (string)reader.AssertTokenType(JsonToken.PropertyName).Value;
                        if (propertyName == splitPath[index])
                        {
                            if (!prefix)
                                throw new JsonException(string.Format("Duplicated property name {0}", propertyName));
                            prefix = false;
                            
                            // Advance reader to value.
                            reader.ReadToContentAndAssert();
                            
                            // Add a token with the current property name but an undefined value.  This indicates an unclosed property.
                            firstWriter.WritePropertyName(propertyName);
                            parentToken.PrefixTokens.Add(new JProperty(propertyName, JValue.CreateUndefined()));
                            
                            parentTokens.Add(parentToken);
                            
                            // SplitJson() leaves the reader positioned ON the end of the token that was read, rather than after.
                            SplitJson(reader, splitPath, index + 1, maxItems, createStream, formatting, parentTokens, writers);
                            parentTokens.RemoveAt(parentTokens.Count-1);
                        }
                        else if (prefix)
                        {
                            // JProperty.Load() leaves the reader positioned AFTER the token that was read, rather than at the end.
                            var property = JProperty.Load(reader);
                            property.WriteTo(firstWriter);
                            parentToken.PrefixTokens.Add(property);
                            doRead = false;
                        }
                        else
                        {
                            var property = JProperty.Load(reader);
                            for (int i = firstWriterIndex; i < writers.Count; i++)
                            {
                                property.WriteTo(writers[i]);
                            }
                            doRead = false;
                        }
                    }
                    for (int i = firstWriterIndex; i < writers.Count; i++)
                    {
                        if (prefix)
                            // We never found the property
                            foreach (var property in parentToken.PrefixTokens)
                                property.WriteTo(writers[i]);
                        writers[i].WriteEndObject();
                    }
                }
                break;
            case JsonToken.StartArray: // Split the array.
                {
                    var maxItemsAtDepth = bottom ? maxItems : 1L;
                    (var writer, var firstWriterIndex) = GetCurrentWriter(writers, parentTokens, createStream, formatting);
                    writer.WriteStartArray();
                    long count = 0L;
                    while (reader.ReadToContentAndAssert().TokenType != JsonToken.EndArray)
                    {
                        if (reader.TokenType == JsonToken.Comment || reader.TokenType == JsonToken.None)
                            continue;
                        if (count >= maxItemsAtDepth)
                        {
                            writer = AddWriter(writers, parentTokens, createStream, formatting);
                            writer.WriteStartArray();
                            count = 0L;
                        }
                        if (bottom)
                            // WriteToken() leaves the reader positioned ON the end of the token that was read, rather than after.
                            writer.WriteToken(reader);
                        else
                        {
                            parentTokens.Add(new ParentToken(JsonToken.StartArray));
                            // SplitJson() leaves the reader positioned ON the end of the token that was read, rather than after.
                            SplitJson(reader, splitPath, index, maxItems, createStream, formatting, parentTokens, writers);
                            parentTokens.RemoveAt(parentTokens.Count-1);
                        }
                        count++;
                    }
                    for (int i = firstWriterIndex; i < writers.Count; i++)
                    {
                        writers[i].WriteEndArray();
                    }
                }
                break;
            default: // null, for instance
                {
                    (var writer, var _) = GetCurrentWriter(writers, parentTokens, createStream, formatting);
                    writer.WriteToken(reader);
                }
                break;
        }
    }       
}

public static partial class JsonExtensions
{
    public static JsonReader AssertTokenType(this JsonReader reader, JsonToken tokenType) => 
        reader.TokenType == tokenType ? reader : throw new JsonSerializationException(string.Format("Unexpected token {0}, expected {1}", reader.TokenType, tokenType));
    
    public static JsonReader ReadToContentAndAssert(this JsonReader reader) =>
        reader.ReadAndAssert().MoveToContentAndAssert();

    public static JsonReader MoveToContentAndAssert(this JsonReader reader)
    {
        if (reader == null)
            throw new ArgumentNullException();
        if (reader.TokenType == JsonToken.None)       // Skip past beginning of stream.
            reader.ReadAndAssert();
        while (reader.TokenType == JsonToken.Comment) // Skip past comments.
            reader.ReadAndAssert();
        return reader;
    }

    public static JsonReader ReadAndAssert(this JsonReader reader)
    {
        if (reader == null)
            throw new ArgumentNullException();
        if (!reader.Read())
            throw new JsonReaderException("Unexpected end of JSON stream.");
        return reader;
    }
}

And then, to split on "Job.Steps[*].InstrumentData[*]", call it as follows:

var fileNames = JsonExtensions.SplitJsonFile(fileName, 
                                             new [] { "Job", "Steps", "InstrumentData" }, 
                                             (directory, name, i, ext) => Path.Combine(directory, Path.ChangeExtension(name + $"_fragment_{i}", ext)));

Alternatively, to split on "Job.Steps[*].InstrumentData[*].Measurements[*]", call as follows:

var fileNames = JsonExtensions.SplitJsonFile(fileName, 
                                             new [] { "Job", "Steps", "InstrumentData", "Measurements", 
                                             (directory, name, i, ext) => Path.Combine(directory, Path.ChangeExtension(name + $"_fragment_{i}", ext)));

Demo fiddle here.

I also use tested this enhanced version of JsonExtensions.SplitJson() using the JSON from Strategy for splitting a large JSON file to verify there was no regression; there was not. See fiddle #2 here.

dbc
  • 104,963
  • 20
  • 228
  • 340
0

This is a little ugly solution and absolutely does not pretend to be the best but it allows to split a json with a similar structure of several GB in size into smaller files containing individual array members in a line-by-line manner. It preserves original redundant indentation and finalizing commas but these can be additionally fixed if necessary.

using var inputStream = File.OpenText("./input.json");

// Searching for the beginning of the array by the "InstrumentData" key
string? line;
while ((line = inputStream.ReadLine()) != null)
{
  if (line.Contains("\"InstrumentData\""))
    break;
}

// End of file reached, exiting
if (line == null)
  return;

// Reading and splitting the "InstrumentData" array.
StreamWriter? outputFile = null;
var outputFilesCounter = 0;
var arrayLevel = 0;
var objectLevel = 0;
try
{
  while ((line = inputStream.ReadLine()) != null)
  {
    // Track the levels of nesting within the array
    arrayLevel += line.Count(c => c == '[');
    objectLevel += line.Count(c => c == '{');

    // Write a line into the currently opened output file stream
    if (objectLevel > 0)
    {
      outputFile ??= File.CreateText($"./output_{outputFilesCounter++}.json");
      outputFile.WriteLine(line);
    }

    arrayLevel -= line.Count(c => c == ']');
    objectLevel -= line.Count(c => c == '}');

    // End of an array member, flush the file stream
    if (objectLevel == 0)
    {
      outputFile?.Dispose();
      outputFile = null;
    }

    // End of array reached, exiting
    if (arrayLevel < 0)
    {
      outputFile?.Dispose();
      return;
    }
  }
}
finally
{
  outputFile?.Dispose();
}

Egiraht
  • 31
  • 1
  • 4
  • 3
    This solution assumes that the json file is properly formatted with line breaks in place. However, that is not the case, and it throws an out of memory exception at the 2nd line. – mkr231 Jan 09 '22 at 05:13
0

My solution here: https://dotnetfiddle.net/CufM4w

The idea is to process the document in two rounds using a relatively simple state automata.

  1. Extract the root and every step templates (prefix and suffix), but skip the InstrumentData entirely.

  2. On the contrary skip everything except InstrumentData and use the parts from #1 to build an output.

So the main code will look like (check the link above for the full source):

    var reader = new StringReader(JsonText);
    var splitter = DataSplitter.Create(reader);

    reader = new StringReader(JsonText);
    splitter.Split(reader, (step, index, content) => {
        Console.WriteLine("=== step: {0}, index: {1} ===", step, index);
        Console.WriteLine(content);
     });

Hope it helped.

ryanovic
  • 114
  • 1
  • 4