It is possible to deserialize a single JSON payload from a list of strings representing chunked JSON by constructing a ReadOnlySequence<byte>
from the list, then constructing a Utf8JsonReader
from the sequence, and finally deserializing using the reader via JsonSerializer.Deserialize<TValue>(Utf8JsonReader, JsonSerializerOptions)
.
The following is a minimal implementation:
public static partial class JsonExtensions
{
public static TValue Deserialize<TValue>(IEnumerable<string> buffers, JsonSerializerOptions options = null)
{
return Deserialize<TValue>(ToByteArrayChunks(buffers));
}
public static TValue Deserialize<TValue>(IEnumerable<byte []> buffers, JsonSerializerOptions options = null)
{
var sequence = ReadOnlySequenceFactory.Create(buffers);
var reader = new Utf8JsonReader(sequence, options.GetReaderOptions());
return JsonSerializer.Deserialize<TValue>(ref reader, options);
}
public static JsonReaderOptions GetReaderOptions(this JsonSerializerOptions options)
{
if (options == null)
return new JsonReaderOptions();
else
return new JsonReaderOptions
{
AllowTrailingCommas = options.AllowTrailingCommas,
CommentHandling = options.ReadCommentHandling,
MaxDepth = options.MaxDepth
};
}
static readonly Encoding encoding = new UTF8Encoding(false);
static IEnumerable<byte []> ToByteArrayChunks(IEnumerable<string> buffers)
{
// By using an encoder we can handle the situation in which surrogate pairs enbedded in JSON string literals
// are split between chunks.
var encoder = encoding.GetEncoder();
foreach (var s in buffers)
{
ReadOnlySpan<char> charSpan = s;
var count = encoder.GetByteCount(charSpan, false);
var bytes = new byte[count];
Span<byte> byteSpan = bytes;
encoder.GetBytes(charSpan, byteSpan, false);
yield return bytes;
}
}
}
public static class ReadOnlySequenceFactory
{
// There is no public concrete implementation of ReadOnlySequenceSegment<T> so we must create one ourselves.
// This is modeled on https://github.com/dotnet/runtime/blob/master/src/libraries/System.Text.Json/tests/BufferFactory.cs
// by https://github.com/ahsonkhan
class ReadOnlyMemorySegment<T> : ReadOnlySequenceSegment<T>
{
public static ReadOnlySequence<T> Create(IEnumerable<ReadOnlyMemory<T>> buffers)
{
ReadOnlyMemorySegment<T> first = null;
ReadOnlyMemorySegment<T> current = null;
foreach (var buffer in buffers)
{
var next = new ReadOnlyMemorySegment<T> { Memory = buffer };
if (first == null)
{
first = next;
}
else
{
current.Next = next;
next.RunningIndex = current.RunningIndex + current.Memory.Length;
}
current = next;
}
if (first == null)
{
first = current = new ReadOnlyMemorySegment<T>();
}
return new ReadOnlySequence<T>(first, 0, current, current.Memory.Length);
}
}
public static ReadOnlySequence<T> Create<T>(IEnumerable<T []> buffers)
{
return ReadOnlyMemorySegment<T>.Create(buffers.Select(b => new ReadOnlyMemory<T>(b)));
}
}
Notes:
A ReadOnlySequence<T>
is constructed from a linked list of ReadOnlySequenceSegment<T>
objects -- but this type is abstract and .NET Core 3.1 doesn't seem to include concrete public implementation. I modeled the implementation above on this one by Ahson Khan.
JsonSerializer
is designed to deserialize from UTF-8 encoded byte sequences rather than from strings or character arrays, so if you can make your database access layer return a list of UTF-8 byte arrays rather than strings, you will get better performance and avoid the step of encoding each string chunk into bytes.
If this isn't possible and your input is definitely a long list of smallish strings (2033 characters), it might be worthwhile to investigate using memory or array pooling to allocate the necessary UTF-8 byte sequences.
While this approach avoids allocating a single, huge string
or byte []
, the entire JSON payload is nevertheless loaded into memory all at once as a sequence of chunks. Thus this is not a true streaming solution.
If you are interested in a true streaming solution and can access your JSON data directly as a Stream
, you might look at this answer to Parsing a JSON file with .NET core 3.0/System.text.Json by mtosh.
Demo fiddle here.