While this question was originally tagged c#-4.0, this can be done fairly easily in .NET 5 with the introduction of Encoding.CreateTranscodingStream
:
Creates a Stream that serves to transcode data between an inner Encoding and an outer Encoding, similar to Convert(Encoding, Encoding, Byte[])
.
The trick is to define an underlying UnicodeStream
that directly accesses the bytes of the string
then wrap that in the transcoding stream to present streamed content with the required encoding.
The following classes and extension method do the job:
public static partial class TextExtensions
{
public static Encoding PlatformCompatibleUnicode => BitConverter.IsLittleEndian ? Encoding.Unicode : Encoding.BigEndianUnicode;
static bool IsPlatformCompatibleUnicode(this Encoding encoding) => BitConverter.IsLittleEndian ? encoding.CodePage == 1200 : encoding.CodePage == 1201;
public static Stream AsStream(this string @string, Encoding encoding = default) =>
(@string ?? throw new ArgumentNullException(nameof(@string))).AsMemory().AsStream(encoding);
public static Stream AsStream(this ReadOnlyMemory<char> charBuffer, Encoding encoding = default) =>
((encoding ??= Encoding.UTF8).IsPlatformCompatibleUnicode())
? new UnicodeStream(charBuffer)
: Encoding.CreateTranscodingStream(new UnicodeStream(charBuffer), PlatformCompatibleUnicode, encoding, false);
}
sealed class UnicodeStream : Stream
{
const int BytesPerChar = 2;
// By sealing UnicodeStream we avoid a lot of the complexity of MemoryStream.
ReadOnlyMemory<char> charMemory;
int position = 0;
Task<int> _cachedResultTask; // For async reads, avoid allocating a Task.FromResult<int>(nRead) every time we read.
public UnicodeStream(string @string) : this((@string ?? throw new ArgumentNullException(nameof(@string))).AsMemory()) { }
public UnicodeStream(ReadOnlyMemory<char> charMemory) => this.charMemory = charMemory;
public override int Read(Span<byte> buffer)
{
EnsureOpen();
var charPosition = position / BytesPerChar;
// MemoryMarshal.AsBytes will throw on strings longer than int.MaxValue / 2, so only slice what we need.
var byteSlice = MemoryMarshal.AsBytes(charMemory.Slice(charPosition, Math.Min(charMemory.Length - charPosition, 1 + buffer.Length / BytesPerChar)).Span);
var slicePosition = position % BytesPerChar;
var nRead = Math.Min(buffer.Length, byteSlice.Length - slicePosition);
byteSlice.Slice(slicePosition, nRead).CopyTo(buffer);
position += nRead;
return nRead;
}
public override int Read(byte[] buffer, int offset, int count)
{
ValidateBufferArgs(buffer, offset, count);
return Read(buffer.AsSpan(offset, count));
}
public override int ReadByte()
{
// Could be optimized.
Span<byte> span = stackalloc byte[1];
return Read(span) == 0 ? -1 : span[0];
}
public override ValueTask<int> ReadAsync(Memory<byte> buffer, CancellationToken cancellationToken = default)
{
EnsureOpen();
if (cancellationToken.IsCancellationRequested)
return ValueTask.FromCanceled<int>(cancellationToken);
try
{
return new ValueTask<int>(Read(buffer.Span));
}
catch (Exception exception)
{
return ValueTask.FromException<int>(exception);
}
}
public override Task<int> ReadAsync(byte[] buffer, int offset, int count, CancellationToken cancellationToken)
{
ValidateBufferArgs(buffer, offset, count);
var valueTask = ReadAsync(buffer.AsMemory(offset, count));
if (!valueTask.IsCompletedSuccessfully)
return valueTask.AsTask();
var lastResultTask = _cachedResultTask;
return (lastResultTask != null && lastResultTask.Result == valueTask.Result) ? lastResultTask : (_cachedResultTask = Task.FromResult<int>(valueTask.Result));
}
void EnsureOpen()
{
if (position == -1)
throw new ObjectDisposedException(GetType().Name);
}
// https://learn.microsoft.com/en-us/dotnet/api/system.io.stream.flush?view=net-5.0
// In a class derived from Stream that doesn't support writing, Flush is typically implemented as an empty method to ensure full compatibility with other Stream types since it's valid to flush a read-only stream.
public override void Flush() { }
public override Task FlushAsync(CancellationToken cancellationToken) => cancellationToken.IsCancellationRequested ? Task.FromCanceled(cancellationToken) : Task.CompletedTask;
public override bool CanRead => true;
public override bool CanSeek => false;
public override bool CanWrite => false;
public override long Length => throw new NotSupportedException();
public override long Position { get => throw new NotSupportedException(); set => throw new NotSupportedException(); }
public override long Seek(long offset, SeekOrigin origin) => throw new NotSupportedException();
public override void SetLength(long value) => throw new NotSupportedException();
public override void Write(byte[] buffer, int offset, int count) => throw new NotSupportedException();
protected override void Dispose(bool disposing)
{
try
{
if (disposing)
{
_cachedResultTask = null;
charMemory = default;
position = -1;
}
}
finally
{
base.Dispose(disposing);
}
}
static void ValidateBufferArgs(byte[] buffer, int offset, int count)
{
if (buffer == null)
throw new ArgumentNullException(nameof(buffer));
if (offset < 0 || count < 0)
throw new ArgumentOutOfRangeException();
if (count > buffer.Length - offset)
throw new ArgumentException();
}
}
Notes:
You can stream from either a string
, a char []
array, or slices thereof by converting them to ReadOnlyMemory<char>
buffers. This conversion simply wraps the underlying string or array memory without allocating anything.
Solutions that use Encoding.GetBytes()
to encode chunks of a string are broken because they will not handle surrogate pairs that are split between chunks. To handle surrogate pairs correctly, Encoding.GetEncoder()
must be called to initially save a Encoder
. Later, Encoder.GetBytes(ReadOnlySpan<Char>, Span<Byte>, flush: false)
can be used to encode in chucks and remember state between calls.
(Microsoft's TranscodingStream
does this correctly.)
You will get the best performance by using Encoding.Unicode
as (on almost all .Net platforms) this encoding is identical to the encoding of the String
type itself.
When a platform-compatible Unicode encoding is supplied no TranscodingStream
is used and the returned Stream
reads from the character data buffer directly.
To do:
- Test on big-endian platforms (which are rare).
- Test on strings longer than
int.MaxValue / 2
.
Demo fiddle including some basic tests here.