I'm currently sitting on a copy function that fills a destination byte array from a source byte array and replicates the source array as many times as needed until the destination array is filled (some call it MemCpyReplicate or similar). The target array is always a multiple of the length of the source array.
My first attempt was a simple copy via the Unsafe.CopyBlockUnaligned
intrinsic which simply emits a rep movsb
:
public static void CopyRepeat(byte* destination, byte* source, int byteCount, int count) {
while(count-- > 0) {
Unsafe.CopyBlockUnaligned(destination, source, (uint)byteCount);
destination += byteCount;
}
}
Since the results were not satisfactory, I now wanted to use SIMD, more precisely the Vector<T>
interface. But I don't know how to handle unaligned addresses and byte patterns smaller than the vector length.
This would be my ideal solution:
Source Array -> 10 Bytes, Vector -> 32 Bytes = 3 x byte pattern
The byte sequences are mostly in the range of 1 to 64 bytes. The number of repetitions ranges from 1 to 500. Is there a better solution or are there sample implementations for similar functions?
UPDATE:
I have built two vectorized variants from the original version. The first one repeats the pattern in the vector so that the vector contains n
patterns. If the pattern is too big for the vector, CopyBlock is used.
The second variant repeats the pattern until there are more than the vector size of bytes in the destination and then always copies vector sized blocks (and moves the source window) without using CopyBlock.
Source code of the vectorized variants
However, I now get weird results in runtime for pattern sizes between 2 and 32 (the vector size in my case). I suspect it is related to reading from the moving source window, since doubling the window halved the execution time. For sizes larger than the vector size, I get the expected results:
Method | byteCount | count | Mean | Error | StdDev |
---|---|---|---|---|---|
Repeat_CopyBlock | 3 | 16 | 19.38 ns | 0.002 ns | 0.002 ns |
Repeat_NoCopyBlock | 3 | 16 | 13.90 ns | 0.106 ns | 0.100 ns |
Repeat_CopyBlock | 3 | 128 | 25.00 ns | 0.005 ns | 0.005 ns |
Repeat_NoCopyBlock | 3 | 128 | 39.31 ns | 0.135 ns | 0.126 ns |
Repeat_CopyBlock | 12 | 16 | 10.64 ns | 0.037 ns | 0.031 ns |
Repeat_NoCopyBlock | 12 | 16 | 13.35 ns | 0.024 ns | 0.023 ns |
Repeat_CopyBlock | 12 | 128 | 25.56 ns | 0.020 ns | 0.019 ns |
Repeat_NoCopyBlock | 12 | 128 | 108.61 ns | 0.164 ns | 0.154 ns |
Repeat_CopyBlock | 16 | 16 | 68.74 ns | 0.010 ns | 0.009 ns |
Repeat_NoCopyBlock | 16 | 16 | 13.50 ns | 0.002 ns | 0.002 ns |
Repeat_CopyBlock | 16 | 128 | 81.41 ns | 0.024 ns | 0.022 ns |
Repeat_NoCopyBlock | 16 | 128 | 81.52 ns | 0.067 ns | 0.062 ns |
Repeat_CopyBlock | 48 | 16 | 48.84 ns | 0.045 ns | 0.042 ns |
Repeat_NoCopyBlock | 48 | 16 | 23.80 ns | 0.089 ns | 0.083 ns |
Repeat_CopyBlock | 48 | 128 | 364.76 ns | 0.053 ns | 0.045 ns |
Repeat_NoCopyBlock | 48 | 128 | 165.34 ns | 0.145 ns | 0.136 ns |
public static unsafe void Repeat_NoCopyBlock(byte* destination, byte* source, int byteCount, int count) {
if(byteCount == 1) {
Unsafe.InitBlockUnaligned(destination, *source, (uint)count);
return;
}
var absoluteByteCount = byteCount * count;
var dst = destination;
var offset = 0;
do
{
if(offset == absoluteByteCount) return;
offset += byteCount;
var src = source;
var remaining = byteCount;
while((remaining & -4) != 0) {
*((uint*)dst) = *((uint*)src);
dst += 4;
src += 4;
remaining -= 4;
}
if((remaining & 2) != 0) {
*((ushort*)dst) = *((ushort*)src);
dst += 2;
src += 2;
remaining -= 2;
}
if((remaining & 1) != 0)
*dst++ = *src;
} while((offset & (2 * -Vector<byte>.Count)) == 0); // & -Vector<byte>.Count was 2x slower.
var stopLoopAtOffset = absoluteByteCount - Vector<byte>.Count;
var from = destination;
// var buffer = offset;
while(offset <= stopLoopAtOffset) {
Unsafe.WriteUnaligned(dst, Unsafe.ReadUnaligned<Vector<byte>>(from));
offset += Vector<byte>.Count;
from += Vector<byte>.Count;
dst += Vector<byte>.Count;
}
var rep = (offset / byteCount) * byteCount; // Closest pattern end.
if(offset != absoluteByteCount) {
// next line is the replacement for (buffer is the offset from destination before the loop above):
// destination + buffer - Vector<byte>.Count
var repEnd = destination + rep - Vector<byte>.Count;
var dstEnd = destination + stopLoopAtOffset;
Unsafe.WriteUnaligned(dstEnd, Unsafe.ReadUnaligned<Vector<byte>>(repEnd));
}
}
public static unsafe void Repeat_CopyBlock(byte* destination, byte* source, int byteCount, int count) {
if(count == 0) return;
if(byteCount == 0) return;
if(byteCount == 1) {
Unsafe.InitBlockUnaligned(destination, *source, (uint)count);
return;
}
var numElements = Vector<byte>.Count / byteCount;
var numElementsByteCount = numElements * byteCount;
var i = 0;
var dst = destination;
do
{
var remaining = byteCount;
var src = source;
while(remaining >= 4) {
*((uint*)dst) = *((uint*)src);
dst += 4;
src += 4;
remaining -= 4;
}
if((remaining & 2) != 0) {
*((ushort*)dst) = *((ushort*)src);
dst += 2;
src += 2;
remaining -= 2;
}
if((remaining & 1) != 0)
*dst++ = *src;
++i; --count;
} while(count != 0 && i < numElements);
if(numElements > 0) { // Skip byteCounts larger than Vector<byte>.Count.
var src = Unsafe.ReadUnaligned<Vector<byte>>(destination);
while(count > numElements) {
Unsafe.WriteUnaligned(dst, src);
count -= numElements;
dst += numElementsByteCount;
}
}
while(count > 0) {
Unsafe.CopyBlockUnaligned(dst, destination, (uint)byteCount);
dst += byteCount;
--count;
}
}