After some thoughts and discussion in the comments, I think this is the most efficient version, at least when source and destination data is in RAM. It does not require AVX2, AVX1 is enough.
The main idea, modern CPUs can do twice as many load micro-ops compared to stores, and on many CPUs loading stuff into higher half of vectors with vinsertf128
has same cost as regular 16-byte load. Compared to your macro, this version no longer needs these relatively expensive (3 cycles of latency on most CPUs) vperm2f128
shuffles.
struct Matrix4x4
{
__m256d r0, r1, r2, r3;
};
inline void loadTransposed( Matrix4x4& mat, const double* rsi, size_t stride = 8 )
{
// Load top half of the matrix into low half of 4 registers
__m256d t0 = _mm256_castpd128_pd256( _mm_loadu_pd( rsi ) ); // 00, 01
__m256d t1 = _mm256_castpd128_pd256( _mm_loadu_pd( rsi + 2 ) ); // 02, 03
rsi += stride;
__m256d t2 = _mm256_castpd128_pd256( _mm_loadu_pd( rsi ) ); // 10, 11
__m256d t3 = _mm256_castpd128_pd256( _mm_loadu_pd( rsi + 2 ) ); // 12, 13
rsi += stride;
// Load bottom half of the matrix into high half of these registers
t0 = _mm256_insertf128_pd( t0, _mm_loadu_pd( rsi ), 1 ); // 00, 01, 20, 21
t1 = _mm256_insertf128_pd( t1, _mm_loadu_pd( rsi + 2 ), 1 );// 02, 03, 22, 23
rsi += stride;
t2 = _mm256_insertf128_pd( t2, _mm_loadu_pd( rsi ), 1 ); // 10, 11, 30, 31
t3 = _mm256_insertf128_pd( t3, _mm_loadu_pd( rsi + 2 ), 1 );// 12, 13, 32, 33
// Transpose 2x2 blocks in registers.
// Due to the tricky way we loaded stuff, that's enough to transpose the complete 4x4 matrix.
mat.r0 = _mm256_unpacklo_pd( t0, t2 ); // 00, 10, 20, 30
mat.r1 = _mm256_unpackhi_pd( t0, t2 ); // 01, 11, 21, 31
mat.r2 = _mm256_unpacklo_pd( t1, t3 ); // 02, 12, 22, 32
mat.r3 = _mm256_unpackhi_pd( t1, t3 ); // 03, 13, 23, 33
}
inline void store( const Matrix4x4& mat, double* rdi, size_t stride = 8 )
{
_mm256_storeu_pd( rdi, mat.r0 );
_mm256_storeu_pd( rdi + stride, mat.r1 );
_mm256_storeu_pd( rdi + stride * 2, mat.r2 );
_mm256_storeu_pd( rdi + stride * 3, mat.r3 );
}
// Transpose 8x8 matrix of double values
void transpose8x8( double* rdi, const double* rsi )
{
Matrix4x4 block;
// Top-left corner
loadTransposed( block, rsi );
store( block, rdi );
#if 1
// Using another instance of the block to support in-place transpose
Matrix4x4 block2;
loadTransposed( block, rsi + 4 ); // top right block
loadTransposed( block2, rsi + 8 * 4 ); // bottom left block
store( block2, rdi + 4 );
store( block, rdi + 8 * 4 );
#else
// Flip the #if if you can guarantee ( rsi != rdi )
// Performance is about the same, but this version uses 4 less vector registers,
// slightly more efficient when some registers need to be backed up / restored.
assert( rsi != rdi );
loadTransposed( block, rsi + 4 );
store( block, rdi + 8 * 4 );
loadTransposed( block, rsi + 8 * 4 );
store( block, rdi + 4 );
#endif
// Bottom-right corner
loadTransposed( block, rsi + 8 * 4 + 4 );
store( block, rdi + 8 * 4 + 4 );
}
For completeness, here’s a version which uses the code very similar to your macro, does twice as few loads, same count of stores, and more shuffles. Have not benchmarked but I would expect it to be slightly slower.
struct Matrix4x4
{
__m256d r0, r1, r2, r3;
};
inline void load( Matrix4x4& mat, const double* rsi, size_t stride = 8 )
{
mat.r0 = _mm256_loadu_pd( rsi );
mat.r1 = _mm256_loadu_pd( rsi + stride );
mat.r2 = _mm256_loadu_pd( rsi + stride * 2 );
mat.r3 = _mm256_loadu_pd( rsi + stride * 3 );
}
inline void store( const Matrix4x4& mat, double* rdi, size_t stride = 8 )
{
_mm256_storeu_pd( rdi, mat.r0 );
_mm256_storeu_pd( rdi + stride, mat.r1 );
_mm256_storeu_pd( rdi + stride * 2, mat.r2 );
_mm256_storeu_pd( rdi + stride * 3, mat.r3 );
}
inline void transpose( Matrix4x4& m4 )
{
// These unpack instructions transpose lanes within 2x2 blocks of the matrix
const __m256d t0 = _mm256_unpacklo_pd( m4.r0, m4.r1 );
const __m256d t1 = _mm256_unpacklo_pd( m4.r2, m4.r3 );
const __m256d t2 = _mm256_unpackhi_pd( m4.r0, m4.r1 );
const __m256d t3 = _mm256_unpackhi_pd( m4.r2, m4.r3 );
// Produce the transposed matrix by combining these blocks
m4.r0 = _mm256_permute2f128_pd( t0, t1, 0x20 );
m4.r1 = _mm256_permute2f128_pd( t2, t3, 0x20 );
m4.r2 = _mm256_permute2f128_pd( t0, t1, 0x31 );
m4.r3 = _mm256_permute2f128_pd( t2, t3, 0x31 );
}
// Transpose 8x8 matrix with double values
void transpose8x8( double* rdi, const double* rsi )
{
Matrix4x4 block;
// Top-left corner
load( block, rsi );
transpose( block );
store( block, rdi );
// Using another instance of the block to support in-place transpose, with very small overhead
Matrix4x4 block2;
load( block, rsi + 4 ); // top right block
load( block2, rsi + 8 * 4 ); // bottom left block
transpose( block2 );
store( block2, rdi + 4 );
transpose( block );
store( block, rdi + 8 * 4 );
// Bottom-right corner
load( block, rsi + 8 * 4 + 4 );
transpose( block );
store( block, rdi + 8 * 4 + 4 );
}