Here’s a function which transposes 8x8 block of that image.
Call that function in a loop transposing these blocks, and writing into another locations of the output image.
Note that unless your image size is a multiple of 8, you gonna need another version of that function, to handle the remainder areas.
// Load 16 bytes from memory, upcast to AVX vector
inline __m256i load12Low( const uint8_t* rsi )
{
return _mm256_castsi128_si256( _mm_loadu_si128( ( const __m128i* ) rsi ) );
}
// Load 16 bytes from memory at the offset (rsi-4) into upper half of the AVX vector
inline __m256i load12High( __m256i low, const uint8_t* rsi )
{
return _mm256_inserti128_si256( low, _mm_loadu_si128( ( const __m128i* ) ( rsi - 4 ) ), 1 );
}
// Store bytes [ 0 .. 11 ] and [ 16 .. 27 ] of the AVX vector
inline void store24( __m256i vec, uint8_t* rdi )
{
_mm_storeu_si128( ( __m128i* )rdi, _mm256_castsi256_si128( vec ) );
__m128i high = _mm256_extracti128_si256( vec, 1 );
_mm_storeu_si64( rdi + 12, high );
*(int*)( rdi + 20 ) = _mm_extract_epi32( high, 2 );
}
inline void unpackLowHigh32( __m256i& r0, __m256i& r1, __m256i a, __m256i b )
{
r0 = _mm256_unpacklo_epi32( a, b );
r1 = _mm256_unpackhi_epi32( a, b );
}
inline void unpackLowHigh64( __m256i& r0, __m256i& r1, __m256i a, __m256i b )
{
r0 = _mm256_unpacklo_epi64( a, b );
r1 = _mm256_unpackhi_epi64( a, b );
}
// Transpose dense 8x8 block of RGB24 pixels
void transpose8x8( uint8_t* rdi, size_t destStride, const uint8_t* rsi, size_t sourceStride )
{
// Load top half of the matrix into lower 4 lanes of 8 vectors
__m256i arr[ 8 ];
arr[ 0 ] = load12Low( rsi );
arr[ 1 ] = load12Low( rsi + 12 );
rsi += sourceStride;
arr[ 2 ] = load12Low( rsi );
arr[ 3 ] = load12Low( rsi + 12 );
rsi += sourceStride;
arr[ 4 ] = load12Low( rsi );
arr[ 5 ] = load12Low( rsi + 12 );
rsi += sourceStride;
arr[ 6 ] = load12Low( rsi );
arr[ 7 ] = load12Low( rsi + 12 );
rsi += sourceStride;
// Load bottom half of the matrix into upper half of these vectors
arr[ 0 ] = load12High( arr[ 0 ], rsi );
arr[ 1 ] = load12High( arr[ 1 ], rsi + 12 );
rsi += sourceStride;
arr[ 2 ] = load12High( arr[ 2 ], rsi );
arr[ 3 ] = load12High( arr[ 3 ], rsi + 12 );
rsi += sourceStride;
arr[ 4 ] = load12High( arr[ 4 ], rsi );
arr[ 5 ] = load12High( arr[ 5 ], rsi + 12 );
rsi += sourceStride;
arr[ 6 ] = load12High( arr[ 6 ], rsi );
arr[ 7 ] = load12High( arr[ 7 ], rsi + 12 );
// Expand 3 byte values into 4 byte ones
__m256i perm = _mm256_setr_epi8(
0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1,
4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, -1, 13, 14, 15, -1 );
arr[ 0 ] = _mm256_shuffle_epi8( arr[ 0 ], perm );
arr[ 1 ] = _mm256_shuffle_epi8( arr[ 1 ], perm );
arr[ 2 ] = _mm256_shuffle_epi8( arr[ 2 ], perm );
arr[ 3 ] = _mm256_shuffle_epi8( arr[ 3 ], perm );
arr[ 4 ] = _mm256_shuffle_epi8( arr[ 4 ], perm );
arr[ 5 ] = _mm256_shuffle_epi8( arr[ 5 ], perm );
arr[ 6 ] = _mm256_shuffle_epi8( arr[ 6 ], perm );
arr[ 7 ] = _mm256_shuffle_epi8( arr[ 7 ], perm );
// The following code requires compiler optimizations to reuse registers
// We only have 16 vector register underneath the compiler
// Fortunately, they seem to do a decent job for that code.
__m256i tmp[ 8 ];
// Transpose 2x2 blocks
unpackLowHigh32( tmp[ 0 ], tmp[ 1 ], arr[ 0 ], arr[ 2 ] );
unpackLowHigh32( tmp[ 2 ], tmp[ 3 ], arr[ 1 ], arr[ 3 ] );
unpackLowHigh32( tmp[ 4 ], tmp[ 5 ], arr[ 4 ], arr[ 6 ] );
unpackLowHigh32( tmp[ 6 ], tmp[ 7 ], arr[ 5 ], arr[ 7 ] );
// Transpose 4x4 blocks
unpackLowHigh64( arr[ 0 ], arr[ 1 ], tmp[ 0 ], tmp[ 4 ] );
unpackLowHigh64( arr[ 2 ], arr[ 3 ], tmp[ 1 ], tmp[ 5 ] );
unpackLowHigh64( arr[ 4 ], arr[ 5 ], tmp[ 2 ], tmp[ 6 ] );
unpackLowHigh64( arr[ 6 ], arr[ 7 ], tmp[ 3 ], tmp[ 7 ] );
// Gather RGB values across 16-byte lanes
const __m128i perm16 = _mm_setr_epi8( 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1 );
perm = _mm256_broadcastsi128_si256( perm16 );
arr[ 0 ] = _mm256_shuffle_epi8( arr[ 0 ], perm );
arr[ 1 ] = _mm256_shuffle_epi8( arr[ 1 ], perm );
arr[ 2 ] = _mm256_shuffle_epi8( arr[ 2 ], perm );
arr[ 3 ] = _mm256_shuffle_epi8( arr[ 3 ], perm );
arr[ 4 ] = _mm256_shuffle_epi8( arr[ 4 ], perm );
arr[ 5 ] = _mm256_shuffle_epi8( arr[ 5 ], perm );
arr[ 6 ] = _mm256_shuffle_epi8( arr[ 6 ], perm );
arr[ 7 ] = _mm256_shuffle_epi8( arr[ 7 ], perm );
// Store these pixels back
store24( arr[ 0 ], rdi );
rdi += destStride;
store24( arr[ 1 ], rdi );
rdi += destStride;
store24( arr[ 2 ], rdi );
rdi += destStride;
store24( arr[ 3 ], rdi );
rdi += destStride;
store24( arr[ 4 ], rdi );
rdi += destStride;
store24( arr[ 5 ], rdi );
rdi += destStride;
store24( arr[ 6 ], rdi );
rdi += destStride;
store24( arr[ 7 ], rdi );
}
I’d like to add that modern software and hardware tends to avoid dealing with RGB24 images in memory. For instance, Windows graphics dropped the support in Vista, in D3D10. The hardware and drivers only support texture formats with 8/16/32/64/128 bits per pixel, despite the 33% VRAM overhead for uncompressed RGB24 bitmaps.