The link you mentioned is using non-temporal stores. I have discussed this several times before, for example here and here. I would suggest your read those before proceeding further.
But if you really want to produce the inline assembly code in the link you mentioned here is how you do it: use intrinsics instead.
The fact that you cannot compile that code with GCC is exactly one of the reasons intrinsics were created. Inline assembly has to be written differently for 32-bit and 64-bit code and typically has different syntax for each compiler. Intrinsics solve all these issues.
The following code should compile with GCC, Clang, ICC, and MSVC in both 32-bit and 64-bit mode.
#include "xmmintrin.h"
void X_aligned_memcpy_sse2(char* dest, const char* src, const unsigned long size)
{
for(int i=size/128; i>0; i--) {
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
_mm_prefetch(src + 128, _MM_HINT_NTA);
_mm_prefetch(src + 160, _MM_HINT_NTA);
_mm_prefetch(src + 194, _MM_HINT_NTA);
_mm_prefetch(src + 224, _MM_HINT_NTA);
xmm0 = _mm_load_si128((__m128i*)&src[ 0]);
xmm1 = _mm_load_si128((__m128i*)&src[ 16]);
xmm2 = _mm_load_si128((__m128i*)&src[ 32]);
xmm3 = _mm_load_si128((__m128i*)&src[ 48]);
xmm4 = _mm_load_si128((__m128i*)&src[ 64]);
xmm5 = _mm_load_si128((__m128i*)&src[ 80]);
xmm6 = _mm_load_si128((__m128i*)&src[ 96]);
xmm7 = _mm_load_si128((__m128i*)&src[ 112]);
_mm_stream_si128((__m128i*)&dest[ 0], xmm0);
_mm_stream_si128((__m128i*)&dest[ 16], xmm1);
_mm_stream_si128((__m128i*)&dest[ 32], xmm2);
_mm_stream_si128((__m128i*)&dest[ 48], xmm3);
_mm_stream_si128((__m128i*)&dest[ 64], xmm4);
_mm_stream_si128((__m128i*)&dest[ 80], xmm5);
_mm_stream_si128((__m128i*)&dest[ 96], xmm6);
_mm_stream_si128((__m128i*)&dest[ 112], xmm7);
src += 128;
dest += 128;
}
}
Note that src
and dest
need to be 16 byte aligned and that size
needs to be a multiple of 128.
I don't, however, advice to use this code. In the cases when non-temporal stores are useful loop unrolling is useless and explicit pre-fetching is rarely ever useful. You can simply do
void copy(char *x, char *y, int n)
{
#pragma omp parallel for schedule(static)
for(int i=0; i<n/16; i++) {
_mm_stream_ps((float*)&y[16*i], _mm_load_ps((float*)&x[16*i]));
}
}
more details as to why can be found here.
Here is the assembly from the X_aligned_memcpy_sse2
function using intrinsics with GCC -O3 -S -masm=intel
. Notice that it's essentially the same as here.
shr rdx, 7
test edx, edx
mov eax, edx
jle .L1
.L5:
sub rsi, -128
movdqa xmm6, XMMWORD PTR [rsi-112]
prefetchnta [rsi]
prefetchnta [rsi+32]
prefetchnta [rsi+66]
movdqa xmm5, XMMWORD PTR [rsi-96]
prefetchnta [rsi+96]
sub rdi, -128
movdqa xmm4, XMMWORD PTR [rsi-80]
movdqa xmm3, XMMWORD PTR [rsi-64]
movdqa xmm2, XMMWORD PTR [rsi-48]
movdqa xmm1, XMMWORD PTR [rsi-32]
movdqa xmm0, XMMWORD PTR [rsi-16]
movdqa xmm7, XMMWORD PTR [rsi-128]
movntdq XMMWORD PTR [rdi-112], xmm6
movntdq XMMWORD PTR [rdi-96], xmm5
movntdq XMMWORD PTR [rdi-80], xmm4
movntdq XMMWORD PTR [rdi-64], xmm3
movntdq XMMWORD PTR [rdi-48], xmm2
movntdq XMMWORD PTR [rdi-128], xmm7
movntdq XMMWORD PTR [rdi-32], xmm1
movntdq XMMWORD PTR [rdi-16], xmm0
sub eax, 1
jne .L5
.L1:
rep ret