#include <stdio.h>
#include <iostream>
#include <random>
using namespace std;
volatile int res = 0;
void copy(char* __restrict__ dst, char* __restrict__ src) {
dst[0] = src[0];
dst[1] = src[1];
dst[2] = src[2];
dst[3] = src[3];
}
void copyOffset(char* __restrict__ dst, char* __restrict__ src, size_t offset) {
dst[0] = src[offset + 0];
dst[1] = src[offset + 1];
dst[2] = src[offset + 2];
dst[3] = src[offset + 3];
}
void copyAsInt(char *dst, char *src) {
*((int*)dst) = *((int*)src);
}
//----
void copy16(char* __restrict__ dst, char* __restrict__ src) {
dst[0] = src[0];
dst[1] = src[1];
dst[2] = src[2];
dst[3] = src[3];
dst[4] = src[4];
dst[5] = src[5];
dst[6] = src[6];
dst[7] = src[7];
dst[8] = src[8];
dst[9] = src[9];
dst[10] = src[10];
dst[11] = src[11];
dst[12] = src[12];
dst[13] = src[13];
dst[14] = src[14];
dst[15] = src[15];
}
void copyOffset16(char* __restrict__ dst, char* __restrict__ src, size_t offset) {
dst[0] = src[offset + 0];
dst[1] = src[offset + 1];
dst[2] = src[offset + 2];
dst[3] = src[offset + 3];
dst[4] = src[offset + 4];
dst[5] = src[offset + 5];
dst[6] = src[offset + 6];
dst[7] = src[offset + 7];
dst[8] = src[offset + 8];
dst[9] = src[offset + 9];
dst[10] = src[offset + 10];
dst[11] = src[offset + 11];
dst[12] = src[offset + 12];
dst[13] = src[offset + 13];
dst[14] = src[offset + 14];
dst[15] = src[offset + 15];
}
int main() {
char *a = new char[1001], *b = new char[16];
//--- which pair of statements below is unsafe or not equal each other?
copyOffset(b, a, 20);
res = b[rand() % 4]; // use b[] for something to prevent optimization
copy(b, &a[20]);
res = b[rand() % 4];
//--- non 4 bytes aligned
copyOffset(b, a, 18);
res = b[rand() % 4];
copy(b, &a[18]);
res = b[rand() % 4];
//---
copyOffset16(b, a, 26);
res = b[rand() % 16];
copy(b, &a[26]);
res = b[rand() % 16];
return 1;
}
I'm trying to copy 4 bytes (both source and dest are ensured to be allocated). However, the source address might not be 4 bytes aligned. To copy 4 bytes, I expect the compiler to emit a copy DWORD instruction like in copyAsInt()
. I'm using -O3 -mavx
flag, and use godbolt with gcc 11.2 to see assembly code.
The function copy()
is translated to be the same as copyAsInt()
, as expected. However, for some reason, the function copyOffset() is translated to copying each byte separately.
copy(char*, char*):
mov eax, DWORD PTR [rsi]
mov DWORD PTR [rdi], eax
ret
copyOffset(char*, char*, unsigned long):
movzx eax, BYTE PTR [rsi+rdx]
mov BYTE PTR [rdi], al
movzx eax, BYTE PTR [rsi+1+rdx]
mov BYTE PTR [rdi+1], al
movzx eax, BYTE PTR [rsi+2+rdx]
mov BYTE PTR [rdi+2], al
movzx eax, BYTE PTR [rsi+3+rdx]
mov BYTE PTR [rdi+3], al
ret
Meanwhile, the function copy16() and copyOffset16() are both vectorized as expected.
copy16(char*, char*):
vmovdqu xmm0, XMMWORD PTR [rsi]
vmovdqu XMMWORD PTR [rdi], xmm0
ret
copyOffset16(char*, char*, unsigned long):
vmovdqu xmm0, XMMWORD PTR [rsi+rdx]
vmovdqu XMMWORD PTR [rdi], xmm0
ret
So why isn't copyOffset()
optimized by the compiler to use mov DWORD
? Also, is there any pair of statements in main()
that is unsafe or might behave unexpectedly?
Edit: switching to x86-64 gcc (trunk) causes gcc to emit the expected instruction. So I guess this behavior is just due to compiler heuristic.