Recently I'm implementing a function with avx2 assembly, but after I finished it, I found that there is no performance improvement. The original pure C code cost three hundreds CPU cycles so as avx2 implementation. Why this happened? Is there any optimization room for my avx2 implementation? c code:
typedef struct {
int32_t coeffs[N] __attribute__((aligned(32)));
} poly;
typedef struct {
poly vec[L];
} polyvecl;
typedef struct {
poly vec[K];
} polyveck;
void prepare_s1_s2_table(uint64_t s_table[2*N], polyvecl *s1, polyveck *s2)
{
uint32_t k,j;
uint64_t temp;
uint64_t mask_s = 0X0404040404040404;
for(k=0; k<N; k++){
for(j=0; j<L; j++)
{
temp = (uint64_t)(ETA + s1->vec[j].coeffs[k]);
s_table[k+N] = (s_table[k+N]<<8) | (temp);
}
for(j=0; j<K; j++)
{
temp = (uint64_t)(ETA + s2->vec[j].coeffs[k]);
s_table[k+N] = (s_table[k+N]<<8) | (temp);
}
s_table[k] = mask_s - s_table[k+N];
}
}
avx2 code:
//preprocessor macro
#if defined(__WIN32__) || defined(__APPLE__)
#define cdecl(s) _##s
#else
#define cdecl(s) s
#endif
.macro prepares1s2 off,off2
# load coeffs
vpmovsxdq (\off)(%rsi),%ymm0
vpmovsxdq (1024+\off)(%rsi),%ymm1
vpmovsxdq (2048+\off)(%rsi),%ymm2
vpmovsxdq (3072+\off)(%rsi),%ymm3
vpmovsxdq (\off)(%rdx),%ymm4
vpmovsxdq (1024+\off)(%rdx),%ymm5
vpmovsxdq (2048+\off)(%rdx),%ymm6
vpmovsxdq (3072+\off)(%rdx),%ymm7
# add eta s1
vpaddq %ymm0,%ymm15,%ymm8
vpaddq %ymm1,%ymm15,%ymm9
vpaddq %ymm2,%ymm15,%ymm10
vpaddq %ymm3,%ymm15,%ymm11
# pack s1 for s_table[i+N]
vpsllq $8,%ymm8,%ymm8
vpor %ymm8,%ymm9,%ymm8
vpsllq $8,%ymm8,%ymm8
vpor %ymm8,%ymm10,%ymm8
vpsllq $8,%ymm8,%ymm8
vpor %ymm8,%ymm11,%ymm8
vpsllq $8,%ymm8,%ymm8
# add eta s2
vpaddq %ymm4,%ymm15,%ymm9
vpaddq %ymm5,%ymm15,%ymm10
vpaddq %ymm6,%ymm15,%ymm11
vpaddq %ymm7,%ymm15,%ymm12
# pack s2 for s_table[i+N]
vpor %ymm8,%ymm9,%ymm8
vpsllq $8,%ymm8,%ymm8
vpor %ymm8,%ymm10,%ymm8
vpsllq $8,%ymm8,%ymm8
vpor %ymm8,%ymm11,%ymm8
vpsllq $8,%ymm8,%ymm8
vpor %ymm8,%ymm12,%ymm8
# pack eta-s1 eta-s2 for s_table[i]
vpsubq %ymm8,%ymm14,%ymm0
# store
vmovdqa %ymm0,(\off2)(%rdi)
vmovdqa %ymm8,(2048+\off2)(%rdi)
.endm
.global cdecl(prepare_s1s2_table_avx)
cdecl(prepare_s1s2_table_avx):
.p2align 5 //instructs the assembler to align the following instruction or data on a boundary that is a power of 2 and here is equal to 2^5, or 32 bytes
vpbroadcastq _4xeta(%rip),%ymm15
vpbroadcastq _4xmasks(%rip),%ymm14
prepares1s2 0,0
prepares1s2 16,32
prepares1s2 32,64
prepares1s2 48,96
prepares1s2 64,128
prepares1s2 80,160
prepares1s2 96,192
prepares1s2 112,224
prepares1s2 128,256
prepares1s2 144,288
prepares1s2 160,320
prepares1s2 176,352
prepares1s2 192,384
prepares1s2 208,416
prepares1s2 224,448
prepares1s2 240,480
prepares1s2 256,512
prepares1s2 272,544
prepares1s2 288,576
prepares1s2 304,608
prepares1s2 320,640
prepares1s2 336,672
prepares1s2 352,704
prepares1s2 368,736
prepares1s2 384,768
prepares1s2 400,800
prepares1s2 416,832
prepares1s2 432,864
prepares1s2 448,896
prepares1s2 464,928
prepares1s2 480,960
prepares1s2 496,992
prepares1s2 512,1024
prepares1s2 528,1056
prepares1s2 544,1088
prepares1s2 560,1120
prepares1s2 576,1152
prepares1s2 592,1184
prepares1s2 608,1216
prepares1s2 624,1248
prepares1s2 640,1280
prepares1s2 656,1312
prepares1s2 672,1344
prepares1s2 688,1376
prepares1s2 704,1408
prepares1s2 720,1440
prepares1s2 736,1472
prepares1s2 752,1504
prepares1s2 768,1536
prepares1s2 784,1568
prepares1s2 800,1600
prepares1s2 816,1632
prepares1s2 832,1664
prepares1s2 848,1696
prepares1s2 864,1728
prepares1s2 880,1760
prepares1s2 896,1792
prepares1s2 912,1824
prepares1s2 928,1856
prepares1s2 944,1888
prepares1s2 960,1920
prepares1s2 976,1952
prepares1s2 992,1984
prepares1s2 1008,2016
ret