I have vectorized the the inner loop of matrix addition using intrinsics instruction of AVX2, I also have the latency table from here. I expect that speedup should be a factor of 5, because almost 4 latency happens in 1024 iterations over 6 latency in 128 iterations, but the speedup is a factor of 3. so the question is what else is here that I don't see. I'm using gcc, coding in c, intrinsics, CPU is skylake 6700hq
Here is c and assembly out put of the inner loop.
global data:
int __attribute__(( aligned(32))) a[MAX1][MAX2] ;
int __attribute__(( aligned(32))) b[MAX2][MAX3] ;
int __attribute__(( aligned(32))) c_result[MAX1][MAX3] ;
sequential :
for( i = 0 ; i < MAX1 ; i++)
for(j = 0 ; j < MAX2 ; j++)
c_result[i][j] = a[i][j] + b[i][j];
.L16:
movl (%r9,%rax), %edx // latency : 2 , throughput : 0.5 number of execution unit : 4 ALU
addl (%r8,%rax), %edx // latency : dont know , throughput : 0.5 number of execution unit : 4 ALU
movl %edx, c_result(%rcx,%rax) // latency : 2 , throughput : 1 number of execution unit : 4 ALU
addq $4, %rax
cmpq $4096, %rax
jne .L16
AVX2:
for( i = 0 ; i < MAX1 ; i++){
for(j = 0 ; j < MAX2 ; j += 8){
a0_i= _mm256_add_epi32( _mm256_load_si256((__m256i *)&a[i][j]) , _mm256_load_si256((__m256i *)&b[i][j]));
_mm256_store_si256((__m256i *)&c_result[i][j], a0_i);
}}
.L22:
vmovdqa (%rcx,%rax), %ymm0 // latency : 3 , throughput : 0.5 number of execution unit : 4 ALU
vpaddd (%r8,%rax), %ymm0, %ymm0 // latency : dont know , throughput : 0.5 number of execution unit : 3 VEC-ALU
vmovdqa %ymm0, c_result(%rdx,%rax) // latency : 3 , throughput : 1 number of execution unit : 4 ALU
addq $32, %rax
cmpq $4096, %rax
jne .L22