I was trying to learn some intel intrinsics to use vector instructions. Here is the code I have written using vectors defined in immintrin.h
.
Compilation is done using g++ vadd.cpp -O3 -o vadd -std=c++17 -Wall -march=broadwell -fno-tree-vectorize -fopt-info-vec-all=vec.rpt
.
Results are
Vectorized time -----------> 327888 and sum 511116419
Non Vectorized time -------> 553156 and sum 511116419
Nothing in the vector report. I am unable to find where I am doing it wrong, but every time the results are the same. The vectorized sum is taking more time than non vectorized sum.
A reason which I came out is lots of vector instructions are generated for the load data into vector. Is there any way to reduce them or any way to optimize it?
Note: TimeStamp is the header file for measuring time differences in nanosec and not included in the code.
int callVector(int data[], int i) {
__m256i dest1 = _mm256_set1_epi32(0);
__m256i dest2 = _mm256_set1_epi32(0);
for (int i=0; i<iteration; i+=8) {
__m256i temp1 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(data + i));
i = i+8;
__m256i temp2 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(data + i));
// sum of adjacent vector elements
dest1 = _mm256_add_epi32(dest1, temp1);
dest2 = _mm256_add_epi32(dest2, temp2);
}
__m256i temp1 = _mm256_loadu_si256(&dest1);
__m256i temp2 = _mm256_loadu_si256(&dest2);
// sum of adjacent vector elements
__m256i sum = _mm256_hadd_epi32(temp1, temp2);
__m128i sum_high = _mm256_extracti128_si256(sum, 1);
__m128i sum4 = _mm_add_epi32(sum_high, _mm256_castsi256_si128(sum));
int ans[4];
_mm_storeu_si128((__m128i *)&ans, sum4);
return ans[0] + ans[1] + ans[2] + ans[3];
}
int callNonVector(int data[], int start) {
int ans = 0;
for (int i=start; i<iteration; i++) {
ans += data[i];
}
return ans;
}
int main () {
int idx[1000000] __attribute__((aligned(64)));
for (int i=0; i<iteration; i++) {
idx[i] = rand() % 1024;
}
int sum = 0;
TimeStamp start{};
sum = callVector(idx, 0);
TimeStamp::IntegralType diff = TimeStamp{} - start;
cout<<"Vectorized time -----------> "<<diff<<" and sum "<<sum<<"\n";
TimeStamp start2{};
sum = callNonVector(idx, 0);
TimeStamp::IntegralType diff2 = TimeStamp{} - start2;
cout<<"Non Vectorized time -------> "<<diff2<<" and sum "<<sum<<"\n";
return 0;
}
Here is the assembly code generated for the above code.
0000000000000d60 <_Z10callVectorPii>:
d60: 8b 05 fa 14 20 00 mov 0x2014fa(%rip),%eax # 202260 <iteration>
d66: 85 c0 test %eax,%eax
d68: 7e 66 jle dd0 <_Z10callVectorPii+0x70>
d6a: ff c8 dec %eax
d6c: c5 e1 ef db vpxor %xmm3,%xmm3,%xmm3
d70: c1 e8 04 shr $0x4,%eax
d73: c5 fd 6f d3 vmovdqa %ymm3,%ymm2
d77: 48 c1 e0 06 shl $0x6,%rax
d7b: 48 01 f8 add %rdi,%rax
d7e: 66 90 xchg %ax,%ax
d80: c5 ed fe 07 vpaddd (%rdi),%ymm2,%ymm0
d84: c5 e5 fe 4f 20 vpaddd 0x20(%rdi),%ymm3,%ymm1
d89: 48 89 fa mov %rdi,%rdx
d8c: 48 83 c7 40 add $0x40,%rdi
d90: c5 fd 6f d0 vmovdqa %ymm0,%ymm2
d94: c5 fd 6f d9 vmovdqa %ymm1,%ymm3
d98: 48 39 d0 cmp %rdx,%rax
d9b: 75 e3 jne d80 <_Z10callVectorPii+0x20>
d9d: c4 e2 7d 02 c1 vphaddd %ymm1,%ymm0,%ymm0
da2: c4 e3 7d 39 c1 01 vextracti128 $0x1,%ymm0,%xmm1
da8: c5 f9 fe c1 vpaddd %xmm1,%xmm0,%xmm0
dac: c4 e3 79 16 c2 02 vpextrd $0x2,%xmm0,%edx
db2: c5 f9 7e c0 vmovd %xmm0,%eax
db6: 01 d0 add %edx,%eax
db8: c4 e3 79 16 c2 01 vpextrd $0x1,%xmm0,%edx
dbe: 01 c2 add %eax,%edx
dc0: c4 e3 79 16 c0 03 vpextrd $0x3,%xmm0,%eax
dc6: 01 d0 add %edx,%eax
dc8: c5 f8 77 vzeroupper
dcb: c3 retq
dcc: 0f 1f 40 00 nopl 0x0(%rax)
dd0: c5 f9 ef c0 vpxor %xmm0,%xmm0,%xmm0
dd4: c5 fd 6f c8 vmovdqa %ymm0,%ymm1
dd8: eb c3 jmp d9d <_Z10callVectorPii+0x3d>
dda: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
0000000000000e50 <_Z13callNonVectorPii>:
e50: 8b 15 0a 14 20 00 mov 0x20140a(%rip),%edx # 202260 <iteration>
e56: 39 d6 cmp %edx,%esi
e58: 7d 26 jge e80 <_Z13callNonVectorPii+0x30>
e5a: ff ca dec %edx
e5c: 48 63 ce movslq %esi,%rcx
e5f: 45 31 c0 xor %r8d,%r8d
e62: 29 f2 sub %esi,%edx
e64: 48 8d 04 8f lea (%rdi,%rcx,4),%rax
e68: 48 01 ca add %rcx,%rdx
e6b: 48 8d 54 97 04 lea 0x4(%rdi,%rdx,4),%rdx
e70: 44 03 00 add (%rax),%r8d
e73: 48 83 c0 04 add $0x4,%rax
e77: 48 39 d0 cmp %rdx,%rax
e7a: 75 f4 jne e70 <_Z13callNonVectorPii+0x20>
e7c: 44 89 c0 mov %r8d,%eax
e7f: c3 retq
e80: 45 31 c0 xor %r8d,%r8d
e83: 44 89 c0 mov %r8d,%eax
e86: c3 retq
e87: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
e8e: 00 00