I am trying to write a simple code using SSE and SSE3 to calculate the sum of all the elements of an array. The difference is that in one of the codes I do the sum "vertically" using PADDD and in the other I do the sum horizontally, using HADPPS. Since the only value I am interested in is the total sum, the way I do the sum should not matter. However, the horizontal addition is outputting the wrong results. Any idea why?
This is the code for the regular add:
int sumelems_sse(int *a, int size)
{
int tmp[4];
tmp[0] = 0;
tmp[1] = 0;
tmp[2] = 0;
tmp[3] = 0;
int total;
__asm__ volatile (
"\n\t movdqa %0,%%xmm0 \t#" // moves tmp[0] to xmm0
: /* no output */
: "m" (tmp[0]) //%0
);
for (int i=0;i<size;i+=4) {
__asm__ volatile
( // instruction comment
"\n\t movdqa %0,%%xmm1 \t#" // moves a[i] to xmm1
"\n\t paddd %%xmm1,%%xmm0 \t#" // xmm0 = xmm0+xmm1 in 4 blocks of 32 bits
: /* no output */
: "m" (a[i]) // %0
);
}
__asm__ volatile(
"\n\t movdqa %%xmm0,%0 \t#" // moves xmm0 to tmp[0]
: "=m" (tmp[0])
);
total = tmp[0] + tmp[1] + tmp[2] + tmp[3];
return total;
}
And this is the code for the horizontal add:
int sumelems_sse3(int *a, int size)
{
int tmp[4];
tmp[0] = 0;
tmp[1] = 0;
tmp[2] = 0;
tmp[3] = 0;
int total;
__asm__ volatile (
"\n\t movdqa %0,%%xmm0 \t#" // moves tmp[0] to xmm0
: /* no output */
: "m" (tmp[0]) //%0
);
for (int i=0;i<size;i+=4) {
__asm__ volatile
( // instruction comment
"\n\t movdqa %0,%%xmm1 \t#" // moves a[i] to xmm1
"\n\t haddps %%xmm1,%%xmm0 \t#" // xmm0 = xmm0+xmm2 in 4 blocks of 32 bits
: /* no output */
: "m" (a[i]) // %0
);
}
__asm__ volatile(
"\n\t movdqa %%xmm0,%0 \t#" // moves xmm0 to tmp[0]
: "=m" (tmp[0])
);
total = tmp[0] + tmp[1] + tmp[2] + tmp[3];
return total;
}
I think only the adding instruction should change, or not?