With the addsd
change suggested in my comments above, --> addsd xmm0,xmm3
, this can be coded to use the full width registers and the performance is twice as fast.
Loosely:
For the initial value of ones
, it needs to be:
double ones[2] = { 1.0, x }
And we need to replace x
with x2
:
double x2[2] = { x * x, x * x }
If there is an odd number of coefficients, pad it with a zero to produce an even number of them.
And, changing the pointer increment to 16.
Here are the test results I got. I did a number of trials and took the ones that had the best time and elongated the time by doing 100 iterations. std is the C version, dbl is your version, and qed is the "wide" version:
R=1463870188
C=100
T=100
I=100
x: 3.467957099973322e+00 3.467957099973322e+00
one: 1.000000000000000e+00 3.467957099973322e+00
x2: 1.202672644725538e+01 1.202672644725538e+01
std: 2.803772098439484e+56 (ELAP: 0.000019312)
dbl: 2.803772098439484e+56 (ELAP: 0.000019312)
qed: 2.803772098439492e+56 (ELAP: 0.000009060)
rtn_loop: 2.179378907910304e+55 2.585834207648461e+56
rtn_shuf: 2.585834207648461e+56 2.179378907910304e+55
rtn_add: 2.803772098439492e+56 2.585834207648461e+56
This was done on an i7 920 @ 2.67 GHz.
I think if you take the elapsed numbers and convert them, you'll see that your version is faster than you think.
I apologize, in advance, for switching to AT&T syntax as I had difficulty getting the assembler to work the other way. Again, sorry. Also, I'm using linux, so I used the rdi rsi
registers to pass the coefficient pointers. If you're on windows, the ABI is different and you'll have to adjust for that.
I did a C version and diassembled it. It was virtually identical to your code except that it rearranged the non-xmm instructions a bit, which I've added below.
I believe I posted all the files, so you could conceivably run this on your system if you wished.
Here's the original code:
# xmmloop/dbl.s -- implement using single double
.globl dbl
# dbl -- compute result using single double
#
# arguments:
# rdi -- pointer to coeff vector
# rsi -- pointer to coeff vector end
dbl:
movsd x(%rip),%xmm2 # get x value
movsd one(%rip),%xmm1 # get ones
xorps %xmm0,%xmm0 # sum = 0
dbl_loop:
movsd (%rdi),%xmm3 # c[i]
add $8,%rdi # increment to next vector element
cmp %rsi,%rdi # done yet?
mulsd %xmm1,%xmm3 # c[i]*x^i
mulsd %xmm2,%xmm1 # x^(i+1)
addsd %xmm3,%xmm0 # sum += c[i]*x^i
jb dbl_loop # no, loop
retq
Here's the code changed to use the movapd
et. al:
# xmmloop/qed.s -- implement using single double
.globl qed
# qed -- compute result using single double
#
# arguments:
# rdi -- pointer to coeff vector
# rsi -- pointer to coeff vector end
qed:
movapd x2(%rip),%xmm2 # get x^2 value
movapd one(%rip),%xmm1 # get [1,x]
xorpd %xmm4,%xmm4 # sum = 0
qed_loop:
movapd (%rdi),%xmm3 # c[i]
add $16,%rdi # increment to next coefficient
cmp %rsi,%rdi # done yet?
mulpd %xmm1,%xmm3 # c[i]*x^i
mulpd %xmm2,%xmm1 # x^(i+2)
addpd %xmm3,%xmm4 # sum += c[i]*x^i
jb qed_loop # no, loop
movapd %xmm4,rtn_loop(%rip) # save intermediate DEBUG
movapd %xmm4,%xmm0 # get lower sum
shufpd $1,%xmm4,%xmm4 # get upper value into lower half
movapd %xmm4,rtn_shuf(%rip) # save intermediate DEBUG
addsd %xmm4,%xmm0 # add upper sum to lower
movapd %xmm0,rtn_add(%rip) # save intermediate DEBUG
retq
Here's a C version of the code:
// xmmloop/std -- compute result using C code
#include <xmmloop.h>
// std -- compute result using C
double
std(const double *cur,const double *ep)
{
double xt;
double xn;
double ci;
double sum;
xt = x[0];
xn = one[0];
sum = 0;
for (; cur < ep; ++cur) {
ci = *cur; // get c[i]
ci *= xn; // c[i]*x^i
xn *= xt; // x^(i+1)
sum += ci; // sum += c[i]*x^i
}
return sum;
}
Here's the test program I used:
// xmmloop/xmmloop -- test program
#define _XMMLOOP_GLO_
#include <xmmloop.h>
// tvget -- get high precision time
double
tvget(void)
{
struct timespec ts;
double sec;
clock_gettime(CLOCK_REALTIME,&ts);
sec = ts.tv_nsec;
sec /= 1e9;
sec += ts.tv_sec;
return sec;
}
// timeit -- get best time
void
timeit(fnc_p proc,double *cofptr,double *cofend,const char *tag)
{
double tvbest;
double tvbeg;
double tvdif;
double sum;
sum = 0;
tvbest = 1e9;
for (int trycnt = 1; trycnt <= opt_T; ++trycnt) {
tvbeg = tvget();
for (int iter = 1; iter <= opt_I; ++iter)
sum = proc(cofptr,cofend);
tvdif = tvget();
tvdif -= tvbeg;
if (tvdif < tvbest)
tvbest = tvdif;
}
printf("%s: %.15e (ELAP: %.9f)\n",tag,sum,tvbest);
}
// main -- main program
int
main(int argc,char **argv)
{
char *cp;
double *cofptr;
double *cofend;
double *cur;
double val;
long rseed;
int cnt;
--argc;
++argv;
rseed = 0;
cnt = 0;
for (; argc > 0; --argc, ++argv) {
cp = *argv;
if (*cp != '-')
break;
switch (cp[1]) {
case 'C':
cp += 2;
cnt = strtol(cp,&cp,10);
break;
case 'R':
cp += 2;
rseed = strtol(cp,&cp,10);
break;
case 'T':
cp += 2;
opt_T = (*cp != 0) ? strtol(cp,&cp,10) : 1;
break;
case 'I':
cp += 2;
opt_I = (*cp != 0) ? strtol(cp,&cp,10) : 1;
break;
}
}
if (rseed == 0)
rseed = time(NULL);
srand48(rseed);
printf("R=%ld\n",rseed);
if (cnt == 0)
cnt = 100;
if (cnt & 1)
++cnt;
printf("C=%d\n",cnt);
if (opt_T == 0)
opt_T = 100;
printf("T=%d\n",opt_T);
if (opt_I == 0)
opt_I = 100;
printf("I=%d\n",opt_I);
cofptr = malloc(sizeof(double) * cnt);
cofend = &cofptr[cnt];
val = drand48();
for (; val < 3; val += 1.0);
x[0] = val;
x[1] = val;
DMP(x);
one[0] = 1.0;
one[1] = val;
DMP(one);
val *= val;
x2[0] = val;
x2[1] = val;
DMP(x2);
for (cur = cofptr; cur < cofend; ++cur) {
val = drand48();
val *= 1e3;
*cur = val;
}
timeit(std,cofptr,cofend,"std");
timeit(dbl,cofptr,cofend,"dbl");
timeit(qed,cofptr,cofend,"qed");
DMP(rtn_loop);
DMP(rtn_shuf);
DMP(rtn_add);
return 0;
}
And the header file:
// xmmloop/xmmloop.h -- common control
#ifndef _xmmloop_xmmloop_h_
#define _xmmloop_xmmloop_h_
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#ifdef _XMMLOOP_GLO_
#define EXTRN_XMMLOOP /**/
#else
#define EXTRN_XMMLOOP extern
#endif
#define XMMALIGN __attribute__((aligned(16)))
EXTRN_XMMLOOP int opt_T;
EXTRN_XMMLOOP int opt_I;
EXTRN_XMMLOOP double x[2] XMMALIGN;
EXTRN_XMMLOOP double x2[2] XMMALIGN;
EXTRN_XMMLOOP double one[2] XMMALIGN;
EXTRN_XMMLOOP double rtn_loop[2] XMMALIGN;
EXTRN_XMMLOOP double rtn_shuf[2] XMMALIGN;
EXTRN_XMMLOOP double rtn_add[2] XMMALIGN;
#define DMP(_sym) \
printf(#_sym ": %.15e %.15e\n",_sym[0],_sym[1]);
typedef double (*fnc_p)(const double *cofptr,const double *cofend);
double std(const double *cofptr,const double *cofend);
double dbl(const double *cofptr,const double *cofend);
double qed(const double *cofptr,const double *cofend);
#endif