I am trying to do loop unrolling using parallel accumulators but I am confused by the dependencies and calculations. The result and result2 should ideally run independent of each other in order to make use of CPU microarchitecture pipelined design, so that they can be executed in parallel. Meaning the following code should not be used:
for (i = degree - 1; i >= 1; i-=2)
{
result = a[i] + x * result;
result2 = a[i-1] + x * result; //same dependency
}
Original function:
double polyh(double a[], double x, long degree)
{
long i;
double result = a[degree];
for (i = degree - 1; i >= 0; i--)
{
result = a[i] + x * result;
}
return result;
}
INTO:
double poly_opt(double a[], double x, long degree)
{
long i;
double result = a[degree];
double result_2 = 0;
double result_array[2] = {result, result_2};
double xpwr_1 = x; // 1
double xpwr_2 = x * x; // 2
double xpwr_array[2] = {xpwr_1, xpwr_2};
for (i = degree - 1; i >= 1; i -= 2)
{
result = a[i] + xpwr_1 * result;
result_2 = a[i - 1] + xpwr_2 * result_2;
xpwr_1 = xpwr_1 * x * x;
xpwr_2 = xpwr_2 * x * x;
}
// leftover when input not multiple of loop unrolling factor
for (; i >= 0; --i)
{
result = a[i] + xpwr_1 * result;
xpwr_1 = x * xpwr_1;
}
return result * result_2;
}
I am trying to introduce a 2nd variable result2 to do a loop unrolling of 2 but I was unable to get the result as the code above does not work.
This was an attempt without using parallel accumulators and loop unrolling factor of 8 which works slightly but I would like introduce new variables in hopes of speeding it up:
double poly_opt(double a[], double x, long degree)
{
long i;
double result = a[degree];
for (i = degree - 1; i >= 8; i -= 9)
{
result = a[i - 8] + (a[i - 7] + (a[i - 6] + (a[i - 5] + (a[i - 4] + (a[i - 3] + ((a[i - 2] + (a[i - 1] + (a[i] + x * result) * x) * x) * x)) * x) * x) * x) * x) * x;
}
// leftover when input not multiple of loop unrolling factor
for (; i >= 0; --i)
{
result = a[i] + x * result;
}
return result;
}
I am testing with these data sets:
double a[] = {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0};
double x = 0.5;
long degree = 359;
int main()
{
std::cout << polyh(a, x, degree) << std::endl;
return 0;
}
The correct output should be: 3.1428571429