I programmed a function that rotates four 3d points around it own arbitrary 3d axis using SIMD. It works but the performance is terrible! Only 15% better than calling a function four times that rotates only one point at a time. I was hoping for 200%-300% improvement. One issue is, the compiler doesn't recognize the SIMD trig functions. Even if I was able to use them I don't know if my CPU's supports them (Ryzen 3 1200 and Ryzen 7 3700).
I'm hoping to use SIMD to calculate elliptical orbits since there's a lot of trig involved.
I've been searching online but really can't find anything helpful. Here's the code.
vec12d MathFunction::Rotate_SIMD(const vec4d& angle, const vec12d& axis, const vec12d& point)
{
vec4d get_cosine = vec4d(cos(angle.v[0]), cos(angle.v[1]), cos(angle.v[2]), cos(angle.v[3]));
vec4d get_sine = vec4d(sin(angle.v[0]), sin(angle.v[1]), sin(angle.v[2]), sin(angle.v[3]));
__m256d cosTheta = _mm256_loadu_pd(get_cosine.v);
__m256d sinTheta = _mm256_loadu_pd(get_sine.v);
__m256d one = _mm256_set1_pd(1.0);
__m256d minus_cosTheta = _mm256_sub_pd(one, cosTheta);
__m256d Xaxis = _mm256_loadu_pd(axis.v);
__m256d Yaxis = _mm256_loadu_pd(&axis.v[4]);
__m256d Zaxis = _mm256_loadu_pd(&axis.v[8]);
__m256d Xpoint = _mm256_loadu_pd(point.v);
__m256d Ypoint = _mm256_loadu_pd(&point.v[4]);
__m256d Zpoint = _mm256_loadu_pd(&point.v[8]);
// Get new Xaxis points
__m256d Xnew = _mm256_mul_pd(Xaxis, Xaxis);
Xnew = _mm256_mul_pd(Xnew, minus_cosTheta);
Xnew = _mm256_add_pd(Xnew, cosTheta);
Xnew = _mm256_mul_pd(Xnew, Xpoint);
__m256d temp = _mm256_mul_pd(minus_cosTheta, Xaxis);
temp = _mm256_mul_pd(temp, Yaxis);
__m256d temp2 = _mm256_mul_pd(Zaxis, sinTheta);
temp = _mm256_sub_pd(temp, temp2);
temp = _mm256_mul_pd(temp, Ypoint);
Xnew = _mm256_add_pd(Xnew, temp);
temp = _mm256_mul_pd(minus_cosTheta, Xaxis);
temp = _mm256_mul_pd(temp, Zaxis);
temp2 = _mm256_mul_pd(Yaxis, sinTheta);
temp = _mm256_add_pd(temp, temp2);
temp = _mm256_mul_pd(temp, Zpoint);
Xnew = _mm256_add_pd(Xnew, temp);
// Get new Yaxis points
__m256d Ynew = _mm256_mul_pd(minus_cosTheta, Xaxis);
Ynew = _mm256_mul_pd(Ynew, Yaxis);
temp2 = _mm256_mul_pd(Zaxis, sinTheta);
Ynew = _mm256_add_pd(Ynew, temp2);
Ynew = _mm256_mul_pd(Ynew, Xpoint);
temp = _mm256_mul_pd(Yaxis, Yaxis);
temp = _mm256_mul_pd(temp, minus_cosTheta);
temp = _mm256_add_pd(temp, cosTheta);
temp = _mm256_mul_pd(temp, Ypoint);
Ynew = _mm256_add_pd(Ynew, temp);
temp = _mm256_mul_pd(minus_cosTheta, Yaxis);
temp = _mm256_mul_pd(temp, Zaxis);
temp2 = _mm256_mul_pd(Xaxis, sinTheta);
temp = _mm256_sub_pd(temp, temp2);
temp = _mm256_mul_pd(temp, Zpoint);
Ynew = _mm256_add_pd(Ynew, temp);
// Get new Zaxis points
__m256d Znew = _mm256_mul_pd(minus_cosTheta, Xaxis);
Znew = _mm256_mul_pd(Znew, Zaxis);
temp2 = _mm256_mul_pd(Xaxis, sinTheta);
Znew = _mm256_sub_pd(Ynew, temp2);
Znew = _mm256_mul_pd(Ynew, Xpoint);
temp = _mm256_mul_pd(minus_cosTheta, Yaxis);
temp = _mm256_mul_pd(temp, Zaxis);
temp2 = _mm256_mul_pd(Xaxis, sinTheta);
temp = _mm256_add_pd(temp, temp2);
temp = _mm256_mul_pd(temp, Ypoint);
Znew = _mm256_add_pd(Znew, temp);
temp = _mm256_mul_pd(minus_cosTheta, Zaxis);
temp = _mm256_mul_pd(temp, Zaxis);
temp = _mm256_add_pd(temp, cosTheta);
temp = _mm256_mul_pd(temp, Zpoint);
Znew = _mm256_add_pd(Znew, temp);
vec12d Npoint __attribute__ ((aligned(32))); // Four points rotated goes here.
_mm256_store_pd(Npoint.v, Xnew);
_mm256_store_pd(&Npoint.v[4], Ynew);
_mm256_store_pd(&Npoint.v[8], Znew);
return Npoint;
}