I try to use SIMD(x86 immintrin.h) to speed up my math, the code looks like this:
#include <float.h>
#include <immintrin.h>
#include <cmath>
class Point2 {
public:
Point2() = default;
Point2(double xx, double yy):x_(xx), y_(yy) {}
// SIMD
inline double CrossSIMD(Point2 other) {
__m128d a = _mm_load_pd(&x_);
__m128d _other = _mm_set_pd(other.y_, -other.x_);
__m128d c = _mm_mul_pd(a, _other);
double temp[2];
_mm_store_pd(&temp[0], c);
return temp[0] + temp[1];
}
// None SIMD
inline double Cross(Point2 other) {
return x_* other.y_ - y_ * other.x_;
}
private:
double x_ = 0.;
double y_ = 0.;
} __attribute__((aligned(16)));
int main() {
double sum_cross = 0.;
{
Timer test_1("Cross");
for(int i = 0; i < kLoop; ++i) {
int index_1x = i & 1023;
int index_1y = (i + 1) & 1023;
int index_2x = (i + 2) & 1023;
int index_2y = (i + 3) & 1023;
sum_cross += Point2(array[index_1x], array[index_1y]).Cross(Point2(array[index_2x], array[index_2y]));
}
}
std::cout << sum_cross << std::endl;
double sum_simd = 0.;
{
Timer test_1("SIMD");
for(int i = 0; i < kLoop; ++i) {
int index_1x = i & 1023;
int index_1y = (i + 1) & 1023;
int index_2x = (i + 2) & 1023;
int index_2y = (i + 3) & 1023;
sum_simd += Point2(array[index_1x], array[index_1y]).CrossSIMD(Point2(array[index_2x], array[index_2y]));
}
}
std::cout << sum_simd << std::endl;
std::cout << sum_simd - sum_cross << std::endl;
return 0;
}
Compile with gcc7.5.0 on Linux with options:
g++ -o cross_simd cross_simd.cpp -O3 -march=native
But profiling shows that Operator* is 3 times slower than Cross function.
And I try to read Compiler intermediates (.s files). But I still can't find the reason for the slowness.