I'm trying to do some comparisons on different methods for calculating dot products using SSE Intrinsics, but since the methods are only a few cycles long, I have to run the instructions trillions of times for it to take more than a tiny fraction of a second. The only problem with that is that gcc
with the -O3
flag is "optimizing" my main
method into an infinite loop.
My code is
#include <immintrin.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <inttypes.h>
#define NORMAL 0
struct _Vec3 {
float x;
float y;
float z;
float w;
};
typedef struct _Vec3 Vec3;
__m128 singleDot(__m128 a, __m128 b) {
return _mm_dp_ps(a, b, 0b00001111);
}
int main(int argc, char** argv) {
for (uint16_t j = 0; j < (1L << 16); j++) {
for (uint64_t i = 0; i < (1L << 62); i++) {
Vec3 a = {i, i + 0.5, i + 1, 0.0};
Vec3 b = {i, i - 0.5, i - 1, 0.0};
#if NORMAL
float ans = normalDot(a, b); // naive implementation
#else
// float _c[4] = {a.x, a.y, a.z, 0.0};
// float _d[4] = {b.x, b.y, b.z, 0.0};
__m128 c = _mm_load_ps((float*)&a);
__m128 d = _mm_load_ps((float*)&b);
__m128 ans = singleDot(c, d);
#endif
}
}
}
but when I compile with gcc -std=c11 -march=native -O3 main.c
and run objdump -d
, it turns main into
0000000000400400 <main>:
400400: eb fe jmp 400400 <main>
is there an alternative for timing different approaches?