I'm trying to mesure the performance of a function.
double microbenchmark_get_sqrt_latency()
{
myInt64 start, end;
list<double> cyclesList;
int num_runs = 40;
double cycles = 0.;
double multiplier = 1.;
double x = 500;
// Repeat the measurement 1000 times
for (size_t i = 0; i < 1000; i++)
{
// Measuring...
start = start_tsc();
for (size_t j = 0; j < num_runs; ++j)
{
sqrtsd(x);
}
// Maybe this instruction is called before the loop ends? somehow?
end = stop_tsc(start);
// Doesn't return the correct number of cycles because
cycles = ((double)end) / num_runs;
cyclesList.push_back(cycles);
}
cyclesList.sort();
auto it = cyclesList.begin();
std::advance(it, cyclesList.size() / 2);
return *it;
}
The problem here is that for the variable end
which represents the number of cycles that has happened since the first rdtsc
instruction is always equal to 22-24, even when num_runs
varies up to 10000. I have no explanation for this, except that maybe the instruction is moved after the first iteration of the for loop.
The compiler and compiler flags that I'm using are : -O3 -fno-tree-vectorize -march=skylake -std=c++17
Here's the implementation of start_tsc()
and stop_tsc()
:
#define RDTSC(cpu_c) \
ASM VOLATILE("rdtsc" \
: "=a"((cpu_c).int32.lo), "=d"((cpu_c).int32.hi))
#define CPUID() \
ASM VOLATILE("cpuid" \
: \
: "a"(0) \
: "bx", "cx", "dx")
unsigned long long start_tsc(void)
{
tsc_counter start;
CPUID();
RDTSC(start);
return COUNTER_VAL(start);
}
unsigned long long stop_tsc(unsigned long long start)
{
tsc_counter end;
RDTSC(end);
CPUID();
return COUNTER_VAL(end) - start;
}
What is wrong with the code? I expect the end
variable to be proportional to num_runs
, but it is not here. Any ideas?