I'm trying to microbenchmark the following C++ code, compiled with the -O3
g++ compiler option for maximum performance:
long long x = 0;
int iterations = 1000000;
int load = 1000;
for(int i = 0; i < iterations; i++) {
long start = get_nano_ts(&ts);
for(int j = 0; j < load; j++) {
long p = (i % 8) * (i % 16);
if (i % 2 == 0) {
x += p;
} else {
x -= p;
}
asm(""); // so that the loop is not removed by -O3
}
long end = get_nano_ts(&ts);
int res = end - start - nanoTimeCost;
if (res <= 0) res = 1;
// (...) removed for clarity
}
cout << "Value computed: " << x << endl;
As you can see I'm using the asm("")
instruction to prevent the compiler from turning my loop calculation into something else. How do I know it is turning my loop calculation into something else? That's because without the asm("")
line, the value is calculated immediately and the program exits immediately, no matter how large I make the load
variable.
So my question is: How can I use the -O3
compiler option and still prevent it from turning my loop into something else?
From this SO question, I got that I have to use asm("")
for that. But then my question becomes: Wouldn't asm("")
mess with my timing (add overhead) and/or prevent the compiler from doing other valid/good optimizations like inlining?
Another solution for me would be to come up with a for/loop code that cannot be translated to a straight mathematical formula by the compiler, in other words, some kind of mathematical computation that really requires a loop to be executed. Any suggestions?
Below the full C++ code I'm using:
#include <iostream>
#include <string>
#include <random>
#include <cmath>
#include <algorithm>
#include <limits>
#include <sys/time.h>
#include <map>
#include <sched.h>
#include <sstream>
#include <iomanip>
using namespace std;
// TO COMPILE: g++ TestJitter.cpp -o TestJitter -std=c++11 -O3
// TO EXECUTE: ./TestJitter 10000000 1000000 1000 1
static const bool MORE_PERCS = true;
static const bool INCLUDE_WORST_PERCS = true;
static const bool INCLUDE_TOTALS = true;
static const bool INCLUDE_RATIOS = false;
static const bool INCLUDE_STDEV = true;
static const bool EXCLUDE_NANO_TS_COST = true;
long get_nano_ts(timespec* ts) {
clock_gettime(CLOCK_MONOTONIC, ts);
return ts->tv_sec * 1000000000 + ts->tv_nsec;
}
static const long NANO_COST_ITERATIONS = 10000000;
static long calc_nano_ts_cost() {
struct timespec ts;
long start = get_nano_ts(&ts);
long finish = start;
for (long i = 0; i < NANO_COST_ITERATIONS; i++) {
finish = get_nano_ts(&ts);
}
finish = get_nano_ts(&ts);
return (finish - start) / NANO_COST_ITERATIONS;
}
struct mi {
long value;
};
void add_perc(stringstream& ss, int size, double perc, map<int, mi*>* map) {
if (map->empty()) return;
int max = -1;
int minBottom = -1;
long x = round(perc * size);
long i = 0;
long iBottom = 0;
long sum = 0;
long sumBottom = 0;
bool trueForTopFalseForBottom = true;
bool flag = false;
const int arraySize = 1024 * 1024 * 10;
int* tempData = new int[arraySize];
double stdevTop = -1;
for(auto iter = map->begin(); iter != map->end(); iter++) {
if (flag) break;
int time = iter->first;
long count = (iter->second)->value;
for(int a = 0; a < count; a++) {
if (trueForTopFalseForBottom) {
tempData[i] = time;
i++;
sum += time;
if (i == x) {
max = time;
if (INCLUDE_STDEV) {
double avg = (double) sum / (double) i;
double temp = 0;
for(int b = 0; b < i; b++) {
int t = tempData[b];
temp += (avg - t) * (avg - t);
}
stdevTop = sqrt(((double) temp / (double) i));
}
if (INCLUDE_WORST_PERCS) {
trueForTopFalseForBottom = false;
} else {
flag = true;
break;
}
}
} else {
tempData[iBottom] = time;
iBottom++;
sumBottom += time;
if (minBottom == -1) {
minBottom = time;
}
}
}
}
ss << " | " << fixed << setprecision(5) << (perc * 100) << "%";
if (INCLUDE_TOTALS) ss << " (" << i << ")";
ss << " = [avg: " << (sum / i);
if (INCLUDE_STDEV) ss << ", stdev: " << fixed << setprecision(2) << stdevTop;
ss << ", max: " << max << "]";
if (INCLUDE_WORST_PERCS) {
ss << " - " << fixed << setprecision(5) << ((1 - perc) * 100) << "%";
if (INCLUDE_TOTALS) ss << " (" << (iBottom > 0 ? iBottom : 0) << ")";
ss << " = [avg: " << (iBottom > 0 ? (sumBottom / iBottom) : -1);
if (INCLUDE_STDEV) {
ss << ", stdev: ";
if (iBottom <= 0) {
ss << "?";
} else {
double avgBottom = (sumBottom / iBottom);
double temp = 0;
for(int b = 0; b < iBottom; b++) {
long t = tempData[b];
temp += (avgBottom - t) * (avgBottom - t);
}
double stdevBottom = sqrt((double) temp / (double) iBottom);
ss << fixed << setprecision(2) << stdevBottom;
}
}
ss << ", min: " << (minBottom != -1 ? minBottom : -1) << "]";
if (INCLUDE_RATIOS) {
ss << " R: ";
ss << fixed << setprecision(2) << (iBottom > 0 ? (((sumBottom / iBottom) / (double) (sum / i)) - 1) * 100 : -1);
ss << "%";
}
}
delete[] tempData;
}
int main(int argc, char* argv[]) {
int iterations = stoi(argv[1]);
int warmup = stoi(argv[2]);
int load = stoi(argv[3]);
int proc = stoi(argv[4]);
cpu_set_t my_set;
CPU_ZERO(&my_set);
CPU_SET(proc, &my_set);
sched_setaffinity(0, sizeof(cpu_set_t), &my_set);
long nanoTimeCost = EXCLUDE_NANO_TS_COST ? calc_nano_ts_cost() : 0;
struct timespec ts;
long long x = 0;
long long totalTime = 0;
int minTime = numeric_limits<int>::max();
int maxTime = numeric_limits<int>::min();
map<int, mi*>* results = new map<int, mi*>();
for(int i = 0; i < iterations; i++) {
long start = get_nano_ts(&ts);
for(int j = 0; j < load; j++) {
long p = (i % 8) * (i % 16);
if (i % 2 == 0) {
x += p;
} else {
x -= p;
}
asm(""); // so that the loop is not removed by -O3
}
long end = get_nano_ts(&ts);
int res = end - start - nanoTimeCost;
if (res <= 0) res = 1;
if (i >= warmup) {
totalTime += res;
minTime = min(minTime, res);
maxTime = max(maxTime, res);
auto iter = results->find(res);
if (iter != results->end()) {
(iter->second)->value = (iter->second)->value + 1;
} else {
mi* elem = new mi();
elem->value = 1;
(*results)[res] = elem;
}
}
}
int count = iterations - warmup;
double avg = totalTime / count;
cout << "Value computed: " << x << endl;
cout << "Nano timestamp cost: " << nanoTimeCost << endl;
stringstream ss;
ss << "Iterations: " << count << " | Avg Time: " << avg;
if (INCLUDE_STDEV) {
long temp = 0;
long x = 0;
for(auto iter = results->begin(); iter != results->end(); iter++) {
int time = iter->first;
long count = (iter->second)->value;
for(int a = 0; a < count; a++) {
temp += (avg - time) * (avg - time);
x++;
}
}
double stdev = sqrt( temp / x );
ss << " | Stdev: " << fixed << setprecision(2) << stdev;
}
if (count > 0) {
ss << " | Min Time: " << minTime << " | Max Time: " << maxTime;
}
add_perc(ss, count, 0.75, results);
add_perc(ss, count, 0.90, results);
add_perc(ss, count, 0.99, results);
add_perc(ss, count, 0.999, results);
add_perc(ss, count, 0.9999, results);
add_perc(ss, count, 0.99999, results);
if (MORE_PERCS) {
add_perc(ss, count, 0.999999, results);
add_perc(ss, count, 0.9999999, results);
}
cout << ss.str() << endl << endl;
delete results;
return 0;
}