0

I'm trying to microbenchmark the following C++ code, compiled with the -O3 g++ compiler option for maximum performance:

long long x = 0;
int iterations = 1000000;
int load = 1000;

for(int i = 0; i < iterations; i++) {
 
    long start = get_nano_ts(&ts);
 
    for(int j = 0; j < load; j++) {
        long p = (i % 8) * (i % 16);
        if (i % 2 == 0) {
            x += p;
        } else {
            x -= p;
        }
        asm(""); // so that the loop is not removed by -O3
    }
     
    long end = get_nano_ts(&ts);

    int res = end - start - nanoTimeCost;
     
    if (res <= 0) res = 1;
     
    // (...) removed for clarity
}

cout << "Value computed: " << x << endl;

As you can see I'm using the asm("") instruction to prevent the compiler from turning my loop calculation into something else. How do I know it is turning my loop calculation into something else? That's because without the asm("") line, the value is calculated immediately and the program exits immediately, no matter how large I make the load variable.

So my question is: How can I use the -O3 compiler option and still prevent it from turning my loop into something else?

From this SO question, I got that I have to use asm("") for that. But then my question becomes: Wouldn't asm("") mess with my timing (add overhead) and/or prevent the compiler from doing other valid/good optimizations like inlining?

Another solution for me would be to come up with a for/loop code that cannot be translated to a straight mathematical formula by the compiler, in other words, some kind of mathematical computation that really requires a loop to be executed. Any suggestions?

Below the full C++ code I'm using:

#include <iostream>
#include <string>
#include <random>
#include <cmath>
#include <algorithm>
#include <limits>
#include <sys/time.h>
#include <map>
#include <sched.h>
#include <sstream>
#include <iomanip>
 
using namespace std;
 
// TO COMPILE: g++ TestJitter.cpp -o TestJitter -std=c++11 -O3
// TO EXECUTE: ./TestJitter 10000000 1000000 1000 1
 
static const bool MORE_PERCS = true;
static const bool INCLUDE_WORST_PERCS = true;
static const bool INCLUDE_TOTALS = true;
static const bool INCLUDE_RATIOS = false;
static const bool INCLUDE_STDEV = true;
 
static const bool EXCLUDE_NANO_TS_COST = true;
 
long get_nano_ts(timespec* ts) {
    clock_gettime(CLOCK_MONOTONIC, ts);
    return ts->tv_sec * 1000000000 + ts->tv_nsec;
}
 
static const long NANO_COST_ITERATIONS = 10000000;
 
static long calc_nano_ts_cost() {
 
    struct timespec ts;
 
    long start = get_nano_ts(&ts);
 
    long finish = start;
 
    for (long i = 0; i < NANO_COST_ITERATIONS; i++) {
        finish = get_nano_ts(&ts);
    }
 
    finish = get_nano_ts(&ts);
         
    return (finish - start) / NANO_COST_ITERATIONS;
}
 
struct mi {
   long value;
};
 
void add_perc(stringstream& ss, int size, double perc, map<int, mi*>* map) {
 
    if (map->empty()) return;
     
    int max = -1;
    int minBottom = -1;
     
    long x = round(perc * size);
    long i = 0;
    long iBottom = 0;
     
    long sum = 0;
    long sumBottom = 0;
     
    bool trueForTopFalseForBottom = true;
    bool flag = false;
     
    const int arraySize = 1024 * 1024 * 10;
    int* tempData = new int[arraySize];
    double stdevTop = -1;
     
    for(auto iter = map->begin(); iter != map->end(); iter++) {
     
        if (flag) break;
     
        int time = iter->first;
        long count = (iter->second)->value;
         
        for(int a = 0; a < count; a++) {
         
            if (trueForTopFalseForBottom) {
         
                tempData[i] = time;
         
                i++;
                sum += time;
                 
                if (i == x) {
                     
                    max = time;
                     
                    if (INCLUDE_STDEV) {
                             
                        double avg = (double) sum / (double) i;
                        double temp = 0;
                         
                        for(int b = 0; b < i; b++) {
                            int t = tempData[b];
                            temp += (avg - t) * (avg - t);
                        }
                         
                        stdevTop = sqrt(((double) temp / (double) i));
                    }
                 
                    if (INCLUDE_WORST_PERCS) {
                        trueForTopFalseForBottom = false;  
                    } else {
                        flag = true;
                        break;
                    }
                }
                 
            } else {
             
                tempData[iBottom] = time;
             
                iBottom++;
                sumBottom += time;
                if (minBottom == -1) {
                    minBottom = time;
                }
            }
        }
    }
     
    ss << " | " << fixed << setprecision(5) << (perc * 100) << "%";
    if (INCLUDE_TOTALS) ss << " (" << i << ")";
    ss << " = [avg: " << (sum / i);
    if (INCLUDE_STDEV) ss << ", stdev: " << fixed << setprecision(2) << stdevTop;
    ss << ", max: " << max << "]";
    if (INCLUDE_WORST_PERCS) {
        ss << " - " << fixed << setprecision(5) << ((1 - perc) * 100) << "%";
        if (INCLUDE_TOTALS) ss << " (" << (iBottom > 0 ? iBottom : 0) << ")";
        ss << " = [avg: " << (iBottom > 0 ? (sumBottom / iBottom) : -1);
         
        if (INCLUDE_STDEV) {
         
            ss << ", stdev: ";
         
            if (iBottom <= 0) {
                ss << "?";
            } else {
             
                double avgBottom = (sumBottom / iBottom);
                 
                double temp = 0;
                 
                for(int b = 0; b < iBottom; b++) {
                    long t = tempData[b];
                    temp += (avgBottom - t) * (avgBottom - t);
                }
                 
                double stdevBottom = sqrt((double) temp / (double) iBottom);
             
                ss << fixed << setprecision(2) << stdevBottom;
            }
         
        }
         
        ss << ", min: " << (minBottom != -1 ? minBottom : -1) << "]";
        if (INCLUDE_RATIOS) {
            ss << " R: ";
            ss << fixed << setprecision(2) << (iBottom > 0 ? (((sumBottom / iBottom) / (double) (sum / i)) - 1) * 100 : -1);
            ss << "%";
        }
    }
     
    delete[] tempData;
}
 
int main(int argc, char* argv[]) {
     
    int iterations = stoi(argv[1]);
    int warmup = stoi(argv[2]);
    int load = stoi(argv[3]);
    int proc = stoi(argv[4]);
     
    cpu_set_t my_set;
    CPU_ZERO(&my_set);
    CPU_SET(proc, &my_set);
    sched_setaffinity(0, sizeof(cpu_set_t), &my_set);
     
    long nanoTimeCost = EXCLUDE_NANO_TS_COST ? calc_nano_ts_cost() : 0;
     
    struct timespec ts;
     
    long long x = 0;
    long long totalTime = 0;
    int minTime = numeric_limits<int>::max();
    int maxTime = numeric_limits<int>::min();
     
    map<int, mi*>* results = new map<int, mi*>();
     
    for(int i = 0; i < iterations; i++) {
     
        long start = get_nano_ts(&ts);
     
        for(int j = 0; j < load; j++) {
            long p = (i % 8) * (i % 16);
            if (i % 2 == 0) {
                x += p;
            } else {
                x -= p;
            }
            asm(""); // so that the loop is not removed by -O3
        }
         
        long end = get_nano_ts(&ts);
 
        int res = end - start - nanoTimeCost;
         
        if (res <= 0) res = 1;
         
        if (i >= warmup) {
            totalTime += res;
            minTime = min(minTime, res);
            maxTime = max(maxTime, res);
             
            auto iter = results->find(res);
             
            if (iter != results->end()) {
             
                (iter->second)->value = (iter->second)->value + 1;
                 
            } else {
             
                mi* elem = new mi();
                elem->value = 1;
                (*results)[res] = elem;
            }      
        }
    }
     
    int count = iterations - warmup;
     
    double avg = totalTime / count;
     
    cout << "Value computed: " << x << endl;
    cout << "Nano timestamp cost: " << nanoTimeCost << endl;
     
    stringstream ss;
     
    ss << "Iterations: " << count << " | Avg Time: " << avg;
 
 
    if (INCLUDE_STDEV) {
     
        long temp = 0;
        long x = 0;
     
        for(auto iter = results->begin(); iter != results->end(); iter++) {
     
            int time = iter->first;
            long count = (iter->second)->value;
             
            for(int a = 0; a < count; a++) {
                temp += (avg - time) * (avg - time);
                x++;
            }
        }
         
        double stdev = sqrt( temp / x );
         
        ss << " | Stdev: " << fixed << setprecision(2) << stdev;
    }
     
    if (count > 0) {
        ss << " | Min Time: " << minTime << " | Max Time: " << maxTime;
    }
     
    add_perc(ss, count, 0.75, results);
    add_perc(ss, count, 0.90, results);
    add_perc(ss, count, 0.99, results);
    add_perc(ss, count, 0.999, results);
    add_perc(ss, count, 0.9999, results);
    add_perc(ss, count, 0.99999, results);
     
    if (MORE_PERCS) {
        add_perc(ss, count, 0.999999, results);
        add_perc(ss, count, 0.9999999, results);
    }
 
    cout << ss.str() << endl << endl;
         
    delete results;
     
    return 0;
}
Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
  • 4
    Not sure what is the point of benchmarking optimized code, while preventing some optimizations. – Eugene Sh. Jun 23 '22 at 16:01
  • I get your point, but I just want my loop to be there. I don't want my loop to be removed. I have my reasons for wanting that which I don't want to bother you with the explanation. – Phillip Borge Jun 23 '22 at 16:03
  • But the whole timed code is the loop. The compiler have found a way to perform it in a smarter way, but you don't want to let it to. What exactly are you trying to measure then? You should state your reasons, because it smells like an XY-problem. – Eugene Sh. Jun 23 '22 at 16:06
  • Not as posed, it isn't. – sweenish Jun 23 '22 at 16:06
  • Ok, I want something that takes time to be calculated. That's the reason I naively used a loop. But the compiler turned my costly operation into something cheap. I'm actually (as the TestJitter.cpp name implies) trying to measure Jitter, not performance. – Phillip Borge Jun 23 '22 at 16:09
  • 2
    Then you should come up with a loop which won't get optimized. For example one manipulating randomly generated data (and make sure that the calculation result is used somewhere, and not just discarded). – Eugene Sh. Jun 23 '22 at 16:11
  • Thanks, @EugeneSh. Do you think this can be accomplished (keeping the loop) without random numbers? Any kind of mathematical calculation that would prevent the loop from being removed? – Phillip Borge Jun 23 '22 at 16:15
  • 1
    Well, for starters it should not be based on a compile-time constant as your `load` currently. Try changing just this (make it either input or random value, or even an `extern` variable) and see if the loop is compiled. – Eugene Sh. Jun 23 '22 at 16:19
  • You can try making the global variables `volatile` which could confuse the compiler into not being able to optimize loops relying on the volatile values. – Dmytro Jun 23 '22 at 16:28
  • 1
    `load` is currently external. It is a command-line argument to the program (check the full source code, you can even execute it if you want). What the compiler is doing is mathematically eliminating the loop to a O(1) expression. – Phillip Borge Jun 23 '22 at 16:28
  • @Dmitry Does `asm("")` introduce any overhead? – Phillip Borge Jun 23 '22 at 17:08
  • If you want to do such microoptimizations, 1) measure 2) learn how to read assembly. – xiver77 Jun 23 '22 at 17:37
  • `asm("")` by definition doesn't emit any instructions. It only introduces "overhead" to the extent that it prevents other optimizations from being done. But in this case your entire goal is to prevent optimizations from being done. Maybe your question is will it inhibit any optimizations other than "the ones I want to avoid", but you don't seem to have precisely defined which optimizations those are. – Nate Eldredge Jun 24 '22 at 05:59

0 Answers0