I needed a way of initializing a scalar value given either a single float, or three floating point values (corresponding to RGB). So I just threw together a very simple struct:
struct Mono {
float value;
Mono(){
this->value = 0;
}
Mono(float value) {
this->value = value;
};
Mono(float red, float green, float blue){
this->value = (red+green+blue)/3;
};
};
// Multiplication operator overloads:
Mono operator*( Mono const& lhs, Mono const& rhs){
return Mono(lhs.value*rhs.value);
};
Mono operator*( float const& lhs, Mono const& rhs){
return Mono(lhs*rhs.value);
};
Mono operator*( Mono const& lhs, float const& rhs){
return Mono(lhs.value*rhs);
};
This worked as expected, but then I wanted to benchmark to see if this wrapper is going to impact performance at all so I wrote the following benchmark test where I simply multiplied a float by the struct 100,000,000 times, and multipled a float by a float 100,000,000 times:
#include <vector>
#include <chrono>
#include <iostream>
using namespace std::chrono;
int main() {
size_t N = 100000000;
std::vector<float> inputs(N);
std::vector<Mono> outputs_c(N);
std::vector<float> outputs_f(N);
Mono color(3.24);
float color_f = 3.24;
for (size_t i = 0; i < N; i++){
inputs[i] = i;
};
auto start_c = high_resolution_clock::now();
for (size_t i = 0; i < N; i++){
outputs_c[i] = color*inputs[i];
}
auto stop_c = high_resolution_clock::now();
auto duration_c = duration_cast<microseconds>(stop_c - start_c);
std::cout << "Mono*float duration: " << duration_c.count() << "\n";
auto start_f = high_resolution_clock::now();
for (size_t i = 0; i < N; i++){
outputs_f[i] = color_f*inputs[i];
}
auto stop_f = high_resolution_clock::now();
auto duration_f = duration_cast<microseconds>(stop_f - start_f);
std::cout << "float*float duration: " << duration_f.count() << "\n";
return 0;
}
When I compile it without any optimizations: g++ test.cpp
, it prints the following times (in microseconds) very reliably:
Mono*float duration: 841122
float*float duration: 656197
So the Mono
*float
is clearly slower in that case. But then if I turn on optimizations (g++ test.cpp -O3
), it prints the following times (in microseconds) very reliably:
Mono*float duration: 75494
float*float duration: 86176
I'm assuming that something is getting optimized weirdly here and it is NOT actually faster to wrap a float in a struct like this... but I'm struggling to see what is going wrong with my test.