I am trying out some bitfield operations and benchmarking them following the info in this post. The code I am using is essentialy the same and shown below.
I've compiled the code with
❯ g++ bench.cpp -std=c++20 -march=native -O3 -o g++bench.out
❯ clang++ bench.cpp -std=c++20 -march=native -O3 -o clang++bench.out
Results:
❯ ./g++bench.out
operations on struct in memory
bitfields: 0.00443397
570425344
separate ints: 0.00320708
570425344
explicit and/or/shift: 0.0721971
570425344
operations on struct larger than memory
bitfields: 0.202714
570425344
separate ints: 0.127191
570425344
explicit and/or/shift: 0.102186
570425344
❯ ./clang++bench.out
operations on struct in memory
bitfields: 0.00304556
570425344
separate ints: 0.00291514
570425344
explicit and/or/shift: 0.00276303
570425344
operations on struct larger than memory
bitfields: 0.00350051
570425344
separate ints: 0.116294
570425344
explicit and/or/shift: 0.0909704
570425344
What mainly strikes me is that the clang code for the bitfields in the large vector is almost 30 times faster than the clang version using separate ints or explicit and/or/shift and 58 times faster than the g++ compiled version for bitfields.
As the code for operations on a struct in memory all runs in the same time I suspect there is no special optimization for the operations itself but clang is doing some clever memory fetching or loop unrolling.
Can anyone explain why the clang bitfield code in this case is so fast (or maybe if there's just a bug in the benchmark)?
Also I would like to know if the benchmark code can be adapted so g++ can get the same speedup.
#include <time.h>
#include <iostream>
#include <vector>
struct A
{
void a(unsigned n) { a_ = n; }
void b(unsigned n) { b_ = n; }
void c(unsigned n) { c_ = n; }
void d(unsigned n) { d_ = n; }
unsigned a() { return a_; }
unsigned b() { return b_; }
unsigned c() { return c_; }
unsigned d() { return d_; }
unsigned a_:1,
b_:5,
c_:2,
d_:8;
};
struct B
{
void a(unsigned n) { a_ = n; }
void b(unsigned n) { b_ = n; }
void c(unsigned n) { c_ = n; }
void d(unsigned n) { d_ = n; }
unsigned a() { return a_; }
unsigned b() { return b_; }
unsigned c() { return c_; }
unsigned d() { return d_; }
unsigned a_, b_, c_, d_;
};
struct C
{
void a(unsigned n) { x_ &= ~0x01; x_ |= n; }
void b(unsigned n) { x_ &= ~0x3E; x_ |= n << 1; }
void c(unsigned n) { x_ &= ~0xC0; x_ |= n << 6; }
void d(unsigned n) { x_ &= ~0xFF00; x_ |= n << 8; }
unsigned a() const { return x_ & 0x01; }
unsigned b() const { return (x_ & 0x3E) >> 1; }
unsigned c() const { return (x_ & 0xC0) >> 6; }
unsigned d() const { return (x_ & 0xFF00) >> 8; }
unsigned x_;
};
struct Timer
{
Timer() { get(&start_tp); }
double elapsed() const {
struct timespec end_tp;
get(&end_tp);
return (end_tp.tv_sec - start_tp.tv_sec) +
(1E-9 * end_tp.tv_nsec - 1E-9 * start_tp.tv_nsec);
}
private:
static void get(struct timespec* p_tp) {
if (clock_gettime(CLOCK_REALTIME, p_tp) != 0)
{
std::cerr << "clock_gettime() error\n";
exit(EXIT_FAILURE);
}
}
struct timespec start_tp;
};
template <typename T>
unsigned f()
{
int n = 0;
Timer timer;
T t;
for (int i = 0; i < 1024*1024*32; ++i)
{
t.a(i & 0x01);
t.b(i & 0x1F);
t.c(i & 0x03);
t.d(i & 0xFF);
n += t.a() + t.b() + t.c() + t.d();
}
std::cout << timer.elapsed() << '\n';
return n;
}
template <typename T>
unsigned g()
{
int n = 0;
Timer timer;
std::vector<T> ts(1024 * 1024 * 16);
for (size_t i = 0, idx = 0; i < 1024*1024*32; ++i)
{
T& t = ts[idx];
t.a(i & 0x01);
t.b(i & 0x1F);
t.c(i & 0x03);
t.d(i & 0xFF);
n += t.a() + t.b() + t.c() + t.d();
idx++;
if (idx >= ts.size()) {
idx = 0;
}
}
std::cout << timer.elapsed() << '\n';
return n;
}
int main()
{
std::cout << "operations on struct in memory" << std::endl;
std::cout << "bitfields: " << f<A>() << '\n';
std::cout << "separate ints: " << f<B>() << '\n';
std::cout << "explicit and/or/shift: " << f<C>() << '\n';
std::cout << std::endl;
std::cout << "operations on struct larger than memory" << std::endl;
std::cout << "bitfields: " << g<A>() << '\n';
std::cout << "separate ints: " << g<B>() << '\n';
std::cout << "explicit and/or/shift: " << g<C>() << '\n';
std::cout << std::endl;
}