FWIW, and looking only at the relative performance question - a bodgy benchmark:
#include <time.h>
#include <iostream>
struct A
{
void a(unsigned n) { a_ = n; }
void b(unsigned n) { b_ = n; }
void c(unsigned n) { c_ = n; }
void d(unsigned n) { d_ = n; }
unsigned a() { return a_; }
unsigned b() { return b_; }
unsigned c() { return c_; }
unsigned d() { return d_; }
volatile unsigned a_:1,
b_:5,
c_:2,
d_:8;
};
struct B
{
void a(unsigned n) { a_ = n; }
void b(unsigned n) { b_ = n; }
void c(unsigned n) { c_ = n; }
void d(unsigned n) { d_ = n; }
unsigned a() { return a_; }
unsigned b() { return b_; }
unsigned c() { return c_; }
unsigned d() { return d_; }
volatile unsigned a_, b_, c_, d_;
};
struct C
{
void a(unsigned n) { x_ &= ~0x01; x_ |= n; }
void b(unsigned n) { x_ &= ~0x3E; x_ |= n << 1; }
void c(unsigned n) { x_ &= ~0xC0; x_ |= n << 6; }
void d(unsigned n) { x_ &= ~0xFF00; x_ |= n << 8; }
unsigned a() const { return x_ & 0x01; }
unsigned b() const { return (x_ & 0x3E) >> 1; }
unsigned c() const { return (x_ & 0xC0) >> 6; }
unsigned d() const { return (x_ & 0xFF00) >> 8; }
volatile unsigned x_;
};
struct Timer
{
Timer() { get(&start_tp); }
double elapsed() const {
struct timespec end_tp;
get(&end_tp);
return (end_tp.tv_sec - start_tp.tv_sec) +
(1E-9 * end_tp.tv_nsec - 1E-9 * start_tp.tv_nsec);
}
private:
static void get(struct timespec* p_tp) {
if (clock_gettime(CLOCK_REALTIME, p_tp) != 0)
{
std::cerr << "clock_gettime() error\n";
exit(EXIT_FAILURE);
}
}
struct timespec start_tp;
};
template <typename T>
unsigned f()
{
int n = 0;
Timer timer;
T t;
for (int i = 0; i < 10000000; ++i)
{
t.a(i & 0x01);
t.b(i & 0x1F);
t.c(i & 0x03);
t.d(i & 0xFF);
n += t.a() + t.b() + t.c() + t.d();
}
std::cout << timer.elapsed() << '\n';
return n;
}
int main()
{
std::cout << "bitfields: " << f<A>() << '\n';
std::cout << "separate ints: " << f<B>() << '\n';
std::cout << "explicit and/or/shift: " << f<C>() << '\n';
}
Output on my test machine (numbers vary by ~20% run to run):
bitfields: 0.140586
1449991808
separate ints: 0.039374
1449991808
explicit and/or/shift: 0.252723
1449991808
Suggests that with g++ -O3 on a pretty recent Athlon, bitfields are worse than a few times slower than separate ints, and this particular and/or/bitshift implementation's at least twice as bad again ("worse" as other operations like memory read/writes are emphasised by the volatility above, and there's loop overhead etc, so the differences are understated in the results).
If you're dealing in hundreds of megabytes of structs that can be mainly bitfields or mainly distinct ints, the caching issues may become dominant - so benchmark in your system.
update from 2021 with an AMD Ryzen 9 3900X and -O2 -march=native:
bitfields: 0.0224893
1449991808
separate ints: 0.0288447
1449991808
explicit and/or/shift: 0.0190325
1449991808
Here we see everything has changed massively, the main implication being - benchmark with the systems you care about.
UPDATE: user2188211 attempted an edit which was rejected but usefully illustrated how bitfields become faster as the amount of data increases: "when iterating over a vector of a few million elements in [a modified version of] the above code, such that the variables do not reside in cache or registers, the bitfield code may be the fastest."
template <typename T>
unsigned f()
{
int n = 0;
Timer timer;
std::vector<T> ts(1024 * 1024 * 16);
for (size_t i = 0, idx = 0; i < 10000000; ++i)
{
T& t = ts[idx];
t.a(i & 0x01);
t.b(i & 0x1F);
t.c(i & 0x03);
t.d(i & 0xFF);
n += t.a() + t.b() + t.c() + t.d();
idx++;
if (idx >= ts.size()) {
idx = 0;
}
}
std::cout << timer.elapsed() << '\n';
return n;
}
Results on from an example run (g++ -03, Core2Duo):
0.19016
bitfields: 1449991808
0.342756
separate ints: 1449991808
0.215243
explicit and/or/shift: 1449991808
Of course, timing's all relative and which way you implement these fields may not matter at all in the context of your system.