Comparing several algorithms leads to this ranking:
Having an inner loop of 1 or 10 in the test below:
- Utilizing a built in bit scan function.
- Filling least significant bits with or and shift (The function of
@Egor Skriptunoff).
- Involving a lookup table.
- Scanning the most significant bit (The second
function of @Tomas).
InnerLoops = 10:
Timing 1: 0.101284
Timing 2: 0.108845
Timing 3: 0.102526
Timing 4: 0.191911
An inner loop of 100 or greater:
- Utilizing a built in bit scan function.
- Involving a lookup table.
- Filling least significant bits with or and shift (The function of
@Egor Skriptunoff).
- Scanning the most significant bit (The second
function of @Tomas).
InnerLoops = 100:
Timing 1: 0.441786
Timing 2: 0.507651
Timing 3: 0.548328
Timing 4: 0.593668
The test:
#include <algorithm>
#include <chrono>
#include <limits>
#include <iostream>
#include <iomanip>
// Functions
// =========
inline unsigned function1(unsigned a, unsigned b)
{
a ^= b;
if(a) {
int n = __builtin_clz (a);
a = (~0u) >> n;
}
return ~a & b;
}
typedef std::uint8_t byte;
static byte msb_table[256] = {
0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
};
inline unsigned function2(unsigned a, unsigned b)
{
a ^= b;
if(a) {
unsigned n = 0;
if(a >> 24) n = msb_table[byte(a >> 24)] + 24;
else if(a >> 16) n = msb_table[byte(a >> 16)] + 16;
else if(a >> 8) n = msb_table[byte(a >> 8)] + 8;
else n = msb_table[byte(a)];
a = (~0u) >> (32-n);
}
return ~a & b;
}
inline unsigned function3(unsigned a, unsigned b)
{
unsigned d = a ^ b;
d = d | (d >> 1);
d = d | (d >> 2);
d = d | (d >> 4);
d = d | (d >> 8);
d = d | (d >> 16);
return a & (~d);;
}
inline unsigned function4(unsigned a, unsigned b)
{
const unsigned maxbit = 1u << (std::numeric_limits<unsigned>::digits - 1);
unsigned msb = maxbit;
a ^= b;
while( ! (a & msb))
msb >>= 1;
if(msb == maxbit) return 0;
else {
msb <<= 1;
msb -= 1;
return ~msb & b;
}
}
// Test
// ====
inline double duration(
std::chrono::system_clock::time_point start,
std::chrono::system_clock::time_point end)
{
return double((end - start).count())
/ std::chrono::system_clock::period::den;
}
int main() {
typedef unsigned (*Function)(unsigned , unsigned);
Function fn[] = {
function1,
function2,
function3,
function4,
};
const unsigned N = sizeof(fn) / sizeof(fn[0]);
std::chrono::system_clock::duration timing[N] = {};
const unsigned OuterLoops = 1000000;
const unsigned InnerLoops = 100;
const unsigned Samples = OuterLoops * InnerLoops;
unsigned* A = new unsigned[Samples];
unsigned* B = new unsigned[Samples];
for(unsigned i = 0; i < Samples; ++i) {
A[i] = std::rand();
B[i] = std::rand();
}
unsigned F[N];
for(unsigned f = 0; f < N; ++f) F[f] = f;
unsigned result[N];
for(unsigned i = 0; i < OuterLoops; ++i) {
std::random_shuffle(F, F + N);
for(unsigned f = 0; f < N; ++f) {
unsigned g = F[f];
auto start = std::chrono::system_clock::now();
for(unsigned j = 0; j < InnerLoops; ++j) {
unsigned index = i + j;
unsigned a = A[index];
unsigned b = B[index];
result[g] = fn[g](a, b);
}
auto end = std::chrono::system_clock::now();
timing[g] += (end-start);
}
for(unsigned f = 1; f < N; ++f) {
if(result[0] != result[f]) {
std::cerr << "Different Results\n" << std::hex;
for(unsigned g = 0; g < N; ++g)
std::cout << "Result " << g+1 << ": " << result[g] << '\n';
exit(-1);
}
}
}
for(unsigned i = 0; i < N; ++i) {
std::cout
<< "Timing " << i+1 << ": "
<< double(timing[i].count()) / std::chrono::system_clock::period::den
<< "\n";
}
}
Compiler:
g++ 4.7.2
Hardware:
Intel® Core™ i3-2310M CPU @ 2.10GHz × 4 7.7 GiB