I would use LUT tables. Here small C++ 32bit (DOWRD = unsigned __int32_t
) example:
//---------------------------------------------------------------------------
// select fastest operation
#define BitReverse BitReverse1
#define AddReverse AddReverseKS
//---------------------------------------------------------------------------
DWORD BitReverseSlow(DWORD x) // Slow 1bit reversing
{
DWORD m0,m1,y;
for (y=0,m0=1,m1=0x80000000;m0<m1;m0<<=1,m1>>=1)
{
if (DWORD(m0&x)) y|=m1;
if (DWORD(m1&x)) y|=m0;
}
return y;
}
//---------------------------------------------------------------------------
DWORD BitReverse1(DWORD x)
{
x=((x<<16)&0xFFFF0000)|((x>>16)&0x0000FFFF);
x=((x<< 8)&0xFF00FF00)|((x>> 8)&0x00FF00FF);
x=((x<< 4)&0xF0F0F0F0)|((x>> 4)&0x0F0F0F0F);
x=((x<< 2)&0xCCCCCCCC)|((x>> 2)&0x33333333);
x=((x<< 1)&0xAAAAAAAA)|((x>> 1)&0x55555555);
return x;
}
//---------------------------------------------------------------------------
DWORD BitReverse8(DWORD x) // fast 8bit LUT reversing
{
DWORD y;
BYTE *px=(BYTE*)&x;
BYTE *py=(BYTE*)&y;
static const BYTE BitReverse8_LUT[256]= // BitReverse8_LUT[x]=BitReverse(x) in 8bits
{
0x00,0x80,0x40,0xC0,0x20,0xA0,0x60,0xE0,0x10,0x90,0x50,0xD0,0x30,0xB0,0x70,0xF0,
0x08,0x88,0x48,0xC8,0x28,0xA8,0x68,0xE8,0x18,0x98,0x58,0xD8,0x38,0xB8,0x78,0xF8,
0x04,0x84,0x44,0xC4,0x24,0xA4,0x64,0xE4,0x14,0x94,0x54,0xD4,0x34,0xB4,0x74,0xF4,
0x0C,0x8C,0x4C,0xCC,0x2C,0xAC,0x6C,0xEC,0x1C,0x9C,0x5C,0xDC,0x3C,0xBC,0x7C,0xFC,
0x02,0x82,0x42,0xC2,0x22,0xA2,0x62,0xE2,0x12,0x92,0x52,0xD2,0x32,0xB2,0x72,0xF2,
0x0A,0x8A,0x4A,0xCA,0x2A,0xAA,0x6A,0xEA,0x1A,0x9A,0x5A,0xDA,0x3A,0xBA,0x7A,0xFA,
0x06,0x86,0x46,0xC6,0x26,0xA6,0x66,0xE6,0x16,0x96,0x56,0xD6,0x36,0xB6,0x76,0xF6,
0x0E,0x8E,0x4E,0xCE,0x2E,0xAE,0x6E,0xEE,0x1E,0x9E,0x5E,0xDE,0x3E,0xBE,0x7E,0xFE,
0x01,0x81,0x41,0xC1,0x21,0xA1,0x61,0xE1,0x11,0x91,0x51,0xD1,0x31,0xB1,0x71,0xF1,
0x09,0x89,0x49,0xC9,0x29,0xA9,0x69,0xE9,0x19,0x99,0x59,0xD9,0x39,0xB9,0x79,0xF9,
0x05,0x85,0x45,0xC5,0x25,0xA5,0x65,0xE5,0x15,0x95,0x55,0xD5,0x35,0xB5,0x75,0xF5,
0x0D,0x8D,0x4D,0xCD,0x2D,0xAD,0x6D,0xED,0x1D,0x9D,0x5D,0xDD,0x3D,0xBD,0x7D,0xFD,
0x03,0x83,0x43,0xC3,0x23,0xA3,0x63,0xE3,0x13,0x93,0x53,0xD3,0x33,0xB3,0x73,0xF3,
0x0B,0x8B,0x4B,0xCB,0x2B,0xAB,0x6B,0xEB,0x1B,0x9B,0x5B,0xDB,0x3B,0xBB,0x7B,0xFB,
0x07,0x87,0x47,0xC7,0x27,0xA7,0x67,0xE7,0x17,0x97,0x57,0xD7,0x37,0xB7,0x77,0xF7,
0x0F,0x8F,0x4F,0xCF,0x2F,0xAF,0x6F,0xEF,0x1F,0x9F,0x5F,0xDF,0x3F,0xBF,0x7F,0xFF
};
py[0]=BitReverse8_LUT[px[3]];
py[1]=BitReverse8_LUT[px[2]];
py[2]=BitReverse8_LUT[px[1]];
py[3]=BitReverse8_LUT[px[0]];
return y;
}
//---------------------------------------------------------------------------
DWORD AddReverseSlow(DWORD x,DWORD y)
{
return BitReverse(BitReverse(x)+BitReverse(y));
}
//---------------------------------------------------------------------------
DWORD AddReverse4(DWORD x,DWORD y)
{
int i;
DWORD z,cy0,cy,a;
static const BYTE AddReverse4_LUT[16][16]= // AddReverse4_LUT[x][y]=BitReverse(BitReverse(x<<1)+BitReverse(y<<1)) in 4bits
{
{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 },
{ 2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29 },
{ 4, 6, 2, 1, 12, 14, 10, 9, 20, 22, 18, 17, 28, 30, 26, 25 },
{ 6, 5, 1, 3, 14, 13, 9, 11, 22, 21, 17, 19, 30, 29, 25, 27 },
{ 8, 10, 12, 14, 4, 6, 2, 1, 24, 26, 28, 30, 20, 22, 18, 17 },
{ 10, 9, 14, 13, 6, 5, 1, 3, 26, 25, 30, 29, 22, 21, 17, 19 },
{ 12, 14, 10, 9, 2, 1, 6, 5, 28, 30, 26, 25, 18, 17, 22, 21 },
{ 14, 13, 9, 11, 1, 3, 5, 7, 30, 29, 25, 27, 17, 19, 21, 23 },
{ 16, 18, 20, 22, 24, 26, 28, 30, 8, 10, 12, 14, 4, 6, 2, 1 },
{ 18, 17, 22, 21, 26, 25, 30, 29, 10, 9, 14, 13, 6, 5, 1, 3 },
{ 20, 22, 18, 17, 28, 30, 26, 25, 12, 14, 10, 9, 2, 1, 6, 5 },
{ 22, 21, 17, 19, 30, 29, 25, 27, 14, 13, 9, 11, 1, 3, 5, 7 },
{ 24, 26, 28, 30, 20, 22, 18, 17, 4, 6, 2, 1, 12, 14, 10, 9 },
{ 26, 25, 30, 29, 22, 21, 17, 19, 6, 5, 1, 3, 14, 13, 9, 11 },
{ 28, 30, 26, 25, 18, 17, 22, 21, 2, 1, 6, 5, 10, 9, 14, 13 },
{ 30, 29, 25, 27, 17, 19, 21, 23, 1, 3, 5, 7, 9, 11, 13, 15 }
};
for (cy0=0,z=0,i=28;i>=0;i-=4,cy0=cy)
{
a=AddReverse4_LUT[(x>>i)&15][(y>>i)&15]; cy=a&1; a>>=1; // add
if (cy0){ a=AddReverse4_LUT[8][a]; cy|=a&1; a>>=1; } // adc
z|=a<<i;
}
return z;
}
//---------------------------------------------------------------------------
WORD AddReverese8_LUT[256][256]; // LUT[x][y]=BitReverse(BitReverse(x<<1)+BitReverse(y<<1)) in 8bits
DWORD AddReverse8(DWORD x,DWORD y)
{
int i;
DWORD z,cy0,cy,a;
BYTE *px=(BYTE*)&x;
BYTE *py=(BYTE*)&y;
BYTE *pz=(BYTE*)&z;
for (cy0=0,z=0,i=3;i>=0;i--,cy0=cy)
{
a=AddReverese8_LUT[px[i]][py[i]]; cy=a&1; a>>=1; // add
if (cy0){ a=AddReverese8_LUT[128][a]; cy|=a&1; a>>=1; } // adc
pz[i]=a;
}
return z;
}
//---------------------------------------------------------------------------
DWORD AddReverseKS(DWORD x,DWORD y) // https://en.wikipedia.org/wiki/Kogge–Stone_adder
{ // Adder from harold's answer ported to 32bit
DWORD p=x^y;
DWORD g=x&y;
g|=p&(g>> 1); p&=p>> 1;
g|=p&(g>> 2); p&=p>> 2;
g|=p&(g>> 4); p&=p>> 4;
g|=p&(g>> 8); p&=p>> 8;
g|=p&(g>>16);
return x^y^(g>>1);
}
//---------------------------------------------------------------------------
The AddReverse8_LUT
is initialized like this:
for (DWORD x=0;x<256;x++)
for (DWORD y=0;y<256;y++)
AddReverse8_LUT[x][y]=BitReverse(BitReverse(x<<1)+BitReverse(y<<1));
Here measured times on mine setup:
8M x BitReverseSlow [ 506.585 ms]
8M x BitReverse1 [ 59.769 ms]
8M x BitReverse8 [ 72.355 ms]
16M x AddReverseSlow [ 611.677 ms]
16M x AddReverse4 [ 491.546 ms]
16M x AddReverse8 [ 347.293 ms]
16M x AddReverseKS [ 142.149 ms]
As you can see the bigger LUT the better speed at the cost of space. On top of that if the resolution is 8/16/32bit you can use BYTE access by pointer instead of slow bit shift/mask speeding up even more.
Just to be sure the Addition LUT[][]
table is encoded such that the addition is shifted to left by 1 bit and the empty space is occupied by Carry flag for the next nibble ...
In the additions when I am doing the adc
(incrementing by carry flag) the operand for LUT
is 1
bit reversed in LUT bit resolution so
1000bin = 8dec // 4 bits
10000000bin = 128dec // 8 bits
To port this to 64 bit just change DWORD
to your datatype and change the bit masks and iterations count to match 64 bit ...
Anyway this reminds me at my old answer to:
However it looks like non LUT versions of BitReverse
are faster
After some digging it looks like LUT usage with temp variables force my compiler to compile my functions differently with great speed decrease (more tan twice). Once all functions are enforced to such calling api the LUT versions are sligthly faster:
8M x BitReverseSlow [ 537.534 ms]
8M x BitReverse1 [ 99.804 ms] <- this is slowed down due to changing call style to the same as the BitReverse8
8M x BitReverse8 [ 80.725 ms]
so I suspect that for reasonably simple functions my compiler uses Assembly call style (not using heap/stack for operands and returning value).
So the result is that:
using BitReverse1
is faster. Also harolds answer beats mine by far for the same reasons + it has less operations needed ...
However if you would hardwire this (FPGA or on die or using gates) I suspect the LUT versions of BitReverse
will be faster (Although you can Hardwire bitreversal directly which is unbeatbale).