The obvious thing to do is to use a table. I was sceptical that such a scheme would be quicker, but curious...
...so, to create a baseline I tweaked your function (rendering it in C):
static uint
GetAlignedSize(size_t size, size_t* rsize)
{
size_t sx = size - 1 ;
if (size <= 0x00800)
{
*rsize = (sx | 0x007) + 1 ;
return (sx >> 3) + ((256 - 64) * 0) ;
} ;
if (size <= 0x02000)
{
*rsize = (sx | 0x01F) + 1 ;
return (sx >> 5) + ((256 - 64) * 1) ;
} ;
if (size <= 0x08000)
{
*rsize = (sx | 0x07F) + 1 ;
return (sx >> 7) + ((256 - 64) * 2) ;
} ;
if (size <= 0x20000)
{
*rsize = (sx | 0x1FF) + 1 ;
return (sx >> 9) + ((256 - 64) * 3) ;
} ;
*rsize = 0 ;
return 64 + ((256 - 64) * 4) ;
} ;
Noting that this does sizes upto and including 0x800 with 8 byte units, etc. and returns "index" 0..831 for all the known sizes, and 832 for the out-size (not 1..832 and 0). In passing, I wonder if it is actually useful to know the rounded size ? To free the block you need the "index", and if you need the rounded size that could be looked up ? [Full disclosure: this also assumes that the incoming request size is not zero... which makes a tiny improvement in the timing !]
Anyway, the most general approach is to table drive the whole thing:
static uint
get_aligned_size_1(size_t size, size_t* rsize)
{
static const uint tb[0x40] =
{
/* 0x00 */ (0x007 << 16) + ((256 - 64) * 0) + 3,
/* 0x01 */ (0x01F << 16) + ((256 - 64) * 1) + 5,
/* 0x02 */ (0x01F << 16) + ((256 - 64) * 1) + 5,
/* 0x03 */ (0x01F << 16) + ((256 - 64) * 1) + 5,
/* 0x04 */ (0x07F << 16) + ((256 - 64) * 2) + 7,
/* 0x05 */ (0x07F << 16) + ((256 - 64) * 2) + 7,
/* 0x06 */ (0x07F << 16) + ((256 - 64) * 2) + 7,
/* 0x07 */ (0x07F << 16) + ((256 - 64) * 2) + 7,
/* 0x08 */ (0x07F << 16) + ((256 - 64) * 2) + 7,
/* 0x09 */ (0x07F << 16) + ((256 - 64) * 2) + 7,
/* 0x0A */ (0x07F << 16) + ((256 - 64) * 2) + 7,
/* 0x0B */ (0x07F << 16) + ((256 - 64) * 2) + 7,
/* 0x0C */ (0x07F << 16) + ((256 - 64) * 2) + 7,
/* 0x0D */ (0x07F << 16) + ((256 - 64) * 2) + 7,
/* 0x0E */ (0x07F << 16) + ((256 - 64) * 2) + 7,
/* 0x0F */ (0x07F << 16) + ((256 - 64) * 2) + 7,
/* 0x10 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x11 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x12 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x13 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x14 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x15 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x16 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x17 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x18 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x19 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x1A */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x1B */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x1C */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x1D */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x1E */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x1F */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x20 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x21 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x22 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x23 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x24 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x25 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x26 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x27 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x28 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x29 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x2A */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x2B */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x2C */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x2D */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x2E */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x2F */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x30 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x31 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x32 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x33 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x34 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x35 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x36 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x37 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x38 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x39 */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x3A */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x3B */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x3C */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x3D */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x3E */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
/* 0x3F */ (0x1FF << 16) + ((256 - 64) * 3) + 9,
} ;
size_t sx = size - 1 ;
if (size <= 0x20000)
{
uint tx ;
tx = tb[sx >> 11] ;
*rsize = (sx | (tx >> 16)) + 1 ;
return (sx >> (tx & 0xF)) + (tx & 0xFFF0) ;
} ;
*rsize = 0 ;
return 64 + ((256 - 64) * 4) ;
} ;
where the mask to round the size up, the "index" base and the shift required to create the "index" are all in the table... packed manually into a uint.
I timed the two functions over a random selection of request sizes: 80% 1..0x800, 15% 0x801..0x2000, 4% 0x2001..0x8000, 0.9% 0x8001..0x20000 and 0.1% out-size:
Setup: 15.610 secs: user 15.580 system 0.000 -- 500 million
Branches: 1.910 secs: user 1.910 system 0.000
Table 1: 1.840 secs: user 1.830 system 0.000
The setup loop is:
srand(314159) ;
for (int i = 0 ; i < trial_count ; ++i)
{
int r ;
size_t mx, mn ;
r = rand() % 1000 ;
if (r < 800)
{
mn = 1 ;
mx = 0x00800 ;
}
else if (r < 950)
{
mn = 0x00801 ;
mx = 0x02000 ;
}
else if (r < 990)
{
mn = 0x02001 ;
mx = 0x08000 ;
}
else if (r < 999)
{
mn = 0x08001 ;
mx = 0x20000 ;
}
else
{
mn = 0x20001 ;
mx = 0x80000 ;
} ;
test_values[i] = (rand() % (mx - mn + 1)) + mn ;
} ;
Observe very carefully the time taken to setup 500 million test values. The table version runs ever so slightly faster... (gcc 4.8, -O3)... but this is a 4% improvent on a trivial function ! Mind you, the table driven version is more flexible, and more quanta can be added without changing the code or the run time.
FWIW, the mask can (obviously) be constructed from the shift, so:
static uint
get_aligned_size_5(size_t size, size_t* rsize)
{
static const uint8_t ts[0x40] =
{
/* 0 1 2 3 4 5 6 7 8 9 A B C D E F
/* 0x00 */ 3,
/* 0x01..0x03 */ 5, 5, 5,
/* 0x04..0x0F */ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
/* 0x10..0x1F */ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
/* 0x20..0x2F */ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
/* 0x30..0x3F */ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
} ;
static const uint tb[16] =
{
/* 0 */ 0,
/* 1 */ 0,
/* 2 */ 0,
/* 3 */ ((256 - 64) / 2) * (3 - 3),
/* 4 */ 0,
/* 5 */ ((256 - 64) / 2) * (5 - 3),
/* 6 */ 0,
/* 7 */ ((256 - 64) / 2) * (7 - 3),
/* 8 */ 0,
/* 9 */ ((256 - 64) / 2) * (9 - 3),
/* 10 */ 0,
/* 11 */ 0,
/* 12 */ 0,
/* 13 */ 0,
/* 14 */ 0,
/* 15 */ 0,
} ;
size_t sx = size - 1 ;
if (size <= 0x20000)
{
uint8_t s ;
s = ts[sx >> 11] ;
*rsize = (sx | (((size_t)1 << s) - 1)) + 1 ;
return (sx >> s) + tb[s] ;
} ;
*rsize = 0 ;
return 64 + ((256 - 64) * 4) ;
} ;
Which I found was the fastest of the variations I tried:
Setup: 15.610 secs: user 15.580 system 0.000 -- 500 million
Branches: 1.910 secs: user 1.910 system 0.000
Table 1: 1.840 secs: user 1.830 system 0.000
Table 5: 1.750 secs: user 1.750 system 0.000
at a whole (read it and sigh) 8% faster :-(.
Ah well, I was bored.