On modern processors with high-throughput double-precision floating-point support the fastest way to compute the integer square root ⌊√x⌋ for argument x
≤ 253 is to compute it as (uint32_t)sqrt((double)x)
. For a 32-bit integer square root suitable for processors with no FPU or slow double-precision support, see this answer of mine.
The square root of a 64-bit unsigned integer can be computed efficiently by first computing the reciprocal square root, 1/√x or rsqrt
(x), using a low-accuracy table lookup and following this with multiple Newton-Raphson iterations in fixed-point arithmetic. The full-precision reciprocal sqaure root is then multiplied by the original argument to yield the square root. The general Newton-Raphson iteration for the reciprocal square root of a
is rn+1 = rn + rn * (1 - a * rn2) / 2. This can be transformed algebraically into various convenient arrangements.
The exemplary C99 code below demonstrates the details of the algorithm outlined above. An eight-bit approximation to the reciprocal square root is retrieved from a 96-entry lookup table using the seven most significant bits of the normalized argument as the index. Normalization requires the count of leading zeros, which is a built-in hardware instruction on many processor architectures, but can also be emulated reasonably efficiently through either a single-precision floating-point computation or integer computation.
To potentially allow the use of small-operand multiplication, the initial reciprocal square root approximation r0 is refined using the following variant of the Newton-Raphson iteration: r1 = (3 * r0 - a * r03) / 2. A second iteration r2 = (r1 * (3 - r1 ( (r1 * a))) / 2 is used to refine this further. The third iteration is combined with the back multiply by a
to yield the final square root approximation: s2 = a * r2, s3 = s2 + (r2 * (a - s2 * s2)) / 2.
As a last step, the final normalized square root approximation must be denormalized. The number of bit positions to shift right is half the number of bit positions shifted left during normalization. The result is an underestimate and can be up to 2 smaller than the true result. The correct result ⌊√a
⌋, can be determined by examining the magnitude of the remainder.
To achieve good performance on many 32-bit platforms, the first two Newton-Raphson iterations should be performed in 32-bit arithmetic as only limited precision is required at that state. The computation can be sped up incrementally by using a larger table with 96 entries of 32 bits, where the least significant ten bits of each entry store 3 * r0 and the most significant bits store r03 rounded to 22 bits, which introduces negligible error.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#if defined(_MSC_VER) && defined(_WIN64)
#include <intrin.h>
#endif // defined(_MSC_VER) && defined(_WIN64)
#define CLZ_BUILTIN (1) // use compiler's built-in count-leading-zeros
#define CLZ_FPU (2) // emulate count-leading-zeros via FPU
#define CLZ_CPU (3) // emulate count-leading-zeros via CPU
#define LARGE_TABLE (1)
#define CLZ_IMPL (CLZ_CPU)
#define GEN_TAB (1)
uint32_t umul32_hi (uint32_t a, uint32_t b);
uint64_t umul32_wide (uint32_t a, uint32_t b);
int clz64 (uint64_t a);
#if LARGE_TABLE
uint32_t rsqrt_tab [96] =
{
0xfa0bfafa, 0xee6b2aee, 0xe5f02ae5, 0xdaf26ed9, 0xd2f002d0, 0xc890c2c4,
0xc1037abb, 0xb9a75ab2, 0xb4da42ac, 0xadcea2a3, 0xa6f27a9a, 0xa279c294,
0x9beb4a8b, 0x97a5ca85, 0x9163427c, 0x8d4fca76, 0x89500270, 0x8563ba6a,
0x818ac264, 0x7dc4ea5e, 0x7a120258, 0x7671da52, 0x72e4424c, 0x6f690a46,
0x6db24243, 0x6a52423d, 0x67042637, 0x6563c234, 0x62302a2e, 0x609cea2b,
0x5d836a25, 0x5bfd1a22, 0x58fd421c, 0x5783ae19, 0x560e4a16, 0x53300210,
0x51c7120d, 0x50623a0a, 0x4da4c204, 0x4c4c1601, 0x4af769fe, 0x49a6b9fb,
0x485a01f8, 0x471139f5, 0x45cc59f2, 0x448b5def, 0x4214fde9, 0x40df89e6,
0x3fade1e3, 0x3e8001e0, 0x3d55e1dd, 0x3c2f79da, 0x3c2f79da, 0x3b0cc5d7,
0x39edc1d4, 0x38d265d1, 0x37baa9ce, 0x36a689cb, 0x359601c8, 0x348909c5,
0x348909c5, 0x337f99c2, 0x3279adbf, 0x317741bc, 0x30784db9, 0x30784db9,
0x2f7cc9b6, 0x2e84b1b3, 0x2d9001b0, 0x2d9001b0, 0x2c9eb1ad, 0x2bb0b9aa,
0x2bb0b9aa, 0x2ac615a7, 0x29dec1a4, 0x29dec1a4, 0x28fab5a1, 0x2819e99e,
0x2819e99e, 0x273c599b, 0x273c599b, 0x26620198, 0x258ad995, 0x258ad995,
0x24b6d992, 0x24b6d992, 0x23e5fd8f, 0x2318418c, 0x2318418c, 0x224d9d89,
0x224d9d89, 0x21860986, 0x21860986, 0x20c18183, 0x20c18183, 0x20000180,
};
#else // LARGE_TABLE
uint8_t rsqrt_tab [96] =
{
0xfe, 0xfa, 0xf7, 0xf3, 0xf0, 0xec, 0xe9, 0xe6, 0xe4, 0xe1, 0xde, 0xdc,
0xd9, 0xd7, 0xd4, 0xd2, 0xd0, 0xce, 0xcc, 0xca, 0xc8, 0xc6, 0xc4, 0xc2,
0xc1, 0xbf, 0xbd, 0xbc, 0xba, 0xb9, 0xb7, 0xb6, 0xb4, 0xb3, 0xb2, 0xb0,
0xaf, 0xae, 0xac, 0xab, 0xaa, 0xa9, 0xa8, 0xa7, 0xa6, 0xa5, 0xa3, 0xa2,
0xa1, 0xa0, 0x9f, 0x9e, 0x9e, 0x9d, 0x9c, 0x9b, 0x9a, 0x99, 0x98, 0x97,
0x97, 0x96, 0x95, 0x94, 0x93, 0x93, 0x92, 0x91, 0x90, 0x90, 0x8f, 0x8e,
0x8e, 0x8d, 0x8c, 0x8c, 0x8b, 0x8a, 0x8a, 0x89, 0x89, 0x88, 0x87, 0x87,
0x86, 0x86, 0x85, 0x84, 0x84, 0x83, 0x83, 0x82, 0x82, 0x81, 0x81, 0x80,
};
#endif //LARGE_TABLE
uint32_t my_isqrt64 (uint64_t a)
{
uint64_t rem, arg = a;
uint32_t b, r, s, t, scal;
/* Handle zero as special case */
if (a == 0ULL) return 0u;
/* Normalize argument */
scal = clz64 (a) & ~1;
a = a << scal;
b = a >> 32;
/* Generate initial approximation to 1/sqrt(a) = rsqrt(a) */
t = rsqrt_tab [(b >> 25) - 32];
/* Perform first NR iteration for rsqrt */
#if LARGE_TABLE
r = (t << 22) - umul32_hi (b, t);
#else // LARGE_TABLE
r = ((3 * t) << 22) - umul32_hi (b, (t * t * t) << 8);
#endif // LARGE_TABLE
/* Perform second NR iteration for rsqrt */
s = umul32_hi (r, b);
s = 0x30000000 - umul32_hi (r, s);
r = umul32_hi (r, s);
/* Compute sqrt(a) as a * rsqrt(a); make sure it is an underestimate! */
r = r * 16;
s = umul32_hi (r, b);
s = 2 * s - 10;
/* Perform third NR iteration combined with back multiply */
rem = a - umul32_wide (s, s);
r = umul32_hi ((uint32_t)(rem >> 32), r);
s = s + r;
/* Denormalize result */
s = s >> (scal >> 1);
/* Make sure we get the floor correct; result underestimates by up to 2 */
rem = arg - umul32_wide (s, s);
if (rem >= ((uint64_t)s * 4 + 4)) s+=2;
else if (rem >= ((uint64_t)s * 2 + 1)) s++;
return s;
}
uint32_t umul32_hi (uint32_t a, uint32_t b)
{
return (uint32_t)(((uint64_t)a * b) >> 32);
}
uint64_t umul32_wide (uint32_t a, uint32_t b)
{
return (uint64_t)a * b;
}
uint32_t float_as_uint32 (float a)
{
uint32_t r;
memcpy (&r, &a, sizeof r);
return r;
}
int clz32 (uint32_t a)
{
#if (CLZ_IMPL == CLZ_FPU)
// Henry S. Warren, Jr, " Hacker's Delight 2nd ed", p. 105
int n = 158 - (float_as_uint32 ((float)(int32_t)(a & ~(a >> 1))+.5f) >> 23);
return (n < 0) ? 0 : n;
#elif (CLZ_IMPL == CLZ_CPU)
static const uint8_t clz_tab[32] = {
31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
};
a |= a >> 16;
a |= a >> 8;
a |= a >> 4;
a |= a >> 2;
a |= a >> 1;
return clz_tab [0x07c4acddu * a >> 27] + (!a);
#else // CLZ_IMPL == CLZ_BUILTIN
#if defined(_MSC_VER) && defined(_WIN64)
return (int)__lzcnt (a);
#else // defined(_MSC_VER) && defined(_WIN64)
return (int)__builtin_clz (a);
#endif // defined(_MSC_VER) && defined(_WIN64)
#endif // CLZ_IMPL
}
int clz64 (uint64_t a)
{
#if (CLZ_IMPL == CLZ_BUILTIN)
#if defined(_MSC_VER) && defined(_WIN64)
return (int)__lzcnt64 (a);
#else // defined(_MSC_VER) && defined(_WIN64)
return (int)__builtin_clzll (a);
#endif // defined(_MSC_VER) && defined(_WIN64)
#else // CLZ_IMPL
uint32_t ah = (uint32_t)(a >> 32);
uint32_t al = (uint32_t)(a);
int r;
if (ah) al = ah;
r = clz32 (al);
if (!ah) r += 32;
return r;
#endif // CLZ_IMPL
}
/* Henry S. Warren, Jr., "Hacker's Delight, 2nd e.d", p. 286 */
uint32_t ref_isqrt64 (uint64_t x)
{
uint64_t m, y, b;
m = 0x4000000000000000ULL;
y = 0ULL;
while (m != 0) {
b = y | m;
y = y >> 1;
if (x >= b) {
x = x - b;
y = y | m;
}
m = m >> 2;
}
return (uint32_t)y;
}
/*
https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
From: geo <gmars...@gmail.com>
Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
Subject: 64-bit KISS RNGs
Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)
This 64-bit KISS RNG has three components, each nearly
good enough to serve alone. The components are:
Multiply-With-Carry (MWC), period (2^121+2^63-1)
Xorshift (XSH), period 2^64-1
Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64 (kiss64_t = (kiss64_x << 58) + kiss64_c, \
kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64 (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
kiss64_y ^= (kiss64_y << 43))
#define CNG64 (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)
#if defined(_WIN32)
#if !defined(WIN32_LEAN_AND_MEAN)
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
double second (void)
{
LARGE_INTEGER t;
static double oofreq;
static int checkedForHighResTimer;
static BOOL hasHighResTimer;
if (!checkedForHighResTimer) {
hasHighResTimer = QueryPerformanceFrequency (&t);
oofreq = 1.0 / (double)t.QuadPart;
checkedForHighResTimer = 1;
}
if (hasHighResTimer) {
QueryPerformanceCounter (&t);
return (double)t.QuadPart * oofreq;
} else {
return (double)GetTickCount() * 1.0e-3;
}
}
#elif defined(__linux__) || defined(__APPLE__)
#include <stddef.h>
#include <sys/time.h>
double second (void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)tv.tv_usec * 1.0e-6;
}
#else
#error unsupported platform
#endif
int main (void)
{
#if LARGE_TABLE
printf ("64-bit integer square root implementation w/ large table\n");
#else // LARGE_TAB
printf ("64-bit integer square root implementation w/ small table\n");
#endif
#if GEN_TAB
printf ("Generating table ...\n");
for (int i = 0; i < 96; i++ ) {
double x1 = 1.0 + i * 1.0 / 32;
double x2 = 1.0 + (i + 1) * 1.0 / 32;
double y = (1.0 / sqrt (x1) + 1.0 / sqrt (x2)) * 0.5;
uint32_t val = (uint32_t)(y * 256 + 0.5);
#if LARGE_TABLE
uint32_t cube = val * val * val;
rsqrt_tab[i] = (((cube + 1) / 4) << 10) + (3 * val);
printf ("0x%08x, ", rsqrt_tab[i]);
if (i % 6 == 5) printf ("\n");
#else // LARGE_TABLE
rsqrt_tab[i] = (uint8_t)val;
printf ("0x%02x, ", rsqrt_tab[i]);
if (i % 12 == 11) printf ("\n");
#endif // LARGE_TABLE
}
#endif // GEN_TAB
printf ("Running benchmark ...\n");
double start, stop;
uint32_t sum[8] = {0, 0, 0, 0, 0, 0, 0, 0};
for (int k = 0; k < 2; k++) {
uint32_t i = 0;
start = second();
do {
sum [0] += my_isqrt64 (0x31415926ULL * i + 0);
sum [1] += my_isqrt64 (0x31415926ULL * i + 1);
sum [2] += my_isqrt64 (0x31415926ULL * i + 2);
sum [3] += my_isqrt64 (0x31415926ULL * i + 3);
sum [4] += my_isqrt64 (0x31415926ULL * i + 4);
sum [5] += my_isqrt64 (0x31415926ULL * i + 5);
sum [6] += my_isqrt64 (0x31415926ULL * i + 6);
sum [7] += my_isqrt64 (0x31415926ULL * i + 7);
i += 8;
} while (i);
stop = second();
}
printf ("%08x\relapsed=%.5f sec\n",
sum[0]+sum[1]+sum[2]+sum[3]+sum[4]+sum[5]+sum[6]+sum[7],
stop - start);
printf ("Running functional test ...\n");
uint64_t a, count = 0;
uint32_t res, ref;
do {
switch (count >> 33) {
case 0:
a = count;
break;
case 1:
a = (count & ((1ULL << 33) - 1)) * (count & ((1ULL << 33) - 1) - 1);
break;
case 2:
a = (count & ((1ULL << 33) - 1)) * (count & ((1ULL << 33) - 1));
break;
case 3:
a = (count & ((1ULL << 33) - 1)) * (count & ((1ULL << 33) - 1)) + 1;
break;
default:
a = KISS64;
break;
}
res = my_isqrt64 (a);
ref = ref_isqrt64 (a);
if (res != ref) {
printf ("\nerror: arg=%016llx res=%08x ref=%08x count=%llx\n", a, res, ref, count);
return EXIT_FAILURE;
}
count++;
if (!(count & 0xffffff)) printf ("\r%016llx", count);
} while (count);
printf ("PASSED\n");
return EXIT_SUCCESS;
}