This answer is about 64 bit integer to double conversion, without cutting corners. In a previous version of this answer (see paragraph Fast and accurate conversion by splitting ...., below),
it was shown that
it is quite efficient to split the 64-bit integers in a 32-bit low and a 32-bit high part,
convert these parts to double, and compute low + high * 2^32
.
The instruction counts of these conversions were:
int64_to_double_full_range
9 instructions (with mul
and add
as one fma
)
uint64_to_double_full_range
7 instructions (with mul
and add
as one fma
)
Inspired by Mysticial's updated answer, with better optimized accurate conversions,
I further optimized the int64_t
to double conversion:
int64_to_double_fast_precise
: 5 instructions.
uint64_to_double_fast_precise
: 5 instructions.
The int64_to_double_fast_precise
conversion takes one instruction less than Mysticial's solution.
The uint64_to_double_fast_precise
code is essentially identical to Mysticial's solution (but with a vpblendd
instead of vpblendw
). It is included here because of its similarities with the
int64_to_double_fast_precise
conversion: The instructions are identical, only the constants differ:
#include <stdio.h>
#include <immintrin.h>
#include <stdint.h>
__m256d int64_to_double_fast_precise(const __m256i v)
/* Optimized full range int64_t to double conversion */
/* Emulate _mm256_cvtepi64_pd() */
{
__m256i magic_i_lo = _mm256_set1_epi64x(0x4330000000000000); /* 2^52 encoded as floating-point */
__m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000080000000); /* 2^84 + 2^63 encoded as floating-point */
__m256i magic_i_all = _mm256_set1_epi64x(0x4530000080100000); /* 2^84 + 2^63 + 2^52 encoded as floating-point */
__m256d magic_d_all = _mm256_castsi256_pd(magic_i_all);
__m256i v_lo = _mm256_blend_epi32(magic_i_lo, v, 0b01010101); /* Blend the 32 lowest significant bits of v with magic_int_lo */
__m256i v_hi = _mm256_srli_epi64(v, 32); /* Extract the 32 most significant bits of v */
v_hi = _mm256_xor_si256(v_hi, magic_i_hi32); /* Flip the msb of v_hi and blend with 0x45300000 */
__m256d v_hi_dbl = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all); /* Compute in double precision: */
__m256d result = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo)); /* (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition !! */
return result; /* With gcc use -O3, then -fno-associative-math is default. Do not use -Ofast, which enables -fassociative-math! */
/* With icc use -fp-model precise */
}
__m256d uint64_to_double_fast_precise(const __m256i v)
/* Optimized full range uint64_t to double conversion */
/* This code is essentially identical to Mysticial's solution. */
/* Emulate _mm256_cvtepu64_pd() */
{
__m256i magic_i_lo = _mm256_set1_epi64x(0x4330000000000000); /* 2^52 encoded as floating-point */
__m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000000000000); /* 2^84 encoded as floating-point */
__m256i magic_i_all = _mm256_set1_epi64x(0x4530000000100000); /* 2^84 + 2^52 encoded as floating-point */
__m256d magic_d_all = _mm256_castsi256_pd(magic_i_all);
__m256i v_lo = _mm256_blend_epi32(magic_i_lo, v, 0b01010101); /* Blend the 32 lowest significant bits of v with magic_int_lo */
__m256i v_hi = _mm256_srli_epi64(v, 32); /* Extract the 32 most significant bits of v */
v_hi = _mm256_xor_si256(v_hi, magic_i_hi32); /* Blend v_hi with 0x45300000 */
__m256d v_hi_dbl = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all); /* Compute in double precision: */
__m256d result = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo)); /* (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition !! */
return result; /* With gcc use -O3, then -fno-associative-math is default. Do not use -Ofast, which enables -fassociative-math! */
/* With icc use -fp-model precise */
}
int main(){
int i;
uint64_t j;
__m256i j_4;
__m256d v;
double x[4];
double x0, x1, a0, a1;
j = 0ull;
printf("\nAccurate int64_to_double\n");
for (i = 0; i < 260; i++){
j_4= _mm256_set_epi64x(0, 0, -j, j);
v = int64_to_double_fast_precise(j_4);
_mm256_storeu_pd(x,v);
x0 = x[0];
x1 = x[1];
a0 = _mm_cvtsd_f64(_mm_cvtsi64_sd(_mm_setzero_pd(),j));
a1 = _mm_cvtsd_f64(_mm_cvtsi64_sd(_mm_setzero_pd(),-j));
printf(" j =%21li v =%23.1f v=%23.1f -v=%23.1f -v=%23.1f d=%.1f d=%.1f\n", j, x0, a0, x1, a1, x0-a0, x1-a1);
j = j+(j>>2)-(j>>5)+1ull;
}
j = 0ull;
printf("\nAccurate uint64_to_double\n");
for (i = 0; i < 260; i++){
if (i==258){j=-1;}
if (i==259){j=-2;}
j_4= _mm256_set_epi64x(0, 0, -j, j);
v = uint64_to_double_fast_precise(j_4);
_mm256_storeu_pd(x,v);
x0 = x[0];
x1 = x[1];
a0 = (double)((uint64_t)j);
a1 = (double)((uint64_t)-j);
printf(" j =%21li v =%23.1f v=%23.1f -v=%23.1f -v=%23.1f d=%.1f d=%.1f\n", j, x0, a0, x1, a1, x0-a0, x1-a1);
j = j+(j>>2)-(j>>5)+1ull;
}
return 0;
}
The conversions may fail if unsafe math optimization options are enabled. With gcc, -O3
is
safe, but -Ofast
may lead to wrong results, because we may not assume associativity
of floating point addition here (the same holds for Mysticial's conversions).
With icc use -fp-model precise
.
Fast and accurate conversion by splitting the 64-bit integers in a 32-bit low and a 32-bit high part.
We assume that both the integer input and the double output are in 256 bit wide AVX registers.
Two approaches are considered:
int64_to_double_based_on_cvtsi2sd()
: as suggested in the comments on the question, use cvtsi2sd
4 times together with some data shuffling.
Unfortunately both cvtsi2sd
and the data shuffling instructions need execution port 5. This limits the performance of this approach.
int64_to_double_full_range()
: we can use Mysticial's fast conversion method twice in order to get
an accurate conversion for the full 64 bit integer range. The 64-bit integer is split in a 32-bit low and a 32-bit high part
,similarly as in the answers to this question: How to perform uint32/float conversion with SSE? .
Each of these pieces is suitable for Mysticial's integer to double conversion.
Finally the high part is multiplied by 2^32 and added to the low part.
The signed conversion is a little bit more complicted than the unsigned conversion (uint64_to_double_full_range()
),
because srai_epi64()
doesn't exist.
Code:
#include <stdio.h>
#include <immintrin.h>
#include <stdint.h>
/*
gcc -O3 -Wall -m64 -mfma -mavx2 -march=broadwell cvt_int_64_double.c
./a.out A
time ./a.out B
time ./a.out C
etc.
*/
inline __m256d uint64_to_double256(__m256i x){ /* Mysticial's fast uint64_to_double. Works for inputs in the range: [0, 2^52) */
x = _mm256_or_si256(x, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)));
return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0010000000000000));
}
inline __m256d int64_to_double256(__m256i x){ /* Mysticial's fast int64_to_double. Works for inputs in the range: (-2^51, 2^51) */
x = _mm256_add_epi64(x, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)));
return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0018000000000000));
}
__m256d int64_to_double_full_range(const __m256i v)
{
__m256i msk_lo =_mm256_set1_epi64x(0xFFFFFFFF);
__m256d cnst2_32_dbl =_mm256_set1_pd(4294967296.0); /* 2^32 */
__m256i v_lo = _mm256_and_si256(v,msk_lo); /* extract the 32 lowest significant bits of v */
__m256i v_hi = _mm256_srli_epi64(v,32); /* 32 most significant bits of v. srai_epi64 doesn't exist */
__m256i v_sign = _mm256_srai_epi32(v,32); /* broadcast sign bit to the 32 most significant bits */
v_hi = _mm256_blend_epi32(v_hi,v_sign,0b10101010); /* restore the correct sign of v_hi */
__m256d v_lo_dbl = int64_to_double256(v_lo); /* v_lo is within specified range of int64_to_double */
__m256d v_hi_dbl = int64_to_double256(v_hi); /* v_hi is within specified range of int64_to_double */
v_hi_dbl = _mm256_mul_pd(cnst2_32_dbl,v_hi_dbl); /* _mm256_mul_pd and _mm256_add_pd may compile to a single fma instruction */
return _mm256_add_pd(v_hi_dbl,v_lo_dbl); /* rounding occurs if the integer doesn't exist as a double */
}
__m256d int64_to_double_based_on_cvtsi2sd(const __m256i v)
{ __m128d zero = _mm_setzero_pd(); /* to avoid uninitialized variables in_mm_cvtsi64_sd */
__m128i v_lo = _mm256_castsi256_si128(v);
__m128i v_hi = _mm256_extracti128_si256(v,1);
__m128d v_0 = _mm_cvtsi64_sd(zero,_mm_cvtsi128_si64(v_lo));
__m128d v_2 = _mm_cvtsi64_sd(zero,_mm_cvtsi128_si64(v_hi));
__m128d v_1 = _mm_cvtsi64_sd(zero,_mm_extract_epi64(v_lo,1));
__m128d v_3 = _mm_cvtsi64_sd(zero,_mm_extract_epi64(v_hi,1));
__m128d v_01 = _mm_unpacklo_pd(v_0,v_1);
__m128d v_23 = _mm_unpacklo_pd(v_2,v_3);
__m256d v_dbl = _mm256_castpd128_pd256(v_01);
v_dbl = _mm256_insertf128_pd(v_dbl,v_23,1);
return v_dbl;
}
__m256d uint64_to_double_full_range(const __m256i v)
{
__m256i msk_lo =_mm256_set1_epi64x(0xFFFFFFFF);
__m256d cnst2_32_dbl =_mm256_set1_pd(4294967296.0); /* 2^32 */
__m256i v_lo = _mm256_and_si256(v,msk_lo); /* extract the 32 lowest significant bits of v */
__m256i v_hi = _mm256_srli_epi64(v,32); /* 32 most significant bits of v */
__m256d v_lo_dbl = uint64_to_double256(v_lo); /* v_lo is within specified range of uint64_to_double */
__m256d v_hi_dbl = uint64_to_double256(v_hi); /* v_hi is within specified range of uint64_to_double */
v_hi_dbl = _mm256_mul_pd(cnst2_32_dbl,v_hi_dbl);
return _mm256_add_pd(v_hi_dbl,v_lo_dbl); /* rounding may occur for inputs >2^52 */
}
int main(int argc, char **argv){
int i;
uint64_t j;
__m256i j_4, j_inc;
__m256d v, v_acc;
double x[4];
char test = argv[1][0];
if (test=='A'){ /* test the conversions for several integer values */
j = 1ull;
printf("\nint64_to_double_full_range\n");
for (i = 0; i<30; i++){
j_4= _mm256_set_epi64x(j-3,j+3,-j,j);
v = int64_to_double_full_range(j_4);
_mm256_storeu_pd(x,v);
printf("j =%21li v =%23.1f -v=%23.1f v+3=%23.1f v-3=%23.1f \n",j,x[0],x[1],x[2],x[3]);
j = j*7ull;
}
j = 1ull;
printf("\nint64_to_double_based_on_cvtsi2sd\n");
for (i = 0; i<30; i++){
j_4= _mm256_set_epi64x(j-3,j+3,-j,j);
v = int64_to_double_based_on_cvtsi2sd(j_4);
_mm256_storeu_pd(x,v);
printf("j =%21li v =%23.1f -v=%23.1f v+3=%23.1f v-3=%23.1f \n",j,x[0],x[1],x[2],x[3]);
j = j*7ull;
}
j = 1ull;
printf("\nuint64_to_double_full_range\n");
for (i = 0; i<30; i++){
j_4= _mm256_set_epi64x(j-3,j+3,j,j);
v = uint64_to_double_full_range(j_4);
_mm256_storeu_pd(x,v);
printf("j =%21lu v =%23.1f v+3=%23.1f v-3=%23.1f \n",j,x[0],x[2],x[3]);
j = j*7ull;
}
}
else{
j_4 = _mm256_set_epi64x(-123,-4004,-312313,-23412731);
j_inc = _mm256_set_epi64x(1,1,1,1);
v_acc = _mm256_setzero_pd();
switch(test){
case 'B' :{
printf("\nLatency int64_to_double_cvtsi2sd()\n"); /* simple test to get a rough idea of the latency of int64_to_double_cvtsi2sd() */
for (i = 0; i<1000000000; i++){
v =int64_to_double_based_on_cvtsi2sd(j_4);
j_4= _mm256_castpd_si256(v); /* cast without conversion, use output as an input in the next step */
}
_mm256_storeu_pd(x,v);
}
break;
case 'C' :{
printf("\nLatency int64_to_double_full_range()\n"); /* simple test to get a rough idea of the latency of int64_to_double_full_range() */
for (i = 0; i<1000000000; i++){
v = int64_to_double_full_range(j_4);
j_4= _mm256_castpd_si256(v);
}
_mm256_storeu_pd(x,v);
}
break;
case 'D' :{
printf("\nThroughput int64_to_double_cvtsi2sd()\n"); /* simple test to get a rough idea of the throughput of int64_to_double_cvtsi2sd() */
for (i = 0; i<1000000000; i++){
j_4 = _mm256_add_epi64(j_4,j_inc); /* each step a different input */
v = int64_to_double_based_on_cvtsi2sd(j_4);
v_acc = _mm256_xor_pd(v,v_acc); /* use somehow the results */
}
_mm256_storeu_pd(x,v_acc);
}
break;
case 'E' :{
printf("\nThroughput int64_to_double_full_range()\n"); /* simple test to get a rough idea of the throughput of int64_to_double_full_range() */
for (i = 0; i<1000000000; i++){
j_4 = _mm256_add_epi64(j_4,j_inc);
v = int64_to_double_full_range(j_4);
v_acc = _mm256_xor_pd(v,v_acc);
}
_mm256_storeu_pd(x,v_acc);
}
break;
default : {}
}
printf("v =%23.1f -v =%23.1f v =%23.1f -v =%23.1f \n",x[0],x[1],x[2],x[3]);
}
return 0;
}
The actual performance of these functions may depend on the surrounding code and the cpu generation.
Timing results for 1e9 conversions (256 bit wide) with simple tests B, C, D, and E in the code above, on an intel skylake i5 6500 system:
Latency experiment int64_to_double_based_on_cvtsi2sd() (test B) 5.02 sec.
Latency experiment int64_to_double_full_range() (test C) 3.77 sec.
Throughput experiment int64_to_double_based_on_cvtsi2sd() (test D) 2.82 sec.
Throughput experiment int64_to_double_full_range() (test E) 1.07 sec.
The difference in throughput between int64_to_double_full_range()
and int64_to_double_based_on_cvtsi2sd()
is larger than I expected.