I have created a function which does 64-bit * 64-bit to 128-bit using SIMD. Currently I have implemented it using SSE2 (acutally SSE4.1). This means it does two 64b*64b to 128b products at the same time. The same idea could be extended to AVX2 or AVX512 giving four or eight 64b*64 to 128b products at the same time. I based my algorithm on http://www.hackersdelight.org/hdcodetxt/muldws.c.txt
That algorithm does one unsigned multiplication, one signed multiplication, and two signed * unsigned multiplications. The signed * signed and unsigned * unsigned operations are easy to do using _mm_mul_epi32
and _mm_mul_epu32
. But the mixed signed and unsigned products caused me trouble.
Consider for example.
int32_t x = 0x80000000;
uint32_t y = 0x7fffffff;
int64_t z = (int64_t)x*y;
The double word product should be 0xc000000080000000
. But how can you get this if you assume your compiler does know how to handle mixed types? This is what I came up with:
int64_t sign = x<0; sign*=-1; //get the sign and make it all ones
uint32_t t = abs(x); //if x<0 take two's complement again
uint64_t prod = (uint64_t)t*y; //unsigned product
int64_t z = (prod ^ sign) - sign; //take two's complement based on the sign
Using SSE this can be done like this
__m128i xh; //(xl2, xh2, xl1, xh1) high is signed, low unsigned
__m128i yl; //(yh2, yl2, yh2, yl2)
__m128i xs = _mm_cmpgt_epi32(_mm_setzero_si128(), xh); // get sign
xs = _mm_shuffle_epi32(xs, 0xA0); // extend sign
__m128i t = _mm_sign_epi32(xh,xh); // abs(xh)
__m128i prod = _mm_mul_epu32(t, yl); // unsigned (xh2*yl2,xh1*yl1)
__m128i inv = _mm_xor_si128(prod,xs); // invert bits if negative
__m128i z = _mm_sub_epi64(inv,xs); // add 1 if negative
This gives the correct result. But I have to do this twice (once when squaring) and it's now a significant fraction of my function. Is there a more efficient way of doing this with SSE4.2, AVX2 (four 128bit products), or even AVX512 (eight 128bit products)?
Maybe there are more efficient ways of doing this than with SIMD? It's a lot of calculations to get the upper word.
Edit: based on the comment by @ElderBug it looks like the way to do this is not with SIMD but with the mul
instruction. For what it's worth, in case anyone want's to see how complicated this is, here is the full working function (I just got it working so I have not optimized it but I don't think it's worth it).
void muldws1_sse(__m128i x, __m128i y, __m128i *lo, __m128i *hi) {
__m128i lomask = _mm_set1_epi64x(0xffffffff);
__m128i xh = _mm_shuffle_epi32(x, 0xB1); // x0l, x0h, x1l, x1h
__m128i yh = _mm_shuffle_epi32(y, 0xB1); // y0l, y0h, y1l, y1h
__m128i xs = _mm_cmpgt_epi32(_mm_setzero_si128(), xh);
__m128i ys = _mm_cmpgt_epi32(_mm_setzero_si128(), yh);
xs = _mm_shuffle_epi32(xs, 0xA0);
ys = _mm_shuffle_epi32(ys, 0xA0);
__m128i w0 = _mm_mul_epu32(x, y); // x0l*y0l, y0l*y0h
__m128i w3 = _mm_mul_epi32(xh, yh); // x0h*y0h, x1h*y1h
xh = _mm_sign_epi32(xh,xh);
yh = _mm_sign_epi32(yh,yh);
__m128i w1 = _mm_mul_epu32(x, yh); // x0l*y0h, x1l*y1h
__m128i w2 = _mm_mul_epu32(xh, y); // x0h*y0l, x1h*y0l
__m128i yinv = _mm_xor_si128(w1,ys); // invert bits if negative
w1 = _mm_sub_epi64(yinv,ys); // add 1
__m128i xinv = _mm_xor_si128(w2,xs); // invert bits if negative
w2 = _mm_sub_epi64(xinv,xs); // add 1
__m128i w0l = _mm_and_si128(w0, lomask);
__m128i w0h = _mm_srli_epi64(w0, 32);
__m128i s1 = _mm_add_epi64(w1, w0h); // xl*yh + w0h;
__m128i s1l = _mm_and_si128(s1, lomask); // lo(wl*yh + w0h);
__m128i s1h = _mm_srai_epi64(s1, 32);
__m128i s2 = _mm_add_epi64(w2, s1l); //xh*yl + s1l
__m128i s2l = _mm_slli_epi64(s2, 32);
__m128i s2h = _mm_srai_epi64(s2, 32); //arithmetic shift right
__m128i hi1 = _mm_add_epi64(w3, s1h);
hi1 = _mm_add_epi64(hi1, s2h);
__m128i lo1 = _mm_add_epi64(w0l, s2l);
*hi = hi1;
*lo = lo1;
}
It gets worse. There is no _mm_srai_epi64
instrinsic/instruction until AVX512 so I had to make my own.
static inline __m128i _mm_srai_epi64(__m128i a, int b) {
__m128i sra = _mm_srai_epi32(a,32);
__m128i srl = _mm_srli_epi64(a,32);
__m128i mask = _mm_set_epi32(-1,0,-1,0);
__m128i out = _mm_blendv_epi8(srl, sra, mask);
}
My implementation of _mm_srai_epi64
above is incomplete. I think I was using Agner Fog's Vector Class Library. If you look in the file vectori128.h you find
static inline Vec2q operator >> (Vec2q const & a, int32_t b) {
// instruction does not exist. Split into 32-bit shifts
if (b <= 32) {
__m128i bb = _mm_cvtsi32_si128(b); // b
__m128i sra = _mm_sra_epi32(a,bb); // a >> b signed dwords
__m128i srl = _mm_srl_epi64(a,bb); // a >> b unsigned qwords
__m128i mask = _mm_setr_epi32(0,-1,0,-1); // mask for signed high part
return selectb(mask,sra,srl);
}
else { // b > 32
__m128i bm32 = _mm_cvtsi32_si128(b-32); // b - 32
__m128i sign = _mm_srai_epi32(a,31); // sign of a
__m128i sra2 = _mm_sra_epi32(a,bm32); // a >> (b-32) signed dwords
__m128i sra3 = _mm_srli_epi64(sra2,32); // a >> (b-32) >> 32 (second shift unsigned qword)
__m128i mask = _mm_setr_epi32(0,-1,0,-1); // mask for high part containing only sign
return selectb(mask,sign,sra3);
}
}