12

I'm wondering if there really is no 128-bit division intrinsic function in Visual C++?

There is a 64x64=128 bit multiplication intrinsic function called _umul128(), which nicely matches the MUL x64 assembler instruction.

Naturally, I assumed there would be a 128/64=64 bit division intrinsic as well (modelling the DIV instruction), but to my amazement neither Visual C++ nor Intel C++ seem to have it, at least it's not listed in intrin.h.

Can someone confirm that? I tried grep'ing for the function names in the compiler executable files, but couldn't find _umul128 in the first place, so I guess I looked in the wrong spot.

Update: at least I have now found the pattern umul128 (without the leading underscore) in c1.dll of Visual C++ 2010. All the other intrinsics are listed around it, but unfortunately no "udiv128" or the like :( So it seems they really have "forgotten" to implement it.

To clarify: I'm not only looking for a 128-bit data type, but a way to divide a 128-bit scalar int by a 64-bit int in C++. Either an intrinsic function or native 128-bit integer support would solve my problem.

Edit: The answer is no, there is no _udiv128 intrinsic in Visual Studio 2010 up to 2017, but it is available in Visual Studio 2019 RTM

cxxl
  • 4,939
  • 3
  • 31
  • 52
  • 1
    It isn't part of the CRT. It is an intrinsic, comes for free with the processor. But only in 64-bit mode. No freebie for the div until you get a 128-bit processor. Given the ridiculously vast range of pow(2, 128), you should be looking for arbitrary precision library. Plenty of those around. – Hans Passant Dec 10 '11 at 00:17
  • @TreeMonkie: __int18 is not supported by VS, see http://stackoverflow.com/questions/6759592/how-to-enable-int128-on-visual-studio – cxxl Dec 10 '11 at 00:17
  • 4
    @Hans: sorry, I don't understand. It's just NOT an intrinsic, not even in 64 bit mode. And I need it to *write* an arbitrary precision lib :) – cxxl Dec 10 '11 at 00:20
  • 1
    Well, no point in looking for a boxed solution then. You know how to do arbitrary precision math with paper and pencil from elementary school. 128 bits takes a lot of paper but computers have plenty. – Hans Passant Dec 10 '11 at 00:23
  • 1
    @cxxl: I believe that 128 bit int's are not supported directly... however you can use them when using SSE intrinsics. I believe -- but don't quote me on this -- that it is __m128. It's not entirely clear to me from the question whether SSE would be of use in this scenario or not... – Daniel Placek Dec 10 '11 at 09:24
  • Note that if the quotient overflows RAX, `div` and `idiv` raise a `#DE` exception. This makes it dangerous to use unless you check that `high_half < denominator` or something like that. – Peter Cordes Jan 31 '19 at 03:41

5 Answers5

12

If you don't mind little hacks, this may help (64-bit mode only, not tested):

#include <windows.h>
#include <stdio.h>

unsigned char udiv128Data[] =
{
  0x48, 0x89, 0xD0, // mov rax,rdx
  0x48, 0x89, 0xCA, // mov rdx,rcx
  0x49, 0xF7, 0xF0, // div r8
  0x49, 0x89, 0x11, // mov [r9],rdx
  0xC3              // ret
};

unsigned char sdiv128Data[] =
{
  0x48, 0x89, 0xD0, // mov rax,rdx
  0x48, 0x89, 0xCA, // mov rdx,rcx
  0x49, 0xF7, 0xF8, // idiv r8
  0x49, 0x89, 0x11, // mov [r9],rdx
  0xC3              // ret
};

unsigned __int64 (__fastcall *udiv128)(unsigned __int64 numhi,
                                       unsigned __int64 numlo,
                                       unsigned __int64 den,
                                       unsigned __int64* rem) =
  (unsigned __int64 (__fastcall *)(unsigned __int64,
                                   unsigned __int64,
                                   unsigned __int64,
                                   unsigned __int64*))udiv128Data;

__int64 (__fastcall *sdiv128)(__int64 numhi,
                              __int64 numlo,
                              __int64 den,
                              __int64* rem) =
  (__int64 (__fastcall *)(__int64,
                          __int64,
                          __int64,
                          __int64*))sdiv128Data;

int main(void)
{
  DWORD dummy;
  unsigned __int64 ur;
  __int64 sr;
  VirtualProtect(udiv128Data, sizeof(udiv128Data), PAGE_EXECUTE_READWRITE, &dummy);
  VirtualProtect(sdiv128Data, sizeof(sdiv128Data), PAGE_EXECUTE_READWRITE, &dummy);
  printf("0x00000123456789ABCDEF000000000000 / 0x0001000000000000 = 0x%llX\n",
         udiv128(0x00000123456789AB, 0xCDEF000000000000, 0x0001000000000000, &ur));
  printf("-6 / -2 = %lld\n",
         sdiv128(-1, -6, -2, &sr));
  return 0;
}
Alexey Frunze
  • 61,140
  • 12
  • 83
  • 180
  • 4
    For MSVC one might use #pragma section to put these functions to code segment during compilation – Marat Dukhan Dec 18 '11 at 06:31
  • Why can't you use inline assembly? – Sandeep Datta Feb 02 '18 at 15:56
  • @SandeepDatta It didn't use to be supported by the compiler in 64-bit code. Is it supported now? – Alexey Frunze Feb 06 '18 at 06:42
  • 2
    Highly recommend `const unsigned char code[]`; you want it to be `const` so it goes in `.rdata`. I don't know if that's already next to the code section and thus executable, like `.rodata` going into the TEXT segment on Linux/ELF, but it should help. And make the function pointers `const` or static const (or constexpr) so they can (hopefully) be optimized away, instead of compiled to actual memory-indirect calls. **Really no benefit to putting these in arrays vs. a separately-compiled `.asm` file.** Pure downside if the call compiles as an indirect call. – Peter Cordes Jan 31 '19 at 03:32
  • Also, reverse the order of the first 2 args so the high half is already in RDX. (You can write an inline wrapper function that will optimize away, to hide this detail if you want the source to have `hi,lo, den`.) – Peter Cordes Jan 31 '19 at 03:33
  • Also be sure to include a warning that this will FAULT with `#DE` (divide exception) if the quotient overflows a 64-bit register. – Peter Cordes Jan 31 '19 at 03:35
7

A small improvement - one less instruction

extern "C" digit64 udiv128(digit64 low, digit64 hi, digit64 divisor, digit64 *remainder);

; Arguments
; RCX       Low Digit
; RDX       High Digit
; R8        Divisor
; R9        *Remainder

; RAX       Quotient upon return

.code
udiv128 proc
    mov rax, rcx    ; Put the low digit in place (hi is already there)
    div r8      ; 128 bit divide rdx-rax/r8 = rdx remainder, rax quotient
    mov [r9], rdx   ; Save the reminder
    ret     ; Return the quotient
udiv128 endp
end
Dick Bertrand
  • 81
  • 1
  • 1
5

It's available now. You can use _div128 and _udiv128

The _div128 intrinsic divides a 128-bit integer by a 64-bit integer. The return value holds the quotient, and the intrinsic returns the remainder through a pointer parameter. _div128 is Microsoft specific.

Last year it was said to be available from "Dev16" but I'm not sure which version is that. I guess it's VS 16.0 A.K.A VS2019, but the documentation on MSDN shows that it goes further to VS2015

phuclv
  • 37,963
  • 15
  • 156
  • 475
  • According to the documentation it's available in Visual Studio 2019 RTM. I justed tested that it is not yet available in Visual Studio 2017, resp. compiler version 19.16.27030.1. – cxxl May 10 '19 at 09:07
2

I am no expert, but I dug this up:

http://research.swtch.com/2008/01/division-via-multiplication.html

Interesting stuff. Hope it helps.

EDIT: This is insightful too: http://www.gamedev.net/topic/508197-x64-div-intrinsic/

Daniel Placek
  • 765
  • 5
  • 16
  • 1
    It's actually quite a pain. Even if you find the reciprocal + shift needed, you're left having to multiply your 128bit nom with the reciprocal and taking the top 64 bits from the result, which is a serious PITA – yonil May 28 '16 at 17:16
  • 1
    Also I find it hard to believe that whole thing would somehow outperform a DIV/IDIV instruction. – yonil May 28 '16 at 18:01
0

Thanks @alexey-frunze, it worked with little tweak for VS2017, checked with same parameters with VS2019:

#include <iostream>
#include <string.h>
#include <math.h>
#include <immintrin.h>
#define no_init_all
#include <windows.h>

unsigned char udiv128Data[] =
{
    0x48, 0x89, 0xD0, // mov rax,rdx
    0x48, 0x89, 0xCA, // mov rdx,rcx
    0x49, 0xF7, 0xF0, // div r8
    0x49, 0x89, 0x11, // mov [r9],rdx
    0xC3              // ret
};

unsigned char sdiv128Data[] =
{
    0x48, 0x89, 0xD0, // mov rax,rdx
    0x48, 0x89, 0xCA, // mov rdx,rcx
    0x49, 0xF7, 0xF8, // idiv r8
    0x49, 0x89, 0x11, // mov [r9],rdx
    0xC3              // ret
};

unsigned __int64(__fastcall* udiv128)(
    unsigned __int64 numhi,
    unsigned __int64 numlo,
    unsigned __int64 den,
    unsigned __int64* rem) =
    (unsigned __int64(__fastcall*)(
        unsigned __int64,
        unsigned __int64,
        unsigned __int64,
        unsigned __int64*))
        ((unsigned __int64*)udiv128Data);

__int64(__fastcall *sdiv128)(
    __int64 numhi,
    __int64 numlo,
    __int64 den,
    __int64* rem) =
    (__int64(__fastcall *)(
        __int64,
        __int64,
        __int64,
        __int64*))
        ((__int64*)sdiv128Data);

void test1()
{
    unsigned __int64 a = 0x3c95ba9e6a637e7;
    unsigned __int64 b = 0x37e739d13a6d036;
    unsigned __int64 c = 0xa6d036507ecc7a7;
    unsigned __int64 d = 0x7ecc37a70c26e68;
    unsigned __int64 e = 0x6e68ac7e5f15726;

    DWORD dummy;
    VirtualProtect(udiv128Data, sizeof(udiv128Data), PAGE_EXECUTE_READWRITE, &dummy);
    e = udiv128(a, b, c, &d);

    printf("d = %llx, e = %llx\n", d, e);    // d = 1ed37bdf861c50, e = 5cf9ffa49b0ec9aa

}

void test2()
{
    __int64 a = 0x3c95ba9e6a637e7;
    __int64 b = 0x37e739d13a6d036;
    __int64 c = 0xa6d036507ecc7a7;
    __int64 d = 0x7ecc37a70c26e68;
    __int64 e = 0x6e68ac7e5f15726;

    DWORD dummy;
    VirtualProtect(sdiv128Data, sizeof(sdiv128Data), PAGE_EXECUTE_READWRITE, &dummy);
    e = sdiv128(a, b, c, &d);

    printf("d = %llx, e = %llx\n", d, e);    // d = 1ed37bdf861c50, e = 5cf9ffa49b0ec9aa

}

int main()
{
    test1();
    test2();

    return 0;
}
kanha.vishva
  • 73
  • 2
  • 4