1

[EDITED] After running the code using the gdb debugger, movapd really wasn't the issue (thanks to everyone who pointed that out). Going line by line, the fault is in the second comparison being made between xmm7 and xmm0 registers. Flow control looks like this:

everything up to checking which radius is larger is fine;

comisd xmm1, xmm3 compares accordingly
(p $xmm1.v2_double $1 = {10, 0} p $xmm3.v2_double $2 = {2, 0})

concludes that xmm1 is greater and jumps to the first label

from there things go wrong when comisd somehow deduces that xmm7, is greater than xmm1

(p $xmm7.v2_double $3 = {1.4142135623730951, 1}

p $xmm1.v2_double $4 = {10, 0} )

(Also I'm pretty sure that the values in the curly brackets of the print command in gdb are in reversed order. What I mean is in xmm1 10 is actually in the lower quadword and zero in the higher, so I couldn't make sense of how is 1.41 greater than 10)

This is the assembly code

#extern int circles(int n, double* cr);

.intel_syntax   noprefix

.data

three:      .int 3

.text

.global circles

circles:

        enter   0, 0


        mov     rax, 1
        cpuid
        test    rdx, 0x2000000
        jz      notSupported

        mov     rbx, rsp
        and     rsp, 0xfffffffffffffff0
        sub     rsp, 512
        fxsave  [rsp]



        xor     r10, r10    #pair counter
        xor     r8, r8      #outter loop counter
        xor     r9, r9      #inner loop counter
        mov     rax, rdi
        mul     dword ptr three
        sub     rax, 3
        #rax value 3n-3 since our step is length of 3

        pivotCircle:

        cmp     r8, rax
        je      done

        #this is the line where it goes wrong 
        movupd  xmm0, [rsi + 8*r8]          #first circle center
        movsd   xmm1, [rsi + 8*r8 + 16]     #radius first circle

        mov     r9, r8

        nextCircle:
        add     r9, 3

        cmp     r9, rax
        jg      nextPivot

        movupd  xmm2, [rsi + 8*r9]          #second circle center
        movsd   xmm3, [rsi + 8*r9 + 16]     #second circle radius

        #calculating distance between the centers

        movapd  xmm7, xmm0
        subpd   xmm7, xmm2
        mulpd   xmm7, xmm7
        movapd  xmm2, xmm7
        shufpd  xmm2, xmm2, 0b11
        addsd   xmm7, xmm2
        sqrtsd  xmm7, xmm7                  # |c1 - c2| in xmm7



        #checking which radius is bigger

        comisd  xmm1, xmm3      

        jge     first
        jmp     second

        first:                  #first one greater

        comisd  xmm7, xmm1
        jge     nextCircle
        movsd   xmm6, xmm1
        subsd   xmm6, xmm7
        comisd  xmm3, xmm6
        jg      nextCircle

        jmp     found

        second:                 #second one greater

        comisd  xmm7, xmm3
        jge     nextCircle
        movsd   xmm6, xmm3
        subsd   xmm6, xmm7
        comisd  xmm1, xmm6
        jg      nextCircle

        found:

        inc     r10
        jmp     nextCircle

        nextPivot:
        add     r8, 3
        jmp pivotCircle

        done:

        fxrstor [rsp]
        mov     rsp, rbx

        mov     rax, r10
        leave
        ret


        notSupported:

        mov     rdi, 1
        call    exit

This is the main.c file. Elements in the array come in groups of three (a, b, radius).

#include    <stdio.h>
#include    <stdlib.h>
#include    <assert.h>


extern int circles(int n, double* cr);


int main(int argc, char const *argv[])
{
    int n;

    double* cr;

    scanf("%d", &n);
    assert(n > 0);

    cr = malloc(n * sizeof(double) * 3);
    assert(cr != NULL);

    for (int i = 0; i < n; i++)
    {
        scanf("%lf%lf%lf", cr+i*3, cr+i*3+1, cr+i*3+2);
        assert(*(cr+3*i+2) > 0);
    }

    printf("%d\n", circles(n, cr));


    return 0;
}
monolith937
  • 429
  • 1
  • 4
  • 13
  • Works fine here. Learn to use a debugger. Also make sure you enter at least 2 circles or else you will bypass that `movupd`. – Jester Aug 11 '18 at 17:15
  • I really should. But even so, how could that be ? `2 1.0 1.0 5.0 1.0 1.0 1.0 0` This is my output for 2 circles – monolith937 Aug 11 '18 at 17:31
  • That is the output, yes. But the `movupd xmm0` works (I have checked in a debugger). So your problem is elsewhere. – Jester Aug 11 '18 at 17:37
  • So, `xmm0` really does get the first point of the first circle ? – monolith937 Aug 11 '18 at 17:46
  • 1
    why do you even ask, just check it finally. (I mean check it for real, like in debugger, not by guessing and some debug logs and watching side-effects somewhere else) – Ped7g Aug 11 '18 at 17:49
  • I did and it doesn't. Deleted most of the code, just to use `movq rax, xmm0` just to see if it was zero and it was – monolith937 Aug 11 '18 at 17:50
  • 4
    `(gdb) p $xmm0.v2_double $2 = {1, 1}` Note that printing that with `%d` will give you zero. That's because the low 32 bits of `1.0` as a double are zero. – Jester Aug 11 '18 at 17:52
  • All right, thank you for clearing that up. I'll get on a debugger soon and find the issue – monolith937 Aug 11 '18 at 18:01
  • All right, thank you for clearing that up. I'll get on a debugger soon and find the issue – monolith937 Aug 11 '18 at 18:01
  • if really desperate, you can in debugger first set `xmm0` to some gibberish, right ahead of the `movupd`, then you would see if the value did change or not. Also using less trivial values for debugging, and not the same ones, like {1.23, 4.56} is often good practice for debugging/testing, becase from {1.23, 1.23} you will not catch accidental swap of arguments, etc... – Ped7g Aug 11 '18 at 18:08
  • SSE2 is baseline for x86-64; you don't need to check for it with `cpuid`. That's quite slow and *not* something you want to do on every call anyway. (If you were using AVX or something else that wasn't baseline, you'd want to check once and save the result for future calls, not check every time.) **You don't need to [`fxsave`](http://felixcloutier.com/x86/FXSAVE.html) / `fxrstor` either**. All xmm regs are call-clobbered on Linux/OS X, but Windows has call-preserved xmm6..15. It's much cheaper to save/restore xmm6 and 7 manually than to save/restore the whole x87 and SSE (not AVX) state. – Peter Cordes Aug 12 '18 at 01:06
  • See https://msdn.microsoft.com/en-us/library/9z1stfyw.aspx for volatile / non-volatile registers in Windows x64, in case that's what you're using. BTW, `enter 0,0` is very slow, too. Never use it. (`leave` is fine, but not `enter`). See https://agner.org/optimize/ and other links in https://stackoverflow.com/tags/x86/info. – Peter Cordes Aug 12 '18 at 01:10
  • 2
    `comisd` sets flags identically to `fcomi`. Use unsigned compares like `ja`, not `jg`, to branch on the result, for historical reasons (the arrangement of bits in the x86 status word vs. the order of bits in FLAGS). – Peter Cordes Aug 12 '18 at 11:46
  • Yep, that fixed it. So should I generally avoid using `jg` ? Is there ever a good time for it ? – monolith937 Aug 12 '18 at 11:50
  • 1
    @monolith937 Check out how `comisd` sets the comparison flags by looking at the documentation. Then check what comparison flags the various `jXXX` instructions check. Use a comparison instruction that checks the desired condition. The mnemonic is a read herring. – fuz Aug 12 '18 at 12:10
  • @monolith937: `jg` is useful for signed *integer* comparisons, e.g. after `cmp` or `test`, or `dec`, or `sub`. – Peter Cordes Aug 12 '18 at 18:03

0 Answers0