x87 - Issue in raymarcher

Question

Below is my raymarcher, mostly adapted from https://www.shadertoy.com/view/llt3R4#
The expected behavior is a projection of a sphere on the screen, but instead it projects empty space (signified by a yellow pixel, instead of a purple one).

This is my first program written using the x87 coprocessor, so I am unsure if there is anything I'm missing. A few programmers I know have looked it over without finding a problem, so I am unsure of what to do.

macro bareintro {
    org 0x7c00
    jmp 0x0:boot
    
    boot:
    
    xor ax, ax
    mov ds, ax
    mov ss, ax
    
}

macro zerob b {
    times b db 0
}


bareintro

mov ax, 0x13
int 0x10
mov ax, 0xA000
mov es, ax
xor di, di

y_loop:
    mov word [ix], 0
    mov bx, [iy]
    inc bx
    cmp bx, 200
    je y_end
    mov [iy], bx
x_loop:
; first calculate the raymarching direction:
; vec3 rayDirection(float fieldOfView, vec2 size, vec2 fragCoord) {
;    vec2 xy = fragCoord - size / 2.0;
;    float z = size.y / tan(radians(fieldOfView) / 2.0);
;    return normalize(vec3(xy, -z));
; }
    fld qword [h]
    fld qword [z_diri]
    fmulp
    fld qword [y_diri]
    fild word [iy]
    faddp
    fld qword [x_diri]
    fild word [ix]
    faddp

; finally, normalize vector
    ; do a couple dupes
    fld st2
    fld st2
    fld st2
    call len
    fdiv  st1, st0
    fdiv  st2, st0
    fdivp st3, st0

; store in respective memory locations
    fstp qword [dir_x]
    fstp qword [dir_y]
    fstp qword [dir_z]
    call sdts

    setc al
    add al, 13
    stosb

    inc word [ix]
    cmp word [ix], 319
    ja y_loop
    jmp x_loop

y_end:
    cli
    hlt

; int intersect(float rayx, float rayy, float rayz) {
;   float depth = MIN_DIST;
;   for(int i = 0; i < STEPS; i++) {
;       float vx = rayx * depth;
;       float vy = rayy * depth;
;       float vz = rayz * depth + 5.0;
;       float dist = sdf(vx, vy, vz);
;       if(dist < EPS) {
;           return 1;
;       }
;       depth += dist;
;       if(depth >= MAX_DIST) {
;           return 0;
;       }
;   }
;   return 0;
;}

sdts:
    fldz
    fstp qword [.depth]
    mov cx, 255
    .loop:


;   float dist = sceneSDF(eye + depth * marchingDirection);

    ; load in direction vector
    fld qword [dir_z]
    fld qword [dir_y]
    fld qword [dir_x]

    fld qword [.depth]

    fmul st1, st0
    fmul st2, st0
    fmulp st3, st0

    fld qword [eyez]
    fld qword [eyey]
    fld qword [eyex]
    faddp st3, st0
    faddp st3, st0
    faddp st3, st0

    .sdf:
        ; single sphere for now (len(p) - radius)
        call len

        ; - radius
        fld qword [radius]

        fsubp st1, st0

    .endsdf:

        ; EPS <=> Dist
        fld qword [eps]
        fcomip st0, st1
        jb .nohit
            ; cowabunga!
            clc
            ret
        .nohit:

        fld qword [.depth]
        ; MAX <=> Total
        faddp
        fld qword [.max_depth]
        fcomip st0, st1
        fstp qword[.depth]
        ja .alsofine
            stc
            ret
        .alsofine:
        loop .loop
    .endloop:
    stc
    ret

    .max_depth:
        dq 100.0
    .depth:
        dq 0.0

; st0 <- sqrt(s0^2 + s1^2 + s2^2)
len:
    fmul st0, st0
    fxch st1
    fmul st0, st0
    faddp st1, st0
    fxch st1
    fmul st0, st0
    faddp st1, st0
    fsqrt
    ret


x_diri:
    dq -160.000
y_diri:
    dq -100.000
z_diri:
    dq 482.8427
dir_x:
    dq -160.000
dir_y:
    dq -100.000
dir_z:
    dq -2.4142135623911343
w:
    dq 320.0
h:
    dq 200.0
radius:
    dq 1.0
eps:
    dq 0.001
eyex:
    dq 0.0
eyey:
    dq 0.0
eyez:
    dq 5.0

ix:
    dw 0
iy:
    dw 0


zerob 510 - ($-$$)
dw 0xaa55

(Here is a working version of the code in C)

#include <math.h>
#include <SDL2/SDL.h>

const int STEPS = 256;
const float MIN_DIST = 0.0;
const float MAX_DIST = 100.0;
const float EPS = 0.001;

float sdf(float x, float y, float z) {
    float len = sqrt(x*x + y*y + z*z);

    return len - 1.0;
}

int intersect(float rayx, float rayy, float rayz) {
    float depth = MIN_DIST;
    for(int i = 0; i < STEPS; i++) {
        float vx = rayx * depth;
        float vy = rayy * depth;
        float vz = rayz * depth + 5.0;
        float dist = sdf(vx, vy, vz);

        if(dist < EPS) {
            return 1;
        }

        depth += dist;

        if(depth >= MAX_DIST) {
            return 0;
        }
    }

    return 0;
}



int main() {
    SDL_Init(SDL_INIT_EVERYTHING);

    uint32_t buff[320*200];
    
    SDL_Window* win;
    SDL_Renderer* ren;
    SDL_CreateWindowAndRenderer(320, 200, 0, &win, &ren);
    

    SDL_Texture* tex = SDL_CreateTexture(ren, SDL_PIXELFORMAT_RGBA8888, SDL_TEXTUREACCESS_STATIC,
        320, 200);


    for(size_t y = 0; y < 200; y++) {
        for(size_t x = 0; x < 320; x++) {
            float r_z = -200.0 / 0.41421356237;
            float r_y = ((float)y) - 100.0;
            float r_x = ((float)x) - 160.0;
            float len = sqrt(r_x*r_x + r_y*r_y + r_z*r_z);
            r_x /= len;
            r_y /= len;
            r_z /= len;

            buff[y*320 + x] = (-1) * intersect(r_x, r_y, r_z);
        }
    }

    SDL_UpdateTexture(tex, NULL, buff, 320*4);
    SDL_RenderCopy(ren, tex, NULL, NULL);
    SDL_RenderPresent(ren);


    SDL_Event e;
    while(1) {
        SDL_WaitEvent(&e);
        if(e.type == SDL_QUIT) {
            SDL_Quit();
            return 0;
        }
    }

    return 0;
}

Wouldn't be surprised if you're overflowing the 8-register x87 stack; you do a lot of `fld` early on. You might not be doing enough pops to balance them out. Single-step your code in a debugger; it should show you how many x87 regs are in use, and if you get a NaN-indefinite from an `fld` when memory is fine then that's your program. (Also note you can do stuff like `fmul qword [z_diri]` instead of `fld` / `fmulp`, but that's early on in your loop so that wouldn't help reduce the max stack depth you need, just help performance in general. Although of course x87 is mostly obsolete anyway...) — Peter Cordes, Feb 06 '21 at 23:40
SSE2 has a flat register file (xmm0..7) that's generally easier to use and keep track of what's where. (Although in freestanding code like this you'd need to [manually enable the control registers](https://stackoverflow.com/questions/31563078/how-do-i-enable-sse-for-my-freestanding-bootable-code) for SSE to not fault. And if this is for a "demo", you might need the code-size advantage of x87.) — Peter Cordes, Feb 07 '21 at 00:16
@PeterCordes I'll have to recheck the register stack, thanks! Also I figure I'm going to stick with x87, especially since I'm not sure if you can use sse from real mode (I could be wrong!), and it would allow at least some backwards compat and some size relief — Qh4os, Feb 07 '21 at 01:00
You can use SSE in real mode. What you *can't* use in real-mode is AVX, or other instructions that need to be encoded with VEX prefixes. (Like [BMI1 `andn`](https://www.felixcloutier.com/x86/andn) - the manual points out that it can't be used in real mode or virtual-8086 mode. But apparently 16-bit sub-mode of protected or long mode is fine.) VEX prefixes overlap with invalid encodings of LDS/LES (except in 64-bit mode where LES/LDS are never valid), but apparently some legacy 16-bit code used those as intentional traps or something like that. — Peter Cordes, Feb 07 '21 at 01:34

x87 - Issue in raymarcher

0 Answers0