I have got some c-routine
int n_mandelbrot(double c_im, double c_re, int N_ITER)
{
static double re, im, re2, im2;
static int n;
im2=im=0;
re2=re=0;
for(n=0; n<N_ITER; n++)
{
im = (re+re)*im + c_im;
re = re2 - im2 + c_re;
im2=im*im;
re2=re*re;
if ( re2 + im2 > 4.0 ) break;
}
return n;
}
want to rewrite it to assembly and I managed to write that
n_mandelbrot_fpu_double: ;; (double cre, double cim, int N_ITER)
mov edx, dword [esp+20] ;; N_ITER
mov ecx, 0
fld qword [esp+4+0] ;; cre
fld qword [esp+12+0] ;; cim
fld1
fadd st0, st0
fadd st0, st0 ;; 4.0
fldz ;; re = 0
fldz ;; im = 0
fldz ;; re2 = 0
fldz ;; im2 = 0
mlloopp:
;; here
;; im = (re+re)*im + c_im;
;; re = re2 - im2 + c_re;
;; im2=im*im;
;; re2=re*re;
;; if ( re2 + im2 > 4.0 ) break;
;; STACK: cre cim 4.0 re im re2 im2
fld st3
fadd st0, st0
fmul st3
fadd st6
fxch st3
fstp st0
fld st1
fsub st1
fadd st7
fxch st4
fstp st0
fld st2
fmul st0, st0
fxch st1
fstp st0
fld st3
fmul st0, st0
fxch st2
fstp st0
fld st0
fadd st2
fcomp st5
fnstsw ax
sahf
ja mloopout
inc ecx
cmp ecx,edx
jb mlloopp
mloopout:
fstp st0
fstp st0
fstp st0
fstp st0
fstp st0
fstp st0
fstp st0
mov eax, ecx
ret
c-routine makes my program loop run 150 ms and with that it dropped to 105 ms so this is faster (though unwinded c-routine with calculation of two pixels in inner loop takes only 115 and I do not know exactly why and how to unroll it in asm )
this asm code is not efficient i think, I tried load all variables on the fpu stack (and before loop I load 7 doubles to it: cre cim 4.0 re im re2 im2 then there is a tol of loading it on top of the stack exchanging and popping back with fstp so i think it is maybe not to efficient
could someone help to improve that (values outside the inner loop does not matter to much but the code in the inner loop counts much here