extern printf %define ACCL1 %define ACCL2 %ifdef ACCL1 %define ACCL0 %endif %ifdef ACCL2 %define ACCL0 %endif section .data data class=data progbits alloc noexec write align=16 section .data ; ALL DATA STRUCTURES xrd: dd 1.0 ; 1+[0 movss xmm6,[sca] shufps xmm6,xmm6,0x00 ; sca movaps xmm2,xmm1 subss xmm2,xmm2 ; ri mulps xmm6,xmm0 addps xmm6,xmm2 ; rf movaps xmm7,xmm2 maxps xmm7,xmm6 minps xmm6,xmm2 movss xmm2,[omr] shufps xmm2,xmm2,0x00 ; omr subps xmm6,xmm2 addps xmm7,xmm2 ; <-- accelerate -- %endif %ifdef ACCL2 movaps xmm2,[rmin] ; -- photon box --> movaps xmm3,xmm6 ; box min movaps xmm4,xmm7 ; box max subps xmm3,xmm2 subps xmm4,xmm2 movaps xmm2,[rrst] mulps xmm3,xmm2 mulps xmm4,xmm2 maxps xmm3,[pmin] minps xmm4,[pmax] minps xmm3,[qmax] maxps xmm4,[qmin] cvtps2dq xmm3,xmm3 ; sse2 convert cvtps2dq xmm4,xmm4 ; float to int movaps [sminr],xmm3 movaps [smaxr],xmm4 imul esi,[smaxx],xnum mov [smaxx],esi imul edi,[smaxy],ynum mov [smaxy],edi imul esi,[sminx],xnum mov [sminx],esi imul edi,[sminy],ynum mov [sminy],edi mov ebp,[sminz] mov esi,[sminx] xin: cmp [smaxx],esi ; loop in x cells js near xout mov edi,[sminy] yin: cmp [smaxy],edi ; loop in y cells js near yout mov ebp,[sminz] zin: cmp [smaxz],ebp ; loop in z cells js near zout mov eax,esi add eax,edi add eax,ebp xor ecx,ecx mov cx,[arr+2*eax] next: xor eax,eax mov ax,[els+ecx] cmp ax,0 js near none shl eax,4 lea eax,[geo+eax] ; <-- photon box -- %else mov ecx,lgeo sph3: lea eax,[geo-om_size+ecx] %endif movaps xmm2,[eax] ; -- sphere --> subss xmm2,xmm2 ; om position %ifdef ACCL1 movaps xmm3,xmm6 ; -- accelerate --> cmpleps xmm3,xmm2 ; min r =< om r movaps xmm4,xmm2 cmpleps xmm4,xmm7 ; om r =< min r andps xmm3,xmm4 shufps xmm3,xmm3,0x39 comiss xmm3,xmm0 jnp near sph2 shufps xmm3,xmm3,0x39 comiss xmm3,xmm0 jnp near sph2 shufps xmm3,xmm3,0x39 comiss xmm3,xmm0 jnp near sph2 ; <-- accelerate -- %endif subps xmm2,xmm1 ; dr=dom-r movaps xmm3,xmm2 mulps xmm2,xmm0 ; dr*n mulps xmm3,xmm3 ; dr*dr shufps xmm2,xmm2,0x39 ; calculate b movss xmm4,xmm2 ; x shufps xmm2,xmm2,0x39 addss xmm4,xmm2 ; x+y shufps xmm2,xmm2,0x39 addss xmm4,xmm2 ; b=x+y+z shufps xmm4,xmm0,0x00 ; 00bb shufps xmm3,xmm3,0x39 ; calculate c movss xmm4,xmm3 ; x shufps xmm3,xmm3,0x39 addss xmm4,xmm3 ; x+y shufps xmm3,xmm3,0x39 addss xmm4,xmm3 ; x+y+z movlps [cc],xmm4 ; 00bc fld dword [cc] fld dword [omr] fmul st0 fsubp st1 ; C=c-omr^2 fld dword [bc] fmul st0 fsubrp st1 ; D=b^2-C fldz fcomip st1 jnc sph1 fsqrt fld dword [bc] fsubrp st1 ; b-sqrt(D) fldz fcomip st1 jnc sph1 fcomi st1 jnc sph1 fxch st1 mov ebx,[eax] ; detected! sph1: fstp st0 ; <-- sphere -- sph2: %ifdef ACCL2 add ecx,2 jmp next none: inc ebp ; -- photon box --> jmp zin zout: add edi,ynum jmp yin yout: add esi,xnum jmp xin xout: ; <-- photon box -- %else sub ecx,om_size jnz sph3 %endif fstp dword [sca] movaps xmm4,xmm0 ; -- advance -- addss xmm4,xmm5 ; 1/cm-n vector movss xmm3,[sca] shufps xmm3,xmm3,0x00 mulps xmm4,xmm3 addps xmm1,xmm4 movaps [rt],xmm1 cmp ebx,0 jz prp4 ret ; done prp4: fld dword [xx] fcomip st1 jnc prp3 call rotxrd movaps [nt],xmm0 jmp prp1 prp3: fstp st0 ret ; <-- prop -- rotxrd: ; MAIN ROTATION fld dword [g] ; g call rand fadd st0 ; 2*xi fld1 fsubp st1 ; xi=1-2*xi fldz fcomip st2 jz rot1 fmul st1 ; g*xi fld1 faddp st1 ; 1+g*xi fld st1 fmul st0 ; g^2 fld1 fsub st1 ; 1-g^2 fdiv st2 ; ga=(1-g^2)/(1+g*xi) fmul st0 ; ga^2 fld1 faddp st2 ; 1+g^2 fsubp st1 ; 1+g^2-ga^2 fld st2 faddp st3 ; 2*g fdivrp st2 ; (1+g^2-ga^2)/(2*g) fstp st0 rot1: fld1 fcomip st1 ; compare with +1 jnc rot2 fstp st0 fld1 jmp rot3 rot2: fld1 fchs fcomip st1 ; compare with -1 jc rot3 fstp st0 fld1 fchs rot3: fld1 fld st1 fmul st0 fsubp st1 fsqrt ; sqrt(1-cos^2) fstp dword [zsin] fstp dword [zcos] ; <-- rotxrd -- rotate: call rand ; AUXILIARY ROTATION movaps xmm7,xmm0 mulps xmm7,xmm7 ; n^2 movaps xmm2,xmm7 shufps xmm2,xmm2,0xe5 ; zyxx x movaps xmm3,xmm7 shufps xmm3,xmm3,0xe6 ; zyxy y movaps xmm4,xmm7 shufps xmm4,xmm4,0xe7 ; zyxz z comiss xmm2,xmm3 jc jl1 ; if x shufps xmm3,xmm3,0x10 ; 0x00 movaps xmm4,xmm0 shufps xmm4,xmm4,0x08 ; 00y0 subps xmm3,xmm4 ; 0xY0 movaps xmm6,xmm7 shufps xmm6,xmm6,0x00 ; x+y mulps xmm6,xmm3 ; p1 movaps xmm3,xmm0 ; <--> shufps xmm3,xmm3,0x40 ; x000 movaps xmm4,xmm0 shufps xmm4,xmm4,0x0c ; 00z0 subps xmm3,xmm4 ; x0Z0 shufps xmm7,xmm7,0x55 ; x+z mulps xmm7,xmm3 ; p2 jmp jl2 cm1: movaps xmm7,xmm3 shufps xmm7,xmm7,0x1b ; yxyz addps xmm7,xmm3 rsqrtps xmm7,xmm7 movaps xmm4,xmm0 ; <--> shufps xmm4,xmm4,0x80 ; y000 movaps xmm2,xmm0 shufps xmm2,xmm2,0x30 ; 0z00 subps xmm4,xmm2 ; yZ00 movaps xmm6,xmm7 shufps xmm6,xmm6,0x00 ; x+z mulps xmm6,xmm4 ; p1 movaps xmm4,xmm0 ; <--> shufps xmm4,xmm4,0x08 ; 00y0 movaps xmm2,xmm0 shufps xmm2,xmm2,0x10 ; 0x00 subps xmm4,xmm2 ; 0Xy0 shufps xmm7,xmm7,0x55 ; x+y mulps xmm7,xmm4 ; p2 jmp jl2 cm2: movaps xmm7,xmm4 shufps xmm7,xmm7,0x4e ; xzzy addps xmm7,xmm4 rsqrtps xmm7,xmm7 movaps xmm2,xmm0 ; <--> shufps xmm2,xmm2,0x0c ; 00z0 movaps xmm3,xmm0 shufps xmm3,xmm3,0x40 ; x000 subps xmm2,xmm3 ; X0z0 movaps xmm6,xmm7 shufps xmm6,xmm6,0x55 ; x+z mulps xmm6,xmm2 ; p1 movaps xmm2,xmm0 ; <--> shufps xmm2,xmm2,0x30 ; 0z00 movaps xmm3,xmm0 shufps xmm3,xmm3,0x80 ; y000 subps xmm2,xmm3 ; Yz00 shufps xmm7,xmm7,0x00 ; y+z mulps xmm7,xmm2 ; p2 jl2: movaps xmm3,xmm6 subps xmm3,xmm7 ; p1-p2 addps xmm7,xmm6 ; p1+p2 movaps xmm4,xmm3 mulps xmm4,xmm4 ; (p1-p2)^2 movaps xmm6,xmm7 mulps xmm6,xmm6 ; (p1+p2)^2 movaps xmm2,xmm4 shufps xmm2,xmm6,0x11 ; 0101 shufps xmm4,xmm6,0xee ; 3232 addps xmm4,xmm2 ; z x+y z x+y movaps xmm2,xmm4 shufps xmm2,xmm2,0xb1 addps xmm4,xmm2 ; x+y+z rsqrtps xmm4,xmm4 movaps xmm6,xmm4 shufps xmm4,xmm4,0x00 ; 0000 shufps xmm6,xmm6,0xaa ; 2222 mulps xmm3,xmm4 ; a1 mulps xmm7,xmm6 ; a2 fldpi fadd st0 ; pi^2 fmulp st1 fsincos fstp dword [xcos] fstp dword [xsin] movlps xmm4,[xcos] movaps xmm6,xmm4 shufps xmm6,xmm6,0x00 ; cos shufps xmm4,xmm4,0x55 ; sin mulps xmm6,xmm3 mulps xmm4,xmm7 addps xmm4,xmm6 ; u movlps xmm2,[zcos] movaps xmm3,xmm2 shufps xmm3,xmm3,0x00 ; cos shufps xmm2,xmm2,0x55 ; sin mulps xmm4,xmm2 mulps xmm0,xmm3 addps xmm0,xmm4 ; new n movaps xmm7,xmm0 mulps xmm7,xmm7 shufps xmm7,xmm7,0x39 ; rotate movss xmm2,xmm7 shufps xmm7,xmm7,0x39 ; rotate movss xmm3,xmm7 shufps xmm7,xmm7,0x39 ; rotate movss xmm4,xmm7 addss xmm2,xmm3 addss xmm2,xmm4 rsqrtss xmm2,xmm2 shufps xmm2,xmm2,0x00 mulps xmm0,xmm2 ; new n jl3: ret ; <-- rotate -- rand: mov eax,[rnd] ; RANDOM NUMBER GENERATOR mul dword [mtp] add eax,[cry] adc edx,0 mov [cry],edx mov [rnd],eax shr eax,9 cmp eax,0 jz rand or eax,0x3f800000 ; or 1.0 mov [xrd],eax fld dword [xrd] fld1 fsubp st1 ret ; <-- rand -- printh: movaps xmm7,xmm0 ; PRINT HIT INFO shufps xmm7,xmm7,0x93 movss [tmp],xmm7 fld dword [tmp] fstp qword [tmp] push dword [tmp+4] push dword [tmp] movss [tmp],xmm1 fld dword [tmp] fstp qword [tmp] push dword [tmp+4] push dword [tmp] mov eax,ebx and eax,0xff push eax shr ebx,8 and ebx,0xff push ebx push dword outh call printf add esp,28 ret ; <-- printh --