#ifdef XCPU #define __device__ #define __global__ #define __constant__ #define rsqrtf 1/sqrtf #define __float2int_rn (int)lroundf #define __float2int_ru (int)ceilf #define __float2int_rd (int)floorf struct int2{ int x, y; }; struct uint4{ unsigned int x, y, z, w; }; struct float2{ float x, y; }; struct float3:float2{ float z; }; struct float4:float3{ float w; }; float int_as_float(unsigned int x){ union{ unsigned int i; float f; }; i=x; return f; } int atomicAdd(int * i, int j){ int k=*i; *i+=j; return k; } struct ThreadIdx{ int x; } threadIdx; struct BlockDim{ int x; } blockDim; struct BlockIdx{ int x; } blockIdx; struct GridDim{ int x; } gridDim; unsigned int seed=0; #endif __device__ float xrnd(uint4 & s){ unsigned int tmp; unsigned long long sda; do{ sda = s.z * (unsigned long long) s.x + s.y; s.x=sda; s.y=sda >> 32; tmp = s.x >> 9; } while(tmp==0); tmp |= 0x3f800000; return int_as_float(tmp)-1.0f; } __device__ void rotate(float & cs, float & si, float4 & n, uint4 & s){ float3 u; { float p1x, p1y, p1z, p2x, p2y, p2z; { float nnx=n.x*n.x, nny=n.y*n.y, nnz=n.z*n.z; int i=nny>nnx?nnz>nny?2:1:nnz>nnx?2:0; if(i==0){ float ry=rsqrtf(nnx+nny), rz=rsqrtf(nnx+nnz); p1x=-n.y*ry; p1y=n.x*ry; p1z=0; p2x=-n.z*rz; p2y=0; p2z=n.x*rz; } else if(i==1){ float rz=rsqrtf(nny+nnz), rx=rsqrtf(nny+nnx); p1y=-n.z*rz; p1z=n.y*rz; p1x=0; p2y=-n.x*rx; p2z=0; p2x=n.y*rx; } else{ float rx=rsqrtf(nnz+nnx), ry=rsqrtf(nnz+nny); p1z=-n.x*rx; p1x=n.z*rx; p1y=0; p2z=-n.y*ry; p2x=0; p2y=n.z*ry; } } { float q1x, q1y, q1z, q2x, q2y, q2z; float a1x=p1x-p2x; q1x=a1x; a1x*=a1x; float a1y=p1y-p2y; q1y=a1y; a1y*=a1y; float a1z=p1z-p2z; q1z=a1z; a1z*=a1z; float a2x=p1x+p2x; q2x=a2x; a2x*=a2x; float a2y=p1y+p2y; q2y=a2y; a2y*=a2y; float a2z=p1z+p2z; q2z=a2z; a2z*=a2z; float r1=rsqrtf(a1x+a1y+a1z), r2=rsqrtf(a2x+a2y+a2z); p1x=q1x*r1; p1y=q1y*r1; p1z=q1z*r1; p2x=q2x*r2; p2y=q2y*r2; p2z=q2z*r2; } { float zi=xrnd(s); zi*=2*fpi; float px, py; sincosf(zi, &py, &px); u.x=px*p1x+py*p2x; u.y=px*p1y+py*p2y; u.z=px*p1z+py*p2z; } } { float r=0; n.x=n.x*cs+u.x*si; r+=n.x*n.x; n.y=n.y*cs+u.y*si; r+=n.y*n.y; n.z=n.z*cs+u.z*si; r+=n.z*n.z; r=rsqrtf(r); n.x*=r; n.y*=r; n.z*=r; } } #ifdef TILT #ifndef XCPU __device__ int __float2int_rd(float x); __host__ int __float2int_rd(float x){ return floorf(x); } __host__ #endif __device__ float zshift(dats & d, float4 & r){ if(d.lnum==0) return 0; float z=(r.z-d.lmin)*d.lrdz; int k=min(max(__float2int_rd(z), 0), d.lpts-2); int l=k+1; float nr=d.lnx*r.x+d.lny*r.y-d.r50; for(int j=1; j0?e.nfla:(blockIdx.x+MLTP)%WNUM __global__ void propagate(dats *ed, unsigned int num){ const unsigned int idx=threadIdx.x*gridDim.x+blockIdx.x; uint4 s; #ifdef XCPU float4 r, n; dats & e = * ed; int eidx = XIDX; ices & w = * e.w[XWNM]; #else float4 r={0,0,0,0}, n={0,0,0,0}; __shared__ dats e; __shared__ ices w; int & eidx = e.hidx; if(threadIdx.x==0){ e=*ed; eidx=XIDX; w=*e.w[XWNM]; } __syncthreads(); #endif { #ifndef XCPU const unsigned int & seed = idx; #endif s.w=seed%e.rsize; s.x=e.rs[s.w]; s.y=e.rs[s.w] >> 32; s.z=e.rm[s.w]; } bool next=true; float TOT=0; for(unsigned long long i=idx; i0){ r.x=e.r[0]; r.y=e.r[1]; r.z=e.r[2]; r.w=0; float rms=0, up=0; switch(e.type){ case 1: rms=9.2f; up=0.0f; break; case 2: rms=9.7f; up=48.0f; break; case 3: rms=0.0f; up=90.0f-41.13f; break; case 4: rms=0.0f; up=41.13f-90.0f; break; } const float fcv=fpi/180.f, sq3=sqrtf(3.f); float xi=xrnd(s); xi=(up+(2*xi-1)*rms*sq3)*fcv; float nz, np; sincosf(xi, &nz, &np); xi=xrnd(s); xi*=2*fpi; sincosf(xi, &n.y, &n.x); n.x*=np; n.y*=np; n.z=nz; } else{ photon & p = e.pz[i/OVER]; n.w=p.q; n.x=p.n[0]; n.y=p.n[1]; n.z=p.n[2]; r.w=p.t; r.x=p.r[0]; r.y=p.r[1]; r.z=p.r[2]; rotate(w.coschr, w.sinchr, n, s); } TOT=-logf(xrnd(s)); next=false; } { float sca; float & nz = n.z; { // get distance for overburden #ifdef TILT float z = r.z - zshift(e, r); #else float & z = r.z; #endif int i=__float2int_rn((z-e.hmin)*e.rdh); if(i<0) i=0; else if(i>=e.size) i=e.size-1; float h=e.hmin+i*e.dh; // middle of the layer float ahx=nz<0?h-e.hdh:h+e.hdh; float SCA=-logf(xrnd(s)); float ais=(nz*SCA-(ahx-z)*w.z[i].sca)*e.rdh; float aia=(nz*TOT-(ahx-z)*w.z[i].abs)*e.rdh; int j=i; if(nz<0) for(; j>0 && ais<0 && aia<0; ahx-=e.dh, ais+=w.z[j].sca, aia+=w.z[j].abs) --j; else for(; j0 && aia>0; ahx+=e.dh, ais-=w.z[j].sca, aia-=w.z[j].abs) ++j; float tot; if(i==j || fabsf(nz)=0){ D=sqrtf(D); float h1=b-D, h2=b+D; if(h2>=0 && h1<=sca*np){ if(fabsf(np)>xx){ h1/=np, h2/=np; if(h1<0) h1=0; if(h2>sca) h2=sca; } else h1=0, h2=sca; h1=r.z+n.z*h1, h2=r.z+n.z*h2; float zl, zh; if(n.z>0) zl=h1, zh=h2; else zl=h2, zh=h1; int omin=0, omax=s.max; int n1=s.n-omin+min(omax+1, max(omin, __float2int_ru(omin-(zh-s.dl-s.h)*s.d))); int n2=s.n-omin+max(omin-1, min(omax, __float2int_rd(omin-(zl-s.dh-s.h)*s.d))); for(int l=n1; l<=n2; l++){ const DOM & dom=oms[l]; #ifdef OFLA if(l==e.fla) continue; #endif float b=0, c=0, dr; dr=dom.r[0]-r.x; b+=n.x*dr; c+=dr*dr; dr=dom.r[1]-r.y; b+=n.y*dr; c+=dr*dr; dr=dom.r[2]-r.z; b+=n.z*dr; c+=dr*dr; float D=b*b-c+R*R; if(D>=0){ float h=b-sqrtf(D); if(h>0 && h<=sca){ om=l; sca=h; } } } } } } } { // advance r.x+=sca*n.x; r.y+=sca*n.y; r.z+=sca*n.z; r.w+=sca*w.ocm; } if(om!=0){ bool flag=true; hit h; h.i=om; h.t=r.w; h.n=n.w; h.z=n.z; #ifdef ASENS float sum; { // hole ice const float s[11]={0.32752, 0.66696, 0.35389, -1.4107, -1.7738, 4.3302, 5.7151, -5.5262, -7.9000, 2.0155, 3.3965}; float & x = n.z; float y=1; sum=s[0]; for(int i=1; i<11; i++){ y*=x; sum+=s[i]*y; } } flag=MAS*xrnd(s)hidx, 1); if(j1) xi=1; else if(xi<-1) xi=-1; float si=sqrtf(1-xi*xi); rotate(xi, si, n, s); } } } { e.rs[s.w]=s.x | (unsigned long long) s.y << 32; } }