__device__ float xrnd(uint4 & s){ unsigned int tmp; unsigned long long sda; do{ sda = s.z * (unsigned long long) s.x + s.y; s.x=sda; s.y=sda >> 32; tmp = s.x >> 9; } while(tmp==0); tmp |= 0x3f800000; return int_as_float(tmp)-1.0f; } __device__ void rotate(float & cs, float & si, float4 & n, uint4 & s){ float3 u; { float p1x, p1y, p1z, p2x, p2y, p2z; { float nnx=n.x*n.x, nny=n.y*n.y, nnz=n.z*n.z; int i=nny>nnx?nnz>nny?2:1:nnz>nnx?2:0; if(i==0){ float ry=rsqrtf(nnx+nny), rz=rsqrtf(nnx+nnz); p1x=-n.y*ry; p1y=n.x*ry; p1z=0; p2x=-n.z*rz; p2y=0; p2z=n.x*rz; } else if(i==1){ float rz=rsqrtf(nny+nnz), rx=rsqrtf(nny+nnx); p1y=-n.z*rz; p1z=n.y*rz; p1x=0; p2y=-n.x*rx; p2z=0; p2x=n.y*rx; } else{ float rx=rsqrtf(nnz+nnx), ry=rsqrtf(nnz+nny); p1z=-n.x*rx; p1x=n.z*rx; p1y=0; p2z=-n.y*ry; p2x=0; p2y=n.z*ry; } } { float q1x, q1y, q1z, q2x, q2y, q2z; float a1x=p1x-p2x; q1x=a1x; a1x*=a1x; float a1y=p1y-p2y; q1y=a1y; a1y*=a1y; float a1z=p1z-p2z; q1z=a1z; a1z*=a1z; float a2x=p1x+p2x; q2x=a2x; a2x*=a2x; float a2y=p1y+p2y; q2y=a2y; a2y*=a2y; float a2z=p1z+p2z; q2z=a2z; a2z*=a2z; float r1=rsqrtf(a1x+a1y+a1z), r2=rsqrtf(a2x+a2y+a2z); p1x=q1x*r1; p1y=q1y*r1; p1z=q1z*r1; p2x=q2x*r2; p2y=q2y*r2; p2z=q2z*r2; } { float zi=xrnd(s); zi*=2*fpi; float px, py; sincosf(zi, &py, &px); u.x=px*p1x+py*p2x; u.y=px*p1y+py*p2y; u.z=px*p1z+py*p2z; } } { float r=0; n.x=n.x*cs+u.x*si; r+=n.x*n.x; n.y=n.y*cs+u.y*si; r+=n.y*n.y; n.z=n.z*cs+u.z*si; r+=n.z*n.z; r=rsqrtf(r); n.x*=r; n.y*=r; n.z*=r; } } __constant__ DOM oms[MAXGEO]; #ifdef ACCL4 #define XINC i=atomicAdd(&e.hidx, gridDim.x) #define XIDX gridDim.x*blockDim.x+blockIdx.x #else #define XINC i+=e.hidx #define XIDX gridDim.x*blockDim.x #endif #ifdef ACCL5 #define XWVL blockIdx.x+MLTP #else #define XWVL (blockIdx.x%WNUM)*13+19 #endif __global__ void propagate(dats *ed, unsigned int num){ const unsigned int idx=threadIdx.x*gridDim.x+blockIdx.x; uint4 s; float4 r={0,0,0,0}, n={0,0,0,0}; __shared__ dats e; __shared__ ices w; if(threadIdx.x==0){ e=*ed; e.hidx=XIDX; w=*e.w[e.type==0?e.nfla:(XWVL)%WNUM]; } __syncthreads(); { s.w=idx%e.rsize; s.x=e.rs[s.w]; s.y=e.rs[s.w] >> 32; s.z=e.rm[s.w]; } bool next=true; float TOT=0; for(unsigned long long i=idx; i=e.size) i=e.size-1; float h=e.hmin+i*e.dh; // middle of the layer float SCA=-logf(xrnd(s)); float sca; { // get distance for overburden x float & x = SCA; float & z = r.z; float & nz = n.z; float ai, ah; if(nz>0){ ah=h+e.hdh; ai=(nz*x-(ah-z)*w.z[i].sca)*e.rdh; for(j=i; j0; ah+=e.dh) ai-=w.z[++j].sca; } else{ ah=h-e.hdh; ai=(nz*x-(ah-z)*w.z[i].sca)*e.rdh; for(j=i; j>0 && ai<0; ah-=e.dh) ai+=w.z[--j].sca; } if(i==j || fabsf(nz)0){ ah=h+e.hdh; ai=(nz*x-(ah-z)*w.z[i].abs)*e.rdh; for(j=i; j0; ah+=e.dh) ai-=w.z[++j].abs; } else{ ah=h-e.hdh; ai=(nz*x-(ah-z)*w.z[i].abs)*e.rdh; for(j=i; j>0 && ai<0; ah-=e.dh) ai+=w.z[--j].abs; } if(i==j || fabsf(nz)0){ dn.x=ini; dx.x=fin; } else{ dn.x=fin; dx.x=ini; } dn.x-=R; dx.x+=R; ini=r.y; fin=ini+sca*n.y; if(n.y>0){ dn.y=ini; dx.y=fin; } else{ dn.y=fin; dx.y=ini; } dn.y-=R; dx.y+=R; ini=r.z; fin=ini+sca*n.z; if(n.z>0){ dn.z=ini; dx.z=fin; } else{ dn.z=fin; dx.z=ini; } dn.z-=R; dx.z+=R; #endif #ifdef ACCL2 int3 xl, xh; xl.x=min(max(__float2int_rn((dn.x-e.cl[0])*e.crst[0]), 0), e.cn[0]); xh.x=max(min(__float2int_rn((dx.x-e.cl[0])*e.crst[0]), e.cn[0]-1), -1); xl.y=min(max(__float2int_rn((dn.y-e.cl[1])*e.crst[1]), 0), e.cn[1]); xh.y=max(min(__float2int_rn((dx.y-e.cl[1])*e.crst[1]), e.cn[1]-1), -1); xl.z=min(max(__float2int_rn((dn.z-e.cl[2])*e.crst[2]), 0), e.cn[2]); xh.z=max(min(__float2int_rn((dx.z-e.cl[2])*e.crst[2]), e.cn[2]-1), -1); for(int i=xl.x; i<=xh.x; i++) for(int j=xl.y; j<=xh.y; j++) for(int k=xl.z; k<=xh.z; k++) for(int m=e.cidx[i][j][k]; e.carr[m]!=-1; m++){ int l=e.carr[m]; const DOM & dom=oms[l]; #ifdef ZZZ } #endif #else for(int l=0; ldx.z) || (dom.r[1]dx.y) || (dom.r[0]dx.x)) continue; #endif { float b=0, c=0, dr; dr=dom.r[0]-r.x; b+=n.x*dr; c+=dr*dr; dr=dom.r[1]-r.y; b+=n.y*dr; c+=dr*dr; dr=dom.r[2]-r.z; b+=n.z*dr; c+=dr*dr; c-=R*R; float D=b*b-c; if(D>=0){ float h=b-sqrtf(D); if(h>0 && h<=sca){ om=l; sca=h; } } } } } float xa; { // get overburden for distance a float & a = sca; float & z = r.z; float & nz = n.z; float y=z+nz*a; j=__float2int_rn((y-e.hmin)*e.rdh); if(j<0) j=0; else if(j>=e.size) j=e.size-1; if(i==j || fabsf(nz)0){ xa=(y-(g-e.hdh))*w.z[j].abs; while(--j>i) xa+=e.hdh*w.z[j].abs; xa+=(h+e.hdh-z)*w.z[i].abs; } else{ xa=(y-(g+e.hdh))*w.z[j].abs; while(++jhidx, 1); if(j1) xi=1; else if(xi<-1) xi=-1; float si=sqrtf(1-xi*xi); rotate(xi, si, n, s); } } } { e.rs[s.w]=s.x | (unsigned long long) s.y << 32; } }