#ifdef XCPU
#define __device__
#define __global__

#define rsqrtf 1/sqrtf
#define __float2int_rn (int)lroundf
#define __float2int_ru (int)ceilf
#define __float2int_rd (int)floorf

struct int2{
  int x, y;
};

struct uint4{
  unsigned int x, y, z, w;
};

float int_as_float(unsigned int x){
  union{
    unsigned int i;
    float f;
  };
  i=x; return f;
}

unsigned int atomicAdd(unsigned int * i, unsigned int j){
  unsigned int k=*i; *i+=j;
  return k;
}

struct ThreadIdx{
  int x;
} threadIdx;

struct BlockDim{
  int x;
} blockDim;

unsigned int seed=0;
#endif

__device__ float xrnd(uint4 & s){
  unsigned int tmp;
  unsigned long long sda;
  do{
    sda = s.z * (unsigned long long) s.x + s.y;
    s.x=sda; s.y=sda >> 32; tmp = s.x >> 9;
  } while(tmp==0);
  tmp |= 0x3f800000;
  return int_as_float(tmp)-1.0f;
}

#ifdef LONG
__device__ float mrnd(float k, uint4 & s){  // gamma distribution
  float x;
  if(k<1){  // Weibull algorithm
    float c=1/k;
    float d=(1-k)*powf(k, k/(1-k));
    float z, e;
    do{
      z=-logf(xrnd(s));
      e=-logf(xrnd(s));
      x=powf(z, c);
    } while(z+e<d+x);
  }
  else{  // Cheng's algorithm
    float b=k-logf(4);
    float l=sqrtf(2*k-1);
    float c=1+logf(4.5f);
    float u, v, y, z, r;
    do{
      u=xrnd(s); v=xrnd(s);
      y=logf(v/(1-v))/l;
      x=k*expf(y);
      z=u*v*v;
      r=b+(k+l)*y-x;
    } while(r<4.5f*z-c && r<logf(z));
  }
  return x;
}
#endif

__device__ void swap(float & x, float & y){
  float a=x; x=y; y=a;
}

__device__ void rotate(float & cs, float & si, float3 & n, uint4 & s){
  float3 p1, p2;
  int i=0;
  {
    float3 r;
    r.x=n.x*n.x, r.y=n.y*n.y, r.z=n.z*n.z;
    if(r.y>r.z){
      if(r.y>r.x) i=(swap(n.x,n.y),swap(r.x,r.y),1);
    }
    else{
      if(r.z>r.x) i=(swap(n.x,n.z),swap(r.x,r.z),2);
    }

    r.y=rsqrtf(r.x+r.y); p1.x=-n.y*r.y; p1.y=n.x*r.y; p1.z=0;
    r.z=rsqrtf(r.x+r.z); p2.x=-n.z*r.z; p2.y=0; p2.z=n.x*r.z;
  }

  {
    float4 q1;

    q1.x=p1.x-p2.x; q1.y=p1.y-p2.y; q1.z=p1.z-p2.z;
    p2.x+=p1.x; p2.y+=p1.y; p2.z+=p1.z;

    q1.w=rsqrtf(q1.x*q1.x+q1.y*q1.y+q1.z*q1.z);
    p1.x=q1.x*q1.w; p1.y=q1.y*q1.w; p1.z=q1.z*q1.w;

    q1.w=rsqrtf(p2.x*p2.x+p2.y*p2.y+p2.z*p2.z);
    p2.x*=q1.w; p2.y*=q1.w; p2.z*=q1.w;
  }

  {
    float2 p;
    float xi=2*fpi*xrnd(s);
    sincosf(xi, &p.y, &p.x);

    n.x=cs*n.x+si*(p.x*p1.x+p.y*p2.x);
    n.y=cs*n.y+si*(p.x*p1.y+p.y*p2.y);
    n.z=cs*n.z+si*(p.x*p1.z+p.y*p2.z);

    float r=rsqrtf(n.x*n.x+n.y*n.y+n.z*n.z);
    n.x*=r; n.y*=r; n.z*=r;
    if(i==1) swap(n.x,n.y); else if(i==2) swap(n.x,n.z);
  }
}

#ifdef TILT
#ifndef XCPU
__device__ int __float2int_rd(float x);
__host__ int __float2int_rd(float x){ return (int)floorf(x); }
__host__
#endif

__device__ float zshift(dats & d, float4 & r){
  if(d.lnum==0) return 0;
  float z=(r.z-d.lmin)*d.lrdz;
  int k=min(max(__float2int_rd(z), 0), d.lpts-2);
  int l=k+1;

  float nr=d.lnx*r.x+d.lny*r.y-d.r50;
  for(int j=1; j<LMAX; j++) if(nr<d.lr[j] || j==d.lnum-1){
    int i=j-1;
    return ( (d.lp[j][l]*(z-k)+d.lp[j][k]*(l-z))*(nr-d.lr[i]) +
	     (d.lp[i][l]*(z-k)+d.lp[i][k]*(l-z))*(d.lr[j]-nr) )/(d.lr[j]-d.lr[i]);
  }
  return 0;
}
#endif

__device__ void ctr(dats & d, float2 & r, float2 & p){
#ifdef ROMB
  p.x=d.cb[0][0]*r.x+d.cb[1][0]*r.y;
  p.y=d.cb[0][1]*r.x+d.cb[1][1]*r.y;
#else
  p=r;
#endif
}

#ifdef XCPU
DOM * oms;
#else
__constant__ DOM oms[MAXGEO];

__device__ inline unsigned int smid(){
  unsigned int r;
  asm("mov.u32 %0, %smid;" : "=r"(r));
  return r;
}
#endif

#ifdef SM12
#define XINC i=atomicAdd(&eidx, e.gridDim)
#define XIDX e.gridDim*blockDim.x+e.blockIdx
#else
#define XINC i+=eidx
#define XIDX e.gridDim*blockDim.x
#endif

__global__ void propagate(dats * ed, unsigned int num){
  uint4 s;
  int niw=0;
#ifdef XCPU
  float3 n;
  float4 r;
  dats & e = * ed;
  unsigned int eidx = XIDX;
#else
  float3 n={0,0,0};
  float4 r={0,0,0,0};
  __shared__ dats e;
  unsigned int & eidx = e.hidx;

  if(num==0){
    ed->hidx=0;
    ed->tn=-1U;
    ed->tx=0;
    ed->ab=0;
    ed->mp=0;
    __threadfence();
    return;
  }

  if(threadIdx.x==0){
    e=*ed; e.tn=clock();
    e.blockIdx=smid()==e.blockIdx?-1:(int)atomicAdd(&ed->mp, 1);
    eidx=XIDX;
  }
  __syncthreads();

  if(e.blockIdx==-1) return;
#endif

  ices * w;
  const unsigned int idx=threadIdx.x*e.gridDim+e.blockIdx;

  {
#ifndef XCPU
    const unsigned int & seed = idx;
#endif
    s.w=seed%e.rsize;
    s.x=e.rs[s.w];
    s.y=e.rs[s.w] >> 32;
    s.z=e.rm[s.w];
  }

  bool next=true;
  float TOT=0;

  for(unsigned int i=idx; i<num; next?XINC:0){
    if(next){ // initialize photon
      w=e.w[min(__float2int_rd(WNUM*xrnd(s)), WNUM-1)];
      if(e.type>0){
	r.x=e.r[0];
	r.y=e.r[1];
	r.z=e.r[2];
	r.w=0;

	float rms=0, up=0, hms=0;
	const float fcv=fpi/180.f, sq3=sqrtf(3.f);

	switch(e.type){
	case 1: rms=9.2f; up=0.0f; hms=10.1f; break;
	case 2: rms=9.7f; up=48.f; hms=14.7f; break; // sin(hms/2)=sin(9.8/2)/cos(up)
	case 3: rms=0.0f; up=90.0f-41.13f; break;
	case 4: rms=0.0f; up=41.13f-90.0f; break;
	}

	float xi=xrnd(s);
	if(e.fldr<0) xi*=2*fpi;
	else{
	  int s=__float2int_rd(xi*=6); xi-=s;
	  xi=(e.fldr+(2*xi-1)*hms*sq3+s*60)*fcv;
	}
	sincosf(xi, &n.y, &n.x);

	xi=(up+(2*xrnd(s)-1)*rms*sq3)*fcv;
	float np; sincosf(xi, &n.z, &np);
	n.x*=np; n.y*=np;
      }
      else{
	photon p=e.pz[i/OVER];
	niw=p.q, n=p.n, r=p.r;

#ifdef LONG
	if(p.b>0) p.l=p.b*mrnd(p.a, s);
#endif
	if(p.l>0){
	  p.l*=xrnd(s); r.w+=e.ocv*p.l;
	  r.x+=n.x*p.l; r.y+=n.y*p.l; r.z+=n.z*p.l;
	}

#ifdef ANGW
	if(p.f<xrnd(s)){
	  const float a=0.39, b=2.61;
	  const float I=1-expf(-b*powf(2, a));
	  float cs=max(1-powf(-logf(1-xrnd(s)*I)/b, 1/a), -1.0f);
	  float si=sqrtf(1-cs*cs); rotate(cs, si, n, s);
	}
#endif
	rotate(w->coschr, w->sinchr, n, s);
      }
      TOT=-logf(xrnd(s));
      next=false;
    }

    {
      float sca;

      { // get distance for overburden
#ifdef TILT
	float z = r.z - zshift(e, r);
#else
	float & z = r.z;
#endif
	int i=__float2int_rn((z-e.hmin)*e.rdh);
	if(i<0) i=0; else if(i>=e.size) i=e.size-1;
	float h=e.hmin+i*e.dh; // middle of the layer
	float ahx=n.z<0?h-e.hdh:h+e.hdh;

	float SCA=-logf(xrnd(s));

	float ais=(n.z*SCA-(ahx-z)*w->z[i].sca)*e.rdh;
	float aia=(n.z*TOT-(ahx-z)*w->z[i].abs)*e.rdh;

	int j=i;
	if(n.z<0) for(; j>0 && ais<0 && aia<0; ahx-=e.dh, ais+=w->z[j].sca, aia+=w->z[j].abs) --j;
	else for(; j<e.size-1 && ais>0 && aia>0; ahx+=e.dh, ais-=w->z[j].sca, aia-=w->z[j].abs) ++j;

	float tot;
	if(i==j || fabsf(n.z)<xx) sca=SCA/w->z[j].sca, tot=TOT/w->z[j].abs;
	else sca=(ais*e.dh/w->z[j].sca+ahx-z)/n.z, tot=(aia*e.dh/w->z[j].abs+ahx-z)/n.z;

	// get overburden for distance
	if(tot<sca) sca=tot, TOT=0; else TOT=(tot-sca)*w->z[j].abs;
      }

      int om=0;
      { // sphere
	float2 ri, rf, pi, pf;

	ri.x=r.x; rf.x=r.x+sca*n.x;
	ri.y=r.y; rf.y=r.y+sca*n.y;

	ctr(e, ri, pi); ctr(e, rf, pf);

	ri.x=min(pi.x, pf.x)-e.rx; rf.x=max(pi.x, pf.x)+e.rx;
	ri.y=min(pi.y, pf.y)-e.rx; rf.y=max(pi.y, pf.y)+e.rx;

	int2 xl, xh;

	xl.x=min(max(__float2int_rn((ri.x-e.cl[0])*e.crst[0]), 0), e.cn[0]);
	xh.x=max(min(__float2int_rn((rf.x-e.cl[0])*e.crst[0]), e.cn[0]-1), -1);

	xl.y=min(max(__float2int_rn((ri.y-e.cl[1])*e.crst[1]), 0), e.cn[1]);
	xh.y=max(min(__float2int_rn((rf.y-e.cl[1])*e.crst[1]), e.cn[1]-1), -1);

	for(int i=xl.x, j=xl.y; i<=xh.x && j<=xh.y; ++j<=xh.y?:(j=xl.y,i++)) for(unsigned char k=e.is[i][j]; k!=0x80; ){
	  unsigned char m=e.ls[k];
	  line & s = e.sc[m&0x7f];
	  k=m&0x80?0x80:++k;

	  float b=0, c=0, dr;
	  dr=s.x-r.x;
	  b+=n.x*dr; c+=dr*dr;
	  dr=s.y-r.y;
	  b+=n.y*dr; c+=dr*dr;

	  float np=1-n.z*n.z;
	  float D=b*b-(c-s.r*s.r)*np;
	  if(D>=0){
	    D=sqrtf(D);
	    float h1=b-D, h2=b+D;
	    if(h2>=0 && h1<=sca*np){
	      if(np>xx){
		h1/=np, h2/=np;
		if(h1<0) h1=0; if(h2>sca) h2=sca;
	      }
	      else h1=0, h2=sca;
	      h1=r.z+n.z*h1, h2=r.z+n.z*h2;
	      float zl, zh;
	      if(n.z>0) zl=h1, zh=h2;
	      else zl=h2, zh=h1;

	      int omin=0, omax=s.max;
	      int n1=s.n-omin+min(omax+1, max(omin, __float2int_ru(omin-(zh-s.dl-s.h)*s.d)));
	      int n2=s.n-omin+max(omin-1, min(omax, __float2int_rd(omin-(zl-s.dh-s.h)*s.d)));

	      for(int l=n1; l<=n2; l++){
		const DOM & dom=oms[l];
#ifdef OFLA
		if(l==e.fla) continue;
#endif
		float b=0, c=0, dr;
		dr=dom.r[0]-r.x;
		b+=n.x*dr; c+=dr*dr;
		dr=dom.r[1]-r.y;
		b+=n.y*dr; c+=dr*dr;
		dr=dom.r[2]-r.z;
		b+=n.z*dr; c+=dr*dr;
		float D=b*b-c+e.R*e.R;
		if(D>=0){
		  float h=b-sqrtf(D);
		  if(h>0 && h<=sca){ om=l; sca=h; }
		}
	      }
	    }
	  }
	}
      }

      { // advance
	r.x+=sca*n.x;
	r.y+=sca*n.y;
	r.z+=sca*n.z;
	r.w+=sca*w->ocm;
      }

      float xi=xrnd(s);
      if(om!=0){
	bool flag=true;
	hit h; h.i=om; h.t=r.w; h.n=niw; h.z=n.z;

#ifdef ASENS
	float sum;
	{
	  float & x = n.z;
	  float y=1;
	  sum=e.s[0];
	  for(int i=1; i<ANUM; i++){ y*=x; sum+=e.s[i]*y; }
	}

	flag=e.mas*xi<sum;
#endif
	if(flag){
	  unsigned int j = atomicAdd(&ed->hidx, 1);
	  if(j<e.hnum) e.hits[j]=h;
	}
	next=true; continue;
      }

#ifndef XCPU
      if(!isfinite(TOT)) ed->bmp[atomicAdd(&ed->ab, 1)%4]=smid();
#endif
      if(!isfinite(TOT) || TOT<xx){ next=true; continue; }

      {
	if(xi>e.sf){
	  xi=(1-xi)/(1-e.sf);
	  xi=2*xi-1;
	  if(e.g!=0){
	    float g2=e.g*e.g;
	    float ga=(1-g2)/(1+e.g*xi);
	    xi=(1+g2-ga*ga)/(2*e.g);
	  }
	}
	else{
	  xi/=e.sf;
	  xi=2*powf(xi, (1-e.g)/(1+e.g))-1;
	}

	if(xi>1) xi=1; else if(xi<-1) xi=-1;

	float si=sqrtf(1-xi*xi);
	rotate(xi, si, n, s);
      }
    }
  }

  {
    e.rs[s.w]=s.x | (unsigned long long) s.y << 32;
#ifndef XCPU
    __syncthreads();
    if(threadIdx.x==0){
      e.tx=clock();
      atomicMin(&ed->tn, e.tx-e.tn);
      atomicMax(&ed->tx, e.tx-e.tn);
    }
    __threadfence();
#endif
  }

}