#ifdef SHRT
#define STRINGIFY(A) #A
#define XTRINGIFY(A) STRINGIFY(A)
string kernel_source = XTRINGIFY((
;
#endif

struct DOM{
  cl_float r[3];
};

struct hit{
  cl_uint i;
  cl_float t;
  cl_uint n;
  cl_float z;
};

struct photon{
  cl_float4 r;    // location, time
  cl_float4 n;    // direction, track length
  cl_uint q;      // track segment
#ifdef ANGW
  cl_float f;     // fraction of light from muon alone (without cascades)
#endif
#ifdef LONG
  cl_float a, b;  // longitudinal development parametrization coefficients
#endif
};

struct ices{
  cl_float wvl;             // wavelength of this block
  cl_float ocm;             // 1 / speed of light in medium
  cl_float coschr, sinchr;  // cos and sin of the cherenkov angle
  struct{
    cl_float abs;           // absorption
    cl_float sca;           // scattering
  } z [MAXLYS];
};

struct line{
  cl_short n, max;
  cl_float x, y, r;
  cl_float h, d;
  cl_float dl, dh;
};

struct dats{
  cl_uint hidx;
  cl_uint ab;    // if TOT was abnormal

  cl_int type;   // 0=cascade/1=flasher/2=flasher 45/3=laser up/4=laser down
  cl_float r[3];

  cl_uint hnum;  // size of hits buffer
  cl_int size;   // size of kurt table
  cl_int rsize;  // count of multipliers
  cl_int gsize;  // count of initialized OMs

  cl_float dh, hdh, rdh, hmin; // step, step/2, 1/step, and min depth

  cl_float ocv;  // 1 / speed of light in vacuum
  cl_float sf;   // scattering function: 0=HG; 1=SAM
  cl_float g, g2, gr; // g=<cos(scattering angle)>, g2=g*g and gr=(1-g)/(1+g)
  cl_float R, R2, zR; // DOM radius, radius^2, and inverse "oversize" scaling factor

  cl_int cn[2];
  cl_float cl[2], crst[2];

  cl_uchar is[CX][CY];
  cl_uchar ls[NSTR];
  struct line sc[NSTR];
  cl_float rx;

  cl_float fldr; // horizontal direction of the flasher led #1
  cl_float eff;  // OM efficiency correction

#ifdef ASENS
  cl_float mas;  // maximum angular sensitivity
  cl_float s[ANUM]; // ang. sens. coefficients
#endif

#ifdef ROMB
  cl_float cb[2][2];
#endif
#ifdef OFLA
  cl_short fla;
#endif
#ifdef TILT
  cl_int lnum, lpts, l0;
  cl_float lmin, lrdz, r0;
  cl_float lnx, lny;
  cl_float lr[LMAX];
  cl_float lp[LMAX][LYRS];
#endif
};

struct datz{
  struct ices w[WNUM];
  cl_uint rm[MAXRND];
  cl_ulong rs[MAXRND];
};

#ifdef SHRT
#define sin native_sin
#define cos native_cos
#define pow native_powr
#define exp native_exp
#define log native_log
#define sqrt native_sqrt
#define rsqrt native_rsqrt

float xrnd(uint4 * s){
  uint tmp;
  do{
    ulong sda = (*s).z * (ulong) (*s).x;
    sda += (*s).y; (*s).x = sda; (*s).y = sda >> 32; tmp = (*s).x >> 9;
  } while(tmp==0);
  return as_float(tmp|0x3f800000)-1.0f;
}

#ifdef LONG
float mrnd(float k, uint4 * s){  // gamma distribution
  float x;
  if(k<1){  // Weibull algorithm
    float c=1/k;
    float d=(1-k)*pow(k, k/(1-k));
    float z, e;
    do{
      z=-log(xrnd(s));
      e=-log(xrnd(s));
      x=pow(z, c);
    } while(z+e<d+x);
  }
  else{  // Cheng's algorithm
    float b=k-log(4.0f);
    float l=sqrt(2*k-1);
    float c=1+log(4.5f);
    float u, v, y, z, r;
    do{
      u=xrnd(s); v=xrnd(s);
      y=log(v/(1-v))/l;
      x=k*exp(y);
      z=u*v*v;
      r=b+(k+l)*y-x;
    } while(r<4.5f*z-c && r<log(z));
  }
  return x;
}
#endif

float square(float x){
  return x*x;
}

void turn(float cs, float si, float3 * r, uint4 * s){
  float3 n = *r;
  float3 p1, p2;
  int i=-1;
  {
    float3 r;
    r.x=n.x*n.x, r.y=n.y*n.y, r.z=n.z*n.z;
    if(r.y>r.z){
      if(r.y>r.x) n.xy=n.yx, r.xy=r.yx, i=1;
    }
    else{
      if(r.z>r.x) n.xz=n.zx, r.xz=r.zx, i=2;
    }

    r.y=rsqrt(r.x+r.y); p1.x=-n.y*r.y; p1.y=n.x*r.y; p1.z=0;
    r.z=rsqrt(r.x+r.z); p2.x=-n.z*r.z; p2.y=0; p2.z=n.x*r.z;
  }

  {
    float4 q1;

    q1.x=p1.x-p2.x; q1.y=p1.y-p2.y; q1.z=p1.z-p2.z;
    p2.x+=p1.x; p2.y+=p1.y; p2.z+=p1.z;

    q1.w=rsqrt(q1.x*q1.x+q1.y*q1.y+q1.z*q1.z);
    p1.x=q1.x*q1.w; p1.y=q1.y*q1.w; p1.z=q1.z*q1.w;

    q1.w=rsqrt(p2.x*p2.x+p2.y*p2.y+p2.z*p2.z);
    p2.x*=q1.w; p2.y*=q1.w; p2.z*=q1.w;
  }

  {
    float2 p;
    float xi=2*FPI*xrnd(s);
    p.x=cos(xi), p.y=sin(xi);

    n.x=cs*n.x+si*(p.x*p1.x+p.y*p2.x);
    n.y=cs*n.y+si*(p.x*p1.y+p.y*p2.y);
    n.z=cs*n.z+si*(p.x*p1.z+p.y*p2.z);

    float r=rsqrt(n.x*n.x+n.y*n.y+n.z*n.z);
    n.x*=r; n.y*=r; n.z*=r;
    if(i==1) n.xy=n.yx; else if(i==2) n.xz=n.zx;
  }

  *r=n;
}

#ifdef TILT
float zshift(__local struct dats * d, float4 r){
  if(d->lnum==0) return 0;
  float z=(r.z-d->lmin)*d->lrdz;
  int k=min(max(convert_int_sat_rtn(z), 0), d->lpts-2);
  int l=k+1;

  float nr=d->lnx*r.x+d->lny*r.y-d->r0;
  for(int j=1; j<LMAX; j++) if(nr<d->lr[j] || j==d->lnum-1){
    int i=j-1;
    return ( (d->lp[j][l]*(z-k)+d->lp[j][k]*(l-z))*(nr-d->lr[i]) +
	     (d->lp[i][l]*(z-k)+d->lp[i][k]*(l-z))*(d->lr[j]-nr) )/(d->lr[j]-d->lr[i]);
  }
  return 0;
}
#endif

void ctr(__local struct dats * d, float2 r, float2 * p){
#ifdef ROMB
  (*p).x=d->cb[0][0]*r.x+d->cb[1][0]*r.y;
  (*p).y=d->cb[0][1]*r.x+d->cb[1][1]*r.y;
#else
  *p=r;
#endif
}

__kernel void propagate(__private uint num,
			__global struct dats * ed,
			__global struct datz * ez,
			__global struct hit * eh,
			__global struct photon * ep,
			__constant struct DOM * oms){

  if(num==0){
    ed->hidx=0;
    ed->ab=0;
    write_mem_fence(CLK_GLOBAL_MEM_FENCE);
    return;
  }

  uint4 s;
  float4 r=(float4)(0);
  float3 n=(float3)(0);

  __global struct ices * w;
  __local struct dats e;

  const uint lidx=get_local_id(0), lsiz=get_local_size(0);
  const uint idx=get_global_id(0), siz=get_global_size(0);

  // event_t ev=async_work_group_copy((__local char *) &e, (__global char *) ed, sizeof(e), 0); wait_group_events(1, &ev);
  for(uint i=lidx; 4*i<sizeof(e); i+=lsiz) ((__local int *) &e)[i]=((__global int *) ed)[i];
  barrier(CLK_LOCAL_MEM_FENCE);

  {
    s.w=idx%e.rsize;
    s.x=ez->rs[s.w];
    s.y=ez->rs[s.w] >> 32;
    s.z=ez->rm[s.w];
  }

  int niw=0, old;
  float TOT=0, sca=0;

  for(uint i=idx; i<num; i+=TOT==0?siz:0){
    if(TOT==0){ // initialize photon
      w=&ez->w[min(convert_int_sat_rtn(WNUM*xrnd(&s)), WNUM-1)];
      if(e.type>0){
	r.x=e.r[0];
	r.y=e.r[1];
	r.z=e.r[2];
	r.w=0;

	float ka=0, up=0;
	const float fcv=FPI/180.f;

	switch(e.type){
	case 1: ka=square(fcv*9.7f); up=fcv*0.0f; break;
	case 2: ka=square(fcv*9.7f); up=fcv*48.f; break;
	case 3: ka=0.0f;  up=fcv*(90.0f-41.13f);  break;
	case 4: ka=0.0f;  up=fcv*(41.13f-90.0f);  break;
	}

	float xi=xrnd(&s);
	if(e.fldr<0) xi*=2*FPI;
	else{
	  int r=convert_int_sat_rtn(e.fldr/360)+1;
	  int s=convert_int_sat_rtn(xi*r);
	  xi=(e.fldr+s*360/r)*fcv;
	}
	n.x=cos(xi), n.y=sin(xi);
	float np=cos(up); n.z=sin(up);
	n.x*=np; n.y*=np;

	if(ka>0){
	  do{ xi=1+ka*log(xrnd(&s)); } while (xi<-1);
	  float si=sqrt(1-xi*xi); turn(xi, si, &n, &s);
	}
      }
      else{
	struct photon p=ep[i/OVER];
	r=p.r, n.xyz=p.n.xyz;
	float l=p.n.w; niw=p.q;

	if(l>0) l*=xrnd(&s);
#ifdef LONG
	else if(p.b>0) l=p.b*mrnd(p.a, &s);
#endif
	if(l>0){
	  r.w+=e.ocv*l;
	  r.x+=n.x*l; r.y+=n.y*l; r.z+=n.z*l;
	}

#ifdef ANGW
	if(p.f<xrnd(&s)){
	  const float a=0.39f, b=2.61f;
	  const float I=1-exp(-b*pow(2, a));
	  float cs=max(1-pow(-log(1-xrnd(&s)*I)/b, 1/a), -1.0f);
	  float si=sqrt(1-cs*cs); turn(cs, si, &n, &s);
	}
#endif
	turn(w->coschr, w->sinchr, &n, &s);
      }
      TOT=-log(xrnd(&s));
    }

    if(sca==0){ // get distance for overburden
      float z = r.z;
#ifdef TILT
      z-= zshift(&e, r);
#endif
      int i=convert_int_sat_rte((z-e.hmin)*e.rdh);
      if(i<0) i=0; else if(i>=e.size) i=e.size-1;
      float h=e.hmin+i*e.dh; // middle of the layer
      float ahx=n.z<0?h-e.hdh:h+e.hdh;

      float SCA=-log(xrnd(&s));

      float ais=(n.z*SCA-(ahx-z)*w->z[i].sca)*e.rdh;
      float aia=(n.z*TOT-(ahx-z)*w->z[i].abs)*e.rdh;

      int j=i;
      if(n.z<0) for(; j>0 && ais<0 && aia<0; ahx-=e.dh, ais+=w->z[j].sca, aia+=w->z[j].abs) --j;
      else for(; j<e.size-1 && ais>0 && aia>0; ahx+=e.dh, ais-=w->z[j].sca, aia-=w->z[j].abs) ++j;

      float tot;
      if(i==j || fabs(n.z)<XXX) sca=SCA/w->z[j].sca, tot=TOT/w->z[j].abs;
      else sca=(ais*e.dh/w->z[j].sca+ahx-z)/n.z, tot=(aia*e.dh/w->z[j].abs+ahx-z)/n.z;

      // get overburden for distance
      if(tot<sca) sca=tot, TOT=0; else TOT=(tot-sca)*w->z[j].abs;
      old=-1;
    }

    int om=-1;
    float del=sca;
    { // sphere
      float2 ri, rf, pi, pf;

      ri.x=r.x; rf.x=r.x+del*n.x;
      ri.y=r.y; rf.y=r.y+del*n.y;

      ctr(&e, ri, &pi); ctr(&e, rf, &pf);

      ri.x=min(pi.x, pf.x)-e.rx; rf.x=max(pi.x, pf.x)+e.rx;
      ri.y=min(pi.y, pf.y)-e.rx; rf.y=max(pi.y, pf.y)+e.rx;

      int2 xl, xh;

      xl.x=min(max(convert_int_sat_rte((ri.x-e.cl[0])*e.crst[0]), 0), e.cn[0]);
      xh.x=max(min(convert_int_sat_rte((rf.x-e.cl[0])*e.crst[0]), e.cn[0]-1), -1);

      xl.y=min(max(convert_int_sat_rte((ri.y-e.cl[1])*e.crst[1]), 0), e.cn[1]);
      xh.y=max(min(convert_int_sat_rte((rf.y-e.cl[1])*e.crst[1]), e.cn[1]-1), -1);

      for(int i=xl.x, j=xl.y; i<=xh.x && j<=xh.y; ++j<=xh.y?:(j=xl.y,i++)) for(uchar k=e.is[i][j]; k!=0x80; ){
	uchar m=e.ls[k];
	__local struct line * s = & e.sc[m&0x7f];
	k=m&0x80?0x80:k+1;

	float b=0, c=0, dr;
	dr=s->x-r.x;
	b+=n.x*dr; c+=dr*dr;
	dr=s->y-r.y;
	b+=n.y*dr; c+=dr*dr;

	float np=1-n.z*n.z;
	float D=b*b-(c-s->r*s->r)*np;
	if(D>=0){
	  D=sqrt(D);
	  float h1=b-D, h2=b+D;
	  if(h2>=0 && h1<=del*np){
	    if(np>XXX){
	      h1/=np, h2/=np;
	      if(h1<0) h1=0; if(h2>del) h2=del;
	    }
	    else h1=0, h2=del;
	    h1=r.z+n.z*h1, h2=r.z+n.z*h2;
	    float zl, zh;
	    if(n.z>0) zl=h1, zh=h2;
	    else zl=h2, zh=h1;

	    int omin=0, omax=s->max;
	    int n1=s->n-omin+min(omax+1, max(omin, convert_int_sat_rtp(omin-(zh-s->dl-s->h)*s->d)));
	    int n2=s->n-omin+max(omin-1, min(omax, convert_int_sat_rtn(omin-(zl-s->dh-s->h)*s->d)));

	    for(int l=n1; l<=n2; l++) if(l!=old){
#ifdef OFLA
	      if(l==e.fla) continue;
#endif
	      struct DOM dom = oms[l];
	      float b=0, c=0, dr;
	      dr=dom.r[0]-r.x;
	      b+=n.x*dr; c+=dr*dr;
	      dr=dom.r[1]-r.y;
	      b+=n.y*dr; c+=dr*dr;
	      dr=dom.r[2]-r.z;
	      b+=n.z*dr; c+=dr*dr;
	      float D=b*b-c+e.R2;
	      if(D>=0){
		D=sqrt(D); float h=b-D*e.zR;
		if(h>0 && h<=del){ om=l; del=h; }
	      }
	    }
	  }
	}
      }
    }

    { // advance
      r.x+=del*n.x;
      r.y+=del*n.y;
      r.z+=del*n.z;
      r.w+=del*w->ocm;
      sca-=del;
    }

    if(!isfinite(TOT) || !isfinite(sca)) atom_add(&ed->ab, 1), TOT=0, sca=0, om=-1;

    float xi=xrnd(&s);
    if(om!=-1){
      bool flag=true;
      struct hit h; h.i=om; h.t=r.w; h.n=niw; h.z=n.z;

#ifdef ASENS
      float sum;
      {
	float x = n.z;
	float y=1;
	sum=e.s[0];
	for(int i=1; i<ANUM; i++){ y*=x; sum+=e.s[i]*y; }
      }

      flag=e.mas*xi<sum;
#endif
      if(e.type>0){
	float dt=0, dr;
	const struct DOM dom = oms[om];
	for(int i=0; i<3; i++, dt+=dr*dr) dr=dom.r[i]-e.r[i];
	if(h.t<(sqrt(dt)-OMR)*w->ocm) flag=false;
      }

      if(flag){
	uint j = atom_add(&ed->hidx, 1);
	if(j<e.hnum) eh[j]=h;
      }

      if(e.zR==1) TOT=0, sca=0; else old=om;
    }
    else{
      if(TOT<XXX) TOT=0;
      else{
	if(xi>e.sf){
	  xi=(1-xi)/(1-e.sf);
	  xi=2*xi-1;
	  if(e.g!=0){
	    float ga=(1-e.g2)/(1+e.g*xi);
	    xi=(1+e.g2-ga*ga)/(2*e.g);
	  }
	}
	else{
	  xi/=e.sf;
	  xi=2*pow(xi, e.gr)-1;
	}

	if(xi>1) xi=1; else if(xi<-1) xi=-1;

	float si=sqrt(1-xi*xi);
	turn(xi, si, &n, &s);
      }
    }
  }

  {
    ez->rs[s.w]=s.x | (ulong) s.y << 32;
    barrier(CLK_LOCAL_MEM_FENCE);
    write_mem_fence(CLK_GLOBAL_MEM_FENCE);
  }

}
#endif

#ifdef SHRT
));
#endif