#include #include #include #include #include #include #include #ifndef __CUDACC__ #define XCPU #elif __CUDA_ARCH__ >= 120 #define USMA #endif #ifdef XCPU #include #include #endif using namespace std; namespace xppc{ #include "ini.cxx" #include "pro.cu" dats *eh; // copy of "d" for device dats *ed; // pointer to structure on device #ifdef XLIB void igeo(float enh){ m.geo(); d.eff*=enh; } #endif unsigned int pmax, pmxo, pn; int nblk, nthr; void ini(){ pmax=nblk*nthr*NPHO; d.hnum=pmax/HQUO; pmxo=pmax/OVER; } #ifdef XCPU void ini(int type){ ini(); { eh = new dats; d.hits = new hit[d.hnum]; if(type==0) d.pz = new photon[pmxo]; } { *eh=d; ed=eh; oms=d.oms; pn=0; eh->type=type; eh->hidx=0; for(int m=0; m<3; m++) eh->r[m]=0; } { unsigned int size=d.rsize, need=seed+1; if(sizetype==0) delete d.pz; delete d.hits; delete eh; } #else float deviceTime=0, threadMin=0, threadMax=0; cudaDeviceProp prop; cudaStream_t stream; cudaEvent_t evt1, evt2; void checkError(cudaError result){ if(result!=cudaSuccess){ cerr<<"CUDA Error: "<type=type; eh->hidx=0; for(int m=0; m<3; m++) eh->r[m]=0; } { unsigned int size=d.rsize, need=nblk*nthr; if(sizersize=need; } int tot=0; { unsigned int size=eh->rsize*sizeof(unsigned int); tot+=size; checkError(cudaMalloc((void**) &eh->rm, size)); checkError(cudaMemcpy(eh->rm, d.rm, size, cudaMemcpyHostToDevice)); } { unsigned int size=eh->rsize*sizeof(unsigned long long); tot+=size; checkError(cudaMalloc((void**) &eh->rs, size)); checkError(cudaMemcpy(eh->rs, d.rs, size, cudaMemcpyHostToDevice)); } for(int i=0; iw[i], size)); checkError(cudaMemcpy(eh->w[i], d.w[i], size, cudaMemcpyHostToDevice)); } { unsigned int size=d.hnum*sizeof(hit); tot+=size; checkError(cudaMalloc((void**) &eh->hits, size)); } if(eh->type==0){ unsigned int size=pmxo*sizeof(photon); tot+=size; checkError(cudaMallocHost((void**) &d.pz, size)); checkError(cudaMalloc((void**) &eh->pz, size)); } { unsigned int size=d.gsize*sizeof(DOM); checkError(cudaMemcpyToSymbol(oms, d.oms, size)); } { unsigned int size=sizeof(dats); tot+=size; checkError(cudaMalloc((void**) &ed, size)); checkError(cudaMemcpy(ed, eh, size, cudaMemcpyHostToDevice)); } cerr<<"Total GPU memory usage: "<rm)); checkError(cudaFree(eh->rs)); for(int i=0; iw[i])); checkError(cudaFree(eh->hits)); checkError(cudaFree(ed)); if(eh->type==0){ checkError(cudaFree(eh->pz)); checkError(cudaFreeHost(d.pz)); } checkError(cudaFreeHost(d.hits)); checkError(cudaFreeHost(eh)); checkError(cudaEventDestroy(evt1)); checkError(cudaEventDestroy(evt2)); checkError(cudaStreamDestroy(stream)); } void listDevices(){ int deviceCount, driver, runtime; cudaGetDeviceCount(&deviceCount); cudaDriverGetVersion(&driver); cudaRuntimeGetVersion(&runtime); fprintf(stderr, "\nFound %d devices, driver %d, runtime %d\n", deviceCount, driver, runtime); for(int device=0; device0){ #ifdef XCPU eh->hidx=0; for(eh->blockIdx=0, eh->gridDim=nblk, blockDim.x=nthr; eh->blockIdxgridDim; eh->blockIdx++) for(threadIdx.x=0; threadIdx.xab>0){ cerr<<"Error: TOT was a nan or an inf "<ab<<" times! Bad MP"; for(int i=0; iab, 4); i++) cerr<<" #"<bmp[i]; cerr<mp!=eh->gridDim){ cerr<<"Error: did not encounter MP #"<blockIdx<tn/(float)prop.clockRate; threadMax+=eh->tx/(float)prop.clockRate; } else threadMin=-1, threadMax=-1; } #endif cerr<<"photons: "<hidx<hidx>=eh->hnum){ eh->hidx=eh->hnum; cerr<<"Error: data buffer overflow occurred!"<hidx*sizeof(hit); checkError(cudaMemcpyAsync(d.hits, eh->hits, size, cudaMemcpyDeviceToHost, stream)); #endif } #ifndef XCPU checkError(cudaStreamSynchronize(stream)); if(num>0){ checkError(cudaEventRecord(evt1, stream)); propagate<<< 1, 1, 0, stream >>>(ed, 0); checkError(cudaGetLastError()); propagate<<< nblk, nthr, 0, stream >>>(ed, num); checkError(cudaGetLastError()); checkError(cudaEventRecord(evt2, stream)); } #endif if(old>0) print(); #ifndef XCPU old=num; #endif } void flini(int str, int dom){ int type=1; float r[3]={0, 0, 0}; if(str<0){ type=2; str=-str; } if(str==0) switch(dom){ case 1: type=3; r[0]=544.07; r[1]=55.89; r[2]=136.86; break; case 2: type=4; r[0]=11.87; r[1]=179.19; r[2]=-205.64; break; } else for(int n=0; nhidx=0; for(int m=0; m<3; m++) eh->r[m]=r[m]; #ifndef XCPU checkError(cudaMemcpyAsync(ed, eh, 4*sizeof(int), cudaMemcpyHostToDevice, stream)); #endif } void flone(unsigned long long num){ for(long long i=num; i>0; i-=pmax) kernel(min(i, (long long) pmax)); #ifndef XCPU kernel(0); #endif } void flasher(int str, int dom, unsigned long long num, int itr){ flini(str, dom); for(int j=0; j0) printf("\n"); } fin(); } void output(){ #ifndef XCPU { if(old>0) checkError(cudaStreamSynchronize(stream)); unsigned int size=pn*sizeof(photon); checkError(cudaMemcpyAsync(eh->pz, d.pz, size, cudaMemcpyHostToDevice, stream)); } #endif kernel(pn*OVER); pn=0; #ifndef XCPU flnd=flne; #endif } #ifdef XCPU void start(){} void stop(){} void choose(int device){ sv+=device; seed=device; nblk=NBLK, nthr=NTHR; } void listDevices(){} #else void start(){ cudaSetDeviceFlags(cudaDeviceBlockingSync); } void stop(){ fprintf(stderr, "\nDevice time: %2.1f (in-kernel: %2.1f...%2.1f) [ms]\n", deviceTime, threadMin, threadMax); checkError(cudaThreadExit()); } void choose(int device){ sv+=device; checkError(cudaSetDevice(device)); checkError(cudaGetDeviceProperties(&prop, device)); #if CUDART_VERSION >= 3000 checkError(cudaFuncSetCacheConfig(propagate, cudaFuncCachePreferL1)); #endif cudaFuncAttributes attr; checkError(cudaFuncGetAttributes (&attr, propagate)); nblk=prop.multiProcessorCount, nthr=attr.maxThreadsPerBlock; cerr<<"Running on "<z[i].abs<<" "<z[i].sca<1) device=atoi(arg_a[1]); choose(device); fprintf(stderr, "Processing f2k muons from stdin on device %d\n", device); f2k(); } else{ int str=0, dom=0, device=0, itr=0; unsigned long long num=1000000ULL; if(arg_c>1) str=atoi(arg_a[1]); if(arg_c>2) dom=atoi(arg_a[2]); if(arg_c>3){ num=(unsigned long long) atof(arg_a[3]); char * sub = strchr(arg_a[3], '*'); if(sub!=NULL) itr=(int) atof(++sub); } if(arg_c>4) device=atoi(arg_a[4]); choose(device); fprintf(stderr, "Running flasher simulation on device %d\n", device); flasher(str, dom, (unsigned long long)llroundf(num*(long double)d.eff), itr); } stop(); } #endif