#include #include #include #include #include #include #include #include #ifndef __CUDACC__ #define XCPU #elif __CUDA_ARCH__ >= 120 #define USMA #endif #ifdef XCPU #include #include #endif using namespace std; namespace xppc{ #include "ini.cxx" #include "pro.cu" void initialize(){ m.set(); } #ifdef XLIB void igeo(float enh){ m.geo(); d.eff*=enh; } #endif unsigned int pmax, pmxo, pn; double xgpu; #ifdef XCPU dats *e; // pointer to a copy of "d" on device int nblk, nthr, ntot; void ini(int type){ rs_ini(); pn=0; xgpu=1; ntot=nblk*nthr; pmax=ntot*NPHO; d.hnum=pmax/HQUO; pmxo=pmax/OVER; { d.hits = q.hits = new hit[d.hnum]; if(type==0) d.pz = q.pz = new photon[pmxo]; #ifdef TALL d.bf = new pbuf[pmax]; #endif } { d.z=&z; e=&d; oms=q.oms; } { unsigned int size=d.rsize, need=seed+1; if(sizedevice=device; checkError(cudaSetDevice(device)); checkError(cudaGetDeviceProperties(&prop, device)); #if CUDART_VERSION >= 3000 checkError(cudaFuncSetCacheConfig(propagate, cudaFuncCachePreferL1)); #endif cudaFuncAttributes attr; checkError(cudaFuncGetAttributes (&attr, propagate)); nblk=prop.multiProcessorCount, nthr=attr.maxThreadsPerBlock; cerr<<"Running on "<0){ cerr<<"Error: TOT was a nan or an inf "<=d.hnum){ d.hidx=d.hnum; cerr<<"Error: data buffer overflow occurred!"<0) checkError(cudaStreamSynchronize(stream)); unsigned int size=pn*sizeof(photon); checkError(cudaMemcpyAsync(d.pz, &q.pz[idx], size, cudaMemcpyHostToDevice, stream)); idx+=pn; } void kernel_f(unsigned int num){ checkError(cudaStreamSynchronize(stream)); if(num>0){ checkError(cudaEventRecord(evt1, stream)); propagate<<< 1, 1, 0, stream >>>(e, 0); checkError(cudaGetLastError()); propagate<<< nblk, nthr, 0, stream >>>(e, num); checkError(cudaGetLastError()); checkError(cudaEventRecord(evt2, stream)); } } void stop(){ fprintf(stderr, "Device time: %2.1f (in-kernel: %2.1f...%2.1f) [ms]\n", deviceTime, threadMin, threadMax); checkError(cudaThreadExit()); } }; vector gpus; void ini(int type){ d.hnum=0; pmax=0, pmxo=0, pn=0; xgpu=gpus.size()>1?XGPU:1; for(vector::iterator i=gpus.begin(); i!=gpus.end(); i++){ if(gpus.size()>1) checkError(cudaSetDevice(i->device)); i->ini(type); if(gpus.size()>1) sv++; d.hnum+=i->d.hnum; pmax+=i->pmax, pmxo+=i->pmxo; } { unsigned int size=d.hnum*sizeof(hit); checkError(cudaMallocHost((void**) &q.hits, size)); } if(d.type==0){ unsigned int size=pmxo*sizeof(photon); checkError(cudaMallocHost((void**) &q.pz, size)); } for(vector::iterator i=gpus.begin(); i!=gpus.end(); i++){ if(gpus.size()>1) checkError(cudaSetDevice(i->device)); i->fr=i->FR=i->pmax/(double)pmax; } } void fin(){ for(vector::iterator i=gpus.begin(); i!=gpus.end(); i++){ if(gpus.size()>1) checkError(cudaSetDevice(i->device)); i->fin(); } checkError(cudaFreeHost(q.hits)); if(d.type==0) checkError(cudaFreeHost(q.pz)); } void listDevices(){ int deviceCount, driver, runtime; cudaGetDeviceCount(&deviceCount); cudaDriverGetVersion(&driver); cudaRuntimeGetVersion(&runtime); fprintf(stderr, "Found %d devices, driver %d, runtime %d\n", deviceCount, driver, runtime); for(int device=0; device0){ d.hidx=0; #ifdef XCPU for(d.blockIdx=0, d.gridDim=nblk, blockDim.x=nthr; d.blockIdx=d.hnum){ d.hidx=d.hnum; cerr<<"Error: data buffer overflow occurred!"<::iterator i=gpus.begin(); i!=gpus.end(); i++){ if(gpus.size()>1) checkError(cudaSetDevice(i->device)); i->kernel_i(); } #endif cerr<<"photons: "<0) if(xgpu>1){ double sum=0; for(vector::iterator i=gpus.begin(); i!=gpus.end(); i++) i->fr/=i->dt, sum+=i->fr; while(true){ double s1=0, s2=0; for(vector::iterator i=gpus.begin(); i!=gpus.end(); i++){ const double fx=xgpu*i->FR; if(i->frfr/=sum; if(i->fr>fx) i->fr=fx, s1+=fx; else s2+=i->fr; } sum=s2/(1-s1); if(s1==0) break; } } if(d.type==0){ unsigned int idx=0; for(vector::iterator i=gpus.begin(); i!=gpus.end(); i++){ if(gpus.size()>1) checkError(cudaSetDevice(i->device)); i->kernel_c(old, lround(num*i->fr)/OVER, idx); } } #endif #ifndef XCPU for(vector::iterator i=gpus.begin(); i!=gpus.end(); i++){ if(gpus.size()>1) checkError(cudaSetDevice(i->device)); i->kernel_f(lround(num*i->fr)); } #endif if(old>0) print(); #ifndef XCPU old=num; #endif } void flini(int str, int dom){ int type=1; float r[3]={0, 0, 0}; if(str<0){ type=2; str=-str; } if(str==0) switch(dom){ case 1: type=3; r[0]=544.07; r[1]=55.89; r[2]=136.86; break; case 2: type=4; r[0]=11.87; r[1]=179.19; r[2]=-205.64; break; } else for(int n=0; n0; i-=pmax/xgpu) kernel(min(i, (long long)(pmax/xgpu))); #ifndef XCPU kernel(0); #endif } void flasher(int str, int dom, unsigned long long num, int itr){ flini(str, dom); for(int j=0; j0) printf("\n"); } fin(); } #ifdef XCPU void start(){} void stop(){} void choose(int device){ sv+=device; seed=device; nblk=NBLK, nthr=NTHR; } void listDevices(){} #else void start(){ cudaSetDeviceFlags(cudaDeviceBlockingSync); } void stop(){ fprintf(stderr, "\n"); for(vector::iterator i=gpus.begin(); i!=gpus.end(); i++){ if(gpus.size()>1) checkError(cudaSetDevice(i->device)); i->stop(); } } void choose(int device){ if(device<0){ int deviceCount; cudaGetDeviceCount(&deviceCount); for(int device=0; device1) device=atoi(arg_a[1]); initialize(); choose(device); fprintf(stderr, "Processing f2k muons from stdin on device %d\n", device); f2k(); } else{ int str=0, dom=0, device=0, itr=0; unsigned long long num=1000000ULL; if(arg_c>1) str=atoi(arg_a[1]); if(arg_c>2) dom=atoi(arg_a[2]); if(arg_c>3){ num=(unsigned long long) atof(arg_a[3]); char * sub = strchr(arg_a[3], '*'); if(sub!=NULL) itr=(int) atof(++sub); } if(arg_c>4) device=atoi(arg_a[4]); initialize(); choose(device); fprintf(stderr, "Running flasher simulation on device %d\n", device); flasher(str, dom, num, itr); } stop(); } #endif