#include #include #include #include #include #include #ifdef XCPU #include #include #endif using namespace std; namespace xppc{ #include "ini.cxx" #include "pro.cu" dats *eh; // copy of "d" for device dats *ed; // pointer to structure on device #ifdef XLIB void igeo(){ m.geo(); } #endif unsigned int pmax, pmxo, pn; int nblk; #ifdef XCPU void ini(int type){ { nblk=WNUM; pmax=nblk*NTHR*NPHO; pmxo=pmax/OVER; } { eh = new dats; d.hits = new hit[d.hnum]; if(type==0) d.pz = new photon[pmxo]; } { *eh=d; ed=eh; oms=d.oms; pn=0; eh->type=type; eh->hidx=0; for(int m=0; m<3; m++) eh->r[m]=0; } } void fin(){ if(eh->type==0) delete d.pz; delete d.hits; delete eh; } #else float deviceTime=0, threadTime=0; cudaDeviceProp prop; cudaStream_t stream; cudaEvent_t evt1, evt2; void checkError(cudaError result){ if(result!=cudaSuccess){ cerr<<"CUDA Error: "<0?prop.multiProcessorCount:WNUM; pmax=nblk*NTHR*NPHO; pmxo=pmax/OVER; } { checkError(cudaStreamCreate(&stream)); checkError(cudaEventCreateWithFlags(&evt1, cudaEventBlockingSync)); checkError(cudaEventCreateWithFlags(&evt2, cudaEventBlockingSync)); checkError(cudaMallocHost((void**) &eh, sizeof(dats))); *eh=d; checkError(cudaMallocHost((void**) &d.hits, d.hnum*sizeof(hit))); } { pn=0; eh->type=type; eh->hidx=0; for(int m=0; m<3; m++) eh->r[m]=0; } { unsigned int size=d.rsize, need=nblk*NTHR; if(sizersize=need; } { unsigned int size=eh->rsize*sizeof(unsigned int); tot+=size; checkError(cudaMalloc((void**) &eh->rm, size)); checkError(cudaMemcpy(eh->rm, d.rm, size, cudaMemcpyHostToDevice)); } { unsigned int size=eh->rsize*sizeof(unsigned long long); tot+=size; checkError(cudaMalloc((void**) &eh->rs, size)); checkError(cudaMemcpy(eh->rs, d.rs, size, cudaMemcpyHostToDevice)); } for(int i=0; iw[i], size)); checkError(cudaMemcpy(eh->w[i], d.w[i], size, cudaMemcpyHostToDevice)); } { unsigned int size=d.hnum*sizeof(hit); tot+=size; checkError(cudaMalloc((void**) &eh->hits, size)); } if(eh->type==0){ unsigned int size=pmxo*sizeof(photon); tot+=size; checkError(cudaMallocHost((void**) &d.pz, size)); checkError(cudaMalloc((void**) &eh->pz, size)); } { unsigned int size=d.gsize*sizeof(DOM); checkError(cudaMemcpyToSymbol(oms, d.oms, size)); } { unsigned int size=sizeof(dats); tot+=size; checkError(cudaMalloc((void**) &ed, size)); checkError(cudaMemcpy(ed, eh, size, cudaMemcpyHostToDevice)); } cerr<<"Total GPU memory usage: "<rm)); checkError(cudaFree(eh->rs)); for(int i=0; iw[i])); checkError(cudaFree(eh->hits)); checkError(cudaFree(ed)); if(eh->type==0){ checkError(cudaFree(eh->pz)); checkError(cudaFreeHost(d.pz)); } checkError(cudaFreeHost(d.hits)); checkError(cudaFreeHost(eh)); checkError(cudaEventDestroy(evt1)); checkError(cudaEventDestroy(evt2)); checkError(cudaStreamDestroy(stream)); } void listDevices(){ int deviceCount, driver, runtime; cudaGetDeviceCount(&deviceCount); cudaDriverGetVersion(&driver); cudaRuntimeGetVersion(&runtime); fprintf(stderr, "\nFound %d devices, driver %d, runtime %d\n", deviceCount, driver, runtime); for(int device=0; devicehidx=0; for(blockIdx.x=0, gridDim.x=nblk, blockDim.x=NTHR; blockIdx.x>>(ed, num); checkError(cudaGetLastError()); checkError(cudaEventRecord(evt2, stream)); checkError(cudaMemcpyAsync(eh, ed, 6*sizeof(int), cudaMemcpyDeviceToHost, stream)); checkError(cudaStreamSynchronize(stream)); float dt; checkError(cudaEventElapsedTime(&dt, evt1, evt2)); deviceTime+=dt; if(eh->ab>0) cerr<<"Error: TOT was a nan or an inf "<ab<<" times!"<dt/prop.clockRate; else threadTime=-1; } } #endif cerr<<"photons: "<hidx<hidx>=eh->hnum){ eh->hidx=eh->hnum; cerr<<"Error: data buffer overflow occurred!"<hidx*sizeof(hit); checkError(cudaMemcpyAsync(d.hits, eh->hits, size, cudaMemcpyDeviceToHost, stream)); checkError(cudaStreamSynchronize(stream)); } #endif print(); } void flasher(int str, int dom, unsigned long long num, int itr){ int type=1; float r[3]={0, 0, 0}; if(str<0){ type=2; str=-str; } if(str==0) switch(dom){ case 1: type=3; r[0]=544.07; r[1]=55.89; r[2]=136.86; break; case 2: type=4; r[0]=11.87; r[1]=179.19; r[2]=-205.64; break; } else for(int n=0; nhidx=0; for(int m=0; m<3; m++) eh->r[m]=r[m]; #ifndef XCPU checkError(cudaMemcpyAsync(ed, eh, 4*sizeof(int), cudaMemcpyHostToDevice, stream)); #endif for(int j=0; j0; i-=pmax) kernel(min(i, (long long) pmax)); if(itr>0) printf("\n"); } fin(); } void output(){ #ifndef XCPU { unsigned int size=pn*sizeof(photon); checkError(cudaMemcpyAsync(eh->pz, d.pz, size, cudaMemcpyHostToDevice, stream)); } #endif kernel(pn*OVER); pn=0; } #ifdef XCPU void start(){} void stop(){} void choose(int device){ #ifdef MKOW srand(device); #endif seed=device; } void listDevices(){} #else void start(){ cudaSetDeviceFlags(cudaDeviceBlockingSync); } void stop(){ fprintf(stderr, "\nDevice time: %2.1f (in-kernel: %2.1f) [ms]\n", deviceTime, threadTime); checkError(cudaThreadExit()); } void choose(int device){ #ifdef MKOW srand(device); #endif checkError(cudaSetDevice(device)); checkError(cudaGetDeviceProperties(&prop, device)); } #endif } #ifndef XLIB using namespace xppc; int main(int arg_c, char *arg_a[]){ start(); if(arg_c<=1){ listDevices(); fprintf(stderr, "\nUse: %s [device] (f2k muons)\n" " %s [str] [om] [num] [device] (flasher)\n", arg_a[0], arg_a[0]); } else if(0==strcmp(arg_a[1], "-")){ ices * w = d.w[d.nfla]; cerr<<"For wv slice "<<(d.nfla+1)<<"/"<z[i].abs<<" "<z[i].sca<1) device=atoi(arg_a[1]); choose(device); fprintf(stderr, "Processing f2k muons from stdin on device %d\n", device); f2k(); } else{ int str=0, dom=0, device=0, itr=0; unsigned long long num=1000000ULL; if(arg_c>1) str=atoi(arg_a[1]); if(arg_c>2) dom=atoi(arg_a[2]); if(arg_c>3){ num=(unsigned long long) atof(arg_a[3]); char * sub = strchr(arg_a[3], '*'); if(sub!=NULL) itr=(int) atof(++sub); } if(arg_c>4) device=atoi(arg_a[4]); choose(device); fprintf(stderr, "Running flasher simulation on device %d\n", device); flasher(str, dom, (unsigned long long)(num*(long double)(maxe*EFF)), itr); } stop(); } #endif