#include<stdio.h>#include<assert.h>inlinecudaError_tcheckCuda(cudaError_tresult){if(result!=cudaSuccess){fprintf(stderr,"CUDA Runtime Error: %s\n",cudaGetErrorString(result));assert(result==cudaSuccess);}returnresult;}voidinitWith(floatnum,float*a,intN){for(inti=0;i<N;++i){a[i]=num;}}__global__voidaddVectorsInto(float*result,float*a,float*b,intN){intinitIndex=threadIdx.x+blockIdx.x*blockDim.x;intgridStride=gridDim.x*blockDim.x;for(inti=initIndex;i<N;i+=gridStride){result[i]=a[i]+b[i];}}voidcheckElementsAre(floattarget,float*array,intN){for(inti=0;i<N;i++){if(array[i]!=target){printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n",i,array[i],target);exit(1);}}printf("SUCCESS! All values added correctly.\n");}intmain(){constintN=2<<20;size_tsize=N*sizeof(float);float*a;float*b;float*c;cudaMallocManaged(&a,size);cudaMallocManaged(&b,size);cudaMallocManaged(&c,size);initWith(3,a,N);initWith(4,b,N);initWith(0,c,N);size_tthreads_per_block=1024;size_tnumber_of_blocks=(N+threads_per_block-1)/threads_per_block;addVectorsInto<<<32,1024>>>(c,a,b,N);//addVectorsInto<<<number_of_blocks,threads_per_block>>>(c, a, b, N);
checkCuda(cudaGetLastError());checkCuda(cudaDeviceSynchronize());checkElementsAre(7,c,N);cudaFree(a);cudaFree(b);cudaFree(c);}
2 加速SAXPY
#include<stdio.h>#include<assert.h>#define N 2048 * 2048 // Number of elements in each vector
/*
* Optimize this already-accelerated codebase. Work iteratively,
* and use nsys to support your work.
*
* Aim to profile `saxpy` (without modifying `N`) running under
* 20us.
*
* Some bugs have been placed in this codebase for your edification.
*/inlinecudaError_tcheckCuda(cudaError_tresult){if(result!=cudaSuccess){fprintf(stderr,"CUDA Runtime Error: %s\n",cudaGetErrorString(result));assert(result==cudaSuccess);}returnresult;}__global__voidsaxpy(float*a,float*b,float*c){inttid=blockIdx.x*blockDim.x+threadIdx.x;intstride=gridDim.x*blockDim.x;for(inti=tid;i<N;i+=stride){c[tid]=2*a[tid]+b[tid];}}intmain(){intdeviceId;intnumberOfSMs;cudaGetDevice(&deviceId);cudaDeviceGetAttribute(&numberOfSMs,cudaDevAttrMultiProcessorCount,deviceId);float*a,*b,*c;intsize=N*sizeof(float);// The total number of bytes per vector
cudaMallocManaged(&a,size);cudaMallocManaged(&b,size);cudaMallocManaged(&c,size);// Initialize memory
for(inti=0;i<N;++i){a[i]=2.0;b[i]=1.0;c[i]=0.0;}cudaMemPrefetchAsync(a,size,deviceId);cudaMemPrefetchAsync(b,size,deviceId);cudaMemPrefetchAsync(c,size,deviceId);intthreads_per_block=256;intnumber_of_blocks=numberOfSMs*32;saxpy<<<number_of_blocks,threads_per_block>>>(a,b,c);checkCuda(cudaGetLastError());checkCuda(cudaDeviceSynchronize());// Print out the first and last 5 values of c for a quality check
for(inti=0;i<5;++i)printf("c[%d] = %f, ",i,c[i]);printf("\n");for(inti=N-5;i<N;++i)printf("c[%d] = %f, ",i,c[i]);printf("\n");cudaFree(a);cudaFree(b);cudaFree(c);}
3 N-body
#include<math.h>#include<stdio.h>#include<stdlib.h>#include"timer.h"#include"files.h"#define SOFTENING 1e-9f
#include<assert.h>inlinecudaError_tcheckCuda(cudaError_tresult){if(result!=cudaSuccess){fprintf(stderr,"CUDA Runtime Error: %s\n",cudaGetErrorString(result));assert(result==cudaSuccess);}returnresult;}/*
* Each body contains x, y, and z coordinate positions,
* as well as velocities in the x, y, and z directions.
*/typedefstruct{floatx,y,z,vx,vy,vz;}Body;/*
* Calculate the gravitational impact of all bodies in the system
* on all others.
*/__global__voidbodyForce(Body*p,floatdt,intn){intindex=blockIdx.x*blockDim.x+threadIdx.x;intstride=gridDim.x*blockDim.x;for(inti=index;i<n;i+=stride){floatFx=0.0f;floatFy=0.0f;floatFz=0.0f;for(intj=0;j<n;j++){floatdx=p[j].x-p[i].x;floatdy=p[j].y-p[i].y;floatdz=p[j].z-p[i].z;floatdistSqr=dx*dx+dy*dy+dz*dz+SOFTENING;floatinvDist=rsqrtf(distSqr);floatinvDist3=invDist*invDist*invDist;Fx+=dx*invDist3;Fy+=dy*invDist3;Fz+=dz*invDist3;}p[i].vx+=dt*Fx;p[i].vy+=dt*Fy;p[i].vz+=dt*Fz;}}intmain(constintargc,constchar**argv){// The assessment will test against both 2<11 and 2<15.
// Feel free to pass the command line argument 15 when you gernate ./nbody report files
intnBodies=2<<11;if(argc>1)nBodies=2<<atoi(argv[1]);// The assessment will pass hidden initialized values to check for correctness.
// You should not make changes to these files, or else the assessment will not work.
constchar*initialized_values;constchar*solution_values;if(nBodies==2<<11){initialized_values="files/initialized_4096";solution_values="files/solution_4096";}else{// nBodies == 2<<15
initialized_values="files/initialized_65536";solution_values="files/solution_65536";}if(argc>2)initialized_values=argv[2];if(argc>3)solution_values=argv[3];constfloatdt=0.01f;// Time step
constintnIters=10;// Simulation iterations
intbytes=nBodies*sizeof(Body);float*buf;cudaMallocManaged(&buf,bytes);Body*p=(Body*)buf;read_values_from_file(initialized_values,buf,bytes);doubletotalTime=0.0;/*
* This simulation will run for 10 cycles of time, calculating gravitational
* interaction amongst bodies, and adjusting their positions to reflect.
*/intdeviceId;intnumberOfSMs;cudaGetDevice(&deviceId);cudaDeviceGetAttribute(&numberOfSMs,cudaDevAttrMultiProcessorCount,deviceId);intthreads_per_block=128;intnumber_of_blocks=numberOfSMs*25;for(intiter=0;iter<nIters;iter++){StartTimer();/*
* You will likely wish to refactor the work being done in `bodyForce`,
* and potentially the work to integrate the positions.
*/bodyForce<<<threads_per_block,number_of_blocks>>>(p,dt,nBodies);// compute interbody forces
checkCuda(cudaGetLastError());checkCuda(cudaDeviceSynchronize());/*
* This position integration cannot occur until this round of `bodyForce` has completed.
* Also, the next round of `bodyForce` cannot begin until the integration is complete.
*/for(inti=0;i<nBodies;i++){// integrate position
p[i].x+=p[i].vx*dt;p[i].y+=p[i].vy*dt;p[i].z+=p[i].vz*dt;}constdoubletElapsed=GetTimer()/1000.0;totalTime+=tElapsed;}doubleavgTime=totalTime/(double)(nIters);floatbillionsOfOpsPerSecond=1e-9*nBodies*nBodies/avgTime;write_values_to_file(solution_values,buf,bytes);// You will likely enjoy watching this value grow as you accelerate the application,
// but beware that a failure to correctly synchronize the device might result in
// unrealistically high values.
printf("%0.3f Billion Interactions / second",billionsOfOpsPerSecond);cudaFree(buf);}