Download presentation
Presentation is loading. Please wait.
1
1 ITCS 6/8010 CUDA Programming, UNC-Charlotte, B. Wilkinson, March 3, 2011 ConstantMemTiming.ppt Measuring Performance of Constant Memory These notes will introduce: Results of an experiment using constant memory
2
2 Program The test program simply adds two vectors A and B together to produce a third vector, C One version uses constant memory for A and B Another version uses regular global memory for A and B Note maximum available for constant memory on the GPU (all compute capabilities so far) is 64 Kbytes total.
3
3 #define N 8192// max size allowed for two vectors in const. mem // Constants held in constant memory __device__ __constant__ int dev_a_Cont[N]; __device__ __constant__ int dev_b_Cont[N]; // regular global memory for comparison __device__ int dev_a[N]; __device__ int dev_b[N]; // result in device global memory __device__ int dev_c[N]; Code Array declarations
4
4 // kernel routines __global__ void add_Cont() {// using constant memory int tid = blockIdx.x * blockDim.x + threadIdx.x; if(tid < N){ dev_c[tid] = dev_a_Cont[tid] + dev_b_Cont[tid]; } __global__ void add() {//not using constant memory int tid = blockIdx.x * blockDim.x + threadIdx.x; if(tid < N){ dev_c[tid] = dev_a[tid] + dev_b[tid]; }
5
5 /*----------- GPU using constant memory ------------------------*/ printf("GPU using constant memory\n"); for(int i=0;i<N;i++) { // load arrays with some numbers a[i] = i; b[i] = i*2; } // copy vectors to constant memory cudaMemcpyToSymbol(dev_a_Cont,a,N*sizeof(int),0,cudaMemcpyHostToDevice); cudaMemcpyToSymbol(dev_b_Cont,b,N*sizeof(int),0,cudaMemcpyHostToDevice); cudaEventRecord(start, 0);// start time add_Cont >>();// does not need array ptrs cudaThreadSynchronize();// wait for all threads to complete cudaEventRecord(stop, 0); // end time cudaMemcpyFromSymbol(a,"dev_a_Cont",N*sizeof(int),0,cudaMemcpyDeviceToHost); cudaMemcpyFromSymbol(b,"dev_b_Cont",N*sizeof(int),0,cudaMemcpyDeviceToHost); cudaMemcpyFromSymbol(c,"dev_c",N*sizeof(int),0,cudaMemcpyDeviceToHost); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsed_time_Cont, start, stop); Watch for this zero. I missed it off and it took some time to spot Missed originally
6
6 /*----------- GPU not using constant memory ------------------------*/ printf("GPU using constant memory\n"); for(int i=0;i<N;i++) { // load arrays with some numbers a[i] = i; b[i] = i*2; } // copy vectors to constant memory cudaMemcpyToSymbol(dev_a_Cont,a,N*sizeof(int),0,cudaMemcpyHostToDevice); cudaMemcpyToSymbol(dev_b_Cont,b,N*sizeof(int),0,cudaMemcpyHostToDevice); cudaEventRecord(start, 0);// start time add >>();// does not need array ptrs cudaThreadSynchronize();// wait for all threads to complete cudaEventRecord(stop, 0); // end time cudaMemcpyFromSymbol(a,"dev_a_Cont",N*sizeof(int),0,cudaMemcpyDeviceToHost); cudaMemcpyFromSymbol(b,"dev_b_Cont",N*sizeof(int),0,cudaMemcpyDeviceToHost); cudaMemcpyFromSymbol(c,"dev_c",N*sizeof(int),0,cudaMemcpyDeviceToHost); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsed_time, start, stop);
7
7 Speedup around 1.2 after first launch (20%) 1 st launch, 1.6 3 rd run, 1.225 2 nd run, 1.217
8
Questions
Similar presentations
© 2024 SlidePlayer.com. Inc.
All rights reserved.