>(ad, bd, cd, N); cudaMemcpy(c, cd, size, cudaMemcpyDeviceToHost); for (i = 0; i >(ad, bd, cd, N); cudaMemcpy(c, cd, size, cudaMemcpyDeviceToHost); for (i = 0; i

Presentation is loading. Please wait.

Presentation is loading. Please wait.

Lecture 6: Shared-memory Computing with GPU. Free download NVIDIA CUDA https://developer.nvidia.com/cud a-downloads CUDA programming on visual studio.

Similar presentations


Presentation on theme: "Lecture 6: Shared-memory Computing with GPU. Free download NVIDIA CUDA https://developer.nvidia.com/cud a-downloads CUDA programming on visual studio."— Presentation transcript:

1 Lecture 6: Shared-memory Computing with GPU

2 Free download NVIDIA CUDA https://developer.nvidia.com/cud a-downloads CUDA programming on visual studio 2010 START: download NVIDIA CUDA

3 #include "cuda_runtime.h" #include "device_launch_parameters.h" #include const int N = 1024; const int blocksize = 16; __global__ void add_matrix(float* a, float *b, float *c, int N) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y ; int index = i+j*N; if (i< N && j <N) c[index] = a[index] + b[index]; } int main() { float *a = new float[N*N]; float *b = new float[N*N]; float *c = new float[N*N]; int i, j; for (int i = 0; i < N*N; ++i) { a[i] = 1.0f; b[i] = 3.5f; } float *ad, *bd, *cd; const int size = N*N*sizeof(float); cudaMalloc( (void**)&ad, size); cudaMalloc( (void**)&bd, size); cudaMalloc( (void**)&cd, size); cudaMemcpy(ad, a, size, cudaMemcpyHostToDevice); cudaMemcpy(bd, b, size, cudaMemcpyHostToDevice); dim3 dimBlock(blocksize, blocksize); dim3 dimGrid(N/dimBlock.x, N/dimBlock.y); add_matrix >>(ad, bd, cd, N); cudaMemcpy(c, cd, size, cudaMemcpyDeviceToHost); for (i = 0; i<N; i++) { for (j=0; j<N; j++) printf("%f", c[i,j]); printf("\n"); }; delete[] a; delete b; delete [] c; return EXIT_SUCCESS; } START: Matrix Addition Global memory (i,j ) height dimBlock.y width dimBlock.x threadIdx.x threadIdy.y

4

5

6

7

8

9

10

11

12

13

14 Memory Allocation Example

15 (xIdx,yIdy ) height dimBlock.y width dimBlock.x threadIdx.x threadIdy.y

16 Memory Allocation Example

17

18 (3) Read from the shared memory & write to global memory (1) Read from global memory & write to block shared memory (2) Transposed address (X,Y) height yBlock width xBlock Global memory (threadIDx.y, threadIDx.x) shared memory (threadIDx.x, threadIDx.y) (1) (2)

19 Memory Allocation Example (X,Y) height yBlock width xBlock (threadIDx.x, threadIDx.y) (threadIDx.y, threadIDx.x) (y,x) height yBlock width xBlock Global memory shared memory Global memory (1) (2) (3) (1) (2) (3)

20 Exercise (1) Compile and execute program Matrix Addition. (2) Write a complete version of the program for Memory Allocation. (3) Write a program for calculate π, where the number of intervals =.


Download ppt "Lecture 6: Shared-memory Computing with GPU. Free download NVIDIA CUDA https://developer.nvidia.com/cud a-downloads CUDA programming on visual studio."

Similar presentations


Ads by Google