Download presentation
Presentation is loading. Please wait.
1
CUDA and OpenCL Kernels
Workshop on Reduction Optimizing Simple CUDA and OpenCL Kernels Chris Szalwinski December
2
Overview Reduction Algorithm Terminology – CUDA and OpenCL
Test Problem – Dot Product Test Parameters Execution Configuration Divergence Register Storage
3
Reduction Algorithm Select the Largest Value
4
Minimum - Naive Reduction
5
Terminology - Configuration
CUDA Grid Blocks Threads OpenCL Workspace Workgroups Work Items
6
Terminology - Memory CUDA Global Shared Local OpenCL Global Local
Private
7
Dot Product Vector a – n elements – a[0] a[1] … a[n-1]
Vector b – n elements – b[0] b[1] … b[n-1] Dot Product = a[0] * b[0] + a[1] * b[1] + … + a[n-1] * b[n-1] 2 stages Multiply matching elements Sum their products
8
CUDA – Naive Dot Product
const int ntpb = 128; // number of threads per block // Shared Memory // __global__ void dot_D(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; // store product in shared memory if (gid < n) s[tid] = a[gid] * b[gid]; else s[tid] = 0; __syncthreads(); // reduce shared memory entries for (int stride = 1; stride < blockDim.x; stride *= 2) { if (tid % (2 * stride) == 0) s[tid] += s[tid + stride]; } if (tid == 0) c[blockIdx.x] = s[0];
9
OpenCL – Naive Dot Product
#define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_D(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory if (gid < n) s[i] = a[gid] * b[gid]; else s[i] = 0; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = 1; stride < size; stride *= 2) { if (i % (2 * stride) == 0) s[i] += s[i + stride]; } if (i == 0) c[wgrp] = s[0];
10
Minimize Divergence
11
CUDA – Minimize Divergence
const int ntpb = 128; // number of threads per block // Shared Memory // __global__ void dot_DM(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; // store product in shared memory if (gid < n) s[tid] = a[gid] * b[gid]; else s[tid] = 0; __syncthreads(); // reduce shared memory entries for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { if (tid < stride) s[tid] += s[tid + stride]; } if (tid == 0) c[blockIdx.x] = s[0];
12
OpenCL – Minimize Divergence
#define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_DM(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory if (gid < n) s[i] = a[gid] * b[gid]; else s[i] = 0; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = size >> 1; stride > 0; stride >>= 1) { if (i < stride) s[i] += s[i + stride]; } if (i == 0) c[wgrp] = s[0];
13
CUDA – Register Accumulator
const int ntpb = 128; // number of threads per block // Shared Memory // __global__ void dot_DMR(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; float x = 0; // store product in shared memory if (gid < n) x = s[tid] = a[gid] * b[gid]; __syncthreads(); // reduce shared memory entries for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { if (tid < stride) { x += s[tid + stride]; s[tid] = x; } if (tid == 0) c[blockIdx.x] = x;
14
OpenCL – Register Accumulator
#define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_DMR(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory float x = 0; if (gid < n) x = s[i] = a[gid] * b[gid]; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = size >> 1; stride > 0; stride >>= 1) { if (i < stride) { x += s[i + stride]; s[i] = x; } if (i == 0) c[wgrp] = x;
15
CUDA Results
16
OpenCL Results
Similar presentations
© 2025 SlidePlayer.com. Inc.
All rights reserved.