OpenCL introduction III.

1 OpenCL introduction III.

2 Parallel reduction on large data
A combination of input elements Associative binary operations Min, max, add, sub Introduction to Parallel Computing, University of Oregon, IPCC

3 Parallel reduction on large data
Split the original input data into multiple partitions Run parallel reduction on the partitions Store the results Run reduction again on the previous results Repeat until one element remains Introduction to Parallel Computing, University of Oregon, IPCC

4 Parallel reduction on large data
Use multiple work-groups Copy the corresponding data into a local array Perform a simple reduction The result of each work-group shall be stored in an output array Run the same kernel multiple times

5 Reduction – original solution
__kernel void reduce_global(__global float* data) { int id = get_global_id(0); for(unsigned int s = get_global_size(0) / 2; s > 0; s >>= 1) if(id < s) data[id] = max(data[id], data[id + s]); } barrier(CLK_GLOBAL_MEM_FENCE);

6 Parallel reduction on large data
wg 0 wg 1 Work-groups Global array 2 * work-group size 4 * work-group size offset = 2 * work-group ID * work-group size l_data[2 * l_id + 0] = g_data[offset + 2 * l_id + 0] l_data[2 * l_id + 1] = g_data[offset + 2 * l_id + 1] Local array 2 * work-group size 2 * work-group size Run normal Reduction on the local data Run normal Reduction on the local data If(0 == local ID) result[work-group ID] = l_data[0] Global array

7 Parallel reduction on large data
__kernel void reduce_global(__global float* data, __global float* output) { __local float l_data[2048]; int wgid = get_group_id(0); int localSize = get_local_size(0); int lid = get_local_id(0); int offset = 2 * wgid * localSize; l_data[2 * lid + 0] = data[offset + 2 * lid + 0]; l_data[2 * lid + 1] = data[offset + 2 * lid + 1]; barrier(CLK_LOCAL_MEM_FENCE);

8 Parallel reduction on large data
for (unsigned int s = localSize; s > 0; s >>= 1) { if (lid < s) l_data[lid] = max(l_data[lid], l_data[lid + s]); } barrier(CLK_LOCAL_MEM_FENCE); if (0 == lid) output[wgid] = l_data[0];

9 Parallel reduction on large data
for (unsigned int kernelNum = dataSize / 2; outputSize >= 1; ) { clSetKernelArg(kernel, 0, sizeof(cl_mem), input); clSetKernelArg(kernel, 1, sizeof(cl_mem), output); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &kernelNum, &maxWorkGroupSize, 0, NULL, NULL); clFinish(queue); cl_mem* tmp = input; input = output; output = tmp; kernelNum = outputSize / 2; outputSize = (kernelNum + maxWorkGroupSize - 1) / maxWorkGroupSize; kernelNum = std::max(kernelNum, maxWorkGroupSize); } float gpuMaxValue = 0.0f; clEnqueueReadBuffer(queue, *input, CL_TRUE, 0, sizeof(float) * 1, &gpuMaxValue, 0, NULL, NULL);

10 InOrder vs OutOfOrder Execution
In order execution Commands submitted to the command queue are executed in the order of submission Out of order execution Commands in the queue can be scheduled in `any` order

11 InOrder vs OutOfOrder Execution
Running multiple tasks In order Out of order Explicit synchronization is needed Events Write data Task Read data Write data Task Read data Write data Write data Read data Write data Read data Read data Task Task Task

12 InOrder vs OutOfOrder Execution
inOrderQueue = clCreateCommandQueue(context, deviceID, CL_QUEUE_PROFILING_ENABLE, &err); if (!CheckCLError(err)) exit(-1); outOfOrderQueue = clCreateCommandQueue(context, deviceID, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err); for (int i = 0; i < instances; i++) { clEnqueueWriteBuffer(inOrderQueue, inputBuffers[i], CL_TRUE, 0, sizeof(float) * dataSize, hostBuffer, 0, NULL, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), inputBuffers[i]); clSetKernelArg(kernel, 1, sizeof(cl_mem), outputBuffers[i]); clEnqueueNDRangeKernel(inOrderQueue, kernel, 1, NULL, &dataSize, NULL, 0, NULL, NULL); clEnqueueReadBuffer(inOrderQueue, outputBuffers[i], CL_TRUE, 0, sizeof(float) * dataSize, hostBuffer, 0, NULL, NULL); }

13 InOrder vs OutOfOrder Execution
for (int i = 0; i < instances; i++) { clEnqueueWriteBuffer(outOfOrderQueue, inputBuffers[i], CL_FALSE, 0, sizeof(float) * dataSize, hostBuffer, 0, NULL, &events[2 * i + 0]); clSetKernelArg(kernel, 0, sizeof(cl_mem), inputBuffers[i]); clSetKernelArg(kernel, 1, sizeof(cl_mem), outputBuffers[i]); clEnqueueNDRangeKernel(outOfOrderQueue, kernel, 1, NULL, &dataSize, NULL, 1, &events[2 * i + 0], &events[2 * i + 1]); clEnqueueReadBuffer(outOfOrderQueue, outputBuffers[i], CL_FALSE, 0, sizeof(float) * dataSize, hostBuffer, 1, &events[2 * i + 1], NULL); }

