OpenCL introduction III.
Parallel reduction on large data A combination of input elements Associative binary operations Min, max, add, sub Introduction to Parallel Computing, University of Oregon, IPCC
Parallel reduction on large data Split the original input data into multiple partitions Run parallel reduction on the partitions Store the results Run reduction again on the previous results Repeat until one element remains Introduction to Parallel Computing, University of Oregon, IPCC
Parallel reduction on large data Use multiple work-groups Copy the corresponding data into a local array Perform a simple reduction The result of each work-group shall be stored in an output array Run the same kernel multiple times
Reduction – original solution __kernel void reduce_global(__global float* data) { int id = get_global_id(0); for(unsigned int s = get_global_size(0) / 2; s > 0; s >>= 1) if(id < s) data[id] = max(data[id], data[id + s]); } barrier(CLK_GLOBAL_MEM_FENCE);
Parallel reduction on large data wg 0 wg 1 Work-groups Global array 2 * work-group size 4 * work-group size offset = 2 * work-group ID * work-group size l_data[2 * l_id + 0] = g_data[offset + 2 * l_id + 0] l_data[2 * l_id + 1] = g_data[offset + 2 * l_id + 1] Local array 2 * work-group size 2 * work-group size Run normal Reduction on the local data Run normal Reduction on the local data If(0 == local ID) result[work-group ID] = l_data[0] Global array
Parallel reduction on large data __kernel void reduce_global(__global float* data, __global float* output) { __local float l_data[2048]; int wgid = get_group_id(0); int localSize = get_local_size(0); int lid = get_local_id(0); int offset = 2 * wgid * localSize; l_data[2 * lid + 0] = data[offset + 2 * lid + 0]; l_data[2 * lid + 1] = data[offset + 2 * lid + 1]; barrier(CLK_LOCAL_MEM_FENCE);
Parallel reduction on large data for (unsigned int s = localSize; s > 0; s >>= 1) { if (lid < s) l_data[lid] = max(l_data[lid], l_data[lid + s]); } barrier(CLK_LOCAL_MEM_FENCE); if (0 == lid) output[wgid] = l_data[0];
Parallel reduction on large data for (unsigned int kernelNum = dataSize / 2; outputSize >= 1; ) { clSetKernelArg(kernel, 0, sizeof(cl_mem), input); clSetKernelArg(kernel, 1, sizeof(cl_mem), output); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &kernelNum, &maxWorkGroupSize, 0, NULL, NULL); clFinish(queue); cl_mem* tmp = input; input = output; output = tmp; kernelNum = outputSize / 2; outputSize = (kernelNum + maxWorkGroupSize - 1) / maxWorkGroupSize; kernelNum = std::max(kernelNum, maxWorkGroupSize); } float gpuMaxValue = 0.0f; clEnqueueReadBuffer(queue, *input, CL_TRUE, 0, sizeof(float) * 1, &gpuMaxValue, 0, NULL, NULL);
InOrder vs OutOfOrder Execution In order execution Commands submitted to the command queue are executed in the order of submission Out of order execution Commands in the queue can be scheduled in `any` order
InOrder vs OutOfOrder Execution Running multiple tasks In order Out of order Explicit synchronization is needed Events Write data Task Read data Write data Task Read data Write data Write data Read data Write data Read data Read data Task Task Task
InOrder vs OutOfOrder Execution inOrderQueue = clCreateCommandQueue(context, deviceID, CL_QUEUE_PROFILING_ENABLE, &err); if (!CheckCLError(err)) exit(-1); outOfOrderQueue = clCreateCommandQueue(context, deviceID, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err); for (int i = 0; i < instances; i++) { clEnqueueWriteBuffer(inOrderQueue, inputBuffers[i], CL_TRUE, 0, sizeof(float) * dataSize, hostBuffer, 0, NULL, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), inputBuffers[i]); clSetKernelArg(kernel, 1, sizeof(cl_mem), outputBuffers[i]); clEnqueueNDRangeKernel(inOrderQueue, kernel, 1, NULL, &dataSize, NULL, 0, NULL, NULL); clEnqueueReadBuffer(inOrderQueue, outputBuffers[i], CL_TRUE, 0, sizeof(float) * dataSize, hostBuffer, 0, NULL, NULL); }
InOrder vs OutOfOrder Execution for (int i = 0; i < instances; i++) { clEnqueueWriteBuffer(outOfOrderQueue, inputBuffers[i], CL_FALSE, 0, sizeof(float) * dataSize, hostBuffer, 0, NULL, &events[2 * i + 0]); clSetKernelArg(kernel, 0, sizeof(cl_mem), inputBuffers[i]); clSetKernelArg(kernel, 1, sizeof(cl_mem), outputBuffers[i]); clEnqueueNDRangeKernel(outOfOrderQueue, kernel, 1, NULL, &dataSize, NULL, 1, &events[2 * i + 0], &events[2 * i + 1]); clEnqueueReadBuffer(outOfOrderQueue, outputBuffers[i], CL_FALSE, 0, sizeof(float) * dataSize, hostBuffer, 1, &events[2 * i + 1], NULL); }