OpenCL introduction III.

OpenCL introduction III.

Parallel reduction on large data
A combination of input elements Associative binary operations Min, max, add, sub Introduction to Parallel Computing, University of Oregon, IPCC

Split the original input data into multiple partitions Run parallel reduction on the partitions Store the results Run reduction again on the previous results Repeat until one element remains Introduction to Parallel Computing, University of Oregon, IPCC

Use multiple work-groups Copy the corresponding data into a local array Perform a simple reduction The result of each work-group shall be stored in an output array Run the same kernel multiple times

Reduction – original solution
__kernel void reduce_global(__global float* data) { int id = get_global_id(0); for(unsigned int s = get_global_size(0) / 2; s > 0; s >>= 1) if(id < s) data[id] = max(data[id], data[id + s]); } barrier(CLK_GLOBAL_MEM_FENCE);

wg 0 wg 1 Work-groups Global array 2 * work-group size 4 * work-group size offset = 2 * work-group ID * work-group size l_data[2 * l_id + 0] = g_data[offset + 2 * l_id + 0] l_data[2 * l_id + 1] = g_data[offset + 2 * l_id + 1] Local array 2 * work-group size 2 * work-group size Run normal Reduction on the local data Run normal Reduction on the local data If(0 == local ID) result[work-group ID] = l_data[0] Global array

__kernel void reduce_global(__global float* data, __global float* output) { __local float l_data[2048]; int wgid = get_group_id(0); int localSize = get_local_size(0); int lid = get_local_id(0); int offset = 2 * wgid * localSize; l_data[2 * lid + 0] = data[offset + 2 * lid + 0]; l_data[2 * lid + 1] = data[offset + 2 * lid + 1]; barrier(CLK_LOCAL_MEM_FENCE);

for (unsigned int s = localSize; s > 0; s >>= 1) { if (lid < s) l_data[lid] = max(l_data[lid], l_data[lid + s]); } barrier(CLK_LOCAL_MEM_FENCE); if (0 == lid) output[wgid] = l_data[0];

for (unsigned int kernelNum = dataSize / 2; outputSize >= 1; ) { clSetKernelArg(kernel, 0, sizeof(cl_mem), input); clSetKernelArg(kernel, 1, sizeof(cl_mem), output); clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &kernelNum, &maxWorkGroupSize, 0, NULL, NULL); clFinish(queue); cl_mem* tmp = input; input = output; output = tmp; kernelNum = outputSize / 2; outputSize = (kernelNum + maxWorkGroupSize - 1) / maxWorkGroupSize; kernelNum = std::max(kernelNum, maxWorkGroupSize); } float gpuMaxValue = 0.0f; clEnqueueReadBuffer(queue, *input, CL_TRUE, 0, sizeof(float) * 1, &gpuMaxValue, 0, NULL, NULL);

InOrder vs OutOfOrder Execution
In order execution Commands submitted to the command queue are executed in the order of submission Out of order execution Commands in the queue can be scheduled in `any` order

Running multiple tasks In order Out of order Explicit synchronization is needed Events Write data Task Read data Write data Task Read data Write data Write data Read data Write data Read data Read data Task Task Task

inOrderQueue = clCreateCommandQueue(context, deviceID, CL_QUEUE_PROFILING_ENABLE, &err); if (!CheckCLError(err)) exit(-1); outOfOrderQueue = clCreateCommandQueue(context, deviceID, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err); for (int i = 0; i < instances; i++) { clEnqueueWriteBuffer(inOrderQueue, inputBuffers[i], CL_TRUE, 0, sizeof(float) * dataSize, hostBuffer, 0, NULL, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), inputBuffers[i]); clSetKernelArg(kernel, 1, sizeof(cl_mem), outputBuffers[i]); clEnqueueNDRangeKernel(inOrderQueue, kernel, 1, NULL, &dataSize, NULL, 0, NULL, NULL); clEnqueueReadBuffer(inOrderQueue, outputBuffers[i], CL_TRUE, 0, sizeof(float) * dataSize, hostBuffer, 0, NULL, NULL); }

for (int i = 0; i < instances; i++) { clEnqueueWriteBuffer(outOfOrderQueue, inputBuffers[i], CL_FALSE, 0, sizeof(float) * dataSize, hostBuffer, 0, NULL, &events[2 * i + 0]); clSetKernelArg(kernel, 0, sizeof(cl_mem), inputBuffers[i]); clSetKernelArg(kernel, 1, sizeof(cl_mem), outputBuffers[i]); clEnqueueNDRangeKernel(outOfOrderQueue, kernel, 1, NULL, &dataSize, NULL, 1, &events[2 * i + 0], &events[2 * i + 1]); clEnqueueReadBuffer(outOfOrderQueue, outputBuffers[i], CL_FALSE, 0, sizeof(float) * dataSize, hostBuffer, 1, &events[2 * i + 1], NULL); }

OpenCL introduction III.

Similar presentations

Presentation on theme: "OpenCL introduction III."— Presentation transcript:

Similar presentations

About project

Feedback

Log in

Auth with social network:

OpenCL introduction III.

Similar presentations

Presentation on theme: "OpenCL introduction III."— Presentation transcript:

Similar presentations

About project

Feedback