AMD-SPL Runtime Programming Guide Jiawei
Outline
The Core of SPL Encapsulation Resource management Workflow controlOptimization Based on CAL
Goal Overcome limitations of Brook+ Provide friendly programming interface for CAL Support the development of SPL
What is in SPL Runtime SPL Runtime Program Management Buffer Management Device Management
Outline
Pre-Requirements Visual Studio 2005 AMD Stream SDK 1.4 beta AMD-SPL 1.0 beta or higher Windows …… Linux
Add Include Directories Add include path in VS2005 –CAL: “$(CALROOT)\include\” –SPL: “$(SPLROOT)\include\” –Runtime: “$(SPLROOT)\include\core\cal” Note: $(SPLROOT) is the root folder of SPL
Add Library Directories Add library directories in VS2005 –CAL: “$(CALROOT)\lib\lh32\”Vista 32bit “$(CALROOT)\lib\lh64\”Vista 64bit “$(CALROOT)\lib\xp32\”XP 32bit “$(CALROOT)\lib\xp64\”XP 64bit –SPL “$(SPLROOT)\lib Note: $(SPLROOT) is the root folder of SPL
Add Library Dependencies Add additional dependencies in VS2005 –CAL: aticalrt.lib aticalcl.lib –SPL: amd-spl_d.libDebug version amd-spl.libRelease version
Header and Namespaces Include proper header files –#include “cal.h”CAL header –#include “amdspl.h”SPL header –#include “RuntimeDefs.h”Runtime header Using namespaces –using namespace amdspl; –using namespace amdspl::core::cal;
DEFINE THE IL KERNEL
Code in IL AMD Stream Kernel Analyzer Generate IL from Brook+ kernel Easier to program Difficult to maintain and optimize Write IL manually Difficult to program and understand Easier to optimize Provide more GPU features
IL Kernel Sample kernel void k(out float o<>, float i<>, float c) { o = i + c; } kernel void k(out float o<>, float i<>, float c) { o = i + c; } il_ps_2_0 dcl_output_generic o0 dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float) _fmtw(float) dcl_input_position_interp(linear_noperspective) v0.xy__ dcl_cb cb0[1] sample_resource(0)_sampler(0) r1, v0.xy00 add o0, r1, cb0[0] endmain end il_ps_2_0 dcl_output_generic o0 dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float) _fmtw(float) dcl_input_position_interp(linear_noperspective) v0.xy__ dcl_cb cb0[1] sample_resource(0)_sampler(0) r1, v0.xy00 add o0, r1, cb0[0] endmain end The Brook+ kernel equivalent:
IL Source String const char * __sample_program_src__ = "il_ps_2_0\n" "dcl_output_generic o0\n" "dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmt z(float)_fmtw(float)\n" "dcl_input_position_interp(linear_noperspective) v0.xy__\n" "dcl_cb cb0[1]\n" "sample_resource(0)_sampler(0) r1, v0.xy00\n" "add o0, r1, cb0[0]\n" "endmain\n" "end\n"; const char * __sample_program_src__ = "il_ps_2_0\n" "dcl_output_generic o0\n" "dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmt z(float)_fmtw(float)\n" "dcl_input_position_interp(linear_noperspective) v0.xy__\n" "dcl_cb cb0[1]\n" "sample_resource(0)_sampler(0) r1, v0.xy00\n" "add o0, r1, cb0[0]\n" "endmain\n" "end\n";
Kernel Information Define the kernel using template class ProgramInfo –Kernel Parameters –ID of the Kernel –Source of the Kernel template <int outputsT, int inputsT = 0, int constantsT = 0, bool globalsT = false> class ProgramInfo { ProgramInfo(const char* ID, const char* source) {...}... }; template <int outputsT, int inputsT = 0, int constantsT = 0, bool globalsT = false> class ProgramInfo { ProgramInfo(const char* ID, const char* source) {...}... };
Define the IL Kernel in SPL Define a global object for the kernel typedef ProgramInfo SampleProgram; SampleProgram sampleProgInfo = SampleProgram("Sample Program", __sample_program_src__); typedef ProgramInfo SampleProgram; SampleProgram sampleProgInfo = SampleProgram("Sample Program", __sample_program_src__);
INITIALIZE SPL RUNTIME
Initialize SPL Runtime Get runtime instance Get device manager Get buffer manager Get program manager Runtime *runtime = Runtime::getInstance(); assert(runtime); DeviceManager *devMgr = runtime->getDeviceManager(); assert(devMgr); BufferManager *bufMgr = runtime->getBufferManager(); assert(bufMgr); ProgramManager* progMgr = runtime->getProgramManager(); assert(progMgr); Runtime *runtime = Runtime::getInstance(); assert(runtime); DeviceManager *devMgr = runtime->getDeviceManager(); assert(devMgr); BufferManager *bufMgr = runtime->getBufferManager(); assert(bufMgr); ProgramManager* progMgr = runtime->getProgramManager(); assert(progMgr);
Assign Device to SPL bool r; r = devMgr->assignDevice(0); assert(r); bool r; r = devMgr->assignDevice(0); assert(r); Assign device to device manager The device manager will handle device initialization and destroy. SPL cannot access device which is not assigned to it
DO GPGPU COMPUTING
Initialize CPU Buffer void fillBuffer(float buf[], int size) { for (int i = 0;i < size; i++) { buf[i] = (float)i; } float *cpuInBuf = new float[1024 * 512]; float *cpuOutBuf = new float[1024 * 512]; float constant = 3; fillBuffer(cpuInBuf, 1024 * 512); void fillBuffer(float buf[], int size) { for (int i = 0;i < size; i++) { buf[i] = (float)i; } float *cpuInBuf = new float[1024 * 512]; float *cpuOutBuf = new float[1024 * 512]; float constant = 3; fillBuffer(cpuInBuf, 1024 * 512);
Get Device Get the default device Get device by ID Device* device = devMgr->getDefaultDevice(); Device* device = devMgr->getDeviceByID(0); OR
Load Program Load the program using program manager –Pass in a ProgramInfo instance Program *prog = progMgr->loadProgram(sampleProgInfo); assert(prog); Program *prog = progMgr->loadProgram(sampleProgInfo); assert(prog);
Create Buffers Create local buffer for input Create remote buffer for output Get constant buffer from constant buffer pool Buffer* inBuf = bufMgr-> createLocalBuffer(device, CAL_FORMAT_FLOAT_1, 1024, 512); assert(inBuf); Buffer* outBuf = bufMgr->createRemoteBuffer( CAL_FORMAT_FLOAT_1, 1024, 512); assert(outBuf); ConstBuffer* constBuf = bufMgr->getConstBuffer(1); assert(constBuf); Buffer* inBuf = bufMgr-> createLocalBuffer(device, CAL_FORMAT_FLOAT_1, 1024, 512); assert(inBuf); Buffer* outBuf = bufMgr->createRemoteBuffer( CAL_FORMAT_FLOAT_1, 1024, 512); assert(outBuf); ConstBuffer* constBuf = bufMgr->getConstBuffer(1); assert(constBuf);
CPU to GPU Data Transfer Read in CPU buffer Set Constant bool r; r = inBuf->readData(cpuInBuf, 1024 * 512); assert(r); bool r; r = inBuf->readData(cpuInBuf, 1024 * 512); assert(r); r = constBuf->setConstant (&constant); assert(r); r = constBuf->setConstant (&constant); assert(r);
Bind Buffers Bind buffers to the program –Input, Output, Constant, Global r = prog->bindOutput(outBuf, 0); assert(r); r = prog->bindInput(inBuf, 0); assert(r); r = prog->bindConstant(constBuf, 0); assert(r); r = prog->bindOutput(outBuf, 0); assert(r); r = prog->bindInput(inBuf, 0); assert(r); r = prog->bindConstant(constBuf, 0); assert(r);
Execute Program Define the execution domain Run program Check the execution event CALdomain domain = {0, 0, 1024, 512}; Event *e = prog->run(domain); assert(e); CALdomain domain = {0, 0, 1024, 512}; Event *e = prog->run(domain); assert(e);
GPU to CPU Data Transfer Write in CPU buffer r = outBuf->writeData(cpuOutBuf, 1024 * 512); assert(r); r = outBuf->writeData(cpuOutBuf, 1024 * 512); assert(r);
RELEASE RESOURCE AND CLEAN UP
Unload Program Destroy program object –Unbind all the buffers Call Program::unbindAllBuffers(); –Unload module from context progMgr->unloadProgram(prog);
Destroy/Release Buffers Destroy buffers –InputBuffer, OutputBuffer Release ConstBuffer to the pool bufMgr->destroyBuffer(inBuf); bufMgr->destroyBuffer(outBuf); bufMgr->releaseConstBuffer(constBuf); bufMgr->destroyBuffer(inBuf); bufMgr->destroyBuffer(outBuf); bufMgr->releaseConstBuffer(constBuf);
Shutdown Runtime Not necessary! –Runtime will be destroy when application exits. Runtime::destroy();
The Whole Program #include "cal.h" #include "amdspl.h" #include "RuntimeDefs.h" using namespace amdspl; using namespace amdspl::core::cal; void fillBuffer(float buf[], int size) { for (int i = 0;i < size; i++) { buf[i] = (float)i; } #include "cal.h" #include "amdspl.h" #include "RuntimeDefs.h" using namespace amdspl; using namespace amdspl::core::cal; void fillBuffer(float buf[], int size) { for (int i = 0;i < size; i++) { buf[i] = (float)i; }
The Whole Program const char *__sample_program_src__ = "il_ps_2_0\n" "dcl_output_generic o0\n" "dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_f mtw(float)\n" "dcl_input_position_interp(linear_noperspective) v0.xy__\n" "dcl_cb cb0[1]\n" "sample_resource(0)_sampler(0) r1, v0.xy00\n" "add o0, r1, cb0[0]\n" "endmain\n" "end\n"; typedef ProgramInfo SampleProgram; SampleProgram sampleProgInfo = SampleProgram("Sample Program", __sample_program_src__); const char *__sample_program_src__ = "il_ps_2_0\n" "dcl_output_generic o0\n" "dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_f mtw(float)\n" "dcl_input_position_interp(linear_noperspective) v0.xy__\n" "dcl_cb cb0[1]\n" "sample_resource(0)_sampler(0) r1, v0.xy00\n" "add o0, r1, cb0[0]\n" "endmain\n" "end\n"; typedef ProgramInfo SampleProgram; SampleProgram sampleProgInfo = SampleProgram("Sample Program", __sample_program_src__);
The Whole Program int main(void) { float *cpuInBuf = new float[1024 * 512]; float *cpuOutBuf = new float[1024 * 512]; float constant = 3; fillBuffer(cpuInBuf, 1024 * 512); Runtime *runtime = Runtime::getInstance(); DeviceManager *devMgr = runtime->getDeviceManager(); BufferManager *bufMgr = runtime->getBufferManager(); ProgramManager* progMgr = runtime->getProgramManager(); devMgr->assignDevice(0); Device* device = devMgr->getDefaultDevice(); int main(void) { float *cpuInBuf = new float[1024 * 512]; float *cpuOutBuf = new float[1024 * 512]; float constant = 3; fillBuffer(cpuInBuf, 1024 * 512); Runtime *runtime = Runtime::getInstance(); DeviceManager *devMgr = runtime->getDeviceManager(); BufferManager *bufMgr = runtime->getBufferManager(); ProgramManager* progMgr = runtime->getProgramManager(); devMgr->assignDevice(0); Device* device = devMgr->getDefaultDevice();
The Whole Program Program *prog = progMgr->loadProgram(sampleProgInfo); Buffer* inBuf = bufMgr->createLocalBuffer(device, CAL_FORMAT_FLOAT_1, 1024, 512); Buffer* outBuf = bufMgr->createRemoteBuffer(CAL_FORMAT_FLOAT_1, 1024, 512); ConstBuffer* constBuf = bufMgr->getConstBuffer(1); inBuf->readData(cpuInBuf, 1024 * 512); constBuf->setConstant (&constant); prog->bindOutput(outBuf, 0); prog->bindInput(inBuf, 0); prog->bindConstant(constBuf, 0); CALdomain domain = {0, 0, 1024, 512}; Event *e = prog->run(domain); r = outBuf->writeData(cpuOutBuf, 1024 * 512); Program *prog = progMgr->loadProgram(sampleProgInfo); Buffer* inBuf = bufMgr->createLocalBuffer(device, CAL_FORMAT_FLOAT_1, 1024, 512); Buffer* outBuf = bufMgr->createRemoteBuffer(CAL_FORMAT_FLOAT_1, 1024, 512); ConstBuffer* constBuf = bufMgr->getConstBuffer(1); inBuf->readData(cpuInBuf, 1024 * 512); constBuf->setConstant (&constant); prog->bindOutput(outBuf, 0); prog->bindInput(inBuf, 0); prog->bindConstant(constBuf, 0); CALdomain domain = {0, 0, 1024, 512}; Event *e = prog->run(domain); r = outBuf->writeData(cpuOutBuf, 1024 * 512);......
The Entire Program..... progMgr->unloadProgram(prog); bufMgr->destroyBuffer(inBuf); bufMgr->destroyBuffer(outBuf); bufMgr->releaseConstBuffer(constBuf); Runtime::destroy(); delete [] cpuInBuf; delete [] cpuOutBuf; return 0; }..... progMgr->unloadProgram(prog); bufMgr->destroyBuffer(inBuf); bufMgr->destroyBuffer(outBuf); bufMgr->releaseConstBuffer(constBuf); Runtime::destroy(); delete [] cpuInBuf; delete [] cpuOutBuf; return 0; }
THANK YOU!