// This shows a simple usage of OpenCL #include #define CL_HPP_TARGET_OPENCL_VERSION 200 #define CL_HPP_MINIMUM_OPENCL_VERSION 200 #include #include #include #include #include #include #include #include #include #include class OpenCLAdder { public: OpenCLAdder(); void add(int const* in, int* out, int sz); private: void exitIfError(cl_int errorCode, char const* msg); void printAllDevices(); void printDeviceProperties(cl::Device const& device); const char* getErrorString(cl_int error); std::unique_ptr m_pContext; std::vector m_devices; std::string m_progText; std::unique_ptr m_pSources; std::unique_ptr m_pProgram; std::unique_ptr m_pQueue; }; OpenCLAdder::OpenCLAdder() { cl_int err; //printAllDevices(); m_pContext.reset(new cl::Context(CL_DEVICE_TYPE_GPU, nullptr, nullptr/*&errCallback*/, nullptr, &err)); exitIfError(err, "Conext::Context()"); m_devices = m_pContext->getInfo(); exitIfError(m_devices.size() > 0 ? CL_SUCCESS : -1, "devices.size() > 0"); std::cerr << "Using " << m_devices.size() << " device(s)\n"; for (cl::Device const& device : m_devices) { printDeviceProperties(device); } m_progText = "#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n" "__kernel void hello(__global int const* in, __global int* out, long sz, __local int* tmp) {\n" " int idx = get_local_id(0);\n" " tmp[idx] = in[idx];\n" " int step;\n" " for(step = 2; step<=sz; step*=2) {\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" " if((idx+1)%step == 0) {\n" " tmp[idx] += tmp[idx-step/2];\n" " }\n" " }\n" " for(step /= 2; step>1 ; step /=2) {\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" " if((idx+step/2+1)%step == 0 && idx>=step/2) {\n" " tmp[idx] += tmp[idx-step/2];\n" " }\n" " }\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" " out[idx] = tmp[idx];\n" "}\n"; m_pSources.reset(new cl::Program::Sources{ m_progText }); m_pProgram.reset(new cl::Program(*m_pContext, *m_pSources)); err = m_pProgram->build(m_devices, ""); exitIfError(err, "program build()"); m_pQueue.reset(new cl::CommandQueue(*m_pContext, m_devices[0], 0, &err)); exitIfError(err, "CommandQueue::CommandQueue()"); } void OpenCLAdder::add(int const* in, int* out, int sz) { cl_int err; cl::Kernel kernel(*m_pProgram, "hello", &err); exitIfError(err, "Kernel::Kernel()"); cl::Buffer inBuf(*m_pContext, CL_MEM_READ_ONLY, sz * 4, nullptr, &err); exitIfError(err, "Buffer::Buffer()"); std::vector afterCopyIn(1); err = m_pQueue->enqueueWriteBuffer(inBuf, CL_FALSE, 0, 4 * sz, in, nullptr, &afterCopyIn[0]); exitIfError(err, "ComamndQueue::enqueueWriteBuffer()"); cl::Buffer outBuf(*m_pContext, CL_MEM_READ_WRITE, sz * 4, nullptr, &err); exitIfError(err, "Buffer::Buffer()"); cl::Buffer tmpBuf(*m_pContext, CL_MEM_READ_WRITE, sz * 4, nullptr, &err); exitIfError(err, "Buffer::Buffer()"); err = kernel.setArg(0, inBuf); exitIfError(err, "Kernel::setArg(0)"); err = kernel.setArg(1, outBuf); exitIfError(err, "Kernel::setArg(1)"); err = kernel.setArg(2, int64_t(sz)); exitIfError(err, "Kernel::setArg(2)"); err = kernel.setArg(3, 4*sz, nullptr); exitIfError(err, "Kernel::setArg(3)"); std::vector afterKernelExec(1); // afterCopyIn event will make the kernel execution wait for the buffer copy err = m_pQueue->enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(sz), cl::NDRange(sz), &afterCopyIn, &afterKernelExec[0]); exitIfError(err, "ComamndQueue::enqueueNDRangeKernel()"); std::vector afterCopyOut(1); err = m_pQueue->enqueueReadBuffer(outBuf, CL_FALSE, 0, 4 * sz, out, &afterKernelExec, &afterCopyOut[0]); exitIfError(err, "ComamndQueue::enqueueReadBuffer()"); err = m_pQueue->flush(); exitIfError(err, "ComamndQueue::flush()"); // Wait for the copy from GPU buffers to host memory to be completed afterCopyOut[0].wait(); } const char* OpenCLAdder::getErrorString(cl_int error) { switch (error) { // run-time and JIT compiler errors case CL_SUCCESS: return "CL_SUCCESS"; case CL_DEVICE_NOT_FOUND: return "CL_DEVICE_NOT_FOUND"; case CL_DEVICE_NOT_AVAILABLE: return "CL_DEVICE_NOT_AVAILABLE"; case CL_COMPILER_NOT_AVAILABLE: return "CL_COMPILER_NOT_AVAILABLE"; case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; case CL_OUT_OF_RESOURCES: return "CL_OUT_OF_RESOURCES"; case CL_OUT_OF_HOST_MEMORY: return "CL_OUT_OF_HOST_MEMORY"; case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE"; case -8: return "CL_MEM_COPY_OVERLAP"; case -9: return "CL_IMAGE_FORMAT_MISMATCH"; case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; case -11: return "CL_BUILD_PROGRAM_FAILURE"; case -12: return "CL_MAP_FAILURE"; case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; case -15: return "CL_COMPILE_PROGRAM_FAILURE"; case -16: return "CL_LINKER_NOT_AVAILABLE"; case -17: return "CL_LINK_PROGRAM_FAILURE"; case -18: return "CL_DEVICE_PARTITION_FAILED"; case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; // compile-time errors case -30: return "CL_INVALID_VALUE"; case -31: return "CL_INVALID_DEVICE_TYPE"; case -32: return "CL_INVALID_PLATFORM"; case -33: return "CL_INVALID_DEVICE"; case -34: return "CL_INVALID_CONTEXT"; case -35: return "CL_INVALID_QUEUE_PROPERTIES"; case -36: return "CL_INVALID_COMMAND_QUEUE"; case -37: return "CL_INVALID_HOST_PTR"; case -38: return "CL_INVALID_MEM_OBJECT"; case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; case -40: return "CL_INVALID_IMAGE_SIZE"; case -41: return "CL_INVALID_SAMPLER"; case -42: return "CL_INVALID_BINARY"; case -43: return "CL_INVALID_BUILD_OPTIONS"; case -44: return "CL_INVALID_PROGRAM"; case -45: return "CL_INVALID_PROGRAM_EXECUTABLE"; case -46: return "CL_INVALID_KERNEL_NAME"; case -47: return "CL_INVALID_KERNEL_DEFINITION"; case -48: return "CL_INVALID_KERNEL"; case -49: return "CL_INVALID_ARG_INDEX"; case -50: return "CL_INVALID_ARG_VALUE"; case -51: return "CL_INVALID_ARG_SIZE"; case -52: return "CL_INVALID_KERNEL_ARGS"; case -53: return "CL_INVALID_WORK_DIMENSION"; case -54: return "CL_INVALID_WORK_GROUP_SIZE"; case -55: return "CL_INVALID_WORK_ITEM_SIZE"; case -56: return "CL_INVALID_GLOBAL_OFFSET"; case -57: return "CL_INVALID_EVENT_WAIT_LIST"; case -58: return "CL_INVALID_EVENT"; case -59: return "CL_INVALID_OPERATION"; case -60: return "CL_INVALID_GL_OBJECT"; case -61: return "CL_INVALID_BUFFER_SIZE"; case -62: return "CL_INVALID_MIP_LEVEL"; case -63: return "CL_INVALID_GLOBAL_WORK_SIZE"; case -64: return "CL_INVALID_PROPERTY"; case -65: return "CL_INVALID_IMAGE_DESCRIPTOR"; case -66: return "CL_INVALID_COMPILER_OPTIONS"; case -67: return "CL_INVALID_LINKER_OPTIONS"; case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT"; // extension errors case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR"; case -1001: return "CL_PLATFORM_NOT_FOUND_KHR"; case -1002: return "CL_INVALID_D3D10_DEVICE_KHR"; case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR"; case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR"; case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR"; default: return "Unknown OpenCL error"; } } void OpenCLAdder::exitIfError(cl_int errorCode, char const* msg) { if (errorCode != CL_SUCCESS) { fprintf(stderr, "OpenCL error at %s: %s(%d)\n", msg, getErrorString(errorCode), errorCode); exit(1); } } void OpenCLAdder::printAllDevices() { cl_int err; std::vector platformList; err = cl::Platform::get(&platformList); exitIfError(err, "cl::Platform::get()"); std::cerr << "Found " << platformList.size() << " OpenCL platforms:\n"; for (cl::Platform& platform : platformList) { std::cerr << "Platform name " << platform.getInfo() << ", vendor " << platform.getInfo() << ", version " << platform.getInfo() << "\n"; std::vector deviceList; err = platform.getDevices(CL_DEVICE_TYPE_ALL, &deviceList); exitIfError(err, "cl::Platform::getDevices()"); std::cerr << " platform has " << deviceList.size() << " device(s):\n"; for (cl::Device& device : deviceList) { printDeviceProperties(device); } } } void OpenCLAdder::printDeviceProperties(cl::Device const& device) { std::cerr << " type = " << device.getInfo() << "\n"; std::cerr << " name = " << device.getInfo() << "\n"; std::cerr << " vendor = " << device.getInfo() << "\n"; std::cerr << " platform = " << device.getInfo() << "\n"; std::cerr << " version = " << device.getInfo() << "\n"; std::cerr << " driver-version = " << device.getInfo() << "\n"; std::cerr << " max CU = " << device.getInfo() << "\n"; std::cerr << " max WI dimensions = " << device.getInfo() << ":"; for (auto dim : device.getInfo()) { std::cerr << " " << dim; } std::cerr << "\n"; std::cerr << " max WG size = " << device.getInfo() << "\n"; } int main(void) { OpenCLAdder adder; //multiplier.test(); unsigned const n = 64; std::vector a(n, 1); std::vector r(n, 1); for (size_t i = 0; i < n; ++i) { a[i] = rand(); } adder.add(a.data(), r.data(), n); bool ok = true; int s = 0; for (size_t i=0 ; i