// This shows a simple usage of OpenCL #include #define CL_HPP_TARGET_OPENCL_VERSION 200 #define CL_HPP_MINIMUM_OPENCL_VERSION 200 #include #include #include #include #include #include #include #include #include #include class OpenCLMatrixMultiplier { public: OpenCLMatrixMultiplier(); void multiplyMatrices(int const* a, int const* b, int* r, int rowsA, int colsA, int colsB); private: void exitIfError(cl_int errorCode, char const* msg); void printAllDevices(); void printDeviceProperties(cl::Device const& device); const char* getErrorString(cl_int error); std::unique_ptr m_pContext; std::vector m_devices; std::string m_progText; std::unique_ptr m_pSources; std::unique_ptr m_pProgram; std::unique_ptr m_pQueue; }; OpenCLMatrixMultiplier::OpenCLMatrixMultiplier() { cl_int err; //printAllDevices(); m_pContext.reset(new cl::Context(CL_DEVICE_TYPE_GPU, nullptr, nullptr/*&errCallback*/, nullptr, &err)); exitIfError(err, "Conext::Context()"); m_devices = m_pContext->getInfo(); exitIfError(m_devices.size() > 0 ? CL_SUCCESS : -1, "devices.size() > 0"); std::cerr << "Using " << m_devices.size() << " device(s)\n"; for (cl::Device const& device : m_devices) { printDeviceProperties(device); } m_progText = "#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n" "__kernel void hello(__global int const* a, __global int const* bt, __global int* r, long rCols, long rSize, long midSize) {\n" " int nrKernels = get_global_size(0)*get_global_size(1);\n" " int thisKernel = get_global_id(0)*get_global_size(1)+get_global_id(1);\n" " long i;\n" " long j;\n" " long chunkSize = (rSize+nrKernels-1)/nrKernels;\n" " long baseOutIdx = thisKernel*chunkSize;\n" " for(i = 0 ; ibuild(m_devices, ""); exitIfError(err, "program build()"); m_pQueue.reset(new cl::CommandQueue(*m_pContext, m_devices[0], 0, &err)); exitIfError(err, "CommandQueue::CommandQueue()"); } void OpenCLMatrixMultiplier::multiplyMatrices(int const* a, int const* b, int* r, int rowsA, int colsA, int colsB) { cl_int err; cl::Kernel kernel(*m_pProgram, "hello", &err); exitIfError(err, "Kernel::Kernel()"); cl::Buffer aBuf(*m_pContext, CL_MEM_READ_ONLY, rowsA * colsA * 4, nullptr, &err); exitIfError(err, "Buffer::Buffer()"); cl::Buffer bBuf(*m_pContext, CL_MEM_READ_ONLY, colsA * colsB * 4, nullptr, &err); exitIfError(err, "Buffer::Buffer()"); std::vector afterCopyIn(2); err = m_pQueue->enqueueWriteBuffer(aBuf, CL_FALSE, 0, 4 * rowsA * colsA, a, nullptr, &afterCopyIn[0]); exitIfError(err, "ComamndQueue::enqueueWriteBuffer()"); std::vector bTransposed(colsA * colsB); for (size_t i = 0; i < colsA; ++i) { for (size_t j = 0; j < colsB; ++j) { bTransposed[j * colsA + i] = b[i * colsB + j]; } } err = m_pQueue->enqueueWriteBuffer(bBuf, CL_FALSE, 0, 4 * colsA * colsB, bTransposed.data(), nullptr, &afterCopyIn[1]); exitIfError(err, "ComamndQueue::enqueueWriteBuffer()"); cl::Buffer rBuf(*m_pContext, CL_MEM_READ_WRITE, rowsA * colsB * 4, nullptr, &err); exitIfError(err, "Buffer::Buffer()"); err = kernel.setArg(0, aBuf); exitIfError(err, "Kernel::setArg(0)"); err = kernel.setArg(1, bBuf); exitIfError(err, "Kernel::setArg(1)"); err = kernel.setArg(2, rBuf); exitIfError(err, "Kernel::setArg(2)"); err = kernel.setArg(3, int64_t(colsB)); exitIfError(err, "Kernel::setArg(3)"); err = kernel.setArg(4, int64_t(rowsA*colsB)); exitIfError(err, "Kernel::setArg(4)"); err = kernel.setArg(5, int64_t(colsA)); exitIfError(err, "Kernel::setArg(5)"); int64_t nrProcs = 1024; std::vector afterKernelExec(1); // afterCopyIn event will make the kernel execution wait for the buffer copy err = m_pQueue->enqueueNDRangeKernel(kernel, cl::NDRange(0,0), cl::NDRange(1024,1024), cl::NDRange(16,16), &afterCopyIn, &afterKernelExec[0]); exitIfError(err, "ComamndQueue::enqueueNDRangeKernel()"); std::vector afterCopyOut(1); err = m_pQueue->enqueueReadBuffer(rBuf, CL_FALSE, 0, 4 * rowsA * colsB, r, &afterKernelExec, &afterCopyOut[0]); exitIfError(err, "ComamndQueue::enqueueReadBuffer()"); err = m_pQueue->flush(); exitIfError(err, "ComamndQueue::flush()"); // Wait for the copy from GPU buffers to host memory to be completed afterCopyOut[0].wait(); } const char* OpenCLMatrixMultiplier::getErrorString(cl_int error) { switch (error) { // run-time and JIT compiler errors case CL_SUCCESS: return "CL_SUCCESS"; case CL_DEVICE_NOT_FOUND: return "CL_DEVICE_NOT_FOUND"; case CL_DEVICE_NOT_AVAILABLE: return "CL_DEVICE_NOT_AVAILABLE"; case CL_COMPILER_NOT_AVAILABLE: return "CL_COMPILER_NOT_AVAILABLE"; case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; case CL_OUT_OF_RESOURCES: return "CL_OUT_OF_RESOURCES"; case CL_OUT_OF_HOST_MEMORY: return "CL_OUT_OF_HOST_MEMORY"; case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE"; case -8: return "CL_MEM_COPY_OVERLAP"; case -9: return "CL_IMAGE_FORMAT_MISMATCH"; case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; case -11: return "CL_BUILD_PROGRAM_FAILURE"; case -12: return "CL_MAP_FAILURE"; case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; case -15: return "CL_COMPILE_PROGRAM_FAILURE"; case -16: return "CL_LINKER_NOT_AVAILABLE"; case -17: return "CL_LINK_PROGRAM_FAILURE"; case -18: return "CL_DEVICE_PARTITION_FAILED"; case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; // compile-time errors case -30: return "CL_INVALID_VALUE"; case -31: return "CL_INVALID_DEVICE_TYPE"; case -32: return "CL_INVALID_PLATFORM"; case -33: return "CL_INVALID_DEVICE"; case -34: return "CL_INVALID_CONTEXT"; case -35: return "CL_INVALID_QUEUE_PROPERTIES"; case -36: return "CL_INVALID_COMMAND_QUEUE"; case -37: return "CL_INVALID_HOST_PTR"; case -38: return "CL_INVALID_MEM_OBJECT"; case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; case -40: return "CL_INVALID_IMAGE_SIZE"; case -41: return "CL_INVALID_SAMPLER"; case -42: return "CL_INVALID_BINARY"; case -43: return "CL_INVALID_BUILD_OPTIONS"; case -44: return "CL_INVALID_PROGRAM"; case -45: return "CL_INVALID_PROGRAM_EXECUTABLE"; case -46: return "CL_INVALID_KERNEL_NAME"; case -47: return "CL_INVALID_KERNEL_DEFINITION"; case -48: return "CL_INVALID_KERNEL"; case -49: return "CL_INVALID_ARG_INDEX"; case -50: return "CL_INVALID_ARG_VALUE"; case -51: return "CL_INVALID_ARG_SIZE"; case -52: return "CL_INVALID_KERNEL_ARGS"; case -53: return "CL_INVALID_WORK_DIMENSION"; case -54: return "CL_INVALID_WORK_GROUP_SIZE"; case -55: return "CL_INVALID_WORK_ITEM_SIZE"; case -56: return "CL_INVALID_GLOBAL_OFFSET"; case -57: return "CL_INVALID_EVENT_WAIT_LIST"; case -58: return "CL_INVALID_EVENT"; case -59: return "CL_INVALID_OPERATION"; case -60: return "CL_INVALID_GL_OBJECT"; case -61: return "CL_INVALID_BUFFER_SIZE"; case -62: return "CL_INVALID_MIP_LEVEL"; case -63: return "CL_INVALID_GLOBAL_WORK_SIZE"; case -64: return "CL_INVALID_PROPERTY"; case -65: return "CL_INVALID_IMAGE_DESCRIPTOR"; case -66: return "CL_INVALID_COMPILER_OPTIONS"; case -67: return "CL_INVALID_LINKER_OPTIONS"; case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT"; // extension errors case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR"; case -1001: return "CL_PLATFORM_NOT_FOUND_KHR"; case -1002: return "CL_INVALID_D3D10_DEVICE_KHR"; case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR"; case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR"; case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR"; default: return "Unknown OpenCL error"; } } void OpenCLMatrixMultiplier::exitIfError(cl_int errorCode, char const* msg) { if (errorCode != CL_SUCCESS) { fprintf(stderr, "OpenCL error at %s: %s(%d)\n", msg, getErrorString(errorCode), errorCode); exit(1); } } void OpenCLMatrixMultiplier::printAllDevices() { cl_int err; std::vector platformList; err = cl::Platform::get(&platformList); exitIfError(err, "cl::Platform::get()"); std::cerr << "Found " << platformList.size() << " OpenCL platforms:\n"; for (cl::Platform& platform : platformList) { std::cerr << "Platform name " << platform.getInfo() << ", vendor " << platform.getInfo() << ", version " << platform.getInfo() << "\n"; std::vector deviceList; err = platform.getDevices(CL_DEVICE_TYPE_ALL, &deviceList); exitIfError(err, "cl::Platform::getDevices()"); std::cerr << " platform has " << deviceList.size() << " device(s):\n"; for (cl::Device& device : deviceList) { printDeviceProperties(device); } } } void OpenCLMatrixMultiplier::printDeviceProperties(cl::Device const& device) { std::cerr << " type = " << device.getInfo() << "\n"; std::cerr << " name = " << device.getInfo() << "\n"; std::cerr << " vendor = " << device.getInfo() << "\n"; std::cerr << " platform = " << device.getInfo() << "\n"; std::cerr << " version = " << device.getInfo() << "\n"; std::cerr << " driver-version = " << device.getInfo() << "\n"; std::cerr << " max CU = " << device.getInfo() << "\n"; std::cerr << " max WI dimensions = " << device.getInfo() << ":"; for (auto dim : device.getInfo()) { std::cerr << " " << dim; } std::cerr << "\n"; std::cerr << " max WG size = " << device.getInfo() << "\n"; } int main(void) { OpenCLMatrixMultiplier multiplier; //multiplier.test(); unsigned const n = 2048; std::vector a(n * n, 1); std::vector b(n * n, 1); std::vector r(n * n, 1); for (size_t i = 0; i < n * n; ++i) { a[i] = rand(); b[i] = rand(); } std::chrono::system_clock::time_point before = std::chrono::system_clock::now(); multiplier.multiplyMatrices(a.data(), b.data(), r.data(), n, n, n); std::chrono::system_clock::time_point after = std::chrono::system_clock::now(); bool ok = true; std::chrono::system_clock::time_point beforeTest = std::chrono::system_clock::now(); std::vector bTransposed(n * n); for (size_t i = 0; i < n; ++i) { for (size_t j = 0; j < n; ++j) { bTransposed[j * n + i] = b[i * n + j]; } } for (size_t i = 0; i < n; ++i) { for (size_t j = 0; j < n; ++j) { int s = 0; for (size_t k = 0; k < n; ++k) { s += a[i * n + k] * bTransposed[j * n + k]; } if (s != r[i * n + j]) { ok = false; } } } std::chrono::system_clock::time_point afterTest = std::chrono::system_clock::now(); //for (cl_int v : r) { // std::cout << v << " "; //} //std::cout << "\n"; std::cout << "Ok=" << ok << "\n"; std::cout << "Time OpenCL=" << std::chrono::duration_cast(after-before).count() << " ms\n"; std::cout << "Time classical=" << std::chrono::duration_cast(afterTest - beforeTest).count() << " ms\n"; return EXIT_SUCCESS; }