From 77ea99f5f1a8ed359d4d3546aacb2ff3b9cd432e Mon Sep 17 00:00:00 2001 From: liuqi Date: Mon, 13 Nov 2017 09:56:25 +0800 Subject: [PATCH] Add dynamic build opencl kernel logic. --- mace/core/runtime/opencl/opencl_runtime.cc | 65 +++++++++++++++++++++- mace/core/runtime/opencl/opencl_runtime.h | 17 +++++- mace/kernels/batch_norm.h | 20 ++++--- mace/kernels/opencl/batch_norm_opencl.cc | 10 ++-- mace/proto/mace.proto | 1 + 5 files changed, 96 insertions(+), 17 deletions(-) diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 26bc60d8..02ce4415 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -83,6 +83,7 @@ bool BuildProgram(OpenCLRuntime *runtime, } // namespace + OpenCLRuntime *OpenCLRuntime::Get() { static std::once_flag init_once; static OpenCLRuntime *instance = nullptr; @@ -140,7 +141,10 @@ OpenCLRuntime *OpenCLRuntime::Get() { OpenCLRuntime::OpenCLRuntime(cl::Context context, cl::Device device, cl::CommandQueue command_queue) - : context_(context), device_(device), command_queue_(command_queue) {} + : context_(context), device_(device), command_queue_(command_queue) { + const char *kernel_path = getenv("MACE_KERNEL_PATH"); + kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/"; +} OpenCLRuntime::~OpenCLRuntime() {} @@ -162,6 +166,65 @@ cl::Program &OpenCLRuntime::program() { return program_; } +const std::unodered_map + OpenCLRuntime::kernel_program_map_ = { + {"BatchNorm", "batch_norm.cl"} +}; + +bool OpenCLRuntime::BuildProgram(const std::string &kernel_name, + const std::string &build_options, + cl::Program *program) { + MACE_CHECK_NOTNULL(program); + + + cl::Program::Sources sources; + std::string filename = kernel_path_ + kernel_name; + std::string kernel_source; + MACE_CHECK(ReadSourceFile(filename, &kernel_source)); + sources.push_back({kernel_source.c_str(), kernel_source.length()}); + + *program = cl::Program(this->context(), sources); + build_options += " -Werror -cl-mad-enable -cl-fast-relaxed-math -I" + path; + // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math + cl_int ret = program->build({runtime->device()}, build_options.c_str()); + if (ret != CL_SUCCESS) { + if (program->getBuildInfo(runtime->device()) == + CL_BUILD_ERROR) { + std::string build_log = + program->getBuildInfo(runtime->device()); + LOG(INFO) << "Program build log: " << build_log; + } + LOG(FATAL) << "Build program failed: " << ret; + } + + return true; +} + +cl::Kernel OpenCLRuntime::BuildKernel(const std::string &kernel_name, + const std::set &build_options) { + auto kernel_program_it = kernel_program_map_.find(kernel_name); + if (kernel_program_it == kernel_program_map_.end()) { + MACE_CHECK(false, kernel_name, " opencl kernel doesn't exist."); + } + + std::string program_name = kernel_program_it->second; + std::string build_options_str; + for(auto &option : build_options) { + build_options_str += " " + option; + } + std::string built_program_key = program_name + build_options_str; + + auto built_program_it = built_program_map_.find(built_program_key); + cl::Program program; + if (built_program_it != built_program_map_.end()) { + program = built_program_it->second; + } else { + this->BuildProgram(kernel_name, build_options_str, &program); + built_program_map_.emplace(built_program_key, std::move(program)); + } + return cl::Kernel(kernel_name, program); +} + uint32_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() { unsigned long long size = 0; device_.getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size); diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index ed7d0c68..cdbd5d46 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -7,6 +7,7 @@ #include #include +#include #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_wrapper.h" @@ -17,12 +18,15 @@ class OpenCLRuntime { public: static OpenCLRuntime *Get(); - uint32_t GetDeviceMaxWorkGroupSize(); - uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel); cl::Context &context(); cl::Device &device(); cl::CommandQueue &command_queue(); cl::Program &program(); + + uint32_t GetDeviceMaxWorkGroupSize(); + uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel); + cl::Kernel BuildKernel(const std::string &kernel_name, + const std::set &build_options); private: OpenCLRuntime(cl::Context context, cl::Device device, @@ -31,12 +35,21 @@ class OpenCLRuntime { OpenCLRuntime(const OpenCLRuntime&) = delete; OpenCLRuntime &operator=(const OpenCLRuntime&) = delete; + bool BuildProgram(const std::string &kernel_name, + const std::string &build_options, + cl::Program *program); + private: cl::Context context_; cl::Device device_; cl::CommandQueue command_queue_; cl::Program program_; std::once_flag build_flag_; + std::string kernel_path_; + static const std::unordered_map kernel_program_map_; + mutable std::unordered_map built_program_map_; }; } // namespace mace diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index cd3fb4b9..d860dcd8 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -76,15 +76,17 @@ void BatchNormFunctor::operator()( const Tensor *epsilon, Tensor *output); -template <> -void BatchNormFunctor::operator()( - const Tensor *input, - const Tensor *scale, - const Tensor *offset, - const Tensor *mean, - const Tensor *var, - const Tensor *epsilon, - Tensor *output); +template +struct BatchNormFunctor { + void operator()( + const Tensor *input, + const Tensor *scale, + const Tensor *offset, + const Tensor *mean, + const Tensor *var, + const Tensor *epsilon, + Tensor *output); +}; } // namepsace kernels } // namespace mace diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 86f15164..0c8cf342 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -10,8 +10,8 @@ namespace mace { namespace kernels { -template <> -void BatchNormFunctor::operator()( +template +void BatchNormFunctor::operator()( const Tensor *input, const Tensor *scale, const Tensor *offset, @@ -27,10 +27,10 @@ void BatchNormFunctor::operator()( static_cast(input->dim(1)), static_cast(blocks)}; - auto runtime = OpenCLRuntime::Get(); - auto program = runtime->program(); - auto bm_kernel = cl::Kernel(program, "batch_norm"); + std::set built_options; + built_options.emplace("-DDataType=" + GetDataTypeFromEnum(input->dtype())); + auto bm_kernel = runtime->CreateKernel("batch_norm"); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel); const std::vector lws = {1, 1, kwg_size}; diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto index ffcffd4f..d70318c3 100644 --- a/mace/proto/mace.proto +++ b/mace/proto/mace.proto @@ -23,6 +23,7 @@ enum DataType { DT_INT64 = 8; DT_UINT16 = 9; DT_BOOL = 10; + DT_HALF = 19; } message TensorProto { -- GitLab