From 77ea99f5f1a8ed359d4d3546aacb2ff3b9cd432e Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Mon, 13 Nov 2017 09:56:25 +0800
Subject: [PATCH] Add dynamic build opencl kernel logic.

---
 mace/core/runtime/opencl/opencl_runtime.cc | 65 +++++++++++++++++++++-
 mace/core/runtime/opencl/opencl_runtime.h  | 17 +++++-
 mace/kernels/batch_norm.h                  | 20 ++++---
 mace/kernels/opencl/batch_norm_opencl.cc   | 10 ++--
 mace/proto/mace.proto                      |  1 +
 5 files changed, 96 insertions(+), 17 deletions(-)

diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 26bc60d8..02ce4415 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -83,6 +83,7 @@ bool BuildProgram(OpenCLRuntime *runtime,
 
 }  // namespace
 
+
 OpenCLRuntime *OpenCLRuntime::Get() {
   static std::once_flag init_once;
   static OpenCLRuntime *instance = nullptr;
@@ -140,7 +141,10 @@ OpenCLRuntime *OpenCLRuntime::Get() {
 OpenCLRuntime::OpenCLRuntime(cl::Context context,
                              cl::Device device,
                              cl::CommandQueue command_queue)
-    : context_(context), device_(device), command_queue_(command_queue) {}
+    : context_(context), device_(device), command_queue_(command_queue) {
+  const char *kernel_path = getenv("MACE_KERNEL_PATH");
+  kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
+}
 
 OpenCLRuntime::~OpenCLRuntime() {}
 
@@ -162,6 +166,65 @@ cl::Program &OpenCLRuntime::program() {
   return program_;
 }
 
+const std::unodered_map<std::string, std::string>
+    OpenCLRuntime::kernel_program_map_ = {
+  {"BatchNorm", "batch_norm.cl"}
+};
+
+bool OpenCLRuntime::BuildProgram(const std::string &kernel_name,
+                                 const std::string &build_options,
+                                 cl::Program *program) {
+  MACE_CHECK_NOTNULL(program);
+
+
+  cl::Program::Sources sources;
+  std::string filename = kernel_path_ + kernel_name;
+  std::string kernel_source;
+  MACE_CHECK(ReadSourceFile(filename, &kernel_source));
+  sources.push_back({kernel_source.c_str(), kernel_source.length()});
+
+  *program = cl::Program(this->context(), sources);
+  build_options += " -Werror -cl-mad-enable -cl-fast-relaxed-math -I" + path;
+  // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
+  cl_int ret = program->build({runtime->device()}, build_options.c_str());
+  if (ret != CL_SUCCESS) {
+    if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(runtime->device()) ==
+        CL_BUILD_ERROR) {
+      std::string build_log =
+          program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(runtime->device());
+      LOG(INFO) << "Program build log: " << build_log;
+    }
+    LOG(FATAL) << "Build program failed: " << ret;
+  }
+
+  return true;
+}
+
+cl::Kernel OpenCLRuntime::BuildKernel(const std::string &kernel_name,
+                        const std::set<std::string> &build_options) {
+  auto kernel_program_it = kernel_program_map_.find(kernel_name);
+  if (kernel_program_it == kernel_program_map_.end()) {
+    MACE_CHECK(false, kernel_name, " opencl kernel doesn't exist.");
+  }
+
+  std::string program_name = kernel_program_it->second;
+  std::string build_options_str;
+  for(auto &option : build_options) {
+    build_options_str += " " + option;
+  }
+  std::string built_program_key = program_name + build_options_str;
+
+  auto built_program_it = built_program_map_.find(built_program_key);
+  cl::Program program;
+  if (built_program_it != built_program_map_.end()) {
+    program = built_program_it->second;
+  } else {
+    this->BuildProgram(kernel_name, build_options_str, &program);
+    built_program_map_.emplace(built_program_key, std::move(program));
+  }
+  return cl::Kernel(kernel_name, program);
+}
+
 uint32_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() {
   unsigned long long size = 0;
   device_.getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size);
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index ed7d0c68..cdbd5d46 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -7,6 +7,7 @@
 
 #include <map>
 #include <mutex>
+#include <unordered_map>
 
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_wrapper.h"
@@ -17,12 +18,15 @@ class OpenCLRuntime {
  public:
   static OpenCLRuntime *Get();
 
-  uint32_t GetDeviceMaxWorkGroupSize();
-  uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel);
   cl::Context &context();
   cl::Device &device();
   cl::CommandQueue &command_queue();
   cl::Program &program();
+
+  uint32_t GetDeviceMaxWorkGroupSize();
+  uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel);
+  cl::Kernel BuildKernel(const std::string &kernel_name,
+                         const std::set<std::string> &build_options);
  private:
   OpenCLRuntime(cl::Context context,
                 cl::Device device,
@@ -31,12 +35,21 @@ class OpenCLRuntime {
   OpenCLRuntime(const OpenCLRuntime&) = delete;
   OpenCLRuntime &operator=(const OpenCLRuntime&) = delete;
 
+  bool BuildProgram(const std::string &kernel_name,
+                    const std::string &build_options,
+                    cl::Program *program);
+
  private:
   cl::Context context_;
   cl::Device device_;
   cl::CommandQueue command_queue_;
   cl::Program program_;
   std::once_flag build_flag_;
+  std::string kernel_path_;
+  static const std::unordered_map<std::string,
+               std::string> kernel_program_map_;
+  mutable std::unordered_map<std::string,
+          cl::Program> built_program_map_;
 };
 
 }  // namespace mace
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index cd3fb4b9..d860dcd8 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -76,15 +76,17 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
     const Tensor *epsilon,
     Tensor *output);
 
-template <>
-void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
-    const Tensor *input,
-    const Tensor *scale,
-    const Tensor *offset,
-    const Tensor *mean,
-    const Tensor *var,
-    const Tensor *epsilon,
-    Tensor *output);
+template <typename T>
+struct BatchNormFunctor<DeviceType::OPENCL, T> {
+  void operator()(
+      const Tensor *input,
+      const Tensor *scale,
+      const Tensor *offset,
+      const Tensor *mean,
+      const Tensor *var,
+      const Tensor *epsilon,
+      Tensor *output);
+};
 
 }  //  namepsace kernels
 }  //  namespace mace
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index 86f15164..0c8cf342 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -10,8 +10,8 @@
 namespace mace {
 namespace kernels {
 
-template <>
-void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
+template <typename T>
+void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
     const Tensor *input,
     const Tensor *scale,
     const Tensor *offset,
@@ -27,10 +27,10 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
                            static_cast<uint32_t>(input->dim(1)),
                            static_cast<uint32_t>(blocks)};
 
-
   auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
-  auto bm_kernel = cl::Kernel(program, "batch_norm");
+  std::set<std::string> built_options;
+  built_options.emplace("-DDataType=" + GetDataTypeFromEnum(input->dtype()));
+  auto bm_kernel = runtime->CreateKernel("batch_norm");
 
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
   const std::vector<uint32_t> lws = {1, 1, kwg_size};
diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto
index ffcffd4f..d70318c3 100644
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -23,6 +23,7 @@ enum DataType {
   DT_INT64 = 8;
   DT_UINT16 = 9;
   DT_BOOL = 10;
+  DT_HALF = 19;
 }
 
 message TensorProto {
-- 
GitLab