From f07dd5163e94af61be16e13bf0b1869292cf0d2c Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Thu, 16 Nov 2017 14:49:55 +0800
Subject: [PATCH] Refactor the opencl kernel build logic.

---
 mace/core/runtime/opencl/opencl_runtime.cc    | 108 ++++---------
 mace/core/runtime/opencl/opencl_runtime.h     |  14 +-
 mace/core/types.cc                            |  29 ++++
 mace/core/types.h                             |   3 +
 mace/kernels/batch_norm.h                     |   6 +-
 mace/kernels/opencl/addn.cc                   |   6 +-
 mace/kernels/opencl/batch_norm_opencl.cc      |  10 +-
 mace/kernels/opencl/cl/addn.cl                |  13 +-
 mace/kernels/opencl/cl/batch_norm.cl          |  26 ++--
 mace/kernels/opencl/cl/common.h               |   3 +
 mace/kernels/opencl/cl/conv_2d_1x1.cl         | 131 ++++++++--------
 mace/kernels/opencl/cl/conv_2d_3x3.cl         | 120 +++++++++++----
 mace/kernels/opencl/cl/conv_helper.cl         |  41 -----
 mace/kernels/opencl/cl/conv_helper.h          |  15 --
 mace/kernels/opencl/cl/depthwise_conv_3x3.cl  | 123 +++++++++++----
 mace/kernels/opencl/cl/pooling.cl             | 145 +++++++++---------
 mace/kernels/opencl/cl/relu.cl                |  21 +--
 mace/kernels/opencl/cl/resize_bilinear.cl     |  23 +--
 mace/kernels/opencl/cl/space_to_batch.cl      |  25 +--
 mace/kernels/opencl/conv_2d_opencl_1x1.cc     |  12 +-
 mace/kernels/opencl/conv_2d_opencl_3x3.cc     |  13 +-
 .../opencl/depthwise_conv_opencl_3x3.cc       |  13 +-
 mace/kernels/opencl/pooling_opencl.cc         |  30 ++--
 mace/kernels/opencl/relu_opencl.cc            |   7 +-
 mace/kernels/opencl/resize_bilinear_opencl.cc |   5 +-
 mace/kernels/opencl/space_to_batch_opecl.cc   |   6 +-
 mace/proto/mace.proto                         |   1 +
 27 files changed, 509 insertions(+), 440 deletions(-)
 delete mode 100644 mace/kernels/opencl/cl/conv_helper.cl
 delete mode 100644 mace/kernels/opencl/cl/conv_helper.h
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 02ce4415..32a7bcc2 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -7,8 +7,6 @@
 #include <memory>
 #include <mutex>
 
-#include <dirent.h>
-
 #include "mace/core/logging.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 
@@ -32,55 +30,6 @@ bool ReadSourceFile(const std::string &filename, std::string *content) {
   return true;
 }
 
-bool BuildProgram(OpenCLRuntime *runtime,
-                  const std::string &path,
-                  cl::Program *program) {
-  MACE_CHECK_NOTNULL(program);
-
-  auto closer = [](DIR *d) {
-    if (d != nullptr) closedir(d);
-  };
-  std::unique_ptr<DIR, decltype(closer)> dir(opendir(path.c_str()), closer);
-  MACE_CHECK_NOTNULL(dir.get());
-
-  const std::string kSourceSuffix = ".cl";
-  cl::Program::Sources sources;
-  errno = 0;
-  dirent *entry = readdir(dir.get());
-  MACE_CHECK(errno == 0);
-  while (entry != nullptr) {
-    if (entry->d_type == DT_REG) {
-      std::string d_name(entry->d_name);
-      if (d_name.size() > kSourceSuffix.size() &&
-          d_name.compare(d_name.size() - kSourceSuffix.size(),
-                         kSourceSuffix.size(), kSourceSuffix) == 0) {
-        std::string filename = path + d_name;
-        std::string kernel_source;
-        MACE_CHECK(ReadSourceFile(filename, &kernel_source));
-        sources.push_back({kernel_source.c_str(), kernel_source.length()});
-      }
-    }
-    entry = readdir(dir.get());
-    MACE_CHECK(errno == 0);
-  };
-
-  *program = cl::Program(runtime->context(), sources);
-  std::string build_options = "-Werror -cl-mad-enable -cl-fast-relaxed-math -I" + path;
-  // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
-  cl_int ret = program->build({runtime->device()}, build_options.c_str());
-  if (ret != CL_SUCCESS) {
-    if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(runtime->device()) ==
-        CL_BUILD_ERROR) {
-      std::string build_log =
-          program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(runtime->device());
-      LOG(INFO) << "Program build log: " << build_log;
-    }
-    LOG(FATAL) << "Build program failed: " << ret;
-  }
-
-  return true;
-}
-
 }  // namespace
 
 
@@ -156,58 +105,57 @@ cl::CommandQueue &OpenCLRuntime::command_queue() { return command_queue_; }
 
 cl::Program &OpenCLRuntime::program() {
   // TODO(heliangliang) Support binary format
-  static const char *kernel_path = getenv("MACE_KERNEL_PATH");
-  std::string path(kernel_path == nullptr ? "" : kernel_path);
-
-  std::call_once(build_flag_, [this, &path]() {
-    MACE_CHECK(BuildProgram(this, path, &program_));
-  });
-
   return program_;
 }
-
-const std::unodered_map<std::string, std::string>
-    OpenCLRuntime::kernel_program_map_ = {
-  {"BatchNorm", "batch_norm.cl"}
+const std::map<std::string, std::string>
+    OpenCLRuntime::program_map_ = {
+  {"addn", "addn.cl"},
+  {"batch_norm", "batch_norm.cl"},
+  {"conv_2d_1x1", "conv_2d_1x1.cl"},
+  {"conv_2d_3x3", "conv_2d_3x3.cl"},
+  {"depthwise_conv_3x3", "depthwise_conv_3x3.cl"},
+  {"pooling", "pooling.cl"},
+  {"relu", "relu.cl"},
+  {"resize_bilinear", "resize_bilinear.cl"},
+  {"space_to_batch", "space_to_batch.cl"},
 };
 
-bool OpenCLRuntime::BuildProgram(const std::string &kernel_name,
+void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
                                  const std::string &build_options,
                                  cl::Program *program) {
   MACE_CHECK_NOTNULL(program);
 
-
   cl::Program::Sources sources;
-  std::string filename = kernel_path_ + kernel_name;
+  std::string filename = kernel_path_ + program_file_name;
   std::string kernel_source;
   MACE_CHECK(ReadSourceFile(filename, &kernel_source));
   sources.push_back({kernel_source.c_str(), kernel_source.length()});
 
   *program = cl::Program(this->context(), sources);
-  build_options += " -Werror -cl-mad-enable -cl-fast-relaxed-math -I" + path;
+  std::string build_options_str = build_options +
+      " -Werror -cl-mad-enable -cl-fast-relaxed-math -I" + kernel_path_;
   // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
-  cl_int ret = program->build({runtime->device()}, build_options.c_str());
+  cl_int ret = program->build({device()}, build_options_str.c_str());
   if (ret != CL_SUCCESS) {
-    if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(runtime->device()) ==
+    if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device()) ==
         CL_BUILD_ERROR) {
       std::string build_log =
-          program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(runtime->device());
+          program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(device());
       LOG(INFO) << "Program build log: " << build_log;
     }
     LOG(FATAL) << "Build program failed: " << ret;
   }
-
-  return true;
 }
 
-cl::Kernel OpenCLRuntime::BuildKernel(const std::string &kernel_name,
-                        const std::set<std::string> &build_options) {
-  auto kernel_program_it = kernel_program_map_.find(kernel_name);
-  if (kernel_program_it == kernel_program_map_.end()) {
-    MACE_CHECK(false, kernel_name, " opencl kernel doesn't exist.");
+cl::Kernel OpenCLRuntime::BuildKernel(const std::string &program_name,
+                                      const std::string &kernel_name,
+                                      const std::set<std::string> &build_options) {
+  auto kernel_program_it = program_map_.find(program_name);
+  if (kernel_program_it == program_map_.end()) {
+    MACE_CHECK(false, program_name, " opencl kernel doesn't exist.");
   }
 
-  std::string program_name = kernel_program_it->second;
+  std::string program_file_name = kernel_program_it->second;
   std::string build_options_str;
   for(auto &option : build_options) {
     build_options_str += " " + option;
@@ -219,10 +167,10 @@ cl::Kernel OpenCLRuntime::BuildKernel(const std::string &kernel_name,
   if (built_program_it != built_program_map_.end()) {
     program = built_program_it->second;
   } else {
-    this->BuildProgram(kernel_name, build_options_str, &program);
-    built_program_map_.emplace(built_program_key, std::move(program));
+    this->BuildProgram(program_file_name, build_options_str, &program);
+    built_program_map_.emplace(built_program_key, program);
   }
-  return cl::Kernel(kernel_name, program);
+  return cl::Kernel(program, kernel_name.c_str());
 }
 
 uint32_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() {
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index cdbd5d46..cd62ca8d 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -7,7 +7,7 @@
 
 #include <map>
 #include <mutex>
-#include <unordered_map>
+#include <set>
 
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_wrapper.h"
@@ -25,7 +25,8 @@ class OpenCLRuntime {
 
   uint32_t GetDeviceMaxWorkGroupSize();
   uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel);
-  cl::Kernel BuildKernel(const std::string &kernel_name,
+  cl::Kernel BuildKernel(const std::string &program_name,
+                         const std::string &kernel_name,
                          const std::set<std::string> &build_options);
  private:
   OpenCLRuntime(cl::Context context,
@@ -35,7 +36,7 @@ class OpenCLRuntime {
   OpenCLRuntime(const OpenCLRuntime&) = delete;
   OpenCLRuntime &operator=(const OpenCLRuntime&) = delete;
 
-  bool BuildProgram(const std::string &kernel_name,
+  void BuildProgram(const std::string &kernel_name,
                     const std::string &build_options,
                     cl::Program *program);
 
@@ -44,11 +45,10 @@ class OpenCLRuntime {
   cl::Device device_;
   cl::CommandQueue command_queue_;
   cl::Program program_;
-  std::once_flag build_flag_;
   std::string kernel_path_;
-  static const std::unordered_map<std::string,
-               std::string> kernel_program_map_;
-  mutable std::unordered_map<std::string,
+  static const std::map<std::string,
+               std::string> program_map_;
+  mutable std::map<std::string,
           cl::Program> built_program_map_;
 };
 
diff --git a/mace/core/types.cc b/mace/core/types.cc
index 8ad8fba9..18b5d6a8 100644
--- a/mace/core/types.cc
+++ b/mace/core/types.cc
@@ -12,6 +12,7 @@ bool DataTypeCanUseMemcpy(DataType dt) {
     case DT_DOUBLE:
     case DT_INT32:
     case DT_INT64:
+    case DT_UINT32:
     case DT_UINT16:
     case DT_UINT8:
     case DT_INT16:
@@ -23,4 +24,32 @@ bool DataTypeCanUseMemcpy(DataType dt) {
   }
 }
 
+std::string DataTypeToCLType(const DataType dt) {
+  switch (dt) {
+    case DT_FLOAT:
+      return "float";
+    case DT_HALF:
+      return "half";
+    case DT_UINT8:
+      return "uchar";
+    case DT_INT8:
+      return "char";
+    case DT_DOUBLE:
+      return "double";
+    case DT_INT32:
+      return "int";
+    case DT_UINT32:
+      return "int";
+    case DT_UINT16:
+      return "ushort";
+    case DT_INT16:
+      return "short";
+    case DT_INT64:
+      return "long";
+    default:
+      LOG(FATAL) << "Unsupported data type";
+      return "";
+  }
+}
+
 }  //  namespace mace
\ No newline at end of file
diff --git a/mace/core/types.h b/mace/core/types.h
index 2f354a14..a592ad9d 100644
--- a/mace/core/types.h
+++ b/mace/core/types.h
@@ -12,6 +12,8 @@ namespace mace {
 
 bool DataTypeCanUseMemcpy(DataType dt);
 
+std::string DataTypeToCLType(const DataType dt);
+
 template <class T>
 struct IsValidDataType;
 
@@ -50,6 +52,7 @@ MATCH_TYPE_AND_ENUM(int16_t, DT_INT16);
 MATCH_TYPE_AND_ENUM(int8_t, DT_INT8);
 MATCH_TYPE_AND_ENUM(string, DT_STRING);
 MATCH_TYPE_AND_ENUM(int64_t, DT_INT64);
+MATCH_TYPE_AND_ENUM(uint32_t, DT_UINT32);
 MATCH_TYPE_AND_ENUM(bool, DT_BOOL);
 
 static const int32_t kint32_tmax = ((int32_t)0x7FFFFFFF);
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index d860dcd8..b95d4895 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -76,9 +76,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
     const Tensor *epsilon,
     Tensor *output);
 
-template <typename T>
-struct BatchNormFunctor<DeviceType::OPENCL, T> {
-  void operator()(
+template <>
+void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
       const Tensor *input,
       const Tensor *scale,
       const Tensor *offset,
@@ -86,7 +85,6 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> {
       const Tensor *var,
       const Tensor *epsilon,
       Tensor *output);
-};
 
 }  //  namepsace kernels
 }  //  namespace mace
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index 24d084ca..b906c92d 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -15,9 +15,9 @@ static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) {
   const uint32_t gws = blocks;
 
   auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
-
-  auto addn_kernel = cl::Kernel(program, "add2");
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(output->dtype()));
+  auto addn_kernel = runtime->BuildKernel("addn", "add2", built_options);
 
   const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
 
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index 0c8cf342..badb3e7e 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -10,8 +10,8 @@
 namespace mace {
 namespace kernels {
 
-template <typename T>
-void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
+template <>
+void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
     const Tensor *input,
     const Tensor *scale,
     const Tensor *offset,
@@ -29,8 +29,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Get();
   std::set<std::string> built_options;
-  built_options.emplace("-DDataType=" + GetDataTypeFromEnum(input->dtype()));
-  auto bm_kernel = runtime->CreateKernel("batch_norm");
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options);
 
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
   const std::vector<uint32_t> lws = {1, 1, kwg_size};
@@ -63,7 +63,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
         cl::NDRange(gws[0], gws[1], gws[2]),
         cl::NDRange(params[0], params[1], params[2]));
 
-    MACE_CHECK(error == CL_SUCCESS);
+    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
     return error;
   };
   std::stringstream ss;
diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl
index eb1be1ca..55c8d0bf 100644
--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
@@ -1,7 +1,10 @@
-__kernel void add2(__global const float *input0,
-                   __global const float *input1,
+#include <common.h>
+
+// Supported data type: half/float
+__kernel void add2(__global const DATA_TYPE *input0,
+                   __global const DATA_TYPE *input1,
                    __private const int size,
-                   __global float *output) {
+                   __global DATA_TYPE *output) {
   int idx = get_global_id(0);
 
   if (idx + 4 > size) {
@@ -9,8 +12,8 @@ __kernel void add2(__global const float *input0,
       *(output+idx) = *(input0+idx) + *(input1+idx);
     }
   } else {
-    float4 in_data0 = vload4(idx, input0);
-    float4 in_data1 = vload4(idx, input1);
+    VEC_DATA_TYPE(DATA_TYPE,4) in_data0 = vload4(idx, input0);
+    VEC_DATA_TYPE(DATA_TYPE,4) in_data1 = vload4(idx, input1);
     vstore4(in_data0+in_data1, idx, output);
   }
 }
diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl
index 6cc2a2e0..e6a52d49 100644
--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
@@ -1,13 +1,15 @@
-void kernel batch_norm(global const float *input,
-                       global const float *scale,
-                       global const float *offset,
-                       global const float *mean,
-                       global const float *var,
-                       global const float *epsilon,
+#include <common.h>
+// Supported data types: half/float
+void kernel batch_norm(global const DATA_TYPE *input,
+                       global const DATA_TYPE *scale,
+                       global const DATA_TYPE *offset,
+                       global const DATA_TYPE *mean,
+                       global const DATA_TYPE *var,
+                       global const DATA_TYPE *epsilon,
                        private const int pixels,
-                       global float *output,
-                       __local float4 *new_scale,
-                       __local float4 *new_offset) {
+                       global DATA_TYPE *output,
+                       __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_scale,
+                       __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_offset) {
   const int batch = get_global_id(0);
   const int channel = get_global_id(1);
   const int channels = get_global_size(1);
@@ -23,8 +25,8 @@ void kernel batch_norm(global const float *input,
   barrier(CLK_LOCAL_MEM_FENCE);
 
   const int image_offset = (batch * channels + channel) * pixels + pixel_offset*4;
-  const float *input_ptr = input + image_offset;
-  float *output_ptr = output + image_offset;
+  const DATA_TYPE *input_ptr = input + image_offset;
+  DATA_TYPE *output_ptr = output + image_offset;
   const int end = (batch * channels + channel + 1) * pixels;
   if ((image_offset+4) > end) {
     for (int i = image_offset; i < end; ++i) {
@@ -33,7 +35,7 @@ void kernel batch_norm(global const float *input,
       ++output_ptr;
     }
   } else {
-    float4 values = vload4(0, input_ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input_ptr);
     values = values * new_scale[local_channel] + new_offset[local_channel];
     vstore4(values, 0, output_ptr);
   }
diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h
index 74c5b67a..963ff740 100644
--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -8,4 +8,7 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
 
+#define VEC_DATA_TYPE_STR(data_type, size) data_type##size
+#define VEC_DATA_TYPE(data_type, size) VEC_DATA_TYPE_STR(data_type, size)
+
 #endif  // MACE_KERNELS_OPENCL_CL_COMMON_H_
diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl
index 8025074f..6fcc863a 100644
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -25,31 +25,31 @@ __kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */
 }
 
 #define vec_conv_2d_1x1_s1                    \
-  float4 in0 = vload4(0, input_ptr);                   \
-  float4 in1 = vload4(0, input_ptr + in_pixel);        \
-  float4 in2 = vload4(0, input_ptr + 2 * in_pixel);    \
-  float4 in3 = vload4(0, input_ptr + 3 * in_pixel);
+  VEC_DATA_TYPE(DATA_TYPE,4) in0 = vload4(0, input_ptr);                   \
+  VEC_DATA_TYPE(DATA_TYPE,4) in1 = vload4(0, input_ptr + in_pixel);        \
+  VEC_DATA_TYPE(DATA_TYPE,4) in2 = vload4(0, input_ptr + 2 * in_pixel);    \
+  VEC_DATA_TYPE(DATA_TYPE,4) in3 = vload4(0, input_ptr + 3 * in_pixel);
 
 
 #define vec_conv_2d_1x1_s2                    \
-  float4 in00 = vload4(0, input_ptr);                   \
-  float3 in01 = vload3(0, input_ptr + 4);               \
-  float4 in10 = vload4(0, input_ptr + in_pixel);        \
-  float3 in11 = vload3(0, input_ptr + in_pixel + 4);    \
-  float4 in20 = vload4(0, input_ptr + 2 * in_pixel);    \
-  float3 in21 = vload3(0, input_ptr + 2 * in_pixel + 4);\
-  float4 in30 = vload4(0, input_ptr + 3 * in_pixel);    \
-  float3 in31 = vload3(0, input_ptr + 3 * in_pixel + 4); \
-  float4 in0 = (float4)(in00.s02, in01.s02);            \
-  float4 in1 = (float4)(in10.s02, in11.s02);            \
-  float4 in2 = (float4)(in20.s02, in21.s02);            \
-  float4 in3 = (float4)(in30.s02, in31.s02);
+  VEC_DATA_TYPE(DATA_TYPE,4) in00 = vload4(0, input_ptr);                   \
+  VEC_DATA_TYPE(DATA_TYPE,3) in01 = vload3(0, input_ptr + 4);               \
+  VEC_DATA_TYPE(DATA_TYPE,4) in10 = vload4(0, input_ptr + in_pixel);        \
+  VEC_DATA_TYPE(DATA_TYPE,3) in11 = vload3(0, input_ptr + in_pixel + 4);    \
+  VEC_DATA_TYPE(DATA_TYPE,4) in20 = vload4(0, input_ptr + 2 * in_pixel);    \
+  VEC_DATA_TYPE(DATA_TYPE,3) in21 = vload3(0, input_ptr + 2 * in_pixel + 4);\
+  VEC_DATA_TYPE(DATA_TYPE,4) in30 = vload4(0, input_ptr + 3 * in_pixel);    \
+  VEC_DATA_TYPE(DATA_TYPE,3) in31 = vload3(0, input_ptr + 3 * in_pixel + 4); \
+  VEC_DATA_TYPE(DATA_TYPE,4) in0 = (VEC_DATA_TYPE(DATA_TYPE,4))(in00.s02, in01.s02);            \
+  VEC_DATA_TYPE(DATA_TYPE,4) in1 = (VEC_DATA_TYPE(DATA_TYPE,4))(in10.s02, in11.s02);            \
+  VEC_DATA_TYPE(DATA_TYPE,4) in2 = (VEC_DATA_TYPE(DATA_TYPE,4))(in20.s02, in21.s02);            \
+  VEC_DATA_TYPE(DATA_TYPE,4) in3 = (VEC_DATA_TYPE(DATA_TYPE,4))(in30.s02, in31.s02);
 
 
 #define vec_conv_2d_1x1_compute_loop  \
   for (int oc = 0; oc < 4; ++oc) {                             \
-    float4 weights = vload4(0, filter_ptr + oc * in_chan_num); \
-    float4 out = vload4(0, output_ptr + oc * out_pixel);       \
+    VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr + oc * in_chan_num); \
+    VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr + oc * out_pixel);       \
     out += in0 * weights.x;                                    \
     out += in1 * weights.y;                                     \
     out += in2 * weights.z;                                     \
@@ -58,25 +58,27 @@ __kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */
   }
 
 #define vec_conv_2d_1x1_compute  \
-    float4 weights = vload4(0, filter_ptr); \
-    float4 out = vload4(0, output_ptr);       \
+    VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr); \
+    VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr);       \
     out += in0 * weights.x;                                    \
     out += in1 * weights.y;                                     \
     out += in2 * weights.z;                                     \
     out += in3 * weights.w;                                     \
     vstore4(out, 0, output_ptr);
 
-__kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */
-                             __global const float *filter, /* o, i, kh, kw */
-                             __global const float *bias, /* o */
-                             __global float *output, /* n, c, h, w */
+// Supported data type: half/float
+__kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */
+                             __global const DATA_TYPE *filter, /* o, i, kh, kw */
+#ifdef BIAS
+                             __global const DATA_TYPE *bias, /* o */
+#endif /* defined(BIAS) */
+                             __global DATA_TYPE *output, /* n, c, h, w */
                              __private const int in_chan_num,
                              __private const int out_chan_num,
                              __private const int in_height,
                              __private const int in_width,
                              __private const int out_height,
-                             __private const int out_width,
-                             __private const int stride) {
+                             __private const int out_width) {
   int batch = get_global_id(0);
   int out_chan_blk = get_global_id(1);
   int out_pixel_blk = get_global_id(2);
@@ -92,20 +94,30 @@ __kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */
   const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
   const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
   const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
+
+#ifdef STRIDE_1
+  const int stride = 1;
+#else
+  const int stride = 2;
+#endif
   const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
 
   const int in_offset = batch * in_chan_num * in_pixel;
   const int out_offset = batch * out_chan_num * out_pixel;
 
-  const float *input_base = input + in_offset + in_pixel_begin;
-  float *output_base = output + out_offset + out_pixel_begin;
+  const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
+  DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
 
   int out_chan_len = out_chan_end - out_chan_begin;
   int pixel_len = out_pixel_end - out_pixel_begin;
 
   for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
-    float *output_ptr = output_base + out_chan * out_pixel;
-    float bias_value = bias == NULL ? 0 : bias[out_chan];
+    DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
+#ifdef BIAS
+    DATA_TYPE bias_value = bias[out_chan];
+#else
+    DATA_TYPE bias_value = 0;
+#endif
     for (int p = 0; p < pixel_len; ++p) {
       output_ptr[p] = bias_value;
     }
@@ -113,48 +125,37 @@ __kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */
 
   int in_chan = 0;
   if (pixel_len == 4) {
-    if (stride == 1) {
-      for (; in_chan + 3 < in_chan_num; in_chan += 4) {
-        const float *input_ptr = input_base + in_chan * in_pixel;
-        int out_chan = out_chan_begin;
-        for (; out_chan + 3 < out_chan_end; out_chan += 4) {
-          const float* filter_ptr = filter + out_chan * in_chan_num + in_chan;
-          float *output_ptr = output_base + out_chan * out_pixel;
-          vec_conv_2d_1x1_s1;
-          vec_conv_2d_1x1_compute_loop;
-        }
-        for (; out_chan < out_chan_end; ++out_chan) {
-          const float* filter_ptr = filter + out_chan * in_chan_num + in_chan;
-          float *output_ptr = output_base + out_chan * out_pixel;
-          vec_conv_2d_1x1_s1;
-          vec_conv_2d_1x1_compute;
-        }
+    for (; in_chan + 3 < in_chan_num; in_chan += 4) {
+      const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel;
+      int out_chan = out_chan_begin;
+      for (; out_chan + 3 < out_chan_end; out_chan += 4) {
+        const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan;
+        DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
+#ifdef STRIDE_1
+        vec_conv_2d_1x1_s1;
+#else
+        vec_conv_2d_1x1_s2;
+#endif
+        vec_conv_2d_1x1_compute_loop;
       }
-    } else if (stride == 2) {
-      for (; in_chan + 3 < in_chan_num; in_chan += 4) {
-        const float *input_ptr = input_base + in_chan * in_pixel;
-        int out_chan = out_chan_begin;
-        for (; out_chan + 3 < out_chan_end; out_chan += 4) {
-          const float* filter_ptr = filter + out_chan * in_chan_num + in_chan;
-          float *output_ptr = output_base + out_chan * out_pixel;
-          vec_conv_2d_1x1_s2;
-          vec_conv_2d_1x1_compute_loop;
-        }
-        for (; out_chan < out_chan_end; ++out_chan) {
-          const float* filter_ptr = filter + out_chan * in_chan_num + in_chan;
-          float *output_ptr = output_base + out_chan * out_pixel;
-          vec_conv_2d_1x1_s2;
-          vec_conv_2d_1x1_compute;
-        }
+      for (; out_chan < out_chan_end; ++out_chan) {
+        const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan;
+        DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
+#ifdef STRIDE_1
+        vec_conv_2d_1x1_s1;
+#else
+        vec_conv_2d_1x1_s2;
+#endif
+        vec_conv_2d_1x1_compute;
       }
     }
   }
 
   for (; in_chan < in_chan_num; ++in_chan) {
-    const float *input_ptr = input_base + in_chan * in_pixel;
+    const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel;
     for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
-      float weights = filter[out_chan * in_chan_num + in_chan];
-      float *output_ptr = output_base + out_chan * out_pixel;
+      DATA_TYPE weights = filter[out_chan * in_chan_num + in_chan];
+      DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
 
       for (int p = 0; p < pixel_len; ++p) {
         float in = input_ptr[p*stride];
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index 317daaaf..8962fdad 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -1,16 +1,60 @@
-#include <conv_helper.h>
-void kernel conv_2d_3x3(global const float *input,
-                        global const float *filter,
-                        global const float *bias,
-                        global float *output,
+#include <common.h>
+
+VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s1(const DATA_TYPE *input_ptr,
+                                      const DATA_TYPE *filter_ptr) {
+  VEC_DATA_TYPE(DATA_TYPE,4) row0 = vload4(0, input_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,2) input1 = vload2(0, input_ptr+4);
+  VEC_DATA_TYPE(DATA_TYPE,4) row1 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input1.s0);
+  VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s23, input1.s01);
+  VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
+  return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
+         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
+         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
+}
+
+VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s2(const DATA_TYPE *input_ptr,
+                                      const DATA_TYPE *filter_ptr) {
+  VEC_DATA_TYPE(DATA_TYPE,8) input = vload8(0, input_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,4) row0 = input.even;
+  VEC_DATA_TYPE(DATA_TYPE,4) row1 = input.odd;
+  VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input_ptr[8]);
+  VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
+  return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
+         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
+         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
+}
+
+// Supported data type: half/float
+DATA_TYPE conv3x3(const DATA_TYPE *input_ptr,
+                  const DATA_TYPE *filter_ptr,
+                  const int row_width) {
+  VEC_DATA_TYPE(DATA_TYPE,3) input_value = vload3(0, input_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,3) filter_value = vload3(0, filter_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,3) res = input_value * filter_value;
+  input_ptr += row_width;
+  input_value = vload3(0, input_ptr);
+  filter_value = vload3(1, filter_ptr);
+  res += input_value * filter_value;
+  input_ptr += row_width;
+  input_value = vload3(0, input_ptr);
+  filter_value = vload3(2, filter_ptr);
+  res += input_value * filter_value;
+
+  return res.s0 + res.s1 + res.s2;
+}
+
+void kernel conv_2d_3x3(global const DATA_TYPE *input,
+                        global const DATA_TYPE *filter,
+#ifdef BIAS
+                        global const DATA_TYPE *bias,
+#endif
+                        global DATA_TYPE *output,
                         private const int in_chan_num,
                         private const int out_chan_num,
                         private const int in_height,
                         private const int in_width,
                         private const int out_height,
-                        private const int out_width,
-                        private const int stride_h,
-                        private const int stride_w) {
+                        private const int out_width) {
   int batch = get_global_id(0);
   int out_chan_blk = get_global_id(1);
   int out_pixel_blk = get_global_id(2);
@@ -26,46 +70,54 @@ void kernel conv_2d_3x3(global const float *input,
   const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
   const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
   const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
-  const int in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
+#ifdef STRIDE_1
+  const int stride = 1;
+#else
+  const int stride = 2;
+#endif
+  const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
 
   const int in_offset = batch * in_chan_num * in_pixel;
   const int out_offset = batch * out_chan_num * out_pixel;
-  const float *input_base = input + in_offset + in_pixel_begin;
-  float *output_base = output + out_offset + out_pixel_begin;
+  const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
+  DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
 
   const int pixels = out_pixel_end - out_pixel_begin;
 
   for (int i = out_chan_begin; i < out_chan_end; ++i) {
-    float *output_ptr = output_base + i * out_pixel;
-    const float *filter_base = filter + i * in_chan_num * 9;
+    DATA_TYPE *output_ptr = output_base + i * out_pixel;
+    const DATA_TYPE *filter_base = filter + i * in_chan_num * 9;
     if (pixels == 4) {
+#ifdef BIAS
+      VEC_DATA_TYPE(DATA_TYPE, 4) res = (VEC_DATA_TYPE(DATA_TYPE, 4))bias[i];
+#else
+      VEC_DATA_TYPE(DATA_TYPE, 4) res = 0;
+#endif
 
-      float4 res = bias == NULL ? 0 : (float4)bias[i];
-
-      if (stride_w == 1) {
-        for (int in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
-          const float* input_ptr = input_base + in_chan_idx * in_pixel;
-          const float* filter_ptr = filter_base + in_chan_idx * 9;
-          res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
-          res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
-          res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
-        }
-      } else {
-        for (int in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
-          const float* input_ptr = input_base + in_chan_idx * in_pixel;
-          const float* filter_ptr = filter_base + in_chan_idx * 9;
-          res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
-          res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
-          res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
-        }
+      for (int in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
+        const DATA_TYPE *input_ptr = input_base + in_chan_idx * in_pixel;
+        const DATA_TYPE *filter_ptr = filter_base + in_chan_idx * 9;
+#ifdef STRIDE_1
+        res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+        res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+        res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
+#else
+        res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+        res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+        res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
+#endif
       }
       vstore4(res, 0, output_ptr);
     } else {
       for (int p = 0; p < pixels; ++p) {
-        float res = bias == NULL ? 0 : bias[i];
+#ifdef BIAS
+        DATA_TYPE res = bias[i];
+#else
+        DATA_TYPE res = 0;
+#endif
         for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
-          const float* input_ptr = input_base + in_chan_idx * in_pixel + p * stride_w;
-          const float* filter_ptr = filter_base + in_chan_idx * 9;
+          const DATA_TYPE *input_ptr = input_base + in_chan_idx * in_pixel + p * stride;
+          const DATA_TYPE *filter_ptr = filter_base + in_chan_idx * 9;
           res += conv3x3(input_ptr, filter_ptr, in_width);
         }
         output_ptr[p] = res;
diff --git a/mace/kernels/opencl/cl/conv_helper.cl b/mace/kernels/opencl/cl/conv_helper.cl
deleted file mode 100644
index f569ef3a..00000000
--- a/mace/kernels/opencl/cl/conv_helper.cl
+++ /dev/null
@@ -1,41 +0,0 @@
-float4 conv1x3_s1(const float *input_ptr,
-                  const float *filter_ptr) {
-  float4 row0 = vload4(0, input_ptr);
-  float2 input1 = vload2(0, input_ptr+4);
-  float4 row1 = (float4)(row0.s123, input1.s0);
-  float4 row2 = (float4)(row0.s23, input1.s01);
-  float3 filter_values = vload3(0, filter_ptr);
-  return (float4)filter_values.s0 * row0 +
-         (float4)filter_values.s1 * row1 +
-         (float4)filter_values.s2 * row2;
-}
-
-float4 conv1x3_s2(const float *input_ptr,
-                  const float *filter_ptr) {
-  float8 input = vload8(0, input_ptr);
-  float4 row0 = input.even;
-  float4 row1 = input.odd;
-  float4 row2 = (float4)(row0.s123, input_ptr[8]);
-  float3 filter_values = vload3(0, filter_ptr);
-  return (float4)filter_values.s0 * row0 +
-         (float4)filter_values.s1 * row1 +
-         (float4)filter_values.s2 * row2;
-}
-
-float conv3x3(const float *input_ptr,
-              const float *filter_ptr,
-              const int row_width) {
-  float3 input_value = vload3(0, input_ptr);
-  float3 filter_value = vload3(0, filter_ptr);
-  float3 res = input_value * filter_value;
-  input_ptr += row_width;
-  input_value = vload3(0, input_ptr);
-  filter_value = vload3(1, filter_ptr);
-  res += input_value * filter_value;
-  input_ptr += row_width;
-  input_value = vload3(0, input_ptr);
-  filter_value = vload3(2, filter_ptr);
-  res += input_value * filter_value;
-
-  return res.s0 + res.s1 + res.s2;
-}
diff --git a/mace/kernels/opencl/cl/conv_helper.h b/mace/kernels/opencl/cl/conv_helper.h
deleted file mode 100644
index 553af09d..00000000
--- a/mace/kernels/opencl/cl/conv_helper.h
+++ /dev/null
@@ -1,15 +0,0 @@
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-
-#ifndef MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
-#define MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
-
-float4 conv1x3_s1(const float *input_ptr,
-                  const float *filter_ptr);
-float4 conv1x3_s2(const float *input_ptr,
-                  const float *filter_ptr);
-float conv3x3(const float *input_ptr,
-              const float *filter_ptr,
-              const int row_width);
-#endif //  MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
diff --git a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
index 9f9a6fc4..29dbc340 100644
--- a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
@@ -1,17 +1,60 @@
-#include <conv_helper.h>
+#include <common.h>
+
+VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s1(const DATA_TYPE *input_ptr,
+                                      const DATA_TYPE *filter_ptr) {
+  VEC_DATA_TYPE(DATA_TYPE,4) row0 = vload4(0, input_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,2) input1 = vload2(0, input_ptr+4);
+  VEC_DATA_TYPE(DATA_TYPE,4) row1 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input1.s0);
+  VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s23, input1.s01);
+  VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
+  return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
+         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
+         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
+}
+
+VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s2(const DATA_TYPE *input_ptr,
+                                      const DATA_TYPE *filter_ptr) {
+  VEC_DATA_TYPE(DATA_TYPE,8) input = vload8(0, input_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,4) row0 = input.even;
+  VEC_DATA_TYPE(DATA_TYPE,4) row1 = input.odd;
+  VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input_ptr[8]);
+  VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
+  return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
+         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
+         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
+}
+
+// Supported data type: half/float
+DATA_TYPE conv3x3(const DATA_TYPE *input_ptr,
+                  const DATA_TYPE *filter_ptr,
+                  const int row_width) {
+  VEC_DATA_TYPE(DATA_TYPE,3) input_value = vload3(0, input_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,3) filter_value = vload3(0, filter_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,3) res = input_value * filter_value;
+  input_ptr += row_width;
+  input_value = vload3(0, input_ptr);
+  filter_value = vload3(1, filter_ptr);
+  res += input_value * filter_value;
+  input_ptr += row_width;
+  input_value = vload3(0, input_ptr);
+  filter_value = vload3(2, filter_ptr);
+  res += input_value * filter_value;
+
+  return res.s0 + res.s1 + res.s2;
+}
 //TODO merge the depthwise with conv 3x3 to remove duplicate code.
-void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */
-                               global const float *filter, /* m, i, kh, kw */
-                               global const float *bias, /* o */
-                               global float *output, /* n, c, h, w */
-                               private const int in_chan_num,
-                               private const int out_chan_num,
-                               private const int in_height,
-                               private const int in_width,
-                               private const int out_height,
-                               private const int out_width,
-                               private const int stride_h,
-                               private const int stride_w) {
+__kernel void depthwise_conv_3x3(__global const DATA_TYPE *input, /* n, c, h, w */
+                                 __global const DATA_TYPE *filter, /* m, i, kh, kw */
+#ifdef BIAS
+                                 __global const DATA_TYPE *bias, /* o */
+#endif
+                                 __global DATA_TYPE *output, /* n, c, h, w */
+                                 __private const int in_chan_num,
+                                 __private const int out_chan_num,
+                                 __private const int in_height,
+                                 __private const int in_width,
+                                 __private const int out_height,
+                                 __private const int out_width) {
   int batch = get_global_id(0);
   int out_chan_blk = get_global_id(1);
   int out_pixel_blk = get_global_id(2);
@@ -28,38 +71,54 @@ void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */
   const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
   const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
   const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
-  const int in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
+#ifdef STRIDE_1
+  const int in_pixel_begin = out_pixel_height * in_width + out_pixel_width * 4;
+#else
+  const int in_pixel_begin = out_pixel_height * 2 * in_width + out_pixel_width * 2 * 4;
+#endif
 
   const int in_offset = batch * in_chan_num * in_pixel;
   const int out_offset = batch * out_chan_num * out_pixel;
-  const float *input_base = input + in_offset + in_pixel_begin;
-  float *output_base = output + out_offset + out_pixel_begin;
+  const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
+  DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
 
   const int pixels = out_pixel_end - out_pixel_begin;
 
   for (int i = out_chan_begin; i < out_chan_end; ++i) {
-    float bias_value = bias[i];
-    const float *input_ptr = input_base + (i / multiplier) * in_pixel;
-    const float *filter_ptr = filter + i * 9;
-    float *output_ptr = output_base + i * out_pixel;
+    const DATA_TYPE *input_ptr = input_base + (i / multiplier) * in_pixel;
+    const DATA_TYPE *filter_ptr = filter + i * 9;
+    DATA_TYPE *output_ptr = output_base + i * out_pixel;
     if (pixels == 4) {
-      float4 res = (float4)bias[i];
-      if (stride_w == 1) {
-        res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
-        res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
-        res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
-      } else {
-        res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
-        res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
-        res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
-      }
+#ifdef BIAS
+      VEC_DATA_TYPE(DATA_TYPE,4) res = (VEC_DATA_TYPE(DATA_TYPE,4))bias[i];
+#else
+      VEC_DATA_TYPE(DATA_TYPE,4) res = 0;
+#endif /* defined(BIAS) */
+
+#ifdef STRIDE_1
+      res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+      res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+      res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
+#else
+      res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+      res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+      res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
+#endif
       vstore4(res, 0, output_ptr);
     } else {
       for (int p = 0; p < pixels; ++p) {
-        float res = bias[i];
+#ifdef BIAS
+        DATA_TYPE res = bias[i];
+#else
+        DATA_TYPE res = 0;
+#endif
         res += conv3x3(input_ptr, filter_ptr, in_width);
         output_ptr[p] = res;
-        input_ptr += stride_w;
+#ifdef STRIDE_1
+        input_ptr += 1;
+#else
+        input_ptr += 2;
+#endif
       }
     }
   }
diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl
index 9f9e38d4..bc987ddd 100644
--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
@@ -1,69 +1,75 @@
-float4 vec_pooling_3_s1(const float *input_ptr, const int in_width) {
-  float4 row00 = vload4(0, input_ptr);
-  float2 row01 = vload2(0, input_ptr + 4);
-  float4 row10 = vload4(0, input_ptr + in_width);
-  float2 row11 = vload2(0, input_ptr + in_width + 4);
-  float4 row20 = vload4(0, input_ptr + in_width * 2);
-  float2 row21 = vload2(0, input_ptr + in_width * 2 + 4);
-
-  float8 data00 = (float8)(row00.s01212323);
-  float4 data01 = (float4)(row01.s0, row00.s3, row01.s01);
-  float8 data10 = (float8)(row10.s01212323);
-  float4 data11 = (float4)(row11.s0, row10.s3, row11.s01);
-  float8 data20 = (float8)(row20.s01212323);
-  float4 data21 = (float4)(row21.s0, row20.s3, row21.s01);
-
-  float8 left = fmax(fmax(data00, data10), data20);
-  float4 right = fmax(fmax(data01, data11), data21);
-
-  float4 res = fmax((float4)(left.s036, right.s1), (float4)(left.s147, right.s2));
-  res = fmax(res, (float4)(left.s25, right.s03));
+#include <common.h>
+
+VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s1(const DATA_TYPE *input_ptr, const int in_width) {
+  VEC_DATA_TYPE(DATA_TYPE,4) row00 = vload4(0, input_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,2) row01 = vload2(0, input_ptr + 4);
+  VEC_DATA_TYPE(DATA_TYPE,4) row10 = vload4(0, input_ptr + in_width);
+  VEC_DATA_TYPE(DATA_TYPE,2) row11 = vload2(0, input_ptr + in_width + 4);
+  VEC_DATA_TYPE(DATA_TYPE,4) row20 = vload4(0, input_ptr + in_width * 2);
+  VEC_DATA_TYPE(DATA_TYPE,2) row21 = vload2(0, input_ptr + in_width * 2 + 4);
+
+  VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01212323);
+  VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row01.s0, row00.s3, row01.s01);
+  VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01212323);
+  VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row11.s0, row10.s3, row11.s01);
+  VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01212323);
+  VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row21.s0, row20.s3, row21.s01);
+
+  VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
+  VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
+
+  VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
+                                        (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
+  res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
 
   return res;
 }
-float4 vec_pooling_3_s2(const float *input_ptr, const int in_width) {
-  float8 row00 = vload8(0, input_ptr);
-  float row01 = *(input_ptr + 8);
-  float8 row10 = vload8(0, input_ptr + in_width);
-  float row11 = *(input_ptr + in_width + 8);
-  float8 row20 = vload8(0, input_ptr + in_width * 2);
-  float row21 = *(input_ptr + in_width * 2 + 8);
-
-  float8 data00 = (float8)(row00.s01223445);
-  float4 data01 = (float4)(row00.s667, row01);
-  float8 data10 = (float8)(row10.s01223445);
-  float4 data11 = (float4)(row10.s667, row11);
-  float8 data20 = (float8)(row20.s01223445);
-  float4 data21 = (float4)(row20.s667, row21);
-
-  float8 left = fmax(fmax(data00, data10), data20);
-  float4 right = fmax(fmax(data01, data11), data21);
-
-  float4 res = fmax((float4)(left.s036, right.s1), (float4)(left.s147, right.s2));
-  res = fmax(res, (float4)(left.s25, right.s03));
+
+VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s2(const DATA_TYPE *input_ptr, const int in_width) {
+  VEC_DATA_TYPE(DATA_TYPE,8) row00 = vload8(0, input_ptr);
+  DATA_TYPE row01 = *(input_ptr + 8);
+  VEC_DATA_TYPE(DATA_TYPE,8) row10 = vload8(0, input_ptr + in_width);
+  DATA_TYPE row11 = *(input_ptr + in_width + 8);
+  VEC_DATA_TYPE(DATA_TYPE,8) row20 = vload8(0, input_ptr + in_width * 2);
+  DATA_TYPE row21 = *(input_ptr + in_width * 2 + 8);
+
+  VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01223445);
+  VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row00.s667, row01);
+  VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01223445);
+  VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row10.s667, row11);
+  VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01223445);
+  VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row20.s667, row21);
+
+  VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
+  VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
+
+  VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
+                                        (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
+  res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
 
   return res;
 }
 
-float inner_pooling_3(const float *input_ptr, const int in_width) {
-  float3 row0 = vload3(0, input_ptr);
-  float3 row1 = vload3(0, input_ptr + in_width);
-  float3 row2 = vload3(0, input_ptr + in_width * 2);
+DATA_TYPE inner_pooling_3(const DATA_TYPE *input_ptr, const int in_width) {
+  VEC_DATA_TYPE(DATA_TYPE,3) row0 = vload3(0, input_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,3) row1 = vload3(0, input_ptr + in_width);
+  VEC_DATA_TYPE(DATA_TYPE,3) row2 = vload3(0, input_ptr + in_width * 2);
 
-  float3 data = fmax(fmax(row0, row1), row2);
+  VEC_DATA_TYPE(DATA_TYPE,3) data = fmax(fmax(row0, row1), row2);
 
-  float res = fmax(fmax(data.s0, data.s1), data.s2);
+  DATA_TYPE res = fmax(fmax(data.s0, data.s1), data.s2);
   return res;
 }
 
-__kernel void pooling3(__global const float *input, /* n, c, h, w */
+// Supported data type: half/float
+__kernel void pooling3(__global const DATA_TYPE *input, /* n, c, h, w */
                        __private const int in_height,
                        __private const int in_width,
                        __private const int out_chan_num,
                        __private const int out_height,
                        __private const int out_width,
                        __private const int stride,
-                       __global float *output) {
+                       __global DATA_TYPE *output) {
   int batch = get_global_id(0);
   int out_chan_blk = get_global_id(1);
   int out_pixel_blk = get_global_id(2);
@@ -83,21 +89,21 @@ __kernel void pooling3(__global const float *input, /* n, c, h, w */
 
   const int in_offset = batch * out_chan_num * in_pixel;
   const int out_offset = batch * out_chan_num * out_pixel;
-  const float *input_base = input + in_offset + in_pixel_begin;
-  float *output_base = output + out_offset + out_pixel_begin;
+  const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
+  DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
 
   const int pixels = out_pixel_end - out_pixel_begin;
 
   for (int i = out_chan_begin; i < out_chan_end; ++i) {
-    const float *input_ptr = input_base + i * in_pixel;
-    float *output_ptr = output_base + i * out_pixel;
+    const DATA_TYPE *input_ptr = input_base + i * in_pixel;
+    DATA_TYPE *output_ptr = output_base + i * out_pixel;
     if (pixels == 4) {
-      float4 res;
-      if (stride == 1) {
-        res = vec_pooling_3_s1(input_ptr, in_width);
-      } else {
-        res = vec_pooling_3_s2(input_ptr, in_width);
-      }
+      VEC_DATA_TYPE(DATA_TYPE,4) res;
+#ifdef STRIDE_1
+      res = vec_pooling_3_s1(input_ptr, in_width);
+#else
+      res = vec_pooling_3_s2(input_ptr, in_width);
+#endif
       vstore4(res, 0, output_ptr);
     } else {
       for (int p = 0; p < pixels; ++p) {
@@ -122,7 +128,8 @@ int calculate_avg_block_size(const int pos_h,
   return (h_end - h_start) * (w_end - w_start);
 }
 
-__kernel void poolingn(__global const float *input, /* n, c, h, w */
+// Supported data type: half/float
+__kernel void poolingn(__global const DATA_TYPE *input, /* n, c, h, w */
                        __private const int in_height,
                        __private const int in_width,
                        __private const int out_chan_num,
@@ -132,7 +139,7 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */
                        __private const int pad_h,
                        __private const int pad_w,
                        __private const int pooling_size,
-                       __global float *output) {
+                       __global DATA_TYPE *output) {
   int batch = get_global_id(0);
   int out_chan_idx = get_global_id(1);
   int out_pixel_idx = get_global_id(2);
@@ -150,8 +157,8 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */
 
   const int in_offset = batch * out_chan_num * in_pixel;
   const int out_offset = batch * out_chan_num * out_pixel;
-  const float *input_base = input + in_offset + in_pixel_idx;
-  float *output_base = output + out_offset + out_pixel_idx;
+  const DATA_TYPE *input_base = input + in_offset + in_pixel_idx;
+  DATA_TYPE *output_base = output + out_offset + out_pixel_idx;
 
   const int block_size = calculate_avg_block_size(
                             out_pixel_height * stride,
@@ -162,14 +169,14 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */
                             in_height - pad_h,
                             in_width - pad_w);
   for (int i = out_chan_begin; i < out_chan_end; ++i) {
-    float8 sum8 = 0.0f;
-    float sum1 = 0.0f;
-    float *output_ptr = output_base + i * out_pixel;
+    VEC_DATA_TYPE(DATA_TYPE,8) sum8 = 0.0f;
+    DATA_TYPE sum1 = 0.0f;
+    DATA_TYPE *output_ptr = output_base + i * out_pixel;
     for (int y = 0; y < pooling_size; ++y) {
-      const float *input_ptr = input_base + i * in_pixel + y * in_width;
+      const DATA_TYPE *input_ptr = input_base + i * in_pixel + y * in_width;
       int x = 0;
       for (; x < (pooling_size-8); x += 8) {
-        float8 data = vload8(0, input_ptr);
+        VEC_DATA_TYPE(DATA_TYPE,8) data = vload8(0, input_ptr);
         sum8 += data;
         input_ptr += 8;
       }
@@ -178,8 +185,8 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */
         input_ptr++;
       }
     }
-    float4 sum4 = sum8.s0123 + sum8.s4567;
-    float2 sum2 = sum4.s01 + sum4.s23;
+    VEC_DATA_TYPE(DATA_TYPE,4) sum4 = sum8.s0123 + sum8.s4567;
+    VEC_DATA_TYPE(DATA_TYPE,2) sum2 = sum4.s01 + sum4.s23;
 
     *output_ptr = (sum2.s0 + sum2.s1 + sum1) / block_size;
   }
diff --git a/mace/kernels/opencl/cl/relu.cl b/mace/kernels/opencl/cl/relu.cl
index 390c8454..33fe65d0 100644
--- a/mace/kernels/opencl/cl/relu.cl
+++ b/mace/kernels/opencl/cl/relu.cl
@@ -1,6 +1,9 @@
-__kernel void relu(__global const float *input,
+#include <common.h>
+
+// Supported data type: half/float
+__kernel void relu(__global const DATA_TYPE *input,
                    __private const int size,
-                   __global float *output) {
+                   __global DATA_TYPE *output) {
   int idx = get_global_id(0);
 
   if (idx + 4 > size) {
@@ -8,16 +11,16 @@ __kernel void relu(__global const float *input,
       *(output+idx) = fmax(*(input+idx), 0);
     }
   } else {
-    float4 data = vload4(idx, input);
-    data = fmax(data, (float4)0);
+    VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input);
+    data = fmax(data, (VEC_DATA_TYPE(DATA_TYPE,4))0);
     vstore4(data, idx, output);
   }
 }
 
-__kernel void relux(__global const float *input,
-                    __private const float max_limit,
+__kernel void relux(__global const DATA_TYPE *input,
+                    __private const DATA_TYPE max_limit,
                     __private const int size,
-                    __global float *output) {
+                    __global DATA_TYPE *output) {
   int idx = get_global_id(0);
 
   if (idx + 4 > size) {
@@ -25,8 +28,8 @@ __kernel void relux(__global const float *input,
       *(output+idx) = clamp(*(input+idx), 0.0f, max_limit);
     }
   } else {
-    float4 data = vload4(idx, input);
-    data = clamp(data, (float4)0, (float4)max_limit);
+    VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input);
+    data = clamp(data, (VEC_DATA_TYPE(DATA_TYPE,4))0, (VEC_DATA_TYPE(DATA_TYPE,4))max_limit);
     vstore4(data, idx, output);
   }
 }
diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl
index b1e987ec..f34e63cb 100644
--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
@@ -1,5 +1,8 @@
-__kernel void resize_bilinear_nocache(__global const float *input, /* n * c, h, w */
-                                      __global float *output /* n * c, h, w */,
+#include <common.h>
+
+// Supported data type: half/float
+__kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c, h, w */
+                                      __global DATA_TYPE *output /* n * c, h, w */,
                                       __private const float height_scale,
                                       __private const float width_scale,
                                       __private const int in_height,
@@ -21,16 +24,16 @@ __kernel void resize_bilinear_nocache(__global const float *input, /* n * c, h,
   const float h_lerp = h_in - h_lower;
   const float w_lerp = w_in - w_lower;
 
-  const float *input_base = input + c * in_height * in_width;
-  float *output_base = output + c * height * width;
+  const DATA_TYPE *input_base = input + c * in_height * in_width;
+  DATA_TYPE *output_base = output + c * height * width;
 
-  float top_left = input_base[h_lower * in_width + w_lower];
-  float top_right = input_base[h_lower * in_width + w_upper];
-  float bottom_left = input_base[h_upper * in_width + w_lower];
-  float bottom_right = input_base[h_upper * in_width + w_upper];
+  DATA_TYPE top_left = input_base[h_lower * in_width + w_lower];
+  DATA_TYPE top_right = input_base[h_lower * in_width + w_upper];
+  DATA_TYPE bottom_left = input_base[h_upper * in_width + w_lower];
+  DATA_TYPE bottom_right = input_base[h_upper * in_width + w_upper];
 
-  const float top = top_left + (top_right - top_left) * w_lerp;
-  const float bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
+  const DATA_TYPE top = top_left + (top_right - top_left) * w_lerp;
+  const DATA_TYPE bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
   output_base[h * width + w] = top + (bottom - top) * h_lerp;
 }
 
diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl
index 921a3bf8..5d098d86 100644
--- a/mace/kernels/opencl/cl/space_to_batch.cl
+++ b/mace/kernels/opencl/cl/space_to_batch.cl
@@ -1,14 +1,17 @@
-void kernel space_to_batch(global float *space_data_ptr,
-                           global const int *block_shape_ptr,
-                           global const int *paddings_ptr,
-                           private const int space_batch,
-                           private const int space_channel,
-                           private const int space_height,
-                           private const int space_width,
-                           private const int batch_height,
-                           private const int batch_width,
-                           private const int b2s,
-                           global float* batch_data_ptr) {
+#include <common.h>
+
+// Supported data type: all
+__kernel void space_to_batch(__global DATA_TYPE *space_data_ptr,
+                             __global const int *block_shape_ptr,
+                             __global const int *paddings_ptr,
+                             __private const int space_batch,
+                             __private const int space_channel,
+                             __private const int space_height,
+                             __private const int space_width,
+                             __private const int batch_height,
+                             __private const int batch_width,
+                             __private const int b2s,
+                             __global DATA_TYPE* batch_data_ptr) {
   int batch_idx = get_global_id(0);
   int batch_channel_idx = get_global_id(1);
   int batch_pixel_idx = get_global_id(2);
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index 0c043b8c..1d89519e 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -61,16 +61,19 @@ void Conv1x1V2(const Tensor *input,
   // TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy
   // TODO check wired clReleaseCommandQueue latency
   // The KernelFunctor can cause segment faults in cb_retain_event
-  auto conv_2d_kernel = cl::Kernel(program, "conv_2d_1x1_v2");
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
+  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1_v2", built_options);
+
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
   uint32_t idx = 0;
   conv_2d_kernel.setArg(idx++,
                         *(static_cast<const cl::Buffer *>(input->buffer())));
   conv_2d_kernel.setArg(idx++,
                         *(static_cast<const cl::Buffer *>(filter->buffer())));
-  if (bias == NULL) {
-    conv_2d_kernel.setArg(idx++, NULL);
-  } else {
+  if (bias != nullptr) {
     conv_2d_kernel.setArg(idx++,
                           *(static_cast<const cl::Buffer *>(bias->buffer())));
   }
@@ -81,7 +84,6 @@ void Conv1x1V2(const Tensor *input,
   conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3)));
   conv_2d_kernel.setArg(idx++, static_cast<int>(height));
   conv_2d_kernel.setArg(idx++, static_cast<int>(width));
-  conv_2d_kernel.setArg(idx++, stride);
 
   auto command_queue = runtime->command_queue();
   cl_int error = command_queue.enqueueNDRangeKernel(
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index a0224484..452f46fd 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -22,14 +22,17 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
 
   auto runtime = OpenCLRuntime::Get();
   auto program = runtime->program();
-  auto conv_kernel = cl::Kernel(program, "conv_2d_3x3");
+
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
+  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  auto conv_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options);
 
   uint32_t idx = 0;
   conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
   conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
-  if (bias == nullptr) {
-    conv_kernel.setArg(idx++, NULL);
-  } else {
+  if (bias != nullptr) {
     conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
   }
   conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
@@ -39,8 +42,6 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
   conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
   conv_kernel.setArg(idx++, static_cast<int32_t>(height));
   conv_kernel.setArg(idx++, static_cast<int32_t>(width));
-  conv_kernel.setArg(idx++, stride);
-  conv_kernel.setArg(idx++, stride);
   const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
                            static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(pixel_blocks)};
diff --git a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
index c37fe77f..84b73071 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
@@ -30,13 +30,18 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
   const index_t pixel_blocks = (width + 3) / 4 * height;
 
   auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
-  auto conv_kernel = cl::Kernel(program, "depthwise_conv_3x3");
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
+  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  auto conv_kernel  = runtime->BuildKernel("depthwise_conv_3x3", "depthwise_conv_3x3", built_options);
 
   uint32_t idx = 0;
   conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
   conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
-  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
+  if (bias != nullptr) {
+    conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
+  }
   conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
   conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
   conv_kernel.setArg(idx++, static_cast<int32_t>(channels));
@@ -44,8 +49,6 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
   conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
   conv_kernel.setArg(idx++, static_cast<int32_t>(height));
   conv_kernel.setArg(idx++, static_cast<int32_t>(width));
-  conv_kernel.setArg(idx++, stride);
-  conv_kernel.setArg(idx++, stride);
 
   const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
                            static_cast<uint32_t>(channel_blocks),
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index 8daa78f6..f3fb6812 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -30,24 +30,26 @@ static void Pooling3(const Tensor *input,
   };
 
   auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace(stride[0] == 1 ? "-DSTRIDE_1" : "");
+  auto pooling_kernel  = runtime->BuildKernel("pooling", "pooling3", built_options);
 
-  auto max_pooling_kernel = cl::Kernel(program, "pooling3");
 
   const uint32_t lws[3] = {1, 8, 128};
 
   uint32_t idx = 0;
-  max_pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
-  max_pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-  max_pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
-  max_pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
-  max_pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
-  max_pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
-  max_pooling_kernel.setArg(idx++, stride[0]);
-  max_pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
+  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
+  pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
+  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
+  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
+  pooling_kernel.setArg(idx++, stride[0]);
+  pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
 
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      max_pooling_kernel, cl::NullRange,
+      pooling_kernel, cl::NullRange,
       cl::NDRange(gws[0], gws[1], gws[2]),
       cl::NDRange(lws[0], lws[1], lws[2]));
   MACE_CHECK(error == CL_SUCCESS);
@@ -75,9 +77,9 @@ static void PoolingN(const Tensor *input,
   };
 
   auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
-
-  auto pooling_kernel = cl::Kernel(program, "poolingn");
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  auto pooling_kernel  = runtime->BuildKernel("pooling", "poolingn", built_options);
 
   const uint32_t lws[3] = {1, 8, 128};
 
diff --git a/mace/kernels/opencl/relu_opencl.cc b/mace/kernels/opencl/relu_opencl.cc
index c4d22ae8..ed562d23 100644
--- a/mace/kernels/opencl/relu_opencl.cc
+++ b/mace/kernels/opencl/relu_opencl.cc
@@ -21,9 +21,10 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
   auto runtime = OpenCLRuntime::Get();
   auto program = runtime->program();
 
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
   if (max_limit_ < 0) {
-    auto relu_kernel = cl::Kernel(program, "relu");
-
+    auto relu_kernel  = runtime->BuildKernel("relu", "relu", built_options);
     const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
 
     uint32_t idx = 0;
@@ -37,7 +38,7 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
         cl::NDRange(lws));
     MACE_CHECK(error == CL_SUCCESS);
   } else {
-    auto relu_kernel = cl::Kernel(program, "relux");
+    auto relu_kernel  = runtime->BuildKernel("relu", "relux", built_options);
 
     const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
 
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index 50d717ef..bf603d94 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -29,9 +29,10 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
   float width_scale = CalculateResizeScale(in_width, out_width, align_corners_);
 
   auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  auto rb_kernel  = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options);
 
-  auto rb_kernel = cl::Kernel(program, "resize_bilinear_nocache");
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
   uint32_t idx = 0;
   rb_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
diff --git a/mace/kernels/opencl/space_to_batch_opecl.cc b/mace/kernels/opencl/space_to_batch_opecl.cc
index 84601492..a4ec2694 100644
--- a/mace/kernels/opencl/space_to_batch_opecl.cc
+++ b/mace/kernels/opencl/space_to_batch_opecl.cc
@@ -18,9 +18,9 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
                                                                 const Tensor *paddings_tensor,
                                                                 Tensor *batch_tensor) {
   auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
-  auto s2b_kernel = cl::Kernel(program, "space_to_batch");
-
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(space_tensor->dtype()));
+  auto s2b_kernel = runtime->BuildKernel("space_to_batch", "space_to_batch", built_options);
 
   uint32_t idx = 0;
   s2b_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(space_tensor->buffer())));
diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto
index d70318c3..8e680821 100644
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -24,6 +24,7 @@ enum DataType {
   DT_UINT16 = 9;
   DT_BOOL = 10;
   DT_HALF = 19;
+  DT_UINT32 = 22;
 }
 
 message TensorProto {
-- 
GitLab