Refactor the opencl kernel build logic.

f07dd516 · liuqi · 77ea99f5 · f07dd516 · f07dd516 · f07dd516
27 changed file
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -7,8 +7,6 @@
 #include <memory>
 #include <mutex>
-#include <dirent.h>
 #include "mace/core/logging.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
@@ -32,55 +30,6 @@ bool ReadSourceFile(const std::string &filename, std::string *content) {
  return true;
 }
-bool BuildProgram(OpenCLRuntime *runtime,
-                  const std::string &path,
-                  cl::Program *program) {
-  MACE_CHECK_NOTNULL(program);
-  auto closer = [](DIR *d) {
-    if (d != nullptr) closedir(d);
-  };
-  std::unique_ptr<DIR, decltype(closer)> dir(opendir(path.c_str()), closer);
-  MACE_CHECK_NOTNULL(dir.get());
-  const std::string kSourceSuffix = ".cl";
-  cl::Program::Sources sources;
-  errno = 0;
-  dirent *entry = readdir(dir.get());
-  MACE_CHECK(errno == 0);
-  while (entry != nullptr) {
-    if (entry->d_type == DT_REG) {
-      std::string d_name(entry->d_name);
-      if (d_name.size() > kSourceSuffix.size() &&
-          d_name.compare(d_name.size() - kSourceSuffix.size(),
-                         kSourceSuffix.size(), kSourceSuffix) == 0) {
-        std::string filename = path + d_name;
-        std::string kernel_source;
-        MACE_CHECK(ReadSourceFile(filename, &kernel_source));
-        sources.push_back({kernel_source.c_str(), kernel_source.length()});
-      }
-    }
-    entry = readdir(dir.get());
-    MACE_CHECK(errno == 0);
-  };
-  *program = cl::Program(runtime->context(), sources);
-  std::string build_options = "-Werror -cl-mad-enable -cl-fast-relaxed-math -I" + path;
-  // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
-  cl_int ret = program->build({runtime->device()}, build_options.c_str());
-  if (ret != CL_SUCCESS) {
-    if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(runtime->device()) ==
-        CL_BUILD_ERROR) {
-      std::string build_log =
-          program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(runtime->device());
-      LOG(INFO) << "Program build log: " << build_log;
-    }
-    LOG(FATAL) << "Build program failed: " << ret;
-  }
-  return true;
-}
 }  // namespace
@@ -156,58 +105,57 @@ cl::CommandQueue &OpenCLRuntime::command_queue() { return command_queue_; }
 cl::Program &OpenCLRuntime::program() {
  // TODO(heliangliang) Support binary format
-  static const char *kernel_path = getenv("MACE_KERNEL_PATH");
-  std::string path(kernel_path == nullptr ? "" : kernel_path);
-  std::call_once(build_flag_, [this, &path]() {
-    MACE_CHECK(BuildProgram(this, path, &program_));
-  });
  return program_;
 }
+const std::map<std::string, std::string>
-const std::unodered_map<std::string, std::string>
+    OpenCLRuntime::program_map_ = {
-    OpenCLRuntime::kernel_program_map_ = {
+  {"addn", "addn.cl"},
-  {"BatchNorm", "batch_norm.cl"}
+  {"batch_norm", "batch_norm.cl"},
+  {"conv_2d_1x1", "conv_2d_1x1.cl"},
+  {"conv_2d_3x3", "conv_2d_3x3.cl"},
+  {"depthwise_conv_3x3", "depthwise_conv_3x3.cl"},
+  {"pooling", "pooling.cl"},
+  {"relu", "relu.cl"},
+  {"resize_bilinear", "resize_bilinear.cl"},
+  {"space_to_batch", "space_to_batch.cl"},
 };
-bool OpenCLRuntime::BuildProgram(const std::string &kernel_name,
+void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
                                 const std::string &build_options,
                                 cl::Program *program) {
  MACE_CHECK_NOTNULL(program);
  cl::Program::Sources sources;
-  std::string filename = kernel_path_ + kernel_name;
+  std::string filename = kernel_path_ + program_file_name;
  std::string kernel_source;
  MACE_CHECK(ReadSourceFile(filename, &kernel_source));
  sources.push_back({kernel_source.c_str(), kernel_source.length()});
  *program = cl::Program(this->context(), sources);
-  build_options += " -Werror -cl-mad-enable -cl-fast-relaxed-math -I" + path;
+  std::string build_options_str = build_options +
+      " -Werror -cl-mad-enable -cl-fast-relaxed-math -I" + kernel_path_;
  // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
-  cl_int ret = program->build({runtime->device()}, build_options.c_str());
+  cl_int ret = program->build({device()}, build_options_str.c_str());
  if (ret != CL_SUCCESS) {
-    if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(runtime->device()) ==
+    if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device()) ==
        CL_BUILD_ERROR) {
      std::string build_log =
-          program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(runtime->device());
+          program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(device());
      LOG(INFO) << "Program build log: " << build_log;
    }
    LOG(FATAL) << "Build program failed: " << ret;
  }
-  return true;
 }
-cl::Kernel OpenCLRuntime::BuildKernel(const std::string &kernel_name,
+cl::Kernel OpenCLRuntime::BuildKernel(const std::string &program_name,
-                        const std::set<std::string> &build_options) {
+                                      const std::string &kernel_name,
-  auto kernel_program_it = kernel_program_map_.find(kernel_name);
+                                      const std::set<std::string> &build_options) {
-  if (kernel_program_it == kernel_program_map_.end()) {
+  auto kernel_program_it = program_map_.find(program_name);
-    MACE_CHECK(false, kernel_name, " opencl kernel doesn't exist.");
+  if (kernel_program_it == program_map_.end()) {
+    MACE_CHECK(false, program_name, " opencl kernel doesn't exist.");
  }
-  std::string program_name = kernel_program_it->second;
+  std::string program_file_name = kernel_program_it->second;
  std::string build_options_str;
  for(auto &option : build_options) {
    build_options_str += " " + option;
@@ -219,10 +167,10 @@ cl::Kernel OpenCLRuntime::BuildKernel(const std::string &kernel_name,
  if (built_program_it != built_program_map_.end()) {
    program = built_program_it->second;
  } else {
-    this->BuildProgram(kernel_name, build_options_str, &program);
+    this->BuildProgram(program_file_name, build_options_str, &program);
-    built_program_map_.emplace(built_program_key, std::move(program));
+    built_program_map_.emplace(built_program_key, program);
  }
-  return cl::Kernel(kernel_name, program);
+  return cl::Kernel(program, kernel_name.c_str());
 }
 uint32_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() {

--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -7,7 +7,7 @@
 #include <map>
 #include <mutex>
-#include <unordered_map>
+#include <set>
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_wrapper.h"
@@ -25,7 +25,8 @@ class OpenCLRuntime {
  uint32_t GetDeviceMaxWorkGroupSize();
  uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel);
-  cl::Kernel BuildKernel(const std::string &kernel_name,
+  cl::Kernel BuildKernel(const std::string &program_name,
+                         const std::string &kernel_name,
                         const std::set<std::string> &build_options);
 private:
  OpenCLRuntime(cl::Context context,
@@ -35,7 +36,7 @@ class OpenCLRuntime {
  OpenCLRuntime(const OpenCLRuntime&) = delete;
  OpenCLRuntime &operator=(const OpenCLRuntime&) = delete;
-  bool BuildProgram(const std::string &kernel_name,
+  void BuildProgram(const std::string &kernel_name,
                    const std::string &build_options,
                    cl::Program *program);
@@ -44,11 +45,10 @@ class OpenCLRuntime {
  cl::Device device_;
  cl::CommandQueue command_queue_;
  cl::Program program_;
-  std::once_flag build_flag_;
  std::string kernel_path_;
-  static const std::unordered_map<std::string,
+  static const std::map<std::string,
-               std::string> kernel_program_map_;
+               std::string> program_map_;
-  mutable std::unordered_map<std::string,
+  mutable std::map<std::string,
          cl::Program> built_program_map_;
 };

--- a/mace/core/types.cc
+++ b/mace/core/types.cc
@@ -12,6 +12,7 @@ bool DataTypeCanUseMemcpy(DataType dt) {
    case DT_DOUBLE:
    case DT_INT32:
    case DT_INT64:
+    case DT_UINT32:
    case DT_UINT16:
    case DT_UINT8:
    case DT_INT16:
@@ -23,4 +24,32 @@ bool DataTypeCanUseMemcpy(DataType dt) {
  }
 }
+std::string DataTypeToCLType(const DataType dt) {
+  switch (dt) {
+    case DT_FLOAT:
+      return "float";
+    case DT_HALF:
+      return "half";
+    case DT_UINT8:
+      return "uchar";
+    case DT_INT8:
+      return "char";
+    case DT_DOUBLE:
+      return "double";
+    case DT_INT32:
+      return "int";
+    case DT_UINT32:
+      return "int";
+    case DT_UINT16:
+      return "ushort";
+    case DT_INT16:
+      return "short";
+    case DT_INT64:
+      return "long";
+    default:
+      LOG(FATAL) << "Unsupported data type";
+      return "";
+  }
+}
 }  //  namespace mace
\ No newline at end of file
--- a/mace/core/types.h
+++ b/mace/core/types.h
@@ -12,6 +12,8 @@ namespace mace {
 bool DataTypeCanUseMemcpy(DataType dt);
+std::string DataTypeToCLType(const DataType dt);
 template <class T>
 struct IsValidDataType;
@@ -50,6 +52,7 @@ MATCH_TYPE_AND_ENUM(int16_t, DT_INT16);
 MATCH_TYPE_AND_ENUM(int8_t, DT_INT8);
 MATCH_TYPE_AND_ENUM(string, DT_STRING);
 MATCH_TYPE_AND_ENUM(int64_t, DT_INT64);
+MATCH_TYPE_AND_ENUM(uint32_t, DT_UINT32);
 MATCH_TYPE_AND_ENUM(bool, DT_BOOL);
 static const int32_t kint32_tmax = ((int32_t)0x7FFFFFFF);

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -76,9 +76,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
    const Tensor *epsilon,
    Tensor *output);
-template <typename T>
+template <>
-struct BatchNormFunctor<DeviceType::OPENCL, T> {
+void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
-  void operator()(
      const Tensor *input,
      const Tensor *scale,
      const Tensor *offset,
@@ -86,7 +85,6 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> {
      const Tensor *var,
      const Tensor *epsilon,
      Tensor *output);
-};
 }  //  namepsace kernels
 }  //  namespace mace

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -15,9 +15,9 @@ static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) {
  const uint32_t gws = blocks;
  auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(output->dtype()));
-  auto addn_kernel = cl::Kernel(program, "add2");
+  auto addn_kernel = runtime->BuildKernel("addn", "add2", built_options);
  const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);

--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -10,8 +10,8 @@
 namespace mace {
 namespace kernels {
-template <typename T>
+template <>
-void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
+void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
    const Tensor *input,
    const Tensor *scale,
    const Tensor *offset,
@@ -29,8 +29,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
-  built_options.emplace("-DDataType=" + GetDataTypeFromEnum(input->dtype()));
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
-  auto bm_kernel = runtime->CreateKernel("batch_norm");
+  auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options);
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
  const std::vector<uint32_t> lws = {1, 1, kwg_size};
@@ -63,7 +63,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
        cl::NDRange(gws[0], gws[1], gws[2]),
        cl::NDRange(params[0], params[1], params[2]));
-    MACE_CHECK(error == CL_SUCCESS);
+    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
    return error;
  };
  std::stringstream ss;

--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
-__kernel void add2(__global const float *input0,
+#include <common.h>
-                   __global const float *input1,
+// Supported data type: half/float
+__kernel void add2(__global const DATA_TYPE *input0,
+                   __global const DATA_TYPE *input1,
                   __private const int size,
-                   __global float *output) {
+                   __global DATA_TYPE *output) {
  int idx = get_global_id(0);
  if (idx + 4 > size) {
@@ -9,8 +12,8 @@ __kernel void add2(__global const float *input0,
      *(output+idx) = *(input0+idx) + *(input1+idx);
    }
  } else {
-    float4 in_data0 = vload4(idx, input0);
+    VEC_DATA_TYPE(DATA_TYPE,4) in_data0 = vload4(idx, input0);
-    float4 in_data1 = vload4(idx, input1);
+    VEC_DATA_TYPE(DATA_TYPE,4) in_data1 = vload4(idx, input1);
    vstore4(in_data0+in_data1, idx, output);
  }
 }

--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
-void kernel batch_norm(global const float *input,
+#include <common.h>
-                       global const float *scale,
+// Supported data types: half/float
-                       global const float *offset,
+void kernel batch_norm(global const DATA_TYPE *input,
-                       global const float *mean,
+                       global const DATA_TYPE *scale,
-                       global const float *var,
+                       global const DATA_TYPE *offset,
-                       global const float *epsilon,
+                       global const DATA_TYPE *mean,
+                       global const DATA_TYPE *var,
+                       global const DATA_TYPE *epsilon,
                       private const int pixels,
-                       global float *output,
+                       global DATA_TYPE *output,
-                       __local float4 *new_scale,
+                       __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_scale,
-                       __local float4 *new_offset) {
+                       __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_offset) {
  const int batch = get_global_id(0);
  const int channel = get_global_id(1);
  const int channels = get_global_size(1);
@@ -23,8 +25,8 @@ void kernel batch_norm(global const float *input,
  barrier(CLK_LOCAL_MEM_FENCE);
  const int image_offset = (batch * channels + channel) * pixels + pixel_offset*4;
-  const float *input_ptr = input + image_offset;
+  const DATA_TYPE *input_ptr = input + image_offset;
-  float *output_ptr = output + image_offset;
+  DATA_TYPE *output_ptr = output + image_offset;
  const int end = (batch * channels + channel + 1) * pixels;
  if ((image_offset+4) > end) {
    for (int i = image_offset; i < end; ++i) {
@@ -33,7 +35,7 @@ void kernel batch_norm(global const float *input,
      ++output_ptr;
    }
  } else {
-    float4 values = vload4(0, input_ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input_ptr);
    values = values * new_scale[local_channel] + new_offset[local_channel];
    vstore4(values, 0, output_ptr);
  }

--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -8,4 +8,7 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
+#define VEC_DATA_TYPE_STR(data_type, size) data_type##size
+#define VEC_DATA_TYPE(data_type, size) VEC_DATA_TYPE_STR(data_type, size)
 #endif  // MACE_KERNELS_OPENCL_CL_COMMON_H_
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -25,31 +25,31 @@ __kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */
 }
 #define vec_conv_2d_1x1_s1                    \
-  float4 in0 = vload4(0, input_ptr);                   \
+  VEC_DATA_TYPE(DATA_TYPE,4) in0 = vload4(0, input_ptr);                   \
-  float4 in1 = vload4(0, input_ptr + in_pixel);        \
+  VEC_DATA_TYPE(DATA_TYPE,4) in1 = vload4(0, input_ptr + in_pixel);        \
-  float4 in2 = vload4(0, input_ptr + 2 * in_pixel);    \
+  VEC_DATA_TYPE(DATA_TYPE,4) in2 = vload4(0, input_ptr + 2 * in_pixel);    \
-  float4 in3 = vload4(0, input_ptr + 3 * in_pixel);
+  VEC_DATA_TYPE(DATA_TYPE,4) in3 = vload4(0, input_ptr + 3 * in_pixel);
 #define vec_conv_2d_1x1_s2                    \
-  float4 in00 = vload4(0, input_ptr);                   \
+  VEC_DATA_TYPE(DATA_TYPE,4) in00 = vload4(0, input_ptr);                   \
-  float3 in01 = vload3(0, input_ptr + 4);               \
+  VEC_DATA_TYPE(DATA_TYPE,3) in01 = vload3(0, input_ptr + 4);               \
-  float4 in10 = vload4(0, input_ptr + in_pixel);        \
+  VEC_DATA_TYPE(DATA_TYPE,4) in10 = vload4(0, input_ptr + in_pixel);        \
-  float3 in11 = vload3(0, input_ptr + in_pixel + 4);    \
+  VEC_DATA_TYPE(DATA_TYPE,3) in11 = vload3(0, input_ptr + in_pixel + 4);    \
-  float4 in20 = vload4(0, input_ptr + 2 * in_pixel);    \
+  VEC_DATA_TYPE(DATA_TYPE,4) in20 = vload4(0, input_ptr + 2 * in_pixel);    \
-  float3 in21 = vload3(0, input_ptr + 2 * in_pixel + 4);\
+  VEC_DATA_TYPE(DATA_TYPE,3) in21 = vload3(0, input_ptr + 2 * in_pixel + 4);\
-  float4 in30 = vload4(0, input_ptr + 3 * in_pixel);    \
+  VEC_DATA_TYPE(DATA_TYPE,4) in30 = vload4(0, input_ptr + 3 * in_pixel);    \
-  float3 in31 = vload3(0, input_ptr + 3 * in_pixel + 4); \
+  VEC_DATA_TYPE(DATA_TYPE,3) in31 = vload3(0, input_ptr + 3 * in_pixel + 4); \
-  float4 in0 = (float4)(in00.s02, in01.s02);            \
+  VEC_DATA_TYPE(DATA_TYPE,4) in0 = (VEC_DATA_TYPE(DATA_TYPE,4))(in00.s02, in01.s02);            \
-  float4 in1 = (float4)(in10.s02, in11.s02);            \
+  VEC_DATA_TYPE(DATA_TYPE,4) in1 = (VEC_DATA_TYPE(DATA_TYPE,4))(in10.s02, in11.s02);            \
-  float4 in2 = (float4)(in20.s02, in21.s02);            \
+  VEC_DATA_TYPE(DATA_TYPE,4) in2 = (VEC_DATA_TYPE(DATA_TYPE,4))(in20.s02, in21.s02);            \
-  float4 in3 = (float4)(in30.s02, in31.s02);
+  VEC_DATA_TYPE(DATA_TYPE,4) in3 = (VEC_DATA_TYPE(DATA_TYPE,4))(in30.s02, in31.s02);
 #define vec_conv_2d_1x1_compute_loop  \
  for (int oc = 0; oc < 4; ++oc) {                             \
-    float4 weights = vload4(0, filter_ptr + oc * in_chan_num); \
+    VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr + oc * in_chan_num); \
-    float4 out = vload4(0, output_ptr + oc * out_pixel);       \
+    VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr + oc * out_pixel);       \
    out += in0 * weights.x;                                    \
    out += in1 * weights.y;                                     \
    out += in2 * weights.z;                                     \
@@ -58,25 +58,27 @@ __kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */
  }
 #define vec_conv_2d_1x1_compute  \
-    float4 weights = vload4(0, filter_ptr); \
+    VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr); \
-    float4 out = vload4(0, output_ptr);       \
+    VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr);       \
    out += in0 * weights.x;                                    \
    out += in1 * weights.y;                                     \
    out += in2 * weights.z;                                     \
    out += in3 * weights.w;                                     \
    vstore4(out, 0, output_ptr);
-__kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */
+// Supported data type: half/float
-                             __global const float *filter, /* o, i, kh, kw */
+__kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */
-                             __global const float *bias, /* o */
+                             __global const DATA_TYPE *filter, /* o, i, kh, kw */
-                             __global float *output, /* n, c, h, w */
+#ifdef BIAS
+                             __global const DATA_TYPE *bias, /* o */
+#endif /* defined(BIAS) */
+                             __global DATA_TYPE *output, /* n, c, h, w */
                             __private const int in_chan_num,
                             __private const int out_chan_num,
                             __private const int in_height,
                             __private const int in_width,
                             __private const int out_height,
-                             __private const int out_width,
+                             __private const int out_width) {
-                             __private const int stride) {
  int batch = get_global_id(0);
  int out_chan_blk = get_global_id(1);
  int out_pixel_blk = get_global_id(2);
@@ -92,20 +94,30 @@ __kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */
  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
  const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
  const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
+#ifdef STRIDE_1
+  const int stride = 1;
+#else
+  const int stride = 2;
+#endif
  const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
  const int in_offset = batch * in_chan_num * in_pixel;
  const int out_offset = batch * out_chan_num * out_pixel;
-  const float *input_base = input + in_offset + in_pixel_begin;
+  const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
-  float *output_base = output + out_offset + out_pixel_begin;
+  DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
  int out_chan_len = out_chan_end - out_chan_begin;
  int pixel_len = out_pixel_end - out_pixel_begin;
  for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
-    float *output_ptr = output_base + out_chan * out_pixel;
+    DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
-    float bias_value = bias == NULL ? 0 : bias[out_chan];
+#ifdef BIAS
+    DATA_TYPE bias_value = bias[out_chan];
+#else
+    DATA_TYPE bias_value = 0;
+#endif
    for (int p = 0; p < pixel_len; ++p) {
      output_ptr[p] = bias_value;
    }
@@ -113,48 +125,37 @@ __kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */
  int in_chan = 0;
  if (pixel_len == 4) {
-    if (stride == 1) {
+    for (; in_chan + 3 < in_chan_num; in_chan += 4) {
-      for (; in_chan + 3 < in_chan_num; in_chan += 4) {
+      const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel;
-        const float *input_ptr = input_base + in_chan * in_pixel;
+      int out_chan = out_chan_begin;
-        int out_chan = out_chan_begin;
+      for (; out_chan + 3 < out_chan_end; out_chan += 4) {
-        for (; out_chan + 3 < out_chan_end; out_chan += 4) {
+        const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan;
-          const float* filter_ptr = filter + out_chan * in_chan_num + in_chan;
+        DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
-          float *output_ptr = output_base + out_chan * out_pixel;
+#ifdef STRIDE_1
-          vec_conv_2d_1x1_s1;
+        vec_conv_2d_1x1_s1;
-          vec_conv_2d_1x1_compute_loop;
+#else
-        }
+        vec_conv_2d_1x1_s2;
-        for (; out_chan < out_chan_end; ++out_chan) {
+#endif
-          const float* filter_ptr = filter + out_chan * in_chan_num + in_chan;
+        vec_conv_2d_1x1_compute_loop;
-          float *output_ptr = output_base + out_chan * out_pixel;
-          vec_conv_2d_1x1_s1;
-          vec_conv_2d_1x1_compute;
-        }
      }
-    } else if (stride == 2) {
+      for (; out_chan < out_chan_end; ++out_chan) {
-      for (; in_chan + 3 < in_chan_num; in_chan += 4) {
+        const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan;
-        const float *input_ptr = input_base + in_chan * in_pixel;
+        DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
-        int out_chan = out_chan_begin;
+#ifdef STRIDE_1
-        for (; out_chan + 3 < out_chan_end; out_chan += 4) {
+        vec_conv_2d_1x1_s1;
-          const float* filter_ptr = filter + out_chan * in_chan_num + in_chan;
+#else
-          float *output_ptr = output_base + out_chan * out_pixel;
+        vec_conv_2d_1x1_s2;
-          vec_conv_2d_1x1_s2;
+#endif
-          vec_conv_2d_1x1_compute_loop;
+        vec_conv_2d_1x1_compute;
-        }
-        for (; out_chan < out_chan_end; ++out_chan) {
-          const float* filter_ptr = filter + out_chan * in_chan_num + in_chan;
-          float *output_ptr = output_base + out_chan * out_pixel;
-          vec_conv_2d_1x1_s2;
-          vec_conv_2d_1x1_compute;
-        }
      }
    }
  }
  for (; in_chan < in_chan_num; ++in_chan) {
-    const float *input_ptr = input_base + in_chan * in_pixel;
+    const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel;
    for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
-      float weights = filter[out_chan * in_chan_num + in_chan];
+      DATA_TYPE weights = filter[out_chan * in_chan_num + in_chan];
-      float *output_ptr = output_base + out_chan * out_pixel;
+      DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
      for (int p = 0; p < pixel_len; ++p) {
        float in = input_ptr[p*stride];

--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
-#include <conv_helper.h>
+#include <common.h>
-void kernel conv_2d_3x3(global const float *input,
-                        global const float *filter,
+VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s1(const DATA_TYPE *input_ptr,
-                        global const float *bias,
+                                      const DATA_TYPE *filter_ptr) {
-                        global float *output,
+  VEC_DATA_TYPE(DATA_TYPE,4) row0 = vload4(0, input_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,2) input1 = vload2(0, input_ptr+4);
+  VEC_DATA_TYPE(DATA_TYPE,4) row1 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input1.s0);
+  VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s23, input1.s01);
+  VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
+  return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
+         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
+         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
+}
+VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s2(const DATA_TYPE *input_ptr,
+                                      const DATA_TYPE *filter_ptr) {
+  VEC_DATA_TYPE(DATA_TYPE,8) input = vload8(0, input_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,4) row0 = input.even;
+  VEC_DATA_TYPE(DATA_TYPE,4) row1 = input.odd;
+  VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input_ptr[8]);
+  VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
+  return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
+         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
+         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
+}
+// Supported data type: half/float
+DATA_TYPE conv3x3(const DATA_TYPE *input_ptr,
+                  const DATA_TYPE *filter_ptr,
+                  const int row_width) {
+  VEC_DATA_TYPE(DATA_TYPE,3) input_value = vload3(0, input_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,3) filter_value = vload3(0, filter_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,3) res = input_value * filter_value;
+  input_ptr += row_width;
+  input_value = vload3(0, input_ptr);
+  filter_value = vload3(1, filter_ptr);
+  res += input_value * filter_value;
+  input_ptr += row_width;
+  input_value = vload3(0, input_ptr);
+  filter_value = vload3(2, filter_ptr);
+  res += input_value * filter_value;
+  return res.s0 + res.s1 + res.s2;
+}
+void kernel conv_2d_3x3(global const DATA_TYPE *input,
+                        global const DATA_TYPE *filter,
+#ifdef BIAS
+                        global const DATA_TYPE *bias,
+#endif
+                        global DATA_TYPE *output,
                        private const int in_chan_num,
                        private const int out_chan_num,
                        private const int in_height,
                        private const int in_width,
                        private const int out_height,
-                        private const int out_width,
+                        private const int out_width) {
-                        private const int stride_h,
-                        private const int stride_w) {
  int batch = get_global_id(0);
  int out_chan_blk = get_global_id(1);
  int out_pixel_blk = get_global_id(2);
@@ -26,46 +70,54 @@ void kernel conv_2d_3x3(global const float *input,
  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
  const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
  const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
-  const int in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
+#ifdef STRIDE_1
+  const int stride = 1;
+#else
+  const int stride = 2;
+#endif
+  const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
  const int in_offset = batch * in_chan_num * in_pixel;
  const int out_offset = batch * out_chan_num * out_pixel;
-  const float *input_base = input + in_offset + in_pixel_begin;
+  const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
-  float *output_base = output + out_offset + out_pixel_begin;
+  DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
  const int pixels = out_pixel_end - out_pixel_begin;
  for (int i = out_chan_begin; i < out_chan_end; ++i) {
-    float *output_ptr = output_base + i * out_pixel;
+    DATA_TYPE *output_ptr = output_base + i * out_pixel;
-    const float *filter_base = filter + i * in_chan_num * 9;
+    const DATA_TYPE *filter_base = filter + i * in_chan_num * 9;
    if (pixels == 4) {
+#ifdef BIAS
+      VEC_DATA_TYPE(DATA_TYPE, 4) res = (VEC_DATA_TYPE(DATA_TYPE, 4))bias[i];
+#else
+      VEC_DATA_TYPE(DATA_TYPE, 4) res = 0;
+#endif
-      float4 res = bias == NULL ? 0 : (float4)bias[i];
+      for (int in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
+        const DATA_TYPE *input_ptr = input_base + in_chan_idx * in_pixel;
-      if (stride_w == 1) {
+        const DATA_TYPE *filter_ptr = filter_base + in_chan_idx * 9;
-        for (int in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
+#ifdef STRIDE_1
-          const float* input_ptr = input_base + in_chan_idx * in_pixel;
+        res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
-          const float* filter_ptr = filter_base + in_chan_idx * 9;
+        res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
-          res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+        res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
-          res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+#else
-          res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
+        res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
-        }
+        res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
-      } else {
+        res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
-        for (int in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
+#endif
-          const float* input_ptr = input_base + in_chan_idx * in_pixel;
-          const float* filter_ptr = filter_base + in_chan_idx * 9;
-          res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
-          res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
-          res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
-        }
      }
      vstore4(res, 0, output_ptr);
    } else {
      for (int p = 0; p < pixels; ++p) {
-        float res = bias == NULL ? 0 : bias[i];
+#ifdef BIAS
+        DATA_TYPE res = bias[i];
+#else
+        DATA_TYPE res = 0;
+#endif
        for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
-          const float* input_ptr = input_base + in_chan_idx * in_pixel + p * stride_w;
+          const DATA_TYPE *input_ptr = input_base + in_chan_idx * in_pixel + p * stride;
-          const float* filter_ptr = filter_base + in_chan_idx * 9;
+          const DATA_TYPE *filter_ptr = filter_base + in_chan_idx * 9;
          res += conv3x3(input_ptr, filter_ptr, in_width);
        }
        output_ptr[p] = res;

--- a/mace/kernels/opencl/cl/conv_helper.cl
+++ b/mace/kernels/opencl/cl/conv_helper.cl
-float4 conv1x3_s1(const float *input_ptr,
-                  const float *filter_ptr) {
-  float4 row0 = vload4(0, input_ptr);
-  float2 input1 = vload2(0, input_ptr+4);
-  float4 row1 = (float4)(row0.s123, input1.s0);
-  float4 row2 = (float4)(row0.s23, input1.s01);
-  float3 filter_values = vload3(0, filter_ptr);
-  return (float4)filter_values.s0 * row0 +
-         (float4)filter_values.s1 * row1 +
-         (float4)filter_values.s2 * row2;
-}
-float4 conv1x3_s2(const float *input_ptr,
-                  const float *filter_ptr) {
-  float8 input = vload8(0, input_ptr);
-  float4 row0 = input.even;
-  float4 row1 = input.odd;
-  float4 row2 = (float4)(row0.s123, input_ptr[8]);
-  float3 filter_values = vload3(0, filter_ptr);
-  return (float4)filter_values.s0 * row0 +
-         (float4)filter_values.s1 * row1 +
-         (float4)filter_values.s2 * row2;
-}
-float conv3x3(const float *input_ptr,
-              const float *filter_ptr,
-              const int row_width) {
-  float3 input_value = vload3(0, input_ptr);
-  float3 filter_value = vload3(0, filter_ptr);
-  float3 res = input_value * filter_value;
-  input_ptr += row_width;
-  input_value = vload3(0, input_ptr);
-  filter_value = vload3(1, filter_ptr);
-  res += input_value * filter_value;
-  input_ptr += row_width;
-  input_value = vload3(0, input_ptr);
-  filter_value = vload3(2, filter_ptr);
-  res += input_value * filter_value;
-  return res.s0 + res.s1 + res.s2;
-}
--- a/mace/kernels/opencl/cl/conv_helper.h
+++ b/mace/kernels/opencl/cl/conv_helper.h
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-#ifndef MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
-#define MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
-float4 conv1x3_s1(const float *input_ptr,
-                  const float *filter_ptr);
-float4 conv1x3_s2(const float *input_ptr,
-                  const float *filter_ptr);
-float conv3x3(const float *input_ptr,
-              const float *filter_ptr,
-              const int row_width);
-#endif //  MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
--- a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
-#include <conv_helper.h>
+#include <common.h>
+VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s1(const DATA_TYPE *input_ptr,
+                                      const DATA_TYPE *filter_ptr) {
+  VEC_DATA_TYPE(DATA_TYPE,4) row0 = vload4(0, input_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,2) input1 = vload2(0, input_ptr+4);
+  VEC_DATA_TYPE(DATA_TYPE,4) row1 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input1.s0);
+  VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s23, input1.s01);
+  VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
+  return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
+         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
+         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
+}
+VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s2(const DATA_TYPE *input_ptr,
+                                      const DATA_TYPE *filter_ptr) {
+  VEC_DATA_TYPE(DATA_TYPE,8) input = vload8(0, input_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,4) row0 = input.even;
+  VEC_DATA_TYPE(DATA_TYPE,4) row1 = input.odd;
+  VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input_ptr[8]);
+  VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
+  return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
+         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
+         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
+}
+// Supported data type: half/float
+DATA_TYPE conv3x3(const DATA_TYPE *input_ptr,
+                  const DATA_TYPE *filter_ptr,
+                  const int row_width) {
+  VEC_DATA_TYPE(DATA_TYPE,3) input_value = vload3(0, input_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,3) filter_value = vload3(0, filter_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,3) res = input_value * filter_value;
+  input_ptr += row_width;
+  input_value = vload3(0, input_ptr);
+  filter_value = vload3(1, filter_ptr);
+  res += input_value * filter_value;
+  input_ptr += row_width;
+  input_value = vload3(0, input_ptr);
+  filter_value = vload3(2, filter_ptr);
+  res += input_value * filter_value;
+  return res.s0 + res.s1 + res.s2;
+}
 //TODO merge the depthwise with conv 3x3 to remove duplicate code.
-void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */
+__kernel void depthwise_conv_3x3(__global const DATA_TYPE *input, /* n, c, h, w */
-                               global const float *filter, /* m, i, kh, kw */
+                                 __global const DATA_TYPE *filter, /* m, i, kh, kw */
-                               global const float *bias, /* o */
+#ifdef BIAS
-                               global float *output, /* n, c, h, w */
+                                 __global const DATA_TYPE *bias, /* o */
-                               private const int in_chan_num,
+#endif
-                               private const int out_chan_num,
+                                 __global DATA_TYPE *output, /* n, c, h, w */
-                               private const int in_height,
+                                 __private const int in_chan_num,
-                               private const int in_width,
+                                 __private const int out_chan_num,
-                               private const int out_height,
+                                 __private const int in_height,
-                               private const int out_width,
+                                 __private const int in_width,
-                               private const int stride_h,
+                                 __private const int out_height,
-                               private const int stride_w) {
+                                 __private const int out_width) {
  int batch = get_global_id(0);
  int out_chan_blk = get_global_id(1);
  int out_pixel_blk = get_global_id(2);
@@ -28,38 +71,54 @@ void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */
  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
  const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
  const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
-  const int in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
+#ifdef STRIDE_1
+  const int in_pixel_begin = out_pixel_height * in_width + out_pixel_width * 4;
+#else
+  const int in_pixel_begin = out_pixel_height * 2 * in_width + out_pixel_width * 2 * 4;
+#endif
  const int in_offset = batch * in_chan_num * in_pixel;
  const int out_offset = batch * out_chan_num * out_pixel;
-  const float *input_base = input + in_offset + in_pixel_begin;
+  const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
-  float *output_base = output + out_offset + out_pixel_begin;
+  DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
  const int pixels = out_pixel_end - out_pixel_begin;
  for (int i = out_chan_begin; i < out_chan_end; ++i) {
-    float bias_value = bias[i];
+    const DATA_TYPE *input_ptr = input_base + (i / multiplier) * in_pixel;
-    const float *input_ptr = input_base + (i / multiplier) * in_pixel;
+    const DATA_TYPE *filter_ptr = filter + i * 9;
-    const float *filter_ptr = filter + i * 9;
+    DATA_TYPE *output_ptr = output_base + i * out_pixel;
-    float *output_ptr = output_base + i * out_pixel;
    if (pixels == 4) {
-      float4 res = (float4)bias[i];
+#ifdef BIAS
-      if (stride_w == 1) {
+      VEC_DATA_TYPE(DATA_TYPE,4) res = (VEC_DATA_TYPE(DATA_TYPE,4))bias[i];
-        res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+#else
-        res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+      VEC_DATA_TYPE(DATA_TYPE,4) res = 0;
-        res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
+#endif /* defined(BIAS) */
-      } else {
-        res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+#ifdef STRIDE_1
-        res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+      res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
-        res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
+      res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
-      }
+      res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
+#else
+      res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+      res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+      res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
+#endif
      vstore4(res, 0, output_ptr);
    } else {
      for (int p = 0; p < pixels; ++p) {
-        float res = bias[i];
+#ifdef BIAS
+        DATA_TYPE res = bias[i];
+#else
+        DATA_TYPE res = 0;
+#endif
        res += conv3x3(input_ptr, filter_ptr, in_width);
        output_ptr[p] = res;
-        input_ptr += stride_w;
+#ifdef STRIDE_1
+        input_ptr += 1;
+#else
+        input_ptr += 2;
+#endif
      }
    }
  }

--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
-float4 vec_pooling_3_s1(const float *input_ptr, const int in_width) {
+#include <common.h>
-  float4 row00 = vload4(0, input_ptr);
-  float2 row01 = vload2(0, input_ptr + 4);
+VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s1(const DATA_TYPE *input_ptr, const int in_width) {
-  float4 row10 = vload4(0, input_ptr + in_width);
+  VEC_DATA_TYPE(DATA_TYPE,4) row00 = vload4(0, input_ptr);
-  float2 row11 = vload2(0, input_ptr + in_width + 4);
+  VEC_DATA_TYPE(DATA_TYPE,2) row01 = vload2(0, input_ptr + 4);
-  float4 row20 = vload4(0, input_ptr + in_width * 2);
+  VEC_DATA_TYPE(DATA_TYPE,4) row10 = vload4(0, input_ptr + in_width);
-  float2 row21 = vload2(0, input_ptr + in_width * 2 + 4);
+  VEC_DATA_TYPE(DATA_TYPE,2) row11 = vload2(0, input_ptr + in_width + 4);
+  VEC_DATA_TYPE(DATA_TYPE,4) row20 = vload4(0, input_ptr + in_width * 2);
-  float8 data00 = (float8)(row00.s01212323);
+  VEC_DATA_TYPE(DATA_TYPE,2) row21 = vload2(0, input_ptr + in_width * 2 + 4);
-  float4 data01 = (float4)(row01.s0, row00.s3, row01.s01);
-  float8 data10 = (float8)(row10.s01212323);
+  VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01212323);
-  float4 data11 = (float4)(row11.s0, row10.s3, row11.s01);
+  VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row01.s0, row00.s3, row01.s01);
-  float8 data20 = (float8)(row20.s01212323);
+  VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01212323);
-  float4 data21 = (float4)(row21.s0, row20.s3, row21.s01);
+  VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row11.s0, row10.s3, row11.s01);
+  VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01212323);
-  float8 left = fmax(fmax(data00, data10), data20);
+  VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row21.s0, row20.s3, row21.s01);
-  float4 right = fmax(fmax(data01, data11), data21);
+  VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
-  float4 res = fmax((float4)(left.s036, right.s1), (float4)(left.s147, right.s2));
+  VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
-  res = fmax(res, (float4)(left.s25, right.s03));
+  VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
+                                        (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
+  res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
  return res;
 }
-float4 vec_pooling_3_s2(const float *input_ptr, const int in_width) {
-  float8 row00 = vload8(0, input_ptr);
+VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s2(const DATA_TYPE *input_ptr, const int in_width) {
-  float row01 = *(input_ptr + 8);
+  VEC_DATA_TYPE(DATA_TYPE,8) row00 = vload8(0, input_ptr);
-  float8 row10 = vload8(0, input_ptr + in_width);
+  DATA_TYPE row01 = *(input_ptr + 8);
-  float row11 = *(input_ptr + in_width + 8);
+  VEC_DATA_TYPE(DATA_TYPE,8) row10 = vload8(0, input_ptr + in_width);
-  float8 row20 = vload8(0, input_ptr + in_width * 2);
+  DATA_TYPE row11 = *(input_ptr + in_width + 8);
-  float row21 = *(input_ptr + in_width * 2 + 8);
+  VEC_DATA_TYPE(DATA_TYPE,8) row20 = vload8(0, input_ptr + in_width * 2);
+  DATA_TYPE row21 = *(input_ptr + in_width * 2 + 8);
-  float8 data00 = (float8)(row00.s01223445);
-  float4 data01 = (float4)(row00.s667, row01);
+  VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01223445);
-  float8 data10 = (float8)(row10.s01223445);
+  VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row00.s667, row01);
-  float4 data11 = (float4)(row10.s667, row11);
+  VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01223445);
-  float8 data20 = (float8)(row20.s01223445);
+  VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row10.s667, row11);
-  float4 data21 = (float4)(row20.s667, row21);
+  VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01223445);
+  VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row20.s667, row21);
-  float8 left = fmax(fmax(data00, data10), data20);
-  float4 right = fmax(fmax(data01, data11), data21);
+  VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
+  VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
-  float4 res = fmax((float4)(left.s036, right.s1), (float4)(left.s147, right.s2));
-  res = fmax(res, (float4)(left.s25, right.s03));
+  VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
+                                        (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
+  res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
  return res;
 }
-float inner_pooling_3(const float *input_ptr, const int in_width) {
+DATA_TYPE inner_pooling_3(const DATA_TYPE *input_ptr, const int in_width) {
-  float3 row0 = vload3(0, input_ptr);
+  VEC_DATA_TYPE(DATA_TYPE,3) row0 = vload3(0, input_ptr);
-  float3 row1 = vload3(0, input_ptr + in_width);
+  VEC_DATA_TYPE(DATA_TYPE,3) row1 = vload3(0, input_ptr + in_width);
-  float3 row2 = vload3(0, input_ptr + in_width * 2);
+  VEC_DATA_TYPE(DATA_TYPE,3) row2 = vload3(0, input_ptr + in_width * 2);
-  float3 data = fmax(fmax(row0, row1), row2);
+  VEC_DATA_TYPE(DATA_TYPE,3) data = fmax(fmax(row0, row1), row2);
-  float res = fmax(fmax(data.s0, data.s1), data.s2);
+  DATA_TYPE res = fmax(fmax(data.s0, data.s1), data.s2);
  return res;
 }
-__kernel void pooling3(__global const float *input, /* n, c, h, w */
+// Supported data type: half/float
+__kernel void pooling3(__global const DATA_TYPE *input, /* n, c, h, w */
                       __private const int in_height,
                       __private const int in_width,
                       __private const int out_chan_num,
                       __private const int out_height,
                       __private const int out_width,
                       __private const int stride,
-                       __global float *output) {
+                       __global DATA_TYPE *output) {
  int batch = get_global_id(0);
  int out_chan_blk = get_global_id(1);
  int out_pixel_blk = get_global_id(2);
@@ -83,21 +89,21 @@ __kernel void pooling3(__global const float *input, /* n, c, h, w */
  const int in_offset = batch * out_chan_num * in_pixel;
  const int out_offset = batch * out_chan_num * out_pixel;
-  const float *input_base = input + in_offset + in_pixel_begin;
+  const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
-  float *output_base = output + out_offset + out_pixel_begin;
+  DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
  const int pixels = out_pixel_end - out_pixel_begin;
  for (int i = out_chan_begin; i < out_chan_end; ++i) {
-    const float *input_ptr = input_base + i * in_pixel;
+    const DATA_TYPE *input_ptr = input_base + i * in_pixel;
-    float *output_ptr = output_base + i * out_pixel;
+    DATA_TYPE *output_ptr = output_base + i * out_pixel;
    if (pixels == 4) {
-      float4 res;
+      VEC_DATA_TYPE(DATA_TYPE,4) res;
-      if (stride == 1) {
+#ifdef STRIDE_1
-        res = vec_pooling_3_s1(input_ptr, in_width);
+      res = vec_pooling_3_s1(input_ptr, in_width);
-      } else {
+#else
-        res = vec_pooling_3_s2(input_ptr, in_width);
+      res = vec_pooling_3_s2(input_ptr, in_width);
-      }
+#endif
      vstore4(res, 0, output_ptr);
    } else {
      for (int p = 0; p < pixels; ++p) {
@@ -122,7 +128,8 @@ int calculate_avg_block_size(const int pos_h,
  return (h_end - h_start) * (w_end - w_start);
 }
-__kernel void poolingn(__global const float *input, /* n, c, h, w */
+// Supported data type: half/float
+__kernel void poolingn(__global const DATA_TYPE *input, /* n, c, h, w */
                       __private const int in_height,
                       __private const int in_width,
                       __private const int out_chan_num,
@@ -132,7 +139,7 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */
                       __private const int pad_h,
                       __private const int pad_w,
                       __private const int pooling_size,
-                       __global float *output) {
+                       __global DATA_TYPE *output) {
  int batch = get_global_id(0);
  int out_chan_idx = get_global_id(1);
  int out_pixel_idx = get_global_id(2);
@@ -150,8 +157,8 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */
  const int in_offset = batch * out_chan_num * in_pixel;
  const int out_offset = batch * out_chan_num * out_pixel;
-  const float *input_base = input + in_offset + in_pixel_idx;
+  const DATA_TYPE *input_base = input + in_offset + in_pixel_idx;
-  float *output_base = output + out_offset + out_pixel_idx;
+  DATA_TYPE *output_base = output + out_offset + out_pixel_idx;
  const int block_size = calculate_avg_block_size(
                            out_pixel_height * stride,
@@ -162,14 +169,14 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */
                            in_height - pad_h,
                            in_width - pad_w);
  for (int i = out_chan_begin; i < out_chan_end; ++i) {
-    float8 sum8 = 0.0f;
+    VEC_DATA_TYPE(DATA_TYPE,8) sum8 = 0.0f;
-    float sum1 = 0.0f;
+    DATA_TYPE sum1 = 0.0f;
-    float *output_ptr = output_base + i * out_pixel;
+    DATA_TYPE *output_ptr = output_base + i * out_pixel;
    for (int y = 0; y < pooling_size; ++y) {
-      const float *input_ptr = input_base + i * in_pixel + y * in_width;
+      const DATA_TYPE *input_ptr = input_base + i * in_pixel + y * in_width;
      int x = 0;
      for (; x < (pooling_size-8); x += 8) {
-        float8 data = vload8(0, input_ptr);
+        VEC_DATA_TYPE(DATA_TYPE,8) data = vload8(0, input_ptr);
        sum8 += data;
        input_ptr += 8;
      }
@@ -178,8 +185,8 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */
        input_ptr++;
      }
    }
-    float4 sum4 = sum8.s0123 + sum8.s4567;
+    VEC_DATA_TYPE(DATA_TYPE,4) sum4 = sum8.s0123 + sum8.s4567;
-    float2 sum2 = sum4.s01 + sum4.s23;
+    VEC_DATA_TYPE(DATA_TYPE,2) sum2 = sum4.s01 + sum4.s23;
    *output_ptr = (sum2.s0 + sum2.s1 + sum1) / block_size;
  }

--- a/mace/kernels/opencl/cl/relu.cl
+++ b/mace/kernels/opencl/cl/relu.cl
-__kernel void relu(__global const float *input,
+#include <common.h>
+// Supported data type: half/float
+__kernel void relu(__global const DATA_TYPE *input,
                   __private const int size,
-                   __global float *output) {
+                   __global DATA_TYPE *output) {
  int idx = get_global_id(0);
  if (idx + 4 > size) {
@@ -8,16 +11,16 @@ __kernel void relu(__global const float *input,
      *(output+idx) = fmax(*(input+idx), 0);
    }
  } else {
-    float4 data = vload4(idx, input);
+    VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input);
-    data = fmax(data, (float4)0);
+    data = fmax(data, (VEC_DATA_TYPE(DATA_TYPE,4))0);
    vstore4(data, idx, output);
  }
 }
-__kernel void relux(__global const float *input,
+__kernel void relux(__global const DATA_TYPE *input,
-                    __private const float max_limit,
+                    __private const DATA_TYPE max_limit,
                    __private const int size,
-                    __global float *output) {
+                    __global DATA_TYPE *output) {
  int idx = get_global_id(0);
  if (idx + 4 > size) {
@@ -25,8 +28,8 @@ __kernel void relux(__global const float *input,
      *(output+idx) = clamp(*(input+idx), 0.0f, max_limit);
    }
  } else {
-    float4 data = vload4(idx, input);
+    VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input);
-    data = clamp(data, (float4)0, (float4)max_limit);
+    data = clamp(data, (VEC_DATA_TYPE(DATA_TYPE,4))0, (VEC_DATA_TYPE(DATA_TYPE,4))max_limit);
    vstore4(data, idx, output);
  }
 }
--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
-__kernel void resize_bilinear_nocache(__global const float *input, /* n * c, h, w */
+#include <common.h>
-                                      __global float *output /* n * c, h, w */,
+// Supported data type: half/float
+__kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c, h, w */
+                                      __global DATA_TYPE *output /* n * c, h, w */,
                                      __private const float height_scale,
                                      __private const float width_scale,
                                      __private const int in_height,
@@ -21,16 +24,16 @@ __kernel void resize_bilinear_nocache(__global const float *input, /* n * c, h,
  const float h_lerp = h_in - h_lower;
  const float w_lerp = w_in - w_lower;
-  const float *input_base = input + c * in_height * in_width;
+  const DATA_TYPE *input_base = input + c * in_height * in_width;
-  float *output_base = output + c * height * width;
+  DATA_TYPE *output_base = output + c * height * width;
-  float top_left = input_base[h_lower * in_width + w_lower];
+  DATA_TYPE top_left = input_base[h_lower * in_width + w_lower];
-  float top_right = input_base[h_lower * in_width + w_upper];
+  DATA_TYPE top_right = input_base[h_lower * in_width + w_upper];
-  float bottom_left = input_base[h_upper * in_width + w_lower];
+  DATA_TYPE bottom_left = input_base[h_upper * in_width + w_lower];
-  float bottom_right = input_base[h_upper * in_width + w_upper];
+  DATA_TYPE bottom_right = input_base[h_upper * in_width + w_upper];
-  const float top = top_left + (top_right - top_left) * w_lerp;
+  const DATA_TYPE top = top_left + (top_right - top_left) * w_lerp;
-  const float bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
+  const DATA_TYPE bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
  output_base[h * width + w] = top + (bottom - top) * h_lerp;
 }
--- a/mace/kernels/opencl/cl/space_to_batch.cl
+++ b/mace/kernels/opencl/cl/space_to_batch.cl
-void kernel space_to_batch(global float *space_data_ptr,
+#include <common.h>
-                           global const int *block_shape_ptr,
-                           global const int *paddings_ptr,
+// Supported data type: all
-                           private const int space_batch,
+__kernel void space_to_batch(__global DATA_TYPE *space_data_ptr,
-                           private const int space_channel,
+                             __global const int *block_shape_ptr,
-                           private const int space_height,
+                             __global const int *paddings_ptr,
-                           private const int space_width,
+                             __private const int space_batch,
-                           private const int batch_height,
+                             __private const int space_channel,
-                           private const int batch_width,
+                             __private const int space_height,
-                           private const int b2s,
+                             __private const int space_width,
-                           global float* batch_data_ptr) {
+                             __private const int batch_height,
+                             __private const int batch_width,
+                             __private const int b2s,
+                             __global DATA_TYPE* batch_data_ptr) {
  int batch_idx = get_global_id(0);
  int batch_channel_idx = get_global_id(1);
  int batch_pixel_idx = get_global_id(2);

--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -61,16 +61,19 @@ void Conv1x1V2(const Tensor *input,
  // TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy
  // TODO check wired clReleaseCommandQueue latency
  // The KernelFunctor can cause segment faults in cb_retain_event
-  auto conv_2d_kernel = cl::Kernel(program, "conv_2d_1x1_v2");
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
+  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1_v2", built_options);
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
  uint32_t idx = 0;
  conv_2d_kernel.setArg(idx++,
                        *(static_cast<const cl::Buffer *>(input->buffer())));
  conv_2d_kernel.setArg(idx++,
                        *(static_cast<const cl::Buffer *>(filter->buffer())));
-  if (bias == NULL) {
+  if (bias != nullptr) {
-    conv_2d_kernel.setArg(idx++, NULL);
-  } else {
    conv_2d_kernel.setArg(idx++,
                          *(static_cast<const cl::Buffer *>(bias->buffer())));
  }
@@ -81,7 +84,6 @@ void Conv1x1V2(const Tensor *input,
  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3)));
  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
-  conv_2d_kernel.setArg(idx++, stride);
  auto command_queue = runtime->command_queue();
  cl_int error = command_queue.enqueueNDRangeKernel(

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -22,14 +22,17 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
  auto runtime = OpenCLRuntime::Get();
  auto program = runtime->program();
-  auto conv_kernel = cl::Kernel(program, "conv_2d_3x3");
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
+  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  auto conv_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options);
  uint32_t idx = 0;
  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
-  if (bias == nullptr) {
+  if (bias != nullptr) {
-    conv_kernel.setArg(idx++, NULL);
-  } else {
    conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
  }
  conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
@@ -39,8 +42,6 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
  conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
  conv_kernel.setArg(idx++, static_cast<int32_t>(height));
  conv_kernel.setArg(idx++, static_cast<int32_t>(width));
-  conv_kernel.setArg(idx++, stride);
-  conv_kernel.setArg(idx++, stride);
  const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
                           static_cast<uint32_t>(channel_blocks),
                           static_cast<uint32_t>(pixel_blocks)};

--- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
@@ -30,13 +30,18 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
  const index_t pixel_blocks = (width + 3) / 4 * height;
  auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
+  std::set<std::string> built_options;
-  auto conv_kernel = cl::Kernel(program, "depthwise_conv_3x3");
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
+  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  auto conv_kernel  = runtime->BuildKernel("depthwise_conv_3x3", "depthwise_conv_3x3", built_options);
  uint32_t idx = 0;
  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
-  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
+  if (bias != nullptr) {
+    conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
+  }
  conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
  conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
  conv_kernel.setArg(idx++, static_cast<int32_t>(channels));
@@ -44,8 +49,6 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
  conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
  conv_kernel.setArg(idx++, static_cast<int32_t>(height));
  conv_kernel.setArg(idx++, static_cast<int32_t>(width));
-  conv_kernel.setArg(idx++, stride);
-  conv_kernel.setArg(idx++, stride);
  const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
                           static_cast<uint32_t>(channel_blocks),

--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -30,24 +30,26 @@ static void Pooling3(const Tensor *input,
  };
  auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace(stride[0] == 1 ? "-DSTRIDE_1" : "");
+  auto pooling_kernel  = runtime->BuildKernel("pooling", "pooling3", built_options);
-  auto max_pooling_kernel = cl::Kernel(program, "pooling3");
  const uint32_t lws[3] = {1, 8, 128};
  uint32_t idx = 0;
-  max_pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
-  max_pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
+  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-  max_pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
+  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
-  max_pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
+  pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
-  max_pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
+  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
-  max_pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
+  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
-  max_pooling_kernel.setArg(idx++, stride[0]);
+  pooling_kernel.setArg(idx++, stride[0]);
-  max_pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      max_pooling_kernel, cl::NullRange,
+      pooling_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
      cl::NDRange(lws[0], lws[1], lws[2]));
  MACE_CHECK(error == CL_SUCCESS);
@@ -75,9 +77,9 @@ static void PoolingN(const Tensor *input,
  };
  auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
-  auto pooling_kernel = cl::Kernel(program, "poolingn");
+  auto pooling_kernel  = runtime->BuildKernel("pooling", "poolingn", built_options);
  const uint32_t lws[3] = {1, 8, 128};

--- a/mace/kernels/opencl/relu_opencl.cc
+++ b/mace/kernels/opencl/relu_opencl.cc
@@ -21,9 +21,10 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
  auto runtime = OpenCLRuntime::Get();
  auto program = runtime->program();
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
  if (max_limit_ < 0) {
-    auto relu_kernel = cl::Kernel(program, "relu");
+    auto relu_kernel  = runtime->BuildKernel("relu", "relu", built_options);
    const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
    uint32_t idx = 0;
@@ -37,7 +38,7 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
        cl::NDRange(lws));
    MACE_CHECK(error == CL_SUCCESS);
  } else {
-    auto relu_kernel = cl::Kernel(program, "relux");
+    auto relu_kernel  = runtime->BuildKernel("relu", "relux", built_options);
    const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel);

--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -29,9 +29,10 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
  float width_scale = CalculateResizeScale(in_width, out_width, align_corners_);
  auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  auto rb_kernel  = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options);
-  auto rb_kernel = cl::Kernel(program, "resize_bilinear_nocache");
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
  uint32_t idx = 0;
  rb_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));

--- a/mace/kernels/opencl/space_to_batch_opecl.cc
+++ b/mace/kernels/opencl/space_to_batch_opecl.cc
@@ -18,9 +18,9 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
                                                                const Tensor *paddings_tensor,
                                                                Tensor *batch_tensor) {
  auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
+  std::set<std::string> built_options;
-  auto s2b_kernel = cl::Kernel(program, "space_to_batch");
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(space_tensor->dtype()));
+  auto s2b_kernel = runtime->BuildKernel("space_to_batch", "space_to_batch", built_options);
  uint32_t idx = 0;
  s2b_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(space_tensor->buffer())));

--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -24,6 +24,7 @@ enum DataType {
  DT_UINT16 = 9;
  DT_BOOL = 10;
  DT_HALF = 19;
+  DT_UINT32 = 22;
 }
 message TensorProto {