adjust opencl code to minify the libmace.so's size

N/A Signed-off-by: N Luxuhui <luxuhui@xiaomi.com>

adjust opencl code to minify the libmace.so's size
N/A Signed-off-by: N Luxuhui <luxuhui@xiaomi.com>
85cef1d8 · luxuhui · 23d985f7 · 85cef1d8 · 85cef1d8 · 85cef1d8
145 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,7 +68,7 @@ if(MACE_ENABLE_CUDA)
  enable_language(CUDA)
 endif(MACE_ENABLE_CUDA)
-if((MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA))
+if(MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA)
  if(ANDROID_ABI STREQUAL "arm64-v8a")
    # Use gold linker to avoid linking check of libcdsprpc.so
    set(MACE_LINKER_FLAGS "${MACE_LINKER_FLAGS} -fuse-ld=gold")

--- a/docs/development/adding_a_new_op.md
+++ b/docs/development/adding_a_new_op.md
@@ -33,8 +33,8 @@ class MyCustomOp<DeviceType::CPU, float> : public Operation {
 }
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class MyCustomOp<DeviceType::GPU, T> : public Operation {
+class MyCustomOp<DeviceType::GPU, float> : public Operation {
 ...
 };
 #endif  // MACE_ENABLE_OPENCL
@@ -43,13 +43,7 @@ void RegisterMyCustomOp(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
                   DeviceType::CPU, float);
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "MyCustomOp", MyCustomOp);
-  MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }
 }  // namespace ops

--- a/mace/codegen/BUILD.bazel
+++ b/mace/codegen/BUILD.bazel
@@ -5,7 +5,7 @@ package(
    default_visibility = ["//visibility:public"],
 )
-load("//mace:mace.bzl", "mace_version_genrule", "encrypt_opencl_kernel_genrule")
+load("//mace:mace.bzl", "encrypt_opencl_kernel_genrule", "mace_version_genrule")
 cc_library(
    name = "generated_models",
@@ -28,6 +28,7 @@ encrypt_opencl_kernel_genrule()
 cc_library(
    name = "generated_opencl",
    srcs = ["opencl/encrypt_opencl_kernel.cc"],
+    hdrs = ["opencl/encrypt_opencl_kernel.h"],
    copts = [
        "-Werror",
        "-Wextra",

--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -318,7 +318,7 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
  std::string key = OpKeyBuilder(op_type)
      .Device(device_type)
-      .TypeConstraint("T", dtype)
+      .TypeConstraint("T", dtype == DT_HALF ? DT_FLOAT : dtype)
      .Build();
  if (registry_.at(op_type)->creators.count(key) == 0) {
    LOG(FATAL) << "Key not registered: " << key;

--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -39,7 +39,7 @@ class OpConditionContext {
  OpConditionContext(const Workspace *ws, TensorShapeMap *info);
  ~OpConditionContext() = default;
-  void set_operator_def(const OperatorDef* operator_def);
+  void set_operator_def(const OperatorDef *operator_def);
  inline const OperatorDef *operator_def() const {
    return operator_def_;
@@ -49,7 +49,7 @@ class OpConditionContext {
    return ws_;
  }
-  inline void set_device(Device* device) {
+  inline void set_device(Device *device) {
    device_ = device;
  }
@@ -110,7 +110,7 @@ class OpConstructContext {
    return ws_;
  }
-  inline void set_device(Device* device) {
+  inline void set_device(Device *device) {
    device_ = device;
  }
@@ -166,14 +166,14 @@ class Operation {
  explicit Operation(OpConstructContext *context);
  virtual ~Operation() = default;
-  template <typename T>
+  template<typename T>
  inline T GetOptionalArg(const std::string &name,
                          const T &default_value) const {
    MACE_CHECK(operator_def_, "operator_def was null!");
    return ProtoArgHelper::GetOptionalArg<OperatorDef, T>(
        *operator_def_, name, default_value);
  }
-  template <typename T>
+  template<typename T>
  inline std::vector<T> GetRepeatedArgs(
      const std::string &name, const std::vector<T> &default_value = {}) const {
    MACE_CHECK(operator_def_, "operator_def was null!");
@@ -240,7 +240,6 @@ class Operation {
 #define MACE_OP_OUTPUT_TAGS(first_input, ...) \
  enum _OutputTags { first_input = 0, __VA_ARGS__ }
 struct OpRegistrationInfo {
 public:
  typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
@@ -290,7 +289,6 @@ class OpConditionBuilder {
  OpRegistrationInfo::DataFormatSelector data_format_selector_;
 };
 class OpRegistryBase {
 public:
  OpRegistryBase() = default;
@@ -315,7 +313,7 @@ class OpRegistryBase {
      OpConstructContext *context,
      DeviceType device_type) const;
-  template <class DerivedType>
+  template<class DerivedType>
  static std::unique_ptr<Operation> DefaultCreator(
      OpConstructContext *context) {
    return std::unique_ptr<Operation>(new DerivedType(context));
@@ -334,6 +332,24 @@ class OpRegistryBase {
                        DataTypeToEnum<dt>::value,                     \
                        OpRegistryBase::DefaultCreator<class_name<device, dt>>)
+#define MACE_REGISTER_OP_BY_CLASS(                 \
+    op_registry, op_type, class_name, device, dt)  \
+  op_registry->Register(op_type,                   \
+                        device,                    \
+                        DataTypeToEnum<dt>::value, \
+                        OpRegistryBase::DefaultCreator<class_name>)
+#ifdef MACE_ENABLE_OPENCL
+#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \
+  op_registry->Register(                                       \
+      op_type,                                                 \
+      DeviceType::GPU,                                         \
+      DT_FLOAT,                                                \
+      OpRegistryBase::DefaultCreator<class_name<DeviceType::GPU, float>>)
+#else
+#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name)
+#endif
 #define MACE_REGISTER_OP_CONDITION(op_registry, builder) \
  op_registry->Register(builder)

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -18,20 +18,19 @@
 #include <fstream>
 #include <memory>
 #include <mutex>  // NOLINT(build/c++11)
+#include <sstream>
 #include <string>
 #include <vector>
 #include <utility>
-#include "mace/utils/macros.h"
+#include "mace/codegen/opencl/encrypt_opencl_kernel.h"
 #include "mace/core/kv_storage.h"
 #include "mace/core/runtime/opencl/opencl_extension.h"
+#include "mace/utils/macros.h"
 #include "mace/utils/tuner.h"
 namespace mace {
-extern const std::map<std::string, std::vector<unsigned char>>
-    kEncryptedProgramMap;
 const std::string OpenCLErrorToString(cl_int error) {
  switch (error) {
    case CL_SUCCESS:
@@ -265,7 +264,7 @@ OpenCLRuntime::OpenCLRuntime(
    const GPUPriorityHint priority_hint,
    const GPUPerfHint perf_hint,
    std::shared_ptr<KVStorage> precompiled_binary_storage,
-    std::shared_ptr<Tuner<uint32_t>> tuner):
+    std::shared_ptr<Tuner<uint32_t>> tuner) :
    cache_storage_(cache_storage),
    precompiled_binary_storage_(precompiled_binary_storage),
    tuner_(tuner),
@@ -345,8 +344,8 @@ OpenCLRuntime::OpenCLRuntime(
 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
    if (is_profiling_enabled_ && gpu_type_ == GPUType::MALI) {
      std::vector<cl_context_properties> context_properties = {
-          CL_CONTEXT_PLATFORM, (cl_context_properties)default_platform(),
+          CL_CONTEXT_PLATFORM, (cl_context_properties) default_platform(),
-          CL_PRINTF_CALLBACK_ARM, (cl_context_properties)OpenCLPrintfCallback,
+          CL_PRINTF_CALLBACK_ARM, (cl_context_properties) OpenCLPrintfCallback,
          CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0
      };
      context_ = std::shared_ptr<cl::Context>(
@@ -530,17 +529,47 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary(
  return true;
 }
+MaceStatus GetProgramSourceByName(const std::string &program_name,
+                              std::string *source) {
+  MACE_CHECK_NOTNULL(source);
+  std::stringstream source_stream;
+  const auto &kEncryptedProgramMap = mace::codegen::kEncryptedProgramMap;
+  const auto &it_program = kEncryptedProgramMap.find(program_name);
+  if (it_program == kEncryptedProgramMap.end()) {
+    LOG(ERROR) << "Find program " << program_name << " failed.";
+    return MaceStatus::MACE_RUNTIME_ERROR;
+  }
+  const std::vector<std::string> &headers = it_program->second.headers_;
+  for (const std::string &header : headers) {
+    const auto &header_program = kEncryptedProgramMap.find(header);
+    if (header_program == kEncryptedProgramMap.end()) {
+      LOG(WARNING) << "Program header(" << header << ") is empty.";
+      continue;
+    }
+    const auto &header_source = header_program->second.encrypted_code_;
+    source_stream << ObfuscateString(
+        std::string(header_source.begin(), header_source.end()));
+  }
+  const auto &it_source = it_program->second.encrypted_code_;
+  source_stream << ObfuscateString(
+      std::string(it_source.begin(), it_source.end()));
+  *source = source_stream.str();
+  return MaceStatus::MACE_SUCCESS;
+}
 bool OpenCLRuntime::BuildProgramFromSource(
    const std::string &program_name,
    const std::string &built_program_key,
    const std::string &build_options_str,
    cl::Program *program) {
-  // Find from source
+  std::string kernel_source;
-  auto it_source = kEncryptedProgramMap.find(program_name);
+  MaceStatus status = GetProgramSourceByName(program_name, &kernel_source);
-  if (it_source != kEncryptedProgramMap.end()) {
+  if (status == MaceStatus::MACE_SUCCESS && !kernel_source.empty()) {
    cl::Program::Sources sources;
-    std::string source(it_source->second.begin(), it_source->second.end());
-    std::string kernel_source = ObfuscateString(source);
    sources.push_back(kernel_source);
    *program = cl::Program(context(), sources);
    cl_int ret = program->build({device()}, build_options_str.c_str());

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -66,7 +66,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
          *net_def, "opencl_mem_type",
          static_cast<MemoryType>(MemoryType::GPU_IMAGE));
  const MemoryType mem_type = static_cast<MemoryType>(mem_type_i);
  runtime->set_mem_type(mem_type);
  return MaceStatus::MACE_SUCCESS;

--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -118,9 +118,21 @@ def mace_version_genrule():
  )
 def encrypt_opencl_kernel_genrule():
+    srcs = [
+        str(Label(
+            "@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.cc",
+        )),
+        str(Label(
+            "@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.h",
+        )),
+    ]
+    outs = ["opencl/encrypt_opencl_kernel.cc", "opencl/encrypt_opencl_kernel.h"]
    native.genrule(
        name = "encrypt_opencl_kernel_gen",
-      srcs = [str(Label("@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel"))],
+        srcs = srcs,
-      outs = ["opencl/encrypt_opencl_kernel.cc"],
+        outs = outs,
-      cmd = "cat $(SRCS) > $@;"
+        cmd = " && ".join([
+            "cat $(location %s) > $(location %s)" % (srcs[i], outs[i])
+            for i in range(0, len(outs))
+        ]),
    )
--- a/mace/ops/BUILD.bazel
+++ b/mace/ops/BUILD.bazel
@@ -181,7 +181,6 @@ cc_library(
    ],
 )
 cc_library(
    name = "internal_ops",
    srcs = glob(

--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -83,28 +83,27 @@ class ActivationOp<DeviceType::CPU, float> : public Operation {
 };
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class ActivationOp<DeviceType::GPU, T> : public Operation {
+class ActivationOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit ActivationOp(OpConstructContext *context)
      : Operation(context) {
    ActivationType type = ops::StringToActivationType(
        Operation::GetOptionalArg<std::string>("activation",
                                              "NOOP"));
-    auto relux_max_limit = static_cast<T>(
+    auto relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
-        Operation::GetOptionalArg<float>("max_limit", 0.0f));
+    auto leakyrelu_coefficient =
-    auto leakyrelu_coefficient = static_cast<T>(
+        Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f);
-        Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f));
    MemoryType mem_type;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::ActivationKernel<T>>(
+      kernel_ = make_unique<opencl::image::ActivationKernel>(
          type, relux_max_limit, leakyrelu_coefficient);
    } else {
      MACE_NOT_IMPLEMENTED;
    }
    if (type == ActivationType::PRELU) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
                     == MaceStatus::MACE_SUCCESS);
    }
@@ -126,14 +125,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
 void RegisterActivation(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
                   DeviceType::CPU, float);
+  MACE_REGISTER_GPU_OP(op_registry, "Activation", ActivationOp);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
  MACE_REGISTER_OP_CONDITION(
      op_registry,
      OpConditionBuilder("Activation")
@@ -141,16 +133,16 @@ void RegisterActivation(OpRegistryBase *op_registry) {
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                }
                int has_data_format =
                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                        *op, "has_data_format", 0);
                if (!has_data_format ||
                    op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
              }));
 }

--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -29,10 +29,10 @@
 namespace mace {
 namespace ops {
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class AddNOp;
-template <>
+template<>
 class AddNOp<DeviceType::CPU, float> : public Operation {
 public:
  explicit AddNOp(OpConstructContext *context)
@@ -62,13 +62,13 @@ class AddNOp<DeviceType::CPU, float> : public Operation {
 };
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class AddNOp<DeviceType::GPU, T> : public Operation {
+class AddNOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit AddNOp(OpConstructContext *context)
      : Operation(context) {
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::AddNKernel<T>>();
+      kernel_ = make_unique<opencl::image::AddNKernel>();
    } else {
      MACE_NOT_IMPLEMENTED;
    }
@@ -92,15 +92,9 @@ class AddNOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 void RegisterAddN(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float);
+  MACE_REGISTER_GPU_OP(op_registry, "AddN", AddNOp);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
  MACE_REGISTER_OP_CONDITION(
      op_registry,
      OpConditionBuilder("AddN")
@@ -108,16 +102,16 @@ void RegisterAddN(OpRegistryBase *op_registry) {
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                }
                int has_data_format =
                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                        *op, "has_data_format", 0);
                if (!has_data_format ||
                    op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
              }));
 }

--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -161,8 +161,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
 };
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class BatchNormOp<DeviceType::GPU, T> : public Operation {
+class BatchNormOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit BatchNormOp(OpConstructContext *context)
      : Operation(context) {
@@ -176,7 +176,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
    MemoryType mem_type;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::BatchNormKernel<T>>(
+      kernel_ = make_unique<opencl::image::BatchNormKernel>(
          epsilon, activation, relux_max_limit, leakyrelu_coefficient);
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -187,7 +187,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
      const Tensor *input_tensor = context->workspace()->GetTensor(
          operator_def_->input(i));
      MACE_CHECK(input_tensor != nullptr);
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context,
          operator_def_.get(),
          i,
@@ -235,14 +235,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
 void RegisterBatchNorm(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
                   DeviceType::CPU, float);
+  MACE_REGISTER_GPU_OP(op_registry, "BatchNorm", BatchNormOp);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }
 }  // namespace ops

--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -80,10 +80,10 @@ class BatchToSpaceOpBase : public Operation {
  }
 };
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class BatchToSpaceNDOp;
-template <>
+template<>
 class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
 public:
  explicit BatchToSpaceNDOp(OpConstructContext *context)
@@ -175,7 +175,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
  }
 };
-template <>
+template<>
 class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
 public:
  explicit BatchToSpaceNDOp(OpConstructContext *context)
@@ -259,13 +259,13 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
 };
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
+class BatchToSpaceNDOp<DeviceType::GPU, float> : public BatchToSpaceOpBase {
 public:
  explicit BatchToSpaceNDOp(OpConstructContext *context)
      : BatchToSpaceOpBase(context) {
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::BatchToSpaceKernel<T>>();
+      kernel_ = make_unique<opencl::image::BatchToSpaceKernel>();
    } else {
      MACE_NOT_IMPLEMENTED;
    }
@@ -285,7 +285,6 @@ class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
 void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
                   BatchToSpaceNDOp, DeviceType::CPU, float);
@@ -293,13 +292,7 @@ void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
                   BatchToSpaceNDOp, DeviceType::CPU, uint8_t);
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "BatchToSpaceND", BatchToSpaceNDOp);
-  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
-                   BatchToSpaceNDOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
-                   BatchToSpaceNDOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }
 }  // namespace ops

--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -34,16 +34,16 @@
 namespace mace {
 namespace ops {
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class BiasAddOp;
-template <>
+template<>
 class BiasAddOp<DeviceType::CPU, float> : public Operation {
 public:
  explicit BiasAddOp(OpConstructContext *context)
      : Operation(context),
-        has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 0))
+        has_data_format_(Operation::GetOptionalArg<int>("has_data_format",
-  {}
+                                                        0)) {}
  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
@@ -96,8 +96,8 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
 };
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class BiasAddOp<DeviceType::GPU, T> : public Operation {
+class BiasAddOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit BiasAddOp(OpConstructContext *context)
      : Operation(context),
@@ -105,11 +105,11 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
    MemoryType mem_type = MemoryType::CPU_BUFFER;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::BiasAddKernel<T>>();
+      kernel_ = make_unique<opencl::image::BiasAddKernel>();
    } else {
      MACE_NOT_IMPLEMENTED;
    }
-    MACE_CHECK(TransformFilter<T>(
+    MACE_CHECK(TransformFilter(
        context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
                   == MaceStatus::MACE_SUCCESS);
  }
@@ -133,18 +133,10 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 void RegisterBiasAdd(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
                   DeviceType::CPU, float);
+  MACE_REGISTER_GPU_OP(op_registry, "BiasAdd", BiasAddOp);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
  MACE_REGISTER_OP_CONDITION(
      op_registry,
      OpConditionBuilder("BiasAdd")
@@ -152,16 +144,16 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) {
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                }
                int has_data_format =
                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                        *op, "has_data_format", 0);
                if (!has_data_format ||
                    op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
              }));
 }

--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -23,10 +23,10 @@
 namespace mace {
 namespace ops {
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class ChannelShuffleOp;
-template <typename T>
+template<typename T>
 class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
 public:
  explicit ChannelShuffleOp(OpConstructContext *context)
@@ -74,16 +74,15 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
  const int groups_;
 };
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
+class ChannelShuffleOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit ChannelShuffleOp(OpConstructContext *context)
      : Operation(context) {
    const int groups = Operation::GetOptionalArg<int>("group", 1);
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::ChannelShuffleKernel<T>>(groups);
+      kernel_ = make_unique<opencl::image::ChannelShuffleKernel>(groups);
    } else {
      MACE_NOT_IMPLEMENTED;
    }
@@ -99,18 +98,11 @@ class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 void RegisterChannelShuffle(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "ChannelShuffle",
                   ChannelShuffleOp, DeviceType::CPU, float);
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "ChannelShuffle", ChannelShuffleOp);
-  MACE_REGISTER_OP(op_registry, "ChannelShuffle",
-                   ChannelShuffleOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "ChannelShuffle",
-                   ChannelShuffleOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
  MACE_REGISTER_OP_CONDITION(
      op_registry,
@@ -119,19 +111,19 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) {
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                }
                int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                    *op, "group", 1);
                if (op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                }
                index_t channels = op->output_shape(0).dims(3);
                index_t channels_per_group = channels / groups;
                if (groups % 4 != 0 || channels_per_group % 4 != 0) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
              }));
 }

--- a/mace/ops/pad.h
+++ b/mace/ops/pad.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_PAD_H_
+#ifndef MACE_OPS_COMMON_PAD_TYPE_H_
-#define MACE_OPS_PAD_H_
+#define MACE_OPS_COMMON_PAD_TYPE_H_
 namespace mace {
 namespace ops {
@@ -27,4 +27,4 @@ enum PadType {
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_PAD_H_
+#endif  // MACE_OPS_COMMON_PAD_TYPE_H_
--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_POOLING_H_
+#ifndef MACE_OPS_COMMON_POOLING_TYPE_H_
-#define MACE_OPS_POOLING_H_
+#define MACE_OPS_COMMON_POOLING_TYPE_H_
 namespace mace {
@@ -23,4 +23,4 @@ enum PoolingType {
 };
 }  // namespace mace
-#endif  // MACE_OPS_POOLING_H_
+#endif  // MACE_OPS_COMMON_POOLING_TYPE_H_
--- a/mace/ops/reduce.h
+++ b/mace/ops/reduce.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_REDUCE_H_
+#ifndef MACE_OPS_COMMON_REDUCE_TYPE_H_
-#define MACE_OPS_REDUCE_H_
+#define MACE_OPS_COMMON_REDUCE_TYPE_H_
 namespace mace {
@@ -28,4 +28,4 @@ enum ReduceType {
 };
 }  // namespace mace
-#endif  // MACE_OPS_REDUCE_H_
+#endif  // MACE_OPS_COMMON_REDUCE_TYPE_H_
--- a/mace/ops/resize_bicubic.h
+++ b/mace/ops/resize_bicubic.h
@@ -12,14 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_RESIZE_BICUBIC_H_
+#ifndef MACE_OPS_COMMON_UTILS_H_
-#define MACE_OPS_RESIZE_BICUBIC_H_
+#define MACE_OPS_COMMON_UTILS_H_
 #include "mace/core/types.h"
 namespace mace {
 namespace ops {
-namespace resize_bicubic {
+namespace common {
+namespace utils {
 constexpr int64_t kTableSize = (1u << 10);
 inline float CalculateResizeScale(index_t in_size,
@@ -29,9 +31,10 @@ inline float CalculateResizeScale(index_t in_size,
         ? (in_size - 1) / static_cast<float>(out_size - 1)
         : in_size / static_cast<float>(out_size);
 }
-}  // namespace resize_bicubic
+}  // namespace utils
+}  // namespace common
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_RESIZE_BICUBIC_H_
+#endif  // MACE_OPS_COMMON_UTILS_H_
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -46,10 +46,10 @@ class ConcatOpBase : public Operation {
  int axis_;
 };
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class ConcatOp;
-template <typename T>
+template<typename T>
 class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
 public:
  explicit ConcatOp(OpConstructContext *context)
@@ -194,13 +194,13 @@ class ConcatOp<DeviceType::CPU, uint8_t> : public ConcatOpBase {
 #endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
+class ConcatOp<DeviceType::GPU, float> : public ConcatOpBase {
 public:
  explicit ConcatOp(OpConstructContext *context)
      : ConcatOpBase(context) {
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::ConcatKernel<T>>();
+      kernel_ = make_unique<opencl::image::ConcatKernel>();
    } else {
      MACE_NOT_IMPLEMENTED;
    }
@@ -215,7 +215,6 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
 void RegisterConcat(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
                   DeviceType::CPU, float);
@@ -228,14 +227,7 @@ void RegisterConcat(OpRegistryBase *op_registry) {
                   DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Concat", ConcatOp);
-  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
  MACE_REGISTER_OP_CONDITION(
      op_registry,
@@ -244,11 +236,11 @@ void RegisterConcat(OpRegistryBase *op_registry) {
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
-                return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                }
                auto tensor_shape_info = context->tensor_shape_info();
                if (op->output_shape(0).dims_size() != 4) {
-                return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                } else {
                  int has_data_format =
                      ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
@@ -256,7 +248,7 @@ void RegisterConcat(OpRegistryBase *op_registry) {
                  int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                      *op, "axis", 3);
                  if (!has_data_format || axis != 3) {
-                  return { DeviceType::CPU };
+                    return {DeviceType::CPU};
                  }
                  bool divisible_four = true;
                  for (const std::string &input : op->input()) {
@@ -268,10 +260,10 @@ void RegisterConcat(OpRegistryBase *op_registry) {
                  }
                  // Only support not divisible 4 case with 2 inputs.
                  if (op->input_size() > 2 && !divisible_four) {
-                  return { DeviceType::CPU };
+                    return {DeviceType::CPU};
                  }
                }
-              return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
              }));
 }

--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -446,8 +446,8 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
 #endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
+class Conv2dOp<DeviceType::GPU, float> : public ConvPool2dOpBase {
 public:
  explicit Conv2dOp(OpConstructContext *context)
      : ConvPool2dOpBase(context),
@@ -461,10 +461,10 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
    MemoryType mem_type;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::Conv2dKernel<T>>();
+      kernel_ = make_unique<opencl::image::Conv2dKernel>();
    } else {
      mem_type = MemoryType::GPU_BUFFER;
-      kernel_ = make_unique<opencl::buffer::Conv2dKernel<T>>();
+      kernel_ = make_unique<opencl::buffer::Conv2dKernel>();
    }
    // Transform filter tensor to target format
    if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
@@ -477,19 +477,19 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
          strides_.data(),
          dilations_.data(),
          &wino_block_size_))) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context, operator_def_.get(), 1,
          OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_)
                     == MaceStatus::MACE_SUCCESS);
    } else {
      wino_block_size_ = 0;
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context, operator_def_.get(), 1,
          OpenCLBufferType::CONV2D_FILTER, mem_type)
                     == MaceStatus::MACE_SUCCESS);
    }
    if (operator_def_->input_size() > 2) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
                     == MaceStatus::MACE_SUCCESS);
    }
@@ -527,13 +527,7 @@ void RegisterConv2D(OpRegistryBase *op_registry) {
                   DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Conv2D", Conv2dOp);
-  MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }
 }  // namespace ops

--- a/mace/ops/crop.cc
+++ b/mace/ops/crop.cc
@@ -24,10 +24,10 @@
 namespace mace {
 namespace ops {
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class CropOp;
-template <class T>
+template<class T>
 class CropOp<DeviceType::CPU, T> : public Operation {
 public:
  explicit CropOp(OpConstructContext *context)
@@ -43,7 +43,6 @@ class CropOp<DeviceType::CPU, T> : public Operation {
    }
  }
  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
    MACE_CHECK(inputs_.size() == 2, "Crop op needs two inputs.");
@@ -71,7 +70,7 @@ class CropOp<DeviceType::CPU, T> : public Operation {
    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
    T *output_data = output->mutable_data<T>();
-    const T * input_data = input0->data<T>();
+    const T *input_data = input0->data<T>();
    crop_copy(input_data, output_data, input0->shape(),
              output_shape, offsets.data());
@@ -80,10 +79,10 @@ class CropOp<DeviceType::CPU, T> : public Operation {
  }
 private:
-  void crop_copy(const T* input_data, T* output_data,
+  void crop_copy(const T *input_data, T *output_data,
                 const std::vector<index_t> &input_shape,
                 const std::vector<index_t> &output_shape,
-                 const int32_t* offsets) {
+                 const int32_t *offsets) {
    const index_t out_img_size =
        output_shape[1] * output_shape[2] * output_shape[3];
    const index_t out_hw = output_shape[2] * output_shape[3];
@@ -94,9 +93,9 @@ class CropOp<DeviceType::CPU, T> : public Operation {
    for (int b = 0; b < output_shape[0]; ++b) {
      for (int c = 0; c < output_shape[1]; ++c) {
        for (int h = 0; h < output_shape[2]; ++h) {
-          T* out_ptr =
+          T *out_ptr =
              output_data + b * out_img_size + c * out_hw + h * output_shape[3];
-          const T* in_ptr_bch =
+          const T *in_ptr_bch =
              input_data + (b + offsets[0]) * in_img_size +
                  (c + offsets[1]) * in_hw +
                  (h + offsets[2]) * input_shape[3] + offsets[3];
@@ -112,13 +111,13 @@ class CropOp<DeviceType::CPU, T> : public Operation {
 };
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class CropOp<DeviceType::GPU, T> : public Operation {
+class CropOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit CropOp(OpConstructContext *context)
      : Operation(context) {
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::CropKernel<T>>(
+      kernel_ = make_unique<opencl::image::CropKernel>(
          Operation::GetRepeatedArgs<int>("offset"));
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -133,18 +132,10 @@ class CropOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 void RegisterCrop(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Crop", CropOp,
                   DeviceType::CPU, float);
+  MACE_REGISTER_GPU_OP(op_registry, "Crop", CropOp);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Crop", CropOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Crop", CropOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
  MACE_REGISTER_OP_CONDITION(
      op_registry,
      OpConditionBuilder("Crop")
@@ -152,16 +143,16 @@ void RegisterCrop(OpRegistryBase *op_registry) {
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                }
                int has_data_format =
                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                        *op, "has_data_format", 0);
                if (!has_data_format ||
                    op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
              }));
 }

--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -167,30 +167,30 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
 };
 #ifdef MACE_ENABLE_OPENCL
-template<typename T>
+template<>
-class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
+class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
 public:
  explicit Deconv2dOp(OpConstructContext *context)
      : Deconv2dOpBase(context) {
    MemoryType mem_type = MemoryType::GPU_IMAGE;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::Deconv2dKernel<T>>();
+      kernel_ = make_unique<opencl::image::Deconv2dKernel>();
    } else {
      MACE_NOT_IMPLEMENTED;
    }
-    MACE_CHECK(TransformFilter<T>(
+    MACE_CHECK(TransformFilter(
        context, operator_def_.get(), 1,
        OpenCLBufferType::CONV2D_FILTER, mem_type)
                   == MaceStatus::MACE_SUCCESS);
    if (model_type_ == FrameworkType::CAFFE) {
      if (operator_def_->input_size() >= 3) {
-        MACE_CHECK(TransformFilter<T>(
+        MACE_CHECK(TransformFilter(
            context, operator_def_.get(), 2,
            OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
      }
    } else {
      if (operator_def_->input_size() >= 4) {
-        MACE_CHECK(TransformFilter<T>(
+        MACE_CHECK(TransformFilter(
            context,
            operator_def_.get(),
            3,
@@ -256,13 +256,8 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
 void RegisterDeconv2D(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
                   DeviceType::CPU, float);
+  MACE_REGISTER_GPU_OP(op_registry, "Deconv2D", Deconv2dOp);
 #ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
-                   DeviceType::GPU, half);
  MACE_REGISTER_OP_CONDITION(
      op_registry,
      OpConditionBuilder("Deconv2D")

--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -24,7 +24,7 @@
 namespace mace {
 namespace ops {
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class DepthToSpaceOp : public Operation {
 public:
  explicit DepthToSpaceOp(OpConstructContext *context)
@@ -90,14 +90,14 @@ class DepthToSpaceOp : public Operation {
 };
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class DepthToSpaceOp<DeviceType::GPU, T> : public Operation {
+class DepthToSpaceOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit DepthToSpaceOp(OpConstructContext *context)
      : Operation(context) {
    int block_size = Operation::GetOptionalArg<int>("block_size", 1);
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::DepthToSpaceKernel<T>>(block_size);
+      kernel_ = make_unique<opencl::image::DepthToSpaceKernel>(block_size);
    } else {
      MACE_NOT_IMPLEMENTED;
    }
@@ -118,13 +118,7 @@ void RegisterDepthToSpace(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "DepthToSpace",
                   DepthToSpaceOp, DeviceType::CPU, float);
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "DepthToSpace", DepthToSpaceOp);
-  MACE_REGISTER_OP(op_registry, "DepthToSpace",
-                   DepthToSpaceOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "DepthToSpace",
-                   DepthToSpaceOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }
 }  // namespace ops

--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -369,24 +369,24 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
 #endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
+class DepthwiseConv2dOp<DeviceType::GPU, float> : public DepthwiseConv2dOpBase {
 public:
  explicit DepthwiseConv2dOp(OpConstructContext *context)
      : DepthwiseConv2dOpBase(context) {
    MemoryType mem_type;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel<T>>();
+      kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel>();
    } else {
      mem_type = MemoryType::GPU_BUFFER;
-      kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel<T>>();
+      kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel>();
    }
    Tensor *filter_tensor = context->workspace()->GetTensor(
        operator_def_->input(1));
    if (filter_tensor != nullptr && filter_tensor->is_weight()) {
      // Transform filter tensor to target format
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context,
          operator_def_.get(),
          1,
@@ -394,7 +394,7 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
          mem_type) == MaceStatus::MACE_SUCCESS);
    }
    if (operator_def_->input_size() > 2) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
                     == MaceStatus::MACE_SUCCESS);
    }
@@ -431,12 +431,9 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
                   DepthwiseConv2dOp, DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "DepthwiseConv2d", DepthwiseConv2dOp);
-  MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
-                   DepthwiseConv2dOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
+#ifdef MACE_ENABLE_OPENCL
-                   DepthwiseConv2dOp, DeviceType::GPU, half);
  MACE_REGISTER_OP_CONDITION(
      op_registry,
      OpConditionBuilder("DepthwiseConv2d")

--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -184,23 +184,23 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
 };
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
+class DepthwiseDeconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
 public:
  explicit DepthwiseDeconv2dOp(OpConstructContext *context)
      : Deconv2dOpBase(context) {
    MemoryType mem_type = MemoryType::GPU_IMAGE;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel<T>>();
+      kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel>();
    } else {
      MACE_NOT_IMPLEMENTED;
    }
-    MACE_CHECK(TransformFilter<T>(
+    MACE_CHECK(TransformFilter(
        context, operator_def_.get(), 1,
        OpenCLBufferType::DW_CONV2D_FILTER, mem_type)
                   == MaceStatus::MACE_SUCCESS);
    if (operator_def_->input_size() >= 3) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context, operator_def_.get(), 2,
          OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
    }
@@ -255,13 +255,7 @@ void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
                   DepthwiseDeconv2dOp, DeviceType::CPU, float);
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "DepthwiseDeconv2d", DepthwiseDeconv2dOp);
-  MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
-                   DepthwiseDeconv2dOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
-                   DepthwiseDeconv2dOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }
 }  // namespace ops

--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -1158,8 +1158,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
 #endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class EltwiseOp<DeviceType::GPU, T> : public Operation {
+class EltwiseOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit EltwiseOp(OpConstructContext *context)
      : Operation(context) {
@@ -1178,7 +1178,7 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
    MemoryType mem_type;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::EltwiseKernel<T>>(
+      kernel_ = make_unique<opencl::image::EltwiseKernel>(
          type, coeff, scalar_input, scalar_input_index);
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -1190,14 +1190,14 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
      if (ws->HasTensor(operator_def_->input(i)) &&
          ws->GetTensor(operator_def_->input(i))->is_weight()) {
        if (ws->GetTensor(operator_def_->input(i))->dim_size() == 1) {
-          MACE_CHECK(TransformFilter<T>(
+          MACE_CHECK(TransformFilter(
              context,
              operator_def_.get(),
              i,
              OpenCLBufferType::ARGUMENT,
              mem_type) == MaceStatus::MACE_SUCCESS);
        } else if (ws->GetTensor(operator_def_->input(i))->dim_size() == 4) {
-          MACE_CHECK(TransformFilter<T>(
+          MACE_CHECK(TransformFilter(
              context,
              operator_def_.get(),
              i,
@@ -1236,13 +1236,7 @@ void RegisterEltwise(OpRegistryBase *op_registry) {
                   DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Eltwise", EltwiseOp);
-  MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }
 }  // namespace ops

--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -184,27 +184,27 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t>
 #endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
+class FullyConnectedOp<DeviceType::GPU, float> : public FullyConnectedOpBase {
 public:
  explicit FullyConnectedOp(OpConstructContext *context)
      : FullyConnectedOpBase(context) {
    MemoryType mem_type = MemoryType::CPU_BUFFER;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::FullyConnectedKernel<T>>();
+      kernel_ = make_unique<opencl::image::FullyConnectedKernel>();
    } else {
      MACE_NOT_IMPLEMENTED;
    }
    // Transform filter tensor to target format
-    MACE_CHECK(TransformFilter<T>(
+    MACE_CHECK(TransformFilter(
        context,
        operator_def_.get(),
        1,
        OpenCLBufferType::WEIGHT_WIDTH,
        mem_type) == MaceStatus::MACE_SUCCESS);
    if (operator_def_->input_size() > 2) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
                     == MaceStatus::MACE_SUCCESS);
    }
@@ -240,13 +240,7 @@ void RegisterFullyConnected(OpRegistryBase *op_registry) {
                   FullyConnectedOp, DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "FullyConnected", FullyConnectedOp);
-  MACE_REGISTER_OP(op_registry, "FullyConnected",
-                   FullyConnectedOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "FullyConnected",
-                   FullyConnectedOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }
 }  // namespace ops

--- a/mace/ops/identity.cc
+++ b/mace/ops/identity.cc
@@ -18,7 +18,6 @@
 namespace mace {
 namespace ops {
-template <DeviceType D, class T>
 class IdentityOp : public Operation {
 public:
  explicit IdentityOp(OpConstructContext *context)
@@ -34,15 +33,13 @@ class IdentityOp : public Operation {
 };
 void RegisterIdentity(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
                            DeviceType::CPU, float);
-  MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
                            DeviceType::CPU, int32_t);
 #ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
                            DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
-                   DeviceType::GPU, half);
 #endif  // MACE_ENABLE_OPENCL
 }

--- a/mace/ops/infer_conv2d_shape.cc
+++ b/mace/ops/infer_conv2d_shape.cc
@@ -19,7 +19,6 @@
 namespace mace {
 namespace ops {
-template <DeviceType D, class T>
 class InferConv2dShapeOp : public Operation {
 public:
  explicit InferConv2dShapeOp(OpConstructContext *context)
@@ -69,19 +68,22 @@ class InferConv2dShapeOp : public Operation {
      out_w = (in_w - kernels[3] + paddings[1]) / strides[1] + 1;
    } else {
      switch (padding_type) {
-        case SAME:
+        case SAME: {
          out_h = (in_h + strides[0] - 1) / strides[0];
          out_w = (in_w + strides[1] - 1) / strides[1];
          break;
-        case VALID:
+        }
+        case VALID: {
          out_h = (in_h - kernels[2] + 1) / strides[0];
          out_w = (in_w - kernels[3] + 1) / strides[1];
          break;
-        default:
+        }
+        default: {
          MACE_NOT_IMPLEMENTED;
          break;
        }
      }
+    }
    if (isNCHW) {
      output_data[0] = out_batch;
@@ -100,15 +102,13 @@ class InferConv2dShapeOp : public Operation {
 };
 void RegisterInferConv2dShape(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "InferConv2dShape",
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
                            InferConv2dShapeOp, DeviceType::CPU, float);
-  MACE_REGISTER_OP(op_registry, "InferConv2dShape",
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
                            InferConv2dShapeOp, DeviceType::CPU, int32_t);
 #ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "InferConv2dShape",
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
                            InferConv2dShapeOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "InferConv2dShape",
-                   InferConv2dShapeOp, DeviceType::GPU, half);
 #endif  // MACE_ENABLE_OPENCL
 }

--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -492,8 +492,8 @@ class MatMulOp<DeviceType::CPU, uint8_t> : public MatMulOpBase {
 #endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class MatMulOp<DeviceType::GPU, T> : public MatMulOpBase {
+class MatMulOp<DeviceType::GPU, float> : public MatMulOpBase {
 public:
  explicit MatMulOp(OpConstructContext *context)
      : MatMulOpBase(context) {
@@ -592,7 +592,6 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase {
 };
 #endif  // MACE_ENABLE_NEON
 void RegisterMatMul(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
                   DeviceType::CPU, float);
@@ -602,13 +601,7 @@ void RegisterMatMul(OpRegistryBase *op_registry) {
                   DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "MatMul", MatMulOp);
-  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 #if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,

--- a/mace/ops/opencl/buffer/buffer_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_transform.cc
@@ -27,7 +27,6 @@ MaceStatus TransformConv2DFilter(
    OpContext *context,
    cl::Kernel *kernel,
    const Tensor *input,
-    const DataType dt,
    Tensor *output) {
  const index_t out_chan = input->dim(0);
  const index_t in_chan = input->dim(1);
@@ -55,8 +54,9 @@ MaceStatus TransformConv2DFilter(
    MACE_OUT_OF_RANGE_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_conv_filter");
    built_options.emplace("-Dtransform_conv_filter=" + kernel_name);
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
+    std::string data_dt = DtToCLDt(input->dtype());
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
+    built_options.emplace("-DDATA_TYPE=" + data_dt);
    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
                                              kernel_name,
                                              built_options,
@@ -98,7 +98,6 @@ MaceStatus TransformDWConv2DFilter(
    OpContext *context,
    cl::Kernel *kernel,
    const Tensor *input,
-    const DataType dt,
    Tensor *output) {
  const index_t multiplier = input->dim(0);
  const index_t in_chan = input->dim(1);
@@ -124,8 +123,9 @@ MaceStatus TransformDWConv2DFilter(
    MACE_NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_dw_conv_filter");
    built_options.emplace("-Dtransform_dw_conv_filter=" + kernel_name);
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
+    std::string data_dt = DtToCLDt(input->dtype());
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
+    built_options.emplace("-DDATA_TYPE=" + data_dt);
    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
                                              kernel_name,
                                              built_options,
@@ -164,7 +164,6 @@ MaceStatus TransformArgument(
    OpContext *context,
    cl::Kernel *kernel,
    const Tensor *input,
-    const DataType dt,
    Tensor *output) {
  const index_t size = input->dim(0);
@@ -181,8 +180,9 @@ MaceStatus TransformArgument(
    MACE_NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_arg");
    built_options.emplace("-Dtransform_arg=" + kernel_name);
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
+    std::string data_dt = DtToCLDt(input->dtype());
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
+    built_options.emplace("-DDATA_TYPE=" + data_dt);
    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
                                              kernel_name,
                                              built_options,
@@ -229,6 +229,30 @@ MaceStatus TransformArgument(
  return MaceStatus::MACE_SUCCESS;
 }
+MaceStatus BufferTransform::Compute(OpContext *context,
+                                    const Tensor *input,
+                                    const OpenCLBufferType type,
+                                    const int wino_blk_size,
+                                    Tensor *output) {
+  MACE_UNUSED(wino_blk_size);
+  switch (type) {
+    case CONV2D_FILTER:
+      return TransformConv2DFilter(context, &kernel_, input, output);
+    case DW_CONV2D_FILTER:
+      return TransformDWConv2DFilter(context, &kernel_, input, output);
+    case ARGUMENT:
+      return TransformArgument(context, &kernel_, input, output);
+    default:
+      if (input->dtype() != output->dtype()) {
+        return BufferTypeTransform(context, &kernel_, input, output);
+      } else {
+        SetFutureDefaultWaitFn(context->future());
+        output->ReuseTensorBuffer(*input);
+        return MaceStatus::MACE_SUCCESS;
+      }
+  }
+}
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/buffer/buffer_transform.h
+++ b/mace/ops/opencl/buffer/buffer_transform.h
@@ -32,33 +32,27 @@ MaceStatus BufferTypeTransform(
    OpContext *context,
    cl::Kernel *kernel,
    const Tensor *input,
-    const DataType dt,
    Tensor *output);
 MaceStatus TransformConv2DFilter(
    OpContext *context,
    cl::Kernel *kernel,
    const Tensor *input,
-    const DataType dt,
    Tensor *output);
 MaceStatus TransformDWConv2DFilter(
    OpContext *context,
    cl::Kernel *kernel,
    const Tensor *input,
-    const DataType dt,
    Tensor *output);
 MaceStatus TransformArgument(
    OpContext *context,
    cl::Kernel *kernel,
    const Tensor *input,
-    const DataType dt,
    Tensor *output);
+class BufferTransform : public OpenCLBufferTransformKernel {
-template <typename T>
-class BufferTransform: public OpenCLBufferTransformKernel {
 public:
  MaceStatus Compute(
      OpContext *context,
@@ -72,32 +66,6 @@ class BufferTransform: public OpenCLBufferTransformKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus BufferTransform<T>::Compute(OpContext *context,
-                                       const Tensor *input,
-                                       const OpenCLBufferType type,
-                                       const int wino_blk_size,
-                                       Tensor *output) {
-  MACE_UNUSED(wino_blk_size);
-  const DataType dt = DataTypeToEnum<T>::value;
-  switch (type) {
-    case CONV2D_FILTER:
-      return TransformConv2DFilter(context, &kernel_, input, dt, output);
-    case DW_CONV2D_FILTER:
-      return TransformDWConv2DFilter(context, &kernel_, input, dt, output);
-    case ARGUMENT:
-      return TransformArgument(context, &kernel_, input, dt, output);
-    default:
-      if (input->dtype() != dt) {
-        return BufferTypeTransform(context, &kernel_, input, dt, output);
-      } else {
-        SetFutureDefaultWaitFn(context->future());
-        output->ReuseTensorBuffer(*input);
-        return MaceStatus::MACE_SUCCESS;
-      }
-  }
-}
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/buffer/buffer_type_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_type_transform.cc
@@ -27,7 +27,6 @@ MaceStatus BufferTypeTransform(
    OpContext *context,
    cl::Kernel *kernel,
    const Tensor *input,
-    const DataType dt,
    Tensor *output) {
  MACE_RETURN_IF_ERROR(output->ResizeLike(input));
@@ -43,7 +42,7 @@ MaceStatus BufferTypeTransform(
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_data_type");
    built_options.emplace("-Dtransform_data_type=" + kernel_name);
    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(output->dtype()));
    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
                                              kernel_name,
                                              built_options,

--- a/mace/ops/opencl/buffer/conv_2d.cc
+++ b/mace/ops/opencl/buffer/conv_2d.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/buffer/conv_2d.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace buffer {
+bool Conv2dKernel::CheckUseWinograd(
+    OpenCLRuntime *runtime,
+    const std::vector<index_t> &filter_shape,
+    const std::vector<index_t> &output_shape,
+    const int *strides,
+    const int *dilations,
+    int *wino_block_size) {
+  MACE_UNUSED(kwg_size_);
+  MACE_UNUSED(runtime);
+  MACE_UNUSED(output_shape);
+  MACE_UNUSED(wino_block_size);
+  return (filter_shape[2] == 3 && filter_shape[3] == 3 &&
+      strides[0] == 1 && strides[1] == 1 &&
+      dilations[0] == 1 && dilations[1] == 1);
+}
+MaceStatus Conv2dKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const float leakyrelu_coefficient,
+    const int winograd_blk_size,
+    Tensor *output) {
+  MACE_UNUSED(winograd_blk_size);
+  StatsFuture pad_future, conv_future;
+  index_t filter_h = filter->dim(2);
+  index_t filter_w = filter->dim(3);
+  // Reshape output
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    ops::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), filter->shape().data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), filter->shape().data(),
+                   padding_data.data(), dilations, strides, RoundType::FLOOR,
+                   output_shape.data());
+  }
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+  // calculate padded input shape
+  index_t width = output_shape[2];
+  index_t channels = output_shape[3];
+  index_t input_height = input->dim(1);
+  index_t input_width = input->dim(2);
+  index_t input_channels = input->dim(3);
+  int pad_top = paddings[0] >> 1;
+  int pad_left = paddings[1] >> 1;
+  MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
+  MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
+             input_channels);
+  std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
+  // Mark whether input changed or not
+  bool input_changed = !IsVecEqual(input_shape_, input->shape());
+  input_shape_ = input->shape();
+  bool use_1x1 = filter_h == 1 && filter_w == 1;
+  std::vector<index_t> padded_output_shape = output_shape;
+  index_t tile_w, tile_c = 4;
+  if (use_1x1) {
+    tile_w = 2;
+  } else {
+    tile_w = 4;
+  }
+  padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
+  std::vector<index_t> padded_input_shape = input->shape();
+  padded_input_shape[1] = input_height + paddings[0];
+  padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
+      (filter_w - 1) * dilations[1] + 1;
+  padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
+  const Tensor *padded_input_ptr = input;
+  // pad input
+  std::unique_ptr<Tensor> padded_input;
+  if (padded_input_shape[1] != input_height ||
+      padded_input_shape[2] != input_width ||
+      padded_input_shape[3] != input_channels) {
+    // decide scratch size before allocate it
+    index_t total_scratch_size = 0;
+    index_t padded_input_size = 0;
+    padded_input_size =
+        std::accumulate(padded_input_shape.begin(),
+                        padded_input_shape.end(),
+                        1,
+                        std::multiplies<index_t>())
+            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
+    total_scratch_size += padded_input_size;
+    // Init scratch buffer
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
+    if (old_scratch_size_ != scratch->size()) {
+      input_changed |= scratch->size() != old_scratch_size_;
+      old_scratch_size_ = scratch->size();
+    }
+    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
+                                       input->dtype());
+    padded_input->Resize(padded_input_shape);
+    PadInput(context, &kernels_[0], input, pad_top, pad_left,
+             input_changed, padded_input.get(), &pad_future);
+    padded_input_ptr = padded_input.get();
+  }
+  if (use_1x1) {
+    conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
+      return conv2d::Conv2d1x1(
+          context, &kernels_[1], pad_input, filter, bias, strides,
+          activation, relux_max_limit,
+          leakyrelu_coefficient, input_changed, output, &conv_future);
+    };
+  } else {
+    conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
+      return conv2d::Conv2dGeneral(
+          context, &kernels_[1], pad_input, filter, bias, strides, dilations,
+          activation, relux_max_limit,
+          leakyrelu_coefficient, input_changed, output, &conv_future);
+    };
+  }
+  MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
+  MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future());
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/buffer/conv_2d.h
+++ b/mace/ops/opencl/buffer/conv_2d.h
@@ -36,7 +36,6 @@ extern MaceStatus Conv2d1x1(OpContext *context,
                            const Tensor *filter,
                            const Tensor *bias,
                            const int *strides,
-                            const DataType dt,
                            const ActivationType activation,
                            const float relux_max_limit,
                            const float leakyrelu_coefficient,
@@ -51,7 +50,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context,
                                const Tensor *bias,
                                const int *strides,
                                const int *dilations,
-                                const DataType dt,
                                const ActivationType activation,
                                const float relux_max_limit,
                                const float leakyrelu_coefficient,
@@ -60,7 +58,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context,
                                StatsFuture *future);
 }  // namespace conv2d
-template <typename T>
 class Conv2dKernel : public OpenCLConv2dKernel {
 public:
  Conv2dKernel() : old_scratch_size_(0) {}
@@ -95,153 +92,6 @@ class Conv2dKernel : public OpenCLConv2dKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-bool Conv2dKernel<T>::CheckUseWinograd(
-    OpenCLRuntime *runtime,
-    const std::vector<index_t> &filter_shape,
-    const std::vector<index_t> &output_shape,
-    const int *strides,
-    const int *dilations,
-    int *wino_block_size) {
-  MACE_UNUSED(runtime);
-  MACE_UNUSED(output_shape);
-  MACE_UNUSED(wino_block_size);
-  return (filter_shape[2] == 3 && filter_shape[3] == 3 &&
-      strides[0] == 1 && strides[1] == 1 &&
-      dilations[0] == 1 && dilations[1] == 1);
-}
-template <typename T>
-MaceStatus Conv2dKernel<T>::Compute(
-      OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *bias,
-      const int *strides,
-      const Padding &padding_type,
-      const std::vector<int> &padding_data,
-      const int *dilations,
-      const ActivationType activation,
-      const float relux_max_limit,
-      const float leakyrelu_coefficient,
-      const int winograd_blk_size,
-      Tensor *output) {
-  MACE_UNUSED(winograd_blk_size);
-  StatsFuture pad_future, conv_future;
-  index_t filter_h = filter->dim(2);
-  index_t filter_w = filter->dim(3);
-  // Reshape output
-  std::vector<index_t> output_shape(4);
-  std::vector<int> paddings(2);
-  if (padding_data.empty()) {
-    ops::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), filter->shape().data(), dilations, strides,
-        padding_type, output_shape.data(), paddings.data());
-  } else {
-    paddings = padding_data;
-    CalcOutputSize(input->shape().data(), filter->shape().data(),
-                   padding_data.data(), dilations, strides, RoundType::FLOOR,
-                   output_shape.data());
-  }
-  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-  // calculate padded input shape
-  index_t width = output_shape[2];
-  index_t channels = output_shape[3];
-  index_t input_height = input->dim(1);
-  index_t input_width = input->dim(2);
-  index_t input_channels = input->dim(3);
-  int pad_top = paddings[0] >> 1;
-  int pad_left = paddings[1] >> 1;
-  MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
-  MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
-             input_channels);
-  std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
-  // Mark whether input changed or not
-  bool input_changed = !IsVecEqual(input_shape_, input->shape());
-  input_shape_ = input->shape();
-  bool use_1x1 = filter_h == 1 && filter_w == 1;
-  std::vector<index_t> padded_output_shape = output_shape;
-  index_t tile_w, tile_c = 4;
-  if (use_1x1) {
-    tile_w = 2;
-  } else {
-    tile_w = 4;
-  }
-  padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
-  std::vector<index_t> padded_input_shape = input->shape();
-  padded_input_shape[1] = input_height + paddings[0];
-  padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
-      (filter_w - 1) * dilations[1] + 1;
-  padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
-  const Tensor *padded_input_ptr = input;
-  // pad input
-  std::unique_ptr<Tensor> padded_input;
-  if (padded_input_shape[1] != input_height ||
-      padded_input_shape[2] != input_width ||
-      padded_input_shape[3] != input_channels) {
-    // decide scratch size before allocate it
-    index_t total_scratch_size = 0;
-    index_t padded_input_size = 0;
-    padded_input_size =
-        std::accumulate(padded_input_shape.begin(),
-                        padded_input_shape.end(),
-                        1,
-                        std::multiplies<index_t>())
-            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
-    total_scratch_size += padded_input_size;
-    // Init scratch buffer
-    ScratchBuffer *scratch = context->device()->scratch_buffer();
-    scratch->Rewind();
-    scratch->GrowSize(total_scratch_size);
-    if (old_scratch_size_ != scratch->size()) {
-      input_changed |= scratch->size() != old_scratch_size_;
-      old_scratch_size_ = scratch->size();
-    }
-    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
-                                  input->dtype());
-    padded_input->Resize(padded_input_shape);
-    PadInput(context, &kernels_[0], input, pad_top, pad_left,
-             input_changed, padded_input.get(), &pad_future);
-    padded_input_ptr = padded_input.get();
-  }
-  if (use_1x1) {
-    conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
-      return conv2d::Conv2d1x1(
-          context, &kernels_[1], pad_input, filter, bias, strides,
-          DataTypeToEnum<T>::v(), activation, relux_max_limit,
-          leakyrelu_coefficient, input_changed, output, &conv_future);
-    };
-  } else {
-    conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
-      return conv2d::Conv2dGeneral(
-        context, &kernels_[1], pad_input, filter, bias, strides, dilations,
-        DataTypeToEnum<T>::v(), activation, relux_max_limit,
-        leakyrelu_coefficient, input_changed, output, &conv_future);
-    };
-  }
-  MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
-  MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future());
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/buffer/conv_2d_1x1.cc
+++ b/mace/ops/opencl/buffer/conv_2d_1x1.cc
@@ -29,7 +29,6 @@ MaceStatus Conv2d1x1(OpContext *context,
                     const Tensor *filter,
                     const Tensor *bias,
                     const int *strides,
-                     const DataType dt,
                     const ActivationType activation,
                     const float relux_max_limit,
                     const float leakyrelu_coefficient,
@@ -53,9 +52,10 @@ MaceStatus Conv2d1x1(OpContext *context,
    MACE_NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
    built_options.emplace("-Dconv2d=" + kernel_name);
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
+    std::string data_dt = DtToCLDt(padded_input->dtype());
-    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
    switch (activation) {
      case NOOP:

--- a/mace/ops/opencl/buffer/conv_2d_general.cc
+++ b/mace/ops/opencl/buffer/conv_2d_general.cc
@@ -30,7 +30,6 @@ MaceStatus Conv2dGeneral(OpContext *context,
                         const Tensor *bias,
                         const int *strides,
                         const int *dilations,
-                         const DataType dt,
                         const ActivationType activation,
                         const float relux_max_limit,
                         const float leakyrelu_coefficient,
@@ -58,9 +57,11 @@ MaceStatus Conv2dGeneral(OpContext *context,
    MACE_NON_UNIFORM_WG_CONFIG
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
    built_options.emplace("-Dconv2d=" + kernel_name);
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
+    std::string pad_data_dt = DtToCLDt(padded_input->dtype());
-    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DIN_DATA_TYPE=" + pad_data_dt);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    std::string out_data_dt = DtToCLDt(output->dtype());
+    built_options.emplace("-DOUT_DATA_TYPE=" + out_data_dt);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
    switch (activation) {
      case NOOP:

--- a/mace/ops/opencl/buffer/depthwise_conv2d.cc
+++ b/mace/ops/opencl/buffer/depthwise_conv2d.cc
@@ -30,7 +30,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                           const Tensor *bias,
                           const int *strides,
                           const int *dilations,
-                           const DataType dt,
                           const ActivationType activation,
                           const float relux_max_limit,
                           const float leakyrelu_coefficient,
@@ -59,8 +58,8 @@ MaceStatus DepthwiseConv2d(OpContext *context,
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
    built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
-    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
    switch (activation) {
      case NOOP:
@@ -136,6 +135,118 @@ MaceStatus DepthwiseConv2d(OpContext *context,
 }
 }  // namespace depthwise
+MaceStatus DepthwiseConv2dKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const float leakyrelu_coefficient,
+    Tensor *output) {
+  StatsFuture pad_future, dw_conv_future;
+  index_t filter_w = filter->dim(3);
+  // Create a fake conv_2d filter to calculate the paddings and output size
+  std::vector<index_t> fake_filter_shape(4);
+  fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
+  fake_filter_shape[1] = filter->dim(1);
+  fake_filter_shape[2] = filter->dim(2);
+  fake_filter_shape[3] = filter->dim(3);
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    ops::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), fake_filter_shape.data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
+                   padding_data.data(), dilations, strides, RoundType::FLOOR,
+                   output_shape.data());
+  }
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+  // calculate padded input shape
+  index_t width = output_shape[2];
+  index_t channels = output_shape[3];
+  index_t input_height = input->dim(1);
+  index_t input_width = input->dim(2);
+  index_t input_channels = input->dim(3);
+  int pad_top = paddings[0] >> 1;
+  int pad_left = paddings[1] >> 1;
+  MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
+  MACE_CHECK(filter->dim(0) * input_channels == channels);
+  MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
+             input_channels);
+  // Mark whether input changed or not
+  bool input_changed = !IsVecEqual(input_shape_, input->shape());
+  input_shape_ = input->shape();
+  std::vector<index_t> padded_output_shape = output_shape;
+  index_t tile_w = 4, tile_c = 4;
+  padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
+  std::vector<index_t> padded_input_shape = input->shape();
+  padded_input_shape[1] = input_height + paddings[0];
+  padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
+      (filter_w - 1) * dilations[1] + 1;
+  padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
+  const Tensor *padded_input_ptr = input;
+  // pad input
+  std::unique_ptr<Tensor> padded_input;
+  if (padded_input_shape[1] != input_height ||
+      padded_input_shape[2] != input_width ||
+      padded_input_shape[3] != input_channels) {
+    index_t total_scratch_size = 0;
+    index_t padded_input_size = 0;
+    padded_input_size =
+        std::accumulate(padded_input_shape.begin(),
+                        padded_input_shape.end(),
+                        1,
+                        std::multiplies<index_t>())
+            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
+    total_scratch_size += padded_input_size;
+    // Init scratch buffer
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
+    if (old_scratch_size_ != scratch->size()) {
+      input_changed |= scratch->size() != old_scratch_size_;
+      old_scratch_size_ = scratch->size();
+    }
+    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
+                                       input->dtype());
+    padded_input->Resize(padded_input_shape);
+    PadInput(context, &kernels_[0], input, pad_top, pad_left,
+             input_changed, padded_input.get(), &pad_future);
+    padded_input_ptr = padded_input.get();
+  }
+  MACE_RETURN_IF_ERROR(
+      depthwise::DepthwiseConv2d(
+          context, &kernels_[1], padded_input_ptr, filter, bias, strides,
+          dilations, activation, relux_max_limit,
+          leakyrelu_coefficient, input_changed, output, &dw_conv_future));
+  MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
+  return MaceStatus::MACE_SUCCESS;
+}
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/buffer/depthwise_conv2d.h
+++ b/mace/ops/opencl/buffer/depthwise_conv2d.h
@@ -37,7 +37,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                           const Tensor *bias,
                           const int *strides,
                           const int *dilations,
-                           const DataType dt,
                           const ActivationType activation,
                           const float relux_max_limit,
                           const float leakyrelu_coefficient,
@@ -46,8 +45,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                           StatsFuture *future);
 }  // namespace depthwise
-template <typename T>
 class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
 public:
  DepthwiseConv2dKernel() : old_scratch_size_(0) {}
@@ -68,122 +65,9 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
 private:
  index_t old_scratch_size_;
  cl::Kernel kernels_[2];
-  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus DepthwiseConv2dKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *filter,
-    const Tensor *bias,
-    const int *strides,
-    const Padding &padding_type,
-    const std::vector<int> &padding_data,
-    const int *dilations,
-    const ActivationType activation,
-    const float relux_max_limit,
-    const float leakyrelu_coefficient,
-    Tensor *output) {
-  StatsFuture pad_future, dw_conv_future;
-  index_t filter_w = filter->dim(3);
-  // Create a fake conv_2d filter to calculate the paddings and output size
-  std::vector<index_t> fake_filter_shape(4);
-  fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
-  fake_filter_shape[1] = filter->dim(1);
-  fake_filter_shape[2] = filter->dim(2);
-  fake_filter_shape[3] = filter->dim(3);
-  std::vector<index_t> output_shape(4);
-  std::vector<int> paddings(2);
-  if (padding_data.empty()) {
-    ops::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), fake_filter_shape.data(), dilations, strides,
-        padding_type, output_shape.data(), paddings.data());
-  } else {
-    paddings = padding_data;
-    CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
-                   padding_data.data(), dilations, strides, RoundType::FLOOR,
-                   output_shape.data());
-  }
-  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-  // calculate padded input shape
-  index_t width = output_shape[2];
-  index_t channels = output_shape[3];
-  index_t input_height = input->dim(1);
-  index_t input_width = input->dim(2);
-  index_t input_channels = input->dim(3);
-  int pad_top = paddings[0] >> 1;
-  int pad_left = paddings[1] >> 1;
-  MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
-  MACE_CHECK(filter->dim(0) * input_channels == channels);
-  MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
-             input_channels);
-  // Mark whether input changed or not
-  bool input_changed = !IsVecEqual(input_shape_, input->shape());
-  input_shape_ = input->shape();
-  std::vector<index_t> padded_output_shape = output_shape;
-  index_t tile_w = 4, tile_c = 4;
-  padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
-  std::vector<index_t> padded_input_shape = input->shape();
-  padded_input_shape[1] = input_height + paddings[0];
-  padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
-      (filter_w - 1) * dilations[1] + 1;
-  padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
-  const Tensor *padded_input_ptr = input;
-  // pad input
-  std::unique_ptr<Tensor> padded_input;
-  if (padded_input_shape[1] != input_height ||
-      padded_input_shape[2] != input_width ||
-      padded_input_shape[3] != input_channels) {
-    index_t total_scratch_size = 0;
-    index_t padded_input_size = 0;
-    padded_input_size =
-        std::accumulate(padded_input_shape.begin(),
-                        padded_input_shape.end(),
-                        1,
-                        std::multiplies<index_t>())
-            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
-    total_scratch_size += padded_input_size;
-    // Init scratch buffer
-    ScratchBuffer *scratch = context->device()->scratch_buffer();
-    scratch->Rewind();
-    scratch->GrowSize(total_scratch_size);
-    if (old_scratch_size_ != scratch->size()) {
-      input_changed |= scratch->size() != old_scratch_size_;
-      old_scratch_size_ = scratch->size();
-    }
-    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
-                                       input->dtype());
-    padded_input->Resize(padded_input_shape);
-    PadInput(context, &kernels_[0], input, pad_top, pad_left,
-             input_changed, padded_input.get(), &pad_future);
-    padded_input_ptr = padded_input.get();
-  }
-  MACE_RETURN_IF_ERROR(
-      depthwise::DepthwiseConv2d(
-          context, &kernels_[1], padded_input_ptr, filter, bias, strides,
-          dilations, DataTypeToEnum<T>::v(), activation, relux_max_limit,
-          leakyrelu_coefficient, input_changed, output, &dw_conv_future));
-  MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/buffer/pooling.cc
+++ b/mace/ops/opencl/buffer/pooling.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/buffer/pooling.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace buffer {
+MaceStatus PoolingKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const PoolingType pooling_type,
+    const int *kernels,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const RoundType round_type,
+    Tensor *output) {
+  MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
+    << "Pooling opencl kernel not support dilation yet";
+  StatsFuture pad_future, pooling_future;
+  index_t input_channels = input->dim(3);
+  std::vector<index_t> output_shape(4);
+  std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
+                                       kernels[0], kernels[1]};
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    ops::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), filter_shape.data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), filter_shape.data(),
+                   padding_data.data(), dilations, strides, round_type,
+                   output_shape.data());
+  }
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+  // Mark whether input changed or not
+  bool input_changed = !IsVecEqual(input_shape_, input->shape());
+  input_shape_ = input->shape();
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  // pad input
+  std::vector<index_t> padded_input_shape = input->shape();
+  padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
+  const Tensor *padded_input_ptr = input;
+  // pad input
+  std::unique_ptr<Tensor> padded_input;
+  if (padded_input_shape[3] != input_channels) {
+    index_t total_scratch_size = 0;
+    index_t padded_input_size = 0;
+    padded_input_size =
+        std::accumulate(padded_input_shape.begin(),
+                        padded_input_shape.end(),
+                        1,
+                        std::multiplies<index_t>())
+            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
+    total_scratch_size += padded_input_size;
+    // Init scratch buffer
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
+    if (old_scratch_size_ != scratch->size()) {
+      input_changed |= scratch->size() != old_scratch_size_;
+      old_scratch_size_ = scratch->size();
+    }
+    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
+                                       input->dtype());
+    padded_input->Resize(padded_input_shape);
+    PadInput(context, &kernels_[0], input, 0, 0,
+             input_changed, padded_input.get(), &pad_future);
+    padded_input_ptr = padded_input.get();
+  }
+  cl::Kernel *kernel = &kernels_[1];
+  MACE_OUT_OF_RANGE_DEFINITION
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
+    built_options.emplace("-Dpooling=" + kernel_name);
+    auto input_dtype = input->dtype();
+    auto input_dt = DtToCLDt(input_dtype);
+    built_options.emplace("-DIN_DATA_TYPE=" + input_dt);
+    auto output_dtype = output->dtype();
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output_dtype));
+    if (pooling_type == MAX && input_dtype == output_dtype) {
+      built_options.emplace("-DDATA_TYPE=" + input_dt);
+    } else {
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    }
+    if (pooling_type == AVG) {
+      built_options.emplace("-DPOOL_AVG");
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
+                                              kernel_name,
+                                              built_options,
+                                              kernel));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
+  }
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
+      static_cast<uint32_t>(output->dim(2)),
+      static_cast<uint32_t>(output->dim(0) * output->dim(1)),
+  };
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  if (input_changed) {
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
+    MACE_SET_3D_GWS_ARGS(*kernel, gws);
+    kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
+    kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
+    kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
+    kernel->setArg(idx++, paddings[0] / 2);
+    kernel->setArg(idx++, paddings[1] / 2);
+    kernel->setArg(idx++, strides[0]);
+    kernel->setArg(idx++, strides[1]);
+    kernel->setArg(idx++, kernels[0]);
+    kernel->setArg(idx++, kernels[1]);
+    kernel->setArg(idx++, *(output->opencl_buffer()));
+  }
+  const std::vector<uint32_t> lws = {4, 4, 4, 0};
+  std::string tuning_key =
+      Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
+                                           gws, lws, &pooling_future));
+  MACE_OUT_OF_RANGE_VALIDATION
+  MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future());
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/buffer/pooling.h
+++ b/mace/ops/opencl/buffer/pooling.h
@@ -31,7 +31,6 @@ namespace ops {
 namespace opencl {
 namespace buffer {
-template <typename T>
 class PoolingKernel : public OpenCLPoolingKernel {
 public:
  PoolingKernel() : old_scratch_size_(0) {}
@@ -54,158 +53,6 @@ class PoolingKernel : public OpenCLPoolingKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus PoolingKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const PoolingType pooling_type,
-    const int *kernels,
-    const int *strides,
-    const Padding &padding_type,
-    const std::vector<int> &padding_data,
-    const int *dilations,
-    const RoundType round_type,
-    Tensor *output) {
-  MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
-    << "Pooling opencl kernel not support dilation yet";
-  StatsFuture pad_future, pooling_future;
-  index_t input_channels = input->dim(3);
-  std::vector<index_t> output_shape(4);
-  std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
-                                       kernels[0], kernels[1]};
-  std::vector<int> paddings(2);
-  if (padding_data.empty()) {
-    ops::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), filter_shape.data(), dilations, strides,
-        padding_type, output_shape.data(), paddings.data());
-  } else {
-    paddings = padding_data;
-    CalcOutputSize(input->shape().data(), filter_shape.data(),
-                   padding_data.data(), dilations, strides, round_type,
-                   output_shape.data());
-  }
-  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-  // Mark whether input changed or not
-  bool input_changed = !IsVecEqual(input_shape_, input->shape());
-  input_shape_ = input->shape();
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  // pad input
-  std::vector<index_t> padded_input_shape = input->shape();
-  padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
-  const Tensor *padded_input_ptr = input;
-  // pad input
-  std::unique_ptr<Tensor> padded_input;
-  if (padded_input_shape[3] != input_channels) {
-    index_t total_scratch_size = 0;
-    index_t padded_input_size = 0;
-    padded_input_size =
-        std::accumulate(padded_input_shape.begin(),
-                        padded_input_shape.end(),
-                        1,
-                        std::multiplies<index_t>())
-            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
-    total_scratch_size += padded_input_size;
-    // Init scratch buffer
-    ScratchBuffer *scratch = context->device()->scratch_buffer();
-    scratch->Rewind();
-    scratch->GrowSize(total_scratch_size);
-    if (old_scratch_size_ != scratch->size()) {
-      input_changed |= scratch->size() != old_scratch_size_;
-      old_scratch_size_ = scratch->size();
-    }
-    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
-                                       input->dtype());
-    padded_input->Resize(padded_input_shape);
-    PadInput(context, &kernels_[0], input, 0, 0,
-             input_changed, padded_input.get(), &pad_future);
-    padded_input_ptr = padded_input.get();
-  }
-  cl::Kernel *kernel = &kernels_[1];
-  MACE_OUT_OF_RANGE_DEFINITION
-  if (kernel->get() == nullptr) {
-    const DataType dt = DataTypeToEnum<T>::value;
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
-    built_options.emplace("-Dpooling=" + kernel_name);
-    if (pooling_type == MAX && input->dtype() == output->dtype()) {
-      built_options.emplace("-DIN_DATA_TYPE=" +
-          DtToCLDt(input->dtype()));
-      built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
-      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    } else {
-      built_options.emplace("-DIN_DATA_TYPE=" +
-          DtToCLDt(input->dtype()));
-      built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
-      built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    }
-    if (pooling_type == AVG) {
-      built_options.emplace("-DPOOL_AVG");
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
-                                              kernel_name,
-                                              built_options,
-                                              kernel));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-  }
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
-      static_cast<uint32_t>(output->dim(2)),
-      static_cast<uint32_t>(output->dim(0) * output->dim(1)),
-  };
-  MACE_OUT_OF_RANGE_INIT(*kernel);
-  if (input_changed) {
-    uint32_t idx = 0;
-    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
-    MACE_SET_3D_GWS_ARGS(*kernel, gws);
-    kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
-    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
-    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
-    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
-    kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
-    kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
-    kernel->setArg(idx++, paddings[0] / 2);
-    kernel->setArg(idx++, paddings[1] / 2);
-    kernel->setArg(idx++, strides[0]);
-    kernel->setArg(idx++, strides[1]);
-    kernel->setArg(idx++, kernels[0]);
-    kernel->setArg(idx++, kernels[1]);
-    kernel->setArg(idx++, *(output->opencl_buffer()));
-  }
-  const std::vector<uint32_t> lws = {4, 4, 4, 0};
-  std::string tuning_key =
-      Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
-                                           gws, lws, &pooling_future));
-  MACE_OUT_OF_RANGE_VALIDATION
-  MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future());
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/buffer/softmax.cc
+++ b/mace/ops/opencl/buffer/softmax.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/buffer/softmax.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace buffer {
+MaceStatus SoftmaxKernel::Compute(
+    OpContext *context,
+    const Tensor *logits,
+    Tensor *output) {
+  index_t batch = 0;
+  index_t height = 0;
+  index_t width = 0;
+  index_t channels = 0;
+  if (logits->dim_size() == 2) {
+    batch = logits->dim(0);
+    height = 1;
+    width = 1;
+    channels = logits->dim(1);
+  } else if (logits->dim_size() == 4) {
+    batch = logits->dim(0);
+    height = logits->dim(1);
+    width = logits->dim(2);
+    channels = logits->dim(3);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const int remain_channels = channel_blocks * 4 - channels;
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
+    built_options.emplace("-Dsoftmax=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    if (use_log_) built_options.emplace("-DUSE_LOG");
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, logits->shape())) {
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(logits->opencl_buffer()));
+    kernel_.setArg(idx++, static_cast<int>(height));
+    kernel_.setArg(idx++, static_cast<int>(channels));
+    kernel_.setArg(idx++, remain_channels);
+    kernel_.setArg(idx++, *(output->opencl_buffer()));
+    input_shape_ = logits->shape();
+  }
+  std::vector<uint32_t> lws = {4, 4, 4, 0};
+  std::string tuning_key =
+      Concat("softmax_opencl_kernel", batch, height, width, channels);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/buffer/softmax.h
+++ b/mace/ops/opencl/buffer/softmax.h
@@ -29,7 +29,7 @@ namespace mace {
 namespace ops {
 namespace opencl {
 namespace buffer {
-template <typename T>
 class SoftmaxKernel : public OpenCLSoftmaxKernel {
 public:
  explicit SoftmaxKernel(bool use_log)
@@ -47,81 +47,6 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus SoftmaxKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *logits,
-    Tensor *output) {
-  index_t batch = 0;
-  index_t height = 0;
-  index_t width = 0;
-  index_t channels = 0;
-  if (logits->dim_size() == 2) {
-    batch = logits->dim(0);
-    height = 1;
-    width = 1;
-    channels = logits->dim(1);
-  } else if (logits->dim_size() == 4) {
-    batch = logits->dim(0);
-    height = logits->dim(1);
-    width = logits->dim(2);
-    channels = logits->dim(3);
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const int remain_channels = channel_blocks * 4 - channels;
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
-    built_options.emplace("-Dsoftmax=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
-    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    if (use_log_) built_options.emplace("-DUSE_LOG");
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, logits->shape())) {
-    uint32_t idx = 0;
-    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(logits->opencl_buffer()));
-    kernel_.setArg(idx++, static_cast<int>(height));
-    kernel_.setArg(idx++, static_cast<int>(channels));
-    kernel_.setArg(idx++, remain_channels);
-    kernel_.setArg(idx++, *(output->opencl_buffer()));
-    input_shape_ = logits->shape();
-  }
-  std::vector<uint32_t> lws = {4, 4, 4, 0};
-  std::string tuning_key =
-      Concat("softmax_opencl_kernel", batch, height, width, channels);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/buffer_transform.cc
+++ b/mace/ops/opencl/buffer_transform.cc
@@ -20,11 +20,11 @@
 namespace mace {
 namespace ops {
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class BufferTransformOp;
-template <typename T>
+template<>
-class BufferTransformOp<DeviceType::GPU, T> : public Operation {
+class BufferTransformOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit BufferTransformOp(OpConstructContext *context)
      : Operation(context),
@@ -42,7 +42,7 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
    MemoryType in_mem_type = context->workspace()->GetTensor(
        operator_def_->input(0))->memory_type();
-    return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
+    return OpenCLBufferTransformer(in_mem_type, out_mem_type_).Transform(
        context, input, type, out_mem_type_, wino_blk_size_, output);
  }
@@ -51,13 +51,8 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
  MemoryType out_mem_type_;
 };
 void RegisterBufferTransform(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "BufferTransform",
+  MACE_REGISTER_GPU_OP(op_registry, "BufferTransform", BufferTransformOp);
-                   BufferTransformOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "BufferTransform",
-                   BufferTransformOp, DeviceType::GPU, half);
 }
 }  // namespace ops

--- a/mace/ops/opencl/buffer_transformer.cc
+++ b/mace/ops/opencl/buffer_transformer.cc
@@ -23,5 +23,29 @@ std::string TransformedFilterName(const std::string &name) {
  return name + postfix;
 }
+MaceStatus TransformFilter(
+    mace::OpConstructContext *context,
+    OperatorDef *op_def,
+    const int input_idx,
+    const OpenCLBufferType buffer_type,
+    const MemoryType mem_type,
+    const int wino_blk_size) {
+  OpContext op_context(context->workspace(), context->device());
+  Workspace *ws = context->workspace();
+  std::string input_name = op_def->input(input_idx);
+  Tensor *input = ws->GetTensor(input_name);
+  const DataType dt = input->dtype();
+  std::string output_name = TransformedFilterName(input_name);
+  Tensor *output =
+      ws->CreateTensor(output_name, context->device()->allocator(), dt, true);
+  // update the information
+  op_def->set_input(input_idx, output_name);
+  input->MarkUnused();
+  return OpenCLBufferTransformer(input->memory_type(), mem_type).
+      Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
+                output);
+}
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
@@ -28,17 +28,16 @@
 namespace mace {
 namespace ops {
 // Only used for GPU Operation(BufferTransform)
-template<typename T>
 class OpenCLBufferTransformer {
 public:
  OpenCLBufferTransformer(const MemoryType in_mem_type,
                          const MemoryType out_mem_type) {
    if (out_mem_type == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::BufferToImage<T>>();
+      kernel_ = make_unique<opencl::image::BufferToImage>();
    } else if (in_mem_type == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::ImageToBuffer<T>>();
+      kernel_ = make_unique<opencl::image::ImageToBuffer>();
    } else {
-      kernel_ = make_unique<opencl::buffer::BufferTransform<T>>();
+      kernel_ = make_unique<opencl::buffer::BufferTransform>();
    }
  }
@@ -49,7 +48,7 @@ class OpenCLBufferTransformer {
                       const int wino_blk_size,
                       Tensor *output) {
    Workspace *ws = context->workspace();
-    DataType dt = DataTypeToEnum<T>::value;
+    DataType dt = output->dtype();
    MemoryType in_mem_type = input->memory_type();
    if (out_mem_type == MemoryType::GPU_IMAGE ||
        out_mem_type == MemoryType::GPU_BUFFER) {
@@ -87,10 +86,10 @@ class OpenCLBufferTransformer {
              << " to CPU Buffer " << output->name()
              << " with data type " << dt;
      Tensor::MappingGuard guard(&internal_tensor);
-      const T *internal_ptr = internal_tensor.data<T>();
+      const float *internal_ptr = internal_tensor.data<float>();
      output->Resize(internal_tensor.shape());
-      T *output_ptr = output->mutable_data<T>();
+      float *output_ptr = output->mutable_data<float>();
-      memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T));
+      memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(float));
      return MaceStatus::MACE_SUCCESS;
    } else {
      LOG(FATAL) << "Unexpected error: " << out_mem_type;
@@ -110,30 +109,13 @@ class OpenCLBufferTransformer {
 std::string TransformedFilterName(const std::string &name);
-template<typename T>
 MaceStatus TransformFilter(
    mace::OpConstructContext *context,
    OperatorDef *op_def,
    const int input_idx,
    const OpenCLBufferType buffer_type,
    const MemoryType mem_type,
-    const int wino_blk_size = 0) {
+    const int wino_blk_size = 0);
-  const DataType dt = DataTypeToEnum<T>::value;
-  OpContext op_context(context->workspace(), context->device());
-  Workspace *ws = context->workspace();
-  std::string input_name = op_def->input(input_idx);
-  Tensor *input = ws->GetTensor(input_name);
-  std::string output_name = TransformedFilterName(input_name);
-  Tensor *output =
-      ws->CreateTensor(output_name, context->device()->allocator(), dt, true);
-  // update the information
-  op_def->set_input(input_idx, output_name);
-  input->MarkUnused();
-  return OpenCLBufferTransformer<T>(input->memory_type(), mem_type).
-      Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
-                output);
-}
 }  // namespace ops
 }  // namespace mace

--- a/mace/ops/opencl/conv_2d.h
+++ b/mace/ops/opencl/conv_2d.h
@@ -17,8 +17,9 @@
 #include <vector>
-#include "mace/ops/activation.h"
+#include "mace/ops/common/activation_type.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
 namespace mace {
 class OpContext;

--- a/mace/ops/opencl/deconv_2d.h
+++ b/mace/ops/opencl/deconv_2d.h
@@ -17,7 +17,10 @@
 #include <vector>
-#include "mace/ops/activation.h"
+#include "mace/core/types.h"
+#include "mace/ops/common/activation_type.h"
+#include "mace/public/mace.h"
+#include "mace/utils/macros.h"
 namespace mace {

--- a/mace/ops/opencl/depthwise_deconv2d.h
+++ b/mace/ops/opencl/depthwise_deconv2d.h
@@ -19,6 +19,9 @@
 #include <vector>
 #include "mace/ops/common/activation_type.h"
+#include "mace/public/mace.h"
+#include "mace/utils/macros.h"
+#include "mace/core/types.h"
 namespace mace {

--- a/mace/ops/opencl/fully_connected.h
+++ b/mace/ops/opencl/fully_connected.h
@@ -15,8 +15,7 @@
 #ifndef MACE_OPS_OPENCL_FULLY_CONNECTED_H_
 #define MACE_OPS_OPENCL_FULLY_CONNECTED_H_
-#include "mace/ops/activation.h"
+#include "mace/ops/common/activation_type.h"
 #include "mace/public/mace.h"
 #include "mace/utils/math.h"

--- a/mace/ops/opencl/helper.cc
+++ b/mace/ops/opencl/helper.cc
@@ -77,28 +77,6 @@ std::string DtToCLCMDDt(const DataType dt) {
  }
 }
-std::string DtToUpCompatibleCLDt(const DataType dt) {
-  switch (dt) {
-    case DT_FLOAT:
-    case DT_HALF:
-      return "float";
-    default:
-      LOG(FATAL) << "Unsupported data type";
-      return "";
-  }
-}
-std::string DtToUpCompatibleCLCMDDt(const DataType dt) {
-  switch (dt) {
-    case DT_FLOAT:
-    case DT_HALF:
-      return "f";
-    default:
-      LOG(FATAL) << "Not supported data type for opencl cmd data type";
-      return "";
-  }
-}
 std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
                                       const uint32_t *gws,
                                       const uint32_t kwg_size) {

--- a/mace/ops/opencl/helper.h
+++ b/mace/ops/opencl/helper.h
@@ -100,17 +100,9 @@ std::vector<index_t> FormatBufferShape(
 // CPU data type to OpenCL command data type
 std::string DtToCLCMDDt(const DataType dt);
-// CPU data type to upward compatible OpenCL command data type
-// e.g. half -> float
-std::string DtToUpCompatibleCLCMDDt(const DataType dt);
 // CPU data type to OpenCL data type
 std::string DtToCLDt(const DataType dt);
-// CPU data type to upward compatible OpenCL data type
-// e.g. half -> float
-std::string DtToUpCompatibleCLDt(const DataType dt);
 // CPU data type to OpenCL condition data type used in select
 // e.g. half -> float
 std::string DtToCLCondDt(const DataType dt);

--- a/mace/ops/opencl/image/activation.cc
+++ b/mace/ops/opencl/image/activation.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/activation.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus ActivationKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *alpha,
+    Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
+    built_options.emplace("-Dactivation=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    switch (activation_) {
+      case RELU: {
+        tuning_key_prefix_ = "relu_opencl_kernel";
+        built_options.emplace("-DUSE_RELU");
+        break;
+      }
+      case RELUX: {
+        tuning_key_prefix_ = "relux_opencl_kernel";
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      }
+      case PRELU: {
+        tuning_key_prefix_ = "prelu_opencl_kernel";
+        built_options.emplace("-DUSE_PRELU");
+        break;
+      }
+      case TANH: {
+        tuning_key_prefix_ = "tanh_opencl_kernel";
+        built_options.emplace("-DUSE_TANH");
+        break;
+      }
+      case SIGMOID: {
+        tuning_key_prefix_ = "sigmoid_opencl_kernel";
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      }
+      case LEAKYRELU: {
+        tuning_key_prefix_ = "leakyrelu_opencl_kernel";
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
+      }
+      default: {
+        LOG(FATAL) << "Unknown activation type: " << activation_;
+      }
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    int idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    if (activation_ == PRELU) {
+      MACE_CHECK_NOTNULL(alpha);
+      kernel_.setArg(idx++, *(alpha->opencl_image()));
+    }
+    kernel_.setArg(idx++, relux_max_limit_);
+    kernel_.setArg(idx++, leakyrelu_coefficient_);
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
+             output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/activation.h
+++ b/mace/ops/opencl/image/activation.h
@@ -31,12 +31,11 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class ActivationKernel : public OpenCLActivationKernel {
 public:
  ActivationKernel(ActivationType type,
-                   T relux_max_limit,
+                   float relux_max_limit,
-                   T leakyrelu_coefficient)
+                   float leakyrelu_coefficient)
      : activation_(type), relux_max_limit_(relux_max_limit),
        leakyrelu_coefficient_(leakyrelu_coefficient) {}
@@ -48,106 +47,14 @@ class ActivationKernel : public OpenCLActivationKernel {
 private:
  ActivationType activation_;
-  T relux_max_limit_;
+  float relux_max_limit_;
-  T leakyrelu_coefficient_;
+  float leakyrelu_coefficient_;
  cl::Kernel kernel_;
  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
  std::string tuning_key_prefix_;
 };
-template <typename T>
-MaceStatus ActivationKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *alpha,
-    Tensor *output) {
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
-    built_options.emplace("-Dactivation=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    switch (activation_) {
-      case RELU:
-        tuning_key_prefix_ = "relu_opencl_kernel";
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        tuning_key_prefix_ = "relux_opencl_kernel";
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case PRELU:
-        tuning_key_prefix_ = "prelu_opencl_kernel";
-        built_options.emplace("-DUSE_PRELU");
-        break;
-      case TANH:
-        tuning_key_prefix_ = "tanh_opencl_kernel";
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        tuning_key_prefix_ = "sigmoid_opencl_kernel";
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      case LEAKYRELU:
-        tuning_key_prefix_ = "leakyrelu_opencl_kernel";
-        built_options.emplace("-DUSE_LEAKYRELU");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation_;
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    int idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    if (activation_ == PRELU) {
-      MACE_CHECK_NOTNULL(alpha);
-      kernel_.setArg(idx++, *(alpha->opencl_image()));
-    }
-    kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
-    kernel_.setArg(idx++, static_cast<float>(leakyrelu_coefficient_));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
-             output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/addn.cc
+++ b/mace/ops/opencl/image/addn.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/addn.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus AddNKernel::Compute(
+    OpContext *context,
+    const std::vector<const Tensor *> &input_tensors,
+    Tensor *output_tensor) {
+  size_t size = input_tensors.size();
+  MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
+  const index_t batch = input_tensors[0]->dim(0);
+  const index_t height = input_tensors[0]->dim(1);
+  const index_t width = input_tensors[0]->dim(2);
+  const index_t channels = input_tensors[0]->dim(3);
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  for (size_t i = 1; i < size; ++i) {
+    MACE_CHECK_NOTNULL(input_tensors[i]);
+    MACE_CHECK(batch == input_tensors[i]->dim(0));
+    MACE_CHECK(height == input_tensors[i]->dim(1));
+    MACE_CHECK(width == input_tensors[i]->dim(2));
+    MACE_CHECK(channels == input_tensors[i]->dim(3));
+  }
+  if (kernel_.get() == nullptr) {
+    if (input_tensors.size() > 4) {
+      MACE_NOT_IMPLEMENTED;
+    }
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
+    built_options.emplace("-Daddn=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  std::vector<index_t> output_shape = input_tensors[0]->shape();
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t width_pixels = channel_blocks * width;
+  const index_t batch_height_pixels = batch * height;
+  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
+                           static_cast<uint32_t>(batch_height_pixels)};
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
+    std::vector<size_t> output_image_shape;
+    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                                &output_image_shape);
+    MACE_RETURN_IF_ERROR(
+        output_tensor->ResizeImage(output_shape, output_image_shape));
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_2D_GWS_ARGS(kernel_, gws);
+    for (auto input : input_tensors) {
+      kernel_.setArg(idx++, *(input->opencl_image()));
+    }
+    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
+    input_shape_ = input_tensors[0]->shape();
+  }
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
+  std::string tuning_key =
+      Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
+             output_tensor->dim(2), output_tensor->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/addn.h
+++ b/mace/ops/opencl/image/addn.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class AddNKernel : public OpenCLAddNKernel {
 public:
  MaceStatus Compute(
@@ -44,89 +43,6 @@ class AddNKernel : public OpenCLAddNKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus AddNKernel<T>::Compute(
-    OpContext *context,
-    const std::vector<const Tensor *> &input_tensors,
-    Tensor *output_tensor) {
-  size_t size = input_tensors.size();
-  MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
-  const index_t batch = input_tensors[0]->dim(0);
-  const index_t height = input_tensors[0]->dim(1);
-  const index_t width = input_tensors[0]->dim(2);
-  const index_t channels = input_tensors[0]->dim(3);
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  for (size_t i = 1; i < size; ++i) {
-    MACE_CHECK_NOTNULL(input_tensors[i]);
-    MACE_CHECK(batch == input_tensors[i]->dim(0));
-    MACE_CHECK(height == input_tensors[i]->dim(1));
-    MACE_CHECK(width == input_tensors[i]->dim(2));
-    MACE_CHECK(channels == input_tensors[i]->dim(3));
-  }
-  if (kernel_.get() == nullptr) {
-    if (input_tensors.size() > 4) {
-      MACE_NOT_IMPLEMENTED;
-    }
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
-    built_options.emplace("-Daddn=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  std::vector<index_t> output_shape = input_tensors[0]->shape();
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t width_pixels = channel_blocks * width;
-  const index_t batch_height_pixels = batch * height;
-  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
-                           static_cast<uint32_t>(batch_height_pixels)};
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
-    std::vector<size_t> output_image_shape;
-    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                                &output_image_shape);
-    MACE_RETURN_IF_ERROR(
-        output_tensor->ResizeImage(output_shape, output_image_shape));
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_2D_GWS_ARGS(kernel_, gws);
-    for (auto input : input_tensors) {
-      kernel_.setArg(idx++, *(input->opencl_image()));
-    }
-    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
-    input_shape_ = input_tensors[0]->shape();
-  }
-  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
-  std::string tuning_key =
-      Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
-             output_tensor->dim(2), output_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/batch_norm.cc
+++ b/mace/ops/opencl/image/batch_norm.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/batch_norm.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+BatchNormKernel::BatchNormKernel(const float epsilon,
+                                 const ActivationType activation,
+                                 const float relux_max_limit,
+                                 const float leakyrelu_coefficient)
+    : epsilon_(epsilon),
+      activation_(activation),
+      relux_max_limit_(relux_max_limit),
+      leakyrelu_coefficient_(leakyrelu_coefficient) {}
+MaceStatus BatchNormKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *scale,
+    const Tensor *offset,
+    const Tensor *mean,
+    const Tensor *var,
+    Tensor *output) {
+  bool not_folded = (mean != nullptr && var != nullptr);
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
+    built_options.emplace("-Dbatch_norm=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    if (!not_folded) {
+      built_options.emplace("-DFOLDED_CONSTANT");
+    }
+    switch (activation_) {
+      case NOOP:break;
+      case RELU:built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:built_options.emplace("-DUSE_SIGMOID");
+        break;
+      case LEAKYRELU:built_options.emplace("-DUSE_LEAKYRELU");
+        break;
+      default:LOG(FATAL) << "Unknown activation type: " << activation_;
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(scale->opencl_image()));
+    kernel_.setArg(idx++, *(offset->opencl_image()));
+    if (not_folded) {
+      kernel_.setArg(idx++, *(mean->opencl_image()));
+      kernel_.setArg(idx++, *(var->opencl_image()));
+      kernel_.setArg(idx++, epsilon_);
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, relux_max_limit_);
+    kernel_.setArg(idx++, leakyrelu_coefficient_);
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/batch_norm.h
+++ b/mace/ops/opencl/image/batch_norm.h
@@ -23,7 +23,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/activation.h"
+#include "mace/ops/common/activation_type.h"
 #include "mace/ops/opencl/helper.h"
 namespace mace {
@@ -31,7 +31,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class BatchNormKernel : public OpenCLBatchNormKernel {
 public:
  BatchNormKernel(
@@ -57,111 +56,6 @@ class BatchNormKernel : public OpenCLBatchNormKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-BatchNormKernel<T>::BatchNormKernel(const float epsilon,
-                                    const ActivationType activation,
-                                    const float relux_max_limit,
-                                    const float leakyrelu_coefficient)
-    : epsilon_(epsilon),
-      activation_(activation),
-      relux_max_limit_(relux_max_limit),
-      leakyrelu_coefficient_(leakyrelu_coefficient) {}
-template <typename T>
-MaceStatus BatchNormKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *scale,
-    const Tensor *offset,
-    const Tensor *mean,
-    const Tensor *var,
-    Tensor *output) {
-  bool not_folded = (mean != nullptr && var != nullptr);
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
-    built_options.emplace("-Dbatch_norm=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    if (!not_folded) {
-      built_options.emplace("-DFOLDED_CONSTANT");
-    }
-    switch (activation_) {
-      case NOOP:
-        break;
-      case RELU:
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case TANH:
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      case LEAKYRELU:
-        built_options.emplace("-DUSE_LEAKYRELU");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation_;
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(scale->opencl_image()));
-    kernel_.setArg(idx++, *(offset->opencl_image()));
-    if (not_folded) {
-      kernel_.setArg(idx++, *(mean->opencl_image()));
-      kernel_.setArg(idx++, *(var->opencl_image()));
-      kernel_.setArg(idx++, epsilon_);
-    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, relux_max_limit_);
-    kernel_.setArg(idx++, leakyrelu_coefficient_);
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
-             output->dim(1), output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/batch_to_space.cc
+++ b/mace/ops/opencl/image/batch_to_space.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/batch_to_space.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus BatchToSpaceKernel::Compute(
+    OpContext *context,
+    const Tensor *batch_tensor,
+    const std::vector<int> &paddings,
+    const std::vector<int> &block_shape,
+    const std::vector<index_t> &output_shape,
+    Tensor *space_tensor) {
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(
+      space_tensor->ResizeImage(output_shape, output_image_shape));
+  const uint32_t chan_blk =
+      static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
+  const uint32_t gws[3] = {
+      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
+      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    const char *kernel_name = "batch_to_space";
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    auto dt = batch_tensor->dtype();
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
+    kernel_.setArg(idx++, *(space_tensor->opencl_image()));
+    kernel_.setArg(idx++, block_shape[0]);
+    kernel_.setArg(idx++, block_shape[1]);
+    kernel_.setArg(idx++, paddings[0]);
+    kernel_.setArg(idx++, paddings[2]);
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
+    input_shape_ = batch_tensor->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
+             batch_tensor->dim(2), batch_tensor->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/batch_to_space.h
+++ b/mace/ops/opencl/image/batch_to_space.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
 public:
  MaceStatus Compute(
@@ -47,81 +46,6 @@ class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus BatchToSpaceKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *batch_tensor,
-    const std::vector<int> &paddings,
-    const std::vector<int> &block_shape,
-    const std::vector<index_t> &output_shape,
-    Tensor *space_tensor) {
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(
-      space_tensor->ResizeImage(output_shape, output_image_shape));
-  const uint32_t chan_blk =
-      static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
-  const uint32_t gws[3] = {
-      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
-      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    const char *kernel_name = "batch_to_space";
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    built_options.emplace(kernel_name_ss.str());
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" +
-        DtToCLCMDDt(DataTypeToEnum<T>::value));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
-    kernel_.setArg(idx++, *(space_tensor->opencl_image()));
-    kernel_.setArg(idx++, block_shape[0]);
-    kernel_.setArg(idx++, block_shape[1]);
-    kernel_.setArg(idx++, paddings[0]);
-    kernel_.setArg(idx++, paddings[2]);
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
-    input_shape_ = batch_tensor->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
-             batch_tensor->dim(2), batch_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/bias_add.cc
+++ b/mace/ops/opencl/image/bias_add.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/bias_add.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus BiasAddKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *bias,
+    Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
+    built_options.emplace("-Dbias_add=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(bias->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange,
+        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION;
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/bias_add.h
+++ b/mace/ops/opencl/image/bias_add.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class BiasAddKernel : public OpenCLBiasAddKernel {
 public:
  MaceStatus Compute(
@@ -45,84 +44,6 @@ class BiasAddKernel : public OpenCLBiasAddKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus BiasAddKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *bias,
-    Tensor *output) {
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    auto dt = DataTypeToEnum<T>::value;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
-    built_options.emplace("-Dbias_add=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(bias->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  } else {
-    std::vector<uint32_t> roundup_gws(lws.size());
-    for (size_t i = 0; i < lws.size(); ++i) {
-      if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
-    }
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange,
-        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  }
-  MACE_CL_RET_STATUS(error);
-  MACE_OUT_OF_RANGE_VALIDATION;
-  if (context->future() != nullptr) {
-    context->future()->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/buffer_to_image.cc
+++ b/mace/ops/opencl/image/buffer_to_image.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/buffer_to_image.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus BufferToImage::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const OpenCLBufferType type,
+    const int wino_blk_size,
+    Tensor *output) {
+  auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
+                              type,
+                              &image_shape,
+                              wino_blk_size);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
+  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
+                     static_cast<uint32_t>(image_shape[1])};
+  std::string kernel_name;
+  switch (type) {
+    case CONV2D_FILTER:kernel_name = "filter_buffer_to_image";
+      break;
+    case DW_CONV2D_FILTER:kernel_name = "dw_filter_buffer_to_image";
+      break;
+    case IN_OUT_CHANNEL:kernel_name = "in_out_buffer_to_image";
+      break;
+    case ARGUMENT:kernel_name = "arg_buffer_to_image";
+      break;
+    case IN_OUT_HEIGHT:kernel_name = "in_out_height_buffer_to_image";
+      break;
+    case IN_OUT_WIDTH:kernel_name = "in_out_width_buffer_to_image";
+      break;
+    case WEIGHT_HEIGHT:kernel_name = "weight_height_buffer_to_image";
+      break;
+    case WEIGHT_WIDTH:kernel_name = "weight_width_buffer_to_image";
+      break;
+    case WINOGRAD_FILTER: {
+      std::stringstream ss_tmp;
+      gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
+      ss_tmp << "winograd_filter_buffer_to_image_"
+             << wino_blk_size << "x" << wino_blk_size;
+      kernel_name = ss_tmp.str();
+      break;
+    }
+  }
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    if (input->dtype() == output->dtype()) {
+      auto input_dt = input->dtype();
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
+    } else {
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel(
+        "buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_2D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_buffer()));
+    MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
+               "buffer offset not aligned");
+    kernel_.setArg(idx++,
+                   static_cast<uint32_t>(input->buffer_offset() /
+                       GetEnumTypeSize(input->dtype())));
+    if (type == CONV2D_FILTER) {
+      const index_t
+          inner_size = input->dim(1) * input->dim(2) * input->dim(3);
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
+    } else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
+    } else if (type == ARGUMENT) {
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
+    } else {
+      kernel_.setArg(idx++,
+                     static_cast<uint32_t>(formatted_buffer_shape[1]));
+      kernel_.setArg(idx++,
+                     static_cast<uint32_t>(formatted_buffer_shape[2]));
+      kernel_.setArg(idx++,
+                     static_cast<uint32_t>(formatted_buffer_shape[3]));
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  const uint32_t kwg_size =
+      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  const std::vector<uint32_t> lws = {16, kwg_size / 16};
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
+        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
+        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION;
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/buffer_to_image.h
+++ b/mace/ops/opencl/image/buffer_to_image.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class BufferToImage : public OpenCLBufferTransformKernel {
 public:
  MaceStatus Compute(
@@ -45,156 +44,6 @@ class BufferToImage : public OpenCLBufferTransformKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus BufferToImage<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const OpenCLBufferType type,
-    const int wino_blk_size,
-    Tensor *output) {
-  auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
-                              type,
-                              &image_shape,
-                              wino_blk_size);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
-  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
-                     static_cast<uint32_t>(image_shape[1])};
-  std::string kernel_name;
-  switch (type) {
-    case CONV2D_FILTER:
-      kernel_name = "filter_buffer_to_image";
-      break;
-    case DW_CONV2D_FILTER:
-      kernel_name = "dw_filter_buffer_to_image";
-      break;
-    case IN_OUT_CHANNEL:
-      kernel_name = "in_out_buffer_to_image";
-      break;
-    case ARGUMENT:
-      kernel_name = "arg_buffer_to_image";
-      break;
-    case IN_OUT_HEIGHT:
-      kernel_name = "in_out_height_buffer_to_image";
-      break;
-    case IN_OUT_WIDTH:
-      kernel_name = "in_out_width_buffer_to_image";
-      break;
-    case WEIGHT_HEIGHT:
-      kernel_name = "weight_height_buffer_to_image";
-      break;
-    case WEIGHT_WIDTH:
-      kernel_name = "weight_width_buffer_to_image";
-      break;
-    case WINOGRAD_FILTER: {
-      std::stringstream ss_tmp;
-      gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
-      ss_tmp << "winograd_filter_buffer_to_image_"
-             << wino_blk_size << "x" << wino_blk_size;
-      kernel_name = ss_tmp.str();
-      break;
-    }
-  }
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    built_options.emplace(kernel_name_ss.str());
-    if (input->dtype() == output->dtype()) {
-      built_options.emplace(
-          "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-      built_options.emplace("-DCMD_DATA_TYPE=" +
-          DtToCLCMDDt(DataTypeToEnum<T>::value));
-    } else {
-      built_options.emplace("-DDATA_TYPE=" +
-          DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
-      built_options.emplace("-DCMD_DATA_TYPE=" +
-          DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel(
-        "buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_2D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_buffer()));
-    MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
-               "buffer offset not aligned");
-    kernel_.setArg(idx++,
-                   static_cast<uint32_t>(input->buffer_offset() /
-                       GetEnumTypeSize(input->dtype())));
-    if (type == CONV2D_FILTER) {
-      const index_t
-          inner_size = input->dim(1) * input->dim(2) * input->dim(3);
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
-    } else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
-    } else if (type == ARGUMENT) {
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
-    } else {
-      kernel_.setArg(idx++,
-                     static_cast<uint32_t>(formatted_buffer_shape[1]));
-      kernel_.setArg(idx++,
-                     static_cast<uint32_t>(formatted_buffer_shape[2]));
-      kernel_.setArg(idx++,
-                     static_cast<uint32_t>(formatted_buffer_shape[3]));
-    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {16, kwg_size / 16};
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
-        cl::NDRange(lws[0], lws[1]), nullptr, &event);
-  } else {
-    std::vector<uint32_t> roundup_gws(lws.size());
-    for (size_t i = 0; i < lws.size(); ++i) {
-      roundup_gws[i] = RoundUp(gws[i], lws[i]);
-    }
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
-        cl::NDRange(lws[0], lws[1]), nullptr, &event);
-  }
-  MACE_CL_RET_STATUS(error);
-  MACE_OUT_OF_RANGE_VALIDATION;
-  if (context->future() != nullptr) {
-    context->future()->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/channel_shuffle.cc
+++ b/mace/ops/opencl/image/channel_shuffle.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/channel_shuffle.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus ChannelShuffleKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    Tensor *output) {
+  MACE_CHECK(input->dim(3) % groups_ == 0,
+             "input channels must be an integral multiple of group. ",
+             input->dim(3));
+  MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
+  const index_t channels_per_group = channels / groups_;
+  const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
+  const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
+    built_options.emplace("-Dchannel_shuffle=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("channel_shuffle", kernel_name,
+                             built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, groups_);
+    kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/channel_shuffle.h
+++ b/mace/ops/opencl/image/channel_shuffle.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
 public:
  explicit ChannelShuffleKernel(const int groups) : groups_(groups) {}
@@ -46,70 +45,6 @@ class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus ChannelShuffleKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    Tensor *output) {
-  MACE_CHECK(input->dim(3) % groups_ == 0,
-             "input channels must be an integral multiple of group. ",
-             input->dim(3));
-  MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-  const index_t channels_per_group = channels / groups_;
-  const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
-  const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
-    built_options.emplace("-Dchannel_shuffle=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(
-        runtime->BuildKernel("channel_shuffle", kernel_name,
-                             built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, groups_);
-    kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/concat.cc
+++ b/mace/ops/opencl/image/concat.cc
@@ -50,7 +50,6 @@ MaceStatus Concat2(OpContext *context,
                   cl::Kernel *kernel,
                   const Tensor *input0,
                   const Tensor *input1,
-                   const DataType dt,
                   std::vector<index_t> *prev_input_shape,
                   Tensor *output,
                   uint32_t *kwg_size) {
@@ -75,12 +74,14 @@ MaceStatus Concat2(OpContext *context,
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
    built_options.emplace("-Dconcat_channel=" + kernel_name);
    if (input0->dtype() == output->dtype()) {
-      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+      auto data_dt = input0->dtype();
-      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt));
    } else {
-      built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
-      built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
    }
    if (input0->dim(3) % 4 == 0) {
      built_options.emplace("-DDIVISIBLE_FOUR");
    }
@@ -119,7 +120,6 @@ MaceStatus Concat2(OpContext *context,
 MaceStatus ConcatN(OpContext *context,
                   cl::Kernel *kernel,
                   const std::vector<const Tensor *> &input_list,
-                   const DataType dt,
                   Tensor *output,
                   uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
@@ -135,8 +135,8 @@ MaceStatus ConcatN(OpContext *context,
    MACE_NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
    built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
    MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
                                              built_options, kernel));
    *kwg_size =
@@ -205,6 +205,51 @@ MaceStatus ConcatN(OpContext *context,
 }
 }  // namespace concat
+MaceStatus ConcatKernel::Compute(
+    OpContext *context,
+    const std::vector<const Tensor *> &input_list,
+    const int32_t axis,
+    Tensor *output) {
+  const int inputs_count = input_list.size();
+  const Tensor *input0 = input_list[0];
+  std::vector<index_t> output_shape(input0->shape());
+  for (int i = 1; i < inputs_count; ++i) {
+    const Tensor *input = input_list[i];
+    MACE_CHECK(input->dim_size() == input0->dim_size(),
+               "Ranks of all input tensors must be same.");
+    for (int j = 0; j < input->dim_size(); ++j) {
+      if (j == axis) {
+        continue;
+      }
+      MACE_CHECK(input->dim(j) == input0->dim(j),
+                 "Dimensions of inputs should equal except axis.");
+    }
+    output_shape[axis] += input->dim(axis);
+  }
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+  switch (inputs_count) {
+    case 2:
+      return concat::Concat2(
+          context, &kernel_, input_list[0], input_list[1],
+          &input_shape_, output, &kwg_size_);
+    default:
+      return concat::ConcatN(context,
+                             &kernel_,
+                             input_list,
+                             output,
+                             &kwg_size_);
+  }
+}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/concat.h
+++ b/mace/ops/opencl/image/concat.h
@@ -32,7 +32,6 @@ MaceStatus Concat2(OpContext *context,
                   cl::Kernel *kernel,
                   const Tensor *input0,
                   const Tensor *input1,
-                   const DataType dt,
                   std::vector<index_t> *prev_input_shape,
                   Tensor *output,
                   uint32_t *kwg_size);
@@ -40,12 +39,10 @@ MaceStatus Concat2(OpContext *context,
 MaceStatus ConcatN(OpContext *context,
                   cl::Kernel *kernel,
                   const std::vector<const Tensor *> &input_list,
-                   const DataType dt,
                   Tensor *output,
                   uint32_t *kwg_size);
 }  // namespace concat
-template <typename T>
 class ConcatKernel : public OpenCLConcatKernel {
 public:
  ConcatKernel() {}
@@ -61,47 +58,6 @@ class ConcatKernel : public OpenCLConcatKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus ConcatKernel<T>::Compute(
-    OpContext *context,
-    const std::vector<const Tensor *> &input_list,
-    const int32_t axis,
-    Tensor *output) {
-  const int inputs_count = input_list.size();
-  const Tensor *input0 = input_list[0];
-  std::vector<index_t> output_shape(input0->shape());
-  for (int i = 1; i < inputs_count; ++i) {
-    const Tensor *input = input_list[i];
-    MACE_CHECK(input->dim_size() == input0->dim_size(),
-               "Ranks of all input tensors must be same.");
-    for (int j = 0; j < input->dim_size(); ++j) {
-      if (j == axis) {
-        continue;
-      }
-      MACE_CHECK(input->dim(j) == input0->dim(j),
-                 "Dimensions of inputs should equal except axis.");
-    }
-    output_shape[axis] += input->dim(axis);
-  }
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape,
-                              OpenCLBufferType::IN_OUT_CHANNEL,
-                              &image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-  switch (inputs_count) {
-    case 2:
-      return concat::Concat2(
-          context, &kernel_, input_list[0], input_list[1],
-          DataTypeToEnum<T>::value, &input_shape_, output, &kwg_size_);
-    default:
-      return concat::ConcatN(context, &kernel_, input_list,
-                             DataTypeToEnum<T>::value, output, &kwg_size_);
-  }
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/conv_2d.cc
+++ b/mace/ops/opencl/image/conv_2d.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/conv_2d.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+bool Conv2dKernel::CheckUseWinograd(
+    OpenCLRuntime *runtime,
+    const std::vector<mace::index_t> &filter_shape,
+    const std::vector<mace::index_t> &output_shape,
+    const int *strides,
+    const int *dilations,
+    int *wino_blk_size) {
+  if (filter_shape[2] != 3 || filter_shape[3] != 3 ||
+      strides[0] > 1 || strides[1] > 1 ||
+      dilations[0] > 1 || dilations[1] > 1) {
+    return false;
+  }
+  index_t out_channels = filter_shape[0];
+  index_t in_channels = filter_shape[1];
+  auto opencl_image_max_size = runtime->GetMaxImage2DSize();
+  auto check_opencl_limit = [&](int block_size) -> bool {
+    int sqr_block = (block_size + 2) * (block_size + 2);
+    uint64_t transformed_width = static_cast<uint64_t>(output_shape[0] *
+        ((output_shape[1] + block_size - 1) / block_size) *
+        ((output_shape[2] + block_size - 1) / block_size));
+    return (transformed_width < opencl_image_max_size[0] &&
+        static_cast<uint64_t>(sqr_block * in_channels)
+            < opencl_image_max_size[1] &&
+        static_cast<uint64_t>(sqr_block * out_channels)
+            < opencl_image_max_size[1]);
+  };
+  // GPU only supports 4x4 and 2x2 gpu winograd convolution
+  if (*wino_blk_size == 4) {
+    // if block size == 4 exceed OpenCL image size limitation, fallback to 2
+    if (!check_opencl_limit(4)) {
+      *wino_blk_size = 2;
+    } else {
+      return true;
+    }
+  }
+  return check_opencl_limit(2);
+}
+MaceStatus Conv2dKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const float leakyrelu_coefficient,
+    const int wino_blk_size,
+    Tensor *output) {
+  index_t kernel_h = filter->dim(2);
+  index_t kernel_w = filter->dim(3);
+  if (strides[0] != strides[1] ||
+      (dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) {
+    LOG(WARNING) << "OpenCL conv2d kernel with "
+                 << "filter" << kernel_h << "x" << kernel_w << ","
+                 << " stride " << strides[0] << "x" << strides[1]
+                 << ",dilations " << dilations[0] << "x" << dilations[1]
+                 << " is not implemented yet.";
+    MACE_NOT_IMPLEMENTED;
+  }
+  // Reshape output
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    ops::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), filter->shape().data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), filter->shape().data(),
+                   padding_data.data(), dilations, strides, RoundType::FLOOR,
+                   output_shape.data());
+  }
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  std::function<MaceStatus()> conv_func;
+  if (wino_blk_size != 0) {
+    // use winograd covolution
+    conv_func = [&]() -> MaceStatus {
+      cl::Kernel *kernels[3] = {&kernels_[0], &kernels_[1], &kernels_[2]};
+      uint32_t *kwg_size[3] = {&kwg_size_[0], &kwg_size_[1], &kwg_size_[2]};
+      return WinogradConv2dK3x3S1(context,
+                                  kernels,
+                                  input,
+                                  filter,
+                                  bias,
+                                  paddings.data(),
+                                  activation,
+                                  relux_max_limit,
+                                  leakyrelu_coefficient,
+                                  wino_blk_size,
+                                  &input_shape_,
+                                  output,
+                                  kwg_size);
+    };
+  } else if (kernel_h == 1 && kernel_w == 1) {
+    conv_func = [&]() -> MaceStatus {
+      return Conv2dK1x1(context,
+                        &kernels_[0],
+                        input,
+                        filter,
+                        bias,
+                        strides[0],
+                        paddings.data(),
+                        dilations,
+                        activation,
+                        relux_max_limit,
+                        leakyrelu_coefficient,
+                        &input_shape_,
+                        output,
+                        &kwg_size_[0]);
+    };
+  } else if (kernel_h == 3 && kernel_w == 3) {
+    conv_func = [&]() -> MaceStatus {
+      return Conv2dK3x3(context,
+                        &kernels_[0],
+                        input,
+                        filter,
+                        bias,
+                        strides[0],
+                        paddings.data(),
+                        dilations,
+                        activation,
+                        relux_max_limit,
+                        leakyrelu_coefficient,
+                        &input_shape_,
+                        output,
+                        &kwg_size_[0]);
+    };
+  } else {
+    conv_func = [&]() -> MaceStatus {
+      return Conv2d(context,
+                    &kernels_[0],
+                    input,
+                    filter,
+                    bias,
+                    strides[0],
+                    paddings.data(),
+                    dilations,
+                    activation,
+                    relux_max_limit,
+                    leakyrelu_coefficient,
+                    &input_shape_,
+                    output,
+                    &kwg_size_[0]);
+    };
+  }
+  return conv_func();
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/conv_2d.h
+++ b/mace/ops/opencl/image/conv_2d.h
@@ -39,7 +39,6 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
                             const ActivationType activation,
                             const float relux_max_limit,
                             const float leakyrelu_coefficient,
-                             const DataType dt,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
                             uint32_t *kwg_size);
@@ -55,7 +54,6 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
                             const ActivationType activation,
                             const float relux_max_limit,
                             const float leakyrelu_coefficient,
-                             const DataType dt,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
                             uint32_t *kwg_size);
@@ -71,7 +69,6 @@ extern MaceStatus Conv2d(OpContext *context,
                         const ActivationType activation,
                         const float relux_max_limit,
                         const float leakyrelu_coefficient,
-                         const DataType dt,
                         std::vector<index_t> *prev_input_shape,
                         Tensor *output,
                         uint32_t *kwg_size);
@@ -85,13 +82,11 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
                                       const ActivationType activation,
                                       const float relux_max_limit,
                                       const float leakyrelu_coefficient,
-                                       const DataType dt,
                                       const int wino_blk_size,
                                       std::vector<index_t> *prev_input_shape,
                                       Tensor *output,
                                       uint32_t *kwg_size[3]);
-template <typename T>
 class Conv2dKernel : public OpenCLConv2dKernel {
 public:
  bool CheckUseWinograd(
@@ -123,172 +118,6 @@ class Conv2dKernel : public OpenCLConv2dKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-bool Conv2dKernel<T>::CheckUseWinograd(
-    OpenCLRuntime *runtime,
-    const std::vector<mace::index_t> &filter_shape,
-    const std::vector<mace::index_t> &output_shape,
-    const int *strides,
-    const int *dilations,
-    int *wino_blk_size) {
-  if (filter_shape[2] != 3 || filter_shape[3] != 3 ||
-      strides[0] > 1 || strides[1] > 1 ||
-      dilations[0] > 1 || dilations[1] > 1) {
-    return false;
-  }
-  index_t out_channels = filter_shape[0];
-  index_t in_channels = filter_shape[1];
-  auto opencl_image_max_size = runtime->GetMaxImage2DSize();
-  auto check_opencl_limit = [&](int block_size) -> bool {
-    int sqr_block = (block_size + 2) * (block_size + 2);
-    uint64_t transformed_width = static_cast<uint64_t>(output_shape[0] *
-        ((output_shape[1] + block_size - 1) / block_size) *
-        ((output_shape[2] + block_size - 1) / block_size));
-    return (transformed_width < opencl_image_max_size[0] &&
-        static_cast<uint64_t>(sqr_block * in_channels)
-        < opencl_image_max_size[1] &&
-        static_cast<uint64_t>(sqr_block * out_channels)
-            < opencl_image_max_size[1]);
-  };
-  // GPU only supports 4x4 and 2x2 gpu winograd convolution
-  if (*wino_blk_size == 4) {
-    // if block size == 4 exceed OpenCL image size limitation, fallback to 2
-    if (!check_opencl_limit(4)) {
-      *wino_blk_size = 2;
-    } else {
-      return true;
-    }
-  }
-  return check_opencl_limit(2);
-}
-template <typename T>
-MaceStatus Conv2dKernel<T>::Compute(
-      OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *bias,
-      const int *strides,
-      const Padding &padding_type,
-      const std::vector<int> &padding_data,
-      const int *dilations,
-      const ActivationType activation,
-      const float relux_max_limit,
-      const float leakyrelu_coefficient,
-      const int wino_blk_size,
-      Tensor *output) {
-  index_t kernel_h = filter->dim(2);
-  index_t kernel_w = filter->dim(3);
-  if (strides[0] != strides[1] ||
-      (dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) {
-    LOG(WARNING) << "OpenCL conv2d kernel with "
-                 << "filter" << kernel_h << "x" << kernel_w << ","
-                 << " stride " << strides[0] << "x" << strides[1]
-                 << ",dilations " << dilations[0] << "x" << dilations[1]
-                 << " is not implemented yet.";
-    MACE_NOT_IMPLEMENTED;
-  }
-  // Reshape output
-  std::vector<index_t> output_shape(4);
-  std::vector<int> paddings(2);
-  if (padding_data.empty()) {
-    ops::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), filter->shape().data(), dilations, strides,
-        padding_type, output_shape.data(), paddings.data());
-  } else {
-    paddings = padding_data;
-    CalcOutputSize(input->shape().data(), filter->shape().data(),
-                   padding_data.data(), dilations, strides, RoundType::FLOOR,
-                   output_shape.data());
-  }
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  std::function<MaceStatus()> conv_func;
-  if (wino_blk_size != 0) {
-    // use winograd covolution
-    conv_func = [&]() -> MaceStatus {
-      cl::Kernel *kernels[3] = {&kernels_[0], &kernels_[1], &kernels_[2]};
-      uint32_t *kwg_size[3] = {&kwg_size_[0], &kwg_size_[1], &kwg_size_[2]};
-      return WinogradConv2dK3x3S1(context,
-                                  kernels,
-                                  input,
-                                  filter,
-                                  bias,
-                                  paddings.data(),
-                                  activation,
-                                  relux_max_limit,
-                                  leakyrelu_coefficient,
-                                  DataTypeToEnum<T>::value,
-                                  wino_blk_size,
-                                  &input_shape_,
-                                  output,
-                                  kwg_size);
-    };
-  } else if (kernel_h == 1 && kernel_w == 1)  {
-    conv_func = [&]() -> MaceStatus {
-      return Conv2dK1x1(context,
-                        &kernels_[0],
-                        input,
-                        filter,
-                        bias,
-                        strides[0],
-                        paddings.data(),
-                        dilations,
-                        activation,
-                        relux_max_limit,
-                        leakyrelu_coefficient,
-                        DataTypeToEnum<T>::value,
-                        &input_shape_,
-                        output,
-                        &kwg_size_[0]);
-    };
-  } else if (kernel_h == 3 && kernel_w == 3) {
-    conv_func = [&]() -> MaceStatus {
-      return Conv2dK3x3(context,
-                        &kernels_[0],
-                        input,
-                        filter,
-                        bias,
-                        strides[0],
-                        paddings.data(),
-                        dilations,
-                        activation,
-                        relux_max_limit,
-                        leakyrelu_coefficient,
-                        DataTypeToEnum<T>::value,
-                        &input_shape_,
-                        output,
-                        &kwg_size_[0]);
-    };
-  } else {
-    conv_func = [&]() -> MaceStatus {
-      return Conv2d(context,
-                    &kernels_[0],
-                    input,
-                    filter,
-                    bias,
-                    strides[0],
-                    paddings.data(),
-                    dilations,
-                    activation,
-                    relux_max_limit,
-                    leakyrelu_coefficient,
-                    DataTypeToEnum<T>::value,
-                    &input_shape_,
-                    output,
-                    &kwg_size_[0]);
-    };
-  }
-  return conv_func();
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/conv_2d_1x1.cc
+++ b/mace/ops/opencl/image/conv_2d_1x1.cc
@@ -66,7 +66,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 }  // namespace
-extern MaceStatus Conv2dK1x1(OpContext *context,
+MaceStatus Conv2dK1x1(OpContext *context,
                      cl::Kernel *kernel,
                      const Tensor *input,
                      const Tensor *filter,
@@ -77,7 +77,6 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
                      const ActivationType activation,
                      const float relux_max_limit,
                      const float leakyrelu_coefficient,
-                             const DataType dt,
                      std::vector<index_t> *prev_input_shape,
                      Tensor *output,
                      uint32_t *kwg_size) {
@@ -106,32 +105,39 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
    MACE_NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1");
    built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
    if (bias != nullptr) {
      built_options.emplace("-DBIAS");
    }
    switch (activation) {
-      case NOOP:
+      case NOOP: {
        break;
-      case RELU:
+      }
+      case RELU: {
        built_options.emplace("-DUSE_RELU");
        break;
-      case RELUX:
+      }
+      case RELUX: {
        built_options.emplace("-DUSE_RELUX");
        break;
-      case TANH:
+      }
+      case TANH: {
        built_options.emplace("-DUSE_TANH");
        break;
-      case SIGMOID:
+      }
+      case SIGMOID: {
        built_options.emplace("-DUSE_SIGMOID");
        break;
-      case LEAKYRELU:
+      }
+      case LEAKYRELU: {
        built_options.emplace("-DUSE_LEAKYRELU");
        break;
-      default:
+      }
+      default: {
        LOG(FATAL) << "Unknown activation type: " << activation;
      }
+    }
    MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1", kernel_name,
                                              built_options, kernel));

--- a/mace/ops/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
@@ -59,7 +59,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 }  // namespace
-extern MaceStatus Conv2dK3x3(OpContext *context,
+MaceStatus Conv2dK3x3(OpContext *context,
                      cl::Kernel *kernel,
                      const Tensor *input,
                      const Tensor *filter,
@@ -70,7 +70,6 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
                      const ActivationType activation,
                      const float relux_max_limit,
                      const float leakyrelu_coefficient,
-                             const DataType dt,
                      std::vector<index_t> *prev_input_shape,
                      Tensor *output,
                      uint32_t *kwg_size) {
@@ -93,30 +92,37 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
    MACE_NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
    built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
    switch (activation) {
-      case NOOP:
+      case NOOP: {
        break;
-      case RELU:
+      }
+      case RELU: {
        built_options.emplace("-DUSE_RELU");
        break;
-      case RELUX:
+      }
+      case RELUX: {
        built_options.emplace("-DUSE_RELUX");
        break;
-      case TANH:
+      }
+      case TANH: {
        built_options.emplace("-DUSE_TANH");
        break;
-      case SIGMOID:
+      }
+      case SIGMOID: {
        built_options.emplace("-DUSE_SIGMOID");
        break;
-      case LEAKYRELU:
+      }
+      case LEAKYRELU: {
        built_options.emplace("-DUSE_LEAKYRELU");
        break;
-      default:
+      }
+      default: {
        LOG(FATAL) << "Unknown activation type: " << activation;
      }
+    }
    MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_3x3", kernel_name,
                                              built_options, kernel));

--- a/mace/ops/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
@@ -67,7 +67,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 }  // namespace
-extern MaceStatus Conv2d(OpContext *context,
+MaceStatus Conv2d(OpContext *context,
                  cl::Kernel *kernel,
                  const Tensor *input,
                  const Tensor *filter,
@@ -78,7 +78,6 @@ extern MaceStatus Conv2d(OpContext *context,
                  const ActivationType activation,
                  const float relux_max_limit,
                  const float leakyrelu_coefficient,
-                         const DataType dt,
                  std::vector<index_t> *prev_input_shape,
                  Tensor *output,
                  uint32_t *kwg_size) {
@@ -101,30 +100,37 @@ extern MaceStatus Conv2d(OpContext *context,
    MACE_NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
    built_options.emplace("-Dconv_2d=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
    switch (activation) {
-      case NOOP:
+      case NOOP: {
        break;
-      case RELU:
+      }
+      case RELU: {
        built_options.emplace("-DUSE_RELU");
        break;
-      case RELUX:
+      }
+      case RELUX: {
        built_options.emplace("-DUSE_RELUX");
        break;
-      case TANH:
+      }
+      case TANH: {
        built_options.emplace("-DUSE_TANH");
        break;
-      case SIGMOID:
+      }
+      case SIGMOID: {
        built_options.emplace("-DUSE_SIGMOID");
        break;
-      case LEAKYRELU:
+      }
+      case LEAKYRELU: {
        built_options.emplace("-DUSE_LEAKYRELU");
        break;
-      default:
+      }
+      default: {
        LOG(FATAL) << "Unknown activation type: " << activation;
      }
+    }
    MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d", kernel_name,
                                              built_options, kernel));

--- a/mace/ops/opencl/image/crop.cc
+++ b/mace/ops/opencl/image/crop.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/crop.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus CropKernel::Compute(
+    OpContext *context,
+    const std::vector<const Tensor *> &input_list,
+    Tensor *output) {
+  const int32_t inputs_count = static_cast<int32_t>(input_list.size());
+  MACE_CHECK(inputs_count >= 2)
+    << "Crop opencl kernel only support 2 elements input";
+  const Tensor *input0 = input_list[0];
+  const Tensor *input1 = input_list[1];
+  const uint32_t in0_dims = static_cast<uint32_t >(input0->dim_size());
+  const uint32_t in1_dims = static_cast<uint32_t >(input0->dim_size());
+  MACE_CHECK(in0_dims == 4 && in1_dims == 4,
+             "Crop op only supports 4-dims inputs now.");
+  std::vector<int32_t> offsets(4, 0);
+  std::vector<index_t> output_shape(input0->shape());
+  for (index_t i = 0; i < in0_dims; ++i) {
+    if (offset_[i] >= 0) {
+      output_shape[i] = input1->dim(i);
+      offsets[i] = offset_[i];
+      MACE_CHECK(input0->dim(i) - offset_[i] >= input1->dim(i))
+        << "the crop for dimension " << i
+        << " is out of bound, first input size "
+        << input0->dim(i) << ", offset " << offsets[i]
+        << ", second input size " << input1->dim(i);
+    }
+  }
+  MACE_CHECK(offsets[3] % 4 == 0,
+             "MACE opencl only supports cropping channel"
+             " offset divisible by 4.");
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+  const index_t offset_chan_blk = RoundUpDiv4(offsets[3]);
+  const index_t channel_blk = RoundUpDiv4(output->dim(3));
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(output->dim(2)),
+      static_cast<uint32_t>(output->dim(0) * output->dim(1))
+  };
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
+    built_options.emplace("-Dcrop=" + kernel_name);
+    auto dt = input0->dtype();
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input0->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input0->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int>(offsets[0]));
+    kernel_.setArg(idx++, static_cast<int>(offsets[1]));
+    kernel_.setArg(idx++, static_cast<int>(offsets[2]));
+    kernel_.setArg(idx++, static_cast<int>(offset_chan_blk));
+    kernel_.setArg(idx++, static_cast<int>(input0->dim(1)));
+    kernel_.setArg(idx++, static_cast<int>(input0->dim(2)));
+    kernel_.setArg(idx++, static_cast<int>(output->dim(1)));
+    kernel_.setArg(idx++, static_cast<int>(output->dim(2)));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input0->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/crop.h
+++ b/mace/ops/opencl/image/crop.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class CropKernel : public OpenCLCropKernel {
 public:
  explicit CropKernel(
@@ -48,98 +47,6 @@ class CropKernel : public OpenCLCropKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus CropKernel<T>::Compute(
-    OpContext *context,
-    const std::vector<const Tensor *> &input_list,
-    Tensor *output) {
-  const int32_t inputs_count = static_cast<int32_t>(input_list.size());
-  MACE_CHECK(inputs_count >= 2)
-    << "Crop opencl kernel only support 2 elements input";
-  const Tensor *input0 = input_list[0];
-  const Tensor *input1 = input_list[1];
-  const uint32_t in0_dims = static_cast<uint32_t >(input0->dim_size());
-  const uint32_t in1_dims = static_cast<uint32_t >(input0->dim_size());
-  MACE_CHECK(in0_dims == 4 && in1_dims == 4,
-             "Crop op only supports 4-dims inputs now.");
-  std::vector<int32_t> offsets(4, 0);
-  std::vector<index_t> output_shape(input0->shape());
-  for (index_t i = 0; i < in0_dims; ++i) {
-    if (offset_[i] >= 0) {
-      output_shape[i] = input1->dim(i);
-      offsets[i] = offset_[i];
-      MACE_CHECK(input0->dim(i) - offset_[i] >= input1->dim(i))
-        << "the crop for dimension " << i
-        << " is out of bound, first input size "
-        << input0->dim(i) << ", offset " << offsets[i]
-        << ", second input size " << input1->dim(i);
-    }
-  }
-  MACE_CHECK(offsets[3] % 4 == 0,
-             "MACE opencl only supports cropping channel"
-                 " offset divisible by 4.");
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape,
-                              OpenCLBufferType::IN_OUT_CHANNEL,
-                              &image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-  const index_t offset_chan_blk = RoundUpDiv4(offsets[3]);
-  const index_t channel_blk = RoundUpDiv4(output->dim(3));
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(output->dim(2)),
-      static_cast<uint32_t>(output->dim(0) * output->dim(1))
-  };
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
-    built_options.emplace("-Dcrop=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input0->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input0->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int>(offsets[0]));
-    kernel_.setArg(idx++, static_cast<int>(offsets[1]));
-    kernel_.setArg(idx++, static_cast<int>(offsets[2]));
-    kernel_.setArg(idx++, static_cast<int>(offset_chan_blk));
-    kernel_.setArg(idx++, static_cast<int>(input0->dim(1)));
-    kernel_.setArg(idx++, static_cast<int>(input0->dim(2)));
-    kernel_.setArg(idx++, static_cast<int>(output->dim(1)));
-    kernel_.setArg(idx++, static_cast<int>(output->dim(2)));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input0->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/deconv_2d.cc
+++ b/mace/ops/opencl/image/deconv_2d.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/deconv_2d.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus Deconv2dKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const int *padding_data,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const float leakyrelu_coefficient,
+    const std::vector<index_t> &output_shape,
+    Tensor *output) {
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channels = output->dim(3);
+  const index_t input_channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
+  const int stride_h = strides[0];
+  const int stride_w = strides[1];
+  MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
+  const int width_tile = 5;
+  const index_t n_strides = (width + stride_w - 1) / stride_w;
+  const index_t width_blocks =
+      ((n_strides + width_tile - 1) / width_tile) * stride_w;
+  const float stride_h_r = 1.f / static_cast<float>(stride_h);
+  const float stride_w_r = 1.f / static_cast<float>(stride_w);
+  const int padding_h = (padding_data[0] + 1) >> 1;
+  const int padding_w = (padding_data[1] + 1) >> 1;
+  const int align_h = stride_h - 1 - padding_h;
+  const int align_w = stride_w - 1 - padding_w;
+  const int kernel_size = filter->dim(2) * filter->dim(3);
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
+    built_options.emplace("-Ddeconv_2d=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width_blocks),
+                           static_cast<uint32_t>(height * batch)};
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(filter->opencl_image()));
+    if (bias != nullptr) {
+      kernel_.setArg(idx++, *(bias->opencl_image()));
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, relux_max_limit);
+    kernel_.setArg(idx++, leakyrelu_coefficient);
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(3)));
+    kernel_.setArg(idx++, static_cast<int32_t>(height));
+    kernel_.setArg(idx++, static_cast<int32_t>(width));
+    kernel_.setArg(idx++, static_cast<int32_t>(channels));
+    kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
+    kernel_.setArg(idx++, stride_h_r);
+    kernel_.setArg(idx++, stride_w_r);
+    kernel_.setArg(idx++, static_cast<int32_t>(align_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(align_w));
+    kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
+    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
+    kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_channel_blocks));
+    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/deconv_2d.h
+++ b/mace/ops/opencl/image/deconv_2d.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class Deconv2dKernel : public OpenCLDeconv2dKernel {
 public:
  MaceStatus Compute(
@@ -52,140 +51,6 @@ class Deconv2dKernel : public OpenCLDeconv2dKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus Deconv2dKernel<T>::Compute(
-      OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *bias,
-      const int *strides,
-      const int *padding_data,
-      const ActivationType activation,
-      const float relux_max_limit,
-      const float leakyrelu_coefficient,
-      const std::vector<index_t> &output_shape,
-      Tensor *output) {
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  const DataType dt = DataTypeToEnum<T>::value;
-  const index_t batch = output->dim(0);
-  const index_t height = output->dim(1);
-  const index_t width = output->dim(2);
-  const index_t channels = output->dim(3);
-  const index_t input_channels = input->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
-  const int stride_h = strides[0];
-  const int stride_w = strides[1];
-  MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
-  const int width_tile = 5;
-  const index_t n_strides = (width + stride_w - 1) / stride_w;
-  const index_t width_blocks =
-      ((n_strides + width_tile - 1) / width_tile) * stride_w;
-  const float stride_h_r = 1.f / static_cast<float>(stride_h);
-  const float stride_w_r = 1.f / static_cast<float>(stride_w);
-  const int padding_h = (padding_data[0] + 1) >> 1;
-  const int padding_w = (padding_data[1] + 1) >> 1;
-  const int align_h = stride_h - 1 - padding_h;
-  const int align_w = stride_w - 1 - padding_w;
-  const int kernel_size = filter->dim(2) * filter->dim(3);
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
-    built_options.emplace("-Ddeconv_2d=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-    switch (activation) {
-      case NOOP:
-        break;
-      case RELU:
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case TANH:
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      case LEAKYRELU:
-        built_options.emplace("-DUSE_LEAKYRELU");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation;
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width_blocks),
-                           static_cast<uint32_t>(height * batch)};
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(filter->opencl_image()));
-    if (bias != nullptr) {
-      kernel_.setArg(idx++, *(bias->opencl_image()));
-    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, relux_max_limit);
-    kernel_.setArg(idx++, leakyrelu_coefficient);
-    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(3)));
-    kernel_.setArg(idx++, static_cast<int32_t>(height));
-    kernel_.setArg(idx++, static_cast<int32_t>(width));
-    kernel_.setArg(idx++, static_cast<int32_t>(channels));
-    kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
-    kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
-    kernel_.setArg(idx++, stride_h_r);
-    kernel_.setArg(idx++, stride_w_r);
-    kernel_.setArg(idx++, static_cast<int32_t>(align_h));
-    kernel_.setArg(idx++, static_cast<int32_t>(align_w));
-    kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
-    kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
-    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
-    kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_channel_blocks));
-    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
-             output->dim(1), output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/depth_to_space.cc
+++ b/mace/ops/opencl/image/depth_to_space.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/depth_to_space.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus DepthToSpaceKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t input_height = input->dim(1);
+  const index_t input_width = input->dim(2);
+  const index_t input_depth = input->dim(3);
+  MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
+             "input depth should be dividable by block_size * block_size ",
+             input_depth);
+  const index_t output_height = input_height * block_size_;
+  const index_t output_width = input_width * block_size_;
+  const index_t output_depth = input_depth / (block_size_ * block_size_);
+  MACE_CHECK(output_depth % 4 == 0 || output_depth < 4,
+             "output channel not support:") << output_depth;
+  std::vector<index_t> output_shape = {batch,
+                                       output_height,
+                                       output_width,
+                                       output_depth};
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+  uint32_t gws[3];
+  if (output_depth < 3) {
+    gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
+    gws[1] = static_cast<uint32_t>(input_width);
+    gws[2] = static_cast<uint32_t>(input_height * batch);
+  } else {
+    gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
+    gws[1] = static_cast<uint32_t>(output_width);
+    gws[2] = static_cast<uint32_t>(output_height * batch);
+  }
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    const char *kernel_name = "depth_to_space";
+    if (output_depth < 4) {
+      built_options.emplace(MakeString("-DDEPTH", output_depth));
+      if (output_depth != 3) kernel_name = "depth_to_space_d1_d2";
+    }
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    auto dt = input->dtype();
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_depth));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  std::string tuning_key = Concat("depth_to_space",
+                                  batch, output_height,
+                                  output_width, output_depth);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/depth_to_space.h
+++ b/mace/ops/opencl/image/depth_to_space.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel {
 public:
  explicit DepthToSpaceKernel(const int block_size)
@@ -47,101 +46,6 @@ class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus DepthToSpaceKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    Tensor *output) {
-  const index_t batch = input->dim(0);
-  const index_t input_height = input->dim(1);
-  const index_t input_width = input->dim(2);
-  const index_t input_depth = input->dim(3);
-  MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
-             "input depth should be dividable by block_size * block_size ",
-             input_depth);
-  const index_t output_height = input_height * block_size_;
-  const index_t output_width = input_width * block_size_;
-  const index_t output_depth = input_depth / (block_size_ * block_size_);
-  MACE_CHECK(output_depth % 4 == 0 || output_depth < 4,
-             "output channel not support:") << output_depth;
-  std::vector<index_t> output_shape = {batch,
-                                       output_height,
-                                       output_width,
-                                       output_depth};
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape,
-                              OpenCLBufferType::IN_OUT_CHANNEL,
-                              &image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-  uint32_t gws[3];
-  if (output_depth < 3) {
-    gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
-    gws[1] = static_cast<uint32_t>(input_width);
-    gws[2] = static_cast<uint32_t>(input_height * batch);
-  } else {
-    gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
-    gws[1] = static_cast<uint32_t>(output_width);
-    gws[2] = static_cast<uint32_t>(output_height * batch);
-  }
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    const char *kernel_name = "depth_to_space";
-    if (output_depth < 4) {
-      built_options.emplace(MakeString("-DDEPTH", output_depth));
-      if (output_depth != 3) kernel_name = "depth_to_space_d1_d2";
-    }
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    built_options.emplace(kernel_name_ss.str());
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_depth));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-  std::string tuning_key = Concat("depth_to_space",
-                                  batch, output_height,
-                                  output_width, output_depth);
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/depthwise_conv2d.cc
+++ b/mace/ops/opencl/image/depthwise_conv2d.cc
@@ -74,7 +74,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                           const ActivationType activation,
                           const float relux_max_limit,
                           const float leakyrelu_coefficient,
-                           const DataType dt,
                           std::vector<index_t> *prev_input_shape,
                           Tensor *output,
                           uint32_t *kwg_size) {
@@ -108,8 +107,8 @@ MaceStatus DepthwiseConv2d(OpContext *context,
    } else {
      built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
    }
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
    built_options.emplace(MakeString("-DSTRIDE=", stride));
    switch (activation) {
@@ -192,6 +191,62 @@ MaceStatus DepthwiseConv2d(OpContext *context,
 }
 }  // namespace depthwise
+MaceStatus DepthwiseConv2dKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const float leakyrelu_coefficient,
+    Tensor *output) {
+  index_t kernel_h = filter->dim(2);
+  index_t kernel_w = filter->dim(3);
+  if (strides[0] != strides[1]) {
+    LOG(WARNING) << "OpenCL depthwise conv2d kernel with "
+                 << "filter" << kernel_h << "x" << kernel_w << ","
+                 << " stride " << strides[0] << "x" << strides[1]
+                 << " is not implemented yet, using slow version";
+    MACE_NOT_IMPLEMENTED;
+  }
+  // Create a fake conv_2d filter to calculate the paddings and output size
+  std::vector<index_t> fake_filter_shape(4);
+  fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
+  fake_filter_shape[1] = filter->dim(1);
+  fake_filter_shape[2] = filter->dim(2);
+  fake_filter_shape[3] = filter->dim(3);
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    ops::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), fake_filter_shape.data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
+                   padding_data.data(), dilations, strides, RoundType::FLOOR,
+                   output_shape.data());
+  }
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  return depthwise::DepthwiseConv2d(
+      context, &kernel_, input, filter, bias, strides[0], paddings.data(),
+      dilations, activation, relux_max_limit, leakyrelu_coefficient,
+      &input_shape_, output, &kwg_size_);
+}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/depthwise_conv2d.h
+++ b/mace/ops/opencl/image/depthwise_conv2d.h
@@ -40,14 +40,11 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                           const ActivationType activation,
                           const float relux_max_limit,
                           const float leakyrelu_coefficient,
-                           const DataType dt,
                           std::vector<index_t> *prev_input_shape,
                           Tensor *output,
                           uint32_t *kwg_size);
 }  // namespace depthwise
-template <typename T>
 class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
 public:
  MaceStatus Compute(
@@ -70,61 +67,6 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus DepthwiseConv2dKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *filter,
-    const Tensor *bias,
-    const int *strides,
-    const Padding &padding_type,
-    const std::vector<int> &padding_data,
-    const int *dilations,
-    const ActivationType activation,
-    const float relux_max_limit,
-    const float leakyrelu_coefficient,
-    Tensor *output) {
-  index_t kernel_h = filter->dim(2);
-  index_t kernel_w = filter->dim(3);
-  if (strides[0] != strides[1]) {
-    LOG(WARNING) << "OpenCL depthwise conv2d kernel with "
-                 << "filter" << kernel_h << "x" << kernel_w << ","
-                 << " stride " << strides[0] << "x" << strides[1]
-                 << " is not implemented yet, using slow version";
-    MACE_NOT_IMPLEMENTED;
-  }
-  // Create a fake conv_2d filter to calculate the paddings and output size
-  std::vector<index_t> fake_filter_shape(4);
-  fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
-  fake_filter_shape[1] = filter->dim(1);
-  fake_filter_shape[2] = filter->dim(2);
-  fake_filter_shape[3] = filter->dim(3);
-  std::vector<index_t> output_shape(4);
-  std::vector<int> paddings(2);
-  if (padding_data.empty()) {
-    ops::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), fake_filter_shape.data(), dilations, strides,
-        padding_type, output_shape.data(), paddings.data());
-  } else {
-    paddings = padding_data;
-    CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
-                   padding_data.data(), dilations, strides, RoundType::FLOOR,
-                   output_shape.data());
-  }
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  return depthwise::DepthwiseConv2d(
-      context, &kernel_, input, filter, bias, strides[0], paddings.data(),
-      dilations, activation, relux_max_limit, leakyrelu_coefficient,
-      DataTypeToEnum<T>::value, &input_shape_, output, &kwg_size_);
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/depthwise_deconv2d.cc
+++ b/mace/ops/opencl/image/depthwise_deconv2d.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/depthwise_deconv2d.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus DepthwiseDeconv2dKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const int *padding_data,
+    const int group,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const float leakyrelu_coefficient,
+    const std::vector<index_t> &output_shape,
+    Tensor *output) {
+  const index_t batch = output_shape[0];
+  const index_t height = output_shape[1];
+  const index_t width = output_shape[2];
+  const index_t channels = output_shape[3];
+  const index_t input_channels = input->dim(3);
+  const index_t multiplier = filter->dim(0);
+  MACE_CHECK(group == channels && group == input_channels && multiplier == 1,
+             "opencl image deconv only supports depthwise type group.");
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const int stride_h = strides[0];
+  const int stride_w = strides[1];
+  MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
+  const int width_tile = 5;
+  const index_t n_strides = (width + stride_w - 1) / stride_w;
+  const index_t width_blocks =
+      ((n_strides + width_tile - 1) / width_tile) * stride_w;
+  const float stride_h_r = 1.f / static_cast<float>(stride_h);
+  const float stride_w_r = 1.f / static_cast<float>(stride_w);
+  const int padding_h = (padding_data[0] + 1) >> 1;
+  const int padding_w = (padding_data[1] + 1) >> 1;
+  const int align_h = stride_h - 1 - padding_h;
+  const int align_w = stride_w - 1 - padding_w;
+  const int kernel_size = filter->dim(2) * filter->dim(3);
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_deconv2d");
+    built_options.emplace("-Ddepthwise_deconv2d=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("depthwise_deconv2d", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width_blocks),
+                           static_cast<uint32_t>(height * batch)};
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(filter->opencl_image()));
+    if (bias != nullptr) {
+      kernel_.setArg(idx++, *(bias->opencl_image()));
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, relux_max_limit);
+    kernel_.setArg(idx++, leakyrelu_coefficient);
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(height));
+    kernel_.setArg(idx++, static_cast<int32_t>(width));
+    kernel_.setArg(idx++, static_cast<int32_t>(channels));
+    kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
+    kernel_.setArg(idx++, stride_h_r);
+    kernel_.setArg(idx++, stride_w_r);
+    kernel_.setArg(idx++, static_cast<int32_t>(align_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(align_w));
+    kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
+    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
+    kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
+    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("depthwise_deconv2d_kernel_",
+             activation,
+             output->dim(0),
+             output->dim(1),
+             output->dim(2),
+             output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/depthwise_deconv2d.h
+++ b/mace/ops/opencl/image/depthwise_deconv2d.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class DepthwiseDeconv2dKernel : public OpenCLDepthwiseDeconv2dKernel {
 public:
  MaceStatus Compute(
@@ -53,147 +52,6 @@ class DepthwiseDeconv2dKernel : public OpenCLDepthwiseDeconv2dKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus DepthwiseDeconv2dKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *filter,
-    const Tensor *bias,
-    const int *strides,
-    const int *padding_data,
-    const int group,
-    const ActivationType activation,
-    const float relux_max_limit,
-    const float leakyrelu_coefficient,
-    const std::vector<index_t> &output_shape,
-    Tensor *output) {
-  const index_t batch = output_shape[0];
-  const index_t height = output_shape[1];
-  const index_t width = output_shape[2];
-  const index_t channels = output_shape[3];
-  const index_t input_channels = input->dim(3);
-  const index_t multiplier = filter->dim(0);
-  MACE_CHECK(group == channels && group == input_channels && multiplier == 1,
-             "opencl image deconv only supports depthwise type group.");
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  const DataType dt = DataTypeToEnum<T>::value;
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const int stride_h = strides[0];
-  const int stride_w = strides[1];
-  MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
-  const int width_tile = 5;
-  const index_t n_strides = (width + stride_w - 1) / stride_w;
-  const index_t width_blocks =
-      ((n_strides + width_tile - 1) / width_tile) * stride_w;
-  const float stride_h_r = 1.f / static_cast<float>(stride_h);
-  const float stride_w_r = 1.f / static_cast<float>(stride_w);
-  const int padding_h = (padding_data[0] + 1) >> 1;
-  const int padding_w = (padding_data[1] + 1) >> 1;
-  const int align_h = stride_h - 1 - padding_h;
-  const int align_w = stride_w - 1 - padding_w;
-  const int kernel_size = filter->dim(2) * filter->dim(3);
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_deconv2d");
-    built_options.emplace("-Ddepthwise_deconv2d=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-    switch (activation) {
-      case NOOP:
-        break;
-      case RELU:
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case TANH:
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      case LEAKYRELU:
-        built_options.emplace("-DUSE_LEAKYRELU");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation;
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("depthwise_deconv2d", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width_blocks),
-                           static_cast<uint32_t>(height * batch)};
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(filter->opencl_image()));
-    if (bias != nullptr) {
-      kernel_.setArg(idx++, *(bias->opencl_image()));
-    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, relux_max_limit);
-    kernel_.setArg(idx++, leakyrelu_coefficient);
-    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(height));
-    kernel_.setArg(idx++, static_cast<int32_t>(width));
-    kernel_.setArg(idx++, static_cast<int32_t>(channels));
-    kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
-    kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
-    kernel_.setArg(idx++, stride_h_r);
-    kernel_.setArg(idx++, stride_w_r);
-    kernel_.setArg(idx++, static_cast<int32_t>(align_h));
-    kernel_.setArg(idx++, static_cast<int32_t>(align_w));
-    kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
-    kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
-    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
-    kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
-    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("depthwise_deconv2d_kernel_",
-             activation,
-             output->dim(0),
-             output->dim(1),
-             output->dim(2),
-             output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/eltwise.cc
+++ b/mace/ops/opencl/image/eltwise.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/eltwise.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus EltwiseKernel::Compute(
+    OpContext *context,
+    const Tensor *input0,
+    const Tensor *input1,
+    Tensor *output) {
+  bool swapped = false;
+  std::string input1_type = "";
+  if (input1 == nullptr) {
+    input1_type = "INPUT_SCALAR";
+  } else {
+    MACE_CHECK((input0->dim_size() == input1->dim_size()
+        && input0->dim_size() == 4) ||
+        input0->dim_size() == 1 || input1->dim_size() == 1)
+      << "Inputs of Eltwise op must be same shape or fulfill broadcast logic";
+    MACE_CHECK(type_ != EltwiseType::EQUAL)
+      << "Eltwise op on GPU does not support EQUAL";
+    // broadcast
+    if (input0->size() != input1->size() ||
+        input0->dim_size() != input1->dim_size()) {
+      if (input0->size() < input1->size()
+          || input0->dim_size() < input1->dim_size()) {
+        std::swap(input0, input1);
+        swapped = true;
+      }
+      if (input1->dim_size() == 1
+          || (input1->dim(0) == 1 && input1->dim(1) == 1
+              && input1->dim(2) == 1)) {
+        // Tensor-Vector element wise
+        if (input0->dim(3) == input1->dim(input1->dim_size()-1)) {
+          input1_type = "INPUT_VECTOR";
+        } else {
+          LOG(FATAL) << "Inputs not match the broadcast logic, "
+                     << MakeString(input0->shape()) << " vs "
+                     << MakeString(input1->shape());
+        }
+      } else {  // must be 4-D
+        if (input0->dim(0) == input1->dim(0)
+            && input1->dim(1) == 1
+            && input1->dim(2) == 1
+            && input0->dim(3) == input1->dim(3)) {
+          input1_type = "INPUT_BATCH_VECTOR";
+        } else if (input0->dim(0) == input1->dim(0)
+            && input0->dim(1) == input1->dim(1)
+            && input0->dim(2) == input1->dim(2)
+            && input1->dim(3) == 1) {
+          // broadcast on channel dimension
+          input1_type = "INPUT_TENSOR_BC_CHAN";
+        } else {
+          LOG(FATAL) << "Element-Wise op only support broadcast on"
+                        " channel dimension:"
+                        "Tensor-BatchVector(4D-[N,1,1,C]) "
+                        "and Tensor-Tensor(4D-[N,H,W,1]). but got "
+                     << MakeString(input0->shape()) << " vs "
+                     << MakeString(input1->shape());
+        }
+      }
+    }
+  }
+  if (scalar_input_index_ == 0) {
+    swapped = !swapped;
+  }
+  std::vector<index_t> output_shape(4);
+  output_shape[0] = input0->dim(0);
+  output_shape[1] = input0->dim(1);
+  output_shape[2] = input0->dim(2);
+  output_shape[3] = input0->dim(3);
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channels = output->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t batch_height_pixels = batch * height;
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(batch_height_pixels)};
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
+    built_options.emplace("-Deltwise=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
+    if (!input1_type.empty()) {
+      built_options.emplace("-D" + input1_type);
+    }
+    if (swapped) built_options.emplace("-DSWAPPED");
+    if (channels % 4 != 0) built_options.emplace("-DNOT_DIVISIBLE_FOUR");
+    if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input0->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input0->opencl_image()));
+    if (input1 == nullptr) {
+      kernel_.setArg(idx++, scalar_input_);
+    } else {
+      kernel_.setArg(idx++, *(input1->opencl_image()));
+    }
+    kernel_.setArg(idx++, static_cast<int32_t>(height));
+    kernel_.setArg(idx++, static_cast<int32_t>(width));
+    kernel_.setArg(idx++, static_cast<int32_t>(channels));
+    if (!coeff_.empty()) {
+      kernel_.setArg(idx++, coeff_[0]);
+      kernel_.setArg(idx++, coeff_[1]);
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input0->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/eltwise.h
+++ b/mace/ops/opencl/image/eltwise.h
@@ -24,7 +24,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/eltwise.h"
+#include "mace/ops/common/eltwise_type.h"
 #include "mace/ops/opencl/helper.h"
 namespace mace {
@@ -32,7 +32,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class EltwiseKernel : public OpenCLEltwiseKernel {
 public:
  explicit EltwiseKernel(
@@ -60,150 +59,6 @@ class EltwiseKernel : public OpenCLEltwiseKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus EltwiseKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input0,
-    const Tensor *input1,
-    Tensor *output) {
-  bool swapped = false;
-  std::string input1_type = "";
-  if (input1 == nullptr) {
-    input1_type = "INPUT_SCALAR";
-  } else {
-    MACE_CHECK((input0->dim_size() == input1->dim_size()
-        && input0->dim_size() == 4) ||
-        input0->dim_size() == 1 || input1->dim_size() == 1)
-      << "Inputs of Eltwise op must be same shape or fulfill broadcast logic";
-    MACE_CHECK(type_ != EltwiseType::EQUAL)
-      << "Eltwise op on GPU does not support EQUAL";
-    // broadcast
-    if (input0->size() != input1->size() ||
-        input0->dim_size() != input1->dim_size()) {
-      if (input0->size() < input1->size()
-          || input0->dim_size() < input1->dim_size()) {
-        std::swap(input0, input1);
-        swapped = true;
-      }
-      if (input1->dim_size() == 1
-          || (input1->dim(0) == 1 && input1->dim(1) == 1
-              && input1->dim(2) == 1)) {
-        // Tensor-Vector element wise
-        if (input0->dim(3) == input1->dim(input1->dim_size()-1)) {
-          input1_type = "INPUT_VECTOR";
-        } else {
-          LOG(FATAL) << "Inputs not match the broadcast logic, "
-                     << MakeString(input0->shape()) << " vs "
-                     << MakeString(input1->shape());
-        }
-      } else {  // must be 4-D
-        if (input0->dim(0) == input1->dim(0)
-            && input1->dim(1) == 1
-            && input1->dim(2) == 1
-            && input0->dim(3) == input1->dim(3)) {
-          input1_type = "INPUT_BATCH_VECTOR";
-        } else if (input0->dim(0) == input1->dim(0)
-            && input0->dim(1) == input1->dim(1)
-            && input0->dim(2) == input1->dim(2)
-            && input1->dim(3) == 1) {
-          // broadcast on channel dimension
-          input1_type = "INPUT_TENSOR_BC_CHAN";
-        } else {
-          LOG(FATAL) << "Element-Wise op only support broadcast on"
-                        " channel dimension:"
-                        "Tensor-BatchVector(4D-[N,1,1,C]) "
-                        "and Tensor-Tensor(4D-[N,H,W,1]). but got "
-                     << MakeString(input0->shape()) << " vs "
-                     << MakeString(input1->shape());
-        }
-      }
-    }
-  }
-  if (scalar_input_index_ == 0) {
-    swapped = !swapped;
-  }
-  std::vector<index_t> output_shape(4);
-  output_shape[0] = input0->dim(0);
-  output_shape[1] = input0->dim(1);
-  output_shape[2] = input0->dim(2);
-  output_shape[3] = input0->dim(3);
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  const index_t batch = output->dim(0);
-  const index_t height = output->dim(1);
-  const index_t width = output->dim(2);
-  const index_t channels = output->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t batch_height_pixels = batch * height;
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(batch_height_pixels)};
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
-    built_options.emplace("-Deltwise=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
-    if (!input1_type.empty()) {
-      built_options.emplace("-D" + input1_type);
-    }
-    if (swapped) built_options.emplace("-DSWAPPED");
-    if (channels % 4 != 0) built_options.emplace("-DNOT_DIVISIBLE_FOUR");
-    if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input0->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input0->opencl_image()));
-    if (input1 == nullptr) {
-      kernel_.setArg(idx++, scalar_input_);
-    } else {
-      kernel_.setArg(idx++, *(input1->opencl_image()));
-    }
-    kernel_.setArg(idx++, static_cast<int32_t>(height));
-    kernel_.setArg(idx++, static_cast<int32_t>(width));
-    kernel_.setArg(idx++, static_cast<int32_t>(channels));
-    if (!coeff_.empty()) {
-      kernel_.setArg(idx++, coeff_[0]);
-      kernel_.setArg(idx++, coeff_[1]);
-    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input0->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/fully_connected.cc
+++ b/mace/ops/opencl/image/fully_connected.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/fully_connected.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus FullyConnectedKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *weight,
+    const Tensor *bias,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const float leakyrelu_coefficient,
+    Tensor *output) {
+  std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    const index_t batch = output->dim(0);
+    const index_t output_size = output->dim(3);
+    const index_t output_blocks = RoundUpDiv4(output_size);
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
+    built_options.emplace("-Dfully_connected_width=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    if (bias != nullptr) {
+      built_options.emplace("-DBIAS");
+    }
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
+      built_options.emplace("-DNON_QUALCOMM_ADRENO");
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
+                                              built_options, &kernel_));
+    const uint32_t kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+    if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+      const uint32_t wave_size =
+          static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
+      gws_ = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
+      const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
+      lws_ = {gws_[0], gws_[1], inter_local_blks};
+    } else {
+      gws_ = {4, 8, static_cast<uint32_t>(batch * output_blocks)};
+      const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
+      lws_ = {gws_[0], gws_[1], inter_local_blks};
+    }
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    const index_t batch = output->dim(0);
+    const index_t output_blocks = RoundUpDiv4(output->dim(3));
+    gws_[2] = static_cast<uint32_t>(batch * output_blocks);
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws_);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(weight->opencl_image()));
+    if (bias != nullptr) {
+      kernel_.setArg(idx++, *(bias->opencl_image()));
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, (lws_[0] * lws_[1] * lws_[2] * sizeof(float)),
+                   nullptr);
+    kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
+    kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
+    kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
+    kernel_.setArg(idx++, static_cast<int>(output_blocks));
+    kernel_.setArg(idx++, relux_max_limit);
+    kernel_.setArg(idx++, leakyrelu_coefficient);
+    input_shape_ = input->shape();
+  }
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws_[0], gws_[1], gws_[2]),
+        cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws_.size());
+    for (size_t i = 0; i < lws_.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws_[i], lws_[i]);
+    }
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange,
+        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+        cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
+  }
+  MACE_OUT_OF_RANGE_VALIDATION;
+  MACE_CL_RET_STATUS(error);
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/fully_connected.h
+++ b/mace/ops/opencl/image/fully_connected.h
@@ -23,6 +23,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
+#include "mace/ops/common/activation_type.h"
 #include "mace/ops/opencl/helper.h"
 namespace mace {
@@ -30,7 +31,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class FullyConnectedKernel : public OpenCLFullyConnectedKernel {
 public:
  MaceStatus Compute(
@@ -50,144 +50,6 @@ class FullyConnectedKernel : public OpenCLFullyConnectedKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus FullyConnectedKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *weight,
-    const Tensor *bias,
-    const ActivationType activation,
-    const float relux_max_limit,
-    const float leakyrelu_coefficient,
-    Tensor *output) {
-  std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    const index_t batch = output->dim(0);
-    const index_t output_size = output->dim(3);
-    const index_t output_blocks = RoundUpDiv4(output_size);
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
-    built_options.emplace("-Dfully_connected_width=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    if (bias != nullptr) {
-      built_options.emplace("-DBIAS");
-    }
-    switch (activation) {
-      case NOOP:
-        break;
-      case RELU:
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case TANH:
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      case LEAKYRELU:
-        built_options.emplace("-DUSE_LEAKYRELU");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation;
-    }
-    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
-      built_options.emplace("-DNON_QUALCOMM_ADRENO");
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
-                                              built_options, &kernel_));
-    const uint32_t kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-    if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
-      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
-      const uint32_t wave_size =
-          static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
-      gws_ = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
-      const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
-      lws_ = {gws_[0], gws_[1], inter_local_blks};
-    } else {
-      gws_ = {4, 8, static_cast<uint32_t>(batch * output_blocks)};
-      const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
-      lws_ = {gws_[0], gws_[1], inter_local_blks};
-    }
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    const index_t batch = output->dim(0);
-    const index_t output_blocks = RoundUpDiv4(output->dim(3));
-    gws_[2] = static_cast<uint32_t>(batch * output_blocks);
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(weight->opencl_image()));
-    if (bias != nullptr) {
-      kernel_.setArg(idx++, *(bias->opencl_image()));
-    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, (lws_[0] * lws_[1] * lws_[2] * sizeof(float)),
-                   nullptr);
-    kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
-    kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
-    kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
-    kernel_.setArg(idx++, static_cast<int>(output_blocks));
-    kernel_.setArg(idx++, relux_max_limit);
-    kernel_.setArg(idx++, leakyrelu_coefficient);
-    input_shape_ = input->shape();
-  }
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(gws_[0], gws_[1], gws_[2]),
-        cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
-  } else {
-    std::vector<uint32_t> roundup_gws(lws_.size());
-    for (size_t i = 0; i < lws_.size(); ++i) {
-      roundup_gws[i] = RoundUp(gws_[i], lws_[i]);
-    }
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange,
-        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-        cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
-  }
-  MACE_OUT_OF_RANGE_VALIDATION;
-  MACE_CL_RET_STATUS(error);
-  if (context->future() != nullptr) {
-    context->future()->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/image_to_buffer.cc
+++ b/mace/ops/opencl/image/image_to_buffer.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/image_to_buffer.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus ImageToBuffer::Compute(OpContext *context,
+                                  const Tensor *input,
+                                  const OpenCLBufferType type,
+                                  const int wino_blk_size,
+                                  Tensor *output) {
+  auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
+                              type,
+                              &image_shape,
+                              wino_blk_size);
+  MACE_RETURN_IF_ERROR(output->Resize(input->shape()));
+  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
+                     static_cast<uint32_t>(image_shape[1])};
+  std::string kernel_name;
+  switch (type) {
+    case CONV2D_FILTER:kernel_name = "filter_image_to_buffer";
+      break;
+    case IN_OUT_CHANNEL:kernel_name = "in_out_image_to_buffer";
+      break;
+    case ARGUMENT:kernel_name = "arg_image_to_buffer";
+      break;
+    case IN_OUT_HEIGHT:kernel_name = "in_out_height_image_to_buffer";
+      break;
+    case WINOGRAD_FILTER: {
+      std::stringstream ss_tmp;
+      gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
+      ss_tmp << "winograd_filter_image_to_buffer_"
+             << wino_blk_size << "x" << wino_blk_size;
+      kernel_name = ss_tmp.str();
+      break;
+    }
+    case WEIGHT_HEIGHT:kernel_name = "weight_height_image_to_buffer";
+      break;
+    case WEIGHT_WIDTH:kernel_name = "weight_width_image_to_buffer";
+      break;
+    case DW_CONV2D_FILTER:
+    case IN_OUT_WIDTH:LOG(FATAL)
+          << "IN_OUT_WIDTH only support buffer to image now";
+      break;
+  }
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    if (output->dtype() == input->dtype()) {
+      auto data_dt = input->dtype();
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt));
+    } else {
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_2D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(output->opencl_buffer()));
+    if (type == CONV2D_FILTER) {
+      const index_t
+          inner_size = output->dim(1) * output->dim(2) * output->dim(3);
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
+    } else if (type == ARGUMENT) {
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
+    } else if (type == WEIGHT_HEIGHT) {
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(1)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
+    } else {
+      kernel_.setArg(idx++,
+                     static_cast<uint32_t>(formatted_buffer_shape[1]));
+      kernel_.setArg(idx++,
+                     static_cast<uint32_t>(formatted_buffer_shape[2]));
+      kernel_.setArg(idx++,
+                     static_cast<uint32_t>(formatted_buffer_shape[3]));
+    }
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  const uint32_t kwg_size =
+      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  const std::vector<uint32_t> lws = {16, kwg_size / 16};
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
+        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
+        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION;
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/image_to_buffer.h
+++ b/mace/ops/opencl/image/image_to_buffer.h
@@ -28,7 +28,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class ImageToBuffer : public OpenCLBufferTransformKernel {
 public:
  MaceStatus Compute(OpContext *context,
@@ -42,150 +41,6 @@ class ImageToBuffer : public OpenCLBufferTransformKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus ImageToBuffer<T>::Compute(OpContext *context,
-                                     const Tensor *input,
-                                     const OpenCLBufferType type,
-                                     const int wino_blk_size,
-                                     Tensor *output) {
-  auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
-                              type,
-                              &image_shape,
-                              wino_blk_size);
-  MACE_RETURN_IF_ERROR(output->Resize(input->shape()));
-  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
-                     static_cast<uint32_t>(image_shape[1])};
-  std::string kernel_name;
-  switch (type) {
-    case CONV2D_FILTER:
-      kernel_name = "filter_image_to_buffer";
-      break;
-    case IN_OUT_CHANNEL:
-      kernel_name = "in_out_image_to_buffer";
-      break;
-    case ARGUMENT:
-      kernel_name = "arg_image_to_buffer";
-      break;
-    case IN_OUT_HEIGHT:
-      kernel_name = "in_out_height_image_to_buffer";
-      break;
-    case WINOGRAD_FILTER: {
-      std::stringstream ss_tmp;
-      gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
-      ss_tmp << "winograd_filter_image_to_buffer_"
-             << wino_blk_size << "x" << wino_blk_size;
-      kernel_name = ss_tmp.str();
-      break;
-    }
-    case WEIGHT_HEIGHT:
-      kernel_name = "weight_height_image_to_buffer";
-      break;
-    case WEIGHT_WIDTH:
-      kernel_name = "weight_width_image_to_buffer";
-      break;
-    case DW_CONV2D_FILTER:
-    case IN_OUT_WIDTH:
-      LOG(FATAL) << "IN_OUT_WIDTH only support buffer to image now";
-      break;
-  }
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    built_options.emplace(kernel_name_ss.str());
-    if (output->dtype() == input->dtype()) {
-      built_options.emplace(
-          "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-      built_options.emplace("-DCMD_DATA_TYPE=" +
-          DtToCLCMDDt(DataTypeToEnum<T>::value));
-    } else {
-      built_options.emplace("-DDATA_TYPE=" +
-          DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
-      built_options.emplace("-DCMD_DATA_TYPE=" +
-          DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_2D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(output->opencl_buffer()));
-    if (type == CONV2D_FILTER) {
-      const index_t
-          inner_size = output->dim(1) * output->dim(2) * output->dim(3);
-      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
-    } else if (type == ARGUMENT) {
-      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
-    } else if (type == WEIGHT_HEIGHT) {
-      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(1)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
-    } else {
-      kernel_.setArg(idx++,
-                     static_cast<uint32_t>(formatted_buffer_shape[1]));
-      kernel_.setArg(idx++,
-                     static_cast<uint32_t>(formatted_buffer_shape[2]));
-      kernel_.setArg(idx++,
-                     static_cast<uint32_t>(formatted_buffer_shape[3]));
-    }
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    input_shape_ = input->shape();
-  }
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {16, kwg_size / 16};
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
-        cl::NDRange(lws[0], lws[1]), nullptr, &event);
-  } else {
-    std::vector<uint32_t> roundup_gws(lws.size());
-    for (size_t i = 0; i < lws.size(); ++i) {
-      roundup_gws[i] = RoundUp(gws[i], lws[i]);
-    }
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
-        cl::NDRange(lws[0], lws[1]), nullptr, &event);
-  }
-  MACE_CL_RET_STATUS(error);
-  MACE_OUT_OF_RANGE_VALIDATION;
-  if (context->future() != nullptr) {
-    context->future()->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/lstm_cell.cc
+++ b/mace/ops/opencl/image/lstm_cell.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/lstm_cell.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus LSTMCellKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *pre_output,
+    const Tensor *weight,
+    const Tensor *bias,
+    const Tensor *pre_cell,
+    Tensor *cell,
+    Tensor *output) {
+  MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0,
+             "LSTM hidden units should be a multiple of 4");
+  const index_t height = input->dim(0);
+  const index_t width = input->dim(1);
+  const index_t hidden_units = pre_output->dim(1);
+  const index_t w_blocks = hidden_units >> 2;
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell");
+    built_options.emplace("-Dlstmcell=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("lstmcell", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  const uint32_t gws[2] = {static_cast<uint32_t>(w_blocks),
+                           static_cast<uint32_t>(height)};
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    std::vector<index_t> output_shape_padded = {height, 1, 1, hidden_units};
+    std::vector<size_t> output_image_shape;
+    OpenCLUtil::CalImage2DShape(output_shape_padded,
+                                OpenCLBufferType::IN_OUT_CHANNEL,
+                                &output_image_shape);
+    MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(),
+                                             output_image_shape));
+    MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(),
+                                           output_image_shape));
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_2D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(pre_output->opencl_image()));
+    kernel_.setArg(idx++, *(weight->opencl_image()));
+    kernel_.setArg(idx++, *(bias->opencl_image()));
+    kernel_.setArg(idx++, *(pre_cell->opencl_image()));
+    kernel_.setArg(idx++, forget_bias_);
+    kernel_.setArg(idx++, static_cast<int32_t>(width));
+    kernel_.setArg(idx++, static_cast<int32_t>(hidden_units));
+    kernel_.setArg(idx++, static_cast<int32_t>(RoundUpDiv4(width)));
+    kernel_.setArg(idx++, *(cell->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
+  std::string tuning_key =
+      Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/lstm_cell.h
+++ b/mace/ops/opencl/image/lstm_cell.h
@@ -30,11 +30,10 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class LSTMCellKernel : public OpenCLLSTMCellKernel {
 public:
  explicit LSTMCellKernel(
-       const T forget_bias)
+       const float forget_bias)
      : forget_bias_(forget_bias) {}
  MaceStatus Compute(
      OpContext *context,
@@ -47,93 +46,12 @@ class LSTMCellKernel : public OpenCLLSTMCellKernel {
      Tensor *output) override;
 private:
-  T forget_bias_;
+  float forget_bias_;
  cl::Kernel kernel_;
  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus LSTMCellKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *pre_output,
-    const Tensor *weight,
-    const Tensor *bias,
-    const Tensor *pre_cell,
-    Tensor *cell,
-    Tensor *output) {
-  MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0,
-             "LSTM hidden units should be a multiple of 4");
-  const index_t height = input->dim(0);
-  const index_t width = input->dim(1);
-  const index_t hidden_units = pre_output->dim(1);
-  const index_t w_blocks = hidden_units >> 2;
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell");
-    built_options.emplace("-Dlstmcell=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("lstmcell", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  const uint32_t gws[2] = {static_cast<uint32_t>(w_blocks),
-                           static_cast<uint32_t>(height)};
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    std::vector<index_t> output_shape_padded = {height, 1, 1, hidden_units};
-    std::vector<size_t> output_image_shape;
-    OpenCLUtil::CalImage2DShape(output_shape_padded,
-                                OpenCLBufferType::IN_OUT_CHANNEL,
-                                &output_image_shape);
-    MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(),
-                                             output_image_shape));
-    MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(),
-                                           output_image_shape));
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_2D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(pre_output->opencl_image()));
-    kernel_.setArg(idx++, *(weight->opencl_image()));
-    kernel_.setArg(idx++, *(bias->opencl_image()));
-    kernel_.setArg(idx++, *(pre_cell->opencl_image()));
-    kernel_.setArg(idx++, static_cast<float>(forget_bias_));
-    kernel_.setArg(idx++, static_cast<int32_t>(width));
-    kernel_.setArg(idx++, static_cast<int32_t>(hidden_units));
-    kernel_.setArg(idx++, static_cast<int32_t>(RoundUpDiv4(width)));
-    kernel_.setArg(idx++, *(cell->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
-  std::string tuning_key =
-      Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/matmul.cc
+++ b/mace/ops/opencl/image/matmul.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/matmul.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus MatMulKernel::Compute(
+    OpContext *context,
+    const Tensor *A,
+    const Tensor *B,
+    Tensor *C,
+    bool transpose_a,
+    bool transpose_b) {
+  MACE_CHECK(!transpose_a && !transpose_b,
+             "GPU does not support transpose matmul");
+  index_t rank = A->dim_size();
+  index_t height = A->dim(rank - 2);
+  index_t K = A->dim(rank - 1);
+  index_t width = B->dim(rank - 1);
+  index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
+                                  std::multiplies<index_t>());
+  std::vector<index_t> c_shape = A->shape();
+  c_shape[rank - 2] = height;
+  c_shape[rank - 1] = width;
+  std::vector<size_t> c_image_shape;
+  std::vector<index_t> padded_c_shape = {batch, height, width, 1};
+  OpenCLUtil::CalImage2DShape(padded_c_shape,
+                              OpenCLBufferType::IN_OUT_HEIGHT,
+                              &c_image_shape);
+  MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
+  const index_t height_blocks = RoundUpDiv4(height);
+  const index_t width_blocks = RoundUpDiv4(width);
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(width_blocks),
+      static_cast<uint32_t>(height_blocks * batch),
+  };
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
+    built_options.emplace("-Dmatmul=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  uint32_t idx = 0;
+  MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+  MACE_SET_2D_GWS_ARGS(kernel_, gws);
+  kernel_.setArg(idx++, *(A->opencl_image()));
+  kernel_.setArg(idx++, *(B->opencl_image()));
+  kernel_.setArg(idx++, *(C->opencl_image()));
+  kernel_.setArg(idx++, static_cast<int>(height));
+  kernel_.setArg(idx++, static_cast<int>(width));
+  kernel_.setArg(idx++, static_cast<int>(K));
+  kernel_.setArg(idx++, static_cast<int>(height_blocks));
+  kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(K)));
+  const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
+  std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/matmul.h
+++ b/mace/ops/opencl/image/matmul.h
@@ -31,7 +31,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class MatMulKernel : public OpenCLMatMulKernel {
 public:
  MaceStatus Compute(
@@ -47,81 +46,6 @@ class MatMulKernel : public OpenCLMatMulKernel {
  uint32_t kwg_size_;
 };
-template <typename T>
-MaceStatus MatMulKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *A,
-    const Tensor *B,
-    Tensor *C,
-    bool transpose_a,
-    bool transpose_b) {
-  MACE_CHECK(!transpose_a && !transpose_b,
-             "GPU does not support transpose matmul");
-  index_t rank = A->dim_size();
-  index_t height = A->dim(rank - 2);
-  index_t K = A->dim(rank - 1);
-  index_t width = B->dim(rank - 1);
-  index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
-                                  std::multiplies<index_t>());
-  std::vector<index_t> c_shape = A->shape();
-  c_shape[rank - 2] = height;
-  c_shape[rank - 1] = width;
-  std::vector<size_t> c_image_shape;
-  std::vector<index_t> padded_c_shape = {batch, height, width, 1};
-  OpenCLUtil::CalImage2DShape(padded_c_shape,
-                              OpenCLBufferType::IN_OUT_HEIGHT,
-                              &c_image_shape);
-  MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
-  const index_t height_blocks = RoundUpDiv4(height);
-  const index_t width_blocks = RoundUpDiv4(width);
-  const uint32_t gws[2] = {
-      static_cast<uint32_t>(width_blocks),
-      static_cast<uint32_t>(height_blocks * batch),
-  };
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
-    built_options.emplace("-Dmatmul=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  uint32_t idx = 0;
-  MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-  MACE_SET_2D_GWS_ARGS(kernel_, gws);
-  kernel_.setArg(idx++, *(A->opencl_image()));
-  kernel_.setArg(idx++, *(B->opencl_image()));
-  kernel_.setArg(idx++, *(C->opencl_image()));
-  kernel_.setArg(idx++, static_cast<int>(height));
-  kernel_.setArg(idx++, static_cast<int>(width));
-  kernel_.setArg(idx++, static_cast<int>(K));
-  kernel_.setArg(idx++, static_cast<int>(height_blocks));
-  kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(K)));
-  const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
-  std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/pad.cc
+++ b/mace/ops/opencl/image/pad.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/pad.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus PadKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    Tensor *output) {
+  MACE_CHECK(this->paddings_.size() ==
+      static_cast<size_t>((input->dim_size() * 2)));
+  MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) &&
+      (this->paddings_[6] == 0) && (this->paddings_[7] == 0))
+    << "Mace only support height/width dimension now";
+  for (int i = 2; i <= 5; ++i) {
+    MACE_CHECK(paddings_[i] >= 0);
+  }
+  auto input_shape = input->shape();
+  if (type_ == PadType::REFLECT) {
+    MACE_CHECK(paddings_[2] < input_shape[1] &&
+        paddings_[3] < input_shape[1] &&
+        paddings_[4] < input_shape[2] &&
+        paddings_[5] < input_shape[2]);
+  } else if (type_ == PadType::SYMMETRIC) {
+    MACE_CHECK(paddings_[2] <= input_shape[1] &&
+        paddings_[3] <= input_shape[1] &&
+        paddings_[4] <= input_shape[2] &&
+        paddings_[5] <= input_shape[2]);
+  } else {
+    MACE_CHECK(type_ == PadType::CONSTANT);
+  }
+  std::vector<index_t> output_shape = {
+      input_shape[0] + this->paddings_[0] + this->paddings_[1],
+      input_shape[1] + this->paddings_[2] + this->paddings_[3],
+      input_shape[2] + this->paddings_[4] + this->paddings_[5],
+      input_shape[3] + this->paddings_[6] + this->paddings_[7]};
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channels = output->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
+    built_options.emplace("-Dpad=" + kernel_name);
+    auto dt = input->dtype();
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    built_options.emplace(MakeString("-DPAD_TYPE=", type_));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    int idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    if (type_ == PadType::CONSTANT) {
+      kernel_.setArg(idx++, this->constant_value_);
+    }
+    kernel_.setArg(idx++, static_cast<int32_t>(input_shape[1]));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_shape[2]));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_shape[1]));
+    kernel_.setArg(idx++, this->paddings_[2]);
+    kernel_.setArg(idx++, this->paddings_[4]);
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
+                                  output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/pad.h
+++ b/mace/ops/opencl/image/pad.h
@@ -23,7 +23,7 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/pad.h"
+#include "mace/ops/common/pad_type.h"
 #include "mace/ops/opencl/helper.h"
 namespace mace {
@@ -31,7 +31,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class PadKernel : public OpenCLPadKernel {
 public:
  PadKernel(const PadType type,
@@ -53,105 +52,6 @@ class PadKernel : public OpenCLPadKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus PadKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    Tensor *output) {
-  MACE_CHECK(this->paddings_.size() ==
-      static_cast<size_t>((input->dim_size() * 2)));
-  MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) &&
-      (this->paddings_[6] == 0) && (this->paddings_[7] == 0))
-    << "Mace only support height/width dimension now";
-  for (int i = 2; i <= 5; ++i) {
-    MACE_CHECK(paddings_[i] >= 0);
-  }
-  auto input_shape = input->shape();
-  if (type_ == PadType::REFLECT) {
-    MACE_CHECK(paddings_[2] < input_shape[1] &&
-               paddings_[3] < input_shape[1] &&
-               paddings_[4] < input_shape[2] &&
-               paddings_[5] < input_shape[2]);
-  } else if (type_ == PadType::SYMMETRIC) {
-    MACE_CHECK(paddings_[2] <= input_shape[1] &&
-               paddings_[3] <= input_shape[1] &&
-               paddings_[4] <= input_shape[2] &&
-               paddings_[5] <= input_shape[2]);
-  } else {
-    MACE_CHECK(type_ == PadType::CONSTANT);
-  }
-  std::vector<index_t> output_shape = {
-      input_shape[0] + this->paddings_[0] + this->paddings_[1],
-      input_shape[1] + this->paddings_[2] + this->paddings_[3],
-      input_shape[2] + this->paddings_[4] + this->paddings_[5],
-      input_shape[3] + this->paddings_[6] + this->paddings_[7]};
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape,
-                              OpenCLBufferType::IN_OUT_CHANNEL,
-                              &image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-  const index_t batch = output->dim(0);
-  const index_t height = output->dim(1);
-  const index_t width = output->dim(2);
-  const index_t channels = output->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
-    built_options.emplace("-Dpad=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    built_options.emplace(MakeString("-DPAD_TYPE=", type_));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    int idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    if (type_ == PadType::CONSTANT) {
-      kernel_.setArg(idx++, this->constant_value_);
-    }
-    kernel_.setArg(idx++, static_cast<int32_t>(input_shape[1]));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_shape[2]));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_shape[1]));
-    kernel_.setArg(idx++, this->paddings_[2]);
-    kernel_.setArg(idx++, this->paddings_[4]);
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
-                                  output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/pooling.cc
+++ b/mace/ops/opencl/image/pooling.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/pooling.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus PoolingKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const PoolingType pooling_type,
+    const int *kernels,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const RoundType round_type,
+    Tensor *output) {
+  MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
+    << "Pooling opencl kernel not support dilation yet";
+  std::vector<index_t> output_shape(4);
+  std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
+                                       kernels[0], kernels[1]};
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    ops::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), filter_shape.data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), filter_shape.data(),
+                   padding_data.data(), dilations, strides, round_type,
+                   output_shape.data());
+  }
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
+    built_options.emplace("-Dpooling=" + kernel_name);
+    if (pooling_type == MAX && input->dtype() == output->dtype()) {
+      auto data_dt = input->dtype();
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt));
+    } else {
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    }
+    if (pooling_type == AVG) {
+      built_options.emplace("-DPOOL_AVG");
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
+                                              kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
+      static_cast<uint32_t>(output->dim(2)),
+      static_cast<uint32_t>(output->dim(0) * output->dim(1)),
+  };
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(output->dim(1)));
+    kernel_.setArg(idx++, paddings[0] / 2);
+    kernel_.setArg(idx++, paddings[1] / 2);
+    kernel_.setArg(idx++, strides[0]);
+    kernel_.setArg(idx++, strides[1]);
+    kernel_.setArg(idx++, kernels[0]);
+    kernel_.setArg(idx++, kernels[1]);
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = pooling::LocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/pooling.h
+++ b/mace/ops/opencl/image/pooling.h
@@ -57,7 +57,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 }  // namespace pooling
-template <typename T>
 class PoolingKernel : public OpenCLPoolingKernel {
 public:
  MaceStatus Compute(
@@ -78,109 +77,6 @@ class PoolingKernel : public OpenCLPoolingKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus PoolingKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const PoolingType pooling_type,
-    const int *kernels,
-    const int *strides,
-    const Padding &padding_type,
-    const std::vector<int> &padding_data,
-    const int *dilations,
-    const RoundType round_type,
-    Tensor *output) {
-  MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
-    << "Pooling opencl kernel not support dilation yet";
-  std::vector<index_t> output_shape(4);
-  std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
-                                       kernels[0], kernels[1]};
-  std::vector<int> paddings(2);
-  if (padding_data.empty()) {
-    ops::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), filter_shape.data(), dilations, strides,
-        padding_type, output_shape.data(), paddings.data());
-  } else {
-    paddings = padding_data;
-    CalcOutputSize(input->shape().data(), filter_shape.data(),
-                   padding_data.data(), dilations, strides, round_type,
-                   output_shape.data());
-  }
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    const DataType dt = DataTypeToEnum<T>::value;
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
-    built_options.emplace("-Dpooling=" + kernel_name);
-    if (pooling_type == MAX && input->dtype() == output->dtype()) {
-      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    } else {
-      built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-      built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    }
-    if (pooling_type == AVG) {
-      built_options.emplace("-DPOOL_AVG");
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
-                                              kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
-      static_cast<uint32_t>(output->dim(2)),
-      static_cast<uint32_t>(output->dim(0) * output->dim(1)),
-  };
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(output->dim(1)));
-    kernel_.setArg(idx++, paddings[0] / 2);
-    kernel_.setArg(idx++, paddings[1] / 2);
-    kernel_.setArg(idx++, strides[0]);
-    kernel_.setArg(idx++, strides[1]);
-    kernel_.setArg(idx++, kernels[0]);
-    kernel_.setArg(idx++, kernels[1]);
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = pooling::LocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/reduce.cc
+++ b/mace/ops/opencl/image/reduce.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/reduce.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus ReduceKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    Tensor *output) {
+  MACE_CHECK_NOTNULL(input);
+  index_t batch = input->dim(0);
+  const index_t in_height = input->dim(1);
+  const index_t in_width = input->dim(2);
+  const index_t channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
+  std::vector<uint32_t> gws(3);
+  std::vector<uint32_t> lws(3);
+  std::vector<index_t> output_shape{batch, 1, 1, channels};
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce");
+    built_options.emplace("-Dreduce=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    built_options.emplace(MakeString("-DREDUCE_TYPE=", reduce_type_));
+    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
+      built_options.emplace("-DNON_QUALCOMM_ADRENO");
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce",
+                                              kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
+    const uint32_t wave_size =
+        static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
+    gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
+  } else {
+    // Ensure each kernel has at least 4 input elements.
+    gws = {4, image_size / 16, static_cast<uint32_t>(batch * channel_blocks)};
+    if (gws[1] == 0) {
+      gws[1] = 1;
+    } else if (gws[1] > 16) {
+      gws[1] = 16;
+    }
+  }
+  lws = {gws[0], gws[1], 1};
+  const int group_num = lws[0] * lws[1] * lws[2];
+  // Each kernel intends to compute compute_size elements.
+  const int compute_size = (image_size + group_num - 1) / group_num;
+  const int last_index = image_size % group_num;
+  const float scale = 1.f / (in_width * in_height);
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, (group_num * 4 * sizeof(float)),
+                   nullptr);
+    kernel_.setArg(idx++, static_cast<int32_t>(group_num));
+    kernel_.setArg(idx++, static_cast<int32_t>(compute_size));
+    kernel_.setArg(idx++, static_cast<int32_t>(last_index));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
+    kernel_.setArg(idx++, scale);
+    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange,
+        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION;
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/reduce.h
+++ b/mace/ops/opencl/image/reduce.h
@@ -24,20 +24,18 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/opencl/helper.h"
-#include "mace/ops/reduce.h"
+#include "mace/ops/common/reduce_type.h"
 namespace mace {
 namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class ReduceKernel : public OpenCLReduceKernel {
 public:
  ReduceKernel(ReduceType type,
-               const std::vector<int> &axis,
+               const std::vector<int> &axis)
-               const bool keep_dims)
+      : reduce_type_(type), axis_(axis) {}
-      : reduce_type_(type), axis_(axis), keep_dims_(keep_dims) {}
  MaceStatus Compute(
      OpContext *context,
@@ -47,129 +45,11 @@ class ReduceKernel : public OpenCLReduceKernel {
 private:
  ReduceType reduce_type_;
  const std::vector<int> axis_;
-  bool keep_dims_;
  cl::Kernel kernel_;
  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus ReduceKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    Tensor *output) {
-  MACE_CHECK_NOTNULL(input);
-  index_t batch = input->dim(0);
-  const index_t in_height = input->dim(1);
-  const index_t in_width = input->dim(2);
-  const index_t channels = input->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
-  std::vector<uint32_t> gws(3);
-  std::vector<uint32_t> lws(3);
-  std::vector<index_t> output_shape{batch, 1, 1, channels};
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    const DataType dt = DataTypeToEnum<T>::value;
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce");
-    built_options.emplace("-Dreduce=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(MakeString("-DREDUCE_TYPE=", reduce_type_));
-    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
-      built_options.emplace("-DNON_QUALCOMM_ADRENO");
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce",
-                                              kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
-    const uint32_t wave_size =
-        static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
-    gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
-  } else {
-    // Ensure each kernel has at least 4 input elements.
-    gws = {4, image_size / 16, static_cast<uint32_t>(batch * channel_blocks)};
-    if (gws[1] == 0) {
-      gws[1] = 1;
-    } else if (gws[1] > 16) {
-      gws[1] = 16;
-    }
-  }
-  lws = {gws[0], gws[1], 1};
-  const int group_num = lws[0] * lws[1] * lws[2];
-  // Each kernel intends to compute compute_size elements.
-  const int compute_size = (image_size + group_num - 1) / group_num;
-  const int last_index = image_size % group_num;
-  const float scale = 1.f / (in_width * in_height);
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, (group_num * 4 * sizeof(float)),
-                   nullptr);
-    kernel_.setArg(idx++, static_cast<int32_t>(group_num));
-    kernel_.setArg(idx++, static_cast<int32_t>(compute_size));
-    kernel_.setArg(idx++, static_cast<int32_t>(last_index));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
-    kernel_.setArg(idx++, scale);
-    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  } else {
-    std::vector<uint32_t> roundup_gws(lws.size());
-    for (size_t i = 0; i < lws.size(); ++i) {
-      roundup_gws[i] = RoundUp(gws[i], lws[i]);
-    }
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange,
-        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  }
-  MACE_CL_RET_STATUS(error);
-  MACE_OUT_OF_RANGE_VALIDATION;
-  if (context->future() != nullptr) {
-    context->future()->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/resize_bicubic.cc
+++ b/mace/ops/opencl/image/resize_bicubic.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/resize_bicubic.h"
+#include "mace/ops/common/utils.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus ResizeBicubicKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t in_height = input->dim(1);
+  const index_t in_width = input->dim(2);
+  const index_t channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t out_height = out_height_;
+  const index_t out_width = out_width_;
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(out_width),
+                           static_cast<uint32_t>(out_height * batch)};
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache");
+    built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    built_options.emplace(
+        MakeString("-DTABLE_SIZE=", common::utils::kTableSize));
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("resize_bicubic",
+                             kernel_name,
+                             built_options,
+                             &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    MACE_CHECK(out_height > 0 && out_width > 0);
+    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
+    std::vector<size_t> output_image_shape;
+    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                                &output_image_shape);
+    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+    float height_scale =
+        common::utils::CalculateResizeScale(
+            in_height, out_height, align_corners_);
+    float width_scale =
+        common::utils::CalculateResizeScale(
+            in_width, out_width, align_corners_);
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, height_scale);
+    kernel_.setArg(idx++, width_scale);
+    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t>
+      lws = resize_bicubic::LocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/resize_bicubic.h
+++ b/mace/ops/opencl/image/resize_bicubic.h
@@ -25,13 +25,14 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/opencl/helper.h"
-#include "mace/ops/resize_bicubic.h"
 namespace mace {
 namespace ops {
 namespace opencl {
 namespace image {
 namespace resize_bicubic {
+constexpr int64_t kTableSize = (1u << 10);
 inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
                                     const uint32_t *gws,
                                     const uint32_t kwg_size) {
@@ -60,7 +61,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 }  // namespace resize_bicubic
-template <typename T>
 class ResizeBicubicKernel : public OpenCLResizeBicubicKernel {
 public:
  ResizeBicubicKernel(bool align_corners,
@@ -84,92 +84,6 @@ class ResizeBicubicKernel : public OpenCLResizeBicubicKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus ResizeBicubicKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    Tensor *output) {
-  const index_t batch = input->dim(0);
-  const index_t in_height = input->dim(1);
-  const index_t in_width = input->dim(2);
-  const index_t channels = input->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t out_height = out_height_;
-  const index_t out_width = out_width_;
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(out_width),
-                           static_cast<uint32_t>(out_height * batch)};
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    auto dt = DataTypeToEnum<T>::value;
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache");
-    built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(
-        MakeString("-DTABLE_SIZE=",
-                   mace::ops::resize_bicubic::kTableSize));
-    MACE_RETURN_IF_ERROR(
-        runtime->BuildKernel("resize_bicubic",
-                             kernel_name,
-                             built_options,
-                             &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    MACE_CHECK(out_height > 0 && out_width > 0);
-    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
-    std::vector<size_t> output_image_shape;
-    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                                &output_image_shape);
-    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-    float height_scale =
-        mace::ops::resize_bicubic::CalculateResizeScale(
-            in_height, out_height, align_corners_);
-    float width_scale =
-        mace::ops::resize_bicubic::CalculateResizeScale(
-            in_width, out_width, align_corners_);
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, height_scale);
-    kernel_.setArg(idx++, width_scale);
-    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t>
-      lws = resize_bicubic::LocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/resize_bilinear.cc
+++ b/mace/ops/opencl/image/resize_bilinear.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/resize_bilinear.h"
+#include "mace/ops/common/utils.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus ResizeBilinearKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t in_height = input->dim(1);
+  const index_t in_width = input->dim(2);
+  const index_t channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t out_height = out_height_;
+  const index_t out_width = out_width_;
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(out_width),
+                           static_cast<uint32_t>(out_height * batch)};
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
+    built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("resize_bilinear",
+                             kernel_name,
+                             built_options,
+                             &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    MACE_CHECK(out_height > 0 && out_width > 0);
+    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
+    std::vector<size_t> output_image_shape;
+    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                                &output_image_shape);
+    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+    float height_scale =
+        common::utils::CalculateResizeScale(in_height,
+                                            out_height,
+                                            align_corners_);
+    float width_scale =
+        common::utils::CalculateResizeScale(in_width,
+                                            out_width,
+                                            align_corners_);
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, height_scale);
+    kernel_.setArg(idx++, width_scale);
+    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t>
+      lws = resize_bilinear::LocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/resize_bilinear.h
+++ b/mace/ops/opencl/image/resize_bilinear.h
@@ -25,7 +25,6 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/opencl/helper.h"
-#include "mace/ops/resize_bilinear.h"
 namespace mace {
 namespace ops {
@@ -65,7 +64,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 }  // namespace resize_bilinear
-template <typename T>
 class ResizeBilinearKernel : public OpenCLResizeBilinearKernel {
 public:
  ResizeBilinearKernel(bool align_corners,
@@ -89,90 +87,6 @@ class ResizeBilinearKernel : public OpenCLResizeBilinearKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus ResizeBilinearKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    Tensor *output) {
-  const index_t batch = input->dim(0);
-  const index_t in_height = input->dim(1);
-  const index_t in_width = input->dim(2);
-  const index_t channels = input->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t out_height = out_height_;
-  const index_t out_width = out_width_;
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(out_width),
-                           static_cast<uint32_t>(out_height * batch)};
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
-    built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(
-        runtime->BuildKernel("resize_bilinear",
-                             kernel_name,
-                             built_options,
-                             &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    MACE_CHECK(out_height > 0 && out_width > 0);
-    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
-    std::vector<size_t> output_image_shape;
-    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                                &output_image_shape);
-    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-    float height_scale =
-        mace::ops::resize_bilinear::CalculateResizeScale(in_height,
-                                                             out_height,
-                                                             align_corners_);
-    float width_scale =
-        mace::ops::resize_bilinear::CalculateResizeScale(in_width,
-                                                             out_width,
-                                                             align_corners_);
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, height_scale);
-    kernel_.setArg(idx++, width_scale);
-    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t>
-      lws = resize_bilinear::LocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/resize_nearest_neighbor.cc
+++ b/mace/ops/opencl/image/resize_nearest_neighbor.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/resize_nearest_neighbor.h"
+#include "mace/ops/common/utils.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus ResizeNearestNeighborKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *size,
+    Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t in_height = input->dim(1);
+  const index_t in_width = input->dim(2);
+  const index_t channels = input->dim(3);
+  Tensor::MappingGuard size_mapper(size);
+  const index_t out_height = size->data<int32_t>()[0];
+  const index_t out_width = size->data<int32_t>()[1];
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(out_width),
+                           static_cast<uint32_t>(out_height * batch)};
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL(
+        "resize_nearest_neighbor_nocache");
+    built_options.emplace("-Dresize_nearest_neighbor_nocache=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("resize_nearest_neighbor",
+                             kernel_name,
+                             built_options,
+                             &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    MACE_CHECK(out_height > 0 && out_width > 0);
+    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
+    std::vector<size_t> output_image_shape;
+    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                                &output_image_shape);
+    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+    float height_scale =
+        common::utils::CalculateResizeScale(
+            in_height, out_height, align_corners_);
+    float width_scale =
+        common::utils::CalculateResizeScale(
+            in_width, out_width, align_corners_);
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, height_scale);
+    kernel_.setArg(idx++, width_scale);
+    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(align_corners_));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t>
+      lws = resize_nearest_neighbor::LocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("resize_nearest_neighbor_opencl_kernel", output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/resize_nearest_neighbor.h
+++ b/mace/ops/opencl/image/resize_nearest_neighbor.h
@@ -25,7 +25,6 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/opencl/helper.h"
-#include "mace/ops/resize_nearest_neighbor.h"
 namespace mace {
 namespace ops {
@@ -65,7 +64,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 }  // namespace resize_nearest_neighbor
-template <typename T>
 class ResizeNearestNeighborKernel : public OpenCLResizeNearestNeighborKernel {
 public:
  explicit ResizeNearestNeighborKernel(bool align_corners)
@@ -84,91 +82,6 @@ class ResizeNearestNeighborKernel : public OpenCLResizeNearestNeighborKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus ResizeNearestNeighborKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *size,
-    Tensor *output) {
-  const index_t batch = input->dim(0);
-  const index_t in_height = input->dim(1);
-  const index_t in_width = input->dim(2);
-  const index_t channels = input->dim(3);
-  Tensor::MappingGuard size_mapper(size);
-  const index_t out_height = size->data<int32_t>()[0];
-  const index_t out_width = size->data<int32_t>()[1];
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(out_width),
-                           static_cast<uint32_t>(out_height * batch)};
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL(
-        "resize_nearest_neighbor_nocache");
-    built_options.emplace("-Dresize_nearest_neighbor_nocache=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(
-        runtime->BuildKernel("resize_nearest_neighbor",
-                             kernel_name,
-                             built_options,
-                             &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    MACE_CHECK(out_height > 0 && out_width > 0);
-    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
-    std::vector<size_t> output_image_shape;
-    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                                &output_image_shape);
-    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-    float height_scale =
-        mace::ops::resize_nearest_neighbor::CalculateResizeScale(
-            in_height, out_height, align_corners_);
-    float width_scale =
-        mace::ops::resize_nearest_neighbor::CalculateResizeScale(
-            in_width, out_width, align_corners_);
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, height_scale);
-    kernel_.setArg(idx++, width_scale);
-    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(align_corners_));
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t>
-      lws = resize_nearest_neighbor::LocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("resize_nearest_neighbor_opencl_kernel", output->dim(0),
-             output->dim(1), output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/softmax.cc
+++ b/mace/ops/opencl/image/softmax.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/softmax.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus SoftmaxKernel::Compute(
+    OpContext *context,
+    const Tensor *logits,
+    Tensor *output) {
+  index_t batch = 0;
+  index_t height = 0;
+  index_t width = 0;
+  index_t channels = 0;
+  if (logits->dim_size() == 2) {
+    batch = logits->dim(0);
+    height = 1;
+    width = 1;
+    channels = logits->dim(1);
+  } else if (logits->dim_size() == 4) {
+    batch = logits->dim(0);
+    height = logits->dim(1);
+    width = logits->dim(2);
+    channels = logits->dim(3);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const int remain_channels = channel_blocks * 4 - channels;
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
+    built_options.emplace("-Dsoftmax=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    if (use_log_)
+      built_options.emplace("-DUSE_LOG");
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, logits->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(logits->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int>(channels));
+    kernel_.setArg(idx++, remain_channels);
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = logits->shape();
+  }
+  std::vector<uint32_t> lws = softmax::LocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("softmax_opencl_kernel", batch, height, width, channels);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/softmax.h
+++ b/mace/ops/opencl/image/softmax.h
@@ -56,7 +56,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 }
 }  // namespace softmax
-template <typename T>
 class SoftmaxKernel : public OpenCLSoftmaxKernel {
 public:
  explicit SoftmaxKernel(bool use_log)
@@ -74,81 +73,6 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus SoftmaxKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *logits,
-    Tensor *output) {
-  index_t batch = 0;
-  index_t height = 0;
-  index_t width = 0;
-  index_t channels = 0;
-  if (logits->dim_size() == 2) {
-    batch = logits->dim(0);
-    height = 1;
-    width = 1;
-    channels = logits->dim(1);
-  } else if (logits->dim_size() == 4) {
-    batch = logits->dim(0);
-    height = logits->dim(1);
-    width = logits->dim(2);
-    channels = logits->dim(3);
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const int remain_channels = channel_blocks * 4 - channels;
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
-    built_options.emplace("-Dsoftmax=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    if (use_log_)
-      built_options.emplace("-DUSE_LOG");
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, logits->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(logits->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int>(channels));
-    kernel_.setArg(idx++, remain_channels);
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = logits->shape();
-  }
-  std::vector<uint32_t> lws = softmax::LocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("softmax_opencl_kernel", batch, height, width, channels);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/space_to_batch.cc
+++ b/mace/ops/opencl/image/space_to_batch.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/space_to_batch.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus SpaceToBatchKernel::Compute(
+    OpContext *context,
+    const Tensor *space_tensor,
+    const std::vector<int> &paddings,
+    const std::vector<int> &block_shape,
+    const std::vector<index_t> &output_shape,
+    Tensor *batch_tensor) {
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(
+      batch_tensor->ResizeImage(output_shape, output_image_shape));
+  const char *kernel_name = "space_to_batch";
+  const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
+  const uint32_t gws[3] = {
+      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
+      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    auto input_dt = space_tensor->dtype();
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, space_tensor->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(space_tensor->opencl_image()));
+    kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
+    kernel_.setArg(idx++, block_shape[0]);
+    kernel_.setArg(idx++, block_shape[1]);
+    kernel_.setArg(idx++, paddings[0]);
+    kernel_.setArg(idx++, paddings[2]);
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
+    input_shape_ = space_tensor->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
+             batch_tensor->dim(2), batch_tensor->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/space_to_batch.h
+++ b/mace/ops/opencl/image/space_to_batch.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel {
 public:
  MaceStatus Compute(
@@ -47,79 +46,6 @@ class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus SpaceToBatchKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *space_tensor,
-    const std::vector<int> &paddings,
-    const std::vector<int> &block_shape,
-    const std::vector<index_t> &output_shape,
-    Tensor *batch_tensor) {
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(
-      batch_tensor->ResizeImage(output_shape, output_image_shape));
-  const char *kernel_name = "space_to_batch";
-  const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
-  const uint32_t gws[3] = {
-      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
-      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    built_options.emplace(kernel_name_ss.str());
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" +
-        DtToCLCMDDt(DataTypeToEnum<T>::value));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, space_tensor->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(space_tensor->opencl_image()));
-    kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
-    kernel_.setArg(idx++, block_shape[0]);
-    kernel_.setArg(idx++, block_shape[1]);
-    kernel_.setArg(idx++, paddings[0]);
-    kernel_.setArg(idx++, paddings[2]);
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
-    input_shape_ = space_tensor->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
-             batch_tensor->dim(2), batch_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/space_to_depth.cc
+++ b/mace/ops/opencl/image/space_to_depth.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/space_to_depth.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus SpaceToDepthKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t input_height = input->dim(1);
+  const index_t input_width = input->dim(2);
+  const index_t input_depth = input->dim(3);
+  MACE_CHECK(input_depth < 4 || (input_depth % 4) == 0,
+             "input channel should be dividable by 4");
+  MACE_CHECK(
+      (input_width % block_size_ == 0) && (input_height % block_size_ == 0),
+      "input width and height should be dividable by block_size");
+  const index_t output_height = input_height / block_size_;
+  const index_t output_width = input_width / block_size_;
+  const index_t output_depth = input_depth * block_size_ * block_size_;
+  const index_t output_depth_blocks = RoundUpDiv4(output_depth);
+  std::vector<index_t> output_shape = {batch, output_height, output_width,
+                                       output_depth};
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    const char *kernel_name = "space_to_depth";
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    if (input_depth < 4) {
+      built_options.emplace(MakeString("-DDEPTH", input_depth));
+    }
+    built_options.emplace(kernel_name_ss.str());
+    auto input_dt = input->dtype();
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  const uint32_t gws[3] = {static_cast<uint32_t>(output_depth_blocks),
+                           static_cast<uint32_t>(output_width),
+                           static_cast<uint32_t>(output_height * batch)};
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_depth));
+    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_width));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key = Concat("space_to_depth", input->dim(0),
+                                  input->dim(1), input->dim(2), input->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/space_to_depth.h
+++ b/mace/ops/opencl/image/space_to_depth.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel {
 public:
  explicit SpaceToDepthKernel(const int block_size)
@@ -47,93 +46,6 @@ class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus SpaceToDepthKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    Tensor *output) {
-  const index_t batch = input->dim(0);
-  const index_t input_height = input->dim(1);
-  const index_t input_width = input->dim(2);
-  const index_t input_depth = input->dim(3);
-  MACE_CHECK(input_depth < 4 || (input_depth % 4) == 0,
-             "input channel should be dividable by 4");
-  MACE_CHECK(
-      (input_width % block_size_ == 0) && (input_height % block_size_ == 0),
-      "input width and height should be dividable by block_size");
-  const index_t output_height = input_height / block_size_;
-  const index_t output_width = input_width / block_size_;
-  const index_t output_depth = input_depth * block_size_ * block_size_;
-  const index_t output_depth_blocks = RoundUpDiv4(output_depth);
-  std::vector<index_t> output_shape = {batch, output_height, output_width,
-                                       output_depth};
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape,
-                              OpenCLBufferType::IN_OUT_CHANNEL,
-                              &image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    const char *kernel_name = "space_to_depth";
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    if (input_depth < 4) {
-      built_options.emplace(MakeString("-DDEPTH", input_depth));
-    }
-    built_options.emplace(kernel_name_ss.str());
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  const uint32_t gws[3] = {static_cast<uint32_t>(output_depth_blocks),
-                           static_cast<uint32_t>(output_width),
-                           static_cast<uint32_t>(output_height * batch)};
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_depth));
-    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_width));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key = Concat("space_to_depth", input->dim(0),
-                                  input->dim(1), input->dim(2), input->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/split.cc
+++ b/mace/ops/opencl/image/split.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/split.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus SplitKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const std::vector<Tensor *> &output_list) {
+  MACE_UNUSED(axis_);
+  const index_t input_channels = input->dim(3);
+  const size_t outputs_count = output_list.size();
+  const index_t output_channels = input_channels / outputs_count;
+  std::vector<index_t> output_shape(
+      {input->dim(0), input->dim(1), input->dim(2), output_channels});
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
+  for (size_t i = 0; i < outputs_count; ++i) {
+    MACE_RETURN_IF_ERROR(
+        output_list[i]->ResizeImage(output_shape, image_shape));
+  }
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split");
+    built_options.emplace("-Dsplit=" + kernel_name);
+    auto input_dt = input->dtype();
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("split",
+                                              kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  const index_t channel_blk = RoundUpDiv4(output_channels);
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(input->dim(2)),
+      static_cast<uint32_t>(input->dim(0) * input->dim(1)),
+  };
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  cl::Event event;
+  CallStats call_stats{INT64_MAX, 0};
+  for (size_t i = 0; i < outputs_count; ++i) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
+    kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
+    cl_int error;
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+    } else {
+      std::vector<uint32_t> roundup_gws(lws.size());
+      for (size_t j = 0; j < 3; ++j) {
+        roundup_gws[j] = RoundUp(gws[j], lws[j]);
+      }
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel_, cl::NullRange,
+          cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+    }
+    MACE_CL_RET_STATUS(error);
+    MACE_OUT_OF_RANGE_VALIDATION;
+    if (context->future() != nullptr && runtime->is_profiling_enabled()) {
+      event.wait();
+      CallStats tmp_stats;
+      runtime->GetCallStats(event, &tmp_stats);
+      call_stats.start_micros =
+          std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
+      call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
+    }
+  }
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [call_stats](CallStats *stats) {
+      if (stats != nullptr) {
+        stats->start_micros = call_stats.start_micros;
+        stats->end_micros = stats->start_micros + call_stats.end_micros;
+      }
+    };
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/split.h
+++ b/mace/ops/opencl/image/split.h
@@ -31,7 +31,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class SplitKernel : public OpenCLSplitKernel {
 public:
  explicit SplitKernel(const int32_t axis) : axis_(axis) {}
@@ -46,104 +45,6 @@ class SplitKernel : public OpenCLSplitKernel {
  uint32_t kwg_size_;
 };
-template <typename T>
-MaceStatus SplitKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const std::vector<Tensor *> &output_list) {
-  const index_t input_channels = input->dim(3);
-  const size_t outputs_count = output_list.size();
-  const index_t output_channels = input_channels / outputs_count;
-  std::vector<index_t> output_shape(
-      {input->dim(0), input->dim(1), input->dim(2), output_channels});
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape,
-                              OpenCLBufferType::IN_OUT_CHANNEL,
-                              &image_shape);
-  for (size_t i = 0; i < outputs_count; ++i) {
-    MACE_RETURN_IF_ERROR(
-        output_list[i]->ResizeImage(output_shape, image_shape));
-  }
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split");
-    built_options.emplace("-Dsplit=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" +
-        DtToCLCMDDt(DataTypeToEnum<T>::value));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("split",
-                                              kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  const index_t channel_blk = RoundUpDiv4(output_channels);
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(input->dim(2)),
-      static_cast<uint32_t>(input->dim(0) * input->dim(1)),
-  };
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  cl::Event event;
-  CallStats call_stats{INT64_MAX, 0};
-  for (size_t i = 0; i < outputs_count; ++i) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
-    kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
-    cl_int error;
-    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-    } else {
-      std::vector<uint32_t> roundup_gws(lws.size());
-      for (size_t j = 0; j < 3; ++j) {
-        roundup_gws[j] = RoundUp(gws[j], lws[j]);
-      }
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          kernel_, cl::NullRange,
-          cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-    }
-    MACE_CL_RET_STATUS(error);
-    MACE_OUT_OF_RANGE_VALIDATION;
-    if (context->future() != nullptr && runtime->is_profiling_enabled()) {
-      event.wait();
-      CallStats tmp_stats;
-      runtime->GetCallStats(event, &tmp_stats);
-      call_stats.start_micros =
-          std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
-      call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
-    }
-  }
-  if (context->future() != nullptr) {
-    context->future()->wait_fn = [call_stats](CallStats *stats) {
-      if (stats != nullptr) {
-        stats->start_micros = call_stats.start_micros;
-        stats->end_micros = stats->start_micros + call_stats.end_micros;
-      }
-    };
-  }
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/sqrdiff_mean.cc
+++ b/mace/ops/opencl/image/sqrdiff_mean.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/opencl/image/sqrdiff_mean.h"
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+MaceStatus SqrDiffMeanKernel::Compute(
+    OpContext *context,
+    const Tensor *input0,
+    const Tensor *input1,
+    Tensor *output) {
+  MACE_CHECK_NOTNULL(input0);
+  MACE_CHECK_NOTNULL(input1);
+  MACE_CHECK(input0->dim(0) == input1->dim(0) &&
+      input0->dim(3) == input1->dim(3));
+  MACE_CHECK(input0->dim_size() == 4 && input1->dim_size() == 4,
+             "SqrDiffMean gpu only support 4-dim input");
+  index_t batch = input0->dim(0);
+  const index_t in_height = input0->dim(1);
+  const index_t in_width = input0->dim(2);
+  const index_t channels = input0->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
+  std::vector<uint32_t> gws(3);
+  std::vector<uint32_t> lws(3);
+  std::vector<index_t> output_shape{batch, 1, 1, channels};
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("sqrdiff_mean");
+    built_options.emplace("-Dsqrdiff_mean=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
+      built_options.emplace("-DNON_QUALCOMM_ADRENO");
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("sqrdiff_mean",
+                                              kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
+    const uint32_t wave_size =
+        static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
+    gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
+  } else {
+    gws = {4, 16, static_cast<uint32_t>(batch * channel_blocks)};
+  }
+  lws = {gws[0], gws[1], 1};
+  const int group_size = lws[0] * lws[1] * lws[2];
+  const int partial_len = (image_size + group_size - 1) / group_size;
+  const int remain_index = image_size % group_size;
+  const float img_size_reciprocal = 1.f / (in_width * in_height);
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input0->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input0->opencl_image()));
+    kernel_.setArg(idx++, *(input1->opencl_image()));
+    kernel_.setArg(idx++, (group_size * 4 * sizeof(float)),
+                   nullptr);
+    kernel_.setArg(idx++, static_cast<int32_t>(group_size));
+    kernel_.setArg(idx++, static_cast<int32_t>(partial_len));
+    kernel_.setArg(idx++, static_cast<int32_t>(remain_index));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
+    kernel_.setArg(idx++, img_size_reciprocal);
+    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input0->shape();
+  }
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange,
+        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION;
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/sqrdiff_mean.h
+++ b/mace/ops/opencl/image/sqrdiff_mean.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
-template <typename T>
 class SqrDiffMeanKernel : public OpenCLSqrDiffMeanKernel {
 public:
  MaceStatus Compute(
@@ -45,123 +44,6 @@ class SqrDiffMeanKernel : public OpenCLSqrDiffMeanKernel {
  std::vector<index_t> input_shape_;
 };
-template <typename T>
-MaceStatus SqrDiffMeanKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input0,
-    const Tensor *input1,
-    Tensor *output) {
-  MACE_CHECK_NOTNULL(input0);
-  MACE_CHECK_NOTNULL(input1);
-  MACE_CHECK(input0->dim(0) == input1->dim(0) &&
-      input0->dim(3) == input1->dim(3));
-  MACE_CHECK(input0->dim_size() == 4 && input1->dim_size() == 4,
-             "SqrDiffMean gpu only support 4-dim input");
-  index_t batch = input0->dim(0);
-  const index_t in_height = input0->dim(1);
-  const index_t in_width = input0->dim(2);
-  const index_t channels = input0->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
-  std::vector<uint32_t> gws(3);
-  std::vector<uint32_t> lws(3);
-  std::vector<index_t> output_shape{batch, 1, 1, channels};
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    const DataType dt = DataTypeToEnum<T>::value;
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("sqrdiff_mean");
-    built_options.emplace("-Dsqrdiff_mean=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
-      built_options.emplace("-DNON_QUALCOMM_ADRENO");
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("sqrdiff_mean",
-                                              kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
-    const uint32_t wave_size =
-        static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
-    gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
-  } else {
-    gws = {4, 16, static_cast<uint32_t>(batch * channel_blocks)};
-  }
-  lws = {gws[0], gws[1], 1};
-  const int group_size = lws[0] * lws[1] * lws[2];
-  const int partial_len = (image_size + group_size - 1) / group_size;
-  const int remain_index = image_size % group_size;
-  const float img_size_reciprocal = 1.f / (in_width * in_height);
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input0->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input0->opencl_image()));
-    kernel_.setArg(idx++, *(input1->opencl_image()));
-    kernel_.setArg(idx++, (group_size * 4 * sizeof(float)),
-                   nullptr);
-    kernel_.setArg(idx++, static_cast<int32_t>(group_size));
-    kernel_.setArg(idx++, static_cast<int32_t>(partial_len));
-    kernel_.setArg(idx++, static_cast<int32_t>(remain_index));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
-    kernel_.setArg(idx++, img_size_reciprocal);
-    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input0->shape();
-  }
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  } else {
-    std::vector<uint32_t> roundup_gws(lws.size());
-    for (size_t i = 0; i < lws.size(); ++i) {
-      roundup_gws[i] = RoundUp(gws[i], lws[i]);
-    }
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange,
-        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  }
-  MACE_CL_RET_STATUS(error);
-  MACE_OUT_OF_RANGE_VALIDATION;
-  if (context->future() != nullptr) {
-    context->future()->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-  return MaceStatus::MACE_SUCCESS;
-}
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/winograd_conv2d.cc
+++ b/mace/ops/opencl/image/winograd_conv2d.cc
@@ -29,7 +29,6 @@ namespace {
 MaceStatus WinogradInputTransform(OpContext *context,
                                  cl::Kernel *kernel,
                                  const Tensor *input_tensor,
-                                  const DataType dt,
                                  const int *paddings,
                                  const index_t round_h,
                                  const index_t round_w,
@@ -62,8 +61,8 @@ MaceStatus WinogradInputTransform(OpContext *context,
      MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
      return MaceStatus::MACE_SUCCESS;
    }
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
    MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
                                              obfuscated_kernel_name,
                                              built_options,
@@ -93,7 +92,6 @@ MaceStatus WinogradInputTransform(OpContext *context,
    kernel->setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
  }
  const std::vector<uint32_t> lws = {*kwg_size / 8, 8, 0};
  std::string tuning_key = Concat("winograd_transform_kernel",
                                  output_tensor->dim(0),
@@ -110,7 +108,6 @@ MaceStatus WinogradOutputTransform(OpContext *context,
                                   cl::Kernel *kernel,
                                   const Tensor *input_tensor,
                                   const Tensor *bias,
-                                   const DataType dt,
                                   const index_t round_h,
                                   const index_t round_w,
                                   const int wino_blk_size,
@@ -145,33 +142,41 @@ MaceStatus WinogradOutputTransform(OpContext *context,
      return MaceStatus::MACE_SUCCESS;
    }
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
    switch (activation) {
-      case NOOP:
+      case NOOP: {
        break;
-      case RELU:
+      }
+      case RELU: {
        built_options.emplace("-DUSE_RELU");
        break;
-      case RELUX:
+      }
+      case RELUX: {
        built_options.emplace("-DUSE_RELUX");
        break;
-      case PRELU:
+      }
+      case PRELU: {
        built_options.emplace("-DUSE_PRELU");
        break;
-      case TANH:
+      }
+      case TANH: {
        built_options.emplace("-DUSE_TANH");
        break;
-      case SIGMOID:
+      }
+      case SIGMOID: {
        built_options.emplace("-DUSE_SIGMOID");
        break;
-      case LEAKYRELU:
+      }
+      case LEAKYRELU: {
        built_options.emplace("-DUSE_LEAKYRELU");
        break;
-      default:
+      }
+      default: {
        LOG(FATAL) << "Unknown activation type: " << activation;
      }
+    }
    MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
                                              obfuscated_kernel_name,
@@ -229,7 +234,6 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
                                       const ActivationType activation,
                                       const float relux_max_limit,
                                       const float leakyrelu_coefficient,
-                                       const DataType dt,
                                       const int wino_blk_size,
                                       std::vector<index_t> *prev_input_shape,
                                       Tensor *output,
@@ -265,13 +269,14 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
                              OpenCLBufferType::IN_OUT_HEIGHT,
                              &t_input_image_shape);
  ScratchImage transformed_input_image(scratch_manager);
-  std::unique_ptr<Tensor> transformed_input = make_unique<Tensor>(
+  auto input_dt = input->dtype();
-      transformed_input_image.Scratch(context->device()->allocator(),
+  auto image = transformed_input_image.Scratch(context->device()->allocator(),
-                                      t_input_image_shape, dt), dt);
+                                               t_input_image_shape, input_dt);
+  auto transformed_input = make_unique<Tensor>(image, input_dt);
  MACE_RETURN_IF_ERROR(transformed_input->ResizeImage(t_input_shape,
                                                      t_input_image_shape));
  MACE_RETURN_IF_ERROR(WinogradInputTransform(
-      context, kernels[0], input, dt, paddings,
+      context, kernels[0], input, paddings,
      round_h, round_w, wino_blk_size,
      input_changed, transformed_input.get(),
      kwg_size[0], &t_input_future));
@@ -290,9 +295,10 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
                              &mm_output_image_shape);
  ScratchImage mm_output_image(scratch_manager);
+  auto output_dt = input->dtype();
  std::unique_ptr<Tensor> mm_output = make_unique<Tensor>(
      mm_output_image.Scratch(context->device()->allocator(),
-                              mm_output_image_shape, dt), dt);
+                              mm_output_image_shape, output_dt), output_dt);
  MACE_RETURN_IF_ERROR(mm_output->ResizeImage(mm_output_shape,
                                              mm_output_image_shape));
@@ -311,8 +317,8 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
    MACE_NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
    built_options.emplace("-Dmatmul=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
    MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
                                              built_options, kernels[1]));
@@ -344,7 +350,7 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
  // t_output (blk_sqr, out_chan, out_width) -> output(NHWC)
  MACE_RETURN_IF_ERROR(WinogradOutputTransform(
      context, kernels[2], mm_output.get(), bias,
-      dt, round_h, round_w, wino_blk_size, activation, relux_max_limit,
+      round_h, round_w, wino_blk_size, activation, relux_max_limit,
      leakyrelu_coefficient, input_changed, output, kwg_size[2],
      &t_output_future))

--- a/mace/ops/opencl/lstm_cell.cc
+++ b/mace/ops/opencl/lstm_cell.cc
@@ -25,21 +25,20 @@
 namespace mace {
 namespace ops {
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class LSTMCellOp;
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class LSTMCellOp<DeviceType::GPU, T> : public Operation {
+class LSTMCellOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit LSTMCellOp(OpConstructContext *context)
      : Operation(context) {
-    T forget_bias = static_cast<T>(
+    float forget_bias = Operation::GetOptionalArg<float>("scalar_input",
-        Operation::GetOptionalArg<float>("scalar_input",
+                                                         0.0);
-                                         0.0));
    MemoryType mem_type = MemoryType::GPU_IMAGE;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::LSTMCellKernel<T>>(forget_bias);
+      kernel_ = make_unique<opencl::image::LSTMCellKernel>(forget_bias);
    } else {
      MACE_NOT_IMPLEMENTED;
    }
@@ -47,30 +46,26 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation {
    const Tensor *pre_output = context->workspace()->GetTensor(
        operator_def_->input(1));
    if (pre_output->is_weight()) {
-      MACE_CHECK(TransformFilter<T>(context,
+      auto status = TransformFilter(context, operator_def_.get(),
-                                    operator_def_.get(),
+                                    1, OpenCLBufferType::IN_OUT_CHANNEL,
-                                    1,
+                                    mem_type);
-                                    OpenCLBufferType::IN_OUT_CHANNEL,
+      MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
-                                    mem_type) == MaceStatus::MACE_SUCCESS);
    }
-    MACE_CHECK(TransformFilter<T>(context,
+    auto status = TransformFilter(context, operator_def_.get(),
-                                  operator_def_.get(),
+                                  2, OpenCLBufferType::IN_OUT_CHANNEL,
-                                  2,
+                                  mem_type);
-                                  OpenCLBufferType::IN_OUT_CHANNEL,
+    MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
-                                  mem_type) == MaceStatus::MACE_SUCCESS);
+    status = TransformFilter(context, operator_def_.get(),
-    MACE_CHECK(TransformFilter<T>(context,
+                             3, OpenCLBufferType::ARGUMENT,
-                                  operator_def_.get(),
+                             mem_type);
-                                  3,
+    MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
-                                  OpenCLBufferType::ARGUMENT,
+    const Tensor *pre_cell =
-                                  mem_type) == MaceStatus::MACE_SUCCESS);
+        context->workspace()->GetTensor(operator_def_->input(4));
-    const Tensor *pre_cell = context->workspace()->GetTensor(
-        operator_def_->input(4));
    if (pre_cell->is_weight()) {
-      MACE_CHECK(TransformFilter<T>(context,
+      status = TransformFilter(context, operator_def_.get(),
-                                    operator_def_.get(),
+                               4, OpenCLBufferType::IN_OUT_CHANNEL,
-                                    4,
+                               mem_type);
-                                    OpenCLBufferType::IN_OUT_CHANNEL,
+      MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
-                                    mem_type) == MaceStatus::MACE_SUCCESS);
    }
  }
@@ -92,14 +87,10 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation {
  MACE_OP_INPUT_TAGS(INPUT, PRE_OUTPUT, WEIGHT, BIAS, PRE_CELL);
  MACE_OP_OUTPUT_TAGS(CELL, OUTPUT);
 };
-#endif
+#endif  // MACE_ENABLE_OPENCL
 void RegisterLSTMCell(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "LSTMCell", LSTMCellOp,
+  MACE_REGISTER_GPU_OP(op_registry, "LSTMCell", LSTMCellOp);
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "LSTMCell", LSTMCellOp,
-                   DeviceType::GPU, half);
 }
 }  // namespace ops

--- a/mace/ops/opencl/pooling.h
+++ b/mace/ops/opencl/pooling.h
@@ -17,7 +17,7 @@
 #include <vector>
-#include "mace/ops/pooling.h"
+#include "mace/ops/common/pooling_type.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 namespace mace {

--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
@@ -16,7 +16,7 @@
 #include <memory>
 #include "mace/core/operator.h"
-#include "mace/ops/pad.h"
+#include "mace/ops/common/pad_type.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/pad.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -26,10 +26,10 @@
 namespace mace {
 namespace ops {
-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 class PadOp;
-template <typename T>
+template<typename T>
 class PadOp<DeviceType::CPU, T> : public Operation {
 public:
  explicit PadOp(OpConstructContext *context)
@@ -116,10 +116,10 @@ class PadOp<DeviceType::CPU, T> : public Operation {
          for (index_t c = 0; c < o_channel; ++c) {
            index_t c_in = get_src_idx(c, channel, paddings_[2], l_add, r_add);
-            const index_t in_offset = (((b_in * channel + c_in) * height) +
+            const index_t in_offset =
-                                      h_in) * width;
+                (((b_in * channel + c_in) * height) + h_in) * width;
-            index_t out_offset = (((b * o_channel + c) * o_height) +
+            index_t out_offset =
-                                 h) * o_width;
+                (((b * o_channel + c) * o_height) + h) * o_width;
            for (index_t i = 0, j = paddings_[6] + l_add;
                 i < paddings_[6]; ++i, --j) {
@@ -169,8 +169,8 @@ class PadOp<DeviceType::CPU, T> : public Operation {
 };
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class PadOp<DeviceType::GPU, T> : public Operation {
+class PadOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit PadOp(OpConstructContext *context)
      : Operation(context) {
@@ -180,7 +180,7 @@ class PadOp<DeviceType::GPU, T> : public Operation {
    float constant_value = Operation::GetOptionalArg<float>(
        "constant_value", 0.0);
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::PadKernel<T>>(
+      kernel_ = make_unique<opencl::image::PadKernel>(
          type, paddings, constant_value);
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -198,18 +198,11 @@ class PadOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 void RegisterPad(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Pad", PadOp,
                   DeviceType::CPU, float);
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Pad", PadOp);
-  MACE_REGISTER_OP(op_registry, "Pad", PadOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Pad", PadOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }
 }  // namespace ops

--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -16,8 +16,6 @@
 #include <arm_neon.h>
 #endif
-#include "mace/ops/pooling.h"
 #include <algorithm>
 #include <limits>
 #include <memory>
@@ -28,6 +26,7 @@
 #include "mace/core/tensor.h"
 #include "mace/ops/conv_pool_2d_base.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/common/pooling_type.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/pooling.h"
 #include "mace/ops/opencl/buffer/pooling.h"
@@ -486,15 +485,15 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
 #endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase {
+class PoolingOp<DeviceType::GPU, float> : public PoolingOpBase {
 public:
  explicit PoolingOp(OpConstructContext *context)
      : PoolingOpBase(context) {
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::PoolingKernel<T>>();
+      kernel_ = make_unique<opencl::image::PoolingKernel>();
    } else {
-      kernel_ = make_unique<opencl::buffer::PoolingKernel<T>>();
+      kernel_ = make_unique<opencl::buffer::PoolingKernel>();
    }
  }
  MaceStatus Run(OpContext *context) override {
@@ -520,13 +519,7 @@ void RegisterPooling(OpRegistryBase *op_registry) {
                   DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Pooling", PoolingOp);
-  MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }
 }  // namespace ops

--- a/mace/ops/reduce.cc
+++ b/mace/ops/reduce.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/reduce.h"
 #include <algorithm>
 #include <memory>
 #include <set>
 #include <vector>
+#include "mace/ops/common/reduce_type.h"
 #include "mace/core/future.h"
 #include "mace/core/operator.h"
 #include "mace/core/runtime/cpu/cpu_runtime.h"
@@ -868,15 +867,14 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce4Dims(
 #endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class ReduceOp<DeviceType::GPU, T> : public ReduceOpBase {
+class ReduceOp<DeviceType::GPU, float> : public ReduceOpBase {
 public:
  explicit ReduceOp(OpConstructContext *context)
      : ReduceOpBase(context) {
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::ReduceKernel<T>>(reduce_type_,
+      kernel_ = make_unique<opencl::image::ReduceKernel>(reduce_type_,
-                                                            axis_,
+                                                         axis_);
-                                                            keep_dims_);
    } else {
      MACE_NOT_IMPLEMENTED;
    }
@@ -901,13 +899,7 @@ void RegisterReduce(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
                   DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Reduce", ReduceOp);
-  MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
  MACE_REGISTER_OP_CONDITION(
      op_registry,
      OpConditionBuilder("Reduce")
@@ -915,26 +907,26 @@ void RegisterReduce(OpRegistryBase *op_registry) {
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                }
                bool keep_dims =
                    ProtoArgHelper::GetOptionalArg<OperatorDef, bool>(
                        *op, "keepdims", false);
                if (!keep_dims) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                }
                auto axis =
                    ProtoArgHelper::GetRepeatedArgs<OperatorDef, int>(
                        *op, "axis");
                if (axis.size() != 2 || axis[0] != 1 || axis[1] != 2) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                }
                auto tensor_shape_info = context->tensor_shape_info();
                if (tensor_shape_info->count(op->input(0)) == 0
                    || tensor_shape_info->at(op->input(0)).size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
              }));
 }

--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/resize_bicubic.h"
 #include <algorithm>
 #include <cmath>
 #include <memory>
 #include <vector>
 #include "mace/core/operator.h"
+#include "mace/ops/common/utils.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/resize_bicubic.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -33,12 +32,12 @@ inline const std::shared_ptr<float> InitCoeffsTable() {
  // convolution algorithm.
  // https://en.wikipedia.org/wiki/Bicubic_interpolation
  auto coeffs_tab = std::shared_ptr<float>(
-      new float[(resize_bicubic::kTableSize + 1) * 2],
+      new float[(common::utils::kTableSize + 1) * 2],
      std::default_delete<float[]>());
  float *coeffs_tab_ptr = coeffs_tab.get();
  static const float A = -0.75f;
-  for (int i = 0; i <= resize_bicubic::kTableSize; ++i) {
+  for (int i = 0; i <= common::utils::kTableSize; ++i) {
-    float x = i * 1.0f / resize_bicubic::kTableSize;
+    float x = i * 1.0f / common::utils::kTableSize;
    coeffs_tab_ptr[i * 2] = ((A + 2) * x - (A + 3)) * x * x + 1;
    x += 1.0;
    coeffs_tab_ptr[i * 2 + 1] = ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
@@ -61,12 +60,12 @@ inline void GetWeightsAndIndices(float scale, int64_t out_loc, int64_t limit,
                                 std::vector<int64_t> *indices) {
  auto in_loc = static_cast<int64_t>(scale * out_loc);
  const float delta = scale * out_loc - in_loc;
-  const int64_t offset = lrintf(delta * resize_bicubic::kTableSize);
+  const int64_t offset = lrintf(delta * common::utils::kTableSize);
  const float *coeffs_tab = GetCoeffsTable();
  *weights = {coeffs_tab[offset * 2 + 1],
              coeffs_tab[offset * 2],
-              coeffs_tab[(resize_bicubic::kTableSize - offset) * 2],
+              coeffs_tab[(common::utils::kTableSize - offset) * 2],
-              coeffs_tab[(resize_bicubic::kTableSize - offset) * 2 + 1]};
+              coeffs_tab[(common::utils::kTableSize - offset) * 2 + 1]};
  *indices = {Bound(in_loc - 1, limit), Bound(in_loc, limit),
              Bound(in_loc + 1, limit), Bound(in_loc + 2, limit)};
 }
@@ -173,11 +172,11 @@ class ResizeBicubicOp<DeviceType::CPU, float> : public Operation {
    }
    float height_scale =
-        resize_bicubic::CalculateResizeScale(in_height,
+        common::utils::CalculateResizeScale(in_height,
                                            out_height,
                                            align_corners_);
    float width_scale =
-        resize_bicubic::CalculateResizeScale(in_width,
+        common::utils::CalculateResizeScale(in_width,
                                            out_width,
                                            align_corners_);
@@ -202,8 +201,8 @@ class ResizeBicubicOp<DeviceType::CPU, float> : public Operation {
 };
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class ResizeBicubicOp<DeviceType::GPU, T> : public Operation {
+class ResizeBicubicOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit ResizeBicubicOp(OpConstructContext *context)
      : Operation(context) {
@@ -213,7 +212,7 @@ class ResizeBicubicOp<DeviceType::GPU, T> : public Operation {
        "size", {-1, -1});
    MACE_CHECK(size.size() == 2);
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::ResizeBicubicKernel<T>>(
+      kernel_ = make_unique<opencl::image::ResizeBicubicKernel>(
          align_corners, size[0], size[1]);
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -237,13 +236,7 @@ void RegisterResizeBicubic(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp,
                   DeviceType::CPU, float);
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "ResizeBicubic", ResizeBicubicOp);
-  MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }
 }  // namespace ops

--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/resize_bilinear.h"
 #include <algorithm>
 #include <memory>
 #include <vector>
@@ -21,6 +19,7 @@
 #include "mace/core/operator.h"
 #include "mace/utils/memory.h"
 #include "mace/core/quantize.h"
+#include "mace/ops/common/utils.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/resize_bilinear.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -223,11 +222,11 @@ class ResizeBilinearOp<DeviceType::CPU, T> : public Operation {
    }
    float height_scale =
-        resize_bilinear::CalculateResizeScale(in_height,
+        common::utils::CalculateResizeScale(in_height,
                                            out_height,
                                            align_corners_);
    float width_scale =
-        resize_bilinear::CalculateResizeScale(in_width,
+        common::utils::CalculateResizeScale(in_width,
                                            out_width,
                                            align_corners_);
@@ -299,11 +298,11 @@ class ResizeBilinearOp<DeviceType::CPU, uint8_t> : public Operation {
    }
    float height_scale =
-        resize_bilinear::CalculateResizeScale(in_height,
+        common::utils::CalculateResizeScale(in_height,
                                            out_height,
                                            align_corners_);
    float width_scale =
-        resize_bilinear::CalculateResizeScale(in_width,
+        common::utils::CalculateResizeScale(in_width,
                                            out_width,
                                            align_corners_);
@@ -336,8 +335,8 @@ class ResizeBilinearOp<DeviceType::CPU, uint8_t> : public Operation {
 #endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class ResizeBilinearOp<DeviceType::GPU, T> : public Operation {
+class ResizeBilinearOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit ResizeBilinearOp(OpConstructContext *context)
      : Operation(context) {
@@ -347,7 +346,7 @@ class ResizeBilinearOp<DeviceType::GPU, T> : public Operation {
        "size", {-1, -1});
    MACE_CHECK(size.size() == 2);
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::ResizeBilinearKernel<T>>(
+      kernel_ = make_unique<opencl::image::ResizeBilinearKernel>(
          align_corners, size[0], size[1]);
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -376,13 +375,7 @@ void RegisterResizeBilinear(OpRegistryBase *op_registry) {
                   DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "ResizeBilinear", ResizeBilinearOp);
-  MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }
 }  // namespace ops

--- a/mace/ops/resize_nearest_neighbor.cc
+++ b/mace/ops/resize_nearest_neighbor.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/resize_nearest_neighbor.h"
 #include <algorithm>
 #include <memory>
 #include <vector>
 #include "mace/core/operator.h"
+#include "mace/ops/common/utils.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/resize_nearest_neighbor.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -115,11 +114,11 @@ class ResizeNearestNeighborOp<DeviceType::CPU, T> : public Operation {
    }
    float height_scale =
-        resize_nearest_neighbor::CalculateResizeScale(in_height,
+        common::utils::CalculateResizeScale(in_height,
                                            out_height,
                                            align_corners_);
    float width_scale =
-        resize_nearest_neighbor::CalculateResizeScale(in_width,
+        common::utils::CalculateResizeScale(in_width,
                                            out_width,
                                            align_corners_);
    ResizeImageNCHW(context,
@@ -142,15 +141,15 @@ class ResizeNearestNeighborOp<DeviceType::CPU, T> : public Operation {
 };
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class ResizeNearestNeighborOp<DeviceType::GPU, T> : public Operation {
+class ResizeNearestNeighborOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit ResizeNearestNeighborOp(OpConstructContext *context)
      : Operation(context) {
    bool align_corners = Operation::GetOptionalArg<bool>(
        "align_corners", false);
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::ResizeNearestNeighborKernel<T>>(
+      kernel_ = make_unique<opencl::image::ResizeNearestNeighborKernel>(
          align_corners);
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -176,13 +175,8 @@ void RegisterResizeNearestNeighbor(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor",
                   ResizeNearestNeighborOp, DeviceType::CPU, float);
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "ResizeNearestNeighbor",
-  MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor",
+                       ResizeNearestNeighborOp);
-                   ResizeNearestNeighborOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor",
-                   ResizeNearestNeighborOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }
 }  // namespace ops

--- a/mace/ops/resize_nearest_neighbor.h
+++ b/mace/ops/resize_nearest_neighbor.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_
-#define MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_
-#include "mace/core/types.h"
-namespace mace {
-namespace ops {
-namespace resize_nearest_neighbor {
-inline float CalculateResizeScale(index_t in_size,
-                                  index_t out_size,
-                                  bool align_corners) {
-  return (align_corners && out_size > 1)
-         ? (in_size - 1) / static_cast<float>(out_size - 1)
-         : in_size / static_cast<float>(out_size);
-}
-}  // namespace resize_nearest_neighbor
-}  // namespace ops
-}  // namespace mace
-#endif  // MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -35,10 +35,10 @@
 namespace mace {
 namespace ops {
-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 class SoftmaxOp;
-template <>
+template<>
 class SoftmaxOp<DeviceType::CPU, float> : public Operation {
 public:
  explicit SoftmaxOp(OpConstructContext *context)
@@ -407,17 +407,17 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
 #endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class SoftmaxOp<DeviceType::GPU, T> : public Operation {
+class SoftmaxOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit SoftmaxOp(OpConstructContext *context)
      : Operation(context) {
    bool use_log = (
        Operation::GetOptionalArg<bool>("use_log", false));
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::SoftmaxKernel<T>>(use_log);
+      kernel_ = make_unique<opencl::image::SoftmaxKernel>(use_log);
    } else {
-      kernel_ = make_unique<opencl::buffer::SoftmaxKernel<T>>(use_log);
+      kernel_ = make_unique<opencl::buffer::SoftmaxKernel>(use_log);
    }
  }
  MaceStatus Run(OpContext *context) override {
@@ -433,7 +433,6 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 void RegisterSoftmax(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
                   DeviceType::CPU, float);
@@ -443,13 +442,7 @@ void RegisterSoftmax(OpRegistryBase *op_registry) {
                   DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Softmax", SoftmaxOp);
-  MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
  MACE_REGISTER_OP_CONDITION(
      op_registry,
@@ -458,13 +451,13 @@ void RegisterSoftmax(OpRegistryBase *op_registry) {
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                }
                if (op->output_shape(0).dims_size() != 2 &&
                    op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
              }));
 }

--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -86,10 +86,10 @@ class SpaceToBatchOpBase : public Operation {
  }
 };
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class SpaceToBatchNDOp;
-template <>
+template<>
 class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
 public:
  explicit SpaceToBatchNDOp(OpConstructContext *context)
@@ -302,13 +302,13 @@ class SpaceToBatchNDOp<DeviceType::CPU, uint8_t> : public SpaceToBatchOpBase {
 #endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class SpaceToBatchNDOp<DeviceType::GPU, T> : public SpaceToBatchOpBase {
+class SpaceToBatchNDOp<DeviceType::GPU, float> : public SpaceToBatchOpBase {
 public:
  explicit SpaceToBatchNDOp(OpConstructContext *context)
      : SpaceToBatchOpBase(context) {
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::SpaceToBatchKernel<T>>();
+      kernel_ = make_unique<opencl::image::SpaceToBatchKernel>();
    } else {
      MACE_NOT_IMPLEMENTED;
    }
@@ -337,13 +337,7 @@ void RegisterSpaceToBatchND(OpRegistryBase *op_registry) {
                   SpaceToBatchNDOp, DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "SpaceToBatchND", SpaceToBatchNDOp);
-  MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
-                   SpaceToBatchNDOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
-                   SpaceToBatchNDOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }
 }  // namespace ops

--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
@@ -24,7 +24,7 @@
 namespace mace {
 namespace ops {
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class SpaceToDepthOp : public Operation {
 public:
  explicit SpaceToDepthOp(OpConstructContext *context)
@@ -88,14 +88,14 @@ class SpaceToDepthOp : public Operation {
 };
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class SpaceToDepthOp<DeviceType::GPU, T> : public Operation {
+class SpaceToDepthOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit SpaceToDepthOp(OpConstructContext *context)
      : Operation(context) {
    int block_size = Operation::GetOptionalArg<int>("block_size", 1);
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::SpaceToDepthKernel<T>>(block_size);
+      kernel_ = make_unique<opencl::image::SpaceToDepthKernel>(block_size);
    } else {
      MACE_NOT_IMPLEMENTED;
    }
@@ -116,13 +116,7 @@ void RegisterSpaceToDepth(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "SpaceToDepth",
                   SpaceToDepthOp, DeviceType::CPU, float);
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "SpaceToDepth", SpaceToDepthOp);
-  MACE_REGISTER_OP(op_registry, "SpaceToDepth",
-                   SpaceToDepthOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "SpaceToDepth",
-                   SpaceToDepthOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }
 }  // namespace ops

--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
@@ -100,14 +100,14 @@ class SplitOp<DeviceType::CPU, T> : public Operation {
 };
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class SplitOp<DeviceType::GPU, T> : public Operation {
+class SplitOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit SplitOp(OpConstructContext *context)
      : Operation(context) {
    int32_t axis = Operation::GetOptionalArg<int>("axis", 3);
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::SplitKernel<T>>(axis);
+      kernel_ = make_unique<opencl::image::SplitKernel>(axis);
    } else {
      MACE_NOT_IMPLEMENTED;
    }
@@ -132,13 +132,7 @@ void RegisterSplit(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Split", SplitOp,
                   DeviceType::CPU, float);
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Split", SplitOp);
-  MACE_REGISTER_OP(op_registry, "Split", SplitOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Split", SplitOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
  MACE_REGISTER_OP_CONDITION(
      op_registry,

--- a/mace/ops/sqrdiff_mean.cc
+++ b/mace/ops/sqrdiff_mean.cc
@@ -24,7 +24,7 @@
 namespace mace {
 namespace ops {
-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 class SqrDiffMeanOp : public Operation {
 public:
  explicit SqrDiffMeanOp(OpConstructContext *context)
@@ -76,15 +76,14 @@ class SqrDiffMeanOp : public Operation {
  }
 };
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
+template<>
-class SqrDiffMeanOp<DeviceType::GPU, T> : public Operation {
+class SqrDiffMeanOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit SqrDiffMeanOp(OpConstructContext *context)
      : Operation(context) {
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::SqrDiffMeanKernel<T>>();
+      kernel_ = make_unique<opencl::image::SqrDiffMeanKernel>();
    } else {
      MACE_NOT_IMPLEMENTED;
    }
@@ -101,18 +100,11 @@ class SqrDiffMeanOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 void RegisterSqrDiffMean(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
                   DeviceType::CPU, float);
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp);
-  MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }
 }  // namespace ops

--- a/mace/ops/squeeze.cc
+++ b/mace/ops/squeeze.cc
@@ -20,18 +20,21 @@
 namespace mace {
 namespace ops {
-template <DeviceType D, typename T>
+class SqueezeOpRaw : public Operation {
-class SqueezeOp : public Operation {
 public:
-  explicit SqueezeOp(OpConstructContext *context)
+  explicit SqueezeOpRaw(OpConstructContext *context,
+                        DeviceType device_type,
+                        DataType data_type)
      : Operation(context),
        axis_(Operation::GetRepeatedArgs<int>("axis", {})),
-        checked_(false) {}
+        checked_(false),
+        data_type_(data_type),
+        device_type_(device_type) {}
  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
-    if (!checked_ && D == DeviceType::CPU
+    if (!checked_ && device_type_ == DeviceType::CPU
-        && DataTypeToEnum<T>::value != DT_UINT8) {
+        && data_type_ != DT_UINT8) {
      auto has_df = Operation::GetOptionalArg<int>(
          "has_data_format", 0);
      if (has_df && this->Input(0)->dim_size() == 4) {
@@ -62,6 +65,16 @@ class SqueezeOp : public Operation {
 private:
  std::vector<int> axis_;
  bool checked_;
+  DataType data_type_;
+  DeviceType device_type_;
+};
+template<DeviceType D, typename T>
+class SqueezeOp : public SqueezeOpRaw {
+ public:
+  explicit SqueezeOp(OpConstructContext *context)
+      : SqueezeOpRaw(context, D, DataTypeToEnum<T>::value) {
+  }
 };
 void RegisterSqueeze(OpRegistryBase *op_registry) {
@@ -69,10 +82,7 @@ void RegisterSqueeze(OpRegistryBase *op_registry) {
 #ifdef MACE_ENABLE_QUANTIZE
  MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
-#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Squeeze", SqueezeOp);
-  MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
  MACE_REGISTER_OP_CONDITION(
      op_registry,
      OpConditionBuilder("Squeeze")
@@ -80,13 +90,13 @@ void RegisterSqueeze(OpRegistryBase *op_registry) {
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                }
                if (op->output_shape(0).dims_size() != 2 &&
                    op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
              }));
 }

--- a/mace/python/tools/encrypt_opencl_codegen.py
+++ b/mace/python/tools/encrypt_opencl_codegen.py
@@ -37,55 +37,73 @@ def encrypt_code(code_str):
    return encrypted_arr
+def create_output_dir(dir_path):
+    if os.path.exists(dir_path):
+        if os.path.isdir(dir_path):
+            try:
+                shutil.rmtree(dir_path)
+            except OSError:
+                raise RuntimeError(
+                    "Cannot delete directory %s due to permission "
+                    "error, inspect and remove manually" % dir_path)
+        else:
+            raise RuntimeError(
+                "Cannot delete non-directory %s, inspect ",
+                "and remove manually" % dir_path)
+    os.makedirs(dir_path)
+def write_cl_encrypted_kernel_to_file(
+        encrypted_code_maps, template_path, output_path):
+    env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
+    cl_encrypted_kernel = env.get_template(template_path).render(
+        tag='codegen',
+        maps=encrypted_code_maps,
+        data_type='unsigned char',
+        variable_name='kEncryptedProgramMap')
+    with open(output_path, "w") as w_file:
+        w_file.write(cl_encrypted_kernel)
+def get_module_key(file_name):
+    module_key = None
+    if file_name[-3:] == ".cl":
+        module_key = file_name[:-3]
+    elif file_name[-2:] == ".h":
+        module_key = file_name
+    return module_key
 def encrypt_opencl_codegen(cl_kernel_dir, output_path):
    if not os.path.exists(cl_kernel_dir):
        print("Input cl_kernel_dir " + cl_kernel_dir + " doesn't exist!")
-    header_code = ""
-    for file_name in os.listdir(cl_kernel_dir):
-        file_path = os.path.join(cl_kernel_dir, file_name)
-        if file_path[-2:] == ".h":
-            with open(file_path, "r") as f:
-                header_code += f.read()
    encrypted_code_maps = {}
    for file_name in os.listdir(cl_kernel_dir):
        file_path = os.path.join(cl_kernel_dir, file_name)
-        if file_path[-3:] == ".cl":
+        module_key = get_module_key(file_name)
+        if len(module_key) > 0:
            with open(file_path, "r") as f:
                code_str = ""
+                headers = []
                for line in f.readlines():
                    if "#include <common.h>" in line:
-                        code_str += header_code
+                        headers.append(get_module_key("common.h"))
                    else:
                        code_str += line
                encrypted_code_arr = encrypt_code(code_str)
-                encrypted_code_maps[file_name[:-3]] = encrypted_code_arr
+                encrypted_code = {}
+                encrypted_code['headers'] = headers
-    env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
+                encrypted_code['code'] = encrypted_code_arr
-    cpp_cl_encrypted_kernel = env.get_template(
+                encrypted_code_maps[module_key] = encrypted_code
-        'str2vec_maps.cc.jinja2').render(
-            maps=encrypted_code_maps,
+    create_output_dir(os.path.dirname(output_path))
-            data_type='unsigned char',
+    write_cl_encrypted_kernel_to_file(
-            variable_name='kEncryptedProgramMap')
+        encrypted_code_maps, 'str2vec_maps.cc.jinja2', output_path)
+    output_path_h = output_path.replace('.cc', '.h')
-    output_dir = os.path.dirname(output_path)
+    write_cl_encrypted_kernel_to_file(
-    if os.path.exists(output_dir):
+        encrypted_code_maps, 'str2vec_maps.h.jinja2', output_path_h)
-        if os.path.isdir(output_dir):
-            try:
-                shutil.rmtree(output_dir)
-            except OSError:
-                raise RuntimeError(
-                    "Cannot delete directory %s due to permission "
-                    "error, inspect and remove manually" % output_dir)
-        else:
-            raise RuntimeError(
-                "Cannot delete non-directory %s, inspect ",
-                "and remove manually" % output_dir)
-    os.makedirs(output_dir)
-    with open(output_path, "w") as w_file:
-        w_file.write(cpp_cl_encrypted_kernel)
    print('Generate OpenCL kernel done.')

--- a/mace/python/tools/str2vec_maps.cc.jinja2
+++ b/mace/python/tools/str2vec_maps.cc.jinja2
@@ -14,24 +14,32 @@
 // This is a generated file. DO NOT EDIT!
+#include "mace/codegen/opencl/encrypt_opencl_kernel.h"
 #include <map>
 #include <string>
-#include <vector>
 namespace mace {
+namespace {{tag}} {
-extern const std::map<std::string, std::vector<{{data_type}}>> {{variable_name}} =
+const std::map<std::string, ClProgramInfo> {{variable_name}} = {
-{
+{% for key, encrypted_code in maps.items() %}
-  {% for key, value in maps.items() %}
+  {
+    "{{key}}", {
      {
-    "{{key}}",
+        {%- for header in encrypted_code['headers'] -%}
+        "{{header}}",
+        {%- endfor -%}
+      },
      {
-      {%- for ele in value -%}
+        {%- for ele in encrypted_code['code'] -%}
        {{ele}},
        {%- endfor -%}
      }
+    }
  },  // {{key}}
 {% endfor %}
 };
+}  // {{tag}}
 }  // namespace mace
--- a/mace/ops/resize_bilinear.h
+++ b/mace/ops/resize_bilinear.h
@@ -12,23 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_RESIZE_BILINEAR_H_
+// This is a generated file. DO NOT EDIT!
-#define MACE_OPS_RESIZE_BILINEAR_H_
-#include "mace/core/types.h"
+#include <map>
+#include <string>
+#include <vector>
 namespace mace {
-namespace ops {
+namespace {{tag}} {
-namespace resize_bilinear {
-inline float CalculateResizeScale(index_t in_size,
+struct ClProgramInfo {
-                                  index_t out_size,
+  const std::vector<std::string> headers_;
-                                  bool align_corners) {
+  const std::vector<{{data_type}}> encrypted_code_;
-  return (align_corners && out_size > 1)
+};
-         ? (in_size - 1) / static_cast<float>(out_size - 1)
-         : in_size / static_cast<float>(out_size);
-}
-}  // namespace resize_bilinear
-}  // namespace ops
-}  // namespace mace
-#endif  // MACE_OPS_RESIZE_BILINEAR_H_
+extern const std::map<std::string, ClProgramInfo> {{variable_name}};
+}  // {{tag}}
+}  // namespace mace
--- a/repository/opencl-kernel/opencl_kernel_configure.bzl
+++ b/repository/opencl-kernel/opencl_kernel_configure.bzl
@@ -22,7 +22,7 @@ def _opencl_encrypt_kernel_impl(repository_ctx):
        unused_var = repository_ctx.path(Label("//:.git/refs/heads/master"))
    ret = repository_ctx.execute(
-        ["test", "-f", "%s/mace/ops/opencl/cl/common.h" % mace_root_path],
+        ["test", "-f", "%s/mace/ops/opencl/cl/common.cl" % mace_root_path],
    )
    if ret.return_code == 0:
        unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/activation.cl"))
@@ -71,7 +71,7 @@ def _opencl_encrypt_kernel_impl(repository_ctx):
        python_bin_path,
        "%s/mace/python/tools/encrypt_opencl_codegen.py" % mace_root_path,
        "--cl_kernel_dir=%s/mace/ops/opencl/cl" % mace_root_path,
-        "--output_path=%s/encrypt_opencl_kernel" % generated_files_path,
+        "--output_path=%s/encrypt_opencl_kernel.cc" % generated_files_path,
    ], quiet = False)
 encrypt_opencl_kernel_repository = repository_rule(

--- a/test/ccbenchmark/mace/ops/buffer_to_image_benchmark.cc
+++ b/test/ccbenchmark/mace/ops/buffer_to_image_benchmark.cc
@@ -42,7 +42,7 @@ void FilterBufferToImage(int iters,
      "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
  auto transform_func = [&]() {
-    OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+    OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
        .Transform(&context,
                   net.ws()->GetTensor("Input"),
                   OpenCLBufferType::IN_OUT_CHANNEL,

--- a/test/ccbenchmark/mace/ops/pad_benchmark.cc
+++ b/test/ccbenchmark/mace/ops/pad_benchmark.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 #include "mace/benchmark_utils/test_benchmark.h"
+#include "mace/ops/common/pad_type.h"
 #include "mace/ops/ops_test_util.h"
-#include "mace/ops/pad.h"
 namespace mace {
 namespace ops {

--- a/test/ccbenchmark/mace/ops/pooling_benchmark.cc
+++ b/test/ccbenchmark/mace/ops/pooling_benchmark.cc
@@ -14,7 +14,7 @@
 #include "mace/benchmark_utils/test_benchmark.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
-#include "mace/ops/pooling.h"
+#include "mace/ops/common/pooling_type.h"
 #include "mace/ops/ops_test_util.h"
 namespace mace {

--- a/test/ccunit/mace/ops/buffer_to_image_test.cc
+++ b/test/ccunit/mace/ops/buffer_to_image_test.cc
@@ -35,14 +35,14 @@ void TestBidirectionTransform(const OpenCLBufferType type,
  Tensor *b2i_output = net.ws()->CreateTensor(
      "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
-  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+  OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
      .Transform(&context, net.ws()->GetTensor("Input"),
                 type, MemoryType::GPU_IMAGE, 0, b2i_output);
  // Inverse Transform
  Tensor *i2b_output = net.ws()->CreateTensor(
      "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
-  OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
+  OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
      .Transform(&context, b2i_output,
                 type, MemoryType::GPU_BUFFER, 0, i2b_output);
@@ -176,14 +176,14 @@ void TestDiffTypeBidirectionTransform(const OpenCLBufferType type,
  Tensor *b2i_output = net.ws()->CreateTensor(
      "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
-  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+  OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
      .Transform(&context, net.ws()->GetTensor("Input"),
                 type, MemoryType::GPU_IMAGE, 0, b2i_output);
  // Inverse Transform
  Tensor *i2b_output = net.ws()->CreateTensor(
      "I2BOutput", context.device()->allocator(), DT_FLOAT);
-  OpenCLBufferTransformer<float>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
+  OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
      .Transform(&context, b2i_output,
                 type, MemoryType::GPU_BUFFER, 0, i2b_output);
@@ -216,14 +216,14 @@ void TestStringHalfBidirectionTransform(const OpenCLBufferType type,
      "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
  // Transform
-  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+  OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
      .Transform(&context, net.ws()->GetTensor("Input"),
                 type, MemoryType::GPU_IMAGE, 0, b2i_output);
  // Inverse Transform
  Tensor *i2b_output = net.ws()->CreateTensor(
      "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
-  OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
+  OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
      .Transform(&context, b2i_output,
                 type, MemoryType::GPU_BUFFER, 0, i2b_output);

--- a/test/ccunit/mace/ops/buffer_transform_test.cc
+++ b/test/ccunit/mace/ops/buffer_transform_test.cc
@@ -45,7 +45,7 @@ void TestBidirectionTransform(const OpenCLBufferType type,
      "BtOutput", context.device()->allocator(),
      DataTypeToEnum<DstType>::value);
-  OpenCLBufferTransformer<DstType>(MemoryType::GPU_BUFFER,
+  OpenCLBufferTransformer(MemoryType::GPU_BUFFER,
                          MemoryType::GPU_BUFFER)
      .Transform(&context, net.ws()->GetTensor("Input"),
                 type, MemoryType::GPU_BUFFER, 0, bt_output);
@@ -54,7 +54,7 @@ void TestBidirectionTransform(const OpenCLBufferType type,
  Tensor *output = net.ws()->CreateTensor(
      "Output", context.device()->allocator(),
      DataTypeToEnum<OrgType>::value);
-  OpenCLBufferTransformer<OrgType>(MemoryType::GPU_BUFFER,
+  OpenCLBufferTransformer(MemoryType::GPU_BUFFER,
                          MemoryType::GPU_BUFFER)
      .Transform(&context, bt_output,
                 type, MemoryType::GPU_BUFFER, 0, output);
@@ -90,7 +90,7 @@ void TestArgumentTransform(const index_t input_size) {
  Tensor *output = net.ws()->CreateTensor(
      "Output", context.device()->allocator(),
      DataTypeToEnum<T>::value);
-  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER,
+  OpenCLBufferTransformer(MemoryType::GPU_BUFFER,
                          MemoryType::GPU_BUFFER)
      .Transform(&context, net.ws()->GetTensor("Input"),
                 OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER,

--- a/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
+++ b/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
@@ -53,10 +53,10 @@ MaceStatus BufferToImageOpImpl(OpContext *context,
                          DtToCLCMDDt(DataTypeToEnum<float>::value));
  } else {
    built_options.emplace("-DDATA_TYPE=" +
-                          DtToUpCompatibleCLDt(DataTypeToEnum<float>::value));
+                          DtToCLDt(DataTypeToEnum<float>::value));
    built_options.emplace(
        "-DCMD_DATA_TYPE=" +
-            DtToUpCompatibleCLCMDDt(DataTypeToEnum<float>::value));
+            DtToCLCMDDt(DataTypeToEnum<float>::value));
  }
  cl::Kernel kernel;

--- a/test/ccunit/mace/ops/pad_test.cc
+++ b/test/ccunit/mace/ops/pad_test.cc
@@ -16,8 +16,8 @@
 #include <string>
 #include <vector>
+#include "mace/ops/common/pad_type.h"
 #include "mace/ops/ops_test_util.h"
-#include "mace/ops/pad.h"
 namespace mace {
 namespace ops {

--- a/test/ccunit/mace/ops/pooling_test.cc
+++ b/test/ccunit/mace/ops/pooling_test.cc
@@ -14,8 +14,8 @@
 #include <vector>
-#include "mace/ops/pooling.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/common/pooling_type.h"
 #include "mace/ops/ops_test_util.h"
 namespace mace {

--- a/test/ccunit/mace/ops/reduce_test.cc
+++ b/test/ccunit/mace/ops/reduce_test.cc
@@ -14,7 +14,7 @@
 #include <vector>
-#include "mace/ops/reduce.h"
+#include "mace/ops/common/reduce_type.h"
 #include "mace/ops/ops_test_util.h"
 namespace mace {