Merge branch 'minify_opencl' into 'master'

Minify opencl See merge request !1104

Merge branch 'minify_opencl' into 'master'
Minify opencl See merge request !1104
0792637f · Liangliang He · 21b43e2b · 85cef1d8 · 0792637f · 0792637f
145 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,7 +68,7 @@ if(MACE_ENABLE_CUDA)
  enable_language(CUDA)
 endif(MACE_ENABLE_CUDA)

-if((MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA))
+if(MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA)
  if(ANDROID_ABI STREQUAL "arm64-v8a")
    # Use gold linker to avoid linking check of libcdsprpc.so
    set(MACE_LINKER_FLAGS "${MACE_LINKER_FLAGS} -fuse-ld=gold")

--- a/docs/development/adding_a_new_op.md
+++ b/docs/development/adding_a_new_op.md
@@ -33,8 +33,8 @@ class MyCustomOp<DeviceType::CPU, float> : public Operation {
 }

 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class MyCustomOp<DeviceType::GPU, T> : public Operation {
+template<>
+class MyCustomOp<DeviceType::GPU, float> : public Operation {
 ...
 };
 #endif  // MACE_ENABLE_OPENCL
@@ -43,13 +43,7 @@ void RegisterMyCustomOp(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
                   DeviceType::CPU, float);

-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "MyCustomOp", MyCustomOp);
 }

 }  // namespace ops

--- a/mace/codegen/BUILD.bazel
+++ b/mace/codegen/BUILD.bazel
@@ -5,7 +5,7 @@ package(
    default_visibility = ["//visibility:public"],
 )

-load("//mace:mace.bzl", "mace_version_genrule", "encrypt_opencl_kernel_genrule")
+load("//mace:mace.bzl", "encrypt_opencl_kernel_genrule", "mace_version_genrule")

 cc_library(
    name = "generated_models",
@@ -28,6 +28,7 @@ encrypt_opencl_kernel_genrule()
 cc_library(
    name = "generated_opencl",
    srcs = ["opencl/encrypt_opencl_kernel.cc"],
+    hdrs = ["opencl/encrypt_opencl_kernel.h"],
    copts = [
        "-Werror",
        "-Wextra",

--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -318,7 +318,7 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation(

  std::string key = OpKeyBuilder(op_type)
      .Device(device_type)
-      .TypeConstraint("T", dtype)
+      .TypeConstraint("T", dtype == DT_HALF ? DT_FLOAT : dtype)
      .Build();
  if (registry_.at(op_type)->creators.count(key) == 0) {
    LOG(FATAL) << "Key not registered: " << key;

--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -39,7 +39,7 @@ class OpConditionContext {
  OpConditionContext(const Workspace *ws, TensorShapeMap *info);
  ~OpConditionContext() = default;

-  void set_operator_def(const OperatorDef* operator_def);
+  void set_operator_def(const OperatorDef *operator_def);

  inline const OperatorDef *operator_def() const {
    return operator_def_;
@@ -49,7 +49,7 @@ class OpConditionContext {
    return ws_;
  }

-  inline void set_device(Device* device) {
+  inline void set_device(Device *device) {
    device_ = device;
  }

@@ -110,7 +110,7 @@ class OpConstructContext {
    return ws_;
  }

-  inline void set_device(Device* device) {
+  inline void set_device(Device *device) {
    device_ = device;
  }

@@ -166,14 +166,14 @@ class Operation {
  explicit Operation(OpConstructContext *context);
  virtual ~Operation() = default;

-  template <typename T>
+  template<typename T>
  inline T GetOptionalArg(const std::string &name,
                          const T &default_value) const {
    MACE_CHECK(operator_def_, "operator_def was null!");
    return ProtoArgHelper::GetOptionalArg<OperatorDef, T>(
        *operator_def_, name, default_value);
  }
-  template <typename T>
+  template<typename T>
  inline std::vector<T> GetRepeatedArgs(
      const std::string &name, const std::vector<T> &default_value = {}) const {
    MACE_CHECK(operator_def_, "operator_def was null!");
@@ -240,7 +240,6 @@ class Operation {
 #define MACE_OP_OUTPUT_TAGS(first_input, ...) \
  enum _OutputTags { first_input = 0, __VA_ARGS__ }

-
 struct OpRegistrationInfo {
 public:
  typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
@@ -290,7 +289,6 @@ class OpConditionBuilder {
  OpRegistrationInfo::DataFormatSelector data_format_selector_;
 };

-
 class OpRegistryBase {
 public:
  OpRegistryBase() = default;
@@ -315,7 +313,7 @@ class OpRegistryBase {
      OpConstructContext *context,
      DeviceType device_type) const;

-  template <class DerivedType>
+  template<class DerivedType>
  static std::unique_ptr<Operation> DefaultCreator(
      OpConstructContext *context) {
    return std::unique_ptr<Operation>(new DerivedType(context));
@@ -334,6 +332,24 @@ class OpRegistryBase {
                        DataTypeToEnum<dt>::value,                     \
                        OpRegistryBase::DefaultCreator<class_name<device, dt>>)

+#define MACE_REGISTER_OP_BY_CLASS(                 \
+    op_registry, op_type, class_name, device, dt)  \
+  op_registry->Register(op_type,                   \
+                        device,                    \
+                        DataTypeToEnum<dt>::value, \
+                        OpRegistryBase::DefaultCreator<class_name>)
+
+#ifdef MACE_ENABLE_OPENCL
+#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \
+  op_registry->Register(                                       \
+      op_type,                                                 \
+      DeviceType::GPU,                                         \
+      DT_FLOAT,                                                \
+      OpRegistryBase::DefaultCreator<class_name<DeviceType::GPU, float>>)
+#else
+#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name)
+#endif
+
 #define MACE_REGISTER_OP_CONDITION(op_registry, builder) \
  op_registry->Register(builder)


--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -18,20 +18,19 @@
 #include <fstream>
 #include <memory>
 #include <mutex>  // NOLINT(build/c++11)
+#include <sstream>
 #include <string>
 #include <vector>
 #include <utility>

-#include "mace/utils/macros.h"
+#include "mace/codegen/opencl/encrypt_opencl_kernel.h"
 #include "mace/core/kv_storage.h"
 #include "mace/core/runtime/opencl/opencl_extension.h"
+#include "mace/utils/macros.h"
 #include "mace/utils/tuner.h"

 namespace mace {

-extern const std::map<std::string, std::vector<unsigned char>>
-    kEncryptedProgramMap;
-
 const std::string OpenCLErrorToString(cl_int error) {
  switch (error) {
    case CL_SUCCESS:
@@ -265,7 +264,7 @@ OpenCLRuntime::OpenCLRuntime(
    const GPUPriorityHint priority_hint,
    const GPUPerfHint perf_hint,
    std::shared_ptr<KVStorage> precompiled_binary_storage,
-    std::shared_ptr<Tuner<uint32_t>> tuner):
+    std::shared_ptr<Tuner<uint32_t>> tuner) :
    cache_storage_(cache_storage),
    precompiled_binary_storage_(precompiled_binary_storage),
    tuner_(tuner),
@@ -332,7 +331,7 @@ OpenCLRuntime::OpenCLRuntime(

  cl_int err;
  if (gpu_type_ == GPUType::QUALCOMM_ADRENO
-          && opencl_version_ == OpenCLVersion::CL_VER_2_0) {
+      && opencl_version_ == OpenCLVersion::CL_VER_2_0) {
    std::vector<cl_context_properties> context_properties;
    context_properties.reserve(5);
    GetAdrenoContextProperties(&context_properties,
@@ -345,8 +344,8 @@ OpenCLRuntime::OpenCLRuntime(
 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
    if (is_profiling_enabled_ && gpu_type_ == GPUType::MALI) {
      std::vector<cl_context_properties> context_properties = {
-          CL_CONTEXT_PLATFORM, (cl_context_properties)default_platform(),
-          CL_PRINTF_CALLBACK_ARM, (cl_context_properties)OpenCLPrintfCallback,
+          CL_CONTEXT_PLATFORM, (cl_context_properties) default_platform(),
+          CL_PRINTF_CALLBACK_ARM, (cl_context_properties) OpenCLPrintfCallback,
          CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0
      };
      context_ = std::shared_ptr<cl::Context>(
@@ -399,7 +398,7 @@ OpenCLRuntime::OpenCLRuntime(
  if (cached_binary_platform_info != platform_info_) {
    if (precompiled_binary_storage_ == nullptr) {
      VLOG(1) << "There is no precompiled OpenCL binary in"
-          " all OpenCL binary paths.";
+                 " all OpenCL binary paths.";
    } else {
      if (precompiled_binary_storage_->Load() != 0) {
        LOG(WARNING) << "Load OpenCL precompiled kernel file failed. "
@@ -530,17 +529,47 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary(
  return true;
 }

+MaceStatus GetProgramSourceByName(const std::string &program_name,
+                              std::string *source) {
+  MACE_CHECK_NOTNULL(source);
+  std::stringstream source_stream;
+  const auto &kEncryptedProgramMap = mace::codegen::kEncryptedProgramMap;
+  const auto &it_program = kEncryptedProgramMap.find(program_name);
+  if (it_program == kEncryptedProgramMap.end()) {
+    LOG(ERROR) << "Find program " << program_name << " failed.";
+    return MaceStatus::MACE_RUNTIME_ERROR;
+  }
+
+  const std::vector<std::string> &headers = it_program->second.headers_;
+  for (const std::string &header : headers) {
+    const auto &header_program = kEncryptedProgramMap.find(header);
+    if (header_program == kEncryptedProgramMap.end()) {
+      LOG(WARNING) << "Program header(" << header << ") is empty.";
+      continue;
+    }
+
+    const auto &header_source = header_program->second.encrypted_code_;
+    source_stream << ObfuscateString(
+        std::string(header_source.begin(), header_source.end()));
+  }
+
+  const auto &it_source = it_program->second.encrypted_code_;
+  source_stream << ObfuscateString(
+      std::string(it_source.begin(), it_source.end()));
+  *source = source_stream.str();
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
 bool OpenCLRuntime::BuildProgramFromSource(
    const std::string &program_name,
    const std::string &built_program_key,
    const std::string &build_options_str,
    cl::Program *program) {
-  // Find from source
-  auto it_source = kEncryptedProgramMap.find(program_name);
-  if (it_source != kEncryptedProgramMap.end()) {
+  std::string kernel_source;
+  MaceStatus status = GetProgramSourceByName(program_name, &kernel_source);
+  if (status == MaceStatus::MACE_SUCCESS && !kernel_source.empty()) {
    cl::Program::Sources sources;
-    std::string source(it_source->second.begin(), it_source->second.end());
-    std::string kernel_source = ObfuscateString(source);
    sources.push_back(kernel_source);
    *program = cl::Program(context(), sources);
    cl_int ret = program->build({device()}, build_options_str.c_str());

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -66,7 +66,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
          *net_def, "opencl_mem_type",
          static_cast<MemoryType>(MemoryType::GPU_IMAGE));
  const MemoryType mem_type = static_cast<MemoryType>(mem_type_i);
-
  runtime->set_mem_type(mem_type);

  return MaceStatus::MACE_SUCCESS;

--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -118,9 +118,21 @@ def mace_version_genrule():
  )

 def encrypt_opencl_kernel_genrule():
-  native.genrule(
-      name = "encrypt_opencl_kernel_gen",
-      srcs = [str(Label("@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel"))],
-      outs = ["opencl/encrypt_opencl_kernel.cc"],
-      cmd = "cat $(SRCS) > $@;"
-  )
+    srcs = [
+        str(Label(
+            "@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.cc",
+        )),
+        str(Label(
+            "@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.h",
+        )),
+    ]
+    outs = ["opencl/encrypt_opencl_kernel.cc", "opencl/encrypt_opencl_kernel.h"]
+    native.genrule(
+        name = "encrypt_opencl_kernel_gen",
+        srcs = srcs,
+        outs = outs,
+        cmd = " && ".join([
+            "cat $(location %s) > $(location %s)" % (srcs[i], outs[i])
+            for i in range(0, len(outs))
+        ]),
+    )
--- a/mace/ops/BUILD.bazel
+++ b/mace/ops/BUILD.bazel
@@ -181,7 +181,6 @@ cc_library(
    ],
 )

-
 cc_library(
    name = "internal_ops",
    srcs = glob(
@@ -239,10 +238,10 @@ cc_library(
    name = "ops",
    srcs = [
        "registry/ops_registry.cc",
-        ],
+    ],
    hdrs = [
        "registry/ops_registry.h",
-        ],
+    ],
    copts = [
        "-Werror",
        "-Wextra",

--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -83,28 +83,27 @@ class ActivationOp<DeviceType::CPU, float> : public Operation {
 };

 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class ActivationOp<DeviceType::GPU, T> : public Operation {
+template<>
+class ActivationOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit ActivationOp(OpConstructContext *context)
      : Operation(context) {
    ActivationType type = ops::StringToActivationType(
        Operation::GetOptionalArg<std::string>("activation",
                                              "NOOP"));
-    auto relux_max_limit = static_cast<T>(
-        Operation::GetOptionalArg<float>("max_limit", 0.0f));
-    auto leakyrelu_coefficient = static_cast<T>(
-        Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f));
+    auto relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
+    auto leakyrelu_coefficient =
+        Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f);
    MemoryType mem_type;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::ActivationKernel<T>>(
+      kernel_ = make_unique<opencl::image::ActivationKernel>(
          type, relux_max_limit, leakyrelu_coefficient);
    } else {
      MACE_NOT_IMPLEMENTED;
    }
    if (type == ActivationType::PRELU) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
                     == MaceStatus::MACE_SUCCESS);
    }
@@ -126,14 +125,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
 void RegisterActivation(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
                   DeviceType::CPU, float);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Activation", ActivationOp);
  MACE_REGISTER_OP_CONDITION(
      op_registry,
      OpConditionBuilder("Activation")
@@ -141,16 +133,16 @@ void RegisterActivation(OpRegistryBase *op_registry) {
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                }
                int has_data_format =
                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                        *op, "has_data_format", 0);
                if (!has_data_format ||
                    op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
              }));
 }


--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -29,10 +29,10 @@
 namespace mace {
 namespace ops {

-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class AddNOp;

-template <>
+template<>
 class AddNOp<DeviceType::CPU, float> : public Operation {
 public:
  explicit AddNOp(OpConstructContext *context)
@@ -62,13 +62,13 @@ class AddNOp<DeviceType::CPU, float> : public Operation {
 };

 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class AddNOp<DeviceType::GPU, T> : public Operation {
+template<>
+class AddNOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit AddNOp(OpConstructContext *context)
      : Operation(context) {
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::AddNKernel<T>>();
+      kernel_ = make_unique<opencl::image::AddNKernel>();
    } else {
      MACE_NOT_IMPLEMENTED;
    }
@@ -92,15 +92,9 @@ class AddNOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL

-
 void RegisterAddN(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "AddN", AddNOp);
  MACE_REGISTER_OP_CONDITION(
      op_registry,
      OpConditionBuilder("AddN")
@@ -108,16 +102,16 @@ void RegisterAddN(OpRegistryBase *op_registry) {
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                }
                int has_data_format =
                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                        *op, "has_data_format", 0);
                if (!has_data_format ||
                    op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
              }));
 }


--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -161,8 +161,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
 };

 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class BatchNormOp<DeviceType::GPU, T> : public Operation {
+template<>
+class BatchNormOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit BatchNormOp(OpConstructContext *context)
      : Operation(context) {
@@ -176,7 +176,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
    MemoryType mem_type;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::BatchNormKernel<T>>(
+      kernel_ = make_unique<opencl::image::BatchNormKernel>(
          epsilon, activation, relux_max_limit, leakyrelu_coefficient);
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -187,7 +187,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
      const Tensor *input_tensor = context->workspace()->GetTensor(
          operator_def_->input(i));
      MACE_CHECK(input_tensor != nullptr);
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context,
          operator_def_.get(),
          i,
@@ -235,14 +235,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
 void RegisterBatchNorm(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
                   DeviceType::CPU, float);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "BatchNorm", BatchNormOp);
 }

 }  // namespace ops

--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -80,10 +80,10 @@ class BatchToSpaceOpBase : public Operation {
  }
 };

-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class BatchToSpaceNDOp;

-template <>
+template<>
 class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
 public:
  explicit BatchToSpaceNDOp(OpConstructContext *context)
@@ -175,7 +175,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
  }
 };

-template <>
+template<>
 class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
 public:
  explicit BatchToSpaceNDOp(OpConstructContext *context)
@@ -259,13 +259,13 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
 };

 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
+template<>
+class BatchToSpaceNDOp<DeviceType::GPU, float> : public BatchToSpaceOpBase {
 public:
  explicit BatchToSpaceNDOp(OpConstructContext *context)
      : BatchToSpaceOpBase(context) {
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::BatchToSpaceKernel<T>>();
+      kernel_ = make_unique<opencl::image::BatchToSpaceKernel>();
    } else {
      MACE_NOT_IMPLEMENTED;
    }
@@ -285,7 +285,6 @@ class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL

-
 void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
                   BatchToSpaceNDOp, DeviceType::CPU, float);
@@ -293,13 +292,7 @@ void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
                   BatchToSpaceNDOp, DeviceType::CPU, uint8_t);

-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
-                   BatchToSpaceNDOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
-                   BatchToSpaceNDOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "BatchToSpaceND", BatchToSpaceNDOp);
 }

 }  // namespace ops

--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -34,16 +34,16 @@
 namespace mace {
 namespace ops {

-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class BiasAddOp;

-template <>
+template<>
 class BiasAddOp<DeviceType::CPU, float> : public Operation {
 public:
  explicit BiasAddOp(OpConstructContext *context)
      : Operation(context),
-        has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 0))
-  {}
+        has_data_format_(Operation::GetOptionalArg<int>("has_data_format",
+                                                        0)) {}

  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
@@ -96,8 +96,8 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
 };

 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class BiasAddOp<DeviceType::GPU, T> : public Operation {
+template<>
+class BiasAddOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit BiasAddOp(OpConstructContext *context)
      : Operation(context),
@@ -105,11 +105,11 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
    MemoryType mem_type = MemoryType::CPU_BUFFER;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::BiasAddKernel<T>>();
+      kernel_ = make_unique<opencl::image::BiasAddKernel>();
    } else {
      MACE_NOT_IMPLEMENTED;
    }
-    MACE_CHECK(TransformFilter<T>(
+    MACE_CHECK(TransformFilter(
        context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
                   == MaceStatus::MACE_SUCCESS);
  }
@@ -133,18 +133,10 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL

-
 void RegisterBiasAdd(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
                   DeviceType::CPU, float);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "BiasAdd", BiasAddOp);
  MACE_REGISTER_OP_CONDITION(
      op_registry,
      OpConditionBuilder("BiasAdd")
@@ -152,16 +144,16 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) {
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                }
                int has_data_format =
                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                        *op, "has_data_format", 0);
                if (!has_data_format ||
                    op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
              }));
 }


--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -23,10 +23,10 @@
 namespace mace {
 namespace ops {

-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class ChannelShuffleOp;

-template <typename T>
+template<typename T>
 class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
 public:
  explicit ChannelShuffleOp(OpConstructContext *context)
@@ -74,16 +74,15 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
  const int groups_;
 };

-
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
+template<>
+class ChannelShuffleOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit ChannelShuffleOp(OpConstructContext *context)
      : Operation(context) {
    const int groups = Operation::GetOptionalArg<int>("group", 1);
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::ChannelShuffleKernel<T>>(groups);
+      kernel_ = make_unique<opencl::image::ChannelShuffleKernel>(groups);
    } else {
      MACE_NOT_IMPLEMENTED;
    }
@@ -99,18 +98,11 @@ class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL

-
 void RegisterChannelShuffle(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "ChannelShuffle",
                   ChannelShuffleOp, DeviceType::CPU, float);

-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "ChannelShuffle",
-                   ChannelShuffleOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "ChannelShuffle",
-                   ChannelShuffleOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "ChannelShuffle", ChannelShuffleOp);

  MACE_REGISTER_OP_CONDITION(
      op_registry,
@@ -119,19 +111,19 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) {
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                }
                int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                    *op, "group", 1);
                if (op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                }
                index_t channels = op->output_shape(0).dims(3);
                index_t channels_per_group = channels / groups;
                if (groups % 4 != 0 || channels_per_group % 4 != 0) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
              }));
 }


--- a/mace/ops/pad.h
+++ b/mace/ops/pad.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_PAD_H_
-#define MACE_OPS_PAD_H_
+#ifndef MACE_OPS_COMMON_PAD_TYPE_H_
+#define MACE_OPS_COMMON_PAD_TYPE_H_

 namespace mace {
 namespace ops {
@@ -27,4 +27,4 @@ enum PadType {
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_PAD_H_
+#endif  // MACE_OPS_COMMON_PAD_TYPE_H_
--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_POOLING_H_
-#define MACE_OPS_POOLING_H_
+#ifndef MACE_OPS_COMMON_POOLING_TYPE_H_
+#define MACE_OPS_COMMON_POOLING_TYPE_H_


 namespace mace {
@@ -23,4 +23,4 @@ enum PoolingType {
 };
 }  // namespace mace

-#endif  // MACE_OPS_POOLING_H_
+#endif  // MACE_OPS_COMMON_POOLING_TYPE_H_
--- a/mace/ops/reduce.h
+++ b/mace/ops/reduce.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_REDUCE_H_
-#define MACE_OPS_REDUCE_H_
+#ifndef MACE_OPS_COMMON_REDUCE_TYPE_H_
+#define MACE_OPS_COMMON_REDUCE_TYPE_H_


 namespace mace {
@@ -28,4 +28,4 @@ enum ReduceType {
 };
 }  // namespace mace

-#endif  // MACE_OPS_REDUCE_H_
+#endif  // MACE_OPS_COMMON_REDUCE_TYPE_H_
--- a/mace/ops/resize_bicubic.h
+++ b/mace/ops/resize_bicubic.h
@@ -12,14 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_RESIZE_BICUBIC_H_
-#define MACE_OPS_RESIZE_BICUBIC_H_
+#ifndef MACE_OPS_COMMON_UTILS_H_
+#define MACE_OPS_COMMON_UTILS_H_

 #include "mace/core/types.h"

 namespace mace {
 namespace ops {
-namespace resize_bicubic {
+namespace common {
+namespace utils {
+
 constexpr int64_t kTableSize = (1u << 10);

 inline float CalculateResizeScale(index_t in_size,
@@ -29,9 +31,10 @@ inline float CalculateResizeScale(index_t in_size,
         ? (in_size - 1) / static_cast<float>(out_size - 1)
         : in_size / static_cast<float>(out_size);
 }
-}  // namespace resize_bicubic

+}  // namespace utils
+}  // namespace common
 }  // namespace ops
 }  // namespace mace

-#endif  // MACE_OPS_RESIZE_BICUBIC_H_
+#endif  // MACE_OPS_COMMON_UTILS_H_
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -46,10 +46,10 @@ class ConcatOpBase : public Operation {
  int axis_;
 };

-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class ConcatOp;

-template <typename T>
+template<typename T>
 class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
 public:
  explicit ConcatOp(OpConstructContext *context)
@@ -194,13 +194,13 @@ class ConcatOp<DeviceType::CPU, uint8_t> : public ConcatOpBase {
 #endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
+template<>
+class ConcatOp<DeviceType::GPU, float> : public ConcatOpBase {
 public:
  explicit ConcatOp(OpConstructContext *context)
      : ConcatOpBase(context) {
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::ConcatKernel<T>>();
+      kernel_ = make_unique<opencl::image::ConcatKernel>();
    } else {
      MACE_NOT_IMPLEMENTED;
    }
@@ -215,7 +215,6 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL

-
 void RegisterConcat(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
                   DeviceType::CPU, float);
@@ -228,51 +227,44 @@ void RegisterConcat(OpRegistryBase *op_registry) {
                   DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE

-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
-                   DeviceType::GPU, half);
-
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Concat", ConcatOp);

  MACE_REGISTER_OP_CONDITION(
      op_registry,
      OpConditionBuilder("Concat")
          .SetDevicePlacerFunc(
-            [](OpConditionContext *context) -> std::set<DeviceType> {
-              auto op = context->operator_def();
-              if (op->output_shape_size() != op->output_size()) {
-                return { DeviceType::CPU, DeviceType::GPU };
-              }
-              auto tensor_shape_info = context->tensor_shape_info();
-              if (op->output_shape(0).dims_size() != 4) {
-                return { DeviceType::CPU };
-              } else {
-                int has_data_format =
-                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                        *op, "has_data_format", 0);
-                int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                    *op, "axis", 3);
-                if (!has_data_format || axis != 3) {
-                  return { DeviceType::CPU };
+              [](OpConditionContext *context) -> std::set<DeviceType> {
+                auto op = context->operator_def();
+                if (op->output_shape_size() != op->output_size()) {
+                  return {DeviceType::CPU, DeviceType::GPU};
                }
-                bool divisible_four = true;
-                for (const std::string &input : op->input()) {
-                  if (tensor_shape_info->find(input)
-                      != tensor_shape_info->end()) {
-                    divisible_four = divisible_four
-                        && (tensor_shape_info->at(input)[3] % 4 == 0);
+                auto tensor_shape_info = context->tensor_shape_info();
+                if (op->output_shape(0).dims_size() != 4) {
+                  return {DeviceType::CPU};
+                } else {
+                  int has_data_format =
+                      ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                          *op, "has_data_format", 0);
+                  int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                      *op, "axis", 3);
+                  if (!has_data_format || axis != 3) {
+                    return {DeviceType::CPU};
+                  }
+                  bool divisible_four = true;
+                  for (const std::string &input : op->input()) {
+                    if (tensor_shape_info->find(input)
+                        != tensor_shape_info->end()) {
+                      divisible_four = divisible_four
+                          && (tensor_shape_info->at(input)[3] % 4 == 0);
+                    }
+                  }
+                  // Only support not divisible 4 case with 2 inputs.
+                  if (op->input_size() > 2 && !divisible_four) {
+                    return {DeviceType::CPU};
                  }
                }
-                // Only support not divisible 4 case with 2 inputs.
-                if (op->input_size() > 2 && !divisible_four) {
-                  return { DeviceType::CPU };
-                }
-              }
-              return { DeviceType::CPU, DeviceType::GPU };
-            }));
+                return {DeviceType::CPU, DeviceType::GPU};
+              }));
 }

 }  // namespace ops

--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -446,8 +446,8 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
 #endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
+template<>
+class Conv2dOp<DeviceType::GPU, float> : public ConvPool2dOpBase {
 public:
  explicit Conv2dOp(OpConstructContext *context)
      : ConvPool2dOpBase(context),
@@ -461,10 +461,10 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
    MemoryType mem_type;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::Conv2dKernel<T>>();
+      kernel_ = make_unique<opencl::image::Conv2dKernel>();
    } else {
      mem_type = MemoryType::GPU_BUFFER;
-      kernel_ = make_unique<opencl::buffer::Conv2dKernel<T>>();
+      kernel_ = make_unique<opencl::buffer::Conv2dKernel>();
    }
    // Transform filter tensor to target format
    if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
@@ -477,19 +477,19 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
          strides_.data(),
          dilations_.data(),
          &wino_block_size_))) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context, operator_def_.get(), 1,
          OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_)
                     == MaceStatus::MACE_SUCCESS);
    } else {
      wino_block_size_ = 0;
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context, operator_def_.get(), 1,
          OpenCLBufferType::CONV2D_FILTER, mem_type)
                     == MaceStatus::MACE_SUCCESS);
    }
    if (operator_def_->input_size() > 2) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
                     == MaceStatus::MACE_SUCCESS);
    }
@@ -527,13 +527,7 @@ void RegisterConv2D(OpRegistryBase *op_registry) {
                   DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE

-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Conv2D", Conv2dOp);
 }

 }  // namespace ops

--- a/mace/ops/crop.cc
+++ b/mace/ops/crop.cc
@@ -24,10 +24,10 @@
 namespace mace {
 namespace ops {

-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class CropOp;

-template <class T>
+template<class T>
 class CropOp<DeviceType::CPU, T> : public Operation {
 public:
  explicit CropOp(OpConstructContext *context)
@@ -43,7 +43,6 @@ class CropOp<DeviceType::CPU, T> : public Operation {
    }
  }

-
  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
    MACE_CHECK(inputs_.size() == 2, "Crop op needs two inputs.");
@@ -71,7 +70,7 @@ class CropOp<DeviceType::CPU, T> : public Operation {
    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
    T *output_data = output->mutable_data<T>();

-    const T * input_data = input0->data<T>();
+    const T *input_data = input0->data<T>();

    crop_copy(input_data, output_data, input0->shape(),
              output_shape, offsets.data());
@@ -80,10 +79,10 @@ class CropOp<DeviceType::CPU, T> : public Operation {
  }

 private:
-  void crop_copy(const T* input_data, T* output_data,
+  void crop_copy(const T *input_data, T *output_data,
                 const std::vector<index_t> &input_shape,
                 const std::vector<index_t> &output_shape,
-                 const int32_t* offsets) {
+                 const int32_t *offsets) {
    const index_t out_img_size =
        output_shape[1] * output_shape[2] * output_shape[3];
    const index_t out_hw = output_shape[2] * output_shape[3];
@@ -94,9 +93,9 @@ class CropOp<DeviceType::CPU, T> : public Operation {
    for (int b = 0; b < output_shape[0]; ++b) {
      for (int c = 0; c < output_shape[1]; ++c) {
        for (int h = 0; h < output_shape[2]; ++h) {
-          T* out_ptr =
+          T *out_ptr =
              output_data + b * out_img_size + c * out_hw + h * output_shape[3];
-          const T* in_ptr_bch =
+          const T *in_ptr_bch =
              input_data + (b + offsets[0]) * in_img_size +
                  (c + offsets[1]) * in_hw +
                  (h + offsets[2]) * input_shape[3] + offsets[3];
@@ -112,13 +111,13 @@ class CropOp<DeviceType::CPU, T> : public Operation {
 };

 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class CropOp<DeviceType::GPU, T> : public Operation {
+template<>
+class CropOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit CropOp(OpConstructContext *context)
      : Operation(context) {
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::CropKernel<T>>(
+      kernel_ = make_unique<opencl::image::CropKernel>(
          Operation::GetRepeatedArgs<int>("offset"));
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -133,18 +132,10 @@ class CropOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL

-
 void RegisterCrop(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Crop", CropOp,
                   DeviceType::CPU, float);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Crop", CropOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "Crop", CropOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Crop", CropOp);
  MACE_REGISTER_OP_CONDITION(
      op_registry,
      OpConditionBuilder("Crop")
@@ -152,16 +143,16 @@ void RegisterCrop(OpRegistryBase *op_registry) {
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                }
                int has_data_format =
                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                        *op, "has_data_format", 0);
                if (!has_data_format ||
                    op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
              }));
 }


--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -167,30 +167,30 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
 };

 #ifdef MACE_ENABLE_OPENCL
-template<typename T>
-class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
+template<>
+class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
 public:
  explicit Deconv2dOp(OpConstructContext *context)
      : Deconv2dOpBase(context) {
    MemoryType mem_type = MemoryType::GPU_IMAGE;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::Deconv2dKernel<T>>();
+      kernel_ = make_unique<opencl::image::Deconv2dKernel>();
    } else {
      MACE_NOT_IMPLEMENTED;
    }
-    MACE_CHECK(TransformFilter<T>(
+    MACE_CHECK(TransformFilter(
        context, operator_def_.get(), 1,
        OpenCLBufferType::CONV2D_FILTER, mem_type)
                   == MaceStatus::MACE_SUCCESS);
    if (model_type_ == FrameworkType::CAFFE) {
      if (operator_def_->input_size() >= 3) {
-        MACE_CHECK(TransformFilter<T>(
+        MACE_CHECK(TransformFilter(
            context, operator_def_.get(), 2,
            OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
      }
    } else {
      if (operator_def_->input_size() >= 4) {
-        MACE_CHECK(TransformFilter<T>(
+        MACE_CHECK(TransformFilter(
            context,
            operator_def_.get(),
            3,
@@ -256,13 +256,8 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
 void RegisterDeconv2D(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
                   DeviceType::CPU, float);
-
+  MACE_REGISTER_GPU_OP(op_registry, "Deconv2D", Deconv2dOp);
 #ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
-                   DeviceType::GPU, half);
  MACE_REGISTER_OP_CONDITION(
      op_registry,
      OpConditionBuilder("Deconv2D")

--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -24,7 +24,7 @@
 namespace mace {
 namespace ops {

-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class DepthToSpaceOp : public Operation {
 public:
  explicit DepthToSpaceOp(OpConstructContext *context)
@@ -90,14 +90,14 @@ class DepthToSpaceOp : public Operation {
 };

 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class DepthToSpaceOp<DeviceType::GPU, T> : public Operation {
+template<>
+class DepthToSpaceOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit DepthToSpaceOp(OpConstructContext *context)
      : Operation(context) {
    int block_size = Operation::GetOptionalArg<int>("block_size", 1);
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::DepthToSpaceKernel<T>>(block_size);
+      kernel_ = make_unique<opencl::image::DepthToSpaceKernel>(block_size);
    } else {
      MACE_NOT_IMPLEMENTED;
    }
@@ -118,13 +118,7 @@ void RegisterDepthToSpace(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "DepthToSpace",
                   DepthToSpaceOp, DeviceType::CPU, float);

-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "DepthToSpace",
-                   DepthToSpaceOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "DepthToSpace",
-                   DepthToSpaceOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "DepthToSpace", DepthToSpaceOp);
 }

 }  // namespace ops

--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -369,24 +369,24 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
 #endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
+template<>
+class DepthwiseConv2dOp<DeviceType::GPU, float> : public DepthwiseConv2dOpBase {
 public:
  explicit DepthwiseConv2dOp(OpConstructContext *context)
      : DepthwiseConv2dOpBase(context) {
    MemoryType mem_type;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel<T>>();
+      kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel>();
    } else {
      mem_type = MemoryType::GPU_BUFFER;
-      kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel<T>>();
+      kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel>();
    }
    Tensor *filter_tensor = context->workspace()->GetTensor(
        operator_def_->input(1));
    if (filter_tensor != nullptr && filter_tensor->is_weight()) {
      // Transform filter tensor to target format
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context,
          operator_def_.get(),
          1,
@@ -394,7 +394,7 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
          mem_type) == MaceStatus::MACE_SUCCESS);
    }
    if (operator_def_->input_size() > 2) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
                     == MaceStatus::MACE_SUCCESS);
    }
@@ -431,12 +431,9 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
                   DepthwiseConv2dOp, DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE

-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
-                   DepthwiseConv2dOp, DeviceType::GPU, float);
+  MACE_REGISTER_GPU_OP(op_registry, "DepthwiseConv2d", DepthwiseConv2dOp);

-  MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
-                   DepthwiseConv2dOp, DeviceType::GPU, half);
+#ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OP_CONDITION(
      op_registry,
      OpConditionBuilder("DepthwiseConv2d")
@@ -467,8 +464,8 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
                DataFormat op_data_format =
                    static_cast<DataFormat>(
                        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                        *context->operator_def(), "data_format",
-                        static_cast<int>(DataFormat::NONE)));
+                            *context->operator_def(), "data_format",
+                            static_cast<int>(DataFormat::NONE)));
                return {op_data_format, DataFormat::OIHW, DataFormat::NONE};
              }));
 }

--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -184,23 +184,23 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
 };

 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
+template<>
+class DepthwiseDeconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
 public:
  explicit DepthwiseDeconv2dOp(OpConstructContext *context)
      : Deconv2dOpBase(context) {
    MemoryType mem_type = MemoryType::GPU_IMAGE;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel<T>>();
+      kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel>();
    } else {
      MACE_NOT_IMPLEMENTED;
    }
-    MACE_CHECK(TransformFilter<T>(
+    MACE_CHECK(TransformFilter(
        context, operator_def_.get(), 1,
        OpenCLBufferType::DW_CONV2D_FILTER, mem_type)
                   == MaceStatus::MACE_SUCCESS);
    if (operator_def_->input_size() >= 3) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context, operator_def_.get(), 2,
          OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
    }
@@ -255,13 +255,7 @@ void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
                   DepthwiseDeconv2dOp, DeviceType::CPU, float);

-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
-                   DepthwiseDeconv2dOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
-                   DepthwiseDeconv2dOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "DepthwiseDeconv2d", DepthwiseDeconv2dOp);
 }

 }  // namespace ops

--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -1158,8 +1158,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
 #endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class EltwiseOp<DeviceType::GPU, T> : public Operation {
+template<>
+class EltwiseOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit EltwiseOp(OpConstructContext *context)
      : Operation(context) {
@@ -1178,7 +1178,7 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
    MemoryType mem_type;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::EltwiseKernel<T>>(
+      kernel_ = make_unique<opencl::image::EltwiseKernel>(
          type, coeff, scalar_input, scalar_input_index);
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -1190,14 +1190,14 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
      if (ws->HasTensor(operator_def_->input(i)) &&
          ws->GetTensor(operator_def_->input(i))->is_weight()) {
        if (ws->GetTensor(operator_def_->input(i))->dim_size() == 1) {
-          MACE_CHECK(TransformFilter<T>(
+          MACE_CHECK(TransformFilter(
              context,
              operator_def_.get(),
              i,
              OpenCLBufferType::ARGUMENT,
              mem_type) == MaceStatus::MACE_SUCCESS);
        } else if (ws->GetTensor(operator_def_->input(i))->dim_size() == 4) {
-          MACE_CHECK(TransformFilter<T>(
+          MACE_CHECK(TransformFilter(
              context,
              operator_def_.get(),
              i,
@@ -1236,13 +1236,7 @@ void RegisterEltwise(OpRegistryBase *op_registry) {
                   DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE

-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Eltwise", EltwiseOp);
 }

 }  // namespace ops

--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -184,27 +184,27 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t>
 #endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
+template<>
+class FullyConnectedOp<DeviceType::GPU, float> : public FullyConnectedOpBase {
 public:
  explicit FullyConnectedOp(OpConstructContext *context)
      : FullyConnectedOpBase(context) {
    MemoryType mem_type = MemoryType::CPU_BUFFER;
    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::FullyConnectedKernel<T>>();
+      kernel_ = make_unique<opencl::image::FullyConnectedKernel>();
    } else {
      MACE_NOT_IMPLEMENTED;
    }
    // Transform filter tensor to target format
-    MACE_CHECK(TransformFilter<T>(
+    MACE_CHECK(TransformFilter(
        context,
        operator_def_.get(),
        1,
        OpenCLBufferType::WEIGHT_WIDTH,
        mem_type) == MaceStatus::MACE_SUCCESS);
    if (operator_def_->input_size() > 2) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
          context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
                     == MaceStatus::MACE_SUCCESS);
    }
@@ -240,13 +240,7 @@ void RegisterFullyConnected(OpRegistryBase *op_registry) {
                   FullyConnectedOp, DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE

-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "FullyConnected",
-                   FullyConnectedOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "FullyConnected",
-                   FullyConnectedOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "FullyConnected", FullyConnectedOp);
 }

 }  // namespace ops

--- a/mace/ops/identity.cc
+++ b/mace/ops/identity.cc
@@ -18,7 +18,6 @@
 namespace mace {
 namespace ops {

-template <DeviceType D, class T>
 class IdentityOp : public Operation {
 public:
  explicit IdentityOp(OpConstructContext *context)
@@ -34,15 +33,13 @@ class IdentityOp : public Operation {
 };

 void RegisterIdentity(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
-                   DeviceType::CPU, float);
-  MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
-                   DeviceType::CPU, int32_t);
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
+                            DeviceType::CPU, float);
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
+                            DeviceType::CPU, int32_t);
 #ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
-                   DeviceType::GPU, half);
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
+                            DeviceType::GPU, float);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/infer_conv2d_shape.cc
+++ b/mace/ops/infer_conv2d_shape.cc
@@ -19,7 +19,6 @@
 namespace mace {
 namespace ops {

-template <DeviceType D, class T>
 class InferConv2dShapeOp : public Operation {
 public:
  explicit InferConv2dShapeOp(OpConstructContext *context)
@@ -66,20 +65,23 @@ class InferConv2dShapeOp : public Operation {
    int32_t out_h = 0, out_w = 0;
    if (!paddings.empty()) {
      out_h = (in_h - kernels[2] + paddings[0]) / strides[0] + 1;
-      out_w = (in_w - kernels[3]  + paddings[1]) / strides[1] + 1;
+      out_w = (in_w - kernels[3] + paddings[1]) / strides[1] + 1;
    } else {
      switch (padding_type) {
-        case SAME:
+        case SAME: {
          out_h = (in_h + strides[0] - 1) / strides[0];
          out_w = (in_w + strides[1] - 1) / strides[1];
          break;
-        case VALID:
+        }
+        case VALID: {
          out_h = (in_h - kernels[2] + 1) / strides[0];
          out_w = (in_w - kernels[3] + 1) / strides[1];
          break;
-        default:
+        }
+        default: {
          MACE_NOT_IMPLEMENTED;
          break;
+        }
      }
    }

@@ -100,15 +102,13 @@ class InferConv2dShapeOp : public Operation {
 };

 void RegisterInferConv2dShape(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "InferConv2dShape",
-                   InferConv2dShapeOp, DeviceType::CPU, float);
-  MACE_REGISTER_OP(op_registry, "InferConv2dShape",
-                   InferConv2dShapeOp, DeviceType::CPU, int32_t);
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
+                            InferConv2dShapeOp, DeviceType::CPU, float);
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
+                            InferConv2dShapeOp, DeviceType::CPU, int32_t);
 #ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "InferConv2dShape",
-                   InferConv2dShapeOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "InferConv2dShape",
-                   InferConv2dShapeOp, DeviceType::GPU, half);
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
+                            InferConv2dShapeOp, DeviceType::GPU, float);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -77,7 +77,7 @@ class MatMulOpBase : public Operation {
    } else {
      MACE_CHECK(lhs_rank == 2 || rhs_rank == 2,
                 "Either lhs or rhs matrix should has rank 2 "
-                     "for non-batched matrix multiplication");
+                 "for non-batched matrix multiplication");
    }

    index_t
@@ -492,8 +492,8 @@ class MatMulOp<DeviceType::CPU, uint8_t> : public MatMulOpBase {
 #endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class MatMulOp<DeviceType::GPU, T> : public MatMulOpBase {
+template<>
+class MatMulOp<DeviceType::GPU, float> : public MatMulOpBase {
 public:
  explicit MatMulOp(OpConstructContext *context)
      : MatMulOpBase(context) {
@@ -592,7 +592,6 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase {
 };
 #endif  // MACE_ENABLE_NEON

-
 void RegisterMatMul(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
                   DeviceType::CPU, float);
@@ -602,13 +601,7 @@ void RegisterMatMul(OpRegistryBase *op_registry) {
                   DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE

-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "MatMul", MatMulOp);

 #if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,

--- a/mace/ops/opencl/buffer/buffer_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_transform.cc
@@ -27,7 +27,6 @@ MaceStatus TransformConv2DFilter(
    OpContext *context,
    cl::Kernel *kernel,
    const Tensor *input,
-    const DataType dt,
    Tensor *output) {
  const index_t out_chan = input->dim(0);
  const index_t in_chan = input->dim(1);
@@ -55,8 +54,9 @@ MaceStatus TransformConv2DFilter(
    MACE_OUT_OF_RANGE_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_conv_filter");
    built_options.emplace("-Dtransform_conv_filter=" + kernel_name);
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    std::string data_dt = DtToCLDt(input->dtype());
+    built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
+    built_options.emplace("-DDATA_TYPE=" + data_dt);
    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
                                              kernel_name,
                                              built_options,
@@ -98,7 +98,6 @@ MaceStatus TransformDWConv2DFilter(
    OpContext *context,
    cl::Kernel *kernel,
    const Tensor *input,
-    const DataType dt,
    Tensor *output) {
  const index_t multiplier = input->dim(0);
  const index_t in_chan = input->dim(1);
@@ -124,8 +123,9 @@ MaceStatus TransformDWConv2DFilter(
    MACE_NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_dw_conv_filter");
    built_options.emplace("-Dtransform_dw_conv_filter=" + kernel_name);
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    std::string data_dt = DtToCLDt(input->dtype());
+    built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
+    built_options.emplace("-DDATA_TYPE=" + data_dt);
    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
                                              kernel_name,
                                              built_options,
@@ -164,7 +164,6 @@ MaceStatus TransformArgument(
    OpContext *context,
    cl::Kernel *kernel,
    const Tensor *input,
-    const DataType dt,
    Tensor *output) {
  const index_t size = input->dim(0);

@@ -181,8 +180,9 @@ MaceStatus TransformArgument(
    MACE_NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_arg");
    built_options.emplace("-Dtransform_arg=" + kernel_name);
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    std::string data_dt = DtToCLDt(input->dtype());
+    built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
+    built_options.emplace("-DDATA_TYPE=" + data_dt);
    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
                                              kernel_name,
                                              built_options,
@@ -229,6 +229,30 @@ MaceStatus TransformArgument(
  return MaceStatus::MACE_SUCCESS;
 }

+MaceStatus BufferTransform::Compute(OpContext *context,
+                                    const Tensor *input,
+                                    const OpenCLBufferType type,
+                                    const int wino_blk_size,
+                                    Tensor *output) {
+  MACE_UNUSED(wino_blk_size);
+  switch (type) {
+    case CONV2D_FILTER:
+      return TransformConv2DFilter(context, &kernel_, input, output);
+    case DW_CONV2D_FILTER:
+      return TransformDWConv2DFilter(context, &kernel_, input, output);
+    case ARGUMENT:
+      return TransformArgument(context, &kernel_, input, output);
+    default:
+      if (input->dtype() != output->dtype()) {
+        return BufferTypeTransform(context, &kernel_, input, output);
+      } else {
+        SetFutureDefaultWaitFn(context->future());
+        output->ReuseTensorBuffer(*input);
+        return MaceStatus::MACE_SUCCESS;
+      }
+  }
+}
+
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/buffer/buffer_transform.h
+++ b/mace/ops/opencl/buffer/buffer_transform.h
@@ -32,33 +32,27 @@ MaceStatus BufferTypeTransform(
    OpContext *context,
    cl::Kernel *kernel,
    const Tensor *input,
-    const DataType dt,
    Tensor *output);

 MaceStatus TransformConv2DFilter(
    OpContext *context,
    cl::Kernel *kernel,
    const Tensor *input,
-    const DataType dt,
    Tensor *output);

 MaceStatus TransformDWConv2DFilter(
    OpContext *context,
    cl::Kernel *kernel,
    const Tensor *input,
-    const DataType dt,
    Tensor *output);

 MaceStatus TransformArgument(
    OpContext *context,
    cl::Kernel *kernel,
    const Tensor *input,
-    const DataType dt,
    Tensor *output);

-
-template <typename T>
-class BufferTransform: public OpenCLBufferTransformKernel {
+class BufferTransform : public OpenCLBufferTransformKernel {
 public:
  MaceStatus Compute(
      OpContext *context,
@@ -72,32 +66,6 @@ class BufferTransform: public OpenCLBufferTransformKernel {
  std::vector<index_t> input_shape_;
 };

-template <typename T>
-MaceStatus BufferTransform<T>::Compute(OpContext *context,
-                                       const Tensor *input,
-                                       const OpenCLBufferType type,
-                                       const int wino_blk_size,
-                                       Tensor *output) {
-  MACE_UNUSED(wino_blk_size);
-  const DataType dt = DataTypeToEnum<T>::value;
-  switch (type) {
-    case CONV2D_FILTER:
-      return TransformConv2DFilter(context, &kernel_, input, dt, output);
-    case DW_CONV2D_FILTER:
-      return TransformDWConv2DFilter(context, &kernel_, input, dt, output);
-    case ARGUMENT:
-      return TransformArgument(context, &kernel_, input, dt, output);
-    default:
-      if (input->dtype() != dt) {
-        return BufferTypeTransform(context, &kernel_, input, dt, output);
-      } else {
-        SetFutureDefaultWaitFn(context->future());
-        output->ReuseTensorBuffer(*input);
-        return MaceStatus::MACE_SUCCESS;
-      }
-  }
-}
-
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/buffer/buffer_type_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_type_transform.cc
@@ -27,7 +27,6 @@ MaceStatus BufferTypeTransform(
    OpContext *context,
    cl::Kernel *kernel,
    const Tensor *input,
-    const DataType dt,
    Tensor *output) {
  MACE_RETURN_IF_ERROR(output->ResizeLike(input));

@@ -43,7 +42,7 @@ MaceStatus BufferTypeTransform(
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_data_type");
    built_options.emplace("-Dtransform_data_type=" + kernel_name);
    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(output->dtype()));
    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
                                              kernel_name,
                                              built_options,

--- a/mace/ops/opencl/buffer/conv_2d.cc
+++ b/mace/ops/opencl/buffer/conv_2d.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/buffer/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace buffer {
+
+bool Conv2dKernel::CheckUseWinograd(
+    OpenCLRuntime *runtime,
+    const std::vector<index_t> &filter_shape,
+    const std::vector<index_t> &output_shape,
+    const int *strides,
+    const int *dilations,
+    int *wino_block_size) {
+  MACE_UNUSED(kwg_size_);
+  MACE_UNUSED(runtime);
+  MACE_UNUSED(output_shape);
+  MACE_UNUSED(wino_block_size);
+  return (filter_shape[2] == 3 && filter_shape[3] == 3 &&
+      strides[0] == 1 && strides[1] == 1 &&
+      dilations[0] == 1 && dilations[1] == 1);
+}
+
+MaceStatus Conv2dKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const float leakyrelu_coefficient,
+    const int winograd_blk_size,
+    Tensor *output) {
+  MACE_UNUSED(winograd_blk_size);
+  StatsFuture pad_future, conv_future;
+  index_t filter_h = filter->dim(2);
+  index_t filter_w = filter->dim(3);
+  // Reshape output
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    ops::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), filter->shape().data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), filter->shape().data(),
+                   padding_data.data(), dilations, strides, RoundType::FLOOR,
+                   output_shape.data());
+  }
+
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  // calculate padded input shape
+  index_t width = output_shape[2];
+  index_t channels = output_shape[3];
+
+  index_t input_height = input->dim(1);
+  index_t input_width = input->dim(2);
+  index_t input_channels = input->dim(3);
+
+  int pad_top = paddings[0] >> 1;
+  int pad_left = paddings[1] >> 1;
+
+  MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
+  MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
+             input_channels);
+
+  std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
+
+  // Mark whether input changed or not
+  bool input_changed = !IsVecEqual(input_shape_, input->shape());
+  input_shape_ = input->shape();
+
+  bool use_1x1 = filter_h == 1 && filter_w == 1;
+
+  std::vector<index_t> padded_output_shape = output_shape;
+  index_t tile_w, tile_c = 4;
+  if (use_1x1) {
+    tile_w = 2;
+  } else {
+    tile_w = 4;
+  }
+  padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
+
+  std::vector<index_t> padded_input_shape = input->shape();
+  padded_input_shape[1] = input_height + paddings[0];
+  padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
+      (filter_w - 1) * dilations[1] + 1;
+  padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
+
+  const Tensor *padded_input_ptr = input;
+  // pad input
+  std::unique_ptr<Tensor> padded_input;
+  if (padded_input_shape[1] != input_height ||
+      padded_input_shape[2] != input_width ||
+      padded_input_shape[3] != input_channels) {
+    // decide scratch size before allocate it
+    index_t total_scratch_size = 0;
+    index_t padded_input_size = 0;
+
+    padded_input_size =
+        std::accumulate(padded_input_shape.begin(),
+                        padded_input_shape.end(),
+                        1,
+                        std::multiplies<index_t>())
+            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
+    total_scratch_size += padded_input_size;
+
+    // Init scratch buffer
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
+    if (old_scratch_size_ != scratch->size()) {
+      input_changed |= scratch->size() != old_scratch_size_;
+      old_scratch_size_ = scratch->size();
+    }
+
+    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
+                                       input->dtype());
+
+    padded_input->Resize(padded_input_shape);
+    PadInput(context, &kernels_[0], input, pad_top, pad_left,
+             input_changed, padded_input.get(), &pad_future);
+    padded_input_ptr = padded_input.get();
+  }
+
+  if (use_1x1) {
+    conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
+      return conv2d::Conv2d1x1(
+          context, &kernels_[1], pad_input, filter, bias, strides,
+          activation, relux_max_limit,
+          leakyrelu_coefficient, input_changed, output, &conv_future);
+    };
+  } else {
+    conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
+      return conv2d::Conv2dGeneral(
+          context, &kernels_[1], pad_input, filter, bias, strides, dilations,
+          activation, relux_max_limit,
+          leakyrelu_coefficient, input_changed, output, &conv_future);
+    };
+  }
+  MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
+  MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future());
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/buffer/conv_2d.h
+++ b/mace/ops/opencl/buffer/conv_2d.h
@@ -36,7 +36,6 @@ extern MaceStatus Conv2d1x1(OpContext *context,
                            const Tensor *filter,
                            const Tensor *bias,
                            const int *strides,
-                            const DataType dt,
                            const ActivationType activation,
                            const float relux_max_limit,
                            const float leakyrelu_coefficient,
@@ -51,7 +50,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context,
                                const Tensor *bias,
                                const int *strides,
                                const int *dilations,
-                                const DataType dt,
                                const ActivationType activation,
                                const float relux_max_limit,
                                const float leakyrelu_coefficient,
@@ -60,7 +58,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context,
                                StatsFuture *future);
 }  // namespace conv2d

-template <typename T>
 class Conv2dKernel : public OpenCLConv2dKernel {
 public:
  Conv2dKernel() : old_scratch_size_(0) {}
@@ -95,153 +92,6 @@ class Conv2dKernel : public OpenCLConv2dKernel {
  std::vector<index_t> input_shape_;
 };

-
-template <typename T>
-bool Conv2dKernel<T>::CheckUseWinograd(
-    OpenCLRuntime *runtime,
-    const std::vector<index_t> &filter_shape,
-    const std::vector<index_t> &output_shape,
-    const int *strides,
-    const int *dilations,
-    int *wino_block_size) {
-  MACE_UNUSED(runtime);
-  MACE_UNUSED(output_shape);
-  MACE_UNUSED(wino_block_size);
-  return (filter_shape[2] == 3 && filter_shape[3] == 3 &&
-      strides[0] == 1 && strides[1] == 1 &&
-      dilations[0] == 1 && dilations[1] == 1);
-}
-
-template <typename T>
-MaceStatus Conv2dKernel<T>::Compute(
-      OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *bias,
-      const int *strides,
-      const Padding &padding_type,
-      const std::vector<int> &padding_data,
-      const int *dilations,
-      const ActivationType activation,
-      const float relux_max_limit,
-      const float leakyrelu_coefficient,
-      const int winograd_blk_size,
-      Tensor *output) {
-  MACE_UNUSED(winograd_blk_size);
-  StatsFuture pad_future, conv_future;
-  index_t filter_h = filter->dim(2);
-  index_t filter_w = filter->dim(3);
-  // Reshape output
-  std::vector<index_t> output_shape(4);
-  std::vector<int> paddings(2);
-  if (padding_data.empty()) {
-    ops::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), filter->shape().data(), dilations, strides,
-        padding_type, output_shape.data(), paddings.data());
-  } else {
-    paddings = padding_data;
-    CalcOutputSize(input->shape().data(), filter->shape().data(),
-                   padding_data.data(), dilations, strides, RoundType::FLOOR,
-                   output_shape.data());
-  }
-
-  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-
-  // calculate padded input shape
-  index_t width = output_shape[2];
-  index_t channels = output_shape[3];
-
-  index_t input_height = input->dim(1);
-  index_t input_width = input->dim(2);
-  index_t input_channels = input->dim(3);
-
-  int pad_top = paddings[0] >> 1;
-  int pad_left = paddings[1] >> 1;
-
-  MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
-  MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
-             input_channels);
-
-  std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
-
-  // Mark whether input changed or not
-  bool input_changed = !IsVecEqual(input_shape_, input->shape());
-  input_shape_ = input->shape();
-
-  bool use_1x1 = filter_h == 1 && filter_w == 1;
-
-  std::vector<index_t> padded_output_shape = output_shape;
-  index_t tile_w, tile_c = 4;
-  if (use_1x1) {
-    tile_w = 2;
-  } else {
-    tile_w = 4;
-  }
-  padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
-
-  std::vector<index_t> padded_input_shape = input->shape();
-  padded_input_shape[1] = input_height + paddings[0];
-  padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
-      (filter_w - 1) * dilations[1] + 1;
-  padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
-
-  const Tensor *padded_input_ptr = input;
-  // pad input
-  std::unique_ptr<Tensor> padded_input;
-  if (padded_input_shape[1] != input_height ||
-      padded_input_shape[2] != input_width ||
-      padded_input_shape[3] != input_channels) {
-    // decide scratch size before allocate it
-    index_t total_scratch_size = 0;
-    index_t padded_input_size = 0;
-
-    padded_input_size =
-        std::accumulate(padded_input_shape.begin(),
-                        padded_input_shape.end(),
-                        1,
-                        std::multiplies<index_t>())
-            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
-    total_scratch_size += padded_input_size;
-
-    // Init scratch buffer
-    ScratchBuffer *scratch = context->device()->scratch_buffer();
-    scratch->Rewind();
-    scratch->GrowSize(total_scratch_size);
-    if (old_scratch_size_ != scratch->size()) {
-      input_changed |= scratch->size() != old_scratch_size_;
-      old_scratch_size_ = scratch->size();
-    }
-
-    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
-                                  input->dtype());
-
-    padded_input->Resize(padded_input_shape);
-    PadInput(context, &kernels_[0], input, pad_top, pad_left,
-             input_changed, padded_input.get(), &pad_future);
-    padded_input_ptr = padded_input.get();
-  }
-
-  if (use_1x1) {
-    conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
-      return conv2d::Conv2d1x1(
-          context, &kernels_[1], pad_input, filter, bias, strides,
-          DataTypeToEnum<T>::v(), activation, relux_max_limit,
-          leakyrelu_coefficient, input_changed, output, &conv_future);
-    };
-  } else {
-    conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
-      return conv2d::Conv2dGeneral(
-        context, &kernels_[1], pad_input, filter, bias, strides, dilations,
-        DataTypeToEnum<T>::v(), activation, relux_max_limit,
-        leakyrelu_coefficient, input_changed, output, &conv_future);
-    };
-  }
-  MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
-  MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future());
-
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/buffer/conv_2d_1x1.cc
+++ b/mace/ops/opencl/buffer/conv_2d_1x1.cc
@@ -29,7 +29,6 @@ MaceStatus Conv2d1x1(OpContext *context,
                     const Tensor *filter,
                     const Tensor *bias,
                     const int *strides,
-                     const DataType dt,
                     const ActivationType activation,
                     const float relux_max_limit,
                     const float leakyrelu_coefficient,
@@ -53,9 +52,10 @@ MaceStatus Conv2d1x1(OpContext *context,
    MACE_NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
    built_options.emplace("-Dconv2d=" + kernel_name);
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
-    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    std::string data_dt = DtToCLDt(padded_input->dtype());
+    built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
    switch (activation) {
      case NOOP:

--- a/mace/ops/opencl/buffer/conv_2d_general.cc
+++ b/mace/ops/opencl/buffer/conv_2d_general.cc
@@ -30,7 +30,6 @@ MaceStatus Conv2dGeneral(OpContext *context,
                         const Tensor *bias,
                         const int *strides,
                         const int *dilations,
-                         const DataType dt,
                         const ActivationType activation,
                         const float relux_max_limit,
                         const float leakyrelu_coefficient,
@@ -58,9 +57,11 @@ MaceStatus Conv2dGeneral(OpContext *context,
    MACE_NON_UNIFORM_WG_CONFIG
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
    built_options.emplace("-Dconv2d=" + kernel_name);
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
-    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    std::string pad_data_dt = DtToCLDt(padded_input->dtype());
+    built_options.emplace("-DIN_DATA_TYPE=" + pad_data_dt);
+    std::string out_data_dt = DtToCLDt(output->dtype());
+    built_options.emplace("-DOUT_DATA_TYPE=" + out_data_dt);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
    switch (activation) {
      case NOOP:

--- a/mace/ops/opencl/buffer/depthwise_conv2d.cc
+++ b/mace/ops/opencl/buffer/depthwise_conv2d.cc
@@ -30,7 +30,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                           const Tensor *bias,
                           const int *strides,
                           const int *dilations,
-                           const DataType dt,
                           const ActivationType activation,
                           const float relux_max_limit,
                           const float leakyrelu_coefficient,
@@ -59,8 +58,8 @@ MaceStatus DepthwiseConv2d(OpContext *context,
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
    built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
-    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
    switch (activation) {
      case NOOP:
@@ -136,6 +135,118 @@ MaceStatus DepthwiseConv2d(OpContext *context,
 }

 }  // namespace depthwise
+
+MaceStatus DepthwiseConv2dKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const float leakyrelu_coefficient,
+    Tensor *output) {
+  StatsFuture pad_future, dw_conv_future;
+  index_t filter_w = filter->dim(3);
+
+  // Create a fake conv_2d filter to calculate the paddings and output size
+  std::vector<index_t> fake_filter_shape(4);
+  fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
+  fake_filter_shape[1] = filter->dim(1);
+  fake_filter_shape[2] = filter->dim(2);
+  fake_filter_shape[3] = filter->dim(3);
+
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    ops::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), fake_filter_shape.data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
+                   padding_data.data(), dilations, strides, RoundType::FLOOR,
+                   output_shape.data());
+  }
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  // calculate padded input shape
+  index_t width = output_shape[2];
+  index_t channels = output_shape[3];
+
+  index_t input_height = input->dim(1);
+  index_t input_width = input->dim(2);
+  index_t input_channels = input->dim(3);
+
+  int pad_top = paddings[0] >> 1;
+  int pad_left = paddings[1] >> 1;
+
+  MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
+  MACE_CHECK(filter->dim(0) * input_channels == channels);
+  MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
+             input_channels);
+
+  // Mark whether input changed or not
+  bool input_changed = !IsVecEqual(input_shape_, input->shape());
+  input_shape_ = input->shape();
+
+  std::vector<index_t> padded_output_shape = output_shape;
+  index_t tile_w = 4, tile_c = 4;
+  padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
+
+  std::vector<index_t> padded_input_shape = input->shape();
+  padded_input_shape[1] = input_height + paddings[0];
+  padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
+      (filter_w - 1) * dilations[1] + 1;
+  padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
+
+  const Tensor *padded_input_ptr = input;
+  // pad input
+  std::unique_ptr<Tensor> padded_input;
+  if (padded_input_shape[1] != input_height ||
+      padded_input_shape[2] != input_width ||
+      padded_input_shape[3] != input_channels) {
+    index_t total_scratch_size = 0;
+    index_t padded_input_size = 0;
+
+    padded_input_size =
+        std::accumulate(padded_input_shape.begin(),
+                        padded_input_shape.end(),
+                        1,
+                        std::multiplies<index_t>())
+            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
+    total_scratch_size += padded_input_size;
+
+    // Init scratch buffer
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
+    if (old_scratch_size_ != scratch->size()) {
+      input_changed |= scratch->size() != old_scratch_size_;
+      old_scratch_size_ = scratch->size();
+    }
+
+    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
+                                       input->dtype());
+
+    padded_input->Resize(padded_input_shape);
+    PadInput(context, &kernels_[0], input, pad_top, pad_left,
+             input_changed, padded_input.get(), &pad_future);
+    padded_input_ptr = padded_input.get();
+  }
+
+  MACE_RETURN_IF_ERROR(
+      depthwise::DepthwiseConv2d(
+          context, &kernels_[1], padded_input_ptr, filter, bias, strides,
+          dilations, activation, relux_max_limit,
+          leakyrelu_coefficient, input_changed, output, &dw_conv_future));
+  MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
+  return MaceStatus::MACE_SUCCESS;
+}
+
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/buffer/depthwise_conv2d.h
+++ b/mace/ops/opencl/buffer/depthwise_conv2d.h
@@ -37,7 +37,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                           const Tensor *bias,
                           const int *strides,
                           const int *dilations,
-                           const DataType dt,
                           const ActivationType activation,
                           const float relux_max_limit,
                           const float leakyrelu_coefficient,
@@ -46,8 +45,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                           StatsFuture *future);
 }  // namespace depthwise

-
-template <typename T>
 class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
 public:
  DepthwiseConv2dKernel() : old_scratch_size_(0) {}
@@ -68,122 +65,9 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
 private:
  index_t old_scratch_size_;
  cl::Kernel kernels_[2];
-  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
 };

-template <typename T>
-MaceStatus DepthwiseConv2dKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *filter,
-    const Tensor *bias,
-    const int *strides,
-    const Padding &padding_type,
-    const std::vector<int> &padding_data,
-    const int *dilations,
-    const ActivationType activation,
-    const float relux_max_limit,
-    const float leakyrelu_coefficient,
-    Tensor *output) {
-  StatsFuture pad_future, dw_conv_future;
-  index_t filter_w = filter->dim(3);
-
-  // Create a fake conv_2d filter to calculate the paddings and output size
-  std::vector<index_t> fake_filter_shape(4);
-  fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
-  fake_filter_shape[1] = filter->dim(1);
-  fake_filter_shape[2] = filter->dim(2);
-  fake_filter_shape[3] = filter->dim(3);
-
-  std::vector<index_t> output_shape(4);
-  std::vector<int> paddings(2);
-  if (padding_data.empty()) {
-    ops::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), fake_filter_shape.data(), dilations, strides,
-        padding_type, output_shape.data(), paddings.data());
-  } else {
-    paddings = padding_data;
-    CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
-                   padding_data.data(), dilations, strides, RoundType::FLOOR,
-                   output_shape.data());
-  }
-  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-
-  // calculate padded input shape
-  index_t width = output_shape[2];
-  index_t channels = output_shape[3];
-
-  index_t input_height = input->dim(1);
-  index_t input_width = input->dim(2);
-  index_t input_channels = input->dim(3);
-
-  int pad_top = paddings[0] >> 1;
-  int pad_left = paddings[1] >> 1;
-
-  MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
-  MACE_CHECK(filter->dim(0) * input_channels == channels);
-  MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
-             input_channels);
-
-  // Mark whether input changed or not
-  bool input_changed = !IsVecEqual(input_shape_, input->shape());
-  input_shape_ = input->shape();
-
-  std::vector<index_t> padded_output_shape = output_shape;
-  index_t tile_w = 4, tile_c = 4;
-  padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
-
-  std::vector<index_t> padded_input_shape = input->shape();
-  padded_input_shape[1] = input_height + paddings[0];
-  padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
-      (filter_w - 1) * dilations[1] + 1;
-  padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
-
-  const Tensor *padded_input_ptr = input;
-  // pad input
-  std::unique_ptr<Tensor> padded_input;
-  if (padded_input_shape[1] != input_height ||
-      padded_input_shape[2] != input_width ||
-      padded_input_shape[3] != input_channels) {
-    index_t total_scratch_size = 0;
-    index_t padded_input_size = 0;
-
-    padded_input_size =
-        std::accumulate(padded_input_shape.begin(),
-                        padded_input_shape.end(),
-                        1,
-                        std::multiplies<index_t>())
-            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
-    total_scratch_size += padded_input_size;
-
-    // Init scratch buffer
-    ScratchBuffer *scratch = context->device()->scratch_buffer();
-    scratch->Rewind();
-    scratch->GrowSize(total_scratch_size);
-    if (old_scratch_size_ != scratch->size()) {
-      input_changed |= scratch->size() != old_scratch_size_;
-      old_scratch_size_ = scratch->size();
-    }
-
-    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
-                                       input->dtype());
-
-    padded_input->Resize(padded_input_shape);
-    PadInput(context, &kernels_[0], input, pad_top, pad_left,
-             input_changed, padded_input.get(), &pad_future);
-    padded_input_ptr = padded_input.get();
-  }
-
-  MACE_RETURN_IF_ERROR(
-      depthwise::DepthwiseConv2d(
-          context, &kernels_[1], padded_input_ptr, filter, bias, strides,
-          dilations, DataTypeToEnum<T>::v(), activation, relux_max_limit,
-          leakyrelu_coefficient, input_changed, output, &dw_conv_future));
-  MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/buffer/pooling.cc
+++ b/mace/ops/opencl/buffer/pooling.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/buffer/pooling.h"
+
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace buffer {
+
+MaceStatus PoolingKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const PoolingType pooling_type,
+    const int *kernels,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const RoundType round_type,
+    Tensor *output) {
+  MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
+    << "Pooling opencl kernel not support dilation yet";
+
+  StatsFuture pad_future, pooling_future;
+
+  index_t input_channels = input->dim(3);
+
+  std::vector<index_t> output_shape(4);
+  std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
+                                       kernels[0], kernels[1]};
+
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    ops::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), filter_shape.data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), filter_shape.data(),
+                   padding_data.data(), dilations, strides, round_type,
+                   output_shape.data());
+  }
+
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  // Mark whether input changed or not
+  bool input_changed = !IsVecEqual(input_shape_, input->shape());
+  input_shape_ = input->shape();
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+
+  // pad input
+  std::vector<index_t> padded_input_shape = input->shape();
+  padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
+
+  const Tensor *padded_input_ptr = input;
+  // pad input
+  std::unique_ptr<Tensor> padded_input;
+  if (padded_input_shape[3] != input_channels) {
+    index_t total_scratch_size = 0;
+    index_t padded_input_size = 0;
+
+    padded_input_size =
+        std::accumulate(padded_input_shape.begin(),
+                        padded_input_shape.end(),
+                        1,
+                        std::multiplies<index_t>())
+            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
+    total_scratch_size += padded_input_size;
+
+    // Init scratch buffer
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
+    if (old_scratch_size_ != scratch->size()) {
+      input_changed |= scratch->size() != old_scratch_size_;
+      old_scratch_size_ = scratch->size();
+    }
+
+    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
+                                       input->dtype());
+
+    padded_input->Resize(padded_input_shape);
+    PadInput(context, &kernels_[0], input, 0, 0,
+             input_changed, padded_input.get(), &pad_future);
+    padded_input_ptr = padded_input.get();
+  }
+
+  cl::Kernel *kernel = &kernels_[1];
+  MACE_OUT_OF_RANGE_DEFINITION
+
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
+    built_options.emplace("-Dpooling=" + kernel_name);
+    auto input_dtype = input->dtype();
+    auto input_dt = DtToCLDt(input_dtype);
+    built_options.emplace("-DIN_DATA_TYPE=" + input_dt);
+    auto output_dtype = output->dtype();
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output_dtype));
+    if (pooling_type == MAX && input_dtype == output_dtype) {
+      built_options.emplace("-DDATA_TYPE=" + input_dt);
+    } else {
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    }
+    if (pooling_type == AVG) {
+      built_options.emplace("-DPOOL_AVG");
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
+                                              kernel_name,
+                                              built_options,
+                                              kernel));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
+  }
+
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
+      static_cast<uint32_t>(output->dim(2)),
+      static_cast<uint32_t>(output->dim(0) * output->dim(1)),
+  };
+
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  if (input_changed) {
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
+    MACE_SET_3D_GWS_ARGS(*kernel, gws);
+    kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
+    kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
+    kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
+    kernel->setArg(idx++, paddings[0] / 2);
+    kernel->setArg(idx++, paddings[1] / 2);
+    kernel->setArg(idx++, strides[0]);
+    kernel->setArg(idx++, strides[1]);
+    kernel->setArg(idx++, kernels[0]);
+    kernel->setArg(idx++, kernels[1]);
+    kernel->setArg(idx++, *(output->opencl_buffer()));
+  }
+
+  const std::vector<uint32_t> lws = {4, 4, 4, 0};
+  std::string tuning_key =
+      Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
+                                           gws, lws, &pooling_future));
+  MACE_OUT_OF_RANGE_VALIDATION
+  MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future());
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/buffer/pooling.h
+++ b/mace/ops/opencl/buffer/pooling.h
@@ -31,7 +31,6 @@ namespace ops {
 namespace opencl {
 namespace buffer {

-template <typename T>
 class PoolingKernel : public OpenCLPoolingKernel {
 public:
  PoolingKernel() : old_scratch_size_(0) {}
@@ -54,158 +53,6 @@ class PoolingKernel : public OpenCLPoolingKernel {
  std::vector<index_t> input_shape_;
 };

-template <typename T>
-MaceStatus PoolingKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const PoolingType pooling_type,
-    const int *kernels,
-    const int *strides,
-    const Padding &padding_type,
-    const std::vector<int> &padding_data,
-    const int *dilations,
-    const RoundType round_type,
-    Tensor *output) {
-  MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
-    << "Pooling opencl kernel not support dilation yet";
-
-  StatsFuture pad_future, pooling_future;
-
-  index_t input_channels = input->dim(3);
-
-  std::vector<index_t> output_shape(4);
-  std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
-                                       kernels[0], kernels[1]};
-
-  std::vector<int> paddings(2);
-  if (padding_data.empty()) {
-    ops::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), filter_shape.data(), dilations, strides,
-        padding_type, output_shape.data(), paddings.data());
-  } else {
-    paddings = padding_data;
-    CalcOutputSize(input->shape().data(), filter_shape.data(),
-                   padding_data.data(), dilations, strides, round_type,
-                   output_shape.data());
-  }
-
-  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-
-  // Mark whether input changed or not
-  bool input_changed = !IsVecEqual(input_shape_, input->shape());
-  input_shape_ = input->shape();
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-
-  // pad input
-  std::vector<index_t> padded_input_shape = input->shape();
-  padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
-
-  const Tensor *padded_input_ptr = input;
-  // pad input
-  std::unique_ptr<Tensor> padded_input;
-  if (padded_input_shape[3] != input_channels) {
-    index_t total_scratch_size = 0;
-    index_t padded_input_size = 0;
-
-    padded_input_size =
-        std::accumulate(padded_input_shape.begin(),
-                        padded_input_shape.end(),
-                        1,
-                        std::multiplies<index_t>())
-            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
-    total_scratch_size += padded_input_size;
-
-    // Init scratch buffer
-    ScratchBuffer *scratch = context->device()->scratch_buffer();
-    scratch->Rewind();
-    scratch->GrowSize(total_scratch_size);
-    if (old_scratch_size_ != scratch->size()) {
-      input_changed |= scratch->size() != old_scratch_size_;
-      old_scratch_size_ = scratch->size();
-    }
-
-    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
-                                       input->dtype());
-
-    padded_input->Resize(padded_input_shape);
-    PadInput(context, &kernels_[0], input, 0, 0,
-             input_changed, padded_input.get(), &pad_future);
-    padded_input_ptr = padded_input.get();
-  }
-
-  cl::Kernel *kernel = &kernels_[1];
-  MACE_OUT_OF_RANGE_DEFINITION
-
-  if (kernel->get() == nullptr) {
-    const DataType dt = DataTypeToEnum<T>::value;
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
-    built_options.emplace("-Dpooling=" + kernel_name);
-
-    if (pooling_type == MAX && input->dtype() == output->dtype()) {
-      built_options.emplace("-DIN_DATA_TYPE=" +
-          DtToCLDt(input->dtype()));
-      built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
-      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    } else {
-      built_options.emplace("-DIN_DATA_TYPE=" +
-          DtToCLDt(input->dtype()));
-      built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
-      built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    }
-    if (pooling_type == AVG) {
-      built_options.emplace("-DPOOL_AVG");
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
-                                              kernel_name,
-                                              built_options,
-                                              kernel));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-  }
-
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
-      static_cast<uint32_t>(output->dim(2)),
-      static_cast<uint32_t>(output->dim(0) * output->dim(1)),
-  };
-
-  MACE_OUT_OF_RANGE_INIT(*kernel);
-  if (input_changed) {
-    uint32_t idx = 0;
-    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
-    MACE_SET_3D_GWS_ARGS(*kernel, gws);
-    kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
-    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
-    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
-    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
-    kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
-    kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
-    kernel->setArg(idx++, paddings[0] / 2);
-    kernel->setArg(idx++, paddings[1] / 2);
-    kernel->setArg(idx++, strides[0]);
-    kernel->setArg(idx++, strides[1]);
-    kernel->setArg(idx++, kernels[0]);
-    kernel->setArg(idx++, kernels[1]);
-    kernel->setArg(idx++, *(output->opencl_buffer()));
-  }
-
-  const std::vector<uint32_t> lws = {4, 4, 4, 0};
-  std::string tuning_key =
-      Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
-                                           gws, lws, &pooling_future));
-  MACE_OUT_OF_RANGE_VALIDATION
-  MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future());
-
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/buffer/softmax.cc
+++ b/mace/ops/opencl/buffer/softmax.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/buffer/softmax.h"
+
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace buffer {
+
+MaceStatus SoftmaxKernel::Compute(
+    OpContext *context,
+    const Tensor *logits,
+    Tensor *output) {
+  index_t batch = 0;
+  index_t height = 0;
+  index_t width = 0;
+  index_t channels = 0;
+
+  if (logits->dim_size() == 2) {
+    batch = logits->dim(0);
+    height = 1;
+    width = 1;
+    channels = logits->dim(1);
+
+  } else if (logits->dim_size() == 4) {
+    batch = logits->dim(0);
+    height = logits->dim(1);
+    width = logits->dim(2);
+    channels = logits->dim(3);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const int remain_channels = channel_blocks * 4 - channels;
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
+    built_options.emplace("-Dsoftmax=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    if (use_log_) built_options.emplace("-DUSE_LOG");
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, logits->shape())) {
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(logits->opencl_buffer()));
+    kernel_.setArg(idx++, static_cast<int>(height));
+    kernel_.setArg(idx++, static_cast<int>(channels));
+    kernel_.setArg(idx++, remain_channels);
+    kernel_.setArg(idx++, *(output->opencl_buffer()));
+
+    input_shape_ = logits->shape();
+  }
+
+  std::vector<uint32_t> lws = {4, 4, 4, 0};
+  std::string tuning_key =
+      Concat("softmax_opencl_kernel", batch, height, width, channels);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/buffer/softmax.h
+++ b/mace/ops/opencl/buffer/softmax.h
@@ -29,7 +29,7 @@ namespace mace {
 namespace ops {
 namespace opencl {
 namespace buffer {
-template <typename T>
+
 class SoftmaxKernel : public OpenCLSoftmaxKernel {
 public:
  explicit SoftmaxKernel(bool use_log)
@@ -47,81 +47,6 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel {
  std::vector<index_t> input_shape_;
 };

-template <typename T>
-MaceStatus SoftmaxKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *logits,
-    Tensor *output) {
-  index_t batch = 0;
-  index_t height = 0;
-  index_t width = 0;
-  index_t channels = 0;
-
-  if (logits->dim_size() == 2) {
-    batch = logits->dim(0);
-    height = 1;
-    width = 1;
-    channels = logits->dim(1);
-
-  } else if (logits->dim_size() == 4) {
-    batch = logits->dim(0);
-    height = logits->dim(1);
-    width = logits->dim(2);
-    channels = logits->dim(3);
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const int remain_channels = channel_blocks * 4 - channels;
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
-    built_options.emplace("-Dsoftmax=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
-    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    if (use_log_) built_options.emplace("-DUSE_LOG");
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, logits->shape())) {
-    uint32_t idx = 0;
-    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(logits->opencl_buffer()));
-    kernel_.setArg(idx++, static_cast<int>(height));
-    kernel_.setArg(idx++, static_cast<int>(channels));
-    kernel_.setArg(idx++, remain_channels);
-    kernel_.setArg(idx++, *(output->opencl_buffer()));
-
-    input_shape_ = logits->shape();
-  }
-
-  std::vector<uint32_t> lws = {4, 4, 4, 0};
-  std::string tuning_key =
-      Concat("softmax_opencl_kernel", batch, height, width, channels);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/buffer_transform.cc
+++ b/mace/ops/opencl/buffer_transform.cc
@@ -20,11 +20,11 @@
 namespace mace {
 namespace ops {

-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class BufferTransformOp;

-template <typename T>
-class BufferTransformOp<DeviceType::GPU, T> : public Operation {
+template<>
+class BufferTransformOp<DeviceType::GPU, float> : public Operation {
 public:
  explicit BufferTransformOp(OpConstructContext *context)
      : Operation(context),
@@ -42,7 +42,7 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {

    MemoryType in_mem_type = context->workspace()->GetTensor(
        operator_def_->input(0))->memory_type();
-    return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
+    return OpenCLBufferTransformer(in_mem_type, out_mem_type_).Transform(
        context, input, type, out_mem_type_, wino_blk_size_, output);
  }

@@ -51,13 +51,8 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
  MemoryType out_mem_type_;
 };

-
 void RegisterBufferTransform(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "BufferTransform",
-                   BufferTransformOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "BufferTransform",
-                   BufferTransformOp, DeviceType::GPU, half);
+  MACE_REGISTER_GPU_OP(op_registry, "BufferTransform", BufferTransformOp);
 }

 }  // namespace ops

--- a/mace/ops/opencl/buffer_transformer.cc
+++ b/mace/ops/opencl/buffer_transformer.cc
@@ -23,5 +23,29 @@ std::string TransformedFilterName(const std::string &name) {
  return name + postfix;
 }

+MaceStatus TransformFilter(
+    mace::OpConstructContext *context,
+    OperatorDef *op_def,
+    const int input_idx,
+    const OpenCLBufferType buffer_type,
+    const MemoryType mem_type,
+    const int wino_blk_size) {
+  OpContext op_context(context->workspace(), context->device());
+  Workspace *ws = context->workspace();
+  std::string input_name = op_def->input(input_idx);
+  Tensor *input = ws->GetTensor(input_name);
+  const DataType dt = input->dtype();
+  std::string output_name = TransformedFilterName(input_name);
+  Tensor *output =
+      ws->CreateTensor(output_name, context->device()->allocator(), dt, true);
+
+  // update the information
+  op_def->set_input(input_idx, output_name);
+  input->MarkUnused();
+  return OpenCLBufferTransformer(input->memory_type(), mem_type).
+      Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
+                output);
+}
+
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
@@ -28,17 +28,16 @@
 namespace mace {
 namespace ops {
 // Only used for GPU Operation(BufferTransform)
-template<typename T>
 class OpenCLBufferTransformer {
 public:
  OpenCLBufferTransformer(const MemoryType in_mem_type,
                          const MemoryType out_mem_type) {
    if (out_mem_type == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::BufferToImage<T>>();
+      kernel_ = make_unique<opencl::image::BufferToImage>();
    } else if (in_mem_type == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::ImageToBuffer<T>>();
+      kernel_ = make_unique<opencl::image::ImageToBuffer>();
    } else {
-      kernel_ = make_unique<opencl::buffer::BufferTransform<T>>();
+      kernel_ = make_unique<opencl::buffer::BufferTransform>();
    }
  }

@@ -49,7 +48,7 @@ class OpenCLBufferTransformer {
                       const int wino_blk_size,
                       Tensor *output) {
    Workspace *ws = context->workspace();
-    DataType dt = DataTypeToEnum<T>::value;
+    DataType dt = output->dtype();
    MemoryType in_mem_type = input->memory_type();
    if (out_mem_type == MemoryType::GPU_IMAGE ||
        out_mem_type == MemoryType::GPU_BUFFER) {
@@ -87,10 +86,10 @@ class OpenCLBufferTransformer {
              << " to CPU Buffer " << output->name()
              << " with data type " << dt;
      Tensor::MappingGuard guard(&internal_tensor);
-      const T *internal_ptr = internal_tensor.data<T>();
+      const float *internal_ptr = internal_tensor.data<float>();
      output->Resize(internal_tensor.shape());
-      T *output_ptr = output->mutable_data<T>();
-      memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T));
+      float *output_ptr = output->mutable_data<float>();
+      memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(float));
      return MaceStatus::MACE_SUCCESS;
    } else {
      LOG(FATAL) << "Unexpected error: " << out_mem_type;
@@ -110,30 +109,13 @@ class OpenCLBufferTransformer {

 std::string TransformedFilterName(const std::string &name);

-template<typename T>
 MaceStatus TransformFilter(
    mace::OpConstructContext *context,
    OperatorDef *op_def,
    const int input_idx,
    const OpenCLBufferType buffer_type,
    const MemoryType mem_type,
-    const int wino_blk_size = 0) {
-  const DataType dt = DataTypeToEnum<T>::value;
-  OpContext op_context(context->workspace(), context->device());
-  Workspace *ws = context->workspace();
-  std::string input_name = op_def->input(input_idx);
-  Tensor *input = ws->GetTensor(input_name);
-  std::string output_name = TransformedFilterName(input_name);
-  Tensor *output =
-      ws->CreateTensor(output_name, context->device()->allocator(), dt, true);
-
-  // update the information
-  op_def->set_input(input_idx, output_name);
-  input->MarkUnused();
-  return OpenCLBufferTransformer<T>(input->memory_type(), mem_type).
-      Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
-                output);
-}
+    const int wino_blk_size = 0);

 }  // namespace ops
 }  // namespace mace

--- a/mace/ops/opencl/conv_2d.h
+++ b/mace/ops/opencl/conv_2d.h
@@ -17,8 +17,9 @@

 #include <vector>

-#include "mace/ops/activation.h"
+#include "mace/ops/common/activation_type.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"

 namespace mace {
 class OpContext;

--- a/mace/ops/opencl/deconv_2d.h
+++ b/mace/ops/opencl/deconv_2d.h
@@ -17,7 +17,10 @@

 #include <vector>

-#include "mace/ops/activation.h"
+#include "mace/core/types.h"
+#include "mace/ops/common/activation_type.h"
+#include "mace/public/mace.h"
+#include "mace/utils/macros.h"

 namespace mace {


--- a/mace/ops/opencl/depthwise_deconv2d.h
+++ b/mace/ops/opencl/depthwise_deconv2d.h
@@ -19,6 +19,9 @@
 #include <vector>

 #include "mace/ops/common/activation_type.h"
+#include "mace/public/mace.h"
+#include "mace/utils/macros.h"
+#include "mace/core/types.h"

 namespace mace {


--- a/mace/ops/opencl/fully_connected.h
+++ b/mace/ops/opencl/fully_connected.h
@@ -15,8 +15,7 @@
 #ifndef MACE_OPS_OPENCL_FULLY_CONNECTED_H_
 #define MACE_OPS_OPENCL_FULLY_CONNECTED_H_

-#include "mace/ops/activation.h"
-
+#include "mace/ops/common/activation_type.h"
 #include "mace/public/mace.h"
 #include "mace/utils/math.h"


--- a/mace/ops/opencl/helper.cc
+++ b/mace/ops/opencl/helper.cc
@@ -77,28 +77,6 @@ std::string DtToCLCMDDt(const DataType dt) {
  }
 }

-std::string DtToUpCompatibleCLDt(const DataType dt) {
-  switch (dt) {
-    case DT_FLOAT:
-    case DT_HALF:
-      return "float";
-    default:
-      LOG(FATAL) << "Unsupported data type";
-      return "";
-  }
-}
-
-std::string DtToUpCompatibleCLCMDDt(const DataType dt) {
-  switch (dt) {
-    case DT_FLOAT:
-    case DT_HALF:
-      return "f";
-    default:
-      LOG(FATAL) << "Not supported data type for opencl cmd data type";
-      return "";
-  }
-}
-
 std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
                                       const uint32_t *gws,
                                       const uint32_t kwg_size) {

--- a/mace/ops/opencl/helper.h
+++ b/mace/ops/opencl/helper.h
@@ -100,17 +100,9 @@ std::vector<index_t> FormatBufferShape(
 // CPU data type to OpenCL command data type
 std::string DtToCLCMDDt(const DataType dt);

-// CPU data type to upward compatible OpenCL command data type
-// e.g. half -> float
-std::string DtToUpCompatibleCLCMDDt(const DataType dt);
-
 // CPU data type to OpenCL data type
 std::string DtToCLDt(const DataType dt);

-// CPU data type to upward compatible OpenCL data type
-// e.g. half -> float
-std::string DtToUpCompatibleCLDt(const DataType dt);
-
 // CPU data type to OpenCL condition data type used in select
 // e.g. half -> float
 std::string DtToCLCondDt(const DataType dt);

--- a/mace/ops/opencl/image/activation.cc
+++ b/mace/ops/opencl/image/activation.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/activation.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus ActivationKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *alpha,
+    Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
+    built_options.emplace("-Dactivation=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    switch (activation_) {
+      case RELU: {
+        tuning_key_prefix_ = "relu_opencl_kernel";
+        built_options.emplace("-DUSE_RELU");
+        break;
+      }
+      case RELUX: {
+        tuning_key_prefix_ = "relux_opencl_kernel";
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      }
+      case PRELU: {
+        tuning_key_prefix_ = "prelu_opencl_kernel";
+        built_options.emplace("-DUSE_PRELU");
+        break;
+      }
+      case TANH: {
+        tuning_key_prefix_ = "tanh_opencl_kernel";
+        built_options.emplace("-DUSE_TANH");
+        break;
+      }
+      case SIGMOID: {
+        tuning_key_prefix_ = "sigmoid_opencl_kernel";
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      }
+      case LEAKYRELU: {
+        tuning_key_prefix_ = "leakyrelu_opencl_kernel";
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
+      }
+      default: {
+        LOG(FATAL) << "Unknown activation type: " << activation_;
+      }
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    int idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    if (activation_ == PRELU) {
+      MACE_CHECK_NOTNULL(alpha);
+      kernel_.setArg(idx++, *(alpha->opencl_image()));
+    }
+    kernel_.setArg(idx++, relux_max_limit_);
+    kernel_.setArg(idx++, leakyrelu_coefficient_);
+    kernel_.setArg(idx++, *(output->opencl_image()));
+
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
+             output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
+
--- a/mace/ops/opencl/image/activation.h
+++ b/mace/ops/opencl/image/activation.h
@@ -31,12 +31,11 @@ namespace ops {
 namespace opencl {
 namespace image {

-template <typename T>
 class ActivationKernel : public OpenCLActivationKernel {
 public:
  ActivationKernel(ActivationType type,
-                   T relux_max_limit,
-                   T leakyrelu_coefficient)
+                   float relux_max_limit,
+                   float leakyrelu_coefficient)
      : activation_(type), relux_max_limit_(relux_max_limit),
        leakyrelu_coefficient_(leakyrelu_coefficient) {}

@@ -48,106 +47,14 @@ class ActivationKernel : public OpenCLActivationKernel {

 private:
  ActivationType activation_;
-  T relux_max_limit_;
-  T leakyrelu_coefficient_;
+  float relux_max_limit_;
+  float leakyrelu_coefficient_;
  cl::Kernel kernel_;
  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
  std::string tuning_key_prefix_;
 };

-template <typename T>
-MaceStatus ActivationKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *alpha,
-    Tensor *output) {
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
-    built_options.emplace("-Dactivation=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    switch (activation_) {
-      case RELU:
-        tuning_key_prefix_ = "relu_opencl_kernel";
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        tuning_key_prefix_ = "relux_opencl_kernel";
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case PRELU:
-        tuning_key_prefix_ = "prelu_opencl_kernel";
-        built_options.emplace("-DUSE_PRELU");
-        break;
-      case TANH:
-        tuning_key_prefix_ = "tanh_opencl_kernel";
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        tuning_key_prefix_ = "sigmoid_opencl_kernel";
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      case LEAKYRELU:
-        tuning_key_prefix_ = "leakyrelu_opencl_kernel";
-        built_options.emplace("-DUSE_LEAKYRELU");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation_;
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    int idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    if (activation_ == PRELU) {
-      MACE_CHECK_NOTNULL(alpha);
-      kernel_.setArg(idx++, *(alpha->opencl_image()));
-    }
-    kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
-    kernel_.setArg(idx++, static_cast<float>(leakyrelu_coefficient_));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
-             output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/addn.cc
+++ b/mace/ops/opencl/image/addn.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/addn.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus AddNKernel::Compute(
+    OpContext *context,
+    const std::vector<const Tensor *> &input_tensors,
+    Tensor *output_tensor) {
+  size_t size = input_tensors.size();
+  MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
+
+  const index_t batch = input_tensors[0]->dim(0);
+  const index_t height = input_tensors[0]->dim(1);
+  const index_t width = input_tensors[0]->dim(2);
+  const index_t channels = input_tensors[0]->dim(3);
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  for (size_t i = 1; i < size; ++i) {
+    MACE_CHECK_NOTNULL(input_tensors[i]);
+    MACE_CHECK(batch == input_tensors[i]->dim(0));
+    MACE_CHECK(height == input_tensors[i]->dim(1));
+    MACE_CHECK(width == input_tensors[i]->dim(2));
+    MACE_CHECK(channels == input_tensors[i]->dim(3));
+  }
+
+  if (kernel_.get() == nullptr) {
+    if (input_tensors.size() > 4) {
+      MACE_NOT_IMPLEMENTED;
+    }
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
+    built_options.emplace("-Daddn=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
+
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+
+  std::vector<index_t> output_shape = input_tensors[0]->shape();
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t width_pixels = channel_blocks * width;
+  const index_t batch_height_pixels = batch * height;
+
+  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
+                           static_cast<uint32_t>(batch_height_pixels)};
+
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
+    std::vector<size_t> output_image_shape;
+    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                                &output_image_shape);
+    MACE_RETURN_IF_ERROR(
+        output_tensor->ResizeImage(output_shape, output_image_shape));
+
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_2D_GWS_ARGS(kernel_, gws);
+    for (auto input : input_tensors) {
+      kernel_.setArg(idx++, *(input->opencl_image()));
+    }
+    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
+
+    input_shape_ = input_tensors[0]->shape();
+  }
+
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
+  std::string tuning_key =
+      Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
+             output_tensor->dim(2), output_tensor->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/addn.h
+++ b/mace/ops/opencl/image/addn.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {

-template <typename T>
 class AddNKernel : public OpenCLAddNKernel {
 public:
  MaceStatus Compute(
@@ -44,89 +43,6 @@ class AddNKernel : public OpenCLAddNKernel {
  std::vector<index_t> input_shape_;
 };

-template <typename T>
-MaceStatus AddNKernel<T>::Compute(
-    OpContext *context,
-    const std::vector<const Tensor *> &input_tensors,
-    Tensor *output_tensor) {
-  size_t size = input_tensors.size();
-  MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
-
-  const index_t batch = input_tensors[0]->dim(0);
-  const index_t height = input_tensors[0]->dim(1);
-  const index_t width = input_tensors[0]->dim(2);
-  const index_t channels = input_tensors[0]->dim(3);
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  for (size_t i = 1; i < size; ++i) {
-    MACE_CHECK_NOTNULL(input_tensors[i]);
-    MACE_CHECK(batch == input_tensors[i]->dim(0));
-    MACE_CHECK(height == input_tensors[i]->dim(1));
-    MACE_CHECK(width == input_tensors[i]->dim(2));
-    MACE_CHECK(channels == input_tensors[i]->dim(3));
-  }
-
-  if (kernel_.get() == nullptr) {
-    if (input_tensors.size() > 4) {
-      MACE_NOT_IMPLEMENTED;
-    }
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
-    built_options.emplace("-Daddn=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
-
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  std::vector<index_t> output_shape = input_tensors[0]->shape();
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t width_pixels = channel_blocks * width;
-  const index_t batch_height_pixels = batch * height;
-
-  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
-                           static_cast<uint32_t>(batch_height_pixels)};
-
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
-    std::vector<size_t> output_image_shape;
-    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                                &output_image_shape);
-    MACE_RETURN_IF_ERROR(
-        output_tensor->ResizeImage(output_shape, output_image_shape));
-
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_2D_GWS_ARGS(kernel_, gws);
-    for (auto input : input_tensors) {
-      kernel_.setArg(idx++, *(input->opencl_image()));
-    }
-    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
-
-    input_shape_ = input_tensors[0]->shape();
-  }
-
-  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
-  std::string tuning_key =
-      Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
-             output_tensor->dim(2), output_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/batch_norm.cc
+++ b/mace/ops/opencl/image/batch_norm.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/batch_norm.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+BatchNormKernel::BatchNormKernel(const float epsilon,
+                                 const ActivationType activation,
+                                 const float relux_max_limit,
+                                 const float leakyrelu_coefficient)
+    : epsilon_(epsilon),
+      activation_(activation),
+      relux_max_limit_(relux_max_limit),
+      leakyrelu_coefficient_(leakyrelu_coefficient) {}
+
+MaceStatus BatchNormKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *scale,
+    const Tensor *offset,
+    const Tensor *mean,
+    const Tensor *var,
+    Tensor *output) {
+  bool not_folded = (mean != nullptr && var != nullptr);
+
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
+    built_options.emplace("-Dbatch_norm=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    if (!not_folded) {
+      built_options.emplace("-DFOLDED_CONSTANT");
+    }
+    switch (activation_) {
+      case NOOP:break;
+      case RELU:built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:built_options.emplace("-DUSE_SIGMOID");
+        break;
+      case LEAKYRELU:built_options.emplace("-DUSE_LEAKYRELU");
+        break;
+      default:LOG(FATAL) << "Unknown activation type: " << activation_;
+    }
+
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(scale->opencl_image()));
+    kernel_.setArg(idx++, *(offset->opencl_image()));
+    if (not_folded) {
+      kernel_.setArg(idx++, *(mean->opencl_image()));
+      kernel_.setArg(idx++, *(var->opencl_image()));
+      kernel_.setArg(idx++, epsilon_);
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, relux_max_limit_);
+    kernel_.setArg(idx++, leakyrelu_coefficient_);
+
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/batch_norm.h
+++ b/mace/ops/opencl/image/batch_norm.h
@@ -23,7 +23,7 @@

 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/activation.h"
+#include "mace/ops/common/activation_type.h"
 #include "mace/ops/opencl/helper.h"

 namespace mace {
@@ -31,7 +31,6 @@ namespace ops {
 namespace opencl {
 namespace image {

-template <typename T>
 class BatchNormKernel : public OpenCLBatchNormKernel {
 public:
  BatchNormKernel(
@@ -57,111 +56,6 @@ class BatchNormKernel : public OpenCLBatchNormKernel {
  std::vector<index_t> input_shape_;
 };

-template <typename T>
-BatchNormKernel<T>::BatchNormKernel(const float epsilon,
-                                    const ActivationType activation,
-                                    const float relux_max_limit,
-                                    const float leakyrelu_coefficient)
-    : epsilon_(epsilon),
-      activation_(activation),
-      relux_max_limit_(relux_max_limit),
-      leakyrelu_coefficient_(leakyrelu_coefficient) {}
-
-template <typename T>
-MaceStatus BatchNormKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *scale,
-    const Tensor *offset,
-    const Tensor *mean,
-    const Tensor *var,
-    Tensor *output) {
-  bool not_folded = (mean != nullptr && var != nullptr);
-
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
-    built_options.emplace("-Dbatch_norm=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    if (!not_folded) {
-      built_options.emplace("-DFOLDED_CONSTANT");
-    }
-    switch (activation_) {
-      case NOOP:
-        break;
-      case RELU:
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case TANH:
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      case LEAKYRELU:
-        built_options.emplace("-DUSE_LEAKYRELU");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation_;
-    }
-
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(scale->opencl_image()));
-    kernel_.setArg(idx++, *(offset->opencl_image()));
-    if (not_folded) {
-      kernel_.setArg(idx++, *(mean->opencl_image()));
-      kernel_.setArg(idx++, *(var->opencl_image()));
-      kernel_.setArg(idx++, epsilon_);
-    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, relux_max_limit_);
-    kernel_.setArg(idx++, leakyrelu_coefficient_);
-
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
-             output->dim(1), output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/batch_to_space.cc
+++ b/mace/ops/opencl/image/batch_to_space.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/batch_to_space.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus BatchToSpaceKernel::Compute(
+    OpContext *context,
+    const Tensor *batch_tensor,
+    const std::vector<int> &paddings,
+    const std::vector<int> &block_shape,
+    const std::vector<index_t> &output_shape,
+    Tensor *space_tensor) {
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(
+      space_tensor->ResizeImage(output_shape, output_image_shape));
+
+  const uint32_t chan_blk =
+      static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
+
+  const uint32_t gws[3] = {
+      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
+      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    const char *kernel_name = "batch_to_space";
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    auto dt = batch_tensor->dtype();
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
+    kernel_.setArg(idx++, *(space_tensor->opencl_image()));
+    kernel_.setArg(idx++, block_shape[0]);
+    kernel_.setArg(idx++, block_shape[1]);
+    kernel_.setArg(idx++, paddings[0]);
+    kernel_.setArg(idx++, paddings[2]);
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
+
+    input_shape_ = batch_tensor->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
+             batch_tensor->dim(2), batch_tensor->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/batch_to_space.h
+++ b/mace/ops/opencl/image/batch_to_space.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {

-template <typename T>
 class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
 public:
  MaceStatus Compute(
@@ -47,81 +46,6 @@ class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
  std::vector<index_t> input_shape_;
 };

-template <typename T>
-MaceStatus BatchToSpaceKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *batch_tensor,
-    const std::vector<int> &paddings,
-    const std::vector<int> &block_shape,
-    const std::vector<index_t> &output_shape,
-    Tensor *space_tensor) {
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(
-      space_tensor->ResizeImage(output_shape, output_image_shape));
-
-  const uint32_t chan_blk =
-      static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
-
-  const uint32_t gws[3] = {
-      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
-      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    const char *kernel_name = "batch_to_space";
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    built_options.emplace(kernel_name_ss.str());
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" +
-        DtToCLCMDDt(DataTypeToEnum<T>::value));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
-    kernel_.setArg(idx++, *(space_tensor->opencl_image()));
-    kernel_.setArg(idx++, block_shape[0]);
-    kernel_.setArg(idx++, block_shape[1]);
-    kernel_.setArg(idx++, paddings[0]);
-    kernel_.setArg(idx++, paddings[2]);
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
-
-    input_shape_ = batch_tensor->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
-             batch_tensor->dim(2), batch_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/bias_add.cc
+++ b/mace/ops/opencl/image/bias_add.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/bias_add.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus BiasAddKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *bias,
+    Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
+    built_options.emplace("-Dbias_add=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(bias->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange,
+        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION;
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/bias_add.h
+++ b/mace/ops/opencl/image/bias_add.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {

-template <typename T>
 class BiasAddKernel : public OpenCLBiasAddKernel {
 public:
  MaceStatus Compute(
@@ -45,84 +44,6 @@ class BiasAddKernel : public OpenCLBiasAddKernel {
  std::vector<index_t> input_shape_;
 };

-template <typename T>
-MaceStatus BiasAddKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *bias,
-    Tensor *output) {
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    auto dt = DataTypeToEnum<T>::value;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
-    built_options.emplace("-Dbias_add=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(bias->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  } else {
-    std::vector<uint32_t> roundup_gws(lws.size());
-    for (size_t i = 0; i < lws.size(); ++i) {
-      if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
-    }
-
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange,
-        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  }
-  MACE_CL_RET_STATUS(error);
-  MACE_OUT_OF_RANGE_VALIDATION;
-  if (context->future() != nullptr) {
-    context->future()->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/buffer_to_image.cc
+++ b/mace/ops/opencl/image/buffer_to_image.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/buffer_to_image.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus BufferToImage::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const OpenCLBufferType type,
+    const int wino_blk_size,
+    Tensor *output) {
+  auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
+                              type,
+                              &image_shape,
+                              wino_blk_size);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
+
+  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
+                     static_cast<uint32_t>(image_shape[1])};
+  std::string kernel_name;
+  switch (type) {
+    case CONV2D_FILTER:kernel_name = "filter_buffer_to_image";
+      break;
+    case DW_CONV2D_FILTER:kernel_name = "dw_filter_buffer_to_image";
+      break;
+    case IN_OUT_CHANNEL:kernel_name = "in_out_buffer_to_image";
+      break;
+    case ARGUMENT:kernel_name = "arg_buffer_to_image";
+      break;
+    case IN_OUT_HEIGHT:kernel_name = "in_out_height_buffer_to_image";
+      break;
+    case IN_OUT_WIDTH:kernel_name = "in_out_width_buffer_to_image";
+      break;
+    case WEIGHT_HEIGHT:kernel_name = "weight_height_buffer_to_image";
+      break;
+    case WEIGHT_WIDTH:kernel_name = "weight_width_buffer_to_image";
+      break;
+    case WINOGRAD_FILTER: {
+      std::stringstream ss_tmp;
+      gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
+      ss_tmp << "winograd_filter_buffer_to_image_"
+             << wino_blk_size << "x" << wino_blk_size;
+      kernel_name = ss_tmp.str();
+      break;
+    }
+  }
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    if (input->dtype() == output->dtype()) {
+      auto input_dt = input->dtype();
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
+    } else {
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    }
+
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel(
+        "buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
+  }
+
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_2D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_buffer()));
+    MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
+               "buffer offset not aligned");
+    kernel_.setArg(idx++,
+                   static_cast<uint32_t>(input->buffer_offset() /
+                       GetEnumTypeSize(input->dtype())));
+    if (type == CONV2D_FILTER) {
+      const index_t
+          inner_size = input->dim(1) * input->dim(2) * input->dim(3);
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
+    } else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
+    } else if (type == ARGUMENT) {
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
+    } else {
+      kernel_.setArg(idx++,
+                     static_cast<uint32_t>(formatted_buffer_shape[1]));
+      kernel_.setArg(idx++,
+                     static_cast<uint32_t>(formatted_buffer_shape[2]));
+      kernel_.setArg(idx++,
+                     static_cast<uint32_t>(formatted_buffer_shape[3]));
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+
+  const uint32_t kwg_size =
+      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  const std::vector<uint32_t> lws = {16, kwg_size / 16};
+
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
+        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
+        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION;
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/buffer_to_image.h
+++ b/mace/ops/opencl/image/buffer_to_image.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {

-template <typename T>
 class BufferToImage : public OpenCLBufferTransformKernel {
 public:
  MaceStatus Compute(
@@ -45,156 +44,6 @@ class BufferToImage : public OpenCLBufferTransformKernel {
  std::vector<index_t> input_shape_;
 };

-template <typename T>
-MaceStatus BufferToImage<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const OpenCLBufferType type,
-    const int wino_blk_size,
-    Tensor *output) {
-  auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
-                              type,
-                              &image_shape,
-                              wino_blk_size);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
-
-  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
-                     static_cast<uint32_t>(image_shape[1])};
-  std::string kernel_name;
-  switch (type) {
-    case CONV2D_FILTER:
-      kernel_name = "filter_buffer_to_image";
-      break;
-    case DW_CONV2D_FILTER:
-      kernel_name = "dw_filter_buffer_to_image";
-      break;
-    case IN_OUT_CHANNEL:
-      kernel_name = "in_out_buffer_to_image";
-      break;
-    case ARGUMENT:
-      kernel_name = "arg_buffer_to_image";
-      break;
-    case IN_OUT_HEIGHT:
-      kernel_name = "in_out_height_buffer_to_image";
-      break;
-    case IN_OUT_WIDTH:
-      kernel_name = "in_out_width_buffer_to_image";
-      break;
-    case WEIGHT_HEIGHT:
-      kernel_name = "weight_height_buffer_to_image";
-      break;
-    case WEIGHT_WIDTH:
-      kernel_name = "weight_width_buffer_to_image";
-      break;
-    case WINOGRAD_FILTER: {
-      std::stringstream ss_tmp;
-      gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
-      ss_tmp << "winograd_filter_buffer_to_image_"
-             << wino_blk_size << "x" << wino_blk_size;
-      kernel_name = ss_tmp.str();
-      break;
-    }
-  }
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    built_options.emplace(kernel_name_ss.str());
-    if (input->dtype() == output->dtype()) {
-      built_options.emplace(
-          "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-      built_options.emplace("-DCMD_DATA_TYPE=" +
-          DtToCLCMDDt(DataTypeToEnum<T>::value));
-    } else {
-      built_options.emplace("-DDATA_TYPE=" +
-          DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
-      built_options.emplace("-DCMD_DATA_TYPE=" +
-          DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel(
-        "buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
-  }
-
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_2D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_buffer()));
-    MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
-               "buffer offset not aligned");
-    kernel_.setArg(idx++,
-                   static_cast<uint32_t>(input->buffer_offset() /
-                       GetEnumTypeSize(input->dtype())));
-    if (type == CONV2D_FILTER) {
-      const index_t
-          inner_size = input->dim(1) * input->dim(2) * input->dim(3);
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
-    } else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
-    } else if (type == ARGUMENT) {
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
-    } else {
-      kernel_.setArg(idx++,
-                     static_cast<uint32_t>(formatted_buffer_shape[1]));
-      kernel_.setArg(idx++,
-                     static_cast<uint32_t>(formatted_buffer_shape[2]));
-      kernel_.setArg(idx++,
-                     static_cast<uint32_t>(formatted_buffer_shape[3]));
-    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {16, kwg_size / 16};
-
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
-        cl::NDRange(lws[0], lws[1]), nullptr, &event);
-  } else {
-    std::vector<uint32_t> roundup_gws(lws.size());
-    for (size_t i = 0; i < lws.size(); ++i) {
-      roundup_gws[i] = RoundUp(gws[i], lws[i]);
-    }
-
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
-        cl::NDRange(lws[0], lws[1]), nullptr, &event);
-  }
-  MACE_CL_RET_STATUS(error);
-  MACE_OUT_OF_RANGE_VALIDATION;
-  if (context->future() != nullptr) {
-    context->future()->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/channel_shuffle.cc
+++ b/mace/ops/opencl/image/channel_shuffle.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/channel_shuffle.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus ChannelShuffleKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    Tensor *output) {
+  MACE_CHECK(input->dim(3) % groups_ == 0,
+             "input channels must be an integral multiple of group. ",
+             input->dim(3));
+  MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
+  const index_t channels_per_group = channels / groups_;
+  const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
+    built_options.emplace("-Dchannel_shuffle=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("channel_shuffle", kernel_name,
+                             built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, groups_);
+    kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/channel_shuffle.h
+++ b/mace/ops/opencl/image/channel_shuffle.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {

-template <typename T>
 class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
 public:
  explicit ChannelShuffleKernel(const int groups) : groups_(groups) {}
@@ -46,70 +45,6 @@ class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
  std::vector<index_t> input_shape_;
 };

-template <typename T>
-MaceStatus ChannelShuffleKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    Tensor *output) {
-  MACE_CHECK(input->dim(3) % groups_ == 0,
-             "input channels must be an integral multiple of group. ",
-             input->dim(3));
-  MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-  const index_t channels_per_group = channels / groups_;
-  const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
-    built_options.emplace("-Dchannel_shuffle=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(
-        runtime->BuildKernel("channel_shuffle", kernel_name,
-                             built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, groups_);
-    kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/concat.cc
+++ b/mace/ops/opencl/image/concat.cc
@@ -50,7 +50,6 @@ MaceStatus Concat2(OpContext *context,
                   cl::Kernel *kernel,
                   const Tensor *input0,
                   const Tensor *input1,
-                   const DataType dt,
                   std::vector<index_t> *prev_input_shape,
                   Tensor *output,
                   uint32_t *kwg_size) {
@@ -75,12 +74,14 @@ MaceStatus Concat2(OpContext *context,
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
    built_options.emplace("-Dconcat_channel=" + kernel_name);
    if (input0->dtype() == output->dtype()) {
-      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+      auto data_dt = input0->dtype();
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt));
    } else {
-      built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-      built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
    }
+
    if (input0->dim(3) % 4 == 0) {
      built_options.emplace("-DDIVISIBLE_FOUR");
    }
@@ -119,7 +120,6 @@ MaceStatus Concat2(OpContext *context,
 MaceStatus ConcatN(OpContext *context,
                   cl::Kernel *kernel,
                   const std::vector<const Tensor *> &input_list,
-                   const DataType dt,
                   Tensor *output,
                   uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
@@ -135,8 +135,8 @@ MaceStatus ConcatN(OpContext *context,
    MACE_NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
    built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
    MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
                                              built_options, kernel));
    *kwg_size =
@@ -205,6 +205,51 @@ MaceStatus ConcatN(OpContext *context,
 }

 }  // namespace concat
+
+
+MaceStatus ConcatKernel::Compute(
+    OpContext *context,
+    const std::vector<const Tensor *> &input_list,
+    const int32_t axis,
+    Tensor *output) {
+  const int inputs_count = input_list.size();
+
+  const Tensor *input0 = input_list[0];
+
+  std::vector<index_t> output_shape(input0->shape());
+  for (int i = 1; i < inputs_count; ++i) {
+    const Tensor *input = input_list[i];
+    MACE_CHECK(input->dim_size() == input0->dim_size(),
+               "Ranks of all input tensors must be same.");
+    for (int j = 0; j < input->dim_size(); ++j) {
+      if (j == axis) {
+        continue;
+      }
+      MACE_CHECK(input->dim(j) == input0->dim(j),
+                 "Dimensions of inputs should equal except axis.");
+    }
+    output_shape[axis] += input->dim(axis);
+  }
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+
+  switch (inputs_count) {
+    case 2:
+      return concat::Concat2(
+          context, &kernel_, input_list[0], input_list[1],
+          &input_shape_, output, &kwg_size_);
+    default:
+      return concat::ConcatN(context,
+                             &kernel_,
+                             input_list,
+                             output,
+                             &kwg_size_);
+  }
+}
+
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/concat.h
+++ b/mace/ops/opencl/image/concat.h
@@ -32,7 +32,6 @@ MaceStatus Concat2(OpContext *context,
                   cl::Kernel *kernel,
                   const Tensor *input0,
                   const Tensor *input1,
-                   const DataType dt,
                   std::vector<index_t> *prev_input_shape,
                   Tensor *output,
                   uint32_t *kwg_size);
@@ -40,12 +39,10 @@ MaceStatus Concat2(OpContext *context,
 MaceStatus ConcatN(OpContext *context,
                   cl::Kernel *kernel,
                   const std::vector<const Tensor *> &input_list,
-                   const DataType dt,
                   Tensor *output,
                   uint32_t *kwg_size);
 }  // namespace concat

-template <typename T>
 class ConcatKernel : public OpenCLConcatKernel {
 public:
  ConcatKernel() {}
@@ -61,47 +58,6 @@ class ConcatKernel : public OpenCLConcatKernel {
  std::vector<index_t> input_shape_;
 };

-template <typename T>
-MaceStatus ConcatKernel<T>::Compute(
-    OpContext *context,
-    const std::vector<const Tensor *> &input_list,
-    const int32_t axis,
-    Tensor *output) {
-  const int inputs_count = input_list.size();
-
-  const Tensor *input0 = input_list[0];
-
-  std::vector<index_t> output_shape(input0->shape());
-  for (int i = 1; i < inputs_count; ++i) {
-    const Tensor *input = input_list[i];
-    MACE_CHECK(input->dim_size() == input0->dim_size(),
-               "Ranks of all input tensors must be same.");
-    for (int j = 0; j < input->dim_size(); ++j) {
-      if (j == axis) {
-        continue;
-      }
-      MACE_CHECK(input->dim(j) == input0->dim(j),
-                 "Dimensions of inputs should equal except axis.");
-    }
-    output_shape[axis] += input->dim(axis);
-  }
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape,
-                              OpenCLBufferType::IN_OUT_CHANNEL,
-                              &image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-
-  switch (inputs_count) {
-    case 2:
-      return concat::Concat2(
-          context, &kernel_, input_list[0], input_list[1],
-          DataTypeToEnum<T>::value, &input_shape_, output, &kwg_size_);
-    default:
-      return concat::ConcatN(context, &kernel_, input_list,
-                             DataTypeToEnum<T>::value, output, &kwg_size_);
-  }
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops

--- a/mace/ops/opencl/image/conv_2d.cc
+++ b/mace/ops/opencl/image/conv_2d.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+bool Conv2dKernel::CheckUseWinograd(
+    OpenCLRuntime *runtime,
+    const std::vector<mace::index_t> &filter_shape,
+    const std::vector<mace::index_t> &output_shape,
+    const int *strides,
+    const int *dilations,
+    int *wino_blk_size) {
+  if (filter_shape[2] != 3 || filter_shape[3] != 3 ||
+      strides[0] > 1 || strides[1] > 1 ||
+      dilations[0] > 1 || dilations[1] > 1) {
+    return false;
+  }
+  index_t out_channels = filter_shape[0];
+  index_t in_channels = filter_shape[1];
+  auto opencl_image_max_size = runtime->GetMaxImage2DSize();
+  auto check_opencl_limit = [&](int block_size) -> bool {
+    int sqr_block = (block_size + 2) * (block_size + 2);
+    uint64_t transformed_width = static_cast<uint64_t>(output_shape[0] *
+        ((output_shape[1] + block_size - 1) / block_size) *
+        ((output_shape[2] + block_size - 1) / block_size));
+    return (transformed_width < opencl_image_max_size[0] &&
+        static_cast<uint64_t>(sqr_block * in_channels)
+            < opencl_image_max_size[1] &&
+        static_cast<uint64_t>(sqr_block * out_channels)
+            < opencl_image_max_size[1]);
+  };
+  // GPU only supports 4x4 and 2x2 gpu winograd convolution
+  if (*wino_blk_size == 4) {
+    // if block size == 4 exceed OpenCL image size limitation, fallback to 2
+    if (!check_opencl_limit(4)) {
+      *wino_blk_size = 2;
+    } else {
+      return true;
+    }
+  }
+  return check_opencl_limit(2);
+}
+
+MaceStatus Conv2dKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const float leakyrelu_coefficient,
+    const int wino_blk_size,
+    Tensor *output) {
+  index_t kernel_h = filter->dim(2);
+  index_t kernel_w = filter->dim(3);
+  if (strides[0] != strides[1] ||
+      (dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) {
+    LOG(WARNING) << "OpenCL conv2d kernel with "
+                 << "filter" << kernel_h << "x" << kernel_w << ","
+                 << " stride " << strides[0] << "x" << strides[1]
+                 << ",dilations " << dilations[0] << "x" << dilations[1]
+                 << " is not implemented yet.";
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  // Reshape output
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    ops::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), filter->shape().data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), filter->shape().data(),
+                   padding_data.data(), dilations, strides, RoundType::FLOOR,
+                   output_shape.data());
+  }
+
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+
+  std::function<MaceStatus()> conv_func;
+
+  if (wino_blk_size != 0) {
+    // use winograd covolution
+    conv_func = [&]() -> MaceStatus {
+      cl::Kernel *kernels[3] = {&kernels_[0], &kernels_[1], &kernels_[2]};
+      uint32_t *kwg_size[3] = {&kwg_size_[0], &kwg_size_[1], &kwg_size_[2]};
+      return WinogradConv2dK3x3S1(context,
+                                  kernels,
+                                  input,
+                                  filter,
+                                  bias,
+                                  paddings.data(),
+                                  activation,
+                                  relux_max_limit,
+                                  leakyrelu_coefficient,
+                                  wino_blk_size,
+                                  &input_shape_,
+                                  output,
+                                  kwg_size);
+    };
+  } else if (kernel_h == 1 && kernel_w == 1) {
+    conv_func = [&]() -> MaceStatus {
+      return Conv2dK1x1(context,
+                        &kernels_[0],
+                        input,
+                        filter,
+                        bias,
+                        strides[0],
+                        paddings.data(),
+                        dilations,
+                        activation,
+                        relux_max_limit,
+                        leakyrelu_coefficient,
+                        &input_shape_,
+                        output,
+                        &kwg_size_[0]);
+    };
+  } else if (kernel_h == 3 && kernel_w == 3) {
+    conv_func = [&]() -> MaceStatus {
+      return Conv2dK3x3(context,
+                        &kernels_[0],
+                        input,
+                        filter,
+                        bias,
+                        strides[0],
+                        paddings.data(),
+                        dilations,
+                        activation,
+                        relux_max_limit,
+                        leakyrelu_coefficient,
+                        &input_shape_,
+                        output,
+                        &kwg_size_[0]);
+    };
+  } else {
+    conv_func = [&]() -> MaceStatus {
+      return Conv2d(context,
+                    &kernels_[0],
+                    input,
+                    filter,
+                    bias,
+                    strides[0],
+                    paddings.data(),
+                    dilations,
+                    activation,
+                    relux_max_limit,
+                    leakyrelu_coefficient,
+                    &input_shape_,
+                    output,
+                    &kwg_size_[0]);
+    };
+  }
+
+  return conv_func();
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/opencl/image/conv_2d.h
+++ b/mace/ops/opencl/image/conv_2d.h
--- a/mace/ops/opencl/image/conv_2d_1x1.cc
+++ b/mace/ops/opencl/image/conv_2d_1x1.cc
--- a/mace/ops/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
--- a/mace/ops/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
--- a/mace/ops/opencl/image/crop.cc
+++ b/mace/ops/opencl/image/crop.cc
--- a/mace/ops/opencl/image/crop.h
+++ b/mace/ops/opencl/image/crop.h
--- a/mace/ops/opencl/image/deconv_2d.cc
+++ b/mace/ops/opencl/image/deconv_2d.cc
--- a/mace/ops/opencl/image/deconv_2d.h
+++ b/mace/ops/opencl/image/deconv_2d.h
--- a/mace/ops/opencl/image/depth_to_space.cc
+++ b/mace/ops/opencl/image/depth_to_space.cc
--- a/mace/ops/opencl/image/depth_to_space.h
+++ b/mace/ops/opencl/image/depth_to_space.h
--- a/mace/ops/opencl/image/depthwise_conv2d.cc
+++ b/mace/ops/opencl/image/depthwise_conv2d.cc
--- a/mace/ops/opencl/image/depthwise_conv2d.h
+++ b/mace/ops/opencl/image/depthwise_conv2d.h
--- a/mace/ops/opencl/image/depthwise_deconv2d.cc
+++ b/mace/ops/opencl/image/depthwise_deconv2d.cc
--- a/mace/ops/opencl/image/depthwise_deconv2d.h
+++ b/mace/ops/opencl/image/depthwise_deconv2d.h
--- a/mace/ops/opencl/image/eltwise.cc
+++ b/mace/ops/opencl/image/eltwise.cc
--- a/mace/ops/opencl/image/eltwise.h
+++ b/mace/ops/opencl/image/eltwise.h
--- a/mace/ops/opencl/image/fully_connected.cc
+++ b/mace/ops/opencl/image/fully_connected.cc
--- a/mace/ops/opencl/image/fully_connected.h
+++ b/mace/ops/opencl/image/fully_connected.h
--- a/mace/ops/opencl/image/image_to_buffer.cc
+++ b/mace/ops/opencl/image/image_to_buffer.cc
--- a/mace/ops/opencl/image/image_to_buffer.h
+++ b/mace/ops/opencl/image/image_to_buffer.h
--- a/mace/ops/opencl/image/lstm_cell.cc
+++ b/mace/ops/opencl/image/lstm_cell.cc
--- a/mace/ops/opencl/image/lstm_cell.h
+++ b/mace/ops/opencl/image/lstm_cell.h
--- a/mace/ops/opencl/image/matmul.cc
+++ b/mace/ops/opencl/image/matmul.cc
--- a/mace/ops/opencl/image/matmul.h
+++ b/mace/ops/opencl/image/matmul.h
--- a/mace/ops/opencl/image/pad.cc
+++ b/mace/ops/opencl/image/pad.cc
--- a/mace/ops/opencl/image/pad.h
+++ b/mace/ops/opencl/image/pad.h
--- a/mace/ops/opencl/image/pooling.cc
+++ b/mace/ops/opencl/image/pooling.cc
--- a/mace/ops/opencl/image/pooling.h
+++ b/mace/ops/opencl/image/pooling.h
--- a/mace/ops/opencl/image/reduce.cc
+++ b/mace/ops/opencl/image/reduce.cc
--- a/mace/ops/opencl/image/reduce.h
+++ b/mace/ops/opencl/image/reduce.h
--- a/mace/ops/opencl/image/resize_bicubic.cc
+++ b/mace/ops/opencl/image/resize_bicubic.cc
--- a/mace/ops/opencl/image/resize_bicubic.h
+++ b/mace/ops/opencl/image/resize_bicubic.h
--- a/mace/ops/opencl/image/resize_bilinear.cc
+++ b/mace/ops/opencl/image/resize_bilinear.cc
--- a/mace/ops/opencl/image/resize_bilinear.h
+++ b/mace/ops/opencl/image/resize_bilinear.h
--- a/mace/ops/opencl/image/resize_nearest_neighbor.cc
+++ b/mace/ops/opencl/image/resize_nearest_neighbor.cc
--- a/mace/ops/opencl/image/resize_nearest_neighbor.h
+++ b/mace/ops/opencl/image/resize_nearest_neighbor.h
--- a/mace/ops/opencl/image/softmax.cc
+++ b/mace/ops/opencl/image/softmax.cc
--- a/mace/ops/opencl/image/softmax.h
+++ b/mace/ops/opencl/image/softmax.h
--- a/mace/ops/opencl/image/space_to_batch.cc
+++ b/mace/ops/opencl/image/space_to_batch.cc
--- a/mace/ops/opencl/image/space_to_batch.h
+++ b/mace/ops/opencl/image/space_to_batch.h
--- a/mace/ops/opencl/image/space_to_depth.cc
+++ b/mace/ops/opencl/image/space_to_depth.cc
--- a/mace/ops/opencl/image/space_to_depth.h
+++ b/mace/ops/opencl/image/space_to_depth.h
--- a/mace/ops/opencl/image/split.cc
+++ b/mace/ops/opencl/image/split.cc
--- a/mace/ops/opencl/image/split.h
+++ b/mace/ops/opencl/image/split.h
--- a/mace/ops/opencl/image/sqrdiff_mean.cc
+++ b/mace/ops/opencl/image/sqrdiff_mean.cc
--- a/mace/ops/opencl/image/sqrdiff_mean.h
+++ b/mace/ops/opencl/image/sqrdiff_mean.h
--- a/mace/ops/opencl/image/winograd_conv2d.cc
+++ b/mace/ops/opencl/image/winograd_conv2d.cc
--- a/mace/ops/opencl/lstm_cell.cc
+++ b/mace/ops/opencl/lstm_cell.cc
--- a/mace/ops/opencl/pooling.h
+++ b/mace/ops/opencl/pooling.h
--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
--- a/mace/ops/reduce.cc
+++ b/mace/ops/reduce.cc
--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
--- a/mace/ops/resize_nearest_neighbor.cc
+++ b/mace/ops/resize_nearest_neighbor.cc
--- a/mace/ops/resize_nearest_neighbor.h
+++ b/mace/ops/resize_nearest_neighbor.h
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
--- a/mace/ops/sqrdiff_mean.cc
+++ b/mace/ops/sqrdiff_mean.cc
--- a/mace/ops/squeeze.cc
+++ b/mace/ops/squeeze.cc
--- a/mace/python/tools/encrypt_opencl_codegen.py
+++ b/mace/python/tools/encrypt_opencl_codegen.py
--- a/mace/python/tools/str2vec_maps.cc.jinja2
+++ b/mace/python/tools/str2vec_maps.cc.jinja2
--- a/mace/ops/resize_bilinear.h
+++ b/mace/ops/resize_bilinear.h
--- a/repository/opencl-kernel/opencl_kernel_configure.bzl
+++ b/repository/opencl-kernel/opencl_kernel_configure.bzl
--- a/test/ccbenchmark/mace/ops/buffer_to_image_benchmark.cc
+++ b/test/ccbenchmark/mace/ops/buffer_to_image_benchmark.cc
--- a/test/ccbenchmark/mace/ops/pad_benchmark.cc
+++ b/test/ccbenchmark/mace/ops/pad_benchmark.cc
--- a/test/ccbenchmark/mace/ops/pooling_benchmark.cc
+++ b/test/ccbenchmark/mace/ops/pooling_benchmark.cc
--- a/test/ccunit/mace/ops/buffer_to_image_test.cc
+++ b/test/ccunit/mace/ops/buffer_to_image_test.cc
--- a/test/ccunit/mace/ops/buffer_transform_test.cc
+++ b/test/ccunit/mace/ops/buffer_transform_test.cc
--- a/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
+++ b/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
--- a/test/ccunit/mace/ops/pad_test.cc
+++ b/test/ccunit/mace/ops/pad_test.cc
--- a/test/ccunit/mace/ops/pooling_test.cc
+++ b/test/ccunit/mace/ops/pooling_test.cc
--- a/test/ccunit/mace/ops/reduce_test.cc
+++ b/test/ccunit/mace/ops/reduce_test.cc