diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4fc705a53d1b8c266d1c6104982fda5caf33426a..9714b1456a1ecdc227b5861d114f09f7464cae97 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,7 +68,7 @@ if(MACE_ENABLE_CUDA)
   enable_language(CUDA)
 endif(MACE_ENABLE_CUDA)
 
-if((MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA))
+if(MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA)
   if(ANDROID_ABI STREQUAL "arm64-v8a")
     # Use gold linker to avoid linking check of libcdsprpc.so
     set(MACE_LINKER_FLAGS "${MACE_LINKER_FLAGS} -fuse-ld=gold")
diff --git a/docs/development/adding_a_new_op.md b/docs/development/adding_a_new_op.md
index 3e4616717767f46894f461d25abe599561639a91..2bf0af810845070f77ac174bcbfb7ccfc8f40113 100644
--- a/docs/development/adding_a_new_op.md
+++ b/docs/development/adding_a_new_op.md
@@ -33,8 +33,8 @@ class MyCustomOp<DeviceType::CPU, float> : public Operation {
 }
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class MyCustomOp<DeviceType::GPU, T> : public Operation {
+template<>
+class MyCustomOp<DeviceType::GPU, float> : public Operation {
 ...
 };
 #endif  // MACE_ENABLE_OPENCL
@@ -43,13 +43,7 @@ void RegisterMyCustomOp(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
                    DeviceType::CPU, float);
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "MyCustomOp", MyCustomOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "MyCustomOp", MyCustomOp);
 }
 
 }  // namespace ops
diff --git a/mace/codegen/BUILD.bazel b/mace/codegen/BUILD.bazel
index 0e5bad98a70cfdb6ebe37fc8112c535ef0ca6e8b..a2a750156f7efd9127d71045b20d3fdd72fcf37f 100644
--- a/mace/codegen/BUILD.bazel
+++ b/mace/codegen/BUILD.bazel
@@ -5,7 +5,7 @@ package(
     default_visibility = ["//visibility:public"],
 )
 
-load("//mace:mace.bzl", "mace_version_genrule", "encrypt_opencl_kernel_genrule")
+load("//mace:mace.bzl", "encrypt_opencl_kernel_genrule", "mace_version_genrule")
 
 cc_library(
     name = "generated_models",
@@ -28,6 +28,7 @@ encrypt_opencl_kernel_genrule()
 cc_library(
     name = "generated_opencl",
     srcs = ["opencl/encrypt_opencl_kernel.cc"],
+    hdrs = ["opencl/encrypt_opencl_kernel.h"],
     copts = [
         "-Werror",
         "-Wextra",
diff --git a/mace/core/operator.cc b/mace/core/operator.cc
index 605ae3a759b9beae2d930263f20316490c15fd1b..883bc1eb828faaeeda015402d1f9f40059f28d5c 100644
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -318,7 +318,7 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
 
   std::string key = OpKeyBuilder(op_type)
       .Device(device_type)
-      .TypeConstraint("T", dtype)
+      .TypeConstraint("T", dtype == DT_HALF ? DT_FLOAT : dtype)
       .Build();
   if (registry_.at(op_type)->creators.count(key) == 0) {
     LOG(FATAL) << "Key not registered: " << key;
diff --git a/mace/core/operator.h b/mace/core/operator.h
index 9430d90d05be00ac2ae1e7034c4ea3f8c5dadfe2..fbcbfd2ead3f8d70552464420f450fae17b04b0a 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -39,7 +39,7 @@ class OpConditionContext {
   OpConditionContext(const Workspace *ws, TensorShapeMap *info);
   ~OpConditionContext() = default;
 
-  void set_operator_def(const OperatorDef* operator_def);
+  void set_operator_def(const OperatorDef *operator_def);
 
   inline const OperatorDef *operator_def() const {
     return operator_def_;
@@ -49,7 +49,7 @@ class OpConditionContext {
     return ws_;
   }
 
-  inline void set_device(Device* device) {
+  inline void set_device(Device *device) {
     device_ = device;
   }
 
@@ -110,7 +110,7 @@ class OpConstructContext {
     return ws_;
   }
 
-  inline void set_device(Device* device) {
+  inline void set_device(Device *device) {
     device_ = device;
   }
 
@@ -166,14 +166,14 @@ class Operation {
   explicit Operation(OpConstructContext *context);
   virtual ~Operation() = default;
 
-  template <typename T>
+  template<typename T>
   inline T GetOptionalArg(const std::string &name,
                           const T &default_value) const {
     MACE_CHECK(operator_def_, "operator_def was null!");
     return ProtoArgHelper::GetOptionalArg<OperatorDef, T>(
         *operator_def_, name, default_value);
   }
-  template <typename T>
+  template<typename T>
   inline std::vector<T> GetRepeatedArgs(
       const std::string &name, const std::vector<T> &default_value = {}) const {
     MACE_CHECK(operator_def_, "operator_def was null!");
@@ -240,7 +240,6 @@ class Operation {
 #define MACE_OP_OUTPUT_TAGS(first_input, ...) \
   enum _OutputTags { first_input = 0, __VA_ARGS__ }
 
-
 struct OpRegistrationInfo {
  public:
   typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
@@ -290,7 +289,6 @@ class OpConditionBuilder {
   OpRegistrationInfo::DataFormatSelector data_format_selector_;
 };
 
-
 class OpRegistryBase {
  public:
   OpRegistryBase() = default;
@@ -315,7 +313,7 @@ class OpRegistryBase {
       OpConstructContext *context,
       DeviceType device_type) const;
 
-  template <class DerivedType>
+  template<class DerivedType>
   static std::unique_ptr<Operation> DefaultCreator(
       OpConstructContext *context) {
     return std::unique_ptr<Operation>(new DerivedType(context));
@@ -334,6 +332,24 @@ class OpRegistryBase {
                         DataTypeToEnum<dt>::value,                     \
                         OpRegistryBase::DefaultCreator<class_name<device, dt>>)
 
+#define MACE_REGISTER_OP_BY_CLASS(                 \
+    op_registry, op_type, class_name, device, dt)  \
+  op_registry->Register(op_type,                   \
+                        device,                    \
+                        DataTypeToEnum<dt>::value, \
+                        OpRegistryBase::DefaultCreator<class_name>)
+
+#ifdef MACE_ENABLE_OPENCL
+#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name) \
+  op_registry->Register(                                       \
+      op_type,                                                 \
+      DeviceType::GPU,                                         \
+      DT_FLOAT,                                                \
+      OpRegistryBase::DefaultCreator<class_name<DeviceType::GPU, float>>)
+#else
+#define MACE_REGISTER_GPU_OP(op_registry, op_type, class_name)
+#endif
+
 #define MACE_REGISTER_OP_CONDITION(op_registry, builder) \
   op_registry->Register(builder)
 
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 022010246da6c59b6bf29da2acfe88b98fabf9be..4875cc228c00effeff3d12d676df410103ae16d2 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -18,20 +18,19 @@
 #include <fstream>
 #include <memory>
 #include <mutex>  // NOLINT(build/c++11)
+#include <sstream>
 #include <string>
 #include <vector>
 #include <utility>
 
-#include "mace/utils/macros.h"
+#include "mace/codegen/opencl/encrypt_opencl_kernel.h"
 #include "mace/core/kv_storage.h"
 #include "mace/core/runtime/opencl/opencl_extension.h"
+#include "mace/utils/macros.h"
 #include "mace/utils/tuner.h"
 
 namespace mace {
 
-extern const std::map<std::string, std::vector<unsigned char>>
-    kEncryptedProgramMap;
-
 const std::string OpenCLErrorToString(cl_int error) {
   switch (error) {
     case CL_SUCCESS:
@@ -265,7 +264,7 @@ OpenCLRuntime::OpenCLRuntime(
     const GPUPriorityHint priority_hint,
     const GPUPerfHint perf_hint,
     std::shared_ptr<KVStorage> precompiled_binary_storage,
-    std::shared_ptr<Tuner<uint32_t>> tuner):
+    std::shared_ptr<Tuner<uint32_t>> tuner) :
     cache_storage_(cache_storage),
     precompiled_binary_storage_(precompiled_binary_storage),
     tuner_(tuner),
@@ -332,7 +331,7 @@ OpenCLRuntime::OpenCLRuntime(
 
   cl_int err;
   if (gpu_type_ == GPUType::QUALCOMM_ADRENO
-          && opencl_version_ == OpenCLVersion::CL_VER_2_0) {
+      && opencl_version_ == OpenCLVersion::CL_VER_2_0) {
     std::vector<cl_context_properties> context_properties;
     context_properties.reserve(5);
     GetAdrenoContextProperties(&context_properties,
@@ -345,8 +344,8 @@ OpenCLRuntime::OpenCLRuntime(
 #if CL_HPP_TARGET_OPENCL_VERSION >= 200
     if (is_profiling_enabled_ && gpu_type_ == GPUType::MALI) {
       std::vector<cl_context_properties> context_properties = {
-          CL_CONTEXT_PLATFORM, (cl_context_properties)default_platform(),
-          CL_PRINTF_CALLBACK_ARM, (cl_context_properties)OpenCLPrintfCallback,
+          CL_CONTEXT_PLATFORM, (cl_context_properties) default_platform(),
+          CL_PRINTF_CALLBACK_ARM, (cl_context_properties) OpenCLPrintfCallback,
           CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0
       };
       context_ = std::shared_ptr<cl::Context>(
@@ -399,7 +398,7 @@ OpenCLRuntime::OpenCLRuntime(
   if (cached_binary_platform_info != platform_info_) {
     if (precompiled_binary_storage_ == nullptr) {
       VLOG(1) << "There is no precompiled OpenCL binary in"
-          " all OpenCL binary paths.";
+                 " all OpenCL binary paths.";
     } else {
       if (precompiled_binary_storage_->Load() != 0) {
         LOG(WARNING) << "Load OpenCL precompiled kernel file failed. "
@@ -530,17 +529,47 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary(
   return true;
 }
 
+MaceStatus GetProgramSourceByName(const std::string &program_name,
+                              std::string *source) {
+  MACE_CHECK_NOTNULL(source);
+  std::stringstream source_stream;
+  const auto &kEncryptedProgramMap = mace::codegen::kEncryptedProgramMap;
+  const auto &it_program = kEncryptedProgramMap.find(program_name);
+  if (it_program == kEncryptedProgramMap.end()) {
+    LOG(ERROR) << "Find program " << program_name << " failed.";
+    return MaceStatus::MACE_RUNTIME_ERROR;
+  }
+
+  const std::vector<std::string> &headers = it_program->second.headers_;
+  for (const std::string &header : headers) {
+    const auto &header_program = kEncryptedProgramMap.find(header);
+    if (header_program == kEncryptedProgramMap.end()) {
+      LOG(WARNING) << "Program header(" << header << ") is empty.";
+      continue;
+    }
+
+    const auto &header_source = header_program->second.encrypted_code_;
+    source_stream << ObfuscateString(
+        std::string(header_source.begin(), header_source.end()));
+  }
+
+  const auto &it_source = it_program->second.encrypted_code_;
+  source_stream << ObfuscateString(
+      std::string(it_source.begin(), it_source.end()));
+  *source = source_stream.str();
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
 bool OpenCLRuntime::BuildProgramFromSource(
     const std::string &program_name,
     const std::string &built_program_key,
     const std::string &build_options_str,
     cl::Program *program) {
-  // Find from source
-  auto it_source = kEncryptedProgramMap.find(program_name);
-  if (it_source != kEncryptedProgramMap.end()) {
+  std::string kernel_source;
+  MaceStatus status = GetProgramSourceByName(program_name, &kernel_source);
+  if (status == MaceStatus::MACE_SUCCESS && !kernel_source.empty()) {
     cl::Program::Sources sources;
-    std::string source(it_source->second.begin(), it_source->second.end());
-    std::string kernel_source = ObfuscateString(source);
     sources.push_back(kernel_source);
     *program = cl::Program(context(), sources);
     cl_int ret = program->build({device()}, build_options_str.c_str());
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index 6b566d181ff3a7074be3e31ef2eb5ed725bf30d7..500b84eff39c4d9e8cd578a2b90949bc7524d27f 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -66,7 +66,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
           *net_def, "opencl_mem_type",
           static_cast<MemoryType>(MemoryType::GPU_IMAGE));
   const MemoryType mem_type = static_cast<MemoryType>(mem_type_i);
-
   runtime->set_mem_type(mem_type);
 
   return MaceStatus::MACE_SUCCESS;
diff --git a/mace/mace.bzl b/mace/mace.bzl
index cef0a5d225a46de6357c9db3bb464fee899040be..47d44edb38e90ebf61f6c1ed9d2dcff23126214d 100644
--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -118,9 +118,21 @@ def mace_version_genrule():
   )
 
 def encrypt_opencl_kernel_genrule():
-  native.genrule(
-      name = "encrypt_opencl_kernel_gen",
-      srcs = [str(Label("@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel"))],
-      outs = ["opencl/encrypt_opencl_kernel.cc"],
-      cmd = "cat $(SRCS) > $@;"
-  )
+    srcs = [
+        str(Label(
+            "@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.cc",
+        )),
+        str(Label(
+            "@local_opencl_kernel_encrypt//:gen/encrypt_opencl_kernel.h",
+        )),
+    ]
+    outs = ["opencl/encrypt_opencl_kernel.cc", "opencl/encrypt_opencl_kernel.h"]
+    native.genrule(
+        name = "encrypt_opencl_kernel_gen",
+        srcs = srcs,
+        outs = outs,
+        cmd = " && ".join([
+            "cat $(location %s) > $(location %s)" % (srcs[i], outs[i])
+            for i in range(0, len(outs))
+        ]),
+    )
diff --git a/mace/ops/BUILD.bazel b/mace/ops/BUILD.bazel
index a80b556dda4d759c8be28cffcb8ed4c1c45fea52..9861198aaa49b99dec5302a0c934f2947e39fc7d 100644
--- a/mace/ops/BUILD.bazel
+++ b/mace/ops/BUILD.bazel
@@ -181,7 +181,6 @@ cc_library(
     ],
 )
 
-
 cc_library(
     name = "internal_ops",
     srcs = glob(
@@ -239,10 +238,10 @@ cc_library(
     name = "ops",
     srcs = [
         "registry/ops_registry.cc",
-        ],
+    ],
     hdrs = [
         "registry/ops_registry.h",
-        ],
+    ],
     copts = [
         "-Werror",
         "-Wextra",
diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc
index 6cb21b5c525ee0b6529348bcfcddd7acd9cfef7b..255370568b6eb7a8702900b85b0e2c99d4606a6b 100644
--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -83,28 +83,27 @@ class ActivationOp<DeviceType::CPU, float> : public Operation {
 };
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class ActivationOp<DeviceType::GPU, T> : public Operation {
+template<>
+class ActivationOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit ActivationOp(OpConstructContext *context)
       : Operation(context) {
     ActivationType type = ops::StringToActivationType(
         Operation::GetOptionalArg<std::string>("activation",
                                               "NOOP"));
-    auto relux_max_limit = static_cast<T>(
-        Operation::GetOptionalArg<float>("max_limit", 0.0f));
-    auto leakyrelu_coefficient = static_cast<T>(
-        Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f));
+    auto relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
+    auto leakyrelu_coefficient =
+        Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f);
     MemoryType mem_type;
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::ActivationKernel<T>>(
+      kernel_ = make_unique<opencl::image::ActivationKernel>(
           type, relux_max_limit, leakyrelu_coefficient);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
     if (type == ActivationType::PRELU) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
           context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
                      == MaceStatus::MACE_SUCCESS);
     }
@@ -126,14 +125,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
 void RegisterActivation(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
                    DeviceType::CPU, float);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Activation", ActivationOp);
   MACE_REGISTER_OP_CONDITION(
       op_registry,
       OpConditionBuilder("Activation")
@@ -141,16 +133,16 @@ void RegisterActivation(OpRegistryBase *op_registry) {
               [](OpConditionContext *context) -> std::set<DeviceType> {
                 auto op = context->operator_def();
                 if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                 }
                 int has_data_format =
                     ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                         *op, "has_data_format", 0);
                 if (!has_data_format ||
                     op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                 }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
               }));
 }
 
diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc
index 523557cffdec564ba9706c4279dd4f20f0d933a7..5b98ba8554caa69929adacefe27b94499d274cd9 100644
--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -29,10 +29,10 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class AddNOp;
 
-template <>
+template<>
 class AddNOp<DeviceType::CPU, float> : public Operation {
  public:
   explicit AddNOp(OpConstructContext *context)
@@ -62,13 +62,13 @@ class AddNOp<DeviceType::CPU, float> : public Operation {
 };
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class AddNOp<DeviceType::GPU, T> : public Operation {
+template<>
+class AddNOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit AddNOp(OpConstructContext *context)
       : Operation(context) {
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::AddNKernel<T>>();
+      kernel_ = make_unique<opencl::image::AddNKernel>();
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -92,15 +92,9 @@ class AddNOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-
 void RegisterAddN(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "AddN", AddNOp);
   MACE_REGISTER_OP_CONDITION(
       op_registry,
       OpConditionBuilder("AddN")
@@ -108,16 +102,16 @@ void RegisterAddN(OpRegistryBase *op_registry) {
               [](OpConditionContext *context) -> std::set<DeviceType> {
                 auto op = context->operator_def();
                 if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                 }
                 int has_data_format =
                     ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                         *op, "has_data_format", 0);
                 if (!has_data_format ||
                     op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                 }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
               }));
 }
 
diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc
index 4e303d07e79b1a5cc9d847720aede92de462f980..a27e46c5739428e6b08952db83f0dfce5b60e798 100644
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -161,8 +161,8 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
 };
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class BatchNormOp<DeviceType::GPU, T> : public Operation {
+template<>
+class BatchNormOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit BatchNormOp(OpConstructContext *context)
       : Operation(context) {
@@ -176,7 +176,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
     MemoryType mem_type;
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::BatchNormKernel<T>>(
+      kernel_ = make_unique<opencl::image::BatchNormKernel>(
           epsilon, activation, relux_max_limit, leakyrelu_coefficient);
     } else {
       MACE_NOT_IMPLEMENTED;
@@ -187,7 +187,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
       const Tensor *input_tensor = context->workspace()->GetTensor(
           operator_def_->input(i));
       MACE_CHECK(input_tensor != nullptr);
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
           context,
           operator_def_.get(),
           i,
@@ -235,14 +235,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
 void RegisterBatchNorm(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
                    DeviceType::CPU, float);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "BatchNorm", BatchNormOp);
 }
 
 }  // namespace ops
diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc
index 03ac91ffb146d4e54c12d94497fb19bdec23337a..937387fc6be78587c0898a5ab5d00a3640b87d3b 100644
--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -80,10 +80,10 @@ class BatchToSpaceOpBase : public Operation {
   }
 };
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class BatchToSpaceNDOp;
 
-template <>
+template<>
 class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
  public:
   explicit BatchToSpaceNDOp(OpConstructContext *context)
@@ -175,7 +175,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
   }
 };
 
-template <>
+template<>
 class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
  public:
   explicit BatchToSpaceNDOp(OpConstructContext *context)
@@ -259,13 +259,13 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
 };
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
+template<>
+class BatchToSpaceNDOp<DeviceType::GPU, float> : public BatchToSpaceOpBase {
  public:
   explicit BatchToSpaceNDOp(OpConstructContext *context)
       : BatchToSpaceOpBase(context) {
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::BatchToSpaceKernel<T>>();
+      kernel_ = make_unique<opencl::image::BatchToSpaceKernel>();
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -285,7 +285,6 @@ class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-
 void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
                    BatchToSpaceNDOp, DeviceType::CPU, float);
@@ -293,13 +292,7 @@ void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
                    BatchToSpaceNDOp, DeviceType::CPU, uint8_t);
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
-                   BatchToSpaceNDOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
-                   BatchToSpaceNDOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "BatchToSpaceND", BatchToSpaceNDOp);
 }
 
 }  // namespace ops
diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc
index 72e93fece0850710fd26aefab0cdddcddaedfc3e..f8c5b352d54bd80dd622d8fba8d5a81b8429a88b 100644
--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -34,16 +34,16 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class BiasAddOp;
 
-template <>
+template<>
 class BiasAddOp<DeviceType::CPU, float> : public Operation {
  public:
   explicit BiasAddOp(OpConstructContext *context)
       : Operation(context),
-        has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 0))
-  {}
+        has_data_format_(Operation::GetOptionalArg<int>("has_data_format",
+                                                        0)) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -96,8 +96,8 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
 };
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class BiasAddOp<DeviceType::GPU, T> : public Operation {
+template<>
+class BiasAddOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit BiasAddOp(OpConstructContext *context)
       : Operation(context),
@@ -105,11 +105,11 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
     MemoryType mem_type = MemoryType::CPU_BUFFER;
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::BiasAddKernel<T>>();
+      kernel_ = make_unique<opencl::image::BiasAddKernel>();
     } else {
       MACE_NOT_IMPLEMENTED;
     }
-    MACE_CHECK(TransformFilter<T>(
+    MACE_CHECK(TransformFilter(
         context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
                    == MaceStatus::MACE_SUCCESS);
   }
@@ -133,18 +133,10 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-
 void RegisterBiasAdd(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
                    DeviceType::CPU, float);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "BiasAdd", BiasAddOp);
   MACE_REGISTER_OP_CONDITION(
       op_registry,
       OpConditionBuilder("BiasAdd")
@@ -152,16 +144,16 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) {
               [](OpConditionContext *context) -> std::set<DeviceType> {
                 auto op = context->operator_def();
                 if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                 }
                 int has_data_format =
                     ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                         *op, "has_data_format", 0);
                 if (!has_data_format ||
                     op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                 }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
               }));
 }
 
diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc
index d68ebbbec9d8c03ee4045c92cf4258f9326dcca8..a7fababb3e9a2806d4de0eb4b9d91600c4180a30 100644
--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -23,10 +23,10 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class ChannelShuffleOp;
 
-template <typename T>
+template<typename T>
 class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
  public:
   explicit ChannelShuffleOp(OpConstructContext *context)
@@ -74,16 +74,15 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
   const int groups_;
 };
 
-
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
+template<>
+class ChannelShuffleOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit ChannelShuffleOp(OpConstructContext *context)
       : Operation(context) {
     const int groups = Operation::GetOptionalArg<int>("group", 1);
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::ChannelShuffleKernel<T>>(groups);
+      kernel_ = make_unique<opencl::image::ChannelShuffleKernel>(groups);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -99,18 +98,11 @@ class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-
 void RegisterChannelShuffle(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "ChannelShuffle",
                    ChannelShuffleOp, DeviceType::CPU, float);
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "ChannelShuffle",
-                   ChannelShuffleOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "ChannelShuffle",
-                   ChannelShuffleOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "ChannelShuffle", ChannelShuffleOp);
 
   MACE_REGISTER_OP_CONDITION(
       op_registry,
@@ -119,19 +111,19 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) {
               [](OpConditionContext *context) -> std::set<DeviceType> {
                 auto op = context->operator_def();
                 if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                 }
                 int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                     *op, "group", 1);
                 if (op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                 }
                 index_t channels = op->output_shape(0).dims(3);
                 index_t channels_per_group = channels / groups;
                 if (groups % 4 != 0 || channels_per_group % 4 != 0) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                 }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
               }));
 }
 
diff --git a/mace/ops/pad.h b/mace/ops/common/pad_type.h
similarity index 87%
rename from mace/ops/pad.h
rename to mace/ops/common/pad_type.h
index e2139e27e0ae319a8ebe4a441eebc5e53187b965..e244b5e6cbd5fcf1354c7e625b83f60abebb3d56 100644
--- a/mace/ops/pad.h
+++ b/mace/ops/common/pad_type.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_PAD_H_
-#define MACE_OPS_PAD_H_
+#ifndef MACE_OPS_COMMON_PAD_TYPE_H_
+#define MACE_OPS_COMMON_PAD_TYPE_H_
 
 namespace mace {
 namespace ops {
@@ -27,4 +27,4 @@ enum PadType {
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_PAD_H_
+#endif  // MACE_OPS_COMMON_PAD_TYPE_H_
diff --git a/mace/ops/pooling.h b/mace/ops/common/pooling_type.h
similarity index 85%
rename from mace/ops/pooling.h
rename to mace/ops/common/pooling_type.h
index c49b2669975bf856d30c0d2cf6ab7deef01e09e1..c7adccbf4c2dabdea6f10d25b7a8e8ae4f1eecbc 100644
--- a/mace/ops/pooling.h
+++ b/mace/ops/common/pooling_type.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_POOLING_H_
-#define MACE_OPS_POOLING_H_
+#ifndef MACE_OPS_COMMON_POOLING_TYPE_H_
+#define MACE_OPS_COMMON_POOLING_TYPE_H_
 
 
 namespace mace {
@@ -23,4 +23,4 @@ enum PoolingType {
 };
 }  // namespace mace
 
-#endif  // MACE_OPS_POOLING_H_
+#endif  // MACE_OPS_COMMON_POOLING_TYPE_H_
diff --git a/mace/ops/reduce.h b/mace/ops/common/reduce_type.h
similarity index 86%
rename from mace/ops/reduce.h
rename to mace/ops/common/reduce_type.h
index 2888bb721ff9fb9f55a28786593da988734f19de..667f6bece40be4bfb4d0594c9920bcdb6a3e0918 100644
--- a/mace/ops/reduce.h
+++ b/mace/ops/common/reduce_type.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_REDUCE_H_
-#define MACE_OPS_REDUCE_H_
+#ifndef MACE_OPS_COMMON_REDUCE_TYPE_H_
+#define MACE_OPS_COMMON_REDUCE_TYPE_H_
 
 
 namespace mace {
@@ -28,4 +28,4 @@ enum ReduceType {
 };
 }  // namespace mace
 
-#endif  // MACE_OPS_REDUCE_H_
+#endif  // MACE_OPS_COMMON_REDUCE_TYPE_H_
diff --git a/mace/ops/resize_bicubic.h b/mace/ops/common/utils.h
similarity index 85%
rename from mace/ops/resize_bicubic.h
rename to mace/ops/common/utils.h
index 97323b8665c1ada6b3c16e8e95ee52230f0350b8..06648942bb48492d946793401920cc246ae77b1a 100644
--- a/mace/ops/resize_bicubic.h
+++ b/mace/ops/common/utils.h
@@ -12,14 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_RESIZE_BICUBIC_H_
-#define MACE_OPS_RESIZE_BICUBIC_H_
+#ifndef MACE_OPS_COMMON_UTILS_H_
+#define MACE_OPS_COMMON_UTILS_H_
 
 #include "mace/core/types.h"
 
 namespace mace {
 namespace ops {
-namespace resize_bicubic {
+namespace common {
+namespace utils {
+
 constexpr int64_t kTableSize = (1u << 10);
 
 inline float CalculateResizeScale(index_t in_size,
@@ -29,9 +31,10 @@ inline float CalculateResizeScale(index_t in_size,
          ? (in_size - 1) / static_cast<float>(out_size - 1)
          : in_size / static_cast<float>(out_size);
 }
-}  // namespace resize_bicubic
 
+}  // namespace utils
+}  // namespace common
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_RESIZE_BICUBIC_H_
+#endif  // MACE_OPS_COMMON_UTILS_H_
diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc
index 518e9cc2b5b9b0d8ff54308e60bc5a3c55e52f42..ccdb0b2db551d2ce26121b38335918ddae306c68 100644
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -46,10 +46,10 @@ class ConcatOpBase : public Operation {
   int axis_;
 };
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class ConcatOp;
 
-template <typename T>
+template<typename T>
 class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
  public:
   explicit ConcatOp(OpConstructContext *context)
@@ -194,13 +194,13 @@ class ConcatOp<DeviceType::CPU, uint8_t> : public ConcatOpBase {
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
+template<>
+class ConcatOp<DeviceType::GPU, float> : public ConcatOpBase {
  public:
   explicit ConcatOp(OpConstructContext *context)
       : ConcatOpBase(context) {
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::ConcatKernel<T>>();
+      kernel_ = make_unique<opencl::image::ConcatKernel>();
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -215,7 +215,6 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-
 void RegisterConcat(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
                    DeviceType::CPU, float);
@@ -228,51 +227,44 @@ void RegisterConcat(OpRegistryBase *op_registry) {
                    DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
-                   DeviceType::GPU, half);
-
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Concat", ConcatOp);
 
   MACE_REGISTER_OP_CONDITION(
       op_registry,
       OpConditionBuilder("Concat")
           .SetDevicePlacerFunc(
-            [](OpConditionContext *context) -> std::set<DeviceType> {
-              auto op = context->operator_def();
-              if (op->output_shape_size() != op->output_size()) {
-                return { DeviceType::CPU, DeviceType::GPU };
-              }
-              auto tensor_shape_info = context->tensor_shape_info();
-              if (op->output_shape(0).dims_size() != 4) {
-                return { DeviceType::CPU };
-              } else {
-                int has_data_format =
-                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                        *op, "has_data_format", 0);
-                int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                    *op, "axis", 3);
-                if (!has_data_format || axis != 3) {
-                  return { DeviceType::CPU };
+              [](OpConditionContext *context) -> std::set<DeviceType> {
+                auto op = context->operator_def();
+                if (op->output_shape_size() != op->output_size()) {
+                  return {DeviceType::CPU, DeviceType::GPU};
                 }
-                bool divisible_four = true;
-                for (const std::string &input : op->input()) {
-                  if (tensor_shape_info->find(input)
-                      != tensor_shape_info->end()) {
-                    divisible_four = divisible_four
-                        && (tensor_shape_info->at(input)[3] % 4 == 0);
+                auto tensor_shape_info = context->tensor_shape_info();
+                if (op->output_shape(0).dims_size() != 4) {
+                  return {DeviceType::CPU};
+                } else {
+                  int has_data_format =
+                      ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                          *op, "has_data_format", 0);
+                  int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                      *op, "axis", 3);
+                  if (!has_data_format || axis != 3) {
+                    return {DeviceType::CPU};
+                  }
+                  bool divisible_four = true;
+                  for (const std::string &input : op->input()) {
+                    if (tensor_shape_info->find(input)
+                        != tensor_shape_info->end()) {
+                      divisible_four = divisible_four
+                          && (tensor_shape_info->at(input)[3] % 4 == 0);
+                    }
+                  }
+                  // Only support not divisible 4 case with 2 inputs.
+                  if (op->input_size() > 2 && !divisible_four) {
+                    return {DeviceType::CPU};
                   }
                 }
-                // Only support not divisible 4 case with 2 inputs.
-                if (op->input_size() > 2 && !divisible_four) {
-                  return { DeviceType::CPU };
-                }
-              }
-              return { DeviceType::CPU, DeviceType::GPU };
-            }));
+                return {DeviceType::CPU, DeviceType::GPU};
+              }));
 }
 
 }  // namespace ops
diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc
index 1963fc865af60c532754345278a9e0f85d9ebc38..c2666d073c370240e3945f166b4ce18a9d9dc0ff 100644
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -446,8 +446,8 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
+template<>
+class Conv2dOp<DeviceType::GPU, float> : public ConvPool2dOpBase {
  public:
   explicit Conv2dOp(OpConstructContext *context)
       : ConvPool2dOpBase(context),
@@ -461,10 +461,10 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
     MemoryType mem_type;
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::Conv2dKernel<T>>();
+      kernel_ = make_unique<opencl::image::Conv2dKernel>();
     } else {
       mem_type = MemoryType::GPU_BUFFER;
-      kernel_ = make_unique<opencl::buffer::Conv2dKernel<T>>();
+      kernel_ = make_unique<opencl::buffer::Conv2dKernel>();
     }
     // Transform filter tensor to target format
     if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
@@ -477,19 +477,19 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
           strides_.data(),
           dilations_.data(),
           &wino_block_size_))) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
           context, operator_def_.get(), 1,
           OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_)
                      == MaceStatus::MACE_SUCCESS);
     } else {
       wino_block_size_ = 0;
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
           context, operator_def_.get(), 1,
           OpenCLBufferType::CONV2D_FILTER, mem_type)
                      == MaceStatus::MACE_SUCCESS);
     }
     if (operator_def_->input_size() > 2) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
           context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
                      == MaceStatus::MACE_SUCCESS);
     }
@@ -527,13 +527,7 @@ void RegisterConv2D(OpRegistryBase *op_registry) {
                    DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Conv2D", Conv2dOp);
 }
 
 }  // namespace ops
diff --git a/mace/ops/crop.cc b/mace/ops/crop.cc
index 20146c8d05eb728ae54711af0883da5cf6e38bca..acaa73f1cfe82834af09d098a7cfc2b12fe70880 100644
--- a/mace/ops/crop.cc
+++ b/mace/ops/crop.cc
@@ -24,10 +24,10 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class CropOp;
 
-template <class T>
+template<class T>
 class CropOp<DeviceType::CPU, T> : public Operation {
  public:
   explicit CropOp(OpConstructContext *context)
@@ -43,7 +43,6 @@ class CropOp<DeviceType::CPU, T> : public Operation {
     }
   }
 
-
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
     MACE_CHECK(inputs_.size() == 2, "Crop op needs two inputs.");
@@ -71,7 +70,7 @@ class CropOp<DeviceType::CPU, T> : public Operation {
     MACE_RETURN_IF_ERROR(output->Resize(output_shape));
     T *output_data = output->mutable_data<T>();
 
-    const T * input_data = input0->data<T>();
+    const T *input_data = input0->data<T>();
 
     crop_copy(input_data, output_data, input0->shape(),
               output_shape, offsets.data());
@@ -80,10 +79,10 @@ class CropOp<DeviceType::CPU, T> : public Operation {
   }
 
  private:
-  void crop_copy(const T* input_data, T* output_data,
+  void crop_copy(const T *input_data, T *output_data,
                  const std::vector<index_t> &input_shape,
                  const std::vector<index_t> &output_shape,
-                 const int32_t* offsets) {
+                 const int32_t *offsets) {
     const index_t out_img_size =
         output_shape[1] * output_shape[2] * output_shape[3];
     const index_t out_hw = output_shape[2] * output_shape[3];
@@ -94,9 +93,9 @@ class CropOp<DeviceType::CPU, T> : public Operation {
     for (int b = 0; b < output_shape[0]; ++b) {
       for (int c = 0; c < output_shape[1]; ++c) {
         for (int h = 0; h < output_shape[2]; ++h) {
-          T* out_ptr =
+          T *out_ptr =
               output_data + b * out_img_size + c * out_hw + h * output_shape[3];
-          const T* in_ptr_bch =
+          const T *in_ptr_bch =
               input_data + (b + offsets[0]) * in_img_size +
                   (c + offsets[1]) * in_hw +
                   (h + offsets[2]) * input_shape[3] + offsets[3];
@@ -112,13 +111,13 @@ class CropOp<DeviceType::CPU, T> : public Operation {
 };
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class CropOp<DeviceType::GPU, T> : public Operation {
+template<>
+class CropOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit CropOp(OpConstructContext *context)
       : Operation(context) {
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::CropKernel<T>>(
+      kernel_ = make_unique<opencl::image::CropKernel>(
           Operation::GetRepeatedArgs<int>("offset"));
     } else {
       MACE_NOT_IMPLEMENTED;
@@ -133,18 +132,10 @@ class CropOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-
 void RegisterCrop(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Crop", CropOp,
                    DeviceType::CPU, float);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Crop", CropOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "Crop", CropOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Crop", CropOp);
   MACE_REGISTER_OP_CONDITION(
       op_registry,
       OpConditionBuilder("Crop")
@@ -152,16 +143,16 @@ void RegisterCrop(OpRegistryBase *op_registry) {
               [](OpConditionContext *context) -> std::set<DeviceType> {
                 auto op = context->operator_def();
                 if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                 }
                 int has_data_format =
                     ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                         *op, "has_data_format", 0);
                 if (!has_data_format ||
                     op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                 }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
               }));
 }
 
diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc
index 2b7623e6d48cf5738bccbbed6c7cf30820342f19..6453544ae92c75efc5560ef5f157dcbbfedb13d5 100644
--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -167,30 +167,30 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
 };
 
 #ifdef MACE_ENABLE_OPENCL
-template<typename T>
-class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
+template<>
+class Deconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
  public:
   explicit Deconv2dOp(OpConstructContext *context)
       : Deconv2dOpBase(context) {
     MemoryType mem_type = MemoryType::GPU_IMAGE;
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::Deconv2dKernel<T>>();
+      kernel_ = make_unique<opencl::image::Deconv2dKernel>();
     } else {
       MACE_NOT_IMPLEMENTED;
     }
-    MACE_CHECK(TransformFilter<T>(
+    MACE_CHECK(TransformFilter(
         context, operator_def_.get(), 1,
         OpenCLBufferType::CONV2D_FILTER, mem_type)
                    == MaceStatus::MACE_SUCCESS);
     if (model_type_ == FrameworkType::CAFFE) {
       if (operator_def_->input_size() >= 3) {
-        MACE_CHECK(TransformFilter<T>(
+        MACE_CHECK(TransformFilter(
             context, operator_def_.get(), 2,
             OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
       }
     } else {
       if (operator_def_->input_size() >= 4) {
-        MACE_CHECK(TransformFilter<T>(
+        MACE_CHECK(TransformFilter(
             context,
             operator_def_.get(),
             3,
@@ -256,13 +256,8 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
 void RegisterDeconv2D(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
                    DeviceType::CPU, float);
-
+  MACE_REGISTER_GPU_OP(op_registry, "Deconv2D", Deconv2dOp);
 #ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
-                   DeviceType::GPU, half);
   MACE_REGISTER_OP_CONDITION(
       op_registry,
       OpConditionBuilder("Deconv2D")
diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc
index a57ddecfae2ddbcc78b93d601382c3a2933fafac..ba87830a9038ac2c791787a148b114d0a5c0c8f6 100644
--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -24,7 +24,7 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class DepthToSpaceOp : public Operation {
  public:
   explicit DepthToSpaceOp(OpConstructContext *context)
@@ -90,14 +90,14 @@ class DepthToSpaceOp : public Operation {
 };
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class DepthToSpaceOp<DeviceType::GPU, T> : public Operation {
+template<>
+class DepthToSpaceOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit DepthToSpaceOp(OpConstructContext *context)
       : Operation(context) {
     int block_size = Operation::GetOptionalArg<int>("block_size", 1);
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::DepthToSpaceKernel<T>>(block_size);
+      kernel_ = make_unique<opencl::image::DepthToSpaceKernel>(block_size);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -118,13 +118,7 @@ void RegisterDepthToSpace(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "DepthToSpace",
                    DepthToSpaceOp, DeviceType::CPU, float);
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "DepthToSpace",
-                   DepthToSpaceOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "DepthToSpace",
-                   DepthToSpaceOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "DepthToSpace", DepthToSpaceOp);
 }
 
 }  // namespace ops
diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc
index d53b67463996e8a27b9d0af62227cbc0c8cdbc1e..06964ee038088d6921b5d9244eac3c14913522ae 100644
--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -369,24 +369,24 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
+template<>
+class DepthwiseConv2dOp<DeviceType::GPU, float> : public DepthwiseConv2dOpBase {
  public:
   explicit DepthwiseConv2dOp(OpConstructContext *context)
       : DepthwiseConv2dOpBase(context) {
     MemoryType mem_type;
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel<T>>();
+      kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel>();
     } else {
       mem_type = MemoryType::GPU_BUFFER;
-      kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel<T>>();
+      kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel>();
     }
     Tensor *filter_tensor = context->workspace()->GetTensor(
         operator_def_->input(1));
     if (filter_tensor != nullptr && filter_tensor->is_weight()) {
       // Transform filter tensor to target format
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
           context,
           operator_def_.get(),
           1,
@@ -394,7 +394,7 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
           mem_type) == MaceStatus::MACE_SUCCESS);
     }
     if (operator_def_->input_size() > 2) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
           context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
                      == MaceStatus::MACE_SUCCESS);
     }
@@ -431,12 +431,9 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
                    DepthwiseConv2dOp, DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
-                   DepthwiseConv2dOp, DeviceType::GPU, float);
+  MACE_REGISTER_GPU_OP(op_registry, "DepthwiseConv2d", DepthwiseConv2dOp);
 
-  MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
-                   DepthwiseConv2dOp, DeviceType::GPU, half);
+#ifdef MACE_ENABLE_OPENCL
   MACE_REGISTER_OP_CONDITION(
       op_registry,
       OpConditionBuilder("DepthwiseConv2d")
@@ -467,8 +464,8 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
                 DataFormat op_data_format =
                     static_cast<DataFormat>(
                         ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                        *context->operator_def(), "data_format",
-                        static_cast<int>(DataFormat::NONE)));
+                            *context->operator_def(), "data_format",
+                            static_cast<int>(DataFormat::NONE)));
                 return {op_data_format, DataFormat::OIHW, DataFormat::NONE};
               }));
 }
diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc
index 31b634af11ed9756fbb14eddd91d519a7224d1d6..96f6d575fd2c8663d7c2c860dbbdbd7d0801713d 100644
--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -184,23 +184,23 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
 };
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
+template<>
+class DepthwiseDeconv2dOp<DeviceType::GPU, float> : public Deconv2dOpBase {
  public:
   explicit DepthwiseDeconv2dOp(OpConstructContext *context)
       : Deconv2dOpBase(context) {
     MemoryType mem_type = MemoryType::GPU_IMAGE;
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel<T>>();
+      kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel>();
     } else {
       MACE_NOT_IMPLEMENTED;
     }
-    MACE_CHECK(TransformFilter<T>(
+    MACE_CHECK(TransformFilter(
         context, operator_def_.get(), 1,
         OpenCLBufferType::DW_CONV2D_FILTER, mem_type)
                    == MaceStatus::MACE_SUCCESS);
     if (operator_def_->input_size() >= 3) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
           context, operator_def_.get(), 2,
           OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
     }
@@ -255,13 +255,7 @@ void RegisterDepthwiseDeconv2d(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
                    DepthwiseDeconv2dOp, DeviceType::CPU, float);
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
-                   DepthwiseDeconv2dOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "DepthwiseDeconv2d",
-                   DepthwiseDeconv2dOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "DepthwiseDeconv2d", DepthwiseDeconv2dOp);
 }
 
 }  // namespace ops
diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc
index c31d5e55a881bb89fde61001c938ba785012d8c0..f597f70c9682a372e28e6602f0b38fa065b9edec 100644
--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -1158,8 +1158,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class EltwiseOp<DeviceType::GPU, T> : public Operation {
+template<>
+class EltwiseOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit EltwiseOp(OpConstructContext *context)
       : Operation(context) {
@@ -1178,7 +1178,7 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
     MemoryType mem_type;
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::EltwiseKernel<T>>(
+      kernel_ = make_unique<opencl::image::EltwiseKernel>(
           type, coeff, scalar_input, scalar_input_index);
     } else {
       MACE_NOT_IMPLEMENTED;
@@ -1190,14 +1190,14 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
       if (ws->HasTensor(operator_def_->input(i)) &&
           ws->GetTensor(operator_def_->input(i))->is_weight()) {
         if (ws->GetTensor(operator_def_->input(i))->dim_size() == 1) {
-          MACE_CHECK(TransformFilter<T>(
+          MACE_CHECK(TransformFilter(
               context,
               operator_def_.get(),
               i,
               OpenCLBufferType::ARGUMENT,
               mem_type) == MaceStatus::MACE_SUCCESS);
         } else if (ws->GetTensor(operator_def_->input(i))->dim_size() == 4) {
-          MACE_CHECK(TransformFilter<T>(
+          MACE_CHECK(TransformFilter(
               context,
               operator_def_.get(),
               i,
@@ -1236,13 +1236,7 @@ void RegisterEltwise(OpRegistryBase *op_registry) {
                    DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Eltwise", EltwiseOp);
 }
 
 }  // namespace ops
diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc
index 9a371b16566c714cc8c352bc7b6a4b1382a9695e..d863a2843a493d3186021d6621f226fc89689e7b 100644
--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -184,27 +184,27 @@ class FullyConnectedOp<DeviceType::CPU, uint8_t>
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
+template<>
+class FullyConnectedOp<DeviceType::GPU, float> : public FullyConnectedOpBase {
  public:
   explicit FullyConnectedOp(OpConstructContext *context)
       : FullyConnectedOpBase(context) {
     MemoryType mem_type = MemoryType::CPU_BUFFER;
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       mem_type = MemoryType::GPU_IMAGE;
-      kernel_ = make_unique<opencl::image::FullyConnectedKernel<T>>();
+      kernel_ = make_unique<opencl::image::FullyConnectedKernel>();
     } else {
       MACE_NOT_IMPLEMENTED;
     }
     // Transform filter tensor to target format
-    MACE_CHECK(TransformFilter<T>(
+    MACE_CHECK(TransformFilter(
         context,
         operator_def_.get(),
         1,
         OpenCLBufferType::WEIGHT_WIDTH,
         mem_type) == MaceStatus::MACE_SUCCESS);
     if (operator_def_->input_size() > 2) {
-      MACE_CHECK(TransformFilter<T>(
+      MACE_CHECK(TransformFilter(
           context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
                      == MaceStatus::MACE_SUCCESS);
     }
@@ -240,13 +240,7 @@ void RegisterFullyConnected(OpRegistryBase *op_registry) {
                    FullyConnectedOp, DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "FullyConnected",
-                   FullyConnectedOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "FullyConnected",
-                   FullyConnectedOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "FullyConnected", FullyConnectedOp);
 }
 
 }  // namespace ops
diff --git a/mace/ops/identity.cc b/mace/ops/identity.cc
index 892cef297e10f41a30163c369f6a62a10768e454..1c7a037ee2b8c1ec445b8c638958209cde7792f0 100644
--- a/mace/ops/identity.cc
+++ b/mace/ops/identity.cc
@@ -18,7 +18,6 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
 class IdentityOp : public Operation {
  public:
   explicit IdentityOp(OpConstructContext *context)
@@ -34,15 +33,13 @@ class IdentityOp : public Operation {
 };
 
 void RegisterIdentity(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
-                   DeviceType::CPU, float);
-  MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
-                   DeviceType::CPU, int32_t);
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
+                            DeviceType::CPU, float);
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
+                            DeviceType::CPU, int32_t);
 #ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
-                   DeviceType::GPU, half);
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "Identity", IdentityOp,
+                            DeviceType::GPU, float);
 #endif  // MACE_ENABLE_OPENCL
 }
 
diff --git a/mace/ops/infer_conv2d_shape.cc b/mace/ops/infer_conv2d_shape.cc
index 38f711f57ad824f146a4cd0abf306300b5122735..fb7bfecc90ccb80d2cedaf321d65b207be988892 100644
--- a/mace/ops/infer_conv2d_shape.cc
+++ b/mace/ops/infer_conv2d_shape.cc
@@ -19,7 +19,6 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
 class InferConv2dShapeOp : public Operation {
  public:
   explicit InferConv2dShapeOp(OpConstructContext *context)
@@ -66,20 +65,23 @@ class InferConv2dShapeOp : public Operation {
     int32_t out_h = 0, out_w = 0;
     if (!paddings.empty()) {
       out_h = (in_h - kernels[2] + paddings[0]) / strides[0] + 1;
-      out_w = (in_w - kernels[3]  + paddings[1]) / strides[1] + 1;
+      out_w = (in_w - kernels[3] + paddings[1]) / strides[1] + 1;
     } else {
       switch (padding_type) {
-        case SAME:
+        case SAME: {
           out_h = (in_h + strides[0] - 1) / strides[0];
           out_w = (in_w + strides[1] - 1) / strides[1];
           break;
-        case VALID:
+        }
+        case VALID: {
           out_h = (in_h - kernels[2] + 1) / strides[0];
           out_w = (in_w - kernels[3] + 1) / strides[1];
           break;
-        default:
+        }
+        default: {
           MACE_NOT_IMPLEMENTED;
           break;
+        }
       }
     }
 
@@ -100,15 +102,13 @@ class InferConv2dShapeOp : public Operation {
 };
 
 void RegisterInferConv2dShape(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "InferConv2dShape",
-                   InferConv2dShapeOp, DeviceType::CPU, float);
-  MACE_REGISTER_OP(op_registry, "InferConv2dShape",
-                   InferConv2dShapeOp, DeviceType::CPU, int32_t);
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
+                            InferConv2dShapeOp, DeviceType::CPU, float);
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
+                            InferConv2dShapeOp, DeviceType::CPU, int32_t);
 #ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "InferConv2dShape",
-                   InferConv2dShapeOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "InferConv2dShape",
-                   InferConv2dShapeOp, DeviceType::GPU, half);
+  MACE_REGISTER_OP_BY_CLASS(op_registry, "InferConv2dShape",
+                            InferConv2dShapeOp, DeviceType::GPU, float);
 #endif  // MACE_ENABLE_OPENCL
 }
 
diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc
index 592d25ae724ed8a93191049a31097a4e95c91d2a..f9bfec53cfb0127e123b50b65587dbf34399cd07 100644
--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -77,7 +77,7 @@ class MatMulOpBase : public Operation {
     } else {
       MACE_CHECK(lhs_rank == 2 || rhs_rank == 2,
                  "Either lhs or rhs matrix should has rank 2 "
-                     "for non-batched matrix multiplication");
+                 "for non-batched matrix multiplication");
     }
 
     index_t
@@ -492,8 +492,8 @@ class MatMulOp<DeviceType::CPU, uint8_t> : public MatMulOpBase {
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class MatMulOp<DeviceType::GPU, T> : public MatMulOpBase {
+template<>
+class MatMulOp<DeviceType::GPU, float> : public MatMulOpBase {
  public:
   explicit MatMulOp(OpConstructContext *context)
       : MatMulOpBase(context) {
@@ -592,7 +592,6 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase {
 };
 #endif  // MACE_ENABLE_NEON
 
-
 void RegisterMatMul(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
                    DeviceType::CPU, float);
@@ -602,13 +601,7 @@ void RegisterMatMul(OpRegistryBase *op_registry) {
                    DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "MatMul", MatMulOp);
 
 #if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
   MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
diff --git a/mace/ops/opencl/buffer/buffer_transform.cc b/mace/ops/opencl/buffer/buffer_transform.cc
index 58ae277ec1a493f0baa5d584089736a4f86aeb38..f3685b985196e8afe956652be2fa85c2f8769b8c 100644
--- a/mace/ops/opencl/buffer/buffer_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_transform.cc
@@ -27,7 +27,6 @@ MaceStatus TransformConv2DFilter(
     OpContext *context,
     cl::Kernel *kernel,
     const Tensor *input,
-    const DataType dt,
     Tensor *output) {
   const index_t out_chan = input->dim(0);
   const index_t in_chan = input->dim(1);
@@ -55,8 +54,9 @@ MaceStatus TransformConv2DFilter(
     MACE_OUT_OF_RANGE_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_conv_filter");
     built_options.emplace("-Dtransform_conv_filter=" + kernel_name);
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    std::string data_dt = DtToCLDt(input->dtype());
+    built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
+    built_options.emplace("-DDATA_TYPE=" + data_dt);
     MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
                                               kernel_name,
                                               built_options,
@@ -98,7 +98,6 @@ MaceStatus TransformDWConv2DFilter(
     OpContext *context,
     cl::Kernel *kernel,
     const Tensor *input,
-    const DataType dt,
     Tensor *output) {
   const index_t multiplier = input->dim(0);
   const index_t in_chan = input->dim(1);
@@ -124,8 +123,9 @@ MaceStatus TransformDWConv2DFilter(
     MACE_NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_dw_conv_filter");
     built_options.emplace("-Dtransform_dw_conv_filter=" + kernel_name);
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    std::string data_dt = DtToCLDt(input->dtype());
+    built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
+    built_options.emplace("-DDATA_TYPE=" + data_dt);
     MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
                                               kernel_name,
                                               built_options,
@@ -164,7 +164,6 @@ MaceStatus TransformArgument(
     OpContext *context,
     cl::Kernel *kernel,
     const Tensor *input,
-    const DataType dt,
     Tensor *output) {
   const index_t size = input->dim(0);
 
@@ -181,8 +180,9 @@ MaceStatus TransformArgument(
     MACE_NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_arg");
     built_options.emplace("-Dtransform_arg=" + kernel_name);
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    std::string data_dt = DtToCLDt(input->dtype());
+    built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
+    built_options.emplace("-DDATA_TYPE=" + data_dt);
     MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
                                               kernel_name,
                                               built_options,
@@ -229,6 +229,30 @@ MaceStatus TransformArgument(
   return MaceStatus::MACE_SUCCESS;
 }
 
+MaceStatus BufferTransform::Compute(OpContext *context,
+                                    const Tensor *input,
+                                    const OpenCLBufferType type,
+                                    const int wino_blk_size,
+                                    Tensor *output) {
+  MACE_UNUSED(wino_blk_size);
+  switch (type) {
+    case CONV2D_FILTER:
+      return TransformConv2DFilter(context, &kernel_, input, output);
+    case DW_CONV2D_FILTER:
+      return TransformDWConv2DFilter(context, &kernel_, input, output);
+    case ARGUMENT:
+      return TransformArgument(context, &kernel_, input, output);
+    default:
+      if (input->dtype() != output->dtype()) {
+        return BufferTypeTransform(context, &kernel_, input, output);
+      } else {
+        SetFutureDefaultWaitFn(context->future());
+        output->ReuseTensorBuffer(*input);
+        return MaceStatus::MACE_SUCCESS;
+      }
+  }
+}
+
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/buffer/buffer_transform.h b/mace/ops/opencl/buffer/buffer_transform.h
index 762518047dfefeebe3665dcb8d327e09ccb6b17d..c32ccbb13069ea800aa30b2ac8cd8a2eb6cac2b5 100644
--- a/mace/ops/opencl/buffer/buffer_transform.h
+++ b/mace/ops/opencl/buffer/buffer_transform.h
@@ -32,33 +32,27 @@ MaceStatus BufferTypeTransform(
     OpContext *context,
     cl::Kernel *kernel,
     const Tensor *input,
-    const DataType dt,
     Tensor *output);
 
 MaceStatus TransformConv2DFilter(
     OpContext *context,
     cl::Kernel *kernel,
     const Tensor *input,
-    const DataType dt,
     Tensor *output);
 
 MaceStatus TransformDWConv2DFilter(
     OpContext *context,
     cl::Kernel *kernel,
     const Tensor *input,
-    const DataType dt,
     Tensor *output);
 
 MaceStatus TransformArgument(
     OpContext *context,
     cl::Kernel *kernel,
     const Tensor *input,
-    const DataType dt,
     Tensor *output);
 
-
-template <typename T>
-class BufferTransform: public OpenCLBufferTransformKernel {
+class BufferTransform : public OpenCLBufferTransformKernel {
  public:
   MaceStatus Compute(
       OpContext *context,
@@ -72,32 +66,6 @@ class BufferTransform: public OpenCLBufferTransformKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus BufferTransform<T>::Compute(OpContext *context,
-                                       const Tensor *input,
-                                       const OpenCLBufferType type,
-                                       const int wino_blk_size,
-                                       Tensor *output) {
-  MACE_UNUSED(wino_blk_size);
-  const DataType dt = DataTypeToEnum<T>::value;
-  switch (type) {
-    case CONV2D_FILTER:
-      return TransformConv2DFilter(context, &kernel_, input, dt, output);
-    case DW_CONV2D_FILTER:
-      return TransformDWConv2DFilter(context, &kernel_, input, dt, output);
-    case ARGUMENT:
-      return TransformArgument(context, &kernel_, input, dt, output);
-    default:
-      if (input->dtype() != dt) {
-        return BufferTypeTransform(context, &kernel_, input, dt, output);
-      } else {
-        SetFutureDefaultWaitFn(context->future());
-        output->ReuseTensorBuffer(*input);
-        return MaceStatus::MACE_SUCCESS;
-      }
-  }
-}
-
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/buffer/buffer_type_transform.cc b/mace/ops/opencl/buffer/buffer_type_transform.cc
index 6899ba4053c7433c4340af91f9708387d2f02844..2cb3ae0043df20ddfa25421572db5377f0c12363 100644
--- a/mace/ops/opencl/buffer/buffer_type_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_type_transform.cc
@@ -27,7 +27,6 @@ MaceStatus BufferTypeTransform(
     OpContext *context,
     cl::Kernel *kernel,
     const Tensor *input,
-    const DataType dt,
     Tensor *output) {
   MACE_RETURN_IF_ERROR(output->ResizeLike(input));
 
@@ -43,7 +42,7 @@ MaceStatus BufferTypeTransform(
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_data_type");
     built_options.emplace("-Dtransform_data_type=" + kernel_name);
     built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(output->dtype()));
     MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
                                               kernel_name,
                                               built_options,
diff --git a/mace/ops/opencl/buffer/conv_2d.cc b/mace/ops/opencl/buffer/conv_2d.cc
new file mode 100644
index 0000000000000000000000000000000000000000..50109b6e2341f488ff39de17360d448dd238dc72
--- /dev/null
+++ b/mace/ops/opencl/buffer/conv_2d.cc
@@ -0,0 +1,170 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/buffer/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace buffer {
+
+bool Conv2dKernel::CheckUseWinograd(
+    OpenCLRuntime *runtime,
+    const std::vector<index_t> &filter_shape,
+    const std::vector<index_t> &output_shape,
+    const int *strides,
+    const int *dilations,
+    int *wino_block_size) {
+  MACE_UNUSED(kwg_size_);
+  MACE_UNUSED(runtime);
+  MACE_UNUSED(output_shape);
+  MACE_UNUSED(wino_block_size);
+  return (filter_shape[2] == 3 && filter_shape[3] == 3 &&
+      strides[0] == 1 && strides[1] == 1 &&
+      dilations[0] == 1 && dilations[1] == 1);
+}
+
+MaceStatus Conv2dKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const float leakyrelu_coefficient,
+    const int winograd_blk_size,
+    Tensor *output) {
+  MACE_UNUSED(winograd_blk_size);
+  StatsFuture pad_future, conv_future;
+  index_t filter_h = filter->dim(2);
+  index_t filter_w = filter->dim(3);
+  // Reshape output
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    ops::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), filter->shape().data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), filter->shape().data(),
+                   padding_data.data(), dilations, strides, RoundType::FLOOR,
+                   output_shape.data());
+  }
+
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  // calculate padded input shape
+  index_t width = output_shape[2];
+  index_t channels = output_shape[3];
+
+  index_t input_height = input->dim(1);
+  index_t input_width = input->dim(2);
+  index_t input_channels = input->dim(3);
+
+  int pad_top = paddings[0] >> 1;
+  int pad_left = paddings[1] >> 1;
+
+  MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
+  MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
+             input_channels);
+
+  std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
+
+  // Mark whether input changed or not
+  bool input_changed = !IsVecEqual(input_shape_, input->shape());
+  input_shape_ = input->shape();
+
+  bool use_1x1 = filter_h == 1 && filter_w == 1;
+
+  std::vector<index_t> padded_output_shape = output_shape;
+  index_t tile_w, tile_c = 4;
+  if (use_1x1) {
+    tile_w = 2;
+  } else {
+    tile_w = 4;
+  }
+  padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
+
+  std::vector<index_t> padded_input_shape = input->shape();
+  padded_input_shape[1] = input_height + paddings[0];
+  padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
+      (filter_w - 1) * dilations[1] + 1;
+  padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
+
+  const Tensor *padded_input_ptr = input;
+  // pad input
+  std::unique_ptr<Tensor> padded_input;
+  if (padded_input_shape[1] != input_height ||
+      padded_input_shape[2] != input_width ||
+      padded_input_shape[3] != input_channels) {
+    // decide scratch size before allocate it
+    index_t total_scratch_size = 0;
+    index_t padded_input_size = 0;
+
+    padded_input_size =
+        std::accumulate(padded_input_shape.begin(),
+                        padded_input_shape.end(),
+                        1,
+                        std::multiplies<index_t>())
+            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
+    total_scratch_size += padded_input_size;
+
+    // Init scratch buffer
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
+    if (old_scratch_size_ != scratch->size()) {
+      input_changed |= scratch->size() != old_scratch_size_;
+      old_scratch_size_ = scratch->size();
+    }
+
+    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
+                                       input->dtype());
+
+    padded_input->Resize(padded_input_shape);
+    PadInput(context, &kernels_[0], input, pad_top, pad_left,
+             input_changed, padded_input.get(), &pad_future);
+    padded_input_ptr = padded_input.get();
+  }
+
+  if (use_1x1) {
+    conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
+      return conv2d::Conv2d1x1(
+          context, &kernels_[1], pad_input, filter, bias, strides,
+          activation, relux_max_limit,
+          leakyrelu_coefficient, input_changed, output, &conv_future);
+    };
+  } else {
+    conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
+      return conv2d::Conv2dGeneral(
+          context, &kernels_[1], pad_input, filter, bias, strides, dilations,
+          activation, relux_max_limit,
+          leakyrelu_coefficient, input_changed, output, &conv_future);
+    };
+  }
+  MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
+  MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future());
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/buffer/conv_2d.h b/mace/ops/opencl/buffer/conv_2d.h
index 4ef8d79d9304143d29ba35125ad0b0970af310cb..c50752c3bc6abeaaabc961084d72e8f7afba9f76 100644
--- a/mace/ops/opencl/buffer/conv_2d.h
+++ b/mace/ops/opencl/buffer/conv_2d.h
@@ -36,7 +36,6 @@ extern MaceStatus Conv2d1x1(OpContext *context,
                             const Tensor *filter,
                             const Tensor *bias,
                             const int *strides,
-                            const DataType dt,
                             const ActivationType activation,
                             const float relux_max_limit,
                             const float leakyrelu_coefficient,
@@ -51,7 +50,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context,
                                 const Tensor *bias,
                                 const int *strides,
                                 const int *dilations,
-                                const DataType dt,
                                 const ActivationType activation,
                                 const float relux_max_limit,
                                 const float leakyrelu_coefficient,
@@ -60,7 +58,6 @@ extern MaceStatus Conv2dGeneral(OpContext *context,
                                 StatsFuture *future);
 }  // namespace conv2d
 
-template <typename T>
 class Conv2dKernel : public OpenCLConv2dKernel {
  public:
   Conv2dKernel() : old_scratch_size_(0) {}
@@ -95,153 +92,6 @@ class Conv2dKernel : public OpenCLConv2dKernel {
   std::vector<index_t> input_shape_;
 };
 
-
-template <typename T>
-bool Conv2dKernel<T>::CheckUseWinograd(
-    OpenCLRuntime *runtime,
-    const std::vector<index_t> &filter_shape,
-    const std::vector<index_t> &output_shape,
-    const int *strides,
-    const int *dilations,
-    int *wino_block_size) {
-  MACE_UNUSED(runtime);
-  MACE_UNUSED(output_shape);
-  MACE_UNUSED(wino_block_size);
-  return (filter_shape[2] == 3 && filter_shape[3] == 3 &&
-      strides[0] == 1 && strides[1] == 1 &&
-      dilations[0] == 1 && dilations[1] == 1);
-}
-
-template <typename T>
-MaceStatus Conv2dKernel<T>::Compute(
-      OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *bias,
-      const int *strides,
-      const Padding &padding_type,
-      const std::vector<int> &padding_data,
-      const int *dilations,
-      const ActivationType activation,
-      const float relux_max_limit,
-      const float leakyrelu_coefficient,
-      const int winograd_blk_size,
-      Tensor *output) {
-  MACE_UNUSED(winograd_blk_size);
-  StatsFuture pad_future, conv_future;
-  index_t filter_h = filter->dim(2);
-  index_t filter_w = filter->dim(3);
-  // Reshape output
-  std::vector<index_t> output_shape(4);
-  std::vector<int> paddings(2);
-  if (padding_data.empty()) {
-    ops::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), filter->shape().data(), dilations, strides,
-        padding_type, output_shape.data(), paddings.data());
-  } else {
-    paddings = padding_data;
-    CalcOutputSize(input->shape().data(), filter->shape().data(),
-                   padding_data.data(), dilations, strides, RoundType::FLOOR,
-                   output_shape.data());
-  }
-
-  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-
-  // calculate padded input shape
-  index_t width = output_shape[2];
-  index_t channels = output_shape[3];
-
-  index_t input_height = input->dim(1);
-  index_t input_width = input->dim(2);
-  index_t input_channels = input->dim(3);
-
-  int pad_top = paddings[0] >> 1;
-  int pad_left = paddings[1] >> 1;
-
-  MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
-  MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
-             input_channels);
-
-  std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
-
-  // Mark whether input changed or not
-  bool input_changed = !IsVecEqual(input_shape_, input->shape());
-  input_shape_ = input->shape();
-
-  bool use_1x1 = filter_h == 1 && filter_w == 1;
-
-  std::vector<index_t> padded_output_shape = output_shape;
-  index_t tile_w, tile_c = 4;
-  if (use_1x1) {
-    tile_w = 2;
-  } else {
-    tile_w = 4;
-  }
-  padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
-
-  std::vector<index_t> padded_input_shape = input->shape();
-  padded_input_shape[1] = input_height + paddings[0];
-  padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
-      (filter_w - 1) * dilations[1] + 1;
-  padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
-
-  const Tensor *padded_input_ptr = input;
-  // pad input
-  std::unique_ptr<Tensor> padded_input;
-  if (padded_input_shape[1] != input_height ||
-      padded_input_shape[2] != input_width ||
-      padded_input_shape[3] != input_channels) {
-    // decide scratch size before allocate it
-    index_t total_scratch_size = 0;
-    index_t padded_input_size = 0;
-
-    padded_input_size =
-        std::accumulate(padded_input_shape.begin(),
-                        padded_input_shape.end(),
-                        1,
-                        std::multiplies<index_t>())
-            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
-    total_scratch_size += padded_input_size;
-
-    // Init scratch buffer
-    ScratchBuffer *scratch = context->device()->scratch_buffer();
-    scratch->Rewind();
-    scratch->GrowSize(total_scratch_size);
-    if (old_scratch_size_ != scratch->size()) {
-      input_changed |= scratch->size() != old_scratch_size_;
-      old_scratch_size_ = scratch->size();
-    }
-
-    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
-                                  input->dtype());
-
-    padded_input->Resize(padded_input_shape);
-    PadInput(context, &kernels_[0], input, pad_top, pad_left,
-             input_changed, padded_input.get(), &pad_future);
-    padded_input_ptr = padded_input.get();
-  }
-
-  if (use_1x1) {
-    conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
-      return conv2d::Conv2d1x1(
-          context, &kernels_[1], pad_input, filter, bias, strides,
-          DataTypeToEnum<T>::v(), activation, relux_max_limit,
-          leakyrelu_coefficient, input_changed, output, &conv_future);
-    };
-  } else {
-    conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
-      return conv2d::Conv2dGeneral(
-        context, &kernels_[1], pad_input, filter, bias, strides, dilations,
-        DataTypeToEnum<T>::v(), activation, relux_max_limit,
-        leakyrelu_coefficient, input_changed, output, &conv_future);
-    };
-  }
-  MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
-  MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future());
-
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/buffer/conv_2d_1x1.cc b/mace/ops/opencl/buffer/conv_2d_1x1.cc
index bfe6775e91b0bf673365e2db4b634a57e10029bc..6eeb0f1d1584eb4eb14fd749602895437286e766 100644
--- a/mace/ops/opencl/buffer/conv_2d_1x1.cc
+++ b/mace/ops/opencl/buffer/conv_2d_1x1.cc
@@ -29,7 +29,6 @@ MaceStatus Conv2d1x1(OpContext *context,
                      const Tensor *filter,
                      const Tensor *bias,
                      const int *strides,
-                     const DataType dt,
                      const ActivationType activation,
                      const float relux_max_limit,
                      const float leakyrelu_coefficient,
@@ -53,9 +52,10 @@ MaceStatus Conv2d1x1(OpContext *context,
     MACE_NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
     built_options.emplace("-Dconv2d=" + kernel_name);
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
-    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    std::string data_dt = DtToCLDt(padded_input->dtype());
+    built_options.emplace("-DIN_DATA_TYPE=" + data_dt);
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
     switch (activation) {
       case NOOP:
diff --git a/mace/ops/opencl/buffer/conv_2d_general.cc b/mace/ops/opencl/buffer/conv_2d_general.cc
index f2090a1bb6d5d69b89a14bedb9118470c59c8c01..b19b702083bbdeb2f94b2d6ab8e7e13a02c3ab12 100644
--- a/mace/ops/opencl/buffer/conv_2d_general.cc
+++ b/mace/ops/opencl/buffer/conv_2d_general.cc
@@ -30,7 +30,6 @@ MaceStatus Conv2dGeneral(OpContext *context,
                          const Tensor *bias,
                          const int *strides,
                          const int *dilations,
-                         const DataType dt,
                          const ActivationType activation,
                          const float relux_max_limit,
                          const float leakyrelu_coefficient,
@@ -58,9 +57,11 @@ MaceStatus Conv2dGeneral(OpContext *context,
     MACE_NON_UNIFORM_WG_CONFIG
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
     built_options.emplace("-Dconv2d=" + kernel_name);
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
-    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    std::string pad_data_dt = DtToCLDt(padded_input->dtype());
+    built_options.emplace("-DIN_DATA_TYPE=" + pad_data_dt);
+    std::string out_data_dt = DtToCLDt(output->dtype());
+    built_options.emplace("-DOUT_DATA_TYPE=" + out_data_dt);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
     switch (activation) {
       case NOOP:
diff --git a/mace/ops/opencl/buffer/depthwise_conv2d.cc b/mace/ops/opencl/buffer/depthwise_conv2d.cc
index d9e1c2c054ee3280b3515a39f480c72ce9f96c43..48c9829f4cd3ad04daf95b5d1964807b9e0a0e67 100644
--- a/mace/ops/opencl/buffer/depthwise_conv2d.cc
+++ b/mace/ops/opencl/buffer/depthwise_conv2d.cc
@@ -30,7 +30,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                            const Tensor *bias,
                            const int *strides,
                            const int *dilations,
-                           const DataType dt,
                            const ActivationType activation,
                            const float relux_max_limit,
                            const float leakyrelu_coefficient,
@@ -59,8 +58,8 @@ MaceStatus DepthwiseConv2d(OpContext *context,
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
     built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
     built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
-    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
     switch (activation) {
       case NOOP:
@@ -136,6 +135,118 @@ MaceStatus DepthwiseConv2d(OpContext *context,
 }
 
 }  // namespace depthwise
+
+MaceStatus DepthwiseConv2dKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const float leakyrelu_coefficient,
+    Tensor *output) {
+  StatsFuture pad_future, dw_conv_future;
+  index_t filter_w = filter->dim(3);
+
+  // Create a fake conv_2d filter to calculate the paddings and output size
+  std::vector<index_t> fake_filter_shape(4);
+  fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
+  fake_filter_shape[1] = filter->dim(1);
+  fake_filter_shape[2] = filter->dim(2);
+  fake_filter_shape[3] = filter->dim(3);
+
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    ops::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), fake_filter_shape.data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
+                   padding_data.data(), dilations, strides, RoundType::FLOOR,
+                   output_shape.data());
+  }
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  // calculate padded input shape
+  index_t width = output_shape[2];
+  index_t channels = output_shape[3];
+
+  index_t input_height = input->dim(1);
+  index_t input_width = input->dim(2);
+  index_t input_channels = input->dim(3);
+
+  int pad_top = paddings[0] >> 1;
+  int pad_left = paddings[1] >> 1;
+
+  MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
+  MACE_CHECK(filter->dim(0) * input_channels == channels);
+  MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
+             input_channels);
+
+  // Mark whether input changed or not
+  bool input_changed = !IsVecEqual(input_shape_, input->shape());
+  input_shape_ = input->shape();
+
+  std::vector<index_t> padded_output_shape = output_shape;
+  index_t tile_w = 4, tile_c = 4;
+  padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
+
+  std::vector<index_t> padded_input_shape = input->shape();
+  padded_input_shape[1] = input_height + paddings[0];
+  padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
+      (filter_w - 1) * dilations[1] + 1;
+  padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
+
+  const Tensor *padded_input_ptr = input;
+  // pad input
+  std::unique_ptr<Tensor> padded_input;
+  if (padded_input_shape[1] != input_height ||
+      padded_input_shape[2] != input_width ||
+      padded_input_shape[3] != input_channels) {
+    index_t total_scratch_size = 0;
+    index_t padded_input_size = 0;
+
+    padded_input_size =
+        std::accumulate(padded_input_shape.begin(),
+                        padded_input_shape.end(),
+                        1,
+                        std::multiplies<index_t>())
+            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
+    total_scratch_size += padded_input_size;
+
+    // Init scratch buffer
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
+    if (old_scratch_size_ != scratch->size()) {
+      input_changed |= scratch->size() != old_scratch_size_;
+      old_scratch_size_ = scratch->size();
+    }
+
+    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
+                                       input->dtype());
+
+    padded_input->Resize(padded_input_shape);
+    PadInput(context, &kernels_[0], input, pad_top, pad_left,
+             input_changed, padded_input.get(), &pad_future);
+    padded_input_ptr = padded_input.get();
+  }
+
+  MACE_RETURN_IF_ERROR(
+      depthwise::DepthwiseConv2d(
+          context, &kernels_[1], padded_input_ptr, filter, bias, strides,
+          dilations, activation, relux_max_limit,
+          leakyrelu_coefficient, input_changed, output, &dw_conv_future));
+  MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
+  return MaceStatus::MACE_SUCCESS;
+}
+
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/buffer/depthwise_conv2d.h b/mace/ops/opencl/buffer/depthwise_conv2d.h
index 6a46334a787378441d84d020cf578042e6bd24b9..98dffa12734b8404221869d147420a2e76866224 100644
--- a/mace/ops/opencl/buffer/depthwise_conv2d.h
+++ b/mace/ops/opencl/buffer/depthwise_conv2d.h
@@ -37,7 +37,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                            const Tensor *bias,
                            const int *strides,
                            const int *dilations,
-                           const DataType dt,
                            const ActivationType activation,
                            const float relux_max_limit,
                            const float leakyrelu_coefficient,
@@ -46,8 +45,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                            StatsFuture *future);
 }  // namespace depthwise
 
-
-template <typename T>
 class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
  public:
   DepthwiseConv2dKernel() : old_scratch_size_(0) {}
@@ -68,122 +65,9 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
  private:
   index_t old_scratch_size_;
   cl::Kernel kernels_[2];
-  uint32_t kwg_size_;
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus DepthwiseConv2dKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *filter,
-    const Tensor *bias,
-    const int *strides,
-    const Padding &padding_type,
-    const std::vector<int> &padding_data,
-    const int *dilations,
-    const ActivationType activation,
-    const float relux_max_limit,
-    const float leakyrelu_coefficient,
-    Tensor *output) {
-  StatsFuture pad_future, dw_conv_future;
-  index_t filter_w = filter->dim(3);
-
-  // Create a fake conv_2d filter to calculate the paddings and output size
-  std::vector<index_t> fake_filter_shape(4);
-  fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
-  fake_filter_shape[1] = filter->dim(1);
-  fake_filter_shape[2] = filter->dim(2);
-  fake_filter_shape[3] = filter->dim(3);
-
-  std::vector<index_t> output_shape(4);
-  std::vector<int> paddings(2);
-  if (padding_data.empty()) {
-    ops::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), fake_filter_shape.data(), dilations, strides,
-        padding_type, output_shape.data(), paddings.data());
-  } else {
-    paddings = padding_data;
-    CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
-                   padding_data.data(), dilations, strides, RoundType::FLOOR,
-                   output_shape.data());
-  }
-  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-
-  // calculate padded input shape
-  index_t width = output_shape[2];
-  index_t channels = output_shape[3];
-
-  index_t input_height = input->dim(1);
-  index_t input_width = input->dim(2);
-  index_t input_channels = input->dim(3);
-
-  int pad_top = paddings[0] >> 1;
-  int pad_left = paddings[1] >> 1;
-
-  MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
-  MACE_CHECK(filter->dim(0) * input_channels == channels);
-  MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
-             input_channels);
-
-  // Mark whether input changed or not
-  bool input_changed = !IsVecEqual(input_shape_, input->shape());
-  input_shape_ = input->shape();
-
-  std::vector<index_t> padded_output_shape = output_shape;
-  index_t tile_w = 4, tile_c = 4;
-  padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
-
-  std::vector<index_t> padded_input_shape = input->shape();
-  padded_input_shape[1] = input_height + paddings[0];
-  padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
-      (filter_w - 1) * dilations[1] + 1;
-  padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
-
-  const Tensor *padded_input_ptr = input;
-  // pad input
-  std::unique_ptr<Tensor> padded_input;
-  if (padded_input_shape[1] != input_height ||
-      padded_input_shape[2] != input_width ||
-      padded_input_shape[3] != input_channels) {
-    index_t total_scratch_size = 0;
-    index_t padded_input_size = 0;
-
-    padded_input_size =
-        std::accumulate(padded_input_shape.begin(),
-                        padded_input_shape.end(),
-                        1,
-                        std::multiplies<index_t>())
-            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
-    total_scratch_size += padded_input_size;
-
-    // Init scratch buffer
-    ScratchBuffer *scratch = context->device()->scratch_buffer();
-    scratch->Rewind();
-    scratch->GrowSize(total_scratch_size);
-    if (old_scratch_size_ != scratch->size()) {
-      input_changed |= scratch->size() != old_scratch_size_;
-      old_scratch_size_ = scratch->size();
-    }
-
-    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
-                                       input->dtype());
-
-    padded_input->Resize(padded_input_shape);
-    PadInput(context, &kernels_[0], input, pad_top, pad_left,
-             input_changed, padded_input.get(), &pad_future);
-    padded_input_ptr = padded_input.get();
-  }
-
-  MACE_RETURN_IF_ERROR(
-      depthwise::DepthwiseConv2d(
-          context, &kernels_[1], padded_input_ptr, filter, bias, strides,
-          dilations, DataTypeToEnum<T>::v(), activation, relux_max_limit,
-          leakyrelu_coefficient, input_changed, output, &dw_conv_future));
-  MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/buffer/pooling.cc b/mace/ops/opencl/buffer/pooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e19d1ab04ebd3faea1067e6c0d4ec548c61a0cc5
--- /dev/null
+++ b/mace/ops/opencl/buffer/pooling.cc
@@ -0,0 +1,174 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/buffer/pooling.h"
+
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace buffer {
+
+MaceStatus PoolingKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const PoolingType pooling_type,
+    const int *kernels,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const RoundType round_type,
+    Tensor *output) {
+  MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
+    << "Pooling opencl kernel not support dilation yet";
+
+  StatsFuture pad_future, pooling_future;
+
+  index_t input_channels = input->dim(3);
+
+  std::vector<index_t> output_shape(4);
+  std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
+                                       kernels[0], kernels[1]};
+
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    ops::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), filter_shape.data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), filter_shape.data(),
+                   padding_data.data(), dilations, strides, round_type,
+                   output_shape.data());
+  }
+
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  // Mark whether input changed or not
+  bool input_changed = !IsVecEqual(input_shape_, input->shape());
+  input_shape_ = input->shape();
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+
+  // pad input
+  std::vector<index_t> padded_input_shape = input->shape();
+  padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
+
+  const Tensor *padded_input_ptr = input;
+  // pad input
+  std::unique_ptr<Tensor> padded_input;
+  if (padded_input_shape[3] != input_channels) {
+    index_t total_scratch_size = 0;
+    index_t padded_input_size = 0;
+
+    padded_input_size =
+        std::accumulate(padded_input_shape.begin(),
+                        padded_input_shape.end(),
+                        1,
+                        std::multiplies<index_t>())
+            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
+    total_scratch_size += padded_input_size;
+
+    // Init scratch buffer
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
+    if (old_scratch_size_ != scratch->size()) {
+      input_changed |= scratch->size() != old_scratch_size_;
+      old_scratch_size_ = scratch->size();
+    }
+
+    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
+                                       input->dtype());
+
+    padded_input->Resize(padded_input_shape);
+    PadInput(context, &kernels_[0], input, 0, 0,
+             input_changed, padded_input.get(), &pad_future);
+    padded_input_ptr = padded_input.get();
+  }
+
+  cl::Kernel *kernel = &kernels_[1];
+  MACE_OUT_OF_RANGE_DEFINITION
+
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
+    built_options.emplace("-Dpooling=" + kernel_name);
+    auto input_dtype = input->dtype();
+    auto input_dt = DtToCLDt(input_dtype);
+    built_options.emplace("-DIN_DATA_TYPE=" + input_dt);
+    auto output_dtype = output->dtype();
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output_dtype));
+    if (pooling_type == MAX && input_dtype == output_dtype) {
+      built_options.emplace("-DDATA_TYPE=" + input_dt);
+    } else {
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    }
+    if (pooling_type == AVG) {
+      built_options.emplace("-DPOOL_AVG");
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
+                                              kernel_name,
+                                              built_options,
+                                              kernel));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
+  }
+
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
+      static_cast<uint32_t>(output->dim(2)),
+      static_cast<uint32_t>(output->dim(0) * output->dim(1)),
+  };
+
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  if (input_changed) {
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
+    MACE_SET_3D_GWS_ARGS(*kernel, gws);
+    kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
+    kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
+    kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
+    kernel->setArg(idx++, paddings[0] / 2);
+    kernel->setArg(idx++, paddings[1] / 2);
+    kernel->setArg(idx++, strides[0]);
+    kernel->setArg(idx++, strides[1]);
+    kernel->setArg(idx++, kernels[0]);
+    kernel->setArg(idx++, kernels[1]);
+    kernel->setArg(idx++, *(output->opencl_buffer()));
+  }
+
+  const std::vector<uint32_t> lws = {4, 4, 4, 0};
+  std::string tuning_key =
+      Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
+                                           gws, lws, &pooling_future));
+  MACE_OUT_OF_RANGE_VALIDATION
+  MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future());
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/buffer/pooling.h b/mace/ops/opencl/buffer/pooling.h
index 4f153e4acfff75ab179e567803e05e14f67ceebf..9e675e29aa14bd12409f0a1315fe34c023a73b5d 100644
--- a/mace/ops/opencl/buffer/pooling.h
+++ b/mace/ops/opencl/buffer/pooling.h
@@ -31,7 +31,6 @@ namespace ops {
 namespace opencl {
 namespace buffer {
 
-template <typename T>
 class PoolingKernel : public OpenCLPoolingKernel {
  public:
   PoolingKernel() : old_scratch_size_(0) {}
@@ -54,158 +53,6 @@ class PoolingKernel : public OpenCLPoolingKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus PoolingKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const PoolingType pooling_type,
-    const int *kernels,
-    const int *strides,
-    const Padding &padding_type,
-    const std::vector<int> &padding_data,
-    const int *dilations,
-    const RoundType round_type,
-    Tensor *output) {
-  MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
-    << "Pooling opencl kernel not support dilation yet";
-
-  StatsFuture pad_future, pooling_future;
-
-  index_t input_channels = input->dim(3);
-
-  std::vector<index_t> output_shape(4);
-  std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
-                                       kernels[0], kernels[1]};
-
-  std::vector<int> paddings(2);
-  if (padding_data.empty()) {
-    ops::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), filter_shape.data(), dilations, strides,
-        padding_type, output_shape.data(), paddings.data());
-  } else {
-    paddings = padding_data;
-    CalcOutputSize(input->shape().data(), filter_shape.data(),
-                   padding_data.data(), dilations, strides, round_type,
-                   output_shape.data());
-  }
-
-  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-
-  // Mark whether input changed or not
-  bool input_changed = !IsVecEqual(input_shape_, input->shape());
-  input_shape_ = input->shape();
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-
-  // pad input
-  std::vector<index_t> padded_input_shape = input->shape();
-  padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
-
-  const Tensor *padded_input_ptr = input;
-  // pad input
-  std::unique_ptr<Tensor> padded_input;
-  if (padded_input_shape[3] != input_channels) {
-    index_t total_scratch_size = 0;
-    index_t padded_input_size = 0;
-
-    padded_input_size =
-        std::accumulate(padded_input_shape.begin(),
-                        padded_input_shape.end(),
-                        1,
-                        std::multiplies<index_t>())
-            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
-    total_scratch_size += padded_input_size;
-
-    // Init scratch buffer
-    ScratchBuffer *scratch = context->device()->scratch_buffer();
-    scratch->Rewind();
-    scratch->GrowSize(total_scratch_size);
-    if (old_scratch_size_ != scratch->size()) {
-      input_changed |= scratch->size() != old_scratch_size_;
-      old_scratch_size_ = scratch->size();
-    }
-
-    padded_input = make_unique<Tensor>(scratch->Scratch(padded_input_size),
-                                       input->dtype());
-
-    padded_input->Resize(padded_input_shape);
-    PadInput(context, &kernels_[0], input, 0, 0,
-             input_changed, padded_input.get(), &pad_future);
-    padded_input_ptr = padded_input.get();
-  }
-
-  cl::Kernel *kernel = &kernels_[1];
-  MACE_OUT_OF_RANGE_DEFINITION
-
-  if (kernel->get() == nullptr) {
-    const DataType dt = DataTypeToEnum<T>::value;
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
-    built_options.emplace("-Dpooling=" + kernel_name);
-
-    if (pooling_type == MAX && input->dtype() == output->dtype()) {
-      built_options.emplace("-DIN_DATA_TYPE=" +
-          DtToCLDt(input->dtype()));
-      built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
-      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    } else {
-      built_options.emplace("-DIN_DATA_TYPE=" +
-          DtToCLDt(input->dtype()));
-      built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
-      built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    }
-    if (pooling_type == AVG) {
-      built_options.emplace("-DPOOL_AVG");
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
-                                              kernel_name,
-                                              built_options,
-                                              kernel));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-  }
-
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
-      static_cast<uint32_t>(output->dim(2)),
-      static_cast<uint32_t>(output->dim(0) * output->dim(1)),
-  };
-
-  MACE_OUT_OF_RANGE_INIT(*kernel);
-  if (input_changed) {
-    uint32_t idx = 0;
-    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
-    MACE_SET_3D_GWS_ARGS(*kernel, gws);
-    kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
-    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
-    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
-    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
-    kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
-    kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
-    kernel->setArg(idx++, paddings[0] / 2);
-    kernel->setArg(idx++, paddings[1] / 2);
-    kernel->setArg(idx++, strides[0]);
-    kernel->setArg(idx++, strides[1]);
-    kernel->setArg(idx++, kernels[0]);
-    kernel->setArg(idx++, kernels[1]);
-    kernel->setArg(idx++, *(output->opencl_buffer()));
-  }
-
-  const std::vector<uint32_t> lws = {4, 4, 4, 0};
-  std::string tuning_key =
-      Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
-                                           gws, lws, &pooling_future));
-  MACE_OUT_OF_RANGE_VALIDATION
-  MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future());
-
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/buffer/softmax.cc b/mace/ops/opencl/buffer/softmax.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc70ea93a07f35d3daa44617f983a954392b8485
--- /dev/null
+++ b/mace/ops/opencl/buffer/softmax.cc
@@ -0,0 +1,99 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/buffer/softmax.h"
+
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace buffer {
+
+MaceStatus SoftmaxKernel::Compute(
+    OpContext *context,
+    const Tensor *logits,
+    Tensor *output) {
+  index_t batch = 0;
+  index_t height = 0;
+  index_t width = 0;
+  index_t channels = 0;
+
+  if (logits->dim_size() == 2) {
+    batch = logits->dim(0);
+    height = 1;
+    width = 1;
+    channels = logits->dim(1);
+
+  } else if (logits->dim_size() == 4) {
+    batch = logits->dim(0);
+    height = logits->dim(1);
+    width = logits->dim(2);
+    channels = logits->dim(3);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const int remain_channels = channel_blocks * 4 - channels;
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
+    built_options.emplace("-Dsoftmax=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(output->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    if (use_log_) built_options.emplace("-DUSE_LOG");
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, logits->shape())) {
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(logits->opencl_buffer()));
+    kernel_.setArg(idx++, static_cast<int>(height));
+    kernel_.setArg(idx++, static_cast<int>(channels));
+    kernel_.setArg(idx++, remain_channels);
+    kernel_.setArg(idx++, *(output->opencl_buffer()));
+
+    input_shape_ = logits->shape();
+  }
+
+  std::vector<uint32_t> lws = {4, 4, 4, 0};
+  std::string tuning_key =
+      Concat("softmax_opencl_kernel", batch, height, width, channels);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/buffer/softmax.h b/mace/ops/opencl/buffer/softmax.h
index 3ab6a7cef1bd1d760ea70e1409f687d664f51996..05d27cac7f4bdcd408c6b25b958e6414bde8249a 100644
--- a/mace/ops/opencl/buffer/softmax.h
+++ b/mace/ops/opencl/buffer/softmax.h
@@ -29,7 +29,7 @@ namespace mace {
 namespace ops {
 namespace opencl {
 namespace buffer {
-template <typename T>
+
 class SoftmaxKernel : public OpenCLSoftmaxKernel {
  public:
   explicit SoftmaxKernel(bool use_log)
@@ -47,81 +47,6 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus SoftmaxKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *logits,
-    Tensor *output) {
-  index_t batch = 0;
-  index_t height = 0;
-  index_t width = 0;
-  index_t channels = 0;
-
-  if (logits->dim_size() == 2) {
-    batch = logits->dim(0);
-    height = 1;
-    width = 1;
-    channels = logits->dim(1);
-
-  } else if (logits->dim_size() == 4) {
-    batch = logits->dim(0);
-    height = logits->dim(1);
-    width = logits->dim(2);
-    channels = logits->dim(3);
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const int remain_channels = channel_blocks * 4 - channels;
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
-    built_options.emplace("-Dsoftmax=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
-    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    if (use_log_) built_options.emplace("-DUSE_LOG");
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, logits->shape())) {
-    uint32_t idx = 0;
-    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(logits->opencl_buffer()));
-    kernel_.setArg(idx++, static_cast<int>(height));
-    kernel_.setArg(idx++, static_cast<int>(channels));
-    kernel_.setArg(idx++, remain_channels);
-    kernel_.setArg(idx++, *(output->opencl_buffer()));
-
-    input_shape_ = logits->shape();
-  }
-
-  std::vector<uint32_t> lws = {4, 4, 4, 0};
-  std::string tuning_key =
-      Concat("softmax_opencl_kernel", batch, height, width, channels);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace buffer
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/buffer_transform.cc b/mace/ops/opencl/buffer_transform.cc
index 7e59b339642b571b7bc08f09af1b07814096eaf0..fc1d9dcc2c514d289baa3f56bced871723e778fc 100644
--- a/mace/ops/opencl/buffer_transform.cc
+++ b/mace/ops/opencl/buffer_transform.cc
@@ -20,11 +20,11 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class BufferTransformOp;
 
-template <typename T>
-class BufferTransformOp<DeviceType::GPU, T> : public Operation {
+template<>
+class BufferTransformOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit BufferTransformOp(OpConstructContext *context)
       : Operation(context),
@@ -42,7 +42,7 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
 
     MemoryType in_mem_type = context->workspace()->GetTensor(
         operator_def_->input(0))->memory_type();
-    return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
+    return OpenCLBufferTransformer(in_mem_type, out_mem_type_).Transform(
         context, input, type, out_mem_type_, wino_blk_size_, output);
   }
 
@@ -51,13 +51,8 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
   MemoryType out_mem_type_;
 };
 
-
 void RegisterBufferTransform(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "BufferTransform",
-                   BufferTransformOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "BufferTransform",
-                   BufferTransformOp, DeviceType::GPU, half);
+  MACE_REGISTER_GPU_OP(op_registry, "BufferTransform", BufferTransformOp);
 }
 
 }  // namespace ops
diff --git a/mace/ops/opencl/buffer_transformer.cc b/mace/ops/opencl/buffer_transformer.cc
index cda7c1331c918d8a685dc1a07fa11865afce8602..dae8385644bc8e74c8c4059b75c110600588ba91 100644
--- a/mace/ops/opencl/buffer_transformer.cc
+++ b/mace/ops/opencl/buffer_transformer.cc
@@ -23,5 +23,29 @@ std::string TransformedFilterName(const std::string &name) {
   return name + postfix;
 }
 
+MaceStatus TransformFilter(
+    mace::OpConstructContext *context,
+    OperatorDef *op_def,
+    const int input_idx,
+    const OpenCLBufferType buffer_type,
+    const MemoryType mem_type,
+    const int wino_blk_size) {
+  OpContext op_context(context->workspace(), context->device());
+  Workspace *ws = context->workspace();
+  std::string input_name = op_def->input(input_idx);
+  Tensor *input = ws->GetTensor(input_name);
+  const DataType dt = input->dtype();
+  std::string output_name = TransformedFilterName(input_name);
+  Tensor *output =
+      ws->CreateTensor(output_name, context->device()->allocator(), dt, true);
+
+  // update the information
+  op_def->set_input(input_idx, output_name);
+  input->MarkUnused();
+  return OpenCLBufferTransformer(input->memory_type(), mem_type).
+      Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
+                output);
+}
+
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h
index d2ef505825eceee5dfb43629ddc250636f952540..f3df8bc4452766b8a15d579f55aae09722c9a48e 100644
--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
@@ -28,17 +28,16 @@
 namespace mace {
 namespace ops {
 // Only used for GPU Operation(BufferTransform)
-template<typename T>
 class OpenCLBufferTransformer {
  public:
   OpenCLBufferTransformer(const MemoryType in_mem_type,
                           const MemoryType out_mem_type) {
     if (out_mem_type == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::BufferToImage<T>>();
+      kernel_ = make_unique<opencl::image::BufferToImage>();
     } else if (in_mem_type == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::ImageToBuffer<T>>();
+      kernel_ = make_unique<opencl::image::ImageToBuffer>();
     } else {
-      kernel_ = make_unique<opencl::buffer::BufferTransform<T>>();
+      kernel_ = make_unique<opencl::buffer::BufferTransform>();
     }
   }
 
@@ -49,7 +48,7 @@ class OpenCLBufferTransformer {
                        const int wino_blk_size,
                        Tensor *output) {
     Workspace *ws = context->workspace();
-    DataType dt = DataTypeToEnum<T>::value;
+    DataType dt = output->dtype();
     MemoryType in_mem_type = input->memory_type();
     if (out_mem_type == MemoryType::GPU_IMAGE ||
         out_mem_type == MemoryType::GPU_BUFFER) {
@@ -87,10 +86,10 @@ class OpenCLBufferTransformer {
               << " to CPU Buffer " << output->name()
               << " with data type " << dt;
       Tensor::MappingGuard guard(&internal_tensor);
-      const T *internal_ptr = internal_tensor.data<T>();
+      const float *internal_ptr = internal_tensor.data<float>();
       output->Resize(internal_tensor.shape());
-      T *output_ptr = output->mutable_data<T>();
-      memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T));
+      float *output_ptr = output->mutable_data<float>();
+      memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(float));
       return MaceStatus::MACE_SUCCESS;
     } else {
       LOG(FATAL) << "Unexpected error: " << out_mem_type;
@@ -110,30 +109,13 @@ class OpenCLBufferTransformer {
 
 std::string TransformedFilterName(const std::string &name);
 
-template<typename T>
 MaceStatus TransformFilter(
     mace::OpConstructContext *context,
     OperatorDef *op_def,
     const int input_idx,
     const OpenCLBufferType buffer_type,
     const MemoryType mem_type,
-    const int wino_blk_size = 0) {
-  const DataType dt = DataTypeToEnum<T>::value;
-  OpContext op_context(context->workspace(), context->device());
-  Workspace *ws = context->workspace();
-  std::string input_name = op_def->input(input_idx);
-  Tensor *input = ws->GetTensor(input_name);
-  std::string output_name = TransformedFilterName(input_name);
-  Tensor *output =
-      ws->CreateTensor(output_name, context->device()->allocator(), dt, true);
-
-  // update the information
-  op_def->set_input(input_idx, output_name);
-  input->MarkUnused();
-  return OpenCLBufferTransformer<T>(input->memory_type(), mem_type).
-      Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
-                output);
-}
+    const int wino_blk_size = 0);
 
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/opencl/conv_2d.h b/mace/ops/opencl/conv_2d.h
index a9ec131d18ef898cb493f4f7ba0bc73fcacc7f07..d6dd40bd6d05c5e5d96af649190c6b9a1ef60822 100644
--- a/mace/ops/opencl/conv_2d.h
+++ b/mace/ops/opencl/conv_2d.h
@@ -17,8 +17,9 @@
 
 #include <vector>
 
-#include "mace/ops/activation.h"
+#include "mace/ops/common/activation_type.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
 
 namespace mace {
 class OpContext;
diff --git a/mace/ops/opencl/deconv_2d.h b/mace/ops/opencl/deconv_2d.h
index 282a6dd888ff3ed0bb442067846b836fbad7291a..3335bebf967ba0321d30cce0ff0b249fcffcacd0 100644
--- a/mace/ops/opencl/deconv_2d.h
+++ b/mace/ops/opencl/deconv_2d.h
@@ -17,7 +17,10 @@
 
 #include <vector>
 
-#include "mace/ops/activation.h"
+#include "mace/core/types.h"
+#include "mace/ops/common/activation_type.h"
+#include "mace/public/mace.h"
+#include "mace/utils/macros.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/depthwise_deconv2d.h b/mace/ops/opencl/depthwise_deconv2d.h
index b2460fcda74e67ff33c9e3dee10ba53dc840fff4..462010729589fcee949f6d64c2387de55f0e44a8 100644
--- a/mace/ops/opencl/depthwise_deconv2d.h
+++ b/mace/ops/opencl/depthwise_deconv2d.h
@@ -19,6 +19,9 @@
 #include <vector>
 
 #include "mace/ops/common/activation_type.h"
+#include "mace/public/mace.h"
+#include "mace/utils/macros.h"
+#include "mace/core/types.h"
 
 namespace mace {
 
diff --git a/mace/ops/opencl/fully_connected.h b/mace/ops/opencl/fully_connected.h
index 416aed6c8692ceaf45da1d1eb36f82b3753c8729..88c1cbaba293fcb42c059b46f5e62e0bcd9de70c 100644
--- a/mace/ops/opencl/fully_connected.h
+++ b/mace/ops/opencl/fully_connected.h
@@ -15,8 +15,7 @@
 #ifndef MACE_OPS_OPENCL_FULLY_CONNECTED_H_
 #define MACE_OPS_OPENCL_FULLY_CONNECTED_H_
 
-#include "mace/ops/activation.h"
-
+#include "mace/ops/common/activation_type.h"
 #include "mace/public/mace.h"
 #include "mace/utils/math.h"
 
diff --git a/mace/ops/opencl/helper.cc b/mace/ops/opencl/helper.cc
index 912a8d8d87e549290cf5d174187d288c2462fcb1..9729555a5ce246a1cb4277c61bf5d5de9f16bbd1 100644
--- a/mace/ops/opencl/helper.cc
+++ b/mace/ops/opencl/helper.cc
@@ -77,28 +77,6 @@ std::string DtToCLCMDDt(const DataType dt) {
   }
 }
 
-std::string DtToUpCompatibleCLDt(const DataType dt) {
-  switch (dt) {
-    case DT_FLOAT:
-    case DT_HALF:
-      return "float";
-    default:
-      LOG(FATAL) << "Unsupported data type";
-      return "";
-  }
-}
-
-std::string DtToUpCompatibleCLCMDDt(const DataType dt) {
-  switch (dt) {
-    case DT_FLOAT:
-    case DT_HALF:
-      return "f";
-    default:
-      LOG(FATAL) << "Not supported data type for opencl cmd data type";
-      return "";
-  }
-}
-
 std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
                                        const uint32_t *gws,
                                        const uint32_t kwg_size) {
diff --git a/mace/ops/opencl/helper.h b/mace/ops/opencl/helper.h
index a4a49b4e15a021f1fa55fbd39c514777f03005bd..a9e9866c31e85bd82efb1d1b2622d429f8639c5a 100644
--- a/mace/ops/opencl/helper.h
+++ b/mace/ops/opencl/helper.h
@@ -100,17 +100,9 @@ std::vector<index_t> FormatBufferShape(
 // CPU data type to OpenCL command data type
 std::string DtToCLCMDDt(const DataType dt);
 
-// CPU data type to upward compatible OpenCL command data type
-// e.g. half -> float
-std::string DtToUpCompatibleCLCMDDt(const DataType dt);
-
 // CPU data type to OpenCL data type
 std::string DtToCLDt(const DataType dt);
 
-// CPU data type to upward compatible OpenCL data type
-// e.g. half -> float
-std::string DtToUpCompatibleCLDt(const DataType dt);
-
 // CPU data type to OpenCL condition data type used in select
 // e.g. half -> float
 std::string DtToCLCondDt(const DataType dt);
diff --git a/mace/ops/opencl/image/activation.cc b/mace/ops/opencl/image/activation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c8ed331820cb23801fb346d645ed0f7a138936d
--- /dev/null
+++ b/mace/ops/opencl/image/activation.cc
@@ -0,0 +1,123 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/activation.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus ActivationKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *alpha,
+    Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
+    built_options.emplace("-Dactivation=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    switch (activation_) {
+      case RELU: {
+        tuning_key_prefix_ = "relu_opencl_kernel";
+        built_options.emplace("-DUSE_RELU");
+        break;
+      }
+      case RELUX: {
+        tuning_key_prefix_ = "relux_opencl_kernel";
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      }
+      case PRELU: {
+        tuning_key_prefix_ = "prelu_opencl_kernel";
+        built_options.emplace("-DUSE_PRELU");
+        break;
+      }
+      case TANH: {
+        tuning_key_prefix_ = "tanh_opencl_kernel";
+        built_options.emplace("-DUSE_TANH");
+        break;
+      }
+      case SIGMOID: {
+        tuning_key_prefix_ = "sigmoid_opencl_kernel";
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      }
+      case LEAKYRELU: {
+        tuning_key_prefix_ = "leakyrelu_opencl_kernel";
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
+      }
+      default: {
+        LOG(FATAL) << "Unknown activation type: " << activation_;
+      }
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    int idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    if (activation_ == PRELU) {
+      MACE_CHECK_NOTNULL(alpha);
+      kernel_.setArg(idx++, *(alpha->opencl_image()));
+    }
+    kernel_.setArg(idx++, relux_max_limit_);
+    kernel_.setArg(idx++, leakyrelu_coefficient_);
+    kernel_.setArg(idx++, *(output->opencl_image()));
+
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
+             output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
+
diff --git a/mace/ops/opencl/image/activation.h b/mace/ops/opencl/image/activation.h
index 6f7c573cec0c3016ac247e095d6148da158e3301..e98b5e9daefe0cf988b6cb39ee7e0cf4903ea89b 100644
--- a/mace/ops/opencl/image/activation.h
+++ b/mace/ops/opencl/image/activation.h
@@ -31,12 +31,11 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class ActivationKernel : public OpenCLActivationKernel {
  public:
   ActivationKernel(ActivationType type,
-                   T relux_max_limit,
-                   T leakyrelu_coefficient)
+                   float relux_max_limit,
+                   float leakyrelu_coefficient)
       : activation_(type), relux_max_limit_(relux_max_limit),
         leakyrelu_coefficient_(leakyrelu_coefficient) {}
 
@@ -48,106 +47,14 @@ class ActivationKernel : public OpenCLActivationKernel {
 
  private:
   ActivationType activation_;
-  T relux_max_limit_;
-  T leakyrelu_coefficient_;
+  float relux_max_limit_;
+  float leakyrelu_coefficient_;
   cl::Kernel kernel_;
   uint32_t kwg_size_;
   std::vector<index_t> input_shape_;
   std::string tuning_key_prefix_;
 };
 
-template <typename T>
-MaceStatus ActivationKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *alpha,
-    Tensor *output) {
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
-    built_options.emplace("-Dactivation=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    switch (activation_) {
-      case RELU:
-        tuning_key_prefix_ = "relu_opencl_kernel";
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        tuning_key_prefix_ = "relux_opencl_kernel";
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case PRELU:
-        tuning_key_prefix_ = "prelu_opencl_kernel";
-        built_options.emplace("-DUSE_PRELU");
-        break;
-      case TANH:
-        tuning_key_prefix_ = "tanh_opencl_kernel";
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        tuning_key_prefix_ = "sigmoid_opencl_kernel";
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      case LEAKYRELU:
-        tuning_key_prefix_ = "leakyrelu_opencl_kernel";
-        built_options.emplace("-DUSE_LEAKYRELU");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation_;
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    int idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    if (activation_ == PRELU) {
-      MACE_CHECK_NOTNULL(alpha);
-      kernel_.setArg(idx++, *(alpha->opencl_image()));
-    }
-    kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
-    kernel_.setArg(idx++, static_cast<float>(leakyrelu_coefficient_));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
-             output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/addn.cc b/mace/ops/opencl/image/addn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7bb38e01b7188e406a0fb13e48d4116c5253a69d
--- /dev/null
+++ b/mace/ops/opencl/image/addn.cc
@@ -0,0 +1,106 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/addn.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus AddNKernel::Compute(
+    OpContext *context,
+    const std::vector<const Tensor *> &input_tensors,
+    Tensor *output_tensor) {
+  size_t size = input_tensors.size();
+  MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
+
+  const index_t batch = input_tensors[0]->dim(0);
+  const index_t height = input_tensors[0]->dim(1);
+  const index_t width = input_tensors[0]->dim(2);
+  const index_t channels = input_tensors[0]->dim(3);
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  for (size_t i = 1; i < size; ++i) {
+    MACE_CHECK_NOTNULL(input_tensors[i]);
+    MACE_CHECK(batch == input_tensors[i]->dim(0));
+    MACE_CHECK(height == input_tensors[i]->dim(1));
+    MACE_CHECK(width == input_tensors[i]->dim(2));
+    MACE_CHECK(channels == input_tensors[i]->dim(3));
+  }
+
+  if (kernel_.get() == nullptr) {
+    if (input_tensors.size() > 4) {
+      MACE_NOT_IMPLEMENTED;
+    }
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
+    built_options.emplace("-Daddn=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
+
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+
+  std::vector<index_t> output_shape = input_tensors[0]->shape();
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t width_pixels = channel_blocks * width;
+  const index_t batch_height_pixels = batch * height;
+
+  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
+                           static_cast<uint32_t>(batch_height_pixels)};
+
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
+    std::vector<size_t> output_image_shape;
+    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                                &output_image_shape);
+    MACE_RETURN_IF_ERROR(
+        output_tensor->ResizeImage(output_shape, output_image_shape));
+
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_2D_GWS_ARGS(kernel_, gws);
+    for (auto input : input_tensors) {
+      kernel_.setArg(idx++, *(input->opencl_image()));
+    }
+    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
+
+    input_shape_ = input_tensors[0]->shape();
+  }
+
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
+  std::string tuning_key =
+      Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
+             output_tensor->dim(2), output_tensor->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/addn.h b/mace/ops/opencl/image/addn.h
index 088dd322d0619205615292cbe0ca355444633b92..b163152bf15838c385b38690c75f8f92499b5ae2 100644
--- a/mace/ops/opencl/image/addn.h
+++ b/mace/ops/opencl/image/addn.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class AddNKernel : public OpenCLAddNKernel {
  public:
   MaceStatus Compute(
@@ -44,89 +43,6 @@ class AddNKernel : public OpenCLAddNKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus AddNKernel<T>::Compute(
-    OpContext *context,
-    const std::vector<const Tensor *> &input_tensors,
-    Tensor *output_tensor) {
-  size_t size = input_tensors.size();
-  MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
-
-  const index_t batch = input_tensors[0]->dim(0);
-  const index_t height = input_tensors[0]->dim(1);
-  const index_t width = input_tensors[0]->dim(2);
-  const index_t channels = input_tensors[0]->dim(3);
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  for (size_t i = 1; i < size; ++i) {
-    MACE_CHECK_NOTNULL(input_tensors[i]);
-    MACE_CHECK(batch == input_tensors[i]->dim(0));
-    MACE_CHECK(height == input_tensors[i]->dim(1));
-    MACE_CHECK(width == input_tensors[i]->dim(2));
-    MACE_CHECK(channels == input_tensors[i]->dim(3));
-  }
-
-  if (kernel_.get() == nullptr) {
-    if (input_tensors.size() > 4) {
-      MACE_NOT_IMPLEMENTED;
-    }
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
-    built_options.emplace("-Daddn=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
-
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  std::vector<index_t> output_shape = input_tensors[0]->shape();
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t width_pixels = channel_blocks * width;
-  const index_t batch_height_pixels = batch * height;
-
-  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
-                           static_cast<uint32_t>(batch_height_pixels)};
-
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
-    std::vector<size_t> output_image_shape;
-    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                                &output_image_shape);
-    MACE_RETURN_IF_ERROR(
-        output_tensor->ResizeImage(output_shape, output_image_shape));
-
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_2D_GWS_ARGS(kernel_, gws);
-    for (auto input : input_tensors) {
-      kernel_.setArg(idx++, *(input->opencl_image()));
-    }
-    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
-
-    input_shape_ = input_tensors[0]->shape();
-  }
-
-  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
-  std::string tuning_key =
-      Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
-             output_tensor->dim(2), output_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/batch_norm.cc b/mace/ops/opencl/image/batch_norm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bfb496e77904f274d92a1846d25eeb14c12cc4aa
--- /dev/null
+++ b/mace/ops/opencl/image/batch_norm.cc
@@ -0,0 +1,120 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/batch_norm.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+BatchNormKernel::BatchNormKernel(const float epsilon,
+                                 const ActivationType activation,
+                                 const float relux_max_limit,
+                                 const float leakyrelu_coefficient)
+    : epsilon_(epsilon),
+      activation_(activation),
+      relux_max_limit_(relux_max_limit),
+      leakyrelu_coefficient_(leakyrelu_coefficient) {}
+
+MaceStatus BatchNormKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *scale,
+    const Tensor *offset,
+    const Tensor *mean,
+    const Tensor *var,
+    Tensor *output) {
+  bool not_folded = (mean != nullptr && var != nullptr);
+
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
+    built_options.emplace("-Dbatch_norm=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    if (!not_folded) {
+      built_options.emplace("-DFOLDED_CONSTANT");
+    }
+    switch (activation_) {
+      case NOOP:break;
+      case RELU:built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:built_options.emplace("-DUSE_SIGMOID");
+        break;
+      case LEAKYRELU:built_options.emplace("-DUSE_LEAKYRELU");
+        break;
+      default:LOG(FATAL) << "Unknown activation type: " << activation_;
+    }
+
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(scale->opencl_image()));
+    kernel_.setArg(idx++, *(offset->opencl_image()));
+    if (not_folded) {
+      kernel_.setArg(idx++, *(mean->opencl_image()));
+      kernel_.setArg(idx++, *(var->opencl_image()));
+      kernel_.setArg(idx++, epsilon_);
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, relux_max_limit_);
+    kernel_.setArg(idx++, leakyrelu_coefficient_);
+
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/batch_norm.h b/mace/ops/opencl/image/batch_norm.h
index 73560343b41e907b90f5c5e81379361ac93a589c..b2201a96631fef8ddd3b1a1748550aa96897e646 100644
--- a/mace/ops/opencl/image/batch_norm.h
+++ b/mace/ops/opencl/image/batch_norm.h
@@ -23,7 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/activation.h"
+#include "mace/ops/common/activation_type.h"
 #include "mace/ops/opencl/helper.h"
 
 namespace mace {
@@ -31,7 +31,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class BatchNormKernel : public OpenCLBatchNormKernel {
  public:
   BatchNormKernel(
@@ -57,111 +56,6 @@ class BatchNormKernel : public OpenCLBatchNormKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-BatchNormKernel<T>::BatchNormKernel(const float epsilon,
-                                    const ActivationType activation,
-                                    const float relux_max_limit,
-                                    const float leakyrelu_coefficient)
-    : epsilon_(epsilon),
-      activation_(activation),
-      relux_max_limit_(relux_max_limit),
-      leakyrelu_coefficient_(leakyrelu_coefficient) {}
-
-template <typename T>
-MaceStatus BatchNormKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *scale,
-    const Tensor *offset,
-    const Tensor *mean,
-    const Tensor *var,
-    Tensor *output) {
-  bool not_folded = (mean != nullptr && var != nullptr);
-
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
-    built_options.emplace("-Dbatch_norm=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    if (!not_folded) {
-      built_options.emplace("-DFOLDED_CONSTANT");
-    }
-    switch (activation_) {
-      case NOOP:
-        break;
-      case RELU:
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case TANH:
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      case LEAKYRELU:
-        built_options.emplace("-DUSE_LEAKYRELU");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation_;
-    }
-
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(scale->opencl_image()));
-    kernel_.setArg(idx++, *(offset->opencl_image()));
-    if (not_folded) {
-      kernel_.setArg(idx++, *(mean->opencl_image()));
-      kernel_.setArg(idx++, *(var->opencl_image()));
-      kernel_.setArg(idx++, epsilon_);
-    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, relux_max_limit_);
-    kernel_.setArg(idx++, leakyrelu_coefficient_);
-
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
-             output->dim(1), output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/batch_to_space.cc b/mace/ops/opencl/image/batch_to_space.cc
new file mode 100644
index 0000000000000000000000000000000000000000..87f5f5a61d12dbc74897f3409ed1ea49bee610a2
--- /dev/null
+++ b/mace/ops/opencl/image/batch_to_space.cc
@@ -0,0 +1,100 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/batch_to_space.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus BatchToSpaceKernel::Compute(
+    OpContext *context,
+    const Tensor *batch_tensor,
+    const std::vector<int> &paddings,
+    const std::vector<int> &block_shape,
+    const std::vector<index_t> &output_shape,
+    Tensor *space_tensor) {
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(
+      space_tensor->ResizeImage(output_shape, output_image_shape));
+
+  const uint32_t chan_blk =
+      static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
+
+  const uint32_t gws[3] = {
+      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
+      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    const char *kernel_name = "batch_to_space";
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    auto dt = batch_tensor->dtype();
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
+    kernel_.setArg(idx++, *(space_tensor->opencl_image()));
+    kernel_.setArg(idx++, block_shape[0]);
+    kernel_.setArg(idx++, block_shape[1]);
+    kernel_.setArg(idx++, paddings[0]);
+    kernel_.setArg(idx++, paddings[2]);
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
+
+    input_shape_ = batch_tensor->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
+             batch_tensor->dim(2), batch_tensor->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/batch_to_space.h b/mace/ops/opencl/image/batch_to_space.h
index 47f79c45c8bbf963f277ca35c594ce081f3bf140..a0aced7c021cdee7dfe55b3800e9da324e7abf59 100644
--- a/mace/ops/opencl/image/batch_to_space.h
+++ b/mace/ops/opencl/image/batch_to_space.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
  public:
   MaceStatus Compute(
@@ -47,81 +46,6 @@ class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus BatchToSpaceKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *batch_tensor,
-    const std::vector<int> &paddings,
-    const std::vector<int> &block_shape,
-    const std::vector<index_t> &output_shape,
-    Tensor *space_tensor) {
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(
-      space_tensor->ResizeImage(output_shape, output_image_shape));
-
-  const uint32_t chan_blk =
-      static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
-
-  const uint32_t gws[3] = {
-      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
-      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    const char *kernel_name = "batch_to_space";
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    built_options.emplace(kernel_name_ss.str());
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" +
-        DtToCLCMDDt(DataTypeToEnum<T>::value));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
-    kernel_.setArg(idx++, *(space_tensor->opencl_image()));
-    kernel_.setArg(idx++, block_shape[0]);
-    kernel_.setArg(idx++, block_shape[1]);
-    kernel_.setArg(idx++, paddings[0]);
-    kernel_.setArg(idx++, paddings[2]);
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
-
-    input_shape_ = batch_tensor->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
-             batch_tensor->dim(2), batch_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/bias_add.cc b/mace/ops/opencl/image/bias_add.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1f62f592f41ede1df35532775f88bae761623447
--- /dev/null
+++ b/mace/ops/opencl/image/bias_add.cc
@@ -0,0 +1,101 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/bias_add.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus BiasAddKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *bias,
+    Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
+    built_options.emplace("-Dbias_add=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(bias->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange,
+        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION;
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/bias_add.h b/mace/ops/opencl/image/bias_add.h
index 6c534a4b1e9cde4fdac5a100c36a0daf2d4fd8ce..7c25662da81b183ca88588dc756b724b50ed33ac 100644
--- a/mace/ops/opencl/image/bias_add.h
+++ b/mace/ops/opencl/image/bias_add.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class BiasAddKernel : public OpenCLBiasAddKernel {
  public:
   MaceStatus Compute(
@@ -45,84 +44,6 @@ class BiasAddKernel : public OpenCLBiasAddKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus BiasAddKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *bias,
-    Tensor *output) {
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    auto dt = DataTypeToEnum<T>::value;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
-    built_options.emplace("-Dbias_add=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(bias->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  } else {
-    std::vector<uint32_t> roundup_gws(lws.size());
-    for (size_t i = 0; i < lws.size(); ++i) {
-      if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
-    }
-
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange,
-        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  }
-  MACE_CL_RET_STATUS(error);
-  MACE_OUT_OF_RANGE_VALIDATION;
-  if (context->future() != nullptr) {
-    context->future()->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/buffer_to_image.cc b/mace/ops/opencl/image/buffer_to_image.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb785e0cdbff3a5b0ef977e9894a7e93c8f0537a
--- /dev/null
+++ b/mace/ops/opencl/image/buffer_to_image.cc
@@ -0,0 +1,164 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/buffer_to_image.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus BufferToImage::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const OpenCLBufferType type,
+    const int wino_blk_size,
+    Tensor *output) {
+  auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
+                              type,
+                              &image_shape,
+                              wino_blk_size);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
+
+  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
+                     static_cast<uint32_t>(image_shape[1])};
+  std::string kernel_name;
+  switch (type) {
+    case CONV2D_FILTER:kernel_name = "filter_buffer_to_image";
+      break;
+    case DW_CONV2D_FILTER:kernel_name = "dw_filter_buffer_to_image";
+      break;
+    case IN_OUT_CHANNEL:kernel_name = "in_out_buffer_to_image";
+      break;
+    case ARGUMENT:kernel_name = "arg_buffer_to_image";
+      break;
+    case IN_OUT_HEIGHT:kernel_name = "in_out_height_buffer_to_image";
+      break;
+    case IN_OUT_WIDTH:kernel_name = "in_out_width_buffer_to_image";
+      break;
+    case WEIGHT_HEIGHT:kernel_name = "weight_height_buffer_to_image";
+      break;
+    case WEIGHT_WIDTH:kernel_name = "weight_width_buffer_to_image";
+      break;
+    case WINOGRAD_FILTER: {
+      std::stringstream ss_tmp;
+      gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
+      ss_tmp << "winograd_filter_buffer_to_image_"
+             << wino_blk_size << "x" << wino_blk_size;
+      kernel_name = ss_tmp.str();
+      break;
+    }
+  }
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    if (input->dtype() == output->dtype()) {
+      auto input_dt = input->dtype();
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
+    } else {
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    }
+
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel(
+        "buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
+  }
+
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_2D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_buffer()));
+    MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
+               "buffer offset not aligned");
+    kernel_.setArg(idx++,
+                   static_cast<uint32_t>(input->buffer_offset() /
+                       GetEnumTypeSize(input->dtype())));
+    if (type == CONV2D_FILTER) {
+      const index_t
+          inner_size = input->dim(1) * input->dim(2) * input->dim(3);
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
+    } else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
+    } else if (type == ARGUMENT) {
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
+    } else {
+      kernel_.setArg(idx++,
+                     static_cast<uint32_t>(formatted_buffer_shape[1]));
+      kernel_.setArg(idx++,
+                     static_cast<uint32_t>(formatted_buffer_shape[2]));
+      kernel_.setArg(idx++,
+                     static_cast<uint32_t>(formatted_buffer_shape[3]));
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+
+  const uint32_t kwg_size =
+      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  const std::vector<uint32_t> lws = {16, kwg_size / 16};
+
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
+        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
+        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION;
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/buffer_to_image.h b/mace/ops/opencl/image/buffer_to_image.h
index 3d1366dfd69808e2220a20ac124003b0e04a0726..493f6579db7ced93681ad2b8b80b491edd934b8d 100644
--- a/mace/ops/opencl/image/buffer_to_image.h
+++ b/mace/ops/opencl/image/buffer_to_image.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class BufferToImage : public OpenCLBufferTransformKernel {
  public:
   MaceStatus Compute(
@@ -45,156 +44,6 @@ class BufferToImage : public OpenCLBufferTransformKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus BufferToImage<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const OpenCLBufferType type,
-    const int wino_blk_size,
-    Tensor *output) {
-  auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
-                              type,
-                              &image_shape,
-                              wino_blk_size);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
-
-  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
-                     static_cast<uint32_t>(image_shape[1])};
-  std::string kernel_name;
-  switch (type) {
-    case CONV2D_FILTER:
-      kernel_name = "filter_buffer_to_image";
-      break;
-    case DW_CONV2D_FILTER:
-      kernel_name = "dw_filter_buffer_to_image";
-      break;
-    case IN_OUT_CHANNEL:
-      kernel_name = "in_out_buffer_to_image";
-      break;
-    case ARGUMENT:
-      kernel_name = "arg_buffer_to_image";
-      break;
-    case IN_OUT_HEIGHT:
-      kernel_name = "in_out_height_buffer_to_image";
-      break;
-    case IN_OUT_WIDTH:
-      kernel_name = "in_out_width_buffer_to_image";
-      break;
-    case WEIGHT_HEIGHT:
-      kernel_name = "weight_height_buffer_to_image";
-      break;
-    case WEIGHT_WIDTH:
-      kernel_name = "weight_width_buffer_to_image";
-      break;
-    case WINOGRAD_FILTER: {
-      std::stringstream ss_tmp;
-      gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
-      ss_tmp << "winograd_filter_buffer_to_image_"
-             << wino_blk_size << "x" << wino_blk_size;
-      kernel_name = ss_tmp.str();
-      break;
-    }
-  }
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    built_options.emplace(kernel_name_ss.str());
-    if (input->dtype() == output->dtype()) {
-      built_options.emplace(
-          "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-      built_options.emplace("-DCMD_DATA_TYPE=" +
-          DtToCLCMDDt(DataTypeToEnum<T>::value));
-    } else {
-      built_options.emplace("-DDATA_TYPE=" +
-          DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
-      built_options.emplace("-DCMD_DATA_TYPE=" +
-          DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel(
-        "buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
-  }
-
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_2D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_buffer()));
-    MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
-               "buffer offset not aligned");
-    kernel_.setArg(idx++,
-                   static_cast<uint32_t>(input->buffer_offset() /
-                       GetEnumTypeSize(input->dtype())));
-    if (type == CONV2D_FILTER) {
-      const index_t
-          inner_size = input->dim(1) * input->dim(2) * input->dim(3);
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
-    } else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
-    } else if (type == ARGUMENT) {
-      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
-    } else {
-      kernel_.setArg(idx++,
-                     static_cast<uint32_t>(formatted_buffer_shape[1]));
-      kernel_.setArg(idx++,
-                     static_cast<uint32_t>(formatted_buffer_shape[2]));
-      kernel_.setArg(idx++,
-                     static_cast<uint32_t>(formatted_buffer_shape[3]));
-    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {16, kwg_size / 16};
-
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
-        cl::NDRange(lws[0], lws[1]), nullptr, &event);
-  } else {
-    std::vector<uint32_t> roundup_gws(lws.size());
-    for (size_t i = 0; i < lws.size(); ++i) {
-      roundup_gws[i] = RoundUp(gws[i], lws[i]);
-    }
-
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
-        cl::NDRange(lws[0], lws[1]), nullptr, &event);
-  }
-  MACE_CL_RET_STATUS(error);
-  MACE_OUT_OF_RANGE_VALIDATION;
-  if (context->future() != nullptr) {
-    context->future()->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/channel_shuffle.cc b/mace/ops/opencl/image/channel_shuffle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6cdbb1feea4a5e77834ce066b476bc3f0162aa5d
--- /dev/null
+++ b/mace/ops/opencl/image/channel_shuffle.cc
@@ -0,0 +1,87 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/channel_shuffle.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus ChannelShuffleKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    Tensor *output) {
+  MACE_CHECK(input->dim(3) % groups_ == 0,
+             "input channels must be an integral multiple of group. ",
+             input->dim(3));
+  MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
+  const index_t channels_per_group = channels / groups_;
+  const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
+    built_options.emplace("-Dchannel_shuffle=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("channel_shuffle", kernel_name,
+                             built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, groups_);
+    kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/channel_shuffle.h b/mace/ops/opencl/image/channel_shuffle.h
index 15111c7dd65e91cea946acfdc3841e400f9a17d7..371ecf22a6cf61e3e7c60b8af4abe981f3a1264e 100644
--- a/mace/ops/opencl/image/channel_shuffle.h
+++ b/mace/ops/opencl/image/channel_shuffle.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
  public:
   explicit ChannelShuffleKernel(const int groups) : groups_(groups) {}
@@ -46,70 +45,6 @@ class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus ChannelShuffleKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    Tensor *output) {
-  MACE_CHECK(input->dim(3) % groups_ == 0,
-             "input channels must be an integral multiple of group. ",
-             input->dim(3));
-  MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-  const index_t channels_per_group = channels / groups_;
-  const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
-    built_options.emplace("-Dchannel_shuffle=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(
-        runtime->BuildKernel("channel_shuffle", kernel_name,
-                             built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, groups_);
-    kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/concat.cc b/mace/ops/opencl/image/concat.cc
index d6b0bc65802547561d989167055454020e6dd1d4..f4433b43ebb44e00f0711f734ffdd1b90b0b09df 100644
--- a/mace/ops/opencl/image/concat.cc
+++ b/mace/ops/opencl/image/concat.cc
@@ -50,7 +50,6 @@ MaceStatus Concat2(OpContext *context,
                    cl::Kernel *kernel,
                    const Tensor *input0,
                    const Tensor *input1,
-                   const DataType dt,
                    std::vector<index_t> *prev_input_shape,
                    Tensor *output,
                    uint32_t *kwg_size) {
@@ -75,12 +74,14 @@ MaceStatus Concat2(OpContext *context,
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
     built_options.emplace("-Dconcat_channel=" + kernel_name);
     if (input0->dtype() == output->dtype()) {
-      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+      auto data_dt = input0->dtype();
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt));
     } else {
-      built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-      built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
     }
+
     if (input0->dim(3) % 4 == 0) {
       built_options.emplace("-DDIVISIBLE_FOUR");
     }
@@ -119,7 +120,6 @@ MaceStatus Concat2(OpContext *context,
 MaceStatus ConcatN(OpContext *context,
                    cl::Kernel *kernel,
                    const std::vector<const Tensor *> &input_list,
-                   const DataType dt,
                    Tensor *output,
                    uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
@@ -135,8 +135,8 @@ MaceStatus ConcatN(OpContext *context,
     MACE_NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
     built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
     MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
                                               built_options, kernel));
     *kwg_size =
@@ -205,6 +205,51 @@ MaceStatus ConcatN(OpContext *context,
 }
 
 }  // namespace concat
+
+
+MaceStatus ConcatKernel::Compute(
+    OpContext *context,
+    const std::vector<const Tensor *> &input_list,
+    const int32_t axis,
+    Tensor *output) {
+  const int inputs_count = input_list.size();
+
+  const Tensor *input0 = input_list[0];
+
+  std::vector<index_t> output_shape(input0->shape());
+  for (int i = 1; i < inputs_count; ++i) {
+    const Tensor *input = input_list[i];
+    MACE_CHECK(input->dim_size() == input0->dim_size(),
+               "Ranks of all input tensors must be same.");
+    for (int j = 0; j < input->dim_size(); ++j) {
+      if (j == axis) {
+        continue;
+      }
+      MACE_CHECK(input->dim(j) == input0->dim(j),
+                 "Dimensions of inputs should equal except axis.");
+    }
+    output_shape[axis] += input->dim(axis);
+  }
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+
+  switch (inputs_count) {
+    case 2:
+      return concat::Concat2(
+          context, &kernel_, input_list[0], input_list[1],
+          &input_shape_, output, &kwg_size_);
+    default:
+      return concat::ConcatN(context,
+                             &kernel_,
+                             input_list,
+                             output,
+                             &kwg_size_);
+  }
+}
+
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/concat.h b/mace/ops/opencl/image/concat.h
index 3a7af7ed15b35b6f35fb4e844f76c61b1f1a6985..f1e51fd96e7312d30419eafada40796f000c55c2 100644
--- a/mace/ops/opencl/image/concat.h
+++ b/mace/ops/opencl/image/concat.h
@@ -32,7 +32,6 @@ MaceStatus Concat2(OpContext *context,
                    cl::Kernel *kernel,
                    const Tensor *input0,
                    const Tensor *input1,
-                   const DataType dt,
                    std::vector<index_t> *prev_input_shape,
                    Tensor *output,
                    uint32_t *kwg_size);
@@ -40,12 +39,10 @@ MaceStatus Concat2(OpContext *context,
 MaceStatus ConcatN(OpContext *context,
                    cl::Kernel *kernel,
                    const std::vector<const Tensor *> &input_list,
-                   const DataType dt,
                    Tensor *output,
                    uint32_t *kwg_size);
 }  // namespace concat
 
-template <typename T>
 class ConcatKernel : public OpenCLConcatKernel {
  public:
   ConcatKernel() {}
@@ -61,47 +58,6 @@ class ConcatKernel : public OpenCLConcatKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus ConcatKernel<T>::Compute(
-    OpContext *context,
-    const std::vector<const Tensor *> &input_list,
-    const int32_t axis,
-    Tensor *output) {
-  const int inputs_count = input_list.size();
-
-  const Tensor *input0 = input_list[0];
-
-  std::vector<index_t> output_shape(input0->shape());
-  for (int i = 1; i < inputs_count; ++i) {
-    const Tensor *input = input_list[i];
-    MACE_CHECK(input->dim_size() == input0->dim_size(),
-               "Ranks of all input tensors must be same.");
-    for (int j = 0; j < input->dim_size(); ++j) {
-      if (j == axis) {
-        continue;
-      }
-      MACE_CHECK(input->dim(j) == input0->dim(j),
-                 "Dimensions of inputs should equal except axis.");
-    }
-    output_shape[axis] += input->dim(axis);
-  }
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape,
-                              OpenCLBufferType::IN_OUT_CHANNEL,
-                              &image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-
-  switch (inputs_count) {
-    case 2:
-      return concat::Concat2(
-          context, &kernel_, input_list[0], input_list[1],
-          DataTypeToEnum<T>::value, &input_shape_, output, &kwg_size_);
-    default:
-      return concat::ConcatN(context, &kernel_, input_list,
-                             DataTypeToEnum<T>::value, output, &kwg_size_);
-  }
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/conv_2d.cc b/mace/ops/opencl/image/conv_2d.cc
new file mode 100644
index 0000000000000000000000000000000000000000..20c101a2410eb11c1a29fbe7f9aa4cfefda9511f
--- /dev/null
+++ b/mace/ops/opencl/image/conv_2d.cc
@@ -0,0 +1,185 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/conv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+bool Conv2dKernel::CheckUseWinograd(
+    OpenCLRuntime *runtime,
+    const std::vector<mace::index_t> &filter_shape,
+    const std::vector<mace::index_t> &output_shape,
+    const int *strides,
+    const int *dilations,
+    int *wino_blk_size) {
+  if (filter_shape[2] != 3 || filter_shape[3] != 3 ||
+      strides[0] > 1 || strides[1] > 1 ||
+      dilations[0] > 1 || dilations[1] > 1) {
+    return false;
+  }
+  index_t out_channels = filter_shape[0];
+  index_t in_channels = filter_shape[1];
+  auto opencl_image_max_size = runtime->GetMaxImage2DSize();
+  auto check_opencl_limit = [&](int block_size) -> bool {
+    int sqr_block = (block_size + 2) * (block_size + 2);
+    uint64_t transformed_width = static_cast<uint64_t>(output_shape[0] *
+        ((output_shape[1] + block_size - 1) / block_size) *
+        ((output_shape[2] + block_size - 1) / block_size));
+    return (transformed_width < opencl_image_max_size[0] &&
+        static_cast<uint64_t>(sqr_block * in_channels)
+            < opencl_image_max_size[1] &&
+        static_cast<uint64_t>(sqr_block * out_channels)
+            < opencl_image_max_size[1]);
+  };
+  // GPU only supports 4x4 and 2x2 gpu winograd convolution
+  if (*wino_blk_size == 4) {
+    // if block size == 4 exceed OpenCL image size limitation, fallback to 2
+    if (!check_opencl_limit(4)) {
+      *wino_blk_size = 2;
+    } else {
+      return true;
+    }
+  }
+  return check_opencl_limit(2);
+}
+
+MaceStatus Conv2dKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const float leakyrelu_coefficient,
+    const int wino_blk_size,
+    Tensor *output) {
+  index_t kernel_h = filter->dim(2);
+  index_t kernel_w = filter->dim(3);
+  if (strides[0] != strides[1] ||
+      (dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) {
+    LOG(WARNING) << "OpenCL conv2d kernel with "
+                 << "filter" << kernel_h << "x" << kernel_w << ","
+                 << " stride " << strides[0] << "x" << strides[1]
+                 << ",dilations " << dilations[0] << "x" << dilations[1]
+                 << " is not implemented yet.";
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  // Reshape output
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    ops::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), filter->shape().data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), filter->shape().data(),
+                   padding_data.data(), dilations, strides, RoundType::FLOOR,
+                   output_shape.data());
+  }
+
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+
+  std::function<MaceStatus()> conv_func;
+
+  if (wino_blk_size != 0) {
+    // use winograd covolution
+    conv_func = [&]() -> MaceStatus {
+      cl::Kernel *kernels[3] = {&kernels_[0], &kernels_[1], &kernels_[2]};
+      uint32_t *kwg_size[3] = {&kwg_size_[0], &kwg_size_[1], &kwg_size_[2]};
+      return WinogradConv2dK3x3S1(context,
+                                  kernels,
+                                  input,
+                                  filter,
+                                  bias,
+                                  paddings.data(),
+                                  activation,
+                                  relux_max_limit,
+                                  leakyrelu_coefficient,
+                                  wino_blk_size,
+                                  &input_shape_,
+                                  output,
+                                  kwg_size);
+    };
+  } else if (kernel_h == 1 && kernel_w == 1) {
+    conv_func = [&]() -> MaceStatus {
+      return Conv2dK1x1(context,
+                        &kernels_[0],
+                        input,
+                        filter,
+                        bias,
+                        strides[0],
+                        paddings.data(),
+                        dilations,
+                        activation,
+                        relux_max_limit,
+                        leakyrelu_coefficient,
+                        &input_shape_,
+                        output,
+                        &kwg_size_[0]);
+    };
+  } else if (kernel_h == 3 && kernel_w == 3) {
+    conv_func = [&]() -> MaceStatus {
+      return Conv2dK3x3(context,
+                        &kernels_[0],
+                        input,
+                        filter,
+                        bias,
+                        strides[0],
+                        paddings.data(),
+                        dilations,
+                        activation,
+                        relux_max_limit,
+                        leakyrelu_coefficient,
+                        &input_shape_,
+                        output,
+                        &kwg_size_[0]);
+    };
+  } else {
+    conv_func = [&]() -> MaceStatus {
+      return Conv2d(context,
+                    &kernels_[0],
+                    input,
+                    filter,
+                    bias,
+                    strides[0],
+                    paddings.data(),
+                    dilations,
+                    activation,
+                    relux_max_limit,
+                    leakyrelu_coefficient,
+                    &input_shape_,
+                    output,
+                    &kwg_size_[0]);
+    };
+  }
+
+  return conv_func();
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/conv_2d.h b/mace/ops/opencl/image/conv_2d.h
index 5df0847b5e1d4160c4484ccb06d6118df4bb70b0..84fae55dff77afef2056f3f8e1628413a73e0bc2 100644
--- a/mace/ops/opencl/image/conv_2d.h
+++ b/mace/ops/opencl/image/conv_2d.h
@@ -39,7 +39,6 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
                              const ActivationType activation,
                              const float relux_max_limit,
                              const float leakyrelu_coefficient,
-                             const DataType dt,
                              std::vector<index_t> *prev_input_shape,
                              Tensor *output,
                              uint32_t *kwg_size);
@@ -55,7 +54,6 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
                              const ActivationType activation,
                              const float relux_max_limit,
                              const float leakyrelu_coefficient,
-                             const DataType dt,
                              std::vector<index_t> *prev_input_shape,
                              Tensor *output,
                              uint32_t *kwg_size);
@@ -71,7 +69,6 @@ extern MaceStatus Conv2d(OpContext *context,
                          const ActivationType activation,
                          const float relux_max_limit,
                          const float leakyrelu_coefficient,
-                         const DataType dt,
                          std::vector<index_t> *prev_input_shape,
                          Tensor *output,
                          uint32_t *kwg_size);
@@ -85,13 +82,11 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
                                        const ActivationType activation,
                                        const float relux_max_limit,
                                        const float leakyrelu_coefficient,
-                                       const DataType dt,
                                        const int wino_blk_size,
                                        std::vector<index_t> *prev_input_shape,
                                        Tensor *output,
                                        uint32_t *kwg_size[3]);
 
-template <typename T>
 class Conv2dKernel : public OpenCLConv2dKernel {
  public:
   bool CheckUseWinograd(
@@ -123,172 +118,6 @@ class Conv2dKernel : public OpenCLConv2dKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-bool Conv2dKernel<T>::CheckUseWinograd(
-    OpenCLRuntime *runtime,
-    const std::vector<mace::index_t> &filter_shape,
-    const std::vector<mace::index_t> &output_shape,
-    const int *strides,
-    const int *dilations,
-    int *wino_blk_size) {
-  if (filter_shape[2] != 3 || filter_shape[3] != 3 ||
-      strides[0] > 1 || strides[1] > 1 ||
-      dilations[0] > 1 || dilations[1] > 1) {
-    return false;
-  }
-  index_t out_channels = filter_shape[0];
-  index_t in_channels = filter_shape[1];
-  auto opencl_image_max_size = runtime->GetMaxImage2DSize();
-  auto check_opencl_limit = [&](int block_size) -> bool {
-    int sqr_block = (block_size + 2) * (block_size + 2);
-    uint64_t transformed_width = static_cast<uint64_t>(output_shape[0] *
-        ((output_shape[1] + block_size - 1) / block_size) *
-        ((output_shape[2] + block_size - 1) / block_size));
-    return (transformed_width < opencl_image_max_size[0] &&
-        static_cast<uint64_t>(sqr_block * in_channels)
-        < opencl_image_max_size[1] &&
-        static_cast<uint64_t>(sqr_block * out_channels)
-            < opencl_image_max_size[1]);
-  };
-  // GPU only supports 4x4 and 2x2 gpu winograd convolution
-  if (*wino_blk_size == 4) {
-    // if block size == 4 exceed OpenCL image size limitation, fallback to 2
-    if (!check_opencl_limit(4)) {
-      *wino_blk_size = 2;
-    } else {
-      return true;
-    }
-  }
-  return check_opencl_limit(2);
-}
-
-template <typename T>
-MaceStatus Conv2dKernel<T>::Compute(
-      OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *bias,
-      const int *strides,
-      const Padding &padding_type,
-      const std::vector<int> &padding_data,
-      const int *dilations,
-      const ActivationType activation,
-      const float relux_max_limit,
-      const float leakyrelu_coefficient,
-      const int wino_blk_size,
-      Tensor *output) {
-  index_t kernel_h = filter->dim(2);
-  index_t kernel_w = filter->dim(3);
-  if (strides[0] != strides[1] ||
-      (dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) {
-    LOG(WARNING) << "OpenCL conv2d kernel with "
-                 << "filter" << kernel_h << "x" << kernel_w << ","
-                 << " stride " << strides[0] << "x" << strides[1]
-                 << ",dilations " << dilations[0] << "x" << dilations[1]
-                 << " is not implemented yet.";
-    MACE_NOT_IMPLEMENTED;
-  }
-
-  // Reshape output
-  std::vector<index_t> output_shape(4);
-  std::vector<int> paddings(2);
-  if (padding_data.empty()) {
-    ops::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), filter->shape().data(), dilations, strides,
-        padding_type, output_shape.data(), paddings.data());
-  } else {
-    paddings = padding_data;
-    CalcOutputSize(input->shape().data(), filter->shape().data(),
-                   padding_data.data(), dilations, strides, RoundType::FLOOR,
-                   output_shape.data());
-  }
-
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-
-  std::function<MaceStatus()> conv_func;
-
-  if (wino_blk_size != 0) {
-    // use winograd covolution
-    conv_func = [&]() -> MaceStatus {
-      cl::Kernel *kernels[3] = {&kernels_[0], &kernels_[1], &kernels_[2]};
-      uint32_t *kwg_size[3] = {&kwg_size_[0], &kwg_size_[1], &kwg_size_[2]};
-      return WinogradConv2dK3x3S1(context,
-                                  kernels,
-                                  input,
-                                  filter,
-                                  bias,
-                                  paddings.data(),
-                                  activation,
-                                  relux_max_limit,
-                                  leakyrelu_coefficient,
-                                  DataTypeToEnum<T>::value,
-                                  wino_blk_size,
-                                  &input_shape_,
-                                  output,
-                                  kwg_size);
-    };
-  } else if (kernel_h == 1 && kernel_w == 1)  {
-    conv_func = [&]() -> MaceStatus {
-      return Conv2dK1x1(context,
-                        &kernels_[0],
-                        input,
-                        filter,
-                        bias,
-                        strides[0],
-                        paddings.data(),
-                        dilations,
-                        activation,
-                        relux_max_limit,
-                        leakyrelu_coefficient,
-                        DataTypeToEnum<T>::value,
-                        &input_shape_,
-                        output,
-                        &kwg_size_[0]);
-    };
-  } else if (kernel_h == 3 && kernel_w == 3) {
-    conv_func = [&]() -> MaceStatus {
-      return Conv2dK3x3(context,
-                        &kernels_[0],
-                        input,
-                        filter,
-                        bias,
-                        strides[0],
-                        paddings.data(),
-                        dilations,
-                        activation,
-                        relux_max_limit,
-                        leakyrelu_coefficient,
-                        DataTypeToEnum<T>::value,
-                        &input_shape_,
-                        output,
-                        &kwg_size_[0]);
-    };
-  } else {
-    conv_func = [&]() -> MaceStatus {
-      return Conv2d(context,
-                    &kernels_[0],
-                    input,
-                    filter,
-                    bias,
-                    strides[0],
-                    paddings.data(),
-                    dilations,
-                    activation,
-                    relux_max_limit,
-                    leakyrelu_coefficient,
-                    DataTypeToEnum<T>::value,
-                    &input_shape_,
-                    output,
-                    &kwg_size_[0]);
-    };
-  }
-
-  return conv_func();
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/conv_2d_1x1.cc b/mace/ops/opencl/image/conv_2d_1x1.cc
index 374d262ae34a4938e40f94dd941e95735bcedd4e..460d01323dccd584b880f2cdc27b5d2e4c2735fe 100644
--- a/mace/ops/opencl/image/conv_2d_1x1.cc
+++ b/mace/ops/opencl/image/conv_2d_1x1.cc
@@ -66,21 +66,20 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 
 }  // namespace
 
-extern MaceStatus Conv2dK1x1(OpContext *context,
-                             cl::Kernel *kernel,
-                             const Tensor *input,
-                             const Tensor *filter,
-                             const Tensor *bias,
-                             const int stride,
-                             const int *padding,
-                             const int *dilations,
-                             const ActivationType activation,
-                             const float relux_max_limit,
-                             const float leakyrelu_coefficient,
-                             const DataType dt,
-                             std::vector<index_t> *prev_input_shape,
-                             Tensor *output,
-                             uint32_t *kwg_size) {
+MaceStatus Conv2dK1x1(OpContext *context,
+                      cl::Kernel *kernel,
+                      const Tensor *input,
+                      const Tensor *filter,
+                      const Tensor *bias,
+                      const int stride,
+                      const int *padding,
+                      const int *dilations,
+                      const ActivationType activation,
+                      const float relux_max_limit,
+                      const float leakyrelu_coefficient,
+                      std::vector<index_t> *prev_input_shape,
+                      Tensor *output,
+                      uint32_t *kwg_size) {
   MACE_UNUSED(padding);
   MACE_UNUSED(dilations);
   const index_t batch = output->dim(0);
@@ -106,31 +105,38 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
     MACE_NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1");
     built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
     if (bias != nullptr) {
       built_options.emplace("-DBIAS");
     }
     switch (activation) {
-      case NOOP:
+      case NOOP: {
         break;
-      case RELU:
+      }
+      case RELU: {
         built_options.emplace("-DUSE_RELU");
         break;
-      case RELUX:
+      }
+      case RELUX: {
         built_options.emplace("-DUSE_RELUX");
         break;
-      case TANH:
+      }
+      case TANH: {
         built_options.emplace("-DUSE_TANH");
         break;
-      case SIGMOID:
+      }
+      case SIGMOID: {
         built_options.emplace("-DUSE_SIGMOID");
         break;
-      case LEAKYRELU:
+      }
+      case LEAKYRELU: {
         built_options.emplace("-DUSE_LEAKYRELU");
         break;
-      default:
+      }
+      default: {
         LOG(FATAL) << "Unknown activation type: " << activation;
+      }
     }
 
     MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1", kernel_name,
diff --git a/mace/ops/opencl/image/conv_2d_3x3.cc b/mace/ops/opencl/image/conv_2d_3x3.cc
index 125a973ae7de4409b31fa2a716c35409d5955d0e..a3bd170f64079a5b4533dd2a4fb104dbee752cfd 100644
--- a/mace/ops/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
@@ -59,21 +59,20 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 
 }  // namespace
 
-extern MaceStatus Conv2dK3x3(OpContext *context,
-                             cl::Kernel *kernel,
-                             const Tensor *input,
-                             const Tensor *filter,
-                             const Tensor *bias,
-                             const int stride,
-                             const int *padding,
-                             const int *dilations,
-                             const ActivationType activation,
-                             const float relux_max_limit,
-                             const float leakyrelu_coefficient,
-                             const DataType dt,
-                             std::vector<index_t> *prev_input_shape,
-                             Tensor *output,
-                             uint32_t *kwg_size) {
+MaceStatus Conv2dK3x3(OpContext *context,
+                      cl::Kernel *kernel,
+                      const Tensor *input,
+                      const Tensor *filter,
+                      const Tensor *bias,
+                      const int stride,
+                      const int *padding,
+                      const int *dilations,
+                      const ActivationType activation,
+                      const float relux_max_limit,
+                      const float leakyrelu_coefficient,
+                      std::vector<index_t> *prev_input_shape,
+                      Tensor *output,
+                      uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -93,29 +92,36 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
     MACE_NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
     built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
     switch (activation) {
-      case NOOP:
+      case NOOP: {
         break;
-      case RELU:
+      }
+      case RELU: {
         built_options.emplace("-DUSE_RELU");
         break;
-      case RELUX:
+      }
+      case RELUX: {
         built_options.emplace("-DUSE_RELUX");
         break;
-      case TANH:
+      }
+      case TANH: {
         built_options.emplace("-DUSE_TANH");
         break;
-      case SIGMOID:
+      }
+      case SIGMOID: {
         built_options.emplace("-DUSE_SIGMOID");
         break;
-      case LEAKYRELU:
+      }
+      case LEAKYRELU: {
         built_options.emplace("-DUSE_LEAKYRELU");
         break;
-      default:
+      }
+      default: {
         LOG(FATAL) << "Unknown activation type: " << activation;
+      }
     }
 
     MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_3x3", kernel_name,
diff --git a/mace/ops/opencl/image/conv_2d_general.cc b/mace/ops/opencl/image/conv_2d_general.cc
index 7f0250cbc4ebc73cfa52c6041c9da8c95b7e3892..e1979c03a715a8ec0a74bf26d35e3f34484d0c55 100644
--- a/mace/ops/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
@@ -67,21 +67,20 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 
 }  // namespace
 
-extern MaceStatus Conv2d(OpContext *context,
-                         cl::Kernel *kernel,
-                         const Tensor *input,
-                         const Tensor *filter,
-                         const Tensor *bias,
-                         const int stride,
-                         const int *padding,
-                         const int *dilations,
-                         const ActivationType activation,
-                         const float relux_max_limit,
-                         const float leakyrelu_coefficient,
-                         const DataType dt,
-                         std::vector<index_t> *prev_input_shape,
-                         Tensor *output,
-                         uint32_t *kwg_size) {
+MaceStatus Conv2d(OpContext *context,
+                  cl::Kernel *kernel,
+                  const Tensor *input,
+                  const Tensor *filter,
+                  const Tensor *bias,
+                  const int stride,
+                  const int *padding,
+                  const int *dilations,
+                  const ActivationType activation,
+                  const float relux_max_limit,
+                  const float leakyrelu_coefficient,
+                  std::vector<index_t> *prev_input_shape,
+                  Tensor *output,
+                  uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -101,29 +100,36 @@ extern MaceStatus Conv2d(OpContext *context,
     MACE_NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
     built_options.emplace("-Dconv_2d=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
     switch (activation) {
-      case NOOP:
+      case NOOP: {
         break;
-      case RELU:
+      }
+      case RELU: {
         built_options.emplace("-DUSE_RELU");
         break;
-      case RELUX:
+      }
+      case RELUX: {
         built_options.emplace("-DUSE_RELUX");
         break;
-      case TANH:
+      }
+      case TANH: {
         built_options.emplace("-DUSE_TANH");
         break;
-      case SIGMOID:
+      }
+      case SIGMOID: {
         built_options.emplace("-DUSE_SIGMOID");
         break;
-      case LEAKYRELU:
+      }
+      case LEAKYRELU: {
         built_options.emplace("-DUSE_LEAKYRELU");
         break;
-      default:
+      }
+      default: {
         LOG(FATAL) << "Unknown activation type: " << activation;
+      }
     }
 
     MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d", kernel_name,
diff --git a/mace/ops/opencl/image/crop.cc b/mace/ops/opencl/image/crop.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ad4e703d6f712e699ff6c73296e04559779e5d60
--- /dev/null
+++ b/mace/ops/opencl/image/crop.cc
@@ -0,0 +1,117 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/crop.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+
+MaceStatus CropKernel::Compute(
+    OpContext *context,
+    const std::vector<const Tensor *> &input_list,
+    Tensor *output) {
+  const int32_t inputs_count = static_cast<int32_t>(input_list.size());
+  MACE_CHECK(inputs_count >= 2)
+    << "Crop opencl kernel only support 2 elements input";
+  const Tensor *input0 = input_list[0];
+  const Tensor *input1 = input_list[1];
+  const uint32_t in0_dims = static_cast<uint32_t >(input0->dim_size());
+  const uint32_t in1_dims = static_cast<uint32_t >(input0->dim_size());
+  MACE_CHECK(in0_dims == 4 && in1_dims == 4,
+             "Crop op only supports 4-dims inputs now.");
+
+  std::vector<int32_t> offsets(4, 0);
+
+  std::vector<index_t> output_shape(input0->shape());
+  for (index_t i = 0; i < in0_dims; ++i) {
+    if (offset_[i] >= 0) {
+      output_shape[i] = input1->dim(i);
+      offsets[i] = offset_[i];
+      MACE_CHECK(input0->dim(i) - offset_[i] >= input1->dim(i))
+        << "the crop for dimension " << i
+        << " is out of bound, first input size "
+        << input0->dim(i) << ", offset " << offsets[i]
+        << ", second input size " << input1->dim(i);
+    }
+  }
+  MACE_CHECK(offsets[3] % 4 == 0,
+             "MACE opencl only supports cropping channel"
+             " offset divisible by 4.");
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+
+  const index_t offset_chan_blk = RoundUpDiv4(offsets[3]);
+  const index_t channel_blk = RoundUpDiv4(output->dim(3));
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(output->dim(2)),
+      static_cast<uint32_t>(output->dim(0) * output->dim(1))
+  };
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
+    built_options.emplace("-Dcrop=" + kernel_name);
+    auto dt = input0->dtype();
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input0->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input0->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int>(offsets[0]));
+    kernel_.setArg(idx++, static_cast<int>(offsets[1]));
+    kernel_.setArg(idx++, static_cast<int>(offsets[2]));
+    kernel_.setArg(idx++, static_cast<int>(offset_chan_blk));
+    kernel_.setArg(idx++, static_cast<int>(input0->dim(1)));
+    kernel_.setArg(idx++, static_cast<int>(input0->dim(2)));
+    kernel_.setArg(idx++, static_cast<int>(output->dim(1)));
+    kernel_.setArg(idx++, static_cast<int>(output->dim(2)));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+
+    input_shape_ = input0->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/crop.h b/mace/ops/opencl/image/crop.h
index d121b76339239ffd3964a9b756e6bebfaa838d48..c2f1c53aa2383ab89669be2520a9af3c1f2a27c8 100644
--- a/mace/ops/opencl/image/crop.h
+++ b/mace/ops/opencl/image/crop.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class CropKernel : public OpenCLCropKernel {
  public:
   explicit CropKernel(
@@ -48,98 +47,6 @@ class CropKernel : public OpenCLCropKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus CropKernel<T>::Compute(
-    OpContext *context,
-    const std::vector<const Tensor *> &input_list,
-    Tensor *output) {
-  const int32_t inputs_count = static_cast<int32_t>(input_list.size());
-  MACE_CHECK(inputs_count >= 2)
-    << "Crop opencl kernel only support 2 elements input";
-  const Tensor *input0 = input_list[0];
-  const Tensor *input1 = input_list[1];
-  const uint32_t in0_dims = static_cast<uint32_t >(input0->dim_size());
-  const uint32_t in1_dims = static_cast<uint32_t >(input0->dim_size());
-  MACE_CHECK(in0_dims == 4 && in1_dims == 4,
-             "Crop op only supports 4-dims inputs now.");
-
-  std::vector<int32_t> offsets(4, 0);
-
-  std::vector<index_t> output_shape(input0->shape());
-  for (index_t i = 0; i < in0_dims; ++i) {
-    if (offset_[i] >= 0) {
-      output_shape[i] = input1->dim(i);
-      offsets[i] = offset_[i];
-      MACE_CHECK(input0->dim(i) - offset_[i] >= input1->dim(i))
-        << "the crop for dimension " << i
-        << " is out of bound, first input size "
-        << input0->dim(i) << ", offset " << offsets[i]
-        << ", second input size " << input1->dim(i);
-    }
-  }
-  MACE_CHECK(offsets[3] % 4 == 0,
-             "MACE opencl only supports cropping channel"
-                 " offset divisible by 4.");
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape,
-                              OpenCLBufferType::IN_OUT_CHANNEL,
-                              &image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-
-  const index_t offset_chan_blk = RoundUpDiv4(offsets[3]);
-  const index_t channel_blk = RoundUpDiv4(output->dim(3));
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(output->dim(2)),
-      static_cast<uint32_t>(output->dim(0) * output->dim(1))
-  };
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
-    built_options.emplace("-Dcrop=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input0->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input0->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int>(offsets[0]));
-    kernel_.setArg(idx++, static_cast<int>(offsets[1]));
-    kernel_.setArg(idx++, static_cast<int>(offsets[2]));
-    kernel_.setArg(idx++, static_cast<int>(offset_chan_blk));
-    kernel_.setArg(idx++, static_cast<int>(input0->dim(1)));
-    kernel_.setArg(idx++, static_cast<int>(input0->dim(2)));
-    kernel_.setArg(idx++, static_cast<int>(output->dim(1)));
-    kernel_.setArg(idx++, static_cast<int>(output->dim(2)));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-
-    input_shape_ = input0->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/deconv_2d.cc b/mace/ops/opencl/image/deconv_2d.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0509fcf005dc9abf20ad241cf45e8e3cd755a1c7
--- /dev/null
+++ b/mace/ops/opencl/image/deconv_2d.cc
@@ -0,0 +1,158 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/deconv_2d.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+
+MaceStatus Deconv2dKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const int *padding_data,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const float leakyrelu_coefficient,
+    const std::vector<index_t> &output_shape,
+    Tensor *output) {
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channels = output->dim(3);
+  const index_t input_channels = input->dim(3);
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
+  const int stride_h = strides[0];
+  const int stride_w = strides[1];
+  MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
+  const int width_tile = 5;
+  const index_t n_strides = (width + stride_w - 1) / stride_w;
+  const index_t width_blocks =
+      ((n_strides + width_tile - 1) / width_tile) * stride_w;
+  const float stride_h_r = 1.f / static_cast<float>(stride_h);
+  const float stride_w_r = 1.f / static_cast<float>(stride_w);
+  const int padding_h = (padding_data[0] + 1) >> 1;
+  const int padding_w = (padding_data[1] + 1) >> 1;
+
+  const int align_h = stride_h - 1 - padding_h;
+  const int align_w = stride_w - 1 - padding_w;
+  const int kernel_size = filter->dim(2) * filter->dim(3);
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
+    built_options.emplace("-Ddeconv_2d=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width_blocks),
+                           static_cast<uint32_t>(height * batch)};
+
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(filter->opencl_image()));
+    if (bias != nullptr) {
+      kernel_.setArg(idx++, *(bias->opencl_image()));
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, relux_max_limit);
+    kernel_.setArg(idx++, leakyrelu_coefficient);
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(3)));
+    kernel_.setArg(idx++, static_cast<int32_t>(height));
+    kernel_.setArg(idx++, static_cast<int32_t>(width));
+    kernel_.setArg(idx++, static_cast<int32_t>(channels));
+    kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
+    kernel_.setArg(idx++, stride_h_r);
+    kernel_.setArg(idx++, stride_w_r);
+    kernel_.setArg(idx++, static_cast<int32_t>(align_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(align_w));
+    kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
+    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
+    kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_channel_blocks));
+    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
+
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/deconv_2d.h b/mace/ops/opencl/image/deconv_2d.h
index 058cc094ac1874bd1c72a588bc6215670daed74b..aa3b9d249b58cd3982866c71ef07b30ee24c75bc 100644
--- a/mace/ops/opencl/image/deconv_2d.h
+++ b/mace/ops/opencl/image/deconv_2d.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class Deconv2dKernel : public OpenCLDeconv2dKernel {
  public:
   MaceStatus Compute(
@@ -52,140 +51,6 @@ class Deconv2dKernel : public OpenCLDeconv2dKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus Deconv2dKernel<T>::Compute(
-      OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *bias,
-      const int *strides,
-      const int *padding_data,
-      const ActivationType activation,
-      const float relux_max_limit,
-      const float leakyrelu_coefficient,
-      const std::vector<index_t> &output_shape,
-      Tensor *output) {
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  const DataType dt = DataTypeToEnum<T>::value;
-  const index_t batch = output->dim(0);
-  const index_t height = output->dim(1);
-  const index_t width = output->dim(2);
-  const index_t channels = output->dim(3);
-  const index_t input_channels = input->dim(3);
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
-  const int stride_h = strides[0];
-  const int stride_w = strides[1];
-  MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
-  const int width_tile = 5;
-  const index_t n_strides = (width + stride_w - 1) / stride_w;
-  const index_t width_blocks =
-      ((n_strides + width_tile - 1) / width_tile) * stride_w;
-  const float stride_h_r = 1.f / static_cast<float>(stride_h);
-  const float stride_w_r = 1.f / static_cast<float>(stride_w);
-  const int padding_h = (padding_data[0] + 1) >> 1;
-  const int padding_w = (padding_data[1] + 1) >> 1;
-
-  const int align_h = stride_h - 1 - padding_h;
-  const int align_w = stride_w - 1 - padding_w;
-  const int kernel_size = filter->dim(2) * filter->dim(3);
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
-    built_options.emplace("-Ddeconv_2d=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-    switch (activation) {
-      case NOOP:
-        break;
-      case RELU:
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case TANH:
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      case LEAKYRELU:
-        built_options.emplace("-DUSE_LEAKYRELU");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation;
-    }
-
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width_blocks),
-                           static_cast<uint32_t>(height * batch)};
-
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(filter->opencl_image()));
-    if (bias != nullptr) {
-      kernel_.setArg(idx++, *(bias->opencl_image()));
-    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, relux_max_limit);
-    kernel_.setArg(idx++, leakyrelu_coefficient);
-    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(3)));
-    kernel_.setArg(idx++, static_cast<int32_t>(height));
-    kernel_.setArg(idx++, static_cast<int32_t>(width));
-    kernel_.setArg(idx++, static_cast<int32_t>(channels));
-    kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
-    kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
-    kernel_.setArg(idx++, stride_h_r);
-    kernel_.setArg(idx++, stride_w_r);
-    kernel_.setArg(idx++, static_cast<int32_t>(align_h));
-    kernel_.setArg(idx++, static_cast<int32_t>(align_w));
-    kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
-    kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
-    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
-    kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_channel_blocks));
-    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
-
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
-             output->dim(1), output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/depth_to_space.cc b/mace/ops/opencl/image/depth_to_space.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b885dddfc93316f3100e7478c3c54246171cafbf
--- /dev/null
+++ b/mace/ops/opencl/image/depth_to_space.cc
@@ -0,0 +1,120 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/depth_to_space.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+
+MaceStatus DepthToSpaceKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t input_height = input->dim(1);
+  const index_t input_width = input->dim(2);
+  const index_t input_depth = input->dim(3);
+
+  MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
+             "input depth should be dividable by block_size * block_size ",
+             input_depth);
+
+  const index_t output_height = input_height * block_size_;
+  const index_t output_width = input_width * block_size_;
+  const index_t output_depth = input_depth / (block_size_ * block_size_);
+  MACE_CHECK(output_depth % 4 == 0 || output_depth < 4,
+             "output channel not support:") << output_depth;
+
+  std::vector<index_t> output_shape = {batch,
+                                       output_height,
+                                       output_width,
+                                       output_depth};
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+
+  uint32_t gws[3];
+  if (output_depth < 3) {
+    gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
+    gws[1] = static_cast<uint32_t>(input_width);
+    gws[2] = static_cast<uint32_t>(input_height * batch);
+  } else {
+    gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
+    gws[1] = static_cast<uint32_t>(output_width);
+    gws[2] = static_cast<uint32_t>(output_height * batch);
+  }
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    const char *kernel_name = "depth_to_space";
+    if (output_depth < 4) {
+      built_options.emplace(MakeString("-DDEPTH", output_depth));
+      if (output_depth != 3) kernel_name = "depth_to_space_d1_d2";
+    }
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    auto dt = input->dtype();
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_depth));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+
+    input_shape_ = input->shape();
+  }
+
+  std::string tuning_key = Concat("depth_to_space",
+                                  batch, output_height,
+                                  output_width, output_depth);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/depth_to_space.h b/mace/ops/opencl/image/depth_to_space.h
index 990e06ccef6771b2d7ab8a4e8bb31446e7feeb40..ac68cbdb02adffad9b7bfc911363b1c95d2a7a86 100644
--- a/mace/ops/opencl/image/depth_to_space.h
+++ b/mace/ops/opencl/image/depth_to_space.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel {
  public:
   explicit DepthToSpaceKernel(const int block_size)
@@ -47,101 +46,6 @@ class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus DepthToSpaceKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    Tensor *output) {
-  const index_t batch = input->dim(0);
-  const index_t input_height = input->dim(1);
-  const index_t input_width = input->dim(2);
-  const index_t input_depth = input->dim(3);
-
-  MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
-             "input depth should be dividable by block_size * block_size ",
-             input_depth);
-
-  const index_t output_height = input_height * block_size_;
-  const index_t output_width = input_width * block_size_;
-  const index_t output_depth = input_depth / (block_size_ * block_size_);
-  MACE_CHECK(output_depth % 4 == 0 || output_depth < 4,
-             "output channel not support:") << output_depth;
-
-  std::vector<index_t> output_shape = {batch,
-                                       output_height,
-                                       output_width,
-                                       output_depth};
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape,
-                              OpenCLBufferType::IN_OUT_CHANNEL,
-                              &image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-
-  uint32_t gws[3];
-  if (output_depth < 3) {
-    gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
-    gws[1] = static_cast<uint32_t>(input_width);
-    gws[2] = static_cast<uint32_t>(input_height * batch);
-  } else {
-    gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
-    gws[1] = static_cast<uint32_t>(output_width);
-    gws[2] = static_cast<uint32_t>(output_height * batch);
-  }
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    const char *kernel_name = "depth_to_space";
-    if (output_depth < 4) {
-      built_options.emplace(MakeString("-DDEPTH", output_depth));
-      if (output_depth != 3) kernel_name = "depth_to_space_d1_d2";
-    }
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    built_options.emplace(kernel_name_ss.str());
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_depth));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-
-    input_shape_ = input->shape();
-  }
-
-  std::string tuning_key = Concat("depth_to_space",
-                                  batch, output_height,
-                                  output_width, output_depth);
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/depthwise_conv2d.cc b/mace/ops/opencl/image/depthwise_conv2d.cc
index 5b86c68414d46517c48382142193187f600efe7b..0101ea136a8b14d825c9d6ec89c074e0d005f01b 100644
--- a/mace/ops/opencl/image/depthwise_conv2d.cc
+++ b/mace/ops/opencl/image/depthwise_conv2d.cc
@@ -74,7 +74,6 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                            const ActivationType activation,
                            const float relux_max_limit,
                            const float leakyrelu_coefficient,
-                           const DataType dt,
                            std::vector<index_t> *prev_input_shape,
                            Tensor *output,
                            uint32_t *kwg_size) {
@@ -108,8 +107,8 @@ MaceStatus DepthwiseConv2d(OpContext *context,
     } else {
       built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
     }
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
     built_options.emplace(MakeString("-DSTRIDE=", stride));
     switch (activation) {
@@ -192,6 +191,62 @@ MaceStatus DepthwiseConv2d(OpContext *context,
 }
 
 }  // namespace depthwise
+
+
+MaceStatus DepthwiseConv2dKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const float leakyrelu_coefficient,
+    Tensor *output) {
+  index_t kernel_h = filter->dim(2);
+  index_t kernel_w = filter->dim(3);
+  if (strides[0] != strides[1]) {
+    LOG(WARNING) << "OpenCL depthwise conv2d kernel with "
+                 << "filter" << kernel_h << "x" << kernel_w << ","
+                 << " stride " << strides[0] << "x" << strides[1]
+                 << " is not implemented yet, using slow version";
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  // Create a fake conv_2d filter to calculate the paddings and output size
+  std::vector<index_t> fake_filter_shape(4);
+  fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
+  fake_filter_shape[1] = filter->dim(1);
+  fake_filter_shape[2] = filter->dim(2);
+  fake_filter_shape[3] = filter->dim(3);
+
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    ops::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), fake_filter_shape.data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
+                   padding_data.data(), dilations, strides, RoundType::FLOOR,
+                   output_shape.data());
+  }
+
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+
+  return depthwise::DepthwiseConv2d(
+      context, &kernel_, input, filter, bias, strides[0], paddings.data(),
+      dilations, activation, relux_max_limit, leakyrelu_coefficient,
+      &input_shape_, output, &kwg_size_);
+}
+
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/depthwise_conv2d.h b/mace/ops/opencl/image/depthwise_conv2d.h
index 13a64076f9200d9f180159b5bd4455aaf316db99..f4bc4f2a0c92adc2edca4c5eb820c2f00f63d680 100644
--- a/mace/ops/opencl/image/depthwise_conv2d.h
+++ b/mace/ops/opencl/image/depthwise_conv2d.h
@@ -40,14 +40,11 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                            const ActivationType activation,
                            const float relux_max_limit,
                            const float leakyrelu_coefficient,
-                           const DataType dt,
                            std::vector<index_t> *prev_input_shape,
                            Tensor *output,
                            uint32_t *kwg_size);
 }  // namespace depthwise
 
-
-template <typename T>
 class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
  public:
   MaceStatus Compute(
@@ -70,61 +67,6 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus DepthwiseConv2dKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *filter,
-    const Tensor *bias,
-    const int *strides,
-    const Padding &padding_type,
-    const std::vector<int> &padding_data,
-    const int *dilations,
-    const ActivationType activation,
-    const float relux_max_limit,
-    const float leakyrelu_coefficient,
-    Tensor *output) {
-  index_t kernel_h = filter->dim(2);
-  index_t kernel_w = filter->dim(3);
-  if (strides[0] != strides[1]) {
-    LOG(WARNING) << "OpenCL depthwise conv2d kernel with "
-                 << "filter" << kernel_h << "x" << kernel_w << ","
-                 << " stride " << strides[0] << "x" << strides[1]
-                 << " is not implemented yet, using slow version";
-    MACE_NOT_IMPLEMENTED;
-  }
-
-  // Create a fake conv_2d filter to calculate the paddings and output size
-  std::vector<index_t> fake_filter_shape(4);
-  fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
-  fake_filter_shape[1] = filter->dim(1);
-  fake_filter_shape[2] = filter->dim(2);
-  fake_filter_shape[3] = filter->dim(3);
-
-  std::vector<index_t> output_shape(4);
-  std::vector<int> paddings(2);
-  if (padding_data.empty()) {
-    ops::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), fake_filter_shape.data(), dilations, strides,
-        padding_type, output_shape.data(), paddings.data());
-  } else {
-    paddings = padding_data;
-    CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
-                   padding_data.data(), dilations, strides, RoundType::FLOOR,
-                   output_shape.data());
-  }
-
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-
-  return depthwise::DepthwiseConv2d(
-      context, &kernel_, input, filter, bias, strides[0], paddings.data(),
-      dilations, activation, relux_max_limit, leakyrelu_coefficient,
-      DataTypeToEnum<T>::value, &input_shape_, output, &kwg_size_);
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/depthwise_deconv2d.cc b/mace/ops/opencl/image/depthwise_deconv2d.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6a8d7eb9919959cc63bcc01f127344fb72ee8af5
--- /dev/null
+++ b/mace/ops/opencl/image/depthwise_deconv2d.cc
@@ -0,0 +1,165 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/depthwise_deconv2d.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+
+MaceStatus DepthwiseDeconv2dKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const int *padding_data,
+    const int group,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const float leakyrelu_coefficient,
+    const std::vector<index_t> &output_shape,
+    Tensor *output) {
+  const index_t batch = output_shape[0];
+  const index_t height = output_shape[1];
+  const index_t width = output_shape[2];
+  const index_t channels = output_shape[3];
+  const index_t input_channels = input->dim(3);
+  const index_t multiplier = filter->dim(0);
+
+  MACE_CHECK(group == channels && group == input_channels && multiplier == 1,
+             "opencl image deconv only supports depthwise type group.");
+
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const int stride_h = strides[0];
+  const int stride_w = strides[1];
+  MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
+  const int width_tile = 5;
+  const index_t n_strides = (width + stride_w - 1) / stride_w;
+  const index_t width_blocks =
+      ((n_strides + width_tile - 1) / width_tile) * stride_w;
+  const float stride_h_r = 1.f / static_cast<float>(stride_h);
+  const float stride_w_r = 1.f / static_cast<float>(stride_w);
+  const int padding_h = (padding_data[0] + 1) >> 1;
+  const int padding_w = (padding_data[1] + 1) >> 1;
+
+  const int align_h = stride_h - 1 - padding_h;
+  const int align_w = stride_w - 1 - padding_w;
+  const int kernel_size = filter->dim(2) * filter->dim(3);
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_deconv2d");
+    built_options.emplace("-Ddepthwise_deconv2d=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("depthwise_deconv2d", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width_blocks),
+                           static_cast<uint32_t>(height * batch)};
+
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(filter->opencl_image()));
+    if (bias != nullptr) {
+      kernel_.setArg(idx++, *(bias->opencl_image()));
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, relux_max_limit);
+    kernel_.setArg(idx++, leakyrelu_coefficient);
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(height));
+    kernel_.setArg(idx++, static_cast<int32_t>(width));
+    kernel_.setArg(idx++, static_cast<int32_t>(channels));
+    kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
+    kernel_.setArg(idx++, stride_h_r);
+    kernel_.setArg(idx++, stride_w_r);
+    kernel_.setArg(idx++, static_cast<int32_t>(align_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(align_w));
+    kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
+    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
+    kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
+    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
+
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("depthwise_deconv2d_kernel_",
+             activation,
+             output->dim(0),
+             output->dim(1),
+             output->dim(2),
+             output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/depthwise_deconv2d.h b/mace/ops/opencl/image/depthwise_deconv2d.h
index 53d0536fe85b220f635d992d03c065e409a14df0..2055511678d8340da655b298c2a8a163279c95a3 100644
--- a/mace/ops/opencl/image/depthwise_deconv2d.h
+++ b/mace/ops/opencl/image/depthwise_deconv2d.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class DepthwiseDeconv2dKernel : public OpenCLDepthwiseDeconv2dKernel {
  public:
   MaceStatus Compute(
@@ -53,147 +52,6 @@ class DepthwiseDeconv2dKernel : public OpenCLDepthwiseDeconv2dKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus DepthwiseDeconv2dKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *filter,
-    const Tensor *bias,
-    const int *strides,
-    const int *padding_data,
-    const int group,
-    const ActivationType activation,
-    const float relux_max_limit,
-    const float leakyrelu_coefficient,
-    const std::vector<index_t> &output_shape,
-    Tensor *output) {
-  const index_t batch = output_shape[0];
-  const index_t height = output_shape[1];
-  const index_t width = output_shape[2];
-  const index_t channels = output_shape[3];
-  const index_t input_channels = input->dim(3);
-  const index_t multiplier = filter->dim(0);
-
-  MACE_CHECK(group == channels && group == input_channels && multiplier == 1,
-             "opencl image deconv only supports depthwise type group.");
-
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  const DataType dt = DataTypeToEnum<T>::value;
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const int stride_h = strides[0];
-  const int stride_w = strides[1];
-  MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
-  const int width_tile = 5;
-  const index_t n_strides = (width + stride_w - 1) / stride_w;
-  const index_t width_blocks =
-      ((n_strides + width_tile - 1) / width_tile) * stride_w;
-  const float stride_h_r = 1.f / static_cast<float>(stride_h);
-  const float stride_w_r = 1.f / static_cast<float>(stride_w);
-  const int padding_h = (padding_data[0] + 1) >> 1;
-  const int padding_w = (padding_data[1] + 1) >> 1;
-
-  const int align_h = stride_h - 1 - padding_h;
-  const int align_w = stride_w - 1 - padding_w;
-  const int kernel_size = filter->dim(2) * filter->dim(3);
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_deconv2d");
-    built_options.emplace("-Ddepthwise_deconv2d=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-    switch (activation) {
-      case NOOP:
-        break;
-      case RELU:
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case TANH:
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      case LEAKYRELU:
-        built_options.emplace("-DUSE_LEAKYRELU");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation;
-    }
-
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("depthwise_deconv2d", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width_blocks),
-                           static_cast<uint32_t>(height * batch)};
-
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(filter->opencl_image()));
-    if (bias != nullptr) {
-      kernel_.setArg(idx++, *(bias->opencl_image()));
-    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, relux_max_limit);
-    kernel_.setArg(idx++, leakyrelu_coefficient);
-    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(height));
-    kernel_.setArg(idx++, static_cast<int32_t>(width));
-    kernel_.setArg(idx++, static_cast<int32_t>(channels));
-    kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
-    kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
-    kernel_.setArg(idx++, stride_h_r);
-    kernel_.setArg(idx++, stride_w_r);
-    kernel_.setArg(idx++, static_cast<int32_t>(align_h));
-    kernel_.setArg(idx++, static_cast<int32_t>(align_w));
-    kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
-    kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
-    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
-    kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
-    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
-
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("depthwise_deconv2d_kernel_",
-             activation,
-             output->dim(0),
-             output->dim(1),
-             output->dim(2),
-             output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/eltwise.cc b/mace/ops/opencl/image/eltwise.cc
new file mode 100644
index 0000000000000000000000000000000000000000..437cfce0255f8bcb337242612b4ec08a3c4bfe85
--- /dev/null
+++ b/mace/ops/opencl/image/eltwise.cc
@@ -0,0 +1,168 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/eltwise.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+
+MaceStatus EltwiseKernel::Compute(
+    OpContext *context,
+    const Tensor *input0,
+    const Tensor *input1,
+    Tensor *output) {
+  bool swapped = false;
+  std::string input1_type = "";
+  if (input1 == nullptr) {
+    input1_type = "INPUT_SCALAR";
+  } else {
+    MACE_CHECK((input0->dim_size() == input1->dim_size()
+        && input0->dim_size() == 4) ||
+        input0->dim_size() == 1 || input1->dim_size() == 1)
+      << "Inputs of Eltwise op must be same shape or fulfill broadcast logic";
+    MACE_CHECK(type_ != EltwiseType::EQUAL)
+      << "Eltwise op on GPU does not support EQUAL";
+    // broadcast
+    if (input0->size() != input1->size() ||
+        input0->dim_size() != input1->dim_size()) {
+      if (input0->size() < input1->size()
+          || input0->dim_size() < input1->dim_size()) {
+        std::swap(input0, input1);
+        swapped = true;
+      }
+      if (input1->dim_size() == 1
+          || (input1->dim(0) == 1 && input1->dim(1) == 1
+              && input1->dim(2) == 1)) {
+        // Tensor-Vector element wise
+        if (input0->dim(3) == input1->dim(input1->dim_size()-1)) {
+          input1_type = "INPUT_VECTOR";
+        } else {
+          LOG(FATAL) << "Inputs not match the broadcast logic, "
+                     << MakeString(input0->shape()) << " vs "
+                     << MakeString(input1->shape());
+        }
+      } else {  // must be 4-D
+        if (input0->dim(0) == input1->dim(0)
+            && input1->dim(1) == 1
+            && input1->dim(2) == 1
+            && input0->dim(3) == input1->dim(3)) {
+          input1_type = "INPUT_BATCH_VECTOR";
+        } else if (input0->dim(0) == input1->dim(0)
+            && input0->dim(1) == input1->dim(1)
+            && input0->dim(2) == input1->dim(2)
+            && input1->dim(3) == 1) {
+          // broadcast on channel dimension
+          input1_type = "INPUT_TENSOR_BC_CHAN";
+        } else {
+          LOG(FATAL) << "Element-Wise op only support broadcast on"
+                        " channel dimension:"
+                        "Tensor-BatchVector(4D-[N,1,1,C]) "
+                        "and Tensor-Tensor(4D-[N,H,W,1]). but got "
+                     << MakeString(input0->shape()) << " vs "
+                     << MakeString(input1->shape());
+        }
+      }
+    }
+  }
+
+  if (scalar_input_index_ == 0) {
+    swapped = !swapped;
+  }
+
+  std::vector<index_t> output_shape(4);
+  output_shape[0] = input0->dim(0);
+  output_shape[1] = input0->dim(1);
+  output_shape[2] = input0->dim(2);
+  output_shape[3] = input0->dim(3);
+
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channels = output->dim(3);
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t batch_height_pixels = batch * height;
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(batch_height_pixels)};
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
+    built_options.emplace("-Deltwise=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
+    if (!input1_type.empty()) {
+      built_options.emplace("-D" + input1_type);
+    }
+    if (swapped) built_options.emplace("-DSWAPPED");
+    if (channels % 4 != 0) built_options.emplace("-DNOT_DIVISIBLE_FOUR");
+    if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input0->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input0->opencl_image()));
+    if (input1 == nullptr) {
+      kernel_.setArg(idx++, scalar_input_);
+    } else {
+      kernel_.setArg(idx++, *(input1->opencl_image()));
+    }
+    kernel_.setArg(idx++, static_cast<int32_t>(height));
+    kernel_.setArg(idx++, static_cast<int32_t>(width));
+    kernel_.setArg(idx++, static_cast<int32_t>(channels));
+    if (!coeff_.empty()) {
+      kernel_.setArg(idx++, coeff_[0]);
+      kernel_.setArg(idx++, coeff_[1]);
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+
+    input_shape_ = input0->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/eltwise.h b/mace/ops/opencl/image/eltwise.h
index 9c8a1a3133e63d7e8c486ca292f86f0fa2b981db..5678f9c72ffbe2ec706e2b44bd73457f938cb585 100644
--- a/mace/ops/opencl/image/eltwise.h
+++ b/mace/ops/opencl/image/eltwise.h
@@ -24,7 +24,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/eltwise.h"
+#include "mace/ops/common/eltwise_type.h"
 #include "mace/ops/opencl/helper.h"
 
 namespace mace {
@@ -32,7 +32,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class EltwiseKernel : public OpenCLEltwiseKernel {
  public:
   explicit EltwiseKernel(
@@ -60,150 +59,6 @@ class EltwiseKernel : public OpenCLEltwiseKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus EltwiseKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input0,
-    const Tensor *input1,
-    Tensor *output) {
-  bool swapped = false;
-  std::string input1_type = "";
-  if (input1 == nullptr) {
-    input1_type = "INPUT_SCALAR";
-  } else {
-    MACE_CHECK((input0->dim_size() == input1->dim_size()
-        && input0->dim_size() == 4) ||
-        input0->dim_size() == 1 || input1->dim_size() == 1)
-      << "Inputs of Eltwise op must be same shape or fulfill broadcast logic";
-    MACE_CHECK(type_ != EltwiseType::EQUAL)
-      << "Eltwise op on GPU does not support EQUAL";
-    // broadcast
-    if (input0->size() != input1->size() ||
-        input0->dim_size() != input1->dim_size()) {
-      if (input0->size() < input1->size()
-          || input0->dim_size() < input1->dim_size()) {
-        std::swap(input0, input1);
-        swapped = true;
-      }
-      if (input1->dim_size() == 1
-          || (input1->dim(0) == 1 && input1->dim(1) == 1
-              && input1->dim(2) == 1)) {
-        // Tensor-Vector element wise
-        if (input0->dim(3) == input1->dim(input1->dim_size()-1)) {
-          input1_type = "INPUT_VECTOR";
-        } else {
-          LOG(FATAL) << "Inputs not match the broadcast logic, "
-                     << MakeString(input0->shape()) << " vs "
-                     << MakeString(input1->shape());
-        }
-      } else {  // must be 4-D
-        if (input0->dim(0) == input1->dim(0)
-            && input1->dim(1) == 1
-            && input1->dim(2) == 1
-            && input0->dim(3) == input1->dim(3)) {
-          input1_type = "INPUT_BATCH_VECTOR";
-        } else if (input0->dim(0) == input1->dim(0)
-            && input0->dim(1) == input1->dim(1)
-            && input0->dim(2) == input1->dim(2)
-            && input1->dim(3) == 1) {
-          // broadcast on channel dimension
-          input1_type = "INPUT_TENSOR_BC_CHAN";
-        } else {
-          LOG(FATAL) << "Element-Wise op only support broadcast on"
-                        " channel dimension:"
-                        "Tensor-BatchVector(4D-[N,1,1,C]) "
-                        "and Tensor-Tensor(4D-[N,H,W,1]). but got "
-                     << MakeString(input0->shape()) << " vs "
-                     << MakeString(input1->shape());
-        }
-      }
-    }
-  }
-
-  if (scalar_input_index_ == 0) {
-    swapped = !swapped;
-  }
-
-  std::vector<index_t> output_shape(4);
-  output_shape[0] = input0->dim(0);
-  output_shape[1] = input0->dim(1);
-  output_shape[2] = input0->dim(2);
-  output_shape[3] = input0->dim(3);
-
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-
-  const index_t batch = output->dim(0);
-  const index_t height = output->dim(1);
-  const index_t width = output->dim(2);
-  const index_t channels = output->dim(3);
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t batch_height_pixels = batch * height;
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(batch_height_pixels)};
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
-    built_options.emplace("-Deltwise=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
-    if (!input1_type.empty()) {
-      built_options.emplace("-D" + input1_type);
-    }
-    if (swapped) built_options.emplace("-DSWAPPED");
-    if (channels % 4 != 0) built_options.emplace("-DNOT_DIVISIBLE_FOUR");
-    if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input0->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input0->opencl_image()));
-    if (input1 == nullptr) {
-      kernel_.setArg(idx++, scalar_input_);
-    } else {
-      kernel_.setArg(idx++, *(input1->opencl_image()));
-    }
-    kernel_.setArg(idx++, static_cast<int32_t>(height));
-    kernel_.setArg(idx++, static_cast<int32_t>(width));
-    kernel_.setArg(idx++, static_cast<int32_t>(channels));
-    if (!coeff_.empty()) {
-      kernel_.setArg(idx++, coeff_[0]);
-      kernel_.setArg(idx++, coeff_[1]);
-    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-
-    input_shape_ = input0->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/fully_connected.cc b/mace/ops/opencl/image/fully_connected.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ec83e91b771d49abc379d4aa312dd5caa90ac18
--- /dev/null
+++ b/mace/ops/opencl/image/fully_connected.cc
@@ -0,0 +1,162 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/fully_connected.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+
+MaceStatus FullyConnectedKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *weight,
+    const Tensor *bias,
+    const ActivationType activation,
+    const float relux_max_limit,
+    const float leakyrelu_coefficient,
+    Tensor *output) {
+  std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    const index_t batch = output->dim(0);
+    const index_t output_size = output->dim(3);
+    const index_t output_blocks = RoundUpDiv4(output_size);
+
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
+    built_options.emplace("-Dfully_connected_width=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    if (bias != nullptr) {
+      built_options.emplace("-DBIAS");
+    }
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
+      built_options.emplace("-DNON_QUALCOMM_ADRENO");
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
+                                              built_options, &kernel_));
+
+    const uint32_t kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+
+    if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+      const uint32_t wave_size =
+          static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
+
+      gws_ = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
+
+      const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
+      lws_ = {gws_[0], gws_[1], inter_local_blks};
+    } else {
+      gws_ = {4, 8, static_cast<uint32_t>(batch * output_blocks)};
+
+      const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
+      lws_ = {gws_[0], gws_[1], inter_local_blks};
+    }
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    const index_t batch = output->dim(0);
+    const index_t output_blocks = RoundUpDiv4(output->dim(3));
+    gws_[2] = static_cast<uint32_t>(batch * output_blocks);
+
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws_);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(weight->opencl_image()));
+    if (bias != nullptr) {
+      kernel_.setArg(idx++, *(bias->opencl_image()));
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, (lws_[0] * lws_[1] * lws_[2] * sizeof(float)),
+                   nullptr);
+    kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
+    kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
+    kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
+    kernel_.setArg(idx++, static_cast<int>(output_blocks));
+    kernel_.setArg(idx++, relux_max_limit);
+    kernel_.setArg(idx++, leakyrelu_coefficient);
+
+    input_shape_ = input->shape();
+  }
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws_[0], gws_[1], gws_[2]),
+        cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws_.size());
+    for (size_t i = 0; i < lws_.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws_[i], lws_[i]);
+    }
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange,
+        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+        cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
+  }
+  MACE_OUT_OF_RANGE_VALIDATION;
+  MACE_CL_RET_STATUS(error);
+
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/fully_connected.h b/mace/ops/opencl/image/fully_connected.h
index 82386b7444396edb8084e3e28978cedc790e5ff7..9f1bae647f33c5906f31f312c5a094d64ef322e6 100644
--- a/mace/ops/opencl/image/fully_connected.h
+++ b/mace/ops/opencl/image/fully_connected.h
@@ -23,6 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
+#include "mace/ops/common/activation_type.h"
 #include "mace/ops/opencl/helper.h"
 
 namespace mace {
@@ -30,7 +31,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class FullyConnectedKernel : public OpenCLFullyConnectedKernel {
  public:
   MaceStatus Compute(
@@ -50,144 +50,6 @@ class FullyConnectedKernel : public OpenCLFullyConnectedKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus FullyConnectedKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *weight,
-    const Tensor *bias,
-    const ActivationType activation,
-    const float relux_max_limit,
-    const float leakyrelu_coefficient,
-    Tensor *output) {
-  std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    const index_t batch = output->dim(0);
-    const index_t output_size = output->dim(3);
-    const index_t output_blocks = RoundUpDiv4(output_size);
-
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
-    built_options.emplace("-Dfully_connected_width=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    if (bias != nullptr) {
-      built_options.emplace("-DBIAS");
-    }
-    switch (activation) {
-      case NOOP:
-        break;
-      case RELU:
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case TANH:
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      case LEAKYRELU:
-        built_options.emplace("-DUSE_LEAKYRELU");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation;
-    }
-    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
-      built_options.emplace("-DNON_QUALCOMM_ADRENO");
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
-                                              built_options, &kernel_));
-
-    const uint32_t kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-
-    if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
-      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
-      const uint32_t wave_size =
-          static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
-
-      gws_ = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
-
-      const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
-      lws_ = {gws_[0], gws_[1], inter_local_blks};
-    } else {
-      gws_ = {4, 8, static_cast<uint32_t>(batch * output_blocks)};
-
-      const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
-      lws_ = {gws_[0], gws_[1], inter_local_blks};
-    }
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    const index_t batch = output->dim(0);
-    const index_t output_blocks = RoundUpDiv4(output->dim(3));
-    gws_[2] = static_cast<uint32_t>(batch * output_blocks);
-
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(weight->opencl_image()));
-    if (bias != nullptr) {
-      kernel_.setArg(idx++, *(bias->opencl_image()));
-    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, (lws_[0] * lws_[1] * lws_[2] * sizeof(float)),
-                   nullptr);
-    kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
-    kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
-    kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
-    kernel_.setArg(idx++, static_cast<int>(output_blocks));
-    kernel_.setArg(idx++, relux_max_limit);
-    kernel_.setArg(idx++, leakyrelu_coefficient);
-
-    input_shape_ = input->shape();
-  }
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(gws_[0], gws_[1], gws_[2]),
-        cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
-  } else {
-    std::vector<uint32_t> roundup_gws(lws_.size());
-    for (size_t i = 0; i < lws_.size(); ++i) {
-      roundup_gws[i] = RoundUp(gws_[i], lws_[i]);
-    }
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange,
-        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-        cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
-  }
-  MACE_OUT_OF_RANGE_VALIDATION;
-  MACE_CL_RET_STATUS(error);
-
-  if (context->future() != nullptr) {
-    context->future()->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/image_to_buffer.cc b/mace/ops/opencl/image/image_to_buffer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2a54ba740294c64ef4de270c31576abf7b9281dd
--- /dev/null
+++ b/mace/ops/opencl/image/image_to_buffer.cc
@@ -0,0 +1,159 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/image_to_buffer.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus ImageToBuffer::Compute(OpContext *context,
+                                  const Tensor *input,
+                                  const OpenCLBufferType type,
+                                  const int wino_blk_size,
+                                  Tensor *output) {
+  auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
+                              type,
+                              &image_shape,
+                              wino_blk_size);
+  MACE_RETURN_IF_ERROR(output->Resize(input->shape()));
+
+  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
+                     static_cast<uint32_t>(image_shape[1])};
+  std::string kernel_name;
+  switch (type) {
+    case CONV2D_FILTER:kernel_name = "filter_image_to_buffer";
+      break;
+    case IN_OUT_CHANNEL:kernel_name = "in_out_image_to_buffer";
+      break;
+    case ARGUMENT:kernel_name = "arg_image_to_buffer";
+      break;
+    case IN_OUT_HEIGHT:kernel_name = "in_out_height_image_to_buffer";
+      break;
+    case WINOGRAD_FILTER: {
+      std::stringstream ss_tmp;
+      gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
+      ss_tmp << "winograd_filter_image_to_buffer_"
+             << wino_blk_size << "x" << wino_blk_size;
+      kernel_name = ss_tmp.str();
+      break;
+    }
+    case WEIGHT_HEIGHT:kernel_name = "weight_height_image_to_buffer";
+      break;
+    case WEIGHT_WIDTH:kernel_name = "weight_width_image_to_buffer";
+      break;
+    case DW_CONV2D_FILTER:
+    case IN_OUT_WIDTH:LOG(FATAL)
+          << "IN_OUT_WIDTH only support buffer to image now";
+      break;
+  }
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    if (output->dtype() == input->dtype()) {
+      auto data_dt = input->dtype();
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt));
+    } else {
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+  }
+
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_2D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(output->opencl_buffer()));
+    if (type == CONV2D_FILTER) {
+      const index_t
+          inner_size = output->dim(1) * output->dim(2) * output->dim(3);
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
+    } else if (type == ARGUMENT) {
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
+    } else if (type == WEIGHT_HEIGHT) {
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(1)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
+    } else {
+      kernel_.setArg(idx++,
+                     static_cast<uint32_t>(formatted_buffer_shape[1]));
+      kernel_.setArg(idx++,
+                     static_cast<uint32_t>(formatted_buffer_shape[2]));
+      kernel_.setArg(idx++,
+                     static_cast<uint32_t>(formatted_buffer_shape[3]));
+    }
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    input_shape_ = input->shape();
+  }
+
+  const uint32_t kwg_size =
+      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  const std::vector<uint32_t> lws = {16, kwg_size / 16};
+
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
+        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
+        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION;
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/image_to_buffer.h b/mace/ops/opencl/image/image_to_buffer.h
index b91b8ba1fe07fcee684318169cf77aa543297c44..85893f6b283da5b659d30466568c65c52d931954 100644
--- a/mace/ops/opencl/image/image_to_buffer.h
+++ b/mace/ops/opencl/image/image_to_buffer.h
@@ -28,7 +28,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class ImageToBuffer : public OpenCLBufferTransformKernel {
  public:
   MaceStatus Compute(OpContext *context,
@@ -42,150 +41,6 @@ class ImageToBuffer : public OpenCLBufferTransformKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus ImageToBuffer<T>::Compute(OpContext *context,
-                                     const Tensor *input,
-                                     const OpenCLBufferType type,
-                                     const int wino_blk_size,
-                                     Tensor *output) {
-  auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
-                              type,
-                              &image_shape,
-                              wino_blk_size);
-  MACE_RETURN_IF_ERROR(output->Resize(input->shape()));
-
-  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
-                     static_cast<uint32_t>(image_shape[1])};
-  std::string kernel_name;
-  switch (type) {
-    case CONV2D_FILTER:
-      kernel_name = "filter_image_to_buffer";
-      break;
-    case IN_OUT_CHANNEL:
-      kernel_name = "in_out_image_to_buffer";
-      break;
-    case ARGUMENT:
-      kernel_name = "arg_image_to_buffer";
-      break;
-    case IN_OUT_HEIGHT:
-      kernel_name = "in_out_height_image_to_buffer";
-      break;
-    case WINOGRAD_FILTER: {
-      std::stringstream ss_tmp;
-      gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
-      ss_tmp << "winograd_filter_image_to_buffer_"
-             << wino_blk_size << "x" << wino_blk_size;
-      kernel_name = ss_tmp.str();
-      break;
-    }
-    case WEIGHT_HEIGHT:
-      kernel_name = "weight_height_image_to_buffer";
-      break;
-    case WEIGHT_WIDTH:
-      kernel_name = "weight_width_image_to_buffer";
-      break;
-    case DW_CONV2D_FILTER:
-    case IN_OUT_WIDTH:
-      LOG(FATAL) << "IN_OUT_WIDTH only support buffer to image now";
-      break;
-  }
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    built_options.emplace(kernel_name_ss.str());
-    if (output->dtype() == input->dtype()) {
-      built_options.emplace(
-          "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-      built_options.emplace("-DCMD_DATA_TYPE=" +
-          DtToCLCMDDt(DataTypeToEnum<T>::value));
-    } else {
-      built_options.emplace("-DDATA_TYPE=" +
-          DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
-      built_options.emplace("-DCMD_DATA_TYPE=" +
-          DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-  }
-
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_2D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(output->opencl_buffer()));
-    if (type == CONV2D_FILTER) {
-      const index_t
-          inner_size = output->dim(1) * output->dim(2) * output->dim(3);
-      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
-    } else if (type == ARGUMENT) {
-      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
-    } else if (type == WEIGHT_HEIGHT) {
-      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(1)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
-    } else {
-      kernel_.setArg(idx++,
-                     static_cast<uint32_t>(formatted_buffer_shape[1]));
-      kernel_.setArg(idx++,
-                     static_cast<uint32_t>(formatted_buffer_shape[2]));
-      kernel_.setArg(idx++,
-                     static_cast<uint32_t>(formatted_buffer_shape[3]));
-    }
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    input_shape_ = input->shape();
-  }
-
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {16, kwg_size / 16};
-
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]),
-        cl::NDRange(lws[0], lws[1]), nullptr, &event);
-  } else {
-    std::vector<uint32_t> roundup_gws(lws.size());
-    for (size_t i = 0; i < lws.size(); ++i) {
-      roundup_gws[i] = RoundUp(gws[i], lws[i]);
-    }
-
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
-        cl::NDRange(lws[0], lws[1]), nullptr, &event);
-  }
-  MACE_CL_RET_STATUS(error);
-  MACE_OUT_OF_RANGE_VALIDATION;
-  if (context->future() != nullptr) {
-    context->future()->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/lstm_cell.cc b/mace/ops/opencl/image/lstm_cell.cc
new file mode 100644
index 0000000000000000000000000000000000000000..987d0b1b338eb20460fed030b94b7858efbd6211
--- /dev/null
+++ b/mace/ops/opencl/image/lstm_cell.cc
@@ -0,0 +1,104 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/lstm_cell.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus LSTMCellKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *pre_output,
+    const Tensor *weight,
+    const Tensor *bias,
+    const Tensor *pre_cell,
+    Tensor *cell,
+    Tensor *output) {
+  MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0,
+             "LSTM hidden units should be a multiple of 4");
+
+  const index_t height = input->dim(0);
+  const index_t width = input->dim(1);
+  const index_t hidden_units = pre_output->dim(1);
+  const index_t w_blocks = hidden_units >> 2;
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell");
+    built_options.emplace("-Dlstmcell=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("lstmcell", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+
+  const uint32_t gws[2] = {static_cast<uint32_t>(w_blocks),
+                           static_cast<uint32_t>(height)};
+
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    std::vector<index_t> output_shape_padded = {height, 1, 1, hidden_units};
+    std::vector<size_t> output_image_shape;
+    OpenCLUtil::CalImage2DShape(output_shape_padded,
+                                OpenCLBufferType::IN_OUT_CHANNEL,
+                                &output_image_shape);
+    MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(),
+                                             output_image_shape));
+    MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(),
+                                           output_image_shape));
+
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_2D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(pre_output->opencl_image()));
+    kernel_.setArg(idx++, *(weight->opencl_image()));
+    kernel_.setArg(idx++, *(bias->opencl_image()));
+    kernel_.setArg(idx++, *(pre_cell->opencl_image()));
+    kernel_.setArg(idx++, forget_bias_);
+    kernel_.setArg(idx++, static_cast<int32_t>(width));
+    kernel_.setArg(idx++, static_cast<int32_t>(hidden_units));
+    kernel_.setArg(idx++, static_cast<int32_t>(RoundUpDiv4(width)));
+    kernel_.setArg(idx++, *(cell->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
+  std::string tuning_key =
+      Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+  MACE_OUT_OF_RANGE_VALIDATION;
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/lstm_cell.h b/mace/ops/opencl/image/lstm_cell.h
index 1e45b2261edcdecbdbfe3b0c7a2a4dceed559306..006374f9d099df01f866231f1756c97ec4b16190 100644
--- a/mace/ops/opencl/image/lstm_cell.h
+++ b/mace/ops/opencl/image/lstm_cell.h
@@ -30,11 +30,10 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class LSTMCellKernel : public OpenCLLSTMCellKernel {
  public:
   explicit LSTMCellKernel(
-       const T forget_bias)
+       const float forget_bias)
       : forget_bias_(forget_bias) {}
   MaceStatus Compute(
       OpContext *context,
@@ -47,93 +46,12 @@ class LSTMCellKernel : public OpenCLLSTMCellKernel {
       Tensor *output) override;
 
  private:
-  T forget_bias_;
+  float forget_bias_;
   cl::Kernel kernel_;
   uint32_t kwg_size_;
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus LSTMCellKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *pre_output,
-    const Tensor *weight,
-    const Tensor *bias,
-    const Tensor *pre_cell,
-    Tensor *cell,
-    Tensor *output) {
-  MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0,
-             "LSTM hidden units should be a multiple of 4");
-
-  const index_t height = input->dim(0);
-  const index_t width = input->dim(1);
-  const index_t hidden_units = pre_output->dim(1);
-  const index_t w_blocks = hidden_units >> 2;
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell");
-    built_options.emplace("-Dlstmcell=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("lstmcell", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  const uint32_t gws[2] = {static_cast<uint32_t>(w_blocks),
-                           static_cast<uint32_t>(height)};
-
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    std::vector<index_t> output_shape_padded = {height, 1, 1, hidden_units};
-    std::vector<size_t> output_image_shape;
-    OpenCLUtil::CalImage2DShape(output_shape_padded,
-                                OpenCLBufferType::IN_OUT_CHANNEL,
-                                &output_image_shape);
-    MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(),
-                                             output_image_shape));
-    MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(),
-                                           output_image_shape));
-
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_2D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(pre_output->opencl_image()));
-    kernel_.setArg(idx++, *(weight->opencl_image()));
-    kernel_.setArg(idx++, *(bias->opencl_image()));
-    kernel_.setArg(idx++, *(pre_cell->opencl_image()));
-    kernel_.setArg(idx++, static_cast<float>(forget_bias_));
-    kernel_.setArg(idx++, static_cast<int32_t>(width));
-    kernel_.setArg(idx++, static_cast<int32_t>(hidden_units));
-    kernel_.setArg(idx++, static_cast<int32_t>(RoundUpDiv4(width)));
-    kernel_.setArg(idx++, *(cell->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
-  std::string tuning_key =
-      Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-  MACE_OUT_OF_RANGE_VALIDATION;
-
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/matmul.cc b/mace/ops/opencl/image/matmul.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a16d845d09bd5d6778ba100b1f8b93b43ff07ddf
--- /dev/null
+++ b/mace/ops/opencl/image/matmul.cc
@@ -0,0 +1,98 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/matmul.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus MatMulKernel::Compute(
+    OpContext *context,
+    const Tensor *A,
+    const Tensor *B,
+    Tensor *C,
+    bool transpose_a,
+    bool transpose_b) {
+  MACE_CHECK(!transpose_a && !transpose_b,
+             "GPU does not support transpose matmul");
+
+  index_t rank = A->dim_size();
+  index_t height = A->dim(rank - 2);
+  index_t K = A->dim(rank - 1);
+  index_t width = B->dim(rank - 1);
+  index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
+                                  std::multiplies<index_t>());
+
+  std::vector<index_t> c_shape = A->shape();
+  c_shape[rank - 2] = height;
+  c_shape[rank - 1] = width;
+  std::vector<size_t> c_image_shape;
+  std::vector<index_t> padded_c_shape = {batch, height, width, 1};
+  OpenCLUtil::CalImage2DShape(padded_c_shape,
+                              OpenCLBufferType::IN_OUT_HEIGHT,
+                              &c_image_shape);
+  MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
+
+  const index_t height_blocks = RoundUpDiv4(height);
+  const index_t width_blocks = RoundUpDiv4(width);
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(width_blocks),
+      static_cast<uint32_t>(height_blocks * batch),
+  };
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
+    built_options.emplace("-Dmatmul=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  uint32_t idx = 0;
+  MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+  MACE_SET_2D_GWS_ARGS(kernel_, gws);
+  kernel_.setArg(idx++, *(A->opencl_image()));
+  kernel_.setArg(idx++, *(B->opencl_image()));
+  kernel_.setArg(idx++, *(C->opencl_image()));
+  kernel_.setArg(idx++, static_cast<int>(height));
+  kernel_.setArg(idx++, static_cast<int>(width));
+  kernel_.setArg(idx++, static_cast<int>(K));
+  kernel_.setArg(idx++, static_cast<int>(height_blocks));
+  kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(K)));
+
+  const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
+  std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/matmul.h b/mace/ops/opencl/image/matmul.h
index 1cd5e7b1688e269b5b2c0cd1bbb0c17539572545..afd4792cba2eb3a33ccbf88959481ebb0cb3f225 100644
--- a/mace/ops/opencl/image/matmul.h
+++ b/mace/ops/opencl/image/matmul.h
@@ -31,7 +31,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class MatMulKernel : public OpenCLMatMulKernel {
  public:
   MaceStatus Compute(
@@ -47,81 +46,6 @@ class MatMulKernel : public OpenCLMatMulKernel {
   uint32_t kwg_size_;
 };
 
-template <typename T>
-MaceStatus MatMulKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *A,
-    const Tensor *B,
-    Tensor *C,
-    bool transpose_a,
-    bool transpose_b) {
-  MACE_CHECK(!transpose_a && !transpose_b,
-             "GPU does not support transpose matmul");
-
-  index_t rank = A->dim_size();
-  index_t height = A->dim(rank - 2);
-  index_t K = A->dim(rank - 1);
-  index_t width = B->dim(rank - 1);
-  index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
-                                  std::multiplies<index_t>());
-
-  std::vector<index_t> c_shape = A->shape();
-  c_shape[rank - 2] = height;
-  c_shape[rank - 1] = width;
-  std::vector<size_t> c_image_shape;
-  std::vector<index_t> padded_c_shape = {batch, height, width, 1};
-  OpenCLUtil::CalImage2DShape(padded_c_shape,
-                              OpenCLBufferType::IN_OUT_HEIGHT,
-                              &c_image_shape);
-  MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
-
-  const index_t height_blocks = RoundUpDiv4(height);
-  const index_t width_blocks = RoundUpDiv4(width);
-  const uint32_t gws[2] = {
-      static_cast<uint32_t>(width_blocks),
-      static_cast<uint32_t>(height_blocks * batch),
-  };
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
-    built_options.emplace("-Dmatmul=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  uint32_t idx = 0;
-  MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-  MACE_SET_2D_GWS_ARGS(kernel_, gws);
-  kernel_.setArg(idx++, *(A->opencl_image()));
-  kernel_.setArg(idx++, *(B->opencl_image()));
-  kernel_.setArg(idx++, *(C->opencl_image()));
-  kernel_.setArg(idx++, static_cast<int>(height));
-  kernel_.setArg(idx++, static_cast<int>(width));
-  kernel_.setArg(idx++, static_cast<int>(K));
-  kernel_.setArg(idx++, static_cast<int>(height_blocks));
-  kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(K)));
-
-  const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
-  std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/pad.cc b/mace/ops/opencl/image/pad.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d057a69f120b8f86d00c771688375fdd7194f04
--- /dev/null
+++ b/mace/ops/opencl/image/pad.cc
@@ -0,0 +1,124 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/pad.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus PadKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    Tensor *output) {
+  MACE_CHECK(this->paddings_.size() ==
+      static_cast<size_t>((input->dim_size() * 2)));
+  MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) &&
+      (this->paddings_[6] == 0) && (this->paddings_[7] == 0))
+    << "Mace only support height/width dimension now";
+  for (int i = 2; i <= 5; ++i) {
+    MACE_CHECK(paddings_[i] >= 0);
+  }
+  auto input_shape = input->shape();
+  if (type_ == PadType::REFLECT) {
+    MACE_CHECK(paddings_[2] < input_shape[1] &&
+        paddings_[3] < input_shape[1] &&
+        paddings_[4] < input_shape[2] &&
+        paddings_[5] < input_shape[2]);
+  } else if (type_ == PadType::SYMMETRIC) {
+    MACE_CHECK(paddings_[2] <= input_shape[1] &&
+        paddings_[3] <= input_shape[1] &&
+        paddings_[4] <= input_shape[2] &&
+        paddings_[5] <= input_shape[2]);
+  } else {
+    MACE_CHECK(type_ == PadType::CONSTANT);
+  }
+  std::vector<index_t> output_shape = {
+      input_shape[0] + this->paddings_[0] + this->paddings_[1],
+      input_shape[1] + this->paddings_[2] + this->paddings_[3],
+      input_shape[2] + this->paddings_[4] + this->paddings_[5],
+      input_shape[3] + this->paddings_[6] + this->paddings_[7]};
+
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channels = output->dim(3);
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
+    built_options.emplace("-Dpad=" + kernel_name);
+    auto dt = input->dtype();
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+
+    built_options.emplace(MakeString("-DPAD_TYPE=", type_));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    int idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    if (type_ == PadType::CONSTANT) {
+      kernel_.setArg(idx++, this->constant_value_);
+    }
+    kernel_.setArg(idx++, static_cast<int32_t>(input_shape[1]));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_shape[2]));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_shape[1]));
+    kernel_.setArg(idx++, this->paddings_[2]);
+    kernel_.setArg(idx++, this->paddings_[4]);
+
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
+                                  output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/pad.h b/mace/ops/opencl/image/pad.h
index a8a5212312ebbe74c3dd15b45346841dbd3c5c9b..f4b8278bbbfc4f67e1e16622baac4517ac441fb6 100644
--- a/mace/ops/opencl/image/pad.h
+++ b/mace/ops/opencl/image/pad.h
@@ -23,7 +23,7 @@
 
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/pad.h"
+#include "mace/ops/common/pad_type.h"
 #include "mace/ops/opencl/helper.h"
 
 namespace mace {
@@ -31,7 +31,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class PadKernel : public OpenCLPadKernel {
  public:
   PadKernel(const PadType type,
@@ -53,105 +52,6 @@ class PadKernel : public OpenCLPadKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus PadKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    Tensor *output) {
-  MACE_CHECK(this->paddings_.size() ==
-      static_cast<size_t>((input->dim_size() * 2)));
-  MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) &&
-      (this->paddings_[6] == 0) && (this->paddings_[7] == 0))
-    << "Mace only support height/width dimension now";
-  for (int i = 2; i <= 5; ++i) {
-    MACE_CHECK(paddings_[i] >= 0);
-  }
-  auto input_shape = input->shape();
-  if (type_ == PadType::REFLECT) {
-    MACE_CHECK(paddings_[2] < input_shape[1] &&
-               paddings_[3] < input_shape[1] &&
-               paddings_[4] < input_shape[2] &&
-               paddings_[5] < input_shape[2]);
-  } else if (type_ == PadType::SYMMETRIC) {
-    MACE_CHECK(paddings_[2] <= input_shape[1] &&
-               paddings_[3] <= input_shape[1] &&
-               paddings_[4] <= input_shape[2] &&
-               paddings_[5] <= input_shape[2]);
-  } else {
-    MACE_CHECK(type_ == PadType::CONSTANT);
-  }
-  std::vector<index_t> output_shape = {
-      input_shape[0] + this->paddings_[0] + this->paddings_[1],
-      input_shape[1] + this->paddings_[2] + this->paddings_[3],
-      input_shape[2] + this->paddings_[4] + this->paddings_[5],
-      input_shape[3] + this->paddings_[6] + this->paddings_[7]};
-
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape,
-                              OpenCLBufferType::IN_OUT_CHANNEL,
-                              &image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-
-  const index_t batch = output->dim(0);
-  const index_t height = output->dim(1);
-  const index_t width = output->dim(2);
-  const index_t channels = output->dim(3);
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
-    built_options.emplace("-Dpad=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    built_options.emplace(MakeString("-DPAD_TYPE=", type_));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    int idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    if (type_ == PadType::CONSTANT) {
-      kernel_.setArg(idx++, this->constant_value_);
-    }
-    kernel_.setArg(idx++, static_cast<int32_t>(input_shape[1]));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_shape[2]));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_shape[1]));
-    kernel_.setArg(idx++, this->paddings_[2]);
-    kernel_.setArg(idx++, this->paddings_[4]);
-
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
-                                  output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/pooling.cc b/mace/ops/opencl/image/pooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d48e4d8997f9348ccdcd9de057753b271815991
--- /dev/null
+++ b/mace/ops/opencl/image/pooling.cc
@@ -0,0 +1,127 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/pooling.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus PoolingKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const PoolingType pooling_type,
+    const int *kernels,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const RoundType round_type,
+    Tensor *output) {
+  MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
+    << "Pooling opencl kernel not support dilation yet";
+
+  std::vector<index_t> output_shape(4);
+  std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
+                                       kernels[0], kernels[1]};
+
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    ops::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), filter_shape.data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), filter_shape.data(),
+                   padding_data.data(), dilations, strides, round_type,
+                   output_shape.data());
+  }
+
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
+    built_options.emplace("-Dpooling=" + kernel_name);
+
+    if (pooling_type == MAX && input->dtype() == output->dtype()) {
+      auto data_dt = input->dtype();
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(data_dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(data_dt));
+    } else {
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    }
+    if (pooling_type == AVG) {
+      built_options.emplace("-DPOOL_AVG");
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
+                                              kernel_name,
+                                              built_options,
+                                              &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
+      static_cast<uint32_t>(output->dim(2)),
+      static_cast<uint32_t>(output->dim(0) * output->dim(1)),
+  };
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(output->dim(1)));
+    kernel_.setArg(idx++, paddings[0] / 2);
+    kernel_.setArg(idx++, paddings[1] / 2);
+    kernel_.setArg(idx++, strides[0]);
+    kernel_.setArg(idx++, strides[1]);
+    kernel_.setArg(idx++, kernels[0]);
+    kernel_.setArg(idx++, kernels[1]);
+    kernel_.setArg(idx++, *(output->opencl_image()));
+
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t> lws = pooling::LocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/pooling.h b/mace/ops/opencl/image/pooling.h
index 768a75caeb3f1fc00c32973f183cec7bf9c5979f..8d709368c8f9d2154cbd60eb07c1a9742fc2f506 100644
--- a/mace/ops/opencl/image/pooling.h
+++ b/mace/ops/opencl/image/pooling.h
@@ -57,7 +57,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 }  // namespace pooling
 
 
-template <typename T>
 class PoolingKernel : public OpenCLPoolingKernel {
  public:
   MaceStatus Compute(
@@ -78,109 +77,6 @@ class PoolingKernel : public OpenCLPoolingKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus PoolingKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const PoolingType pooling_type,
-    const int *kernels,
-    const int *strides,
-    const Padding &padding_type,
-    const std::vector<int> &padding_data,
-    const int *dilations,
-    const RoundType round_type,
-    Tensor *output) {
-  MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
-    << "Pooling opencl kernel not support dilation yet";
-
-  std::vector<index_t> output_shape(4);
-  std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
-                                       kernels[0], kernels[1]};
-
-  std::vector<int> paddings(2);
-  if (padding_data.empty()) {
-    ops::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), filter_shape.data(), dilations, strides,
-        padding_type, output_shape.data(), paddings.data());
-  } else {
-    paddings = padding_data;
-    CalcOutputSize(input->shape().data(), filter_shape.data(),
-                   padding_data.data(), dilations, strides, round_type,
-                   output_shape.data());
-  }
-
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    const DataType dt = DataTypeToEnum<T>::value;
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
-    built_options.emplace("-Dpooling=" + kernel_name);
-
-    if (pooling_type == MAX && input->dtype() == output->dtype()) {
-      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    } else {
-      built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-      built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    }
-    if (pooling_type == AVG) {
-      built_options.emplace("-DPOOL_AVG");
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
-                                              kernel_name,
-                                              built_options,
-                                              &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
-      static_cast<uint32_t>(output->dim(2)),
-      static_cast<uint32_t>(output->dim(0) * output->dim(1)),
-  };
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(output->dim(1)));
-    kernel_.setArg(idx++, paddings[0] / 2);
-    kernel_.setArg(idx++, paddings[1] / 2);
-    kernel_.setArg(idx++, strides[0]);
-    kernel_.setArg(idx++, strides[1]);
-    kernel_.setArg(idx++, kernels[0]);
-    kernel_.setArg(idx++, kernels[1]);
-    kernel_.setArg(idx++, *(output->opencl_image()));
-
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t> lws = pooling::LocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/reduce.cc b/mace/ops/opencl/image/reduce.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee7e2ce1c0d99a9cab3e77c08826827b02805a0f
--- /dev/null
+++ b/mace/ops/opencl/image/reduce.cc
@@ -0,0 +1,140 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/reduce.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus ReduceKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    Tensor *output) {
+  MACE_CHECK_NOTNULL(input);
+  index_t batch = input->dim(0);
+  const index_t in_height = input->dim(1);
+  const index_t in_width = input->dim(2);
+  const index_t channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
+
+  std::vector<uint32_t> gws(3);
+  std::vector<uint32_t> lws(3);
+  std::vector<index_t> output_shape{batch, 1, 1, channels};
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce");
+    built_options.emplace("-Dreduce=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    built_options.emplace(MakeString("-DREDUCE_TYPE=", reduce_type_));
+    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
+      built_options.emplace("-DNON_QUALCOMM_ADRENO");
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce",
+                                              kernel_name,
+                                              built_options,
+                                              &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+
+  if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
+    const uint32_t wave_size =
+        static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
+    gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
+  } else {
+    // Ensure each kernel has at least 4 input elements.
+    gws = {4, image_size / 16, static_cast<uint32_t>(batch * channel_blocks)};
+    if (gws[1] == 0) {
+      gws[1] = 1;
+    } else if (gws[1] > 16) {
+      gws[1] = 16;
+    }
+  }
+  lws = {gws[0], gws[1], 1};
+  const int group_num = lws[0] * lws[1] * lws[2];
+  // Each kernel intends to compute compute_size elements.
+  const int compute_size = (image_size + group_num - 1) / group_num;
+  const int last_index = image_size % group_num;
+  const float scale = 1.f / (in_width * in_height);
+
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, (group_num * 4 * sizeof(float)),
+                   nullptr);
+    kernel_.setArg(idx++, static_cast<int32_t>(group_num));
+    kernel_.setArg(idx++, static_cast<int32_t>(compute_size));
+    kernel_.setArg(idx++, static_cast<int32_t>(last_index));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
+    kernel_.setArg(idx++, scale);
+    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+
+    input_shape_ = input->shape();
+  }
+
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange,
+        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION;
+
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/reduce.h b/mace/ops/opencl/image/reduce.h
index fa69a11621c5f395be237bed7867c356b576a844..992ac1b1491c1ccfeba27ad39b743ab568354797 100644
--- a/mace/ops/opencl/image/reduce.h
+++ b/mace/ops/opencl/image/reduce.h
@@ -24,20 +24,18 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/opencl/helper.h"
-#include "mace/ops/reduce.h"
+#include "mace/ops/common/reduce_type.h"
 
 namespace mace {
 namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class ReduceKernel : public OpenCLReduceKernel {
  public:
   ReduceKernel(ReduceType type,
-               const std::vector<int> &axis,
-               const bool keep_dims)
-      : reduce_type_(type), axis_(axis), keep_dims_(keep_dims) {}
+               const std::vector<int> &axis)
+      : reduce_type_(type), axis_(axis) {}
 
   MaceStatus Compute(
       OpContext *context,
@@ -47,129 +45,11 @@ class ReduceKernel : public OpenCLReduceKernel {
  private:
   ReduceType reduce_type_;
   const std::vector<int> axis_;
-  bool keep_dims_;
   cl::Kernel kernel_;
   uint32_t kwg_size_;
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus ReduceKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    Tensor *output) {
-  MACE_CHECK_NOTNULL(input);
-  index_t batch = input->dim(0);
-  const index_t in_height = input->dim(1);
-  const index_t in_width = input->dim(2);
-  const index_t channels = input->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
-
-  std::vector<uint32_t> gws(3);
-  std::vector<uint32_t> lws(3);
-  std::vector<index_t> output_shape{batch, 1, 1, channels};
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    const DataType dt = DataTypeToEnum<T>::value;
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce");
-    built_options.emplace("-Dreduce=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(MakeString("-DREDUCE_TYPE=", reduce_type_));
-    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
-      built_options.emplace("-DNON_QUALCOMM_ADRENO");
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce",
-                                              kernel_name,
-                                              built_options,
-                                              &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
-    const uint32_t wave_size =
-        static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
-    gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
-  } else {
-    // Ensure each kernel has at least 4 input elements.
-    gws = {4, image_size / 16, static_cast<uint32_t>(batch * channel_blocks)};
-    if (gws[1] == 0) {
-      gws[1] = 1;
-    } else if (gws[1] > 16) {
-      gws[1] = 16;
-    }
-  }
-  lws = {gws[0], gws[1], 1};
-  const int group_num = lws[0] * lws[1] * lws[2];
-  // Each kernel intends to compute compute_size elements.
-  const int compute_size = (image_size + group_num - 1) / group_num;
-  const int last_index = image_size % group_num;
-  const float scale = 1.f / (in_width * in_height);
-
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, (group_num * 4 * sizeof(float)),
-                   nullptr);
-    kernel_.setArg(idx++, static_cast<int32_t>(group_num));
-    kernel_.setArg(idx++, static_cast<int32_t>(compute_size));
-    kernel_.setArg(idx++, static_cast<int32_t>(last_index));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
-    kernel_.setArg(idx++, scale);
-    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-
-    input_shape_ = input->shape();
-  }
-
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  } else {
-    std::vector<uint32_t> roundup_gws(lws.size());
-    for (size_t i = 0; i < lws.size(); ++i) {
-      roundup_gws[i] = RoundUp(gws[i], lws[i]);
-    }
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange,
-        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  }
-  MACE_CL_RET_STATUS(error);
-  MACE_OUT_OF_RANGE_VALIDATION;
-
-  if (context->future() != nullptr) {
-    context->future()->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/resize_bicubic.cc b/mace/ops/opencl/image/resize_bicubic.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e09b5640d55c9a672a39146e4bbc3c683d21f06c
--- /dev/null
+++ b/mace/ops/opencl/image/resize_bicubic.cc
@@ -0,0 +1,110 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/resize_bicubic.h"
+
+#include "mace/ops/common/utils.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus ResizeBicubicKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t in_height = input->dim(1);
+  const index_t in_width = input->dim(2);
+  const index_t channels = input->dim(3);
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t out_height = out_height_;
+  const index_t out_width = out_width_;
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(out_width),
+                           static_cast<uint32_t>(out_height * batch)};
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache");
+    built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    built_options.emplace(
+        MakeString("-DTABLE_SIZE=", common::utils::kTableSize));
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("resize_bicubic",
+                             kernel_name,
+                             built_options,
+                             &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    MACE_CHECK(out_height > 0 && out_width > 0);
+    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
+
+    std::vector<size_t> output_image_shape;
+    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                                &output_image_shape);
+    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+
+    float height_scale =
+        common::utils::CalculateResizeScale(
+            in_height, out_height, align_corners_);
+    float width_scale =
+        common::utils::CalculateResizeScale(
+            in_width, out_width, align_corners_);
+
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, height_scale);
+    kernel_.setArg(idx++, width_scale);
+    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
+
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t>
+      lws = resize_bicubic::LocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/resize_bicubic.h b/mace/ops/opencl/image/resize_bicubic.h
index 31957af017b7ab82413595ad22bec73454e13029..cb215f19aa6a22fb3f919b2048b85e084c35667e 100644
--- a/mace/ops/opencl/image/resize_bicubic.h
+++ b/mace/ops/opencl/image/resize_bicubic.h
@@ -25,13 +25,14 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/opencl/helper.h"
-#include "mace/ops/resize_bicubic.h"
 
 namespace mace {
 namespace ops {
 namespace opencl {
 namespace image {
 namespace resize_bicubic {
+constexpr int64_t kTableSize = (1u << 10);
+
 inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
                                      const uint32_t *gws,
                                      const uint32_t kwg_size) {
@@ -60,7 +61,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 
 }  // namespace resize_bicubic
 
-template <typename T>
 class ResizeBicubicKernel : public OpenCLResizeBicubicKernel {
  public:
   ResizeBicubicKernel(bool align_corners,
@@ -84,92 +84,6 @@ class ResizeBicubicKernel : public OpenCLResizeBicubicKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus ResizeBicubicKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    Tensor *output) {
-  const index_t batch = input->dim(0);
-  const index_t in_height = input->dim(1);
-  const index_t in_width = input->dim(2);
-  const index_t channels = input->dim(3);
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t out_height = out_height_;
-  const index_t out_width = out_width_;
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(out_width),
-                           static_cast<uint32_t>(out_height * batch)};
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    auto dt = DataTypeToEnum<T>::value;
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache");
-    built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(
-        MakeString("-DTABLE_SIZE=",
-                   mace::ops::resize_bicubic::kTableSize));
-    MACE_RETURN_IF_ERROR(
-        runtime->BuildKernel("resize_bicubic",
-                             kernel_name,
-                             built_options,
-                             &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    MACE_CHECK(out_height > 0 && out_width > 0);
-    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
-
-    std::vector<size_t> output_image_shape;
-    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                                &output_image_shape);
-    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-
-    float height_scale =
-        mace::ops::resize_bicubic::CalculateResizeScale(
-            in_height, out_height, align_corners_);
-    float width_scale =
-        mace::ops::resize_bicubic::CalculateResizeScale(
-            in_width, out_width, align_corners_);
-
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, height_scale);
-    kernel_.setArg(idx++, width_scale);
-    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
-
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t>
-      lws = resize_bicubic::LocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/resize_bilinear.cc b/mace/ops/opencl/image/resize_bilinear.cc
new file mode 100644
index 0000000000000000000000000000000000000000..91d82e821d2dc21da48990b22423962ee4decede
--- /dev/null
+++ b/mace/ops/opencl/image/resize_bilinear.cc
@@ -0,0 +1,110 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/resize_bilinear.h"
+
+#include "mace/ops/common/utils.h"
+
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus ResizeBilinearKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t in_height = input->dim(1);
+  const index_t in_width = input->dim(2);
+  const index_t channels = input->dim(3);
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t out_height = out_height_;
+  const index_t out_width = out_width_;
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(out_width),
+                           static_cast<uint32_t>(out_height * batch)};
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
+    built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("resize_bilinear",
+                             kernel_name,
+                             built_options,
+                             &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    MACE_CHECK(out_height > 0 && out_width > 0);
+    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
+
+    std::vector<size_t> output_image_shape;
+    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                                &output_image_shape);
+    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+
+    float height_scale =
+        common::utils::CalculateResizeScale(in_height,
+                                            out_height,
+                                            align_corners_);
+    float width_scale =
+        common::utils::CalculateResizeScale(in_width,
+                                            out_width,
+                                            align_corners_);
+
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, height_scale);
+    kernel_.setArg(idx++, width_scale);
+    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
+
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t>
+      lws = resize_bilinear::LocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/resize_bilinear.h b/mace/ops/opencl/image/resize_bilinear.h
index 5b778122d59cb3dd6a768ceeb202413647b49691..68b1478dc81d620cc2bde198b02c221913b7939f 100644
--- a/mace/ops/opencl/image/resize_bilinear.h
+++ b/mace/ops/opencl/image/resize_bilinear.h
@@ -25,7 +25,6 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/opencl/helper.h"
-#include "mace/ops/resize_bilinear.h"
 
 namespace mace {
 namespace ops {
@@ -65,12 +64,11 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 
 }  // namespace resize_bilinear
 
-template <typename T>
 class ResizeBilinearKernel : public OpenCLResizeBilinearKernel {
  public:
   ResizeBilinearKernel(bool align_corners,
-                      const index_t out_height,
-                      const index_t out_width)
+                       const index_t out_height,
+                       const index_t out_width)
       : align_corners_(align_corners),
         out_height_(out_height),
         out_width_(out_width) {}
@@ -89,90 +87,6 @@ class ResizeBilinearKernel : public OpenCLResizeBilinearKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus ResizeBilinearKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    Tensor *output) {
-  const index_t batch = input->dim(0);
-  const index_t in_height = input->dim(1);
-  const index_t in_width = input->dim(2);
-  const index_t channels = input->dim(3);
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t out_height = out_height_;
-  const index_t out_width = out_width_;
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(out_width),
-                           static_cast<uint32_t>(out_height * batch)};
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
-    built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(
-        runtime->BuildKernel("resize_bilinear",
-                             kernel_name,
-                             built_options,
-                             &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    MACE_CHECK(out_height > 0 && out_width > 0);
-    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
-
-    std::vector<size_t> output_image_shape;
-    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                                &output_image_shape);
-    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-
-    float height_scale =
-        mace::ops::resize_bilinear::CalculateResizeScale(in_height,
-                                                             out_height,
-                                                             align_corners_);
-    float width_scale =
-        mace::ops::resize_bilinear::CalculateResizeScale(in_width,
-                                                             out_width,
-                                                             align_corners_);
-
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, height_scale);
-    kernel_.setArg(idx++, width_scale);
-    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
-
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t>
-      lws = resize_bilinear::LocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/resize_nearest_neighbor.cc b/mace/ops/opencl/image/resize_nearest_neighbor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..afb4b151d4ed0ea6ad17030025bf82123adf5d3d
--- /dev/null
+++ b/mace/ops/opencl/image/resize_nearest_neighbor.cc
@@ -0,0 +1,110 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/resize_nearest_neighbor.h"
+
+#include "mace/ops/common/utils.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus ResizeNearestNeighborKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const Tensor *size,
+    Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t in_height = input->dim(1);
+  const index_t in_width = input->dim(2);
+  const index_t channels = input->dim(3);
+  Tensor::MappingGuard size_mapper(size);
+  const index_t out_height = size->data<int32_t>()[0];
+  const index_t out_width = size->data<int32_t>()[1];
+  const index_t channel_blocks = RoundUpDiv4(channels);
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(out_width),
+                           static_cast<uint32_t>(out_height * batch)};
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL(
+        "resize_nearest_neighbor_nocache");
+    built_options.emplace("-Dresize_nearest_neighbor_nocache=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("resize_nearest_neighbor",
+                             kernel_name,
+                             built_options,
+                             &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    MACE_CHECK(out_height > 0 && out_width > 0);
+    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
+
+    std::vector<size_t> output_image_shape;
+    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                                &output_image_shape);
+    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+
+    float height_scale =
+        common::utils::CalculateResizeScale(
+            in_height, out_height, align_corners_);
+    float width_scale =
+        common::utils::CalculateResizeScale(
+            in_width, out_width, align_corners_);
+
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, height_scale);
+    kernel_.setArg(idx++, width_scale);
+    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(align_corners_));
+
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t>
+      lws = resize_nearest_neighbor::LocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("resize_nearest_neighbor_opencl_kernel", output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/resize_nearest_neighbor.h b/mace/ops/opencl/image/resize_nearest_neighbor.h
index 8f5bca6b029599e7a42899453279c4f77758196b..98ef37b28944521123996fbb38f6688d90a277c0 100644
--- a/mace/ops/opencl/image/resize_nearest_neighbor.h
+++ b/mace/ops/opencl/image/resize_nearest_neighbor.h
@@ -25,7 +25,6 @@
 #include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/ops/opencl/helper.h"
-#include "mace/ops/resize_nearest_neighbor.h"
 
 namespace mace {
 namespace ops {
@@ -65,7 +64,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 
 }  // namespace resize_nearest_neighbor
 
-template <typename T>
 class ResizeNearestNeighborKernel : public OpenCLResizeNearestNeighborKernel {
  public:
   explicit ResizeNearestNeighborKernel(bool align_corners)
@@ -84,91 +82,6 @@ class ResizeNearestNeighborKernel : public OpenCLResizeNearestNeighborKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus ResizeNearestNeighborKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const Tensor *size,
-    Tensor *output) {
-  const index_t batch = input->dim(0);
-  const index_t in_height = input->dim(1);
-  const index_t in_width = input->dim(2);
-  const index_t channels = input->dim(3);
-  Tensor::MappingGuard size_mapper(size);
-  const index_t out_height = size->data<int32_t>()[0];
-  const index_t out_width = size->data<int32_t>()[1];
-  const index_t channel_blocks = RoundUpDiv4(channels);
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(out_width),
-                           static_cast<uint32_t>(out_height * batch)};
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL(
-        "resize_nearest_neighbor_nocache");
-    built_options.emplace("-Dresize_nearest_neighbor_nocache=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(
-        runtime->BuildKernel("resize_nearest_neighbor",
-                             kernel_name,
-                             built_options,
-                             &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    MACE_CHECK(out_height > 0 && out_width > 0);
-    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
-
-    std::vector<size_t> output_image_shape;
-    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                                &output_image_shape);
-    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-
-    float height_scale =
-        mace::ops::resize_nearest_neighbor::CalculateResizeScale(
-            in_height, out_height, align_corners_);
-    float width_scale =
-        mace::ops::resize_nearest_neighbor::CalculateResizeScale(
-            in_width, out_width, align_corners_);
-
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, height_scale);
-    kernel_.setArg(idx++, width_scale);
-    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(align_corners_));
-
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t>
-      lws = resize_nearest_neighbor::LocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("resize_nearest_neighbor_opencl_kernel", output->dim(0),
-             output->dim(1), output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/softmax.cc b/mace/ops/opencl/image/softmax.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f37b76d6f5ebec1fe9f4ab8b533848fca1dfd3be
--- /dev/null
+++ b/mace/ops/opencl/image/softmax.cc
@@ -0,0 +1,98 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/softmax.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus SoftmaxKernel::Compute(
+    OpContext *context,
+    const Tensor *logits,
+    Tensor *output) {
+  index_t batch = 0;
+  index_t height = 0;
+  index_t width = 0;
+  index_t channels = 0;
+
+  if (logits->dim_size() == 2) {
+    batch = logits->dim(0);
+    height = 1;
+    width = 1;
+    channels = logits->dim(1);
+
+  } else if (logits->dim_size() == 4) {
+    batch = logits->dim(0);
+    height = logits->dim(1);
+    width = logits->dim(2);
+    channels = logits->dim(3);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const int remain_channels = channel_blocks * 4 - channels;
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
+    built_options.emplace("-Dsoftmax=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    if (use_log_)
+      built_options.emplace("-DUSE_LOG");
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, logits->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(logits->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int>(channels));
+    kernel_.setArg(idx++, remain_channels);
+    kernel_.setArg(idx++, *(output->opencl_image()));
+
+    input_shape_ = logits->shape();
+  }
+
+  std::vector<uint32_t> lws = softmax::LocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("softmax_opencl_kernel", batch, height, width, channels);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/softmax.h b/mace/ops/opencl/image/softmax.h
index 3aa84bb5091066bff8565d3428fca7ebe4badafd..505dff57c9a7caf718a4a7f98ab3d6ffe58a5565 100644
--- a/mace/ops/opencl/image/softmax.h
+++ b/mace/ops/opencl/image/softmax.h
@@ -56,7 +56,6 @@ inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 }
 }  // namespace softmax
 
-template <typename T>
 class SoftmaxKernel : public OpenCLSoftmaxKernel {
  public:
   explicit SoftmaxKernel(bool use_log)
@@ -74,81 +73,6 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus SoftmaxKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *logits,
-    Tensor *output) {
-  index_t batch = 0;
-  index_t height = 0;
-  index_t width = 0;
-  index_t channels = 0;
-
-  if (logits->dim_size() == 2) {
-    batch = logits->dim(0);
-    height = 1;
-    width = 1;
-    channels = logits->dim(1);
-
-  } else if (logits->dim_size() == 4) {
-    batch = logits->dim(0);
-    height = logits->dim(1);
-    width = logits->dim(2);
-    channels = logits->dim(3);
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const int remain_channels = channel_blocks * 4 - channels;
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
-    built_options.emplace("-Dsoftmax=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    if (use_log_)
-      built_options.emplace("-DUSE_LOG");
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, logits->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(logits->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int>(channels));
-    kernel_.setArg(idx++, remain_channels);
-    kernel_.setArg(idx++, *(output->opencl_image()));
-
-    input_shape_ = logits->shape();
-  }
-
-  std::vector<uint32_t> lws = softmax::LocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("softmax_opencl_kernel", batch, height, width, channels);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/space_to_batch.cc b/mace/ops/opencl/image/space_to_batch.cc
new file mode 100644
index 0000000000000000000000000000000000000000..771d8e32ec7fd2ac9e887ad59b94c04aefde8b8f
--- /dev/null
+++ b/mace/ops/opencl/image/space_to_batch.cc
@@ -0,0 +1,98 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/space_to_batch.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus SpaceToBatchKernel::Compute(
+    OpContext *context,
+    const Tensor *space_tensor,
+    const std::vector<int> &paddings,
+    const std::vector<int> &block_shape,
+    const std::vector<index_t> &output_shape,
+    Tensor *batch_tensor) {
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(
+      batch_tensor->ResizeImage(output_shape, output_image_shape));
+  const char *kernel_name = "space_to_batch";
+  const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
+  const uint32_t gws[3] = {
+      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
+      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    auto input_dt = space_tensor->dtype();
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
+
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, space_tensor->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+
+    kernel_.setArg(idx++, *(space_tensor->opencl_image()));
+    kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
+    kernel_.setArg(idx++, block_shape[0]);
+    kernel_.setArg(idx++, block_shape[1]);
+    kernel_.setArg(idx++, paddings[0]);
+    kernel_.setArg(idx++, paddings[2]);
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
+
+    input_shape_ = space_tensor->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
+             batch_tensor->dim(2), batch_tensor->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/space_to_batch.h b/mace/ops/opencl/image/space_to_batch.h
index 28f00df5fc7e549f6e58dd327de9544b68598fb1..6ad5d22833e2ff2104c974bd77f6da5c76af1ad3 100644
--- a/mace/ops/opencl/image/space_to_batch.h
+++ b/mace/ops/opencl/image/space_to_batch.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel {
  public:
   MaceStatus Compute(
@@ -47,79 +46,6 @@ class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus SpaceToBatchKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *space_tensor,
-    const std::vector<int> &paddings,
-    const std::vector<int> &block_shape,
-    const std::vector<index_t> &output_shape,
-    Tensor *batch_tensor) {
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(
-      batch_tensor->ResizeImage(output_shape, output_image_shape));
-  const char *kernel_name = "space_to_batch";
-  const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
-  const uint32_t gws[3] = {
-      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
-      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    built_options.emplace(kernel_name_ss.str());
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" +
-        DtToCLCMDDt(DataTypeToEnum<T>::value));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, space_tensor->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-
-    kernel_.setArg(idx++, *(space_tensor->opencl_image()));
-    kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
-    kernel_.setArg(idx++, block_shape[0]);
-    kernel_.setArg(idx++, block_shape[1]);
-    kernel_.setArg(idx++, paddings[0]);
-    kernel_.setArg(idx++, paddings[2]);
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
-
-    input_shape_ = space_tensor->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
-             batch_tensor->dim(2), batch_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/space_to_depth.cc b/mace/ops/opencl/image/space_to_depth.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b48769b7afecee3dcf73f888653fd1cede42cc4
--- /dev/null
+++ b/mace/ops/opencl/image/space_to_depth.cc
@@ -0,0 +1,111 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/space_to_depth.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus SpaceToDepthKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    Tensor *output) {
+  const index_t batch = input->dim(0);
+  const index_t input_height = input->dim(1);
+  const index_t input_width = input->dim(2);
+  const index_t input_depth = input->dim(3);
+
+  MACE_CHECK(input_depth < 4 || (input_depth % 4) == 0,
+             "input channel should be dividable by 4");
+  MACE_CHECK(
+      (input_width % block_size_ == 0) && (input_height % block_size_ == 0),
+      "input width and height should be dividable by block_size");
+
+  const index_t output_height = input_height / block_size_;
+  const index_t output_width = input_width / block_size_;
+  const index_t output_depth = input_depth * block_size_ * block_size_;
+
+  const index_t output_depth_blocks = RoundUpDiv4(output_depth);
+
+  std::vector<index_t> output_shape = {batch, output_height, output_width,
+                                       output_depth};
+
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    const char *kernel_name = "space_to_depth";
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    if (input_depth < 4) {
+      built_options.emplace(MakeString("-DDEPTH", input_depth));
+    }
+    built_options.emplace(kernel_name_ss.str());
+    auto input_dt = input->dtype();
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(output_depth_blocks),
+                           static_cast<uint32_t>(output_width),
+                           static_cast<uint32_t>(output_height * batch)};
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_depth));
+    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_width));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+
+    input_shape_ = input->shape();
+  }
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key = Concat("space_to_depth", input->dim(0),
+                                  input->dim(1), input->dim(2), input->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, context->future()));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/space_to_depth.h b/mace/ops/opencl/image/space_to_depth.h
index e58b7b8d0660cc6c91d965557a17cb1c206f072e..324977ea45c518a4a7a46520f0b5626c82716ea2 100644
--- a/mace/ops/opencl/image/space_to_depth.h
+++ b/mace/ops/opencl/image/space_to_depth.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel {
  public:
   explicit SpaceToDepthKernel(const int block_size)
@@ -47,93 +46,6 @@ class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus SpaceToDepthKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    Tensor *output) {
-  const index_t batch = input->dim(0);
-  const index_t input_height = input->dim(1);
-  const index_t input_width = input->dim(2);
-  const index_t input_depth = input->dim(3);
-
-  MACE_CHECK(input_depth < 4 || (input_depth % 4) == 0,
-             "input channel should be dividable by 4");
-  MACE_CHECK(
-      (input_width % block_size_ == 0) && (input_height % block_size_ == 0),
-      "input width and height should be dividable by block_size");
-
-  const index_t output_height = input_height / block_size_;
-  const index_t output_width = input_width / block_size_;
-  const index_t output_depth = input_depth * block_size_ * block_size_;
-
-  const index_t output_depth_blocks = RoundUpDiv4(output_depth);
-
-  std::vector<index_t> output_shape = {batch, output_height, output_width,
-                                       output_depth};
-
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape,
-                              OpenCLBufferType::IN_OUT_CHANNEL,
-                              &image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    const char *kernel_name = "space_to_depth";
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    if (input_depth < 4) {
-      built_options.emplace(MakeString("-DDEPTH", input_depth));
-    }
-    built_options.emplace(kernel_name_ss.str());
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(output_depth_blocks),
-                           static_cast<uint32_t>(output_width),
-                           static_cast<uint32_t>(output_height * batch)};
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_depth));
-    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_width));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key = Concat("space_to_depth", input->dim(0),
-                                  input->dim(1), input->dim(2), input->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/split.cc b/mace/ops/opencl/image/split.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1df73c47e7339ba6e2a174d5271b03ca3f07056b
--- /dev/null
+++ b/mace/ops/opencl/image/split.cc
@@ -0,0 +1,123 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/split.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus SplitKernel::Compute(
+    OpContext *context,
+    const Tensor *input,
+    const std::vector<Tensor *> &output_list) {
+  MACE_UNUSED(axis_);
+  const index_t input_channels = input->dim(3);
+  const size_t outputs_count = output_list.size();
+  const index_t output_channels = input_channels / outputs_count;
+  std::vector<index_t> output_shape(
+      {input->dim(0), input->dim(1), input->dim(2), output_channels});
+
+  std::vector<size_t> image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
+  for (size_t i = 0; i < outputs_count; ++i) {
+    MACE_RETURN_IF_ERROR(
+        output_list[i]->ResizeImage(output_shape, image_shape));
+  }
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split");
+    built_options.emplace("-Dsplit=" + kernel_name);
+    auto input_dt = input->dtype();
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input_dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(input_dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("split",
+                                              kernel_name,
+                                              built_options,
+                                              &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  const index_t channel_blk = RoundUpDiv4(output_channels);
+
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(input->dim(2)),
+      static_cast<uint32_t>(input->dim(0) * input->dim(1)),
+  };
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  cl::Event event;
+  CallStats call_stats{INT64_MAX, 0};
+  for (size_t i = 0; i < outputs_count; ++i) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
+    kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
+
+    cl_int error;
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+    } else {
+      std::vector<uint32_t> roundup_gws(lws.size());
+      for (size_t j = 0; j < 3; ++j) {
+        roundup_gws[j] = RoundUp(gws[j], lws[j]);
+      }
+
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel_, cl::NullRange,
+          cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+    }
+    MACE_CL_RET_STATUS(error);
+    MACE_OUT_OF_RANGE_VALIDATION;
+    if (context->future() != nullptr && runtime->is_profiling_enabled()) {
+      event.wait();
+      CallStats tmp_stats;
+      runtime->GetCallStats(event, &tmp_stats);
+      call_stats.start_micros =
+          std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
+      call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
+    }
+  }
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [call_stats](CallStats *stats) {
+      if (stats != nullptr) {
+        stats->start_micros = call_stats.start_micros;
+        stats->end_micros = stats->start_micros + call_stats.end_micros;
+      }
+    };
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/split.h b/mace/ops/opencl/image/split.h
index 12755910a75cd812725b02dd76d35c052a6f6826..956ff6573a60ed2050d5b526f58734cdc8fdff43 100644
--- a/mace/ops/opencl/image/split.h
+++ b/mace/ops/opencl/image/split.h
@@ -31,7 +31,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class SplitKernel : public OpenCLSplitKernel {
  public:
   explicit SplitKernel(const int32_t axis) : axis_(axis) {}
@@ -46,104 +45,6 @@ class SplitKernel : public OpenCLSplitKernel {
   uint32_t kwg_size_;
 };
 
-template <typename T>
-MaceStatus SplitKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input,
-    const std::vector<Tensor *> &output_list) {
-  const index_t input_channels = input->dim(3);
-  const size_t outputs_count = output_list.size();
-  const index_t output_channels = input_channels / outputs_count;
-  std::vector<index_t> output_shape(
-      {input->dim(0), input->dim(1), input->dim(2), output_channels});
-
-  std::vector<size_t> image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape,
-                              OpenCLBufferType::IN_OUT_CHANNEL,
-                              &image_shape);
-  for (size_t i = 0; i < outputs_count; ++i) {
-    MACE_RETURN_IF_ERROR(
-        output_list[i]->ResizeImage(output_shape, image_shape));
-  }
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split");
-    built_options.emplace("-Dsplit=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" +
-        DtToCLCMDDt(DataTypeToEnum<T>::value));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("split",
-                                              kernel_name,
-                                              built_options,
-                                              &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  const index_t channel_blk = RoundUpDiv4(output_channels);
-
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(input->dim(2)),
-      static_cast<uint32_t>(input->dim(0) * input->dim(1)),
-  };
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  cl::Event event;
-  CallStats call_stats{INT64_MAX, 0};
-  for (size_t i = 0; i < outputs_count; ++i) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
-    kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
-
-    cl_int error;
-    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-    } else {
-      std::vector<uint32_t> roundup_gws(lws.size());
-      for (size_t j = 0; j < 3; ++j) {
-        roundup_gws[j] = RoundUp(gws[j], lws[j]);
-      }
-
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          kernel_, cl::NullRange,
-          cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-    }
-    MACE_CL_RET_STATUS(error);
-    MACE_OUT_OF_RANGE_VALIDATION;
-    if (context->future() != nullptr && runtime->is_profiling_enabled()) {
-      event.wait();
-      CallStats tmp_stats;
-      runtime->GetCallStats(event, &tmp_stats);
-      call_stats.start_micros =
-          std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
-      call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
-    }
-  }
-  if (context->future() != nullptr) {
-    context->future()->wait_fn = [call_stats](CallStats *stats) {
-      if (stats != nullptr) {
-        stats->start_micros = call_stats.start_micros;
-        stats->end_micros = stats->start_micros + call_stats.end_micros;
-      }
-    };
-  }
-
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/sqrdiff_mean.cc b/mace/ops/opencl/image/sqrdiff_mean.cc
new file mode 100644
index 0000000000000000000000000000000000000000..442a319159f40349c84b6807ad25da529527ca78
--- /dev/null
+++ b/mace/ops/opencl/image/sqrdiff_mean.cc
@@ -0,0 +1,140 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/opencl/image/sqrdiff_mean.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+MaceStatus SqrDiffMeanKernel::Compute(
+    OpContext *context,
+    const Tensor *input0,
+    const Tensor *input1,
+    Tensor *output) {
+  MACE_CHECK_NOTNULL(input0);
+  MACE_CHECK_NOTNULL(input1);
+  MACE_CHECK(input0->dim(0) == input1->dim(0) &&
+      input0->dim(3) == input1->dim(3));
+  MACE_CHECK(input0->dim_size() == 4 && input1->dim_size() == 4,
+             "SqrDiffMean gpu only support 4-dim input");
+  index_t batch = input0->dim(0);
+  const index_t in_height = input0->dim(1);
+  const index_t in_width = input0->dim(2);
+  const index_t channels = input0->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
+
+  std::vector<uint32_t> gws(3);
+  std::vector<uint32_t> lws(3);
+  std::vector<index_t> output_shape{batch, 1, 1, channels};
+  std::vector<size_t> output_image_shape;
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+
+  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("sqrdiff_mean");
+    built_options.emplace("-Dsqrdiff_mean=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
+    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
+      built_options.emplace("-DNON_QUALCOMM_ADRENO");
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("sqrdiff_mean",
+                                              kernel_name,
+                                              built_options,
+                                              &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+
+  if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
+    const uint32_t wave_size =
+        static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
+    gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
+  } else {
+    gws = {4, 16, static_cast<uint32_t>(batch * channel_blocks)};
+  }
+  lws = {gws[0], gws[1], 1};
+  const int group_size = lws[0] * lws[1] * lws[2];
+  const int partial_len = (image_size + group_size - 1) / group_size;
+  const int remain_index = image_size % group_size;
+  const float img_size_reciprocal = 1.f / (in_width * in_height);
+
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input0->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input0->opencl_image()));
+    kernel_.setArg(idx++, *(input1->opencl_image()));
+    kernel_.setArg(idx++, (group_size * 4 * sizeof(float)),
+                   nullptr);
+    kernel_.setArg(idx++, static_cast<int32_t>(group_size));
+    kernel_.setArg(idx++, static_cast<int32_t>(partial_len));
+    kernel_.setArg(idx++, static_cast<int32_t>(remain_index));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
+    kernel_.setArg(idx++, img_size_reciprocal);
+    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+
+    input_shape_ = input0->shape();
+  }
+
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange,
+        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION;
+
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/sqrdiff_mean.h b/mace/ops/opencl/image/sqrdiff_mean.h
index bde87a80896e0f76d4dbdf82acb130c8d9e80460..bd2d1e7f394e7ed98eb8bd4e948da32e615349be 100644
--- a/mace/ops/opencl/image/sqrdiff_mean.h
+++ b/mace/ops/opencl/image/sqrdiff_mean.h
@@ -30,7 +30,6 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-template <typename T>
 class SqrDiffMeanKernel : public OpenCLSqrDiffMeanKernel {
  public:
   MaceStatus Compute(
@@ -45,123 +44,6 @@ class SqrDiffMeanKernel : public OpenCLSqrDiffMeanKernel {
   std::vector<index_t> input_shape_;
 };
 
-template <typename T>
-MaceStatus SqrDiffMeanKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input0,
-    const Tensor *input1,
-    Tensor *output) {
-  MACE_CHECK_NOTNULL(input0);
-  MACE_CHECK_NOTNULL(input1);
-  MACE_CHECK(input0->dim(0) == input1->dim(0) &&
-      input0->dim(3) == input1->dim(3));
-  MACE_CHECK(input0->dim_size() == 4 && input1->dim_size() == 4,
-             "SqrDiffMean gpu only support 4-dim input");
-  index_t batch = input0->dim(0);
-  const index_t in_height = input0->dim(1);
-  const index_t in_width = input0->dim(2);
-  const index_t channels = input0->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
-
-  std::vector<uint32_t> gws(3);
-  std::vector<uint32_t> lws(3);
-  std::vector<index_t> output_shape{batch, 1, 1, channels};
-  std::vector<size_t> output_image_shape;
-  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
-                              &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-
-  auto runtime = context->device()->gpu_runtime()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    const DataType dt = DataTypeToEnum<T>::value;
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("sqrdiff_mean");
-    built_options.emplace("-Dsqrdiff_mean=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
-      built_options.emplace("-DNON_QUALCOMM_ADRENO");
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("sqrdiff_mean",
-                                              kernel_name,
-                                              built_options,
-                                              &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
-    const uint32_t wave_size =
-        static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
-    gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
-  } else {
-    gws = {4, 16, static_cast<uint32_t>(batch * channel_blocks)};
-  }
-  lws = {gws[0], gws[1], 1};
-  const int group_size = lws[0] * lws[1] * lws[2];
-  const int partial_len = (image_size + group_size - 1) / group_size;
-  const int remain_index = image_size % group_size;
-  const float img_size_reciprocal = 1.f / (in_width * in_height);
-
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input0->shape())) {
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_3D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input0->opencl_image()));
-    kernel_.setArg(idx++, *(input1->opencl_image()));
-    kernel_.setArg(idx++, (group_size * 4 * sizeof(float)),
-                   nullptr);
-    kernel_.setArg(idx++, static_cast<int32_t>(group_size));
-    kernel_.setArg(idx++, static_cast<int32_t>(partial_len));
-    kernel_.setArg(idx++, static_cast<int32_t>(remain_index));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
-    kernel_.setArg(idx++, img_size_reciprocal);
-    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-
-    input_shape_ = input0->shape();
-  }
-
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  } else {
-    std::vector<uint32_t> roundup_gws(lws.size());
-    for (size_t i = 0; i < lws.size(); ++i) {
-      roundup_gws[i] = RoundUp(gws[i], lws[i]);
-    }
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange,
-        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  }
-  MACE_CL_RET_STATUS(error);
-  MACE_OUT_OF_RANGE_VALIDATION;
-
-  if (context->future() != nullptr) {
-    context->future()->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-
-  return MaceStatus::MACE_SUCCESS;
-}
-
 }  // namespace image
 }  // namespace opencl
 }  // namespace ops
diff --git a/mace/ops/opencl/image/winograd_conv2d.cc b/mace/ops/opencl/image/winograd_conv2d.cc
index 40b83fa62e757b1f13a1e06c6f91b6db1e29ab1b..1ea2634a022e7614bcc600e3e34827e7a4aa8338 100644
--- a/mace/ops/opencl/image/winograd_conv2d.cc
+++ b/mace/ops/opencl/image/winograd_conv2d.cc
@@ -29,7 +29,6 @@ namespace {
 MaceStatus WinogradInputTransform(OpContext *context,
                                   cl::Kernel *kernel,
                                   const Tensor *input_tensor,
-                                  const DataType dt,
                                   const int *paddings,
                                   const index_t round_h,
                                   const index_t round_w,
@@ -62,8 +61,8 @@ MaceStatus WinogradInputTransform(OpContext *context,
       MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
       return MaceStatus::MACE_SUCCESS;
     }
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
     MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
                                               obfuscated_kernel_name,
                                               built_options,
@@ -93,7 +92,6 @@ MaceStatus WinogradInputTransform(OpContext *context,
     kernel->setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
   }
 
-
   const std::vector<uint32_t> lws = {*kwg_size / 8, 8, 0};
   std::string tuning_key = Concat("winograd_transform_kernel",
                                   output_tensor->dim(0),
@@ -110,7 +108,6 @@ MaceStatus WinogradOutputTransform(OpContext *context,
                                    cl::Kernel *kernel,
                                    const Tensor *input_tensor,
                                    const Tensor *bias,
-                                   const DataType dt,
                                    const index_t round_h,
                                    const index_t round_w,
                                    const int wino_blk_size,
@@ -145,32 +142,40 @@ MaceStatus WinogradOutputTransform(OpContext *context,
       return MaceStatus::MACE_SUCCESS;
     }
 
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
     switch (activation) {
-      case NOOP:
+      case NOOP: {
         break;
-      case RELU:
+      }
+      case RELU: {
         built_options.emplace("-DUSE_RELU");
         break;
-      case RELUX:
+      }
+      case RELUX: {
         built_options.emplace("-DUSE_RELUX");
         break;
-      case PRELU:
+      }
+      case PRELU: {
         built_options.emplace("-DUSE_PRELU");
         break;
-      case TANH:
+      }
+      case TANH: {
         built_options.emplace("-DUSE_TANH");
         break;
-      case SIGMOID:
+      }
+      case SIGMOID: {
         built_options.emplace("-DUSE_SIGMOID");
         break;
-      case LEAKYRELU:
+      }
+      case LEAKYRELU: {
         built_options.emplace("-DUSE_LEAKYRELU");
         break;
-      default:
+      }
+      default: {
         LOG(FATAL) << "Unknown activation type: " << activation;
+      }
     }
 
     MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
@@ -229,7 +234,6 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
                                        const ActivationType activation,
                                        const float relux_max_limit,
                                        const float leakyrelu_coefficient,
-                                       const DataType dt,
                                        const int wino_blk_size,
                                        std::vector<index_t> *prev_input_shape,
                                        Tensor *output,
@@ -265,13 +269,14 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
                               OpenCLBufferType::IN_OUT_HEIGHT,
                               &t_input_image_shape);
   ScratchImage transformed_input_image(scratch_manager);
-  std::unique_ptr<Tensor> transformed_input = make_unique<Tensor>(
-      transformed_input_image.Scratch(context->device()->allocator(),
-                                      t_input_image_shape, dt), dt);
+  auto input_dt = input->dtype();
+  auto image = transformed_input_image.Scratch(context->device()->allocator(),
+                                               t_input_image_shape, input_dt);
+  auto transformed_input = make_unique<Tensor>(image, input_dt);
   MACE_RETURN_IF_ERROR(transformed_input->ResizeImage(t_input_shape,
                                                       t_input_image_shape));
   MACE_RETURN_IF_ERROR(WinogradInputTransform(
-      context, kernels[0], input, dt, paddings,
+      context, kernels[0], input, paddings,
       round_h, round_w, wino_blk_size,
       input_changed, transformed_input.get(),
       kwg_size[0], &t_input_future));
@@ -290,9 +295,10 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
                               &mm_output_image_shape);
 
   ScratchImage mm_output_image(scratch_manager);
+  auto output_dt = input->dtype();
   std::unique_ptr<Tensor> mm_output = make_unique<Tensor>(
       mm_output_image.Scratch(context->device()->allocator(),
-                              mm_output_image_shape, dt), dt);
+                              mm_output_image_shape, output_dt), output_dt);
   MACE_RETURN_IF_ERROR(mm_output->ResizeImage(mm_output_shape,
                                               mm_output_image_shape));
 
@@ -311,8 +317,8 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
     MACE_NON_UNIFORM_WG_CONFIG;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
     built_options.emplace("-Dmatmul=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DT_FLOAT));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DT_FLOAT));
     MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
                                               built_options, kernels[1]));
 
@@ -334,7 +340,7 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
 
   const std::vector<uint32_t> lws = {*kwg_size[1] / 64, 64, 0};
   std::string tuning_key = Concat("matmul_opencl_kernel", mm_output_shape[0],
-      mm_output_shape[1], mm_output_shape[2]);
+                                  mm_output_shape[1], mm_output_shape[2]);
   MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernels[1], tuning_key,
                                            gws, lws, &mm_future));
 
@@ -344,7 +350,7 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
   // t_output (blk_sqr, out_chan, out_width) -> output(NHWC)
   MACE_RETURN_IF_ERROR(WinogradOutputTransform(
       context, kernels[2], mm_output.get(), bias,
-      dt, round_h, round_w, wino_blk_size, activation, relux_max_limit,
+      round_h, round_w, wino_blk_size, activation, relux_max_limit,
       leakyrelu_coefficient, input_changed, output, kwg_size[2],
       &t_output_future))
 
diff --git a/mace/ops/opencl/lstm_cell.cc b/mace/ops/opencl/lstm_cell.cc
index 563a53bcbd8ebedf5d694ecfd5d9a4252fd735ad..ce45c84401f89d42762c8a2c2bccbb57c35c08e1 100644
--- a/mace/ops/opencl/lstm_cell.cc
+++ b/mace/ops/opencl/lstm_cell.cc
@@ -25,21 +25,20 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class LSTMCellOp;
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class LSTMCellOp<DeviceType::GPU, T> : public Operation {
+template<>
+class LSTMCellOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit LSTMCellOp(OpConstructContext *context)
       : Operation(context) {
-    T forget_bias = static_cast<T>(
-        Operation::GetOptionalArg<float>("scalar_input",
-                                         0.0));
+    float forget_bias = Operation::GetOptionalArg<float>("scalar_input",
+                                                         0.0);
     MemoryType mem_type = MemoryType::GPU_IMAGE;
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::LSTMCellKernel<T>>(forget_bias);
+      kernel_ = make_unique<opencl::image::LSTMCellKernel>(forget_bias);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -47,30 +46,26 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation {
     const Tensor *pre_output = context->workspace()->GetTensor(
         operator_def_->input(1));
     if (pre_output->is_weight()) {
-      MACE_CHECK(TransformFilter<T>(context,
-                                    operator_def_.get(),
-                                    1,
-                                    OpenCLBufferType::IN_OUT_CHANNEL,
-                                    mem_type) == MaceStatus::MACE_SUCCESS);
+      auto status = TransformFilter(context, operator_def_.get(),
+                                    1, OpenCLBufferType::IN_OUT_CHANNEL,
+                                    mem_type);
+      MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
     }
-    MACE_CHECK(TransformFilter<T>(context,
-                                  operator_def_.get(),
-                                  2,
-                                  OpenCLBufferType::IN_OUT_CHANNEL,
-                                  mem_type) == MaceStatus::MACE_SUCCESS);
-    MACE_CHECK(TransformFilter<T>(context,
-                                  operator_def_.get(),
-                                  3,
-                                  OpenCLBufferType::ARGUMENT,
-                                  mem_type) == MaceStatus::MACE_SUCCESS);
-    const Tensor *pre_cell = context->workspace()->GetTensor(
-        operator_def_->input(4));
+    auto status = TransformFilter(context, operator_def_.get(),
+                                  2, OpenCLBufferType::IN_OUT_CHANNEL,
+                                  mem_type);
+    MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
+    status = TransformFilter(context, operator_def_.get(),
+                             3, OpenCLBufferType::ARGUMENT,
+                             mem_type);
+    MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
+    const Tensor *pre_cell =
+        context->workspace()->GetTensor(operator_def_->input(4));
     if (pre_cell->is_weight()) {
-      MACE_CHECK(TransformFilter<T>(context,
-                                    operator_def_.get(),
-                                    4,
-                                    OpenCLBufferType::IN_OUT_CHANNEL,
-                                    mem_type) == MaceStatus::MACE_SUCCESS);
+      status = TransformFilter(context, operator_def_.get(),
+                               4, OpenCLBufferType::IN_OUT_CHANNEL,
+                               mem_type);
+      MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
     }
   }
 
@@ -92,14 +87,10 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation {
   MACE_OP_INPUT_TAGS(INPUT, PRE_OUTPUT, WEIGHT, BIAS, PRE_CELL);
   MACE_OP_OUTPUT_TAGS(CELL, OUTPUT);
 };
-#endif
+#endif  // MACE_ENABLE_OPENCL
 
 void RegisterLSTMCell(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "LSTMCell", LSTMCellOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "LSTMCell", LSTMCellOp,
-                   DeviceType::GPU, half);
+  MACE_REGISTER_GPU_OP(op_registry, "LSTMCell", LSTMCellOp);
 }
 
 }  // namespace ops
diff --git a/mace/ops/opencl/pooling.h b/mace/ops/opencl/pooling.h
index 78628593f98209b7ab2ec3898e24bf370f573268..9d652cdcf05e76da2db2bb5ade66523b7d9e1ab1 100644
--- a/mace/ops/opencl/pooling.h
+++ b/mace/ops/opencl/pooling.h
@@ -17,7 +17,7 @@
 
 #include <vector>
 
-#include "mace/ops/pooling.h"
+#include "mace/ops/common/pooling_type.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 
 namespace mace {
diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc
index 24130d7ae381222fb6219b4d335afc4a9e0c5723..49784c10db2c999b07faffe927aa6d6ebb061746 100644
--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
@@ -16,7 +16,7 @@
 #include <memory>
 
 #include "mace/core/operator.h"
-#include "mace/ops/pad.h"
+#include "mace/ops/common/pad_type.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/pad.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -26,10 +26,10 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 class PadOp;
 
-template <typename T>
+template<typename T>
 class PadOp<DeviceType::CPU, T> : public Operation {
  public:
   explicit PadOp(OpConstructContext *context)
@@ -88,12 +88,12 @@ class PadOp<DeviceType::CPU, T> : public Operation {
         for (index_t c = 0; c < channel; ++c) {
           for (index_t h = 0; h < height; ++h) {
             const index_t in_offset = (((b * channel + c) * height) +
-                                      h) * width;
+                h) * width;
             const index_t out_offset =
-                  (((b + this->paddings_[0]) * output->dim(1)
-                + (c + this->paddings_[2])) * output->dim(2)
-                + (h + this->paddings_[4])) * output->dim(3)
-                + this->paddings_[6];
+                (((b + this->paddings_[0]) * output->dim(1)
+                    + (c + this->paddings_[2])) * output->dim(2)
+                    + (h + this->paddings_[4])) * output->dim(3)
+                    + this->paddings_[6];
             memcpy(output_ptr + out_offset,
                    input_ptr + in_offset,
                    width * sizeof(T));
@@ -101,11 +101,11 @@ class PadOp<DeviceType::CPU, T> : public Operation {
         }
       }
     } else if (type_ == PadType::REFLECT || type_ == PadType::SYMMETRIC) {
-      const index_t o_batch   = output->dim(0);
+      const index_t o_batch = output->dim(0);
       const index_t o_channel = output->dim(1);
-      const index_t o_height  = output->dim(2);
-      const index_t o_width   = output->dim(3);
-      const int l_add = type_ == PadType::REFLECT ?  0 : -1;
+      const index_t o_height = output->dim(2);
+      const index_t o_width = output->dim(3);
+      const int l_add = type_ == PadType::REFLECT ? 0 : -1;
       const int r_add = type_ == PadType::REFLECT ? -2 : -1;
 
       for (index_t h = 0; h < o_height; ++h) {
@@ -116,10 +116,10 @@ class PadOp<DeviceType::CPU, T> : public Operation {
 
           for (index_t c = 0; c < o_channel; ++c) {
             index_t c_in = get_src_idx(c, channel, paddings_[2], l_add, r_add);
-            const index_t in_offset = (((b_in * channel + c_in) * height) +
-                                      h_in) * width;
-            index_t out_offset = (((b * o_channel + c) * o_height) +
-                                 h) * o_width;
+            const index_t in_offset =
+                (((b_in * channel + c_in) * height) + h_in) * width;
+            index_t out_offset =
+                (((b * o_channel + c) * o_height) + h) * o_width;
 
             for (index_t i = 0, j = paddings_[6] + l_add;
                  i < paddings_[6]; ++i, --j) {
@@ -169,8 +169,8 @@ class PadOp<DeviceType::CPU, T> : public Operation {
 };
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class PadOp<DeviceType::GPU, T> : public Operation {
+template<>
+class PadOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit PadOp(OpConstructContext *context)
       : Operation(context) {
@@ -180,7 +180,7 @@ class PadOp<DeviceType::GPU, T> : public Operation {
     float constant_value = Operation::GetOptionalArg<float>(
         "constant_value", 0.0);
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::PadKernel<T>>(
+      kernel_ = make_unique<opencl::image::PadKernel>(
           type, paddings, constant_value);
     } else {
       MACE_NOT_IMPLEMENTED;
@@ -198,18 +198,11 @@ class PadOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-
 void RegisterPad(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Pad", PadOp,
                    DeviceType::CPU, float);
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Pad", PadOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "Pad", PadOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Pad", PadOp);
 }
 
 }  // namespace ops
diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc
index ef72ca9335657e09a50cff1ae523b2d4708f647f..4d4247f2b7236a0a3270c7d30a413c2885ca8256 100644
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -16,8 +16,6 @@
 #include <arm_neon.h>
 #endif
 
-#include "mace/ops/pooling.h"
-
 #include <algorithm>
 #include <limits>
 #include <memory>
@@ -28,6 +26,7 @@
 #include "mace/core/tensor.h"
 #include "mace/ops/conv_pool_2d_base.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/common/pooling_type.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/pooling.h"
 #include "mace/ops/opencl/buffer/pooling.h"
@@ -486,15 +485,15 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase {
+template<>
+class PoolingOp<DeviceType::GPU, float> : public PoolingOpBase {
  public:
   explicit PoolingOp(OpConstructContext *context)
       : PoolingOpBase(context) {
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::PoolingKernel<T>>();
+      kernel_ = make_unique<opencl::image::PoolingKernel>();
     } else {
-      kernel_ = make_unique<opencl::buffer::PoolingKernel<T>>();
+      kernel_ = make_unique<opencl::buffer::PoolingKernel>();
     }
   }
   MaceStatus Run(OpContext *context) override {
@@ -520,13 +519,7 @@ void RegisterPooling(OpRegistryBase *op_registry) {
                    DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Pooling", PoolingOp);
 }
 
 }  // namespace ops
diff --git a/mace/ops/reduce.cc b/mace/ops/reduce.cc
index 27b34a91a32c214f22074e2f8605fdb29dd0d6f7..28083312872d269d49b9b509525aa5ee6021b6b0 100644
--- a/mace/ops/reduce.cc
+++ b/mace/ops/reduce.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/reduce.h"
-
 #include <algorithm>
 #include <memory>
 #include <set>
 #include <vector>
 
+#include "mace/ops/common/reduce_type.h"
 #include "mace/core/future.h"
 #include "mace/core/operator.h"
 #include "mace/core/runtime/cpu/cpu_runtime.h"
@@ -868,15 +867,14 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce4Dims(
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class ReduceOp<DeviceType::GPU, T> : public ReduceOpBase {
+template<>
+class ReduceOp<DeviceType::GPU, float> : public ReduceOpBase {
  public:
   explicit ReduceOp(OpConstructContext *context)
       : ReduceOpBase(context) {
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::ReduceKernel<T>>(reduce_type_,
-                                                            axis_,
-                                                            keep_dims_);
+      kernel_ = make_unique<opencl::image::ReduceKernel>(reduce_type_,
+                                                         axis_);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -901,13 +899,7 @@ void RegisterReduce(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
                    DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Reduce", ReduceOp);
   MACE_REGISTER_OP_CONDITION(
       op_registry,
       OpConditionBuilder("Reduce")
@@ -915,26 +907,26 @@ void RegisterReduce(OpRegistryBase *op_registry) {
               [](OpConditionContext *context) -> std::set<DeviceType> {
                 auto op = context->operator_def();
                 if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                 }
                 bool keep_dims =
                     ProtoArgHelper::GetOptionalArg<OperatorDef, bool>(
                         *op, "keepdims", false);
                 if (!keep_dims) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                 }
                 auto axis =
                     ProtoArgHelper::GetRepeatedArgs<OperatorDef, int>(
                         *op, "axis");
                 if (axis.size() != 2 || axis[0] != 1 || axis[1] != 2) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                 }
                 auto tensor_shape_info = context->tensor_shape_info();
                 if (tensor_shape_info->count(op->input(0)) == 0
                     || tensor_shape_info->at(op->input(0)).size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                 }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
               }));
 }
 
diff --git a/mace/ops/resize_bicubic.cc b/mace/ops/resize_bicubic.cc
index 349f6423470b4db78df0f65e24b1dc1ae00bef58..5e48ad392e9c46269187b632f5d19c1c058ef081 100644
--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/resize_bicubic.h"
-
 #include <algorithm>
 #include <cmath>
 #include <memory>
 #include <vector>
 
 #include "mace/core/operator.h"
+#include "mace/ops/common/utils.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/resize_bicubic.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -33,12 +32,12 @@ inline const std::shared_ptr<float> InitCoeffsTable() {
   // convolution algorithm.
   // https://en.wikipedia.org/wiki/Bicubic_interpolation
   auto coeffs_tab = std::shared_ptr<float>(
-      new float[(resize_bicubic::kTableSize + 1) * 2],
+      new float[(common::utils::kTableSize + 1) * 2],
       std::default_delete<float[]>());
   float *coeffs_tab_ptr = coeffs_tab.get();
   static const float A = -0.75f;
-  for (int i = 0; i <= resize_bicubic::kTableSize; ++i) {
-    float x = i * 1.0f / resize_bicubic::kTableSize;
+  for (int i = 0; i <= common::utils::kTableSize; ++i) {
+    float x = i * 1.0f / common::utils::kTableSize;
     coeffs_tab_ptr[i * 2] = ((A + 2) * x - (A + 3)) * x * x + 1;
     x += 1.0;
     coeffs_tab_ptr[i * 2 + 1] = ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
@@ -61,12 +60,12 @@ inline void GetWeightsAndIndices(float scale, int64_t out_loc, int64_t limit,
                                  std::vector<int64_t> *indices) {
   auto in_loc = static_cast<int64_t>(scale * out_loc);
   const float delta = scale * out_loc - in_loc;
-  const int64_t offset = lrintf(delta * resize_bicubic::kTableSize);
+  const int64_t offset = lrintf(delta * common::utils::kTableSize);
   const float *coeffs_tab = GetCoeffsTable();
   *weights = {coeffs_tab[offset * 2 + 1],
               coeffs_tab[offset * 2],
-              coeffs_tab[(resize_bicubic::kTableSize - offset) * 2],
-              coeffs_tab[(resize_bicubic::kTableSize - offset) * 2 + 1]};
+              coeffs_tab[(common::utils::kTableSize - offset) * 2],
+              coeffs_tab[(common::utils::kTableSize - offset) * 2 + 1]};
   *indices = {Bound(in_loc - 1, limit), Bound(in_loc, limit),
               Bound(in_loc + 1, limit), Bound(in_loc + 2, limit)};
 }
@@ -173,13 +172,13 @@ class ResizeBicubicOp<DeviceType::CPU, float> : public Operation {
     }
 
     float height_scale =
-        resize_bicubic::CalculateResizeScale(in_height,
-                                             out_height,
-                                             align_corners_);
+        common::utils::CalculateResizeScale(in_height,
+                                            out_height,
+                                            align_corners_);
     float width_scale =
-        resize_bicubic::CalculateResizeScale(in_width,
-                                             out_width,
-                                             align_corners_);
+        common::utils::CalculateResizeScale(in_width,
+                                            out_width,
+                                            align_corners_);
 
     ResizeImage(context,
                 input_data,
@@ -202,8 +201,8 @@ class ResizeBicubicOp<DeviceType::CPU, float> : public Operation {
 };
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class ResizeBicubicOp<DeviceType::GPU, T> : public Operation {
+template<>
+class ResizeBicubicOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit ResizeBicubicOp(OpConstructContext *context)
       : Operation(context) {
@@ -213,7 +212,7 @@ class ResizeBicubicOp<DeviceType::GPU, T> : public Operation {
         "size", {-1, -1});
     MACE_CHECK(size.size() == 2);
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::ResizeBicubicKernel<T>>(
+      kernel_ = make_unique<opencl::image::ResizeBicubicKernel>(
           align_corners, size[0], size[1]);
     } else {
       MACE_NOT_IMPLEMENTED;
@@ -237,13 +236,7 @@ void RegisterResizeBicubic(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp,
                    DeviceType::CPU, float);
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "ResizeBicubic", ResizeBicubicOp);
 }
 
 }  // namespace ops
diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc
index 09df62d880cad6a1f9ece73e5312a2b56df46340..e209864f15f1d18da6e6f96353f68e257252812e 100644
--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/resize_bilinear.h"
-
 #include <algorithm>
 #include <memory>
 #include <vector>
@@ -21,6 +19,7 @@
 #include "mace/core/operator.h"
 #include "mace/utils/memory.h"
 #include "mace/core/quantize.h"
+#include "mace/ops/common/utils.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/resize_bilinear.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -223,13 +222,13 @@ class ResizeBilinearOp<DeviceType::CPU, T> : public Operation {
     }
 
     float height_scale =
-        resize_bilinear::CalculateResizeScale(in_height,
-                                              out_height,
-                                              align_corners_);
+        common::utils::CalculateResizeScale(in_height,
+                                            out_height,
+                                            align_corners_);
     float width_scale =
-        resize_bilinear::CalculateResizeScale(in_width,
-                                              out_width,
-                                              align_corners_);
+        common::utils::CalculateResizeScale(in_width,
+                                            out_width,
+                                            align_corners_);
 
     std::vector<CachedInterpolation> ys(out_height + 1);
     std::vector<CachedInterpolation> xs(out_width + 1);
@@ -299,13 +298,13 @@ class ResizeBilinearOp<DeviceType::CPU, uint8_t> : public Operation {
     }
 
     float height_scale =
-        resize_bilinear::CalculateResizeScale(in_height,
-                                              out_height,
-                                              align_corners_);
+        common::utils::CalculateResizeScale(in_height,
+                                            out_height,
+                                            align_corners_);
     float width_scale =
-        resize_bilinear::CalculateResizeScale(in_width,
-                                              out_width,
-                                              align_corners_);
+        common::utils::CalculateResizeScale(in_width,
+                                            out_width,
+                                            align_corners_);
 
     std::vector<CachedInterpolation> ys(out_height + 1);
     std::vector<CachedInterpolation> xs(out_width + 1);
@@ -336,8 +335,8 @@ class ResizeBilinearOp<DeviceType::CPU, uint8_t> : public Operation {
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class ResizeBilinearOp<DeviceType::GPU, T> : public Operation {
+template<>
+class ResizeBilinearOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit ResizeBilinearOp(OpConstructContext *context)
       : Operation(context) {
@@ -347,7 +346,7 @@ class ResizeBilinearOp<DeviceType::GPU, T> : public Operation {
         "size", {-1, -1});
     MACE_CHECK(size.size() == 2);
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::ResizeBilinearKernel<T>>(
+      kernel_ = make_unique<opencl::image::ResizeBilinearKernel>(
           align_corners, size[0], size[1]);
     } else {
       MACE_NOT_IMPLEMENTED;
@@ -376,13 +375,7 @@ void RegisterResizeBilinear(OpRegistryBase *op_registry) {
                    DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "ResizeBilinear", ResizeBilinearOp);
 }
 
 }  // namespace ops
diff --git a/mace/ops/resize_nearest_neighbor.cc b/mace/ops/resize_nearest_neighbor.cc
index 9e98e75e16313fc7d3093260feaa0207d40bcbd0..89ed473c44e43c5dd4c6415fe2badfd9f738c844 100644
--- a/mace/ops/resize_nearest_neighbor.cc
+++ b/mace/ops/resize_nearest_neighbor.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/resize_nearest_neighbor.h"
-
 #include <algorithm>
 #include <memory>
 #include <vector>
 
 #include "mace/core/operator.h"
+#include "mace/ops/common/utils.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/resize_nearest_neighbor.h"
 #endif  // MACE_ENABLE_OPENCL
@@ -115,13 +114,13 @@ class ResizeNearestNeighborOp<DeviceType::CPU, T> : public Operation {
     }
 
     float height_scale =
-        resize_nearest_neighbor::CalculateResizeScale(in_height,
-                                                      out_height,
-                                                      align_corners_);
+        common::utils::CalculateResizeScale(in_height,
+                                            out_height,
+                                            align_corners_);
     float width_scale =
-        resize_nearest_neighbor::CalculateResizeScale(in_width,
-                                                      out_width,
-                                                      align_corners_);
+        common::utils::CalculateResizeScale(in_width,
+                                            out_width,
+                                            align_corners_);
     ResizeImageNCHW(context,
                     input_data,
                     batch,
@@ -142,15 +141,15 @@ class ResizeNearestNeighborOp<DeviceType::CPU, T> : public Operation {
 };
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class ResizeNearestNeighborOp<DeviceType::GPU, T> : public Operation {
+template<>
+class ResizeNearestNeighborOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit ResizeNearestNeighborOp(OpConstructContext *context)
       : Operation(context) {
     bool align_corners = Operation::GetOptionalArg<bool>(
         "align_corners", false);
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::ResizeNearestNeighborKernel<T>>(
+      kernel_ = make_unique<opencl::image::ResizeNearestNeighborKernel>(
           align_corners);
     } else {
       MACE_NOT_IMPLEMENTED;
@@ -176,13 +175,8 @@ void RegisterResizeNearestNeighbor(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor",
                    ResizeNearestNeighborOp, DeviceType::CPU, float);
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor",
-                   ResizeNearestNeighborOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "ResizeNearestNeighbor",
-                   ResizeNearestNeighborOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "ResizeNearestNeighbor",
+                       ResizeNearestNeighborOp);
 }
 
 }  // namespace ops
diff --git a/mace/ops/resize_nearest_neighbor.h b/mace/ops/resize_nearest_neighbor.h
deleted file mode 100644
index 0f27a219daf17329328321bd9132fad6ab5b462c..0000000000000000000000000000000000000000
--- a/mace/ops/resize_nearest_neighbor.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_
-#define MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_
-
-#include "mace/core/types.h"
-
-namespace mace {
-namespace ops {
-namespace resize_nearest_neighbor {
-inline float CalculateResizeScale(index_t in_size,
-                                  index_t out_size,
-                                  bool align_corners) {
-  return (align_corners && out_size > 1)
-         ? (in_size - 1) / static_cast<float>(out_size - 1)
-         : in_size / static_cast<float>(out_size);
-}
-}  // namespace resize_nearest_neighbor
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_RESIZE_NEAREST_NEIGHBOR_H_
diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc
index e32410989fe8c14cf936330769fd700eb0fe31b5..82a684b1a4056dcfa13be8b4c45aeb63e59781f2 100644
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -35,10 +35,10 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 class SoftmaxOp;
 
-template <>
+template<>
 class SoftmaxOp<DeviceType::CPU, float> : public Operation {
  public:
   explicit SoftmaxOp(OpConstructContext *context)
@@ -139,12 +139,12 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
           sum = std::max(sum, std::numeric_limits<float>::min());
           if (use_log_) {
             for (index_t c = 0; c < class_count; ++c) {
-              output_ptr[c] /=  sum;
+              output_ptr[c] /= sum;
               output_ptr[c] = std::log(output_ptr[c]);
             }
           } else {
             for (index_t c = 0; c < class_count; ++c) {
-              output_ptr[c] /=  sum;
+              output_ptr[c] /= sum;
             }
           }
         }
@@ -407,17 +407,17 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class SoftmaxOp<DeviceType::GPU, T> : public Operation {
+template<>
+class SoftmaxOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit SoftmaxOp(OpConstructContext *context)
       : Operation(context) {
     bool use_log = (
         Operation::GetOptionalArg<bool>("use_log", false));
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::SoftmaxKernel<T>>(use_log);
+      kernel_ = make_unique<opencl::image::SoftmaxKernel>(use_log);
     } else {
-      kernel_ = make_unique<opencl::buffer::SoftmaxKernel<T>>(use_log);
+      kernel_ = make_unique<opencl::buffer::SoftmaxKernel>(use_log);
     }
   }
   MaceStatus Run(OpContext *context) override {
@@ -433,7 +433,6 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-
 void RegisterSoftmax(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
                    DeviceType::CPU, float);
@@ -443,13 +442,7 @@ void RegisterSoftmax(OpRegistryBase *op_registry) {
                    DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Softmax", SoftmaxOp);
 
   MACE_REGISTER_OP_CONDITION(
       op_registry,
@@ -458,13 +451,13 @@ void RegisterSoftmax(OpRegistryBase *op_registry) {
               [](OpConditionContext *context) -> std::set<DeviceType> {
                 auto op = context->operator_def();
                 if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                 }
                 if (op->output_shape(0).dims_size() != 2 &&
                     op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                 }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
               }));
 }
 
diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc
index 50de3fc74b1104ccac8576e29a90911789dc91fd..156c2132289a487cb0db14d0bce05da85a31442d 100644
--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -86,10 +86,10 @@ class SpaceToBatchOpBase : public Operation {
   }
 };
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class SpaceToBatchNDOp;
 
-template <>
+template<>
 class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
  public:
   explicit SpaceToBatchNDOp(OpConstructContext *context)
@@ -302,13 +302,13 @@ class SpaceToBatchNDOp<DeviceType::CPU, uint8_t> : public SpaceToBatchOpBase {
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class SpaceToBatchNDOp<DeviceType::GPU, T> : public SpaceToBatchOpBase {
+template<>
+class SpaceToBatchNDOp<DeviceType::GPU, float> : public SpaceToBatchOpBase {
  public:
   explicit SpaceToBatchNDOp(OpConstructContext *context)
       : SpaceToBatchOpBase(context) {
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::SpaceToBatchKernel<T>>();
+      kernel_ = make_unique<opencl::image::SpaceToBatchKernel>();
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -337,13 +337,7 @@ void RegisterSpaceToBatchND(OpRegistryBase *op_registry) {
                    SpaceToBatchNDOp, DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
-                   SpaceToBatchNDOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
-                   SpaceToBatchNDOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "SpaceToBatchND", SpaceToBatchNDOp);
 }
 
 }  // namespace ops
diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc
index 9584ddb8d7d43f3cea7c5b0612e7bca24346070d..d9b5473629da962985261bc955dc591ef4b3a0f7 100644
--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
@@ -24,7 +24,7 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class SpaceToDepthOp : public Operation {
  public:
   explicit SpaceToDepthOp(OpConstructContext *context)
@@ -88,14 +88,14 @@ class SpaceToDepthOp : public Operation {
 };
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class SpaceToDepthOp<DeviceType::GPU, T> : public Operation {
+template<>
+class SpaceToDepthOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit SpaceToDepthOp(OpConstructContext *context)
       : Operation(context) {
     int block_size = Operation::GetOptionalArg<int>("block_size", 1);
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::SpaceToDepthKernel<T>>(block_size);
+      kernel_ = make_unique<opencl::image::SpaceToDepthKernel>(block_size);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -116,13 +116,7 @@ void RegisterSpaceToDepth(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "SpaceToDepth",
                    SpaceToDepthOp, DeviceType::CPU, float);
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "SpaceToDepth",
-                   SpaceToDepthOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "SpaceToDepth",
-                   SpaceToDepthOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "SpaceToDepth", SpaceToDepthOp);
 }
 
 }  // namespace ops
diff --git a/mace/ops/split.cc b/mace/ops/split.cc
index b08d72c533d480a65cbff0c6fefb6a3b940322d6..ffe7172f841bb76be8e4428cdf9a30ac29ee27bd 100644
--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
@@ -100,14 +100,14 @@ class SplitOp<DeviceType::CPU, T> : public Operation {
 };
 
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class SplitOp<DeviceType::GPU, T> : public Operation {
+template<>
+class SplitOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit SplitOp(OpConstructContext *context)
       : Operation(context) {
     int32_t axis = Operation::GetOptionalArg<int>("axis", 3);
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::SplitKernel<T>>(axis);
+      kernel_ = make_unique<opencl::image::SplitKernel>(axis);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -132,13 +132,7 @@ void RegisterSplit(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Split", SplitOp,
                    DeviceType::CPU, float);
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Split", SplitOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "Split", SplitOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Split", SplitOp);
 
   MACE_REGISTER_OP_CONDITION(
       op_registry,
diff --git a/mace/ops/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc
index cd2fb1742f4a31992922deb357f4cfa788c032f8..2d85ed98448ba37e60572df7f87c6184ebbeddfb 100644
--- a/mace/ops/sqrdiff_mean.cc
+++ b/mace/ops/sqrdiff_mean.cc
@@ -24,7 +24,7 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 class SqrDiffMeanOp : public Operation {
  public:
   explicit SqrDiffMeanOp(OpConstructContext *context)
@@ -76,15 +76,14 @@ class SqrDiffMeanOp : public Operation {
   }
 };
 
-
 #ifdef MACE_ENABLE_OPENCL
-template <typename T>
-class SqrDiffMeanOp<DeviceType::GPU, T> : public Operation {
+template<>
+class SqrDiffMeanOp<DeviceType::GPU, float> : public Operation {
  public:
   explicit SqrDiffMeanOp(OpConstructContext *context)
       : Operation(context) {
     if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
-      kernel_ = make_unique<opencl::image::SqrDiffMeanKernel<T>>();
+      kernel_ = make_unique<opencl::image::SqrDiffMeanKernel>();
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -101,18 +100,11 @@ class SqrDiffMeanOp<DeviceType::GPU, T> : public Operation {
 };
 #endif  // MACE_ENABLE_OPENCL
 
-
 void RegisterSqrDiffMean(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
                    DeviceType::CPU, float);
 
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp);
 }
 
 }  // namespace ops
diff --git a/mace/ops/squeeze.cc b/mace/ops/squeeze.cc
index 660a8e8f3dbfd8b54e701b5ff7714dc0c942aa3f..0c08cfd589b6d5c5f080432bffb62162706f15bc 100644
--- a/mace/ops/squeeze.cc
+++ b/mace/ops/squeeze.cc
@@ -20,18 +20,21 @@
 namespace mace {
 namespace ops {
 
-template <DeviceType D, typename T>
-class SqueezeOp : public Operation {
+class SqueezeOpRaw : public Operation {
  public:
-  explicit SqueezeOp(OpConstructContext *context)
+  explicit SqueezeOpRaw(OpConstructContext *context,
+                        DeviceType device_type,
+                        DataType data_type)
       : Operation(context),
         axis_(Operation::GetRepeatedArgs<int>("axis", {})),
-        checked_(false) {}
+        checked_(false),
+        data_type_(data_type),
+        device_type_(device_type) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
-    if (!checked_ && D == DeviceType::CPU
-        && DataTypeToEnum<T>::value != DT_UINT8) {
+    if (!checked_ && device_type_ == DeviceType::CPU
+        && data_type_ != DT_UINT8) {
       auto has_df = Operation::GetOptionalArg<int>(
           "has_data_format", 0);
       if (has_df && this->Input(0)->dim_size() == 4) {
@@ -62,6 +65,16 @@ class SqueezeOp : public Operation {
  private:
   std::vector<int> axis_;
   bool checked_;
+  DataType data_type_;
+  DeviceType device_type_;
+};
+
+template<DeviceType D, typename T>
+class SqueezeOp : public SqueezeOpRaw {
+ public:
+  explicit SqueezeOp(OpConstructContext *context)
+      : SqueezeOpRaw(context, D, DataTypeToEnum<T>::value) {
+  }
 };
 
 void RegisterSqueeze(OpRegistryBase *op_registry) {
@@ -69,10 +82,7 @@ void RegisterSqueeze(OpRegistryBase *op_registry) {
 #ifdef MACE_ENABLE_QUANTIZE
   MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_GPU_OP(op_registry, "Squeeze", SqueezeOp);
   MACE_REGISTER_OP_CONDITION(
       op_registry,
       OpConditionBuilder("Squeeze")
@@ -80,13 +90,13 @@ void RegisterSqueeze(OpRegistryBase *op_registry) {
               [](OpConditionContext *context) -> std::set<DeviceType> {
                 auto op = context->operator_def();
                 if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU, DeviceType::GPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                 }
                 if (op->output_shape(0).dims_size() != 2 &&
                     op->output_shape(0).dims_size() != 4) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU};
                 }
-                return { DeviceType::CPU, DeviceType::GPU };
+                return {DeviceType::CPU, DeviceType::GPU};
               }));
 }
 
diff --git a/mace/python/tools/encrypt_opencl_codegen.py b/mace/python/tools/encrypt_opencl_codegen.py
index 6fa3db4589bd883fe00433456808ef3b3c50c27e..2ef43a2d9ea2a9938e89250be2591079e9b8e5a4 100644
--- a/mace/python/tools/encrypt_opencl_codegen.py
+++ b/mace/python/tools/encrypt_opencl_codegen.py
@@ -37,55 +37,73 @@ def encrypt_code(code_str):
     return encrypted_arr
 
 
+def create_output_dir(dir_path):
+    if os.path.exists(dir_path):
+        if os.path.isdir(dir_path):
+            try:
+                shutil.rmtree(dir_path)
+            except OSError:
+                raise RuntimeError(
+                    "Cannot delete directory %s due to permission "
+                    "error, inspect and remove manually" % dir_path)
+        else:
+            raise RuntimeError(
+                "Cannot delete non-directory %s, inspect ",
+                "and remove manually" % dir_path)
+    os.makedirs(dir_path)
+
+
+def write_cl_encrypted_kernel_to_file(
+        encrypted_code_maps, template_path, output_path):
+    env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
+    cl_encrypted_kernel = env.get_template(template_path).render(
+        tag='codegen',
+        maps=encrypted_code_maps,
+        data_type='unsigned char',
+        variable_name='kEncryptedProgramMap')
+    with open(output_path, "w") as w_file:
+        w_file.write(cl_encrypted_kernel)
+
+
+def get_module_key(file_name):
+    module_key = None
+    if file_name[-3:] == ".cl":
+        module_key = file_name[:-3]
+    elif file_name[-2:] == ".h":
+        module_key = file_name
+
+    return module_key
+
+
 def encrypt_opencl_codegen(cl_kernel_dir, output_path):
     if not os.path.exists(cl_kernel_dir):
         print("Input cl_kernel_dir " + cl_kernel_dir + " doesn't exist!")
 
-    header_code = ""
-    for file_name in os.listdir(cl_kernel_dir):
-        file_path = os.path.join(cl_kernel_dir, file_name)
-        if file_path[-2:] == ".h":
-            with open(file_path, "r") as f:
-                header_code += f.read()
-
     encrypted_code_maps = {}
     for file_name in os.listdir(cl_kernel_dir):
         file_path = os.path.join(cl_kernel_dir, file_name)
-        if file_path[-3:] == ".cl":
+        module_key = get_module_key(file_name)
+        if len(module_key) > 0:
             with open(file_path, "r") as f:
                 code_str = ""
+                headers = []
                 for line in f.readlines():
                     if "#include <common.h>" in line:
-                        code_str += header_code
+                        headers.append(get_module_key("common.h"))
                     else:
                         code_str += line
                 encrypted_code_arr = encrypt_code(code_str)
-                encrypted_code_maps[file_name[:-3]] = encrypted_code_arr
-
-    env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
-    cpp_cl_encrypted_kernel = env.get_template(
-        'str2vec_maps.cc.jinja2').render(
-            maps=encrypted_code_maps,
-            data_type='unsigned char',
-            variable_name='kEncryptedProgramMap')
-
-    output_dir = os.path.dirname(output_path)
-    if os.path.exists(output_dir):
-        if os.path.isdir(output_dir):
-            try:
-                shutil.rmtree(output_dir)
-            except OSError:
-                raise RuntimeError(
-                    "Cannot delete directory %s due to permission "
-                    "error, inspect and remove manually" % output_dir)
-        else:
-            raise RuntimeError(
-                "Cannot delete non-directory %s, inspect ",
-                "and remove manually" % output_dir)
-    os.makedirs(output_dir)
-
-    with open(output_path, "w") as w_file:
-        w_file.write(cpp_cl_encrypted_kernel)
+                encrypted_code = {}
+                encrypted_code['headers'] = headers
+                encrypted_code['code'] = encrypted_code_arr
+                encrypted_code_maps[module_key] = encrypted_code
+
+    create_output_dir(os.path.dirname(output_path))
+    write_cl_encrypted_kernel_to_file(
+        encrypted_code_maps, 'str2vec_maps.cc.jinja2', output_path)
+    output_path_h = output_path.replace('.cc', '.h')
+    write_cl_encrypted_kernel_to_file(
+        encrypted_code_maps, 'str2vec_maps.h.jinja2', output_path_h)
 
     print('Generate OpenCL kernel done.')
 
diff --git a/mace/python/tools/str2vec_maps.cc.jinja2 b/mace/python/tools/str2vec_maps.cc.jinja2
index 513114941e8267528ed33eddd5b7f7ebb64a57ab..d88347172d0dd4f50d382a7c7598723db151f2e1 100644
--- a/mace/python/tools/str2vec_maps.cc.jinja2
+++ b/mace/python/tools/str2vec_maps.cc.jinja2
@@ -14,24 +14,32 @@
 
 // This is a generated file. DO NOT EDIT!
 
+#include "mace/codegen/opencl/encrypt_opencl_kernel.h"
+
 #include <map>
 #include <string>
-#include <vector>
 
 namespace mace {
+namespace {{tag}} {
 
-extern const std::map<std::string, std::vector<{{data_type}}>> {{variable_name}} =
-{
-  {% for key, value in maps.items() %}
+const std::map<std::string, ClProgramInfo> {{variable_name}} = {
+{% for key, encrypted_code in maps.items() %}
   {
-    "{{key}}",
-    {
-      {%- for ele in value -%}
-      {{ele}},
-      {%- endfor -%}
+    "{{key}}", {
+      {
+        {%- for header in encrypted_code['headers'] -%}
+        "{{header}}",
+        {%- endfor -%}
+      },
+      {
+        {%- for ele in encrypted_code['code'] -%}
+        {{ele}},
+        {%- endfor -%}
+      }
     }
   },  // {{key}}
 {% endfor %}
 };
 
+}  // {{tag}}
 }  // namespace mace
diff --git a/mace/ops/resize_bilinear.h b/mace/python/tools/str2vec_maps.h.jinja2
similarity index 54%
rename from mace/ops/resize_bilinear.h
rename to mace/python/tools/str2vec_maps.h.jinja2
index b5f50d29336b9af9cb4b756a15999074a566ed5b..9e89e416ebe1b67545538346431245c735104dc9 100644
--- a/mace/ops/resize_bilinear.h
+++ b/mace/python/tools/str2vec_maps.h.jinja2
@@ -12,23 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_RESIZE_BILINEAR_H_
-#define MACE_OPS_RESIZE_BILINEAR_H_
+// This is a generated file. DO NOT EDIT!
 
-#include "mace/core/types.h"
+#include <map>
+#include <string>
+#include <vector>
 
 namespace mace {
-namespace ops {
-namespace resize_bilinear {
-inline float CalculateResizeScale(index_t in_size,
-                                  index_t out_size,
-                                  bool align_corners) {
-  return (align_corners && out_size > 1)
-         ? (in_size - 1) / static_cast<float>(out_size - 1)
-         : in_size / static_cast<float>(out_size);
-}
-}  // namespace resize_bilinear
-}  // namespace ops
-}  // namespace mace
+namespace {{tag}} {
+
+struct ClProgramInfo {
+  const std::vector<std::string> headers_;
+  const std::vector<{{data_type}}> encrypted_code_;
+};
 
-#endif  // MACE_OPS_RESIZE_BILINEAR_H_
+extern const std::map<std::string, ClProgramInfo> {{variable_name}};
+
+}  // {{tag}}
+}  // namespace mace
diff --git a/repository/opencl-kernel/opencl_kernel_configure.bzl b/repository/opencl-kernel/opencl_kernel_configure.bzl
index 63191cda20032c191992ea3624c13c121c585121..545af54d3dabab5ef7c9e34ccc2fbd9186c9f7c1 100644
--- a/repository/opencl-kernel/opencl_kernel_configure.bzl
+++ b/repository/opencl-kernel/opencl_kernel_configure.bzl
@@ -22,7 +22,7 @@ def _opencl_encrypt_kernel_impl(repository_ctx):
         unused_var = repository_ctx.path(Label("//:.git/refs/heads/master"))
 
     ret = repository_ctx.execute(
-        ["test", "-f", "%s/mace/ops/opencl/cl/common.h" % mace_root_path],
+        ["test", "-f", "%s/mace/ops/opencl/cl/common.cl" % mace_root_path],
     )
     if ret.return_code == 0:
         unused_var = repository_ctx.path(Label("//:mace/ops/opencl/cl/activation.cl"))
@@ -71,7 +71,7 @@ def _opencl_encrypt_kernel_impl(repository_ctx):
         python_bin_path,
         "%s/mace/python/tools/encrypt_opencl_codegen.py" % mace_root_path,
         "--cl_kernel_dir=%s/mace/ops/opencl/cl" % mace_root_path,
-        "--output_path=%s/encrypt_opencl_kernel" % generated_files_path,
+        "--output_path=%s/encrypt_opencl_kernel.cc" % generated_files_path,
     ], quiet = False)
 
 encrypt_opencl_kernel_repository = repository_rule(
diff --git a/test/ccbenchmark/mace/ops/buffer_to_image_benchmark.cc b/test/ccbenchmark/mace/ops/buffer_to_image_benchmark.cc
index 6e5f7017e822dadb8d8c1044dc8875631fa6a28d..07685255c407a59e57f2edd2d01570bddf2e54bd 100644
--- a/test/ccbenchmark/mace/ops/buffer_to_image_benchmark.cc
+++ b/test/ccbenchmark/mace/ops/buffer_to_image_benchmark.cc
@@ -42,7 +42,7 @@ void FilterBufferToImage(int iters,
       "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
 
   auto transform_func = [&]() {
-    OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+    OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
         .Transform(&context,
                    net.ws()->GetTensor("Input"),
                    OpenCLBufferType::IN_OUT_CHANNEL,
diff --git a/test/ccbenchmark/mace/ops/pad_benchmark.cc b/test/ccbenchmark/mace/ops/pad_benchmark.cc
index 2d0b4bc42a41a31ce4640b4d8227245084490515..27bd93bd6ba399f3d9616f5d6f8f7a8ce5847dc5 100644
--- a/test/ccbenchmark/mace/ops/pad_benchmark.cc
+++ b/test/ccbenchmark/mace/ops/pad_benchmark.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "mace/benchmark_utils/test_benchmark.h"
+#include "mace/ops/common/pad_type.h"
 #include "mace/ops/ops_test_util.h"
-#include "mace/ops/pad.h"
 
 namespace mace {
 namespace ops {
diff --git a/test/ccbenchmark/mace/ops/pooling_benchmark.cc b/test/ccbenchmark/mace/ops/pooling_benchmark.cc
index 6b66a9fa7032ca29fc16fe888c9f532997ee37de..314cc6f90a98d9e732510869c0488bf50b3d478f 100644
--- a/test/ccbenchmark/mace/ops/pooling_benchmark.cc
+++ b/test/ccbenchmark/mace/ops/pooling_benchmark.cc
@@ -14,7 +14,7 @@
 
 #include "mace/benchmark_utils/test_benchmark.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
-#include "mace/ops/pooling.h"
+#include "mace/ops/common/pooling_type.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/test/ccunit/mace/ops/buffer_to_image_test.cc b/test/ccunit/mace/ops/buffer_to_image_test.cc
index cb52eafe19bf27f926c36653889942a232edb2c5..644283d405f2a712c58707b83e3070893e2d2ba2 100644
--- a/test/ccunit/mace/ops/buffer_to_image_test.cc
+++ b/test/ccunit/mace/ops/buffer_to_image_test.cc
@@ -35,14 +35,14 @@ void TestBidirectionTransform(const OpenCLBufferType type,
   Tensor *b2i_output = net.ws()->CreateTensor(
       "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
 
-  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+  OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
       .Transform(&context, net.ws()->GetTensor("Input"),
                  type, MemoryType::GPU_IMAGE, 0, b2i_output);
 
   // Inverse Transform
   Tensor *i2b_output = net.ws()->CreateTensor(
       "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
-  OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
+  OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
       .Transform(&context, b2i_output,
                  type, MemoryType::GPU_BUFFER, 0, i2b_output);
 
@@ -176,14 +176,14 @@ void TestDiffTypeBidirectionTransform(const OpenCLBufferType type,
   Tensor *b2i_output = net.ws()->CreateTensor(
       "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
 
-  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+  OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
       .Transform(&context, net.ws()->GetTensor("Input"),
                  type, MemoryType::GPU_IMAGE, 0, b2i_output);
 
   // Inverse Transform
   Tensor *i2b_output = net.ws()->CreateTensor(
       "I2BOutput", context.device()->allocator(), DT_FLOAT);
-  OpenCLBufferTransformer<float>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
+  OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
       .Transform(&context, b2i_output,
                  type, MemoryType::GPU_BUFFER, 0, i2b_output);
 
@@ -216,14 +216,14 @@ void TestStringHalfBidirectionTransform(const OpenCLBufferType type,
       "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
 
   // Transform
-  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+  OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
       .Transform(&context, net.ws()->GetTensor("Input"),
                  type, MemoryType::GPU_IMAGE, 0, b2i_output);
 
   // Inverse Transform
   Tensor *i2b_output = net.ws()->CreateTensor(
       "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
-  OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
+  OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
       .Transform(&context, b2i_output,
                  type, MemoryType::GPU_BUFFER, 0, i2b_output);
 
diff --git a/test/ccunit/mace/ops/buffer_transform_test.cc b/test/ccunit/mace/ops/buffer_transform_test.cc
index a9af4bc9943fceb62d61e9ec7b13a58188230e83..f29a2e012249d5214ddedeaf9320aec80e71120c 100644
--- a/test/ccunit/mace/ops/buffer_transform_test.cc
+++ b/test/ccunit/mace/ops/buffer_transform_test.cc
@@ -45,8 +45,8 @@ void TestBidirectionTransform(const OpenCLBufferType type,
       "BtOutput", context.device()->allocator(),
       DataTypeToEnum<DstType>::value);
 
-  OpenCLBufferTransformer<DstType>(MemoryType::GPU_BUFFER,
-                                   MemoryType::GPU_BUFFER)
+  OpenCLBufferTransformer(MemoryType::GPU_BUFFER,
+                          MemoryType::GPU_BUFFER)
       .Transform(&context, net.ws()->GetTensor("Input"),
                  type, MemoryType::GPU_BUFFER, 0, bt_output);
 
@@ -54,8 +54,8 @@ void TestBidirectionTransform(const OpenCLBufferType type,
   Tensor *output = net.ws()->CreateTensor(
       "Output", context.device()->allocator(),
       DataTypeToEnum<OrgType>::value);
-  OpenCLBufferTransformer<OrgType>(MemoryType::GPU_BUFFER,
-                                   MemoryType::GPU_BUFFER)
+  OpenCLBufferTransformer(MemoryType::GPU_BUFFER,
+                          MemoryType::GPU_BUFFER)
       .Transform(&context, bt_output,
                  type, MemoryType::GPU_BUFFER, 0, output);
 
@@ -90,8 +90,8 @@ void TestArgumentTransform(const index_t input_size) {
   Tensor *output = net.ws()->CreateTensor(
       "Output", context.device()->allocator(),
       DataTypeToEnum<T>::value);
-  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER,
-                             MemoryType::GPU_BUFFER)
+  OpenCLBufferTransformer(MemoryType::GPU_BUFFER,
+                          MemoryType::GPU_BUFFER)
       .Transform(&context, net.ws()->GetTensor("Input"),
                  OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER,
                  0, output);
diff --git a/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc b/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
index 8909f35113c5a77d78cf614970d9d027019f111c..8a17c2d2c5e5d9ed0431005404b630efdfd2c974 100644
--- a/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
+++ b/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
@@ -53,10 +53,10 @@ MaceStatus BufferToImageOpImpl(OpContext *context,
                           DtToCLCMDDt(DataTypeToEnum<float>::value));
   } else {
     built_options.emplace("-DDATA_TYPE=" +
-                          DtToUpCompatibleCLDt(DataTypeToEnum<float>::value));
+                          DtToCLDt(DataTypeToEnum<float>::value));
     built_options.emplace(
         "-DCMD_DATA_TYPE=" +
-            DtToUpCompatibleCLCMDDt(DataTypeToEnum<float>::value));
+            DtToCLCMDDt(DataTypeToEnum<float>::value));
   }
 
   cl::Kernel kernel;
diff --git a/test/ccunit/mace/ops/pad_test.cc b/test/ccunit/mace/ops/pad_test.cc
index 977305597ae742866d2c1d63c48f571cfaa884e7..3d785ac7603b75d9a2e11ca65faeefb1cc40abbc 100644
--- a/test/ccunit/mace/ops/pad_test.cc
+++ b/test/ccunit/mace/ops/pad_test.cc
@@ -16,8 +16,8 @@
 #include <string>
 #include <vector>
 
+#include "mace/ops/common/pad_type.h"
 #include "mace/ops/ops_test_util.h"
-#include "mace/ops/pad.h"
 
 namespace mace {
 namespace ops {
diff --git a/test/ccunit/mace/ops/pooling_test.cc b/test/ccunit/mace/ops/pooling_test.cc
index 037cf8cf76e1926f941a92ea5eb1197b11e74b99..caa525c67b592dc44084f63093b3a20ad3aeb4c7 100644
--- a/test/ccunit/mace/ops/pooling_test.cc
+++ b/test/ccunit/mace/ops/pooling_test.cc
@@ -14,8 +14,8 @@
 
 #include <vector>
 
-#include "mace/ops/pooling.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/common/pooling_type.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/test/ccunit/mace/ops/reduce_test.cc b/test/ccunit/mace/ops/reduce_test.cc
index 21a2dc13c3d63c8da97b47690b576d3d2499c6bf..753bf419debf706329b7c53898d2a561d0ff61ac 100644
--- a/test/ccunit/mace/ops/reduce_test.cc
+++ b/test/ccunit/mace/ops/reduce_test.cc
@@ -14,7 +14,7 @@
 
 #include <vector>
 
-#include "mace/ops/reduce.h"
+#include "mace/ops/common/reduce_type.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {