Merge branch 'gpu-buffer' into 'master'

Refactor OpenCL kernel for supporting buffer. See merge request !803

Merge branch 'gpu-buffer' into 'master'
Refactor OpenCL kernel for supporting buffer. See merge request !803
e2a40a03 · 刘琦 · f2f0a76c · e3fb9821 · e2a40a03 · e2a40a03
183 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -8,6 +8,7 @@ stages:
  - ops_test
  - api_test
  - python_tools_tests
+  - model_tests
  - build_android_demo
  - ops_benchmark
  - extra_tests
@@ -113,6 +114,18 @@ python_tools_tests:
      python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file || exit 1;
      python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1;
      python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+model_tests:
+  stage: model_tests
+  script:
+    - pwd
+    - rm -rf mace-models
+    - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
+    - CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1.yml
+    - >
+      python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
    - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml
    - >
      python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1;

--- a/mace/core/device.cc
+++ b/mace/core/device.cc
@@ -14,6 +14,8 @@
 #include "mace/core/device.h"
+#include "mace/core/buffer.h"
 namespace mace {
 CPUDevice::CPUDevice(const int num_threads,
@@ -21,7 +23,8 @@ CPUDevice::CPUDevice(const int num_threads,
                     const bool use_gemmlowp)
    : cpu_runtime_(new CPURuntime(num_threads,
                                  policy,
-                                  use_gemmlowp)) {}
+                                  use_gemmlowp)),
+      scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {}
 CPUDevice::~CPUDevice() = default;
@@ -31,6 +34,7 @@ CPURuntime *CPUDevice::cpu_runtime() {
 #ifdef MACE_ENABLE_OPENCL
 OpenCLRuntime *CPUDevice::opencl_runtime() {
+  LOG(FATAL) << "CPU device should not call OpenCL Runtime";
  return nullptr;
 }
 #endif
@@ -43,4 +47,8 @@ DeviceType CPUDevice::device_type() const {
  return DeviceType::CPU;
 }
+ScratchBuffer *CPUDevice::scratch_buffer() {
+  return scratch_buffer_.get();
+}
 }  // namespace mace
--- a/mace/core/device.h
+++ b/mace/core/device.h
@@ -26,6 +26,8 @@
 namespace mace {
+class ScratchBuffer;
 class Device {
 public:
  virtual ~Device() {}
@@ -37,6 +39,7 @@ class Device {
  virtual Allocator *allocator() = 0;
  virtual DeviceType device_type() const = 0;
+  virtual ScratchBuffer *scratch_buffer() = 0;
 };
 class CPUDevice : public Device {
@@ -53,9 +56,11 @@ class CPUDevice : public Device {
  Allocator *allocator() override;
  DeviceType device_type() const override;
+  ScratchBuffer *scratch_buffer() override;
 private:
  std::unique_ptr<CPURuntime> cpu_runtime_;
+  std::unique_ptr<ScratchBuffer> scratch_buffer_;
 };
 }  // namespace mace

--- a/mace/core/future.h
+++ b/mace/core/future.h
@@ -15,7 +15,9 @@
 #ifndef MACE_CORE_FUTURE_H_
 #define MACE_CORE_FUTURE_H_
+#include <algorithm>
 #include <functional>
+#include <vector>
 #include "mace/utils/logging.h"
@@ -25,9 +27,7 @@ struct CallStats;
 // Wait the call to finish and get the stats if param is not nullptr
 struct StatsFuture {
-  std::function<void(CallStats *)> wait_fn = [](CallStats *) {
+  std::function<void(CallStats *)> wait_fn;
-    LOG(FATAL) << "wait_fn must be properly set";
-  };
 };
 inline void SetFutureDefaultWaitFn(StatsFuture *future) {
@@ -41,6 +41,29 @@ inline void SetFutureDefaultWaitFn(StatsFuture *future) {
  }
 }
+inline void MergeMultipleFutureWaitFn(
+    const std::vector<StatsFuture> &org_futures,
+    StatsFuture *dst_future) {
+  if (dst_future != nullptr) {
+    dst_future->wait_fn = [org_futures](CallStats *stats) {
+      if (stats != nullptr) {
+        stats->start_micros = INT64_MAX;
+        stats->end_micros = 0;
+        for (auto &org_future : org_futures) {
+          CallStats tmp_stats;
+          if (org_future.wait_fn != nullptr) {
+            org_future.wait_fn(&tmp_stats);
+            stats->start_micros = std::min(stats->start_micros,
+                                           tmp_stats.start_micros);
+            stats->end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
+          }
+        }
+        stats->end_micros += stats->start_micros;
+      }
+    };
+  }
+}
 }  // namespace mace
 #endif  // MACE_CORE_FUTURE_H_
--- a/mace/core/runtime/opencl/gpu_device.cc
+++ b/mace/core/runtime/opencl/gpu_device.cc
@@ -14,6 +14,8 @@
 #include "mace/core/runtime/opencl/gpu_device.h"
+#include "mace/core/buffer.h"
 namespace mace {
 GPUDevice::GPUDevice(Tuner<uint32_t> *tuner,
@@ -27,7 +29,8 @@ GPUDevice::GPUDevice(Tuner<uint32_t> *tuner,
    CPUDevice(num_threads, cpu_affinity_policy, use_gemmlowp),
    runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf,
                               opencl_binary_storage, tuner)),
-    allocator_(new OpenCLAllocator(runtime_.get())) {}
+    allocator_(new OpenCLAllocator(runtime_.get())),
+    scratch_buffer_(new ScratchBuffer(allocator_.get())) {}
 GPUDevice::~GPUDevice() = default;
@@ -43,4 +46,8 @@ DeviceType GPUDevice::device_type() const {
  return DeviceType::GPU;
 }
+ScratchBuffer *GPUDevice::scratch_buffer() {
+  return scratch_buffer_.get();
+}
 }  // namespace mace
--- a/mace/core/runtime/opencl/gpu_device.h
+++ b/mace/core/runtime/opencl/gpu_device.h
@@ -37,9 +37,11 @@ class GPUDevice : public CPUDevice {
  OpenCLRuntime *opencl_runtime() override;
  Allocator *allocator() override;
  DeviceType device_type() const override;
+  ScratchBuffer *scratch_buffer() override;
 private:
  std::unique_ptr<OpenCLRuntime> runtime_;
  std::unique_ptr<OpenCLAllocator> allocator_;
+  std::unique_ptr<ScratchBuffer> scratch_buffer_;
 };
 }  // namespace mace

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -31,8 +31,6 @@
 namespace mace {
-std::string kOpenCLParameterPath;  // NOLINT(runtime/string)
 extern const std::map<std::string, std::vector<unsigned char>>
    kEncryptedProgramMap;
@@ -286,7 +284,8 @@ OpenCLRuntime::OpenCLRuntime(
    is_opencl_avaliable_(false),
    is_profiling_enabled_(false),
    opencl_version_(CL_VER_UNKNOWN),
-    gpu_type_(UNKNOWN) {
+    gpu_type_(UNKNOWN),
+    mem_type_(MemoryType::GPU_IMAGE) {
  std::vector<cl::Platform> all_platforms;
  cl::Platform::get(&all_platforms);
  if (all_platforms.size() == 0) {
@@ -471,6 +470,14 @@ uint32_t OpenCLRuntime::device_compute_units() const {
  return device_compute_units_;
 }
+bool OpenCLRuntime::UseImageMemory() {
+  return this->mem_type_ == MemoryType::GPU_IMAGE;
+}
+void OpenCLRuntime::set_mem_type(MemoryType type) {
+  this->mem_type_ = type;
+}
 bool OpenCLRuntime::BuildProgramFromCache(
    const std::string &built_program_key,
    const std::string &build_options_str,

--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -25,6 +25,7 @@
 #include "mace/core/file_storage.h"
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/proto/mace.pb.h"
 #include "mace/utils/string_util.h"
 #include "mace/utils/timer.h"
 #include "mace/utils/tuner.h"
@@ -82,6 +83,9 @@ class OpenCLRuntime {
  uint32_t device_compute_units() const;
  Tuner<uint32_t> *tuner();
  bool is_opencl_avaliable();
+  // TODO(liuqi): remove this function in the future, make decision at runtime.
+  bool UseImageMemory();
+  void set_mem_type(MemoryType type);
  void GetCallStats(const cl::Event &event, CallStats *stats);
  uint64_t GetDeviceMaxWorkGroupSize();
@@ -129,6 +133,7 @@ class OpenCLRuntime {
  bool is_profiling_enabled_;
  OpenCLVersion opencl_version_;
  GPUType gpu_type_;
+  MemoryType mem_type_;
  // All OpenCL object must be a pointer and manually deleted before unloading
  // OpenCL library.
  std::shared_ptr<cl::Context> context_;

--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -101,13 +101,14 @@ enum DataFormat { NHWC = 0, NCHW = 1, HWOI = 2, OIHW = 3, HWIO = 4, OHWI = 5 };
 class Tensor {
 public:
  Tensor(Allocator *alloc, DataType type,
-         bool is_weight = false)
+         bool is_weight = false,
+         const std::string name = "")
      : allocator_(alloc),
        dtype_(type),
        buffer_(nullptr),
        is_buffer_owner_(true),
        unused_(false),
-        name_(""),
+        name_(name),
        is_weight_(is_weight),
        scale_(0.f),
        zero_point_(0),
@@ -115,12 +116,13 @@ class Tensor {
        maxval_(0.f) {}
  Tensor(BufferBase *buffer, DataType dtype,
-         bool is_weight = false)
+         bool is_weight = false,
+         const std::string name = "")
    : dtype_(dtype),
      buffer_(buffer),
      is_buffer_owner_(false),
      unused_(false),
-      name_(""),
+      name_(name),
      is_weight_(is_weight),
      scale_(0.f),
      zero_point_(0),
@@ -129,12 +131,13 @@ class Tensor {
  Tensor(const BufferSlice &buffer_slice,
         DataType dtype,
-         bool is_weight = false)
+         bool is_weight = false,
+         const std::string name = "")
      : dtype_(dtype),
        buffer_slice_(buffer_slice),
        is_buffer_owner_(false),
        unused_(false),
-        name_(""),
+        name_(name),
        is_weight_(is_weight),
        scale_(0.f),
        zero_point_(0),
@@ -152,6 +155,8 @@ class Tensor {
    }
  }
+  inline std::string name() const { return name_; }
  inline DataType dtype() const { return dtype_; }
  inline void SetDtype(DataType dtype) { dtype_ = dtype; }
@@ -188,11 +193,15 @@ class Tensor {
    shape_configured_ = shape_configured;
  }
+  inline const std::vector<index_t> &buffer_shape() const {
+    return buffer_shape_;
+  }
  inline index_t dim_size() const { return shape_.size(); }
  inline index_t dim(unsigned int index) const {
-    MACE_CHECK(index < shape_.size(), "Dim out of range: ", index, " >= ",
+    MACE_CHECK(index < shape_.size(),
-               shape_.size());
+               name_, ": Dim out of range: ", index, " >= ", shape_.size());
    return shape_[index];
  }
@@ -214,12 +223,12 @@ class Tensor {
 #ifdef MACE_ENABLE_OPENCL
  inline cl::Image *opencl_image() const {
-    MACE_CHECK(has_opencl_image(), "do not have image");
+    MACE_CHECK(has_opencl_image(), name_, " do not have image");
    return static_cast<cl::Image *>(buffer_->buffer());
  }
  inline cl::Buffer *opencl_buffer() const {
-    MACE_CHECK(has_opencl_buffer(), "do not have opencl buffer");
+    MACE_CHECK(has_opencl_buffer(), name_, " do not have opencl buffer");
    return static_cast<cl::Buffer *>(buffer_->buffer());
  }
 #endif
@@ -268,12 +277,14 @@ class Tensor {
  inline MaceStatus Resize(const std::vector<index_t> &shape) {
    shape_ = shape;
+    buffer_shape_ = shape;
    image_shape_.clear();
    if (buffer_ != nullptr) {
-      MACE_CHECK(!has_opencl_image(), "Cannot resize image, use ResizeImage.");
+      MACE_CHECK(!has_opencl_image(),
+                 name_, ": Cannot resize image, use ResizeImage.");
      if (raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE > buffer_->size()) {
-        LOG(WARNING) << "Resize buffer from size " << buffer_->size() << " to "
+        LOG(WARNING) << name_ << ": Resize buffer from size " << buffer_->size()
-                     << raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE;
+                     << " to " << raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE;
        return buffer_->Resize(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE);
      }
      return MaceStatus::MACE_SUCCESS;
@@ -296,19 +307,22 @@ class Tensor {
    allocator_ = other.allocator_;
    dtype_ = other.dtype_;
    shape_ = other.shape_;
+    buffer_shape_ = other.buffer_shape_;
    image_shape_ = other.image_shape_;
  }
  inline MaceStatus ResizeImage(const std::vector<index_t> &shape,
                                const std::vector<size_t> &image_shape) {
    shape_ = shape;
+    buffer_shape_ = shape;
    image_shape_ = image_shape;
    if (buffer_ == nullptr) {
      MACE_CHECK(is_buffer_owner_);
      buffer_ = new Image(allocator_);
      return buffer_->Allocate(image_shape, dtype_);
    } else {
-      MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");
+      MACE_CHECK(has_opencl_image(),
+                 name_, ": Cannot ResizeImage buffer, use Resize.");
      Image *image = dynamic_cast<Image *>(buffer_);
      MACE_CHECK(image_shape[0] <= image->image_shape()[0] &&
                     image_shape[1] <= image->image_shape()[1],
@@ -366,8 +380,6 @@ class Tensor {
  inline BufferBase *UnderlyingBuffer() const { return buffer_; }
-  inline void SetSourceOpName(const std::string name) { name_ = name; }
  inline void DebugPrint() const {
    using namespace numerical_chars;  // NOLINT(build/namespaces)
    std::stringstream os;
@@ -459,9 +471,12 @@ class Tensor {
 private:
  Allocator *allocator_;
  DataType dtype_;
+  // the shape of buffer(logical)
  std::vector<index_t> shape_;
  std::vector<index_t> shape_configured_;
  std::vector<size_t> image_shape_;
+  // the shape of buffer(physical storage)
+  std::vector<index_t> buffer_shape_;
  BufferBase *buffer_;
  BufferSlice buffer_slice_;
  bool is_buffer_owner_;

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -44,8 +44,7 @@ bool HasQuantizeOp(const NetDef &net_def) {
 }
 }  // namespace
-Workspace::Workspace() :
+Workspace::Workspace() = default;
-    host_scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {}
 Tensor *Workspace::CreateTensor(const std::string &name,
                                Allocator *alloc,
@@ -54,8 +53,8 @@ Tensor *Workspace::CreateTensor(const std::string &name,
    VLOG(3) << "Tensor " << name << " already exists. Skipping.";
  } else {
    VLOG(3) << "Creating Tensor " << name;
-    tensor_map_[name] = std::unique_ptr<Tensor>(new Tensor(alloc, type));
+    tensor_map_[name] = std::unique_ptr<Tensor>(new Tensor(alloc, type,
-    tensor_map_[name]->SetSourceOpName(name);
+                                                           false, name));
  }
  return GetTensor(name);
 }
@@ -171,7 +170,10 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
                tensor_buffer_.get(), const_tensor.offset(),
                const_tensor.data_size() *
                    GetEnumTypeSize(const_tensor.data_type())),
-                       const_tensor.data_type(), true));
+                       const_tensor.data_type(),
+                       true,
+                       const_tensor.name()));
        tensor->Reshape(dims);
        tensor->SetScale(const_tensor.scale());
        tensor->SetZeroPoint(const_tensor.zero_point());
@@ -275,7 +277,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
        std::unique_ptr<BufferBase> tensor_buf(
            new Buffer(device->allocator()));
        MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
-            mem_block.x() * GetEnumTypeSize(dtype)));
+            mem_block.x() * GetEnumTypeSize(dtype)
+                + MACE_EXTRA_BUFFER_PAD_SIZE));
        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
                                          std::move(tensor_buf));
      }
@@ -301,10 +304,9 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
          }
          std::unique_ptr<Tensor> tensor
              (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]),
-                          output_type));
+                          output_type, false, op.output(i)));
-          tensor->SetSourceOpName(op.name());
+          if (device_type == DeviceType::GPU && tensor->has_opencl_image()) {
-          if (device_type == DeviceType::GPU) {
+            VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
-            VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")"
                    << " Mem: " << mem_ids[i]
                    << " Image shape: "
                    << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
@@ -312,8 +314,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
                    << ", "
                    << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
                        ->image_shape()[1];
-          } else if (device_type == DeviceType::CPU) {
+          } else {
-            VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")"
+            VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
                    << " Mem: " << mem_ids[i]
                    << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
          }
@@ -356,14 +358,6 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
  return MaceStatus::MACE_SUCCESS;
 }
-ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {
-  if (device_type == CPU) {
-    return host_scratch_buffer_.get();
-  } else {
-    return nullptr;
-  }
-}
 void Workspace::RemoveUnusedBuffer() {
  auto iter = tensor_map_.begin();
  auto end_iter = tensor_map_.end();

--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -52,8 +52,6 @@ class Workspace {
                             Device *device,
                             const unsigned char *model_data);
-  ScratchBuffer *GetScratchBuffer(DeviceType device_type);
  void RemoveUnusedBuffer();
  void RemoveAndReloadBuffer(const NetDef &net_def,
@@ -64,15 +62,12 @@ class Workspace {
  MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
                                      Device *device);
-  Device *device_;
  TensorMap tensor_map_;
  std::unique_ptr<BufferBase> tensor_buffer_;
  PreallocatedPooledAllocator preallocated_allocator_;
-  std::unique_ptr<ScratchBuffer> host_scratch_buffer_;
  bool fused_buffer_;
  MACE_DISABLE_COPY_AND_ASSIGN(Workspace);

--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -32,6 +32,8 @@ cc_library(
    ) + if_opencl_enabled(glob(
        [
            "opencl/*.cc",
+            "opencl/image/*.cc",
+            "opencl/buffer/*.cc",
        ],
        exclude = [
            "opencl/*_test.cc",
@@ -43,14 +45,16 @@ cc_library(
            "arm/*.h",
        ],
        exclude = [
-            "buffer_to_image.h",
+            "buffer_transform.h",
-            "image_to_buffer.h",
+            "buffer_inverse_transform.h",
            "lstmcell.h",
        ],
    ) + if_opencl_enabled(glob([
        "opencl/*.h",
-        "buffer_to_image.h",
+        "opencl/image/*.h",
-        "image_to_buffer.h",
+        "opencl/buffer/*.h",
+        "buffer_transform.h",
+        "buffer_inverse_transform.h",
        "lstmcell.h",
    ])),
    copts = [

--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -26,10 +26,6 @@
 #include "mace/core/types.h"
 #include "mace/kernels/kernel.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
@@ -164,15 +160,22 @@ class ActivationFunctor<DeviceType::CPU, float> : OpKernel {
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLActivationKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *alpha,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLActivationKernel);
+};
 template <typename T>
 class ActivationFunctor<DeviceType::GPU, T> : OpKernel {
 public:
  ActivationFunctor(OpKernelContext *context,
                    ActivationType type,
-                    T relux_max_limit)
+                    T relux_max_limit);
-      : OpKernel(context),
-        activation_(type),
-        relux_max_limit_(static_cast<T>(relux_max_limit)) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *alpha,
@@ -180,13 +183,7 @@ class ActivationFunctor<DeviceType::GPU, T> : OpKernel {
                        StatsFuture *future);
 private:
-  ActivationType activation_;
+  std::unique_ptr<OpenCLActivationKernel> kernel_;
-  T relux_max_limit_;
-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::string tuning_key_prefix_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -26,10 +26,6 @@
 #include "mace/core/tensor.h"
 #include "mace/kernels/kernel.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
@@ -96,17 +92,23 @@ struct AddNFunctor : OpKernel {
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLAddNKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const std::vector<const Tensor *> &input_tensors,
+      Tensor *output_tensor,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLAddNKernel);
+};
 template <typename T>
 struct AddNFunctor<DeviceType::GPU, T> : OpKernel {
-  explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {}
+  explicit AddNFunctor(OpKernelContext *context);
  MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
                        Tensor *output_tensor,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLAddNKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -26,41 +26,22 @@
 #include "mace/kernels/activation.h"
 #include "mace/public/mace.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
-struct BatchNormFunctorBase : OpKernel {
-  BatchNormFunctorBase(OpKernelContext *context,
-                       bool folded_constant,
-                       const ActivationType activation,
-                       const float relux_max_limit)
-    : OpKernel(context),
-      folded_constant_(folded_constant),
-      activation_(activation),
-      relux_max_limit_(relux_max_limit) {}
-  const bool folded_constant_;
-  const ActivationType activation_;
-  const float relux_max_limit_;
-};
 template<DeviceType D, typename T>
 struct BatchNormFunctor;
 template<>
-struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
+struct BatchNormFunctor<DeviceType::CPU, float> : OpKernel {
  BatchNormFunctor(OpKernelContext *context,
                   const bool folded_constant,
                   const ActivationType activation,
                   const float relux_max_limit)
-      : BatchNormFunctorBase(context,
+      : OpKernel(context),
-                             folded_constant,
+        folded_constant_(folded_constant),
-                             activation,
+        activation_(activation),
-                             relux_max_limit) {}
+        relux_max_limit_(relux_max_limit) {}
  MaceStatus operator()(const Tensor *input,
                  const Tensor *scale,
@@ -133,19 +114,33 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
    return MACE_SUCCESS;
  }
+  const bool folded_constant_;
+  const ActivationType activation_;
+  const float relux_max_limit_;
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLBatchNormKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *scale,
+      const Tensor *offset,
+      const Tensor *mean,
+      const Tensor *var,
+      const float epsilon,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBatchNormKernel);
+};
 template<typename T>
-struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase {
+struct BatchNormFunctor<DeviceType::GPU, T> : OpKernel {
  BatchNormFunctor(OpKernelContext *context,
                   const bool folded_constant,
                   const ActivationType activation,
-                   const float relux_max_limit)
+                   const float relux_max_limit);
-      : BatchNormFunctorBase(context,
-                             folded_constant,
-                             activation,
-                             relux_max_limit) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *scale,
                        const Tensor *offset,
@@ -154,10 +149,7 @@ struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase {
                        const float epsilon,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLBatchNormKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/batch_to_space.h
+++ b/mace/kernels/batch_to_space.h
@@ -24,10 +24,6 @@
 #include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
@@ -51,7 +47,8 @@ struct BatchToSpaceFunctorBase : OpKernel {
  void CalculateBatchToSpaceOutputShape(const Tensor *input_tensor,
                                        const DataFormat data_format,
                                        index_t *output_shape) {
-    MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
+    MACE_CHECK(input_tensor->dim_size() == 4,
+               "Input(", input_tensor->name(), ") shape should be 4D");
    index_t batch = input_tensor->dim(0);
    index_t channels = 0;
    index_t height = 0;
@@ -96,8 +93,8 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
                      const std::vector<int> &block_shape)
      : BatchToSpaceFunctorBase(context, paddings, block_shape) {}
-  MaceStatus operator()(Tensor *space_tensor,
+  MaceStatus operator()(const Tensor *batch_tensor,
-                        Tensor *batch_tensor,
+                        Tensor *space_tensor,
                        StatsFuture *future) {
    MACE_UNUSED(future);
@@ -191,8 +188,8 @@ struct BatchToSpaceFunctor<CPU, uint8_t> : BatchToSpaceFunctorBase {
                      const std::vector<int> &block_shape)
      : BatchToSpaceFunctorBase(context, paddings, block_shape) {}
-  MaceStatus operator()(Tensor *space_tensor,
+  MaceStatus operator()(const Tensor *batch_tensor,
-                        Tensor *batch_tensor,
+                        Tensor *space_tensor,
                        StatsFuture *future) {
    MACE_UNUSED(future);
@@ -272,21 +269,29 @@ struct BatchToSpaceFunctor<CPU, uint8_t> : BatchToSpaceFunctorBase {
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLBatchToSpaceKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *batch_tensor,
+      const std::vector<int> &paddings,
+      const std::vector<int> &block_shape,
+      const std::vector<index_t> &output_shape,
+      Tensor *space_tensor,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBatchToSpaceKernel);
+};
 template <typename T>
 struct BatchToSpaceFunctor<DeviceType::GPU, T> : BatchToSpaceFunctorBase {
  BatchToSpaceFunctor(OpKernelContext *context,
                      const std::vector<int> &paddings,
-                      const std::vector<int> &block_shape)
+                      const std::vector<int> &block_shape);
-      : BatchToSpaceFunctorBase(context, paddings, block_shape) {}
-  MaceStatus operator()(Tensor *space_tensor,
+  MaceStatus operator()(const Tensor *batch_tensor,
-                  Tensor *batch_tensor,
+                        Tensor *space_tensor,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLBatchToSpaceKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> space_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -24,10 +24,6 @@
 #include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
@@ -96,18 +92,26 @@ struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLBiasAddKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *bias,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBiasAddKernel);
+};
 template <typename T>
 struct BiasAddFunctor<DeviceType::GPU, T> : BiasAddFunctorBase {
-  BiasAddFunctor(OpKernelContext *context, const DataFormat data_format)
+  BiasAddFunctor(OpKernelContext *context, const DataFormat data_format);
-      : BiasAddFunctorBase(context, data_format) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *bias,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
+  std::unique_ptr<OpenCLBiasAddKernel> kernel_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_BUFFER_TO_IMAGE_H_
+#ifndef MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_
-#define MACE_KERNELS_BUFFER_TO_IMAGE_H_
+#define MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_
 #include <memory>
 #include <vector>
@@ -26,18 +26,19 @@
 namespace mace {
 namespace kernels {
-struct BufferToImageFunctorBase : OpKernel {
+struct BufferInverseTransformFunctorBase : OpKernel {
-  explicit BufferToImageFunctorBase(OpKernelContext *context,
+  BufferInverseTransformFunctorBase(OpKernelContext *context,
                                    const int wino_blk_size)
-    : OpKernel(context), wino_blk_size_(wino_blk_size) {}
+    : OpKernel(context),
+      wino_blk_size_(wino_blk_size) {}
  const int wino_blk_size_;
 };
 template <DeviceType D, typename T>
-struct BufferToImageFunctor : BufferToImageFunctorBase {
+struct BufferInverseTransformFunctor : BufferInverseTransformFunctorBase {
-  explicit BufferToImageFunctor(OpKernelContext *context,
+  explicit BufferInverseTransformFunctor(OpKernelContext *context,
                                         const int wino_blk_size)
-      : BufferToImageFunctorBase(context, wino_blk_size) {}
+    : BufferInverseTransformFunctorBase(context, wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                        const BufferType type,
                        Tensor *output,
@@ -51,22 +52,31 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
  }
 };
+class OpenCLBufferInverseTransformKernel {
+ public:
+  virtual MaceStatus Compute(OpKernelContext *context,
+                             const Tensor *input,
+                             const BufferType type,
+                             const int wino_blk_size,
+                             Tensor *output,
+                             StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBufferInverseTransformKernel)
+};
 template <typename T>
-struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
+struct BufferInverseTransformFunctor<DeviceType::GPU, T>
-  explicit BufferToImageFunctor(OpKernelContext *context,
+    : BufferInverseTransformFunctorBase {
-                                const int wino_blk_size)
+  explicit BufferInverseTransformFunctor(OpKernelContext *context,
-      : BufferToImageFunctorBase(context, wino_blk_size) {}
+                                         const int wino_blk_size);
  MaceStatus operator()(const Tensor *input,
                        const BufferType type,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLBufferInverseTransformKernel> kernel_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 }  // namespace kernels
 }  // namespace mace
-#endif  // MACE_KERNELS_BUFFER_TO_IMAGE_H_
+#endif  // MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_
--- a/mace/kernels/image_to_buffer.h
+++ b/mace/kernels/image_to_buffer.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_KERNELS_IMAGE_TO_BUFFER_H_
+#ifndef MACE_KERNELS_BUFFER_TRANSFORM_H_
-#define MACE_KERNELS_IMAGE_TO_BUFFER_H_
+#define MACE_KERNELS_BUFFER_TRANSFORM_H_
 #include <memory>
 #include <vector>
@@ -26,18 +26,19 @@
 namespace mace {
 namespace kernels {
-struct ImageToBufferFunctorBase : OpKernel {
+struct BufferTransformFunctorBase : OpKernel {
-  ImageToBufferFunctorBase(OpKernelContext *context,
+  explicit BufferTransformFunctorBase(OpKernelContext *context,
                                      const int wino_blk_size)
-    : OpKernel(context),
+    : OpKernel(context), wino_blk_size_(wino_blk_size) {}
-      wino_blk_size_(wino_blk_size) {}
  const int wino_blk_size_;
 };
 template <DeviceType D, typename T>
-struct ImageToBufferFunctor : ImageToBufferFunctorBase {
+struct BufferTransformFunctor : BufferTransformFunctorBase {
-  ImageToBufferFunctor(OpKernelContext *context, const int wino_blk_size)
+  BufferTransformFunctor(OpKernelContext *context,
-    : ImageToBufferFunctorBase(context, wino_blk_size) {}
+                         const int wino_blk_size)
+      : BufferTransformFunctorBase(context, wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                        const BufferType type,
                        Tensor *output,
@@ -51,22 +52,30 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
  }
 };
+class OpenCLBufferTransformKernel {
+ public:
+  virtual MaceStatus Compute(OpKernelContext *context,
+                             const Tensor *input,
+                             const BufferType type,
+                             const int wino_blk_size,
+                             Tensor *output,
+                             StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBufferTransformKernel)
+};
 template <typename T>
-struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase {
+struct BufferTransformFunctor<DeviceType::GPU, T> : BufferTransformFunctorBase {
-  ImageToBufferFunctor(OpKernelContext *context,
+  BufferTransformFunctor(OpKernelContext *context, const int wino_blk_size);
-                                const int wino_blk_size)
-      : ImageToBufferFunctorBase(context, wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                        const BufferType type,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 }  // namespace kernels
 }  // namespace mace
-#endif  // MACE_KERNELS_IMAGE_TO_BUFFER_H_
+#endif  // MACE_KERNELS_BUFFER_TRANSFORM_H_
--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -71,20 +71,24 @@ struct ChannelShuffleFunctor : OpKernel {
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLChannelShuffleKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLChannelShuffleKernel);
+};
 template<typename T>
 struct ChannelShuffleFunctor<DeviceType::GPU, T> : OpKernel {
-  ChannelShuffleFunctor(OpKernelContext *context, const int groups)
+  ChannelShuffleFunctor(OpKernelContext *context, const int groups);
-      : OpKernel(context), groups_(groups) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLChannelShuffleKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  const int groups_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -24,24 +24,13 @@
 #include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
-struct ConcatFunctorBase : OpKernel {
-  ConcatFunctorBase(OpKernelContext *context, const int32_t axis)
-      : OpKernel(context), axis_(axis) {}
-  int32_t axis_;
-};
 template <DeviceType D, typename T>
-struct ConcatFunctor : ConcatFunctorBase {
+struct ConcatFunctor : OpKernel {
  ConcatFunctor(OpKernelContext *context, const int32_t axis)
-      : ConcatFunctorBase(context, axis) {}
+      : OpKernel(context), axis_(axis) {}
  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
                        Tensor *output,
@@ -98,21 +87,29 @@ struct ConcatFunctor : ConcatFunctorBase {
    return MACE_SUCCESS;
  }
+  int32_t axis_;
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLConcatKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const std::vector<const Tensor *> &input_list,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLConcatKernel);
+};
 template <typename T>
-struct ConcatFunctor<DeviceType::GPU, T> : ConcatFunctorBase {
+struct ConcatFunctor<DeviceType::GPU, T> : OpKernel {
-  ConcatFunctor(OpKernelContext *context, const int32_t axis)
+  ConcatFunctor(OpKernelContext *context, const int32_t axis);
-      : ConcatFunctorBase(context, axis) {}
  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
+  std::unique_ptr<OpenCLConcatKernel> kernel_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -35,10 +35,6 @@
 #include "mace/kernels/quantize.h"
 #include "mace/utils/utils.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
@@ -78,8 +74,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                const int *dilations,
                const ActivationType activation,
                const float relux_max_limit,
-                const bool is_filter_transformed,
+                const bool is_filter_transformed)
-                ScratchBuffer *scratch)
    : Conv2dFunctorBase(context,
                        strides,
                        padding_type,
@@ -88,8 +83,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                        activation,
                        relux_max_limit),
      transformed_filter_(GetCPUAllocator(), DataType::DT_FLOAT),
-      is_filter_transformed_(is_filter_transformed),
+      is_filter_transformed_(is_filter_transformed) {}
-      scratch_(scratch) {}
  void Conv2dGeneral(const float *input,
                     const float *filter,
@@ -494,14 +488,15 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
    }
    // Init scratch buffer
-    scratch_->Rewind();
+    ScratchBuffer *scratch = context_->device()->scratch_buffer();
-    scratch_->GrowSize(total_scratch_size);
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
    Tensor
-      transformed_input(scratch_->Scratch(transformed_input_size), DT_FLOAT);
+      transformed_input(scratch->Scratch(transformed_input_size), DT_FLOAT);
    Tensor
-      transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT);
+      transformed_output(scratch->Scratch(transformed_output_size), DT_FLOAT);
-    Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT);
+    Tensor padded_input(scratch->Scratch(padded_input_size), DT_FLOAT);
-    Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT);
+    Tensor padded_output(scratch->Scratch(padded_output_size), DT_FLOAT);
    const index_t extra_input_shape[4] =
        {batch, input_channels, extra_input_height, extra_input_width};
    const index_t extra_output_shape[4] =
@@ -559,7 +554,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                          transformed_output_data,
                          pad_output,
                          &sgemm_,
-                          scratch_);
+                          scratch);
      };
    } else if (use_neon_3x3_s1) {
      conv_func = [=](const float *pad_input, float *pad_output) {
@@ -588,7 +583,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                         channels,
                         pad_output,
                         &sgemm_,
-                         scratch_);
+                         scratch);
      };
    } else if (use_neon_5x5_s1) {
      conv_func = [=](const float *pad_input, float *pad_output) {
@@ -735,7 +730,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
  Tensor transformed_filter_;
  bool is_filter_transformed_;
-  ScratchBuffer *scratch_;
  SGemm sgemm_;
 };
@@ -748,16 +742,14 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
                const int *dilations,
                const ActivationType activation,
                const float relux_max_limit,
-                const bool is_filter_transformed,
+                const bool is_filter_transformed)
-                ScratchBuffer *scratch)
      : Conv2dFunctorBase(context,
                          strides,
                          padding_type,
                          paddings,
                          dilations,
                          activation,
-                          relux_max_limit),
+                          relux_max_limit) {
-        scratch_(scratch) {
    MACE_UNUSED(is_filter_transformed);
  }
@@ -926,13 +918,14 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
    bool im2col_required =
        filter_h != 1 || filter_w != 1 || stride_h != 1 || stride_w != 1;
    total_scratch_size += (im2col_required ? im2col_size : 0);
-    scratch_->Rewind();
+    ScratchBuffer *scratch = context_->device()->scratch_buffer();
-    scratch_->GrowSize(total_scratch_size);
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
    std::unique_ptr<Tensor> zero_bias;
    const int32_t *bias_data = nullptr;
    if (bias == nullptr) {
-      zero_bias.reset(new Tensor(scratch_->Scratch(zero_bias_size), DT_INT32));
+      zero_bias.reset(new Tensor(scratch->Scratch(zero_bias_size), DT_INT32));
      zero_bias->Reshape({channels});
      zero_bias->Clear();
      bias_data = zero_bias->data<int32_t>();
@@ -944,7 +937,7 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
    auto gemm_input_data = input_data;
    if (im2col_required) {
      // prepare im2col
-      im2col.reset(new Tensor(scratch_->Scratch(im2col_size), DT_UINT8));
+      im2col.reset(new Tensor(scratch->Scratch(im2col_size), DT_UINT8));
      uint8_t *im2col_data = im2col->mutable_data<uint8_t>();
      Im2col(input_data, input->shape(), filter_h, filter_w, stride_h,
             stride_w, static_cast<uint8_t>(input->zero_point()),
@@ -976,12 +969,28 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
    return MACE_SUCCESS;
  }
-  ScratchBuffer *scratch_;
 };
 #ifdef MACE_ENABLE_OPENCL
-template<typename T>
+class OpenCLConv2dKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLConv2dKernel);
+};
+template <typename T>
 struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
  Conv2dFunctor(OpKernelContext *context,
                const int *strides,
@@ -990,18 +999,7 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
                const int *dilations,
                const ActivationType activation,
                const float relux_max_limit,
-                const bool is_filter_transformed,
+                const bool is_filter_transformed);
-                ScratchBuffer *scratch)
-    : Conv2dFunctorBase(context,
-                        strides,
-                        padding_type,
-                        paddings,
-                        dilations,
-                        activation,
-                        relux_max_limit) {
-    MACE_UNUSED(is_filter_transformed);
-    MACE_UNUSED(scratch);
-  }
  MaceStatus operator()(const Tensor *input,
                        const Tensor *filter,
@@ -1009,10 +1007,7 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLConv2dKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/conv_pool_2d_util.cc
+++ b/mace/kernels/conv_pool_2d_util.cc
@@ -210,6 +210,20 @@ void CalcOutputSize(const index_t *input_shape,
  }
 }
+void CalcNCHWInputShape(const index_t *output_shape,
+                        const index_t *filter_shape,
+                        const int *strides,
+                        const int *dilations,
+                        index_t *input_shape) {
+  MACE_CHECK_NOTNULL(input_shape);
+  input_shape[0] = output_shape[0];
+  input_shape[1] = filter_shape[1];
+  input_shape[2] = (output_shape[2] - 1) * strides[0] +
+      (filter_shape[2] - 1) * dilations[0] + 1;
+  input_shape[3] = (output_shape[3] - 1) * strides[1] +
+      (filter_shape[3] - 1) * dilations[1] + 1;
+}
 void CalcOutputSize(const index_t *input_shape,   // NHWC
                    const index_t *filter_shape,  // OIHW
                    const int *padding_size,
@@ -234,8 +248,8 @@ void CalcNCHWOutputSize(const index_t *input_shape,   // NCHW
 void CalPaddingSize(const index_t *input_shape,   // NCHW
                    const index_t *filter_shape,  // OIHW
-                    const int *dilations,
                    const int *strides,
+                    const int *dilations,
                    Padding padding,
                    int *padding_size) {
  MACE_CHECK(dilations[0] > 0 && dilations[1] > 0,

--- a/mace/kernels/conv_pool_2d_util.h
+++ b/mace/kernels/conv_pool_2d_util.h
@@ -84,6 +84,12 @@ void CalcNCHWOutputSize(const index_t *input_shape,
                    const RoundType round_type,
                    index_t *output_shape);
+void CalcNCHWInputShape(const index_t *output_shape,
+                        const index_t *filter_shape,
+                        const int *strides,
+                        const int *dilations,
+                        index_t *input_shape);
 void CalPaddingSize(const index_t *input_shape,   // NCHW
                    const index_t *filter_shape,  // OIHW
                    const int *dilations,
@@ -91,6 +97,7 @@ void CalPaddingSize(const index_t *input_shape,   // NCHW
                    Padding padding,
                    int *padding_size);
 MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input,
                               const int pad_top, const int pad_bottom,
                               const int pad_left, const int pad_right,

--- a/mace/kernels/crop.h
+++ b/mace/kernels/crop.h
@@ -24,32 +24,18 @@
 #include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
-struct CropFunctorBase : OpKernel {
+template <DeviceType D, typename T>
-  CropFunctorBase(OpKernelContext *context,
+struct CropFunctor : OpKernel {
+  CropFunctor(OpKernelContext *context,
              const int axis,
              const std::vector<int> &offset)
      : OpKernel(context),
        axis_(axis),
        offset_(offset) {}
-  const int axis_;
-  std::vector<int> offset_;
-};
-template <DeviceType D, typename T>
-struct CropFunctor : CropFunctorBase {
-  CropFunctor(OpKernelContext *context,
-              const int axis,
-              const std::vector<int> &offset)
-      : CropFunctorBase(context, axis, offset) {}
  void crop_copy(const T* input_data, T* output_data,
                 const std::vector<index_t> &input_shape,
                 const std::vector<index_t> &output_shape,
@@ -121,23 +107,31 @@ struct CropFunctor : CropFunctorBase {
    return MACE_SUCCESS;
  }
+  const int axis_;
+  std::vector<int> offset_;
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLCropKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const std::vector<const Tensor *> &input_list,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLCropKernel);
+};
 template <typename T>
-struct CropFunctor<DeviceType::GPU, T> : CropFunctorBase {
+struct CropFunctor<DeviceType::GPU, T> : OpKernel {
  CropFunctor(OpKernelContext *context,
              const int axis,
-              const std::vector<int> &offset)
+              const std::vector<int> &offset);
-      : CropFunctorBase(context, axis, offset) {}
  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLCropKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -28,10 +28,6 @@
 #include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/utils/utils.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
@@ -317,6 +313,22 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLDeconv2dKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const int *padding_data,
+      const ActivationType activation,
+      const float relux_max_limit,
+      const std::vector<index_t> &output_shape,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDeconv2dKernel);
+};
 template <typename T>
 struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
  Deconv2dFunctor(OpKernelContext *context,
@@ -325,14 +337,7 @@ struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
                  const std::vector<int> &paddings,
                  const std::vector<index_t> &output_shape,
                  const ActivationType activation,
-                  const float relux_max_limit)
+                  const float relux_max_limit);
-      : Deconv2dFunctorBase(context,
-                            strides,
-                            padding_type,
-                            paddings,
-                            output_shape,
-                            activation,
-                            relux_max_limit) {}
  MaceStatus operator()(const Tensor *input,
                  const Tensor *filter,
@@ -341,10 +346,7 @@ struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
                  Tensor *output,
                  StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLDeconv2dKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -93,20 +93,24 @@ struct DepthToSpaceOpFunctor : OpKernel {
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLDepthToSpaceKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDepthToSpaceKernel);
+};
 template<typename T>
 struct DepthToSpaceOpFunctor<DeviceType::GPU, T> : OpKernel {
  DepthToSpaceOpFunctor(OpKernelContext *context,
-                        const int block_size)
+                        const int block_size);
-      : OpKernel(context), block_size_(block_size) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future);
-  const int block_size_;
+  std::unique_ptr<OpenCLDepthToSpaceKernel> kernel_;
-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -501,6 +501,24 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLDepthwiseConv2dKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDepthwiseConv2dKernel);
+};
 template<typename T>
 struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
  : DepthwiseConv2dFunctorBase {
@@ -510,14 +528,7 @@ struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
                         const std::vector<int> &paddings,
                         const int *dilations,
                         const ActivationType activation,
-                         const float relux_max_limit)
+                         const float relux_max_limit);
-    : DepthwiseConv2dFunctorBase(context,
-                                 strides,
-                                 padding_type,
-                                 paddings,
-                                 dilations,
-                                 activation,
-                                 relux_max_limit) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *filter,
@@ -525,10 +536,7 @@ struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLDepthwiseConv2dKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -27,10 +27,6 @@
 #include "mace/kernels/kernel.h"
 #include "mace/utils/quantize.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
@@ -805,11 +801,12 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
  }
 }
-struct EltwiseFunctorBase : OpKernel {
+template <DeviceType D, typename T>
-  EltwiseFunctorBase(OpKernelContext *context,
+struct EltwiseFunctor : OpKernel {
+  EltwiseFunctor(OpKernelContext *context,
                 const EltwiseType type,
                 const std::vector<float> &coeff,
-                     const float scalar_input,
+                 const float scalar_input,  // float as it comes from arg
                 const int32_t scalar_input_index,
                 const DataFormat data_format)
      : OpKernel(context),
@@ -819,28 +816,6 @@ struct EltwiseFunctorBase : OpKernel {
        scalar_input_index_(scalar_input_index),
        data_format_(data_format) {}
-  EltwiseType type_;
-  std::vector<float> coeff_;
-  float scalar_input_;
-  int32_t scalar_input_index_;
-  DataFormat data_format_;
-};
-template <DeviceType D, typename T>
-struct EltwiseFunctor : EltwiseFunctorBase {
-  EltwiseFunctor(OpKernelContext *context,
-                 const EltwiseType type,
-                 const std::vector<float> &coeff,
-                 const float scalar_input,  // float as it comes from arg
-                 const int32_t scalar_input_index,
-                 const DataFormat data_format)
-      : EltwiseFunctorBase(context,
-                           type,
-                           coeff,
-                           scalar_input,
-                           scalar_input_index,
-                           data_format) {}
  template <typename DstType>
  MaceStatus DoEltwise(const Tensor *input0,
                       const Tensor *input1,
@@ -957,23 +932,28 @@ struct EltwiseFunctor : EltwiseFunctorBase {
    }
  }
+  EltwiseType type_;
+  std::vector<float> coeff_;
+  float scalar_input_;
+  int32_t scalar_input_index_;
+  DataFormat data_format_;
  Tensor scalar_tensor_;
 };
 template <>
-struct EltwiseFunctor<DeviceType::CPU, uint8_t> : EltwiseFunctorBase {
+struct EltwiseFunctor<DeviceType::CPU, uint8_t> : OpKernel {
  EltwiseFunctor(OpKernelContext *context,
                 const EltwiseType type,
                 const std::vector<float> &coeff,
                 const float scalar_input,  // float as it comes from arg
                 const int32_t scalar_input_index,
                 const DataFormat data_format)
-      : EltwiseFunctorBase(context,
+      : OpKernel(context),
-                           type,
+        type_(type),
-                           coeff,
+        coeff_(coeff),
-                           scalar_input,
+        scalar_input_(scalar_input),
-                           scalar_input_index,
+        scalar_input_index_(scalar_input_index),
-                           data_format) {}
+        data_format_(data_format) {}
  MaceStatus operator()(const Tensor *input0,
                        const Tensor *input1,
@@ -1093,33 +1073,41 @@ struct EltwiseFunctor<DeviceType::CPU, uint8_t> : EltwiseFunctorBase {
    return MACE_SUCCESS;
  }
+  EltwiseType type_;
+  std::vector<float> coeff_;
+  float scalar_input_;
+  int32_t scalar_input_index_;
+  DataFormat data_format_;
+  Tensor scalar_tensor_;
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLEltwiseKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input0,
+      const Tensor *input1,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLEltwiseKernel);
+};
 template <typename T>
-struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
+struct EltwiseFunctor<DeviceType::GPU, T> : OpKernel {
  EltwiseFunctor(OpKernelContext *context,
                 const EltwiseType type,
                 const std::vector<float> &coeff,
                 const float scalar_input,
                 const int32_t scalar_input_index,
-                 const DataFormat data_format)
+                 const DataFormat data_format);
-      : EltwiseFunctorBase(context,
-                           type,
-                           coeff,
-                           scalar_input,
-                           scalar_input_index,
-                           data_format) {}
  MaceStatus operator()(const Tensor *input0,
                        const Tensor *input1,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLEltwiseKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -151,12 +151,24 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLFullyConnectedKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *weight,
+      const Tensor *bias,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLFullyConnectedKernel);
+};
 template <typename T>
 struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
  FullyConnectedFunctor(OpKernelContext *context,
                        const ActivationType activation,
-                        const float relux_max_limit)
+                        const float relux_max_limit);
-      : FullyConnectedBase(context, activation, relux_max_limit) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *weight,
@@ -164,11 +176,7 @@ struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLFullyConnectedKernel> kernel_;
-  std::vector<uint32_t> gws_;
-  std::vector<uint32_t> lws_;
-  std::vector<index_t> input_shape_;
-  std::unique_ptr<BufferBase> kernel_error_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/lstmcell.h
+++ b/mace/kernels/lstmcell.h
@@ -35,11 +35,23 @@ namespace kernels {
 template <DeviceType D, typename T>
 struct LSTMCellFunctor;
+class OpenCLLSTMCellKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *pre_output,
+      const Tensor *weight,
+      const Tensor *bias,
+      const Tensor *pre_cell,
+      Tensor *cell,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLLSTMCellKernel);
+};
 template <typename T>
 struct LSTMCellFunctor<DeviceType::GPU, T> : OpKernel{
-  LSTMCellFunctor(OpKernelContext *context, T forget_bias)
+  LSTMCellFunctor(OpKernelContext *context, T forget_bias);
-      : OpKernel(context),
-        forget_bias_(static_cast<T>(forget_bias)) {}
  MaceStatus operator()(const Tensor *input,
                        const Tensor *pre_output,
                        const Tensor *weight,
@@ -49,11 +61,7 @@ struct LSTMCellFunctor<DeviceType::GPU, T> : OpKernel{
                        Tensor *output,
                        StatsFuture *future);
-  T forget_bias_;
+  std::unique_ptr<OpenCLLSTMCellKernel> kernel_;
-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 }  // namespace kernels

--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -34,10 +34,6 @@
 #include "mace/kernels/gemmlowp_util.h"
 #include "mace/kernels/sgemm.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
@@ -89,7 +85,7 @@ struct MatMulFunctor : OpKernel {
    const index_t height_b = B->dim(rank - 2);
    const index_t width_b = B->dim(rank - 1);
-    auto scratch_buffer = context_->workspace()->GetScratchBuffer(D);
+    auto scratch_buffer = context_->device()->scratch_buffer();
    scratch_buffer->Rewind();
    index_t scratch_size = C->raw_max_size();
    if (!A->is_weight()) {
@@ -112,7 +108,7 @@ struct MatMulFunctor : OpKernel {
               A->is_weight(),
               B->is_weight(),
               c_ptr_base,
-               scratch_buffer);
+               context_->device()->scratch_buffer());
    return MACE_SUCCESS;
  }
@@ -218,9 +214,21 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel {
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLMatMulKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *A,
+      const Tensor *B,
+      Tensor *C,
+      bool transpose_a,
+      bool transpose_b,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLMatMulKernel);
+};
 template <typename T>
 struct MatMulFunctor<DeviceType::GPU, T> : OpKernel {
-  explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
+  explicit MatMulFunctor(OpKernelContext *context);
  MaceStatus operator()(const Tensor *A,
                        const Tensor *B,
@@ -229,9 +237,7 @@ struct MatMulFunctor<DeviceType::GPU, T> : OpKernel {
                        bool transpose_b,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLMatMulKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/opencl/activation.cc
+++ b/mace/kernels/opencl/activation.cc
@@ -13,96 +13,31 @@
 // limitations under the License.
 #include "mace/kernels/activation.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/kernels/opencl/image/activation.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
+template <typename T>
+ActivationFunctor<DeviceType::GPU, T>::ActivationFunctor(
+    OpKernelContext *context,
+    ActivationType type,
+    T relux_max_limit) : OpKernel(context) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(
+        new opencl::image::ActivationKernel<T>(type, relux_max_limit));
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
 template <typename T>
 MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
    const Tensor *alpha,
    Tensor *output,
    StatsFuture *future) {
-  const index_t batch = input->dim(0);
+  return kernel_->Compute(context_, input, alpha, output, future);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
-    built_options.emplace("-Dactivation=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    switch (activation_) {
-      case RELU:
-        tuning_key_prefix_ = "relu_opencl_kernel";
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        tuning_key_prefix_ = "relux_opencl_kernel";
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case PRELU:
-        tuning_key_prefix_ = "prelu_opencl_kernel";
-        built_options.emplace("-DUSE_PRELU");
-        break;
-      case TANH:
-        tuning_key_prefix_ = "tanh_opencl_kernel";
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        tuning_key_prefix_ = "sigmoid_opencl_kernel";
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation_;
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    int idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    if (activation_ == PRELU) {
-      MACE_CHECK_NOTNULL(alpha);
-      kernel_.setArg(idx++, *(alpha->opencl_image()));
-    }
-    kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
-             output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct ActivationFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -13,97 +13,32 @@
 // limitations under the License.
 #include "mace/kernels/addn.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/kernels/opencl/image/addn.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
+template <typename T>
+AddNFunctor<DeviceType::GPU, T>::AddNFunctor(OpKernelContext *context)
+    : OpKernel(context) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(
+        new opencl::image::AddNKernel<T>);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
 template <typename T>
 MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
    const std::vector<const Tensor *> &input_tensors,
    Tensor *output_tensor,
    StatsFuture *future) {
-  size_t size = input_tensors.size();
+  return kernel_->Compute(context_, input_tensors, output_tensor, future);
-  MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
-  const index_t batch = input_tensors[0]->dim(0);
-  const index_t height = input_tensors[0]->dim(1);
-  const index_t width = input_tensors[0]->dim(2);
-  const index_t channels = input_tensors[0]->dim(3);
-  auto runtime = context_->device()->opencl_runtime();
-  for (size_t i = 1; i < size; ++i) {
-    MACE_CHECK_NOTNULL(input_tensors[i]);
-    MACE_CHECK(batch == input_tensors[i]->dim(0));
-    MACE_CHECK(height == input_tensors[i]->dim(1));
-    MACE_CHECK(width == input_tensors[i]->dim(2));
-    MACE_CHECK(channels == input_tensors[i]->dim(3));
-  }
-  if (kernel_.get() == nullptr) {
-    if (input_tensors.size() > 4) {
-      MACE_NOT_IMPLEMENTED;
-    }
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
-    built_options.emplace("-Daddn=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  std::vector<index_t> output_shape = input_tensors[0]->shape();
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t width_pixels = channel_blocks * width;
-  const index_t batch_height_pixels = batch * height;
-  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
-                           static_cast<uint32_t>(batch_height_pixels)};
-  if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
-    std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                    &output_image_shape);
-    MACE_RETURN_IF_ERROR(
-        output_tensor->ResizeImage(output_shape, output_image_shape));
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_2D_GWS_ARGS(kernel_);
-    for (auto input : input_tensors) {
-      kernel_.setArg(idx++, *(input->opencl_image()));
-    }
-    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
-    input_shape_ = input_tensors[0]->shape();
-  }
-  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
-  std::string tuning_key =
-      Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
-             output_tensor->dim(2), output_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct AddNFunctor<DeviceType::GPU, float>;
 template struct AddNFunctor<DeviceType::GPU, half>;
 }  // namespace kernels

--- a/mace/kernels/opencl/batch_norm.cc
+++ b/mace/kernels/opencl/batch_norm.cc
@@ -13,14 +13,26 @@
 // limitations under the License.
 #include "mace/kernels/batch_norm.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/image/batch_norm.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
+template <typename T>
+BatchNormFunctor<DeviceType::GPU, T>::BatchNormFunctor(
+    OpKernelContext *context,
+    const bool folded_constant,
+    const ActivationType activation,
+    const float relux_max_limit)
+    : OpKernel(context) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::BatchNormKernel<T>(
+        folded_constant, activation, relux_max_limit));
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
 template <typename T>
 MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
@@ -31,84 +43,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
    const float epsilon,
    Tensor *output,
    StatsFuture *future) {
-  MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr));
+  return kernel_->Compute(context_, input, scale, offset, mean,
+                          var, epsilon, output, future);
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
-    built_options.emplace("-Dbatch_norm=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    if (folded_constant_) {
-      built_options.emplace("-DFOLDED_CONSTANT");
-    }
-    switch (activation_) {
-      case NOOP:
-        break;
-      case RELU:
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case TANH:
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation_;
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(scale->opencl_image()));
-    kernel_.setArg(idx++, *(offset->opencl_image()));
-    if (!folded_constant_) {
-      kernel_.setArg(idx++, *(mean->opencl_image()));
-      kernel_.setArg(idx++, *(var->opencl_image()));
-      kernel_.setArg(idx++, epsilon);
-    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, relux_max_limit_);
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
-             output->dim(1), output->dim(2), output->dim(3), folded_constant_);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct BatchNormFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/batch_to_space.cc
+++ b/mace/kernels/opencl/batch_to_space.cc
@@ -16,84 +16,31 @@
 #define MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
 #include "mace/kernels/batch_to_space.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/image/batch_to_space.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
+template <typename T>
+BatchToSpaceFunctor<DeviceType::GPU, T>::BatchToSpaceFunctor(
+    OpKernelContext *context,
+    const std::vector<int> &paddings,
+    const std::vector<int> &block_shape)
+    : BatchToSpaceFunctorBase(context, paddings, block_shape) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::BatchToSpaceKernel<T>);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
 template <typename T>
 MaceStatus BatchToSpaceFunctor<DeviceType::GPU, T>::operator()(
-    Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) {
+    const Tensor *batch_tensor, Tensor *space_tensor, StatsFuture *future) {
  std::vector<index_t> output_shape(4, 0);
  CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC,
                                   output_shape.data());
+  return kernel_->Compute(context_, batch_tensor, paddings_, block_shape_,
-  std::vector<size_t> output_image_shape;
+                          output_shape, space_tensor, future);
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
-  MACE_RETURN_IF_ERROR(
-      space_tensor->ResizeImage(output_shape, output_image_shape));
-  const uint32_t chan_blk =
-      static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
-  const uint32_t gws[3] = {
-      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
-      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    const char *kernel_name = "batch_to_space";
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    built_options.emplace(kernel_name_ss.str());
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" +
-                          DtToCLCMDDt(DataTypeToEnum<T>::value));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  if (!IsVecEqual(space_shape_, space_tensor->shape())) {
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
-    kernel_.setArg(idx++, *(space_tensor->opencl_image()));
-    kernel_.setArg(idx++, block_shape_[0]);
-    kernel_.setArg(idx++, block_shape_[1]);
-    kernel_.setArg(idx++, paddings_[0]);
-    kernel_.setArg(idx++, paddings_[2]);
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
-    space_shape_ = space_tensor->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
-             batch_tensor->dim(2), batch_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct BatchToSpaceFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/bias_add.cc
+++ b/mace/kernels/opencl/bias_add.cc
@@ -13,13 +13,23 @@
 // limitations under the License.
 #include "mace/kernels/bias_add.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/image/bias_add.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
+template <typename T>
+BiasAddFunctor<DeviceType::GPU, T>::BiasAddFunctor(
+    OpKernelContext *context,
+    const DataFormat data_format)
+    : BiasAddFunctorBase(context, data_format) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::BiasAddKernel<T>);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
 template <typename T>
 MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                          const Tensor *bias,
@@ -27,75 +37,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                          StatsFuture *future) {
  MACE_CHECK(input->dim_size() == 4 && data_format_ == NHWC,
             "gpu only support biasadd for 4-dimensional NHWC format tensor");
+  return kernel_->Compute(context_, input, bias, output, future);
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    auto dt = DataTypeToEnum<T>::value;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
-    built_options.emplace("-Dbias_add=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(bias->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  } else {
-    std::vector<uint32_t> roundup_gws(lws.size());
-    for (size_t i = 0; i < lws.size(); ++i) {
-      if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
-    }
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange,
-        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  }
-  MACE_CL_RET_STATUS(error);
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-  return MACE_SUCCESS;
 }
 template struct BiasAddFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/buffer/buffer_inverse_transform.h
+++ b/mace/kernels/opencl/buffer/buffer_inverse_transform.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
+#define MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
+#include "mace/kernels/buffer_inverse_transform.h"
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+MaceStatus BufferTypeTransform(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future);
+template <typename T>
+class BufferInverseTransform: public OpenCLBufferInverseTransformKernel {
+ public:
+  MaceStatus Compute(OpKernelContext *context,
+                     const Tensor *input,
+                     const BufferType type,
+                     const int wino_blk_size,
+                     Tensor *output,
+                     StatsFuture *future) override;
+ private:
+  cl::Kernel kernel_;
+};
+template <typename T>
+MaceStatus BufferInverseTransform<T>::Compute(OpKernelContext *context,
+                                              const Tensor *input,
+                                              const BufferType type,
+                                              const int wino_blk_size,
+                                              Tensor *output,
+                                              StatsFuture *future) {
+  MACE_UNUSED(type);
+  MACE_UNUSED(wino_blk_size);
+  const DataType dt = DataTypeToEnum<T>::value;
+  if (input->dtype() != output->dtype()) {
+    return BufferTypeTransform(context, &kernel_, input, dt, output, future);
+  } else {
+    SetFutureDefaultWaitFn(future);
+    output->ReuseTensorBuffer(*input);
+    return MaceStatus::MACE_SUCCESS;
+  }
+}
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
--- a/mace/kernels/opencl/buffer/buffer_transform.cc
+++ b/mace/kernels/opencl/buffer/buffer_transform.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/kernels/opencl/buffer/buffer_transform.h"
+#include <vector>
+#include <set>
+#include <string>
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+MaceStatus TransformConv2DFilter(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future) {
+  const index_t out_chan = input->dim(0);
+  const index_t in_chan = input->dim(1);
+  const index_t filter_height = input->dim(2);
+  const index_t filter_width = input->dim(3);
+  std::vector<index_t> transformed_shape = {
+      filter_height, filter_width,
+      RoundUpDiv4(out_chan),
+      RoundUp<index_t>(in_chan, 4),
+      4,
+  };
+  uint32_t gws[3];
+  gws[0] = static_cast<uint32_t>(transformed_shape[3]);
+  gws[1] = static_cast<uint32_t>(transformed_shape[2]);
+  gws[2] = static_cast<uint32_t>(filter_height * filter_width);
+  MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
+  output->Reshape(input->shape());
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    MACE_OUT_OF_RANGE_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_conv_filter");
+    built_options.emplace("-Dtransform_conv_filter=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
+                                              kernel_name,
+                                              built_options,
+                                              kernel));
+  }
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  uint32_t idx = 0;
+  MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->UnderlyingBuffer()->size());
+  MACE_SET_3D_GWS_ARGS(*kernel, gws);
+  kernel->setArg(idx++, *(input->opencl_buffer()));
+  MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
+             "buffer offset not aligned");
+  kernel->setArg(idx++,
+                 static_cast<uint32_t>(input->buffer_offset() /
+                     GetEnumTypeSize(input->dtype())));
+  kernel->setArg(idx++, *(output->opencl_buffer()));
+  kernel->setArg(idx++, static_cast<int32_t>(out_chan));
+  kernel->setArg(idx++, static_cast<int32_t>(in_chan));
+  kernel->setArg(idx++, static_cast<int32_t>(filter_height));
+  kernel->setArg(idx++, static_cast<int32_t>(filter_width));
+  kernel->setArg(idx++, static_cast<int32_t>(
+      in_chan * filter_height * filter_width));
+  std::string tuning_key =
+      Concat("transform_conv_filter",
+             transformed_shape[0],
+             transformed_shape[1],
+             transformed_shape[2],
+             transformed_shape[3]);
+  std::vector<uint32_t> lws = {4, 4, 4, 0};
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION
+  // Mark the buffer unused.
+  const_cast<Tensor *>(input)->MarkUnused();
+  return MACE_SUCCESS;
+}
+MaceStatus TransformDWConv2DFilter(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future) {
+  const index_t multiplier = input->dim(0);
+  const index_t in_chan = input->dim(1);
+  const index_t filter_height = input->dim(2);
+  const index_t filter_width = input->dim(3);
+  std::vector<index_t> transformed_shape = {
+      multiplier, RoundUpDiv4(in_chan),
+      filter_height, filter_width, 4,
+  };
+  uint32_t gws[3];
+  gws[0] = static_cast<uint32_t>(filter_width);
+  gws[1] = static_cast<uint32_t>(filter_height);
+  gws[2] = static_cast<uint32_t>(transformed_shape[1]);
+  MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
+  output->Reshape(input->shape());
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_dw_conv_filter");
+    built_options.emplace("-Dtransform_dw_conv_filter=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
+                                              kernel_name,
+                                              built_options,
+                                              kernel));
+  }
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  uint32_t idx = 0;
+  MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->UnderlyingBuffer()->size());
+  MACE_SET_3D_GWS_ARGS(*kernel, gws);
+  kernel->setArg(idx++, *(input->opencl_buffer()));
+  MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
+             "buffer offset not aligned");
+  kernel->setArg(idx++,
+                 static_cast<uint32_t>(input->buffer_offset() /
+                     GetEnumTypeSize(input->dtype())));
+  kernel->setArg(idx++, *(output->opencl_buffer()));
+  kernel->setArg(idx++, static_cast<int32_t>(in_chan));
+  kernel->setArg(idx++, static_cast<int32_t>(filter_height * filter_width));
+  std::string tuning_key =
+      Concat("transform_conv_filter",
+             transformed_shape[0],
+             transformed_shape[1],
+             transformed_shape[2],
+             transformed_shape[3]);
+  std::vector<uint32_t> lws = {4, 4, 4, 0};
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION
+  // Mark the buffer unused.
+  const_cast<Tensor *>(input)->MarkUnused();
+  return MACE_SUCCESS;
+}
+MaceStatus TransformArgument(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future) {
+  const index_t size = input->dim(0);
+  std::vector<index_t> transformed_shape = {RoundUp<index_t>(size, 4)};
+  uint32_t gws = static_cast<uint32_t>(RoundUpDiv4(transformed_shape[0]));
+  MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
+  output->Reshape(input->shape());
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_arg");
+    built_options.emplace("-Dtransform_arg=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
+                                              kernel_name,
+                                              built_options,
+                                              kernel));
+  }
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  uint32_t idx = 0;
+  MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->UnderlyingBuffer()->size());
+  kernel->setArg(idx++, gws);
+  kernel->setArg(idx++, *(input->opencl_buffer()));
+  MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
+             "buffer offset not aligned");
+  kernel->setArg(idx++,
+                 static_cast<uint32_t>(input->buffer_offset() /
+                     GetEnumTypeSize(input->dtype())));
+  kernel->setArg(idx++, *(output->opencl_buffer()));
+  kernel->setArg(idx++, static_cast<int32_t>(size));
+  const uint32_t lws =
+      static_cast<uint32_t>(RoundUpDiv4(runtime->GetDeviceMaxWorkGroupSize()));
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        *kernel, cl::NullRange, cl::NDRange(gws),
+        cl::NDRange(lws), nullptr, &event);
+  } else {
+    uint32_t roundup_gws = RoundUp(gws, lws);
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        *kernel, cl::NullRange, cl::NDRange(roundup_gws),
+        cl::NDRange(lws), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION
+  if (future != nullptr) {
+    future->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+  // Mark the buffer unused.
+  const_cast<Tensor *>(input)->MarkUnused();
+  return MACE_SUCCESS;
+}
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/buffer/buffer_transform.h
+++ b/mace/kernels/opencl/buffer/buffer_transform.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
+#define MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
+#include <vector>
+#include "mace/kernels/buffer_transform.h"
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+MaceStatus BufferTypeTransform(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future);
+MaceStatus TransformConv2DFilter(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future);
+MaceStatus TransformDWConv2DFilter(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future);
+MaceStatus TransformArgument(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future);
+template <typename T>
+class BufferTransform: public OpenCLBufferTransformKernel {
+ public:
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const BufferType type,
+      const int wino_blk_size,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  cl::Kernel kernel_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus BufferTransform<T>::Compute(OpKernelContext *context,
+                                       const Tensor *input,
+                                       const BufferType type,
+                                       const int wino_blk_size,
+                                       Tensor *output,
+                                       StatsFuture *future) {
+  MACE_UNUSED(type);
+  MACE_UNUSED(wino_blk_size);
+  const DataType dt = DataTypeToEnum<T>::value;
+  switch (type) {
+    case CONV2D_FILTER:
+      return TransformConv2DFilter(context, &kernel_, input,
+                                   dt, output, future);
+    case DW_CONV2D_FILTER:
+      return TransformDWConv2DFilter(context, &kernel_, input,
+                                     dt, output, future);
+    case ARGUMENT:
+      return TransformArgument(context, &kernel_, input, dt, output, future);
+    default:
+      if (input->dtype() != dt) {
+        return BufferTypeTransform(context, &kernel_, input,
+                                   dt, output, future);
+      } else {
+        SetFutureDefaultWaitFn(future);
+        output->ReuseTensorBuffer(*input);
+        return MaceStatus::MACE_SUCCESS;
+      }
+  }
+}
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
--- a/mace/kernels/opencl/buffer/buffer_type_transform.cc
+++ b/mace/kernels/opencl/buffer/buffer_type_transform.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/activation.h"
+#include "mace/kernels/conv_2d.h"
+#include "mace/kernels/opencl/helper.h"
+#include "mace/utils/tuner.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+MaceStatus BufferTypeTransform(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future) {
+  MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION
+  const uint32_t gws =
+      static_cast<uint32_t>(RoundUpDiv4(output->size()));
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_data_type");
+    built_options.emplace("-Dtransform_data_type=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
+                                              kernel_name,
+                                              built_options,
+                                              kernel));
+  }
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  uint32_t idx = 0;
+  MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
+  kernel->setArg(idx++, gws);
+  kernel->setArg(idx++, *(input->opencl_buffer()));
+  MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
+             "buffer offset not aligned");
+  kernel->setArg(idx++,
+                 static_cast<uint32_t>(input->buffer_offset() /
+                     GetEnumTypeSize(input->dtype())));
+  kernel->setArg(idx++, *(output->opencl_buffer()));
+  const uint32_t lws =
+      static_cast<uint32_t>(RoundUpDiv4(runtime->GetDeviceMaxWorkGroupSize()));
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        *kernel, cl::NullRange, cl::NDRange(gws),
+        cl::NDRange(lws), nullptr, &event);
+  } else {
+    uint32_t roundup_gws = RoundUp(gws, lws);
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        *kernel, cl::NullRange, cl::NDRange(roundup_gws),
+        cl::NDRange(lws), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION
+  if (future != nullptr) {
+    future->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+  // Mark the buffer unused.
+  const_cast<Tensor *>(input)->MarkUnused();
+  return MACE_SUCCESS;
+}
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/buffer/conv_2d.h
+++ b/mace/kernels/opencl/buffer/conv_2d.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_
+#define MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_
+#include "mace/kernels/conv_2d.h"
+#include <functional>
+#include <memory>
+#include <vector>
+#include "mace/kernels/opencl/buffer/utils.h"
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+namespace conv2d {
+extern MaceStatus Conv2d1x1(OpKernelContext *context,
+                            cl::Kernel *kernel,
+                            const Tensor *padded_input,
+                            const Tensor *filter,
+                            const Tensor *bias,
+                            const int *strides,
+                            const DataType dt,
+                            const ActivationType activation,
+                            const float relux_max_limit,
+                            const bool input_changed,
+                            Tensor *output,
+                            StatsFuture *future);
+extern MaceStatus Conv2dGeneral(OpKernelContext *context,
+                                cl::Kernel *kernel,
+                                const Tensor *input,
+                                const Tensor *filter,
+                                const Tensor *bias,
+                                const int *strides,
+                                const int *dilations,
+                                const DataType dt,
+                                const ActivationType activation,
+                                const float relux_max_limit,
+                                const bool input_changed,
+                                Tensor *output,
+                                StatsFuture *future);
+}  // namespace conv2d
+template <typename T>
+class Conv2dKernel : public OpenCLConv2dKernel {
+ public:
+  Conv2dKernel() : old_scratch_size_(0) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  index_t old_scratch_size_;
+  cl::Kernel kernels_[2];
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus Conv2dKernel<T>::Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output,
+      StatsFuture *future) {
+  StatsFuture pad_future, conv_future;
+  index_t filter_h = filter->dim(2);
+  index_t filter_w = filter->dim(3);
+  // Reshape output
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    kernels::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), filter->shape().data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), filter->shape().data(),
+                   padding_data.data(), dilations, strides, RoundType::FLOOR,
+                   output_shape.data());
+  }
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+  // calculate padded input shape
+  index_t width = output_shape[2];
+  index_t channels = output_shape[3];
+  index_t input_height = input->dim(1);
+  index_t input_width = input->dim(2);
+  index_t input_channels = input->dim(3);
+  int pad_top = paddings[0] >> 1;
+  int pad_left = paddings[1] >> 1;
+  MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
+  MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
+             input_channels);
+  std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
+  // Mark whether input changed or not
+  bool input_changed = !IsVecEqual(input_shape_, input->shape());
+  input_shape_ = input->shape();
+  bool use_1x1 = filter_h == 1 && filter_w == 1;
+  std::vector<index_t> padded_output_shape = output_shape;
+  index_t tile_w, tile_c = 4;
+  if (use_1x1) {
+    tile_w = 2;
+  } else {
+    tile_w = 4;
+  }
+  padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
+  std::vector<index_t> padded_input_shape = input->shape();
+  padded_input_shape[1] = input_height + paddings[0];
+  padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
+      (filter_w - 1) * dilations[1] + 1;
+  padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
+  const Tensor *padded_input_ptr = input;
+  // pad input
+  std::unique_ptr<Tensor> padded_input;
+  if (padded_input_shape[1] != input_height ||
+      padded_input_shape[2] != input_width ||
+      padded_input_shape[3] != input_channels) {
+    // decide scratch size before allocate it
+    index_t total_scratch_size = 0;
+    index_t padded_input_size = 0;
+    padded_input_size =
+        std::accumulate(padded_input_shape.begin(),
+                        padded_input_shape.end(),
+                        1,
+                        std::multiplies<index_t>())
+            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
+    total_scratch_size += padded_input_size;
+    // Init scratch buffer
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
+    if (old_scratch_size_ != scratch->size()) {
+      input_changed |= scratch->size() != old_scratch_size_;
+      old_scratch_size_ = scratch->size();
+    }
+    padded_input.reset(new Tensor(scratch->Scratch(padded_input_size),
+                                  input->dtype()));
+    padded_input->Resize(padded_input_shape);
+    PadInput(context, &kernels_[0], input, pad_top, pad_left,
+             input_changed, padded_input.get(), &pad_future);
+    padded_input_ptr = padded_input.get();
+  }
+  if (use_1x1) {
+    conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
+      return conv2d::Conv2d1x1(
+          context, &kernels_[1], pad_input, filter, bias, strides,
+          DataTypeToEnum<T>::v(), activation, relux_max_limit,
+          input_changed, output, &conv_future);
+    };
+  } else {
+    conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
+      return conv2d::Conv2dGeneral(
+        context, &kernels_[1], pad_input, filter, bias, strides, dilations,
+        DataTypeToEnum<T>::v(), activation, relux_max_limit,
+        input_changed, output, &conv_future);
+    };
+  }
+  MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
+  MergeMultipleFutureWaitFn({pad_future, conv_future}, future);
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_
--- a/mace/kernels/opencl/buffer/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/buffer/conv_2d_1x1.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/activation.h"
+#include "mace/kernels/conv_2d.h"
+#include "mace/kernels/opencl/helper.h"
+#include "mace/utils/tuner.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+namespace conv2d {
+MaceStatus Conv2d1x1(OpKernelContext *context,
+                     cl::Kernel *kernel,
+                     const Tensor *padded_input,
+                     const Tensor *filter,
+                     const Tensor *bias,
+                     const int *strides,
+                     const DataType dt,
+                     const ActivationType activation,
+                     const float relux_max_limit,
+                     const bool input_changed,
+                     Tensor *output,
+                     StatsFuture *future) {
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channel = output->dim(3);
+  const index_t in_height = padded_input->dim(1);
+  const index_t in_width = padded_input->dim(2);
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
+    built_options.emplace("-Dconv2d=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1_buffer",
+                                              kernel_name,
+                                              built_options, kernel));
+  }
+  const uint32_t gws[2] = {static_cast<uint32_t>(
+                               RoundUpDiv4(channel) *
+                                   RoundUpDiv<index_t>(width, 2)),
+                           static_cast<uint32_t>(height * batch)};
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  if (input_changed) {
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
+    MACE_SET_2D_GWS_ARGS(*kernel, gws);
+    kernel->setArg(idx++, *(padded_input->opencl_buffer()));
+    kernel->setArg(idx++, *(filter->opencl_buffer()));
+    if (bias != nullptr) {
+      kernel->setArg(idx++, *(bias->opencl_buffer()));
+    }
+    kernel->setArg(idx++, static_cast<int32_t>(in_height));
+    kernel->setArg(idx++, static_cast<int32_t>(in_width));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input->dim(3)));
+    kernel->setArg(idx++,
+                   static_cast<int32_t>(filter->buffer_shape()[3]));
+    kernel->setArg(idx++, static_cast<int32_t>(height));
+    kernel->setArg(idx++, static_cast<int32_t>(width));
+    kernel->setArg(idx++, static_cast<int32_t>(channel));
+    kernel->setArg(idx++, strides[0]);
+    kernel->setArg(idx++, strides[1]);
+    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, *(output->opencl_buffer()));
+  }
+  std::string tuning_key =
+      Concat("conv2d_1x1_buffer", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  std::vector<uint32_t> lws = {16, 4, 0};
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, gws,
+                                           lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace conv2d
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/buffer/conv_2d_general.cc
+++ b/mace/kernels/opencl/buffer/conv_2d_general.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/activation.h"
+#include "mace/kernels/conv_2d.h"
+#include "mace/kernels/opencl/helper.h"
+#include "mace/utils/tuner.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+namespace conv2d {
+MaceStatus Conv2dGeneral(OpKernelContext *context,
+                         cl::Kernel *kernel,
+                         const Tensor *padded_input,
+                         const Tensor *filter,
+                         const Tensor *bias,
+                         const int *strides,
+                         const int *dilations,
+                         const DataType dt,
+                         const ActivationType activation,
+                         const float relux_max_limit,
+                         const bool input_changed,
+                         Tensor *output,
+                         StatsFuture *future) {
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channel = output->dim(3);
+  const index_t in_height = padded_input->dim(1);
+  const index_t in_width = padded_input->dim(2);
+  const index_t in_channel = padded_input->dim(3);
+  const index_t filter_height = filter->dim(2);
+  const index_t filter_width = filter->dim(3);
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
+    built_options.emplace("-Dconv2d=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_buffer",
+                                              kernel_name,
+                                              built_options, kernel));
+  }
+  const uint32_t gws[2] = {static_cast<uint32_t>(
+                               RoundUpDiv4(channel) * RoundUpDiv4(width)),
+                           static_cast<uint32_t>(height * batch)};
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  if (input_changed) {
+    auto filter_buffer_shape = filter->buffer_shape();
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
+    MACE_SET_2D_GWS_ARGS(*kernel, gws)
+    kernel->setArg(idx++, *(padded_input->opencl_buffer()));
+    kernel->setArg(idx++, *(filter->opencl_buffer()));
+    if (bias != nullptr) {
+      kernel->setArg(idx++, *(bias->opencl_buffer()));
+    }
+    kernel->setArg(idx++, static_cast<int32_t>(in_height));
+    kernel->setArg(idx++, static_cast<int32_t>(in_width));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input->dim(3)));
+    kernel->setArg(idx++, static_cast<int32_t>(filter_height));
+    kernel->setArg(idx++, static_cast<int32_t>(filter_width));
+    kernel->setArg(idx++,
+                   static_cast<int32_t>(filter_buffer_shape[3]));
+    kernel->setArg(idx++, static_cast<int32_t>(
+        filter_buffer_shape[2] * filter_buffer_shape[3]
+            * filter_buffer_shape[4]));
+    kernel->setArg(idx++, static_cast<int32_t>(height));
+    kernel->setArg(idx++, static_cast<int32_t>(width));
+    kernel->setArg(idx++, static_cast<int32_t>(channel));
+    kernel->setArg(idx++, strides[0]);
+    kernel->setArg(idx++, strides[1]);
+    kernel->setArg(idx++, static_cast<int32_t>(
+        dilations[0] * in_width * in_channel));
+    kernel->setArg(idx++, static_cast<int32_t>(
+        dilations[1] * in_channel));
+    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, *(output->opencl_buffer()));
+  }
+  std::string tuning_key =
+      Concat("conv2d_general_buffer", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3), filter_height, filter_width);
+  std::vector<uint32_t> lws = {16, 4, 0};
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, gws,
+                                           lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION
+  return MACE_SUCCESS;
+}
+}  // namespace conv2d
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/buffer/depthwise_conv2d.cc
+++ b/mace/kernels/opencl/buffer/depthwise_conv2d.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/kernels/opencl/buffer/depthwise_conv2d.h"
+#include <set>
+#include <string>
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+namespace depthwise {
+MaceStatus DepthwiseConv2d(OpKernelContext *context,
+                           cl::Kernel *kernel,
+                           const Tensor *padded_input,   // NHWC
+                           const Tensor *filter,  // HWIM
+                           const Tensor *bias,
+                           const int *strides,
+                           const int *dilations,
+                           const DataType dt,
+                           const ActivationType activation,
+                           const float relux_max_limit,
+                           const bool input_changed,
+                           Tensor *output,
+                           StatsFuture *future) {
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channel = output->dim(3);
+  const index_t in_height = padded_input->dim(1);
+  const index_t in_width = padded_input->dim(2);
+  const index_t in_channel = padded_input->dim(3);
+  const index_t filter_height = filter->dim(2);
+  const index_t filter_width = filter->dim(3);
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
+    built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("depthwise_conv2d_buffer", kernel_name,
+                             built_options, kernel));
+  }
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(RoundUpDiv4(channel) * RoundUpDiv4(width)),
+      static_cast<uint32_t>(height * batch)
+  };
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  if (input_changed) {
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
+    MACE_SET_2D_GWS_ARGS(*kernel, gws);
+    kernel->setArg(idx++, *(padded_input->opencl_buffer()));
+    kernel->setArg(idx++, *(filter->opencl_buffer()));
+    if (bias != nullptr) {
+      kernel->setArg(idx++, *(bias->opencl_buffer()));
+    }
+    kernel->setArg(idx++, static_cast<uint32_t>(in_height));
+    kernel->setArg(idx++, static_cast<uint32_t>(in_width));
+    kernel->setArg(idx++, static_cast<uint32_t>(in_channel));
+    kernel->setArg(idx++, static_cast<uint32_t>(filter_height));
+    kernel->setArg(idx++, static_cast<uint32_t>(filter_width));
+    kernel->setArg(idx++, static_cast<uint32_t>(filter_height * filter_width));
+    kernel->setArg(idx++, static_cast<uint32_t>(height));
+    kernel->setArg(idx++, static_cast<uint32_t>(width));
+    kernel->setArg(idx++, static_cast<uint32_t>(channel));
+    kernel->setArg(idx++, static_cast<uint32_t>(strides[0]));
+    kernel->setArg(idx++, static_cast<uint32_t>(strides[1]));
+    kernel->setArg(idx++, static_cast<int32_t>(
+        dilations[0] * in_width * in_channel));
+    kernel->setArg(idx++, static_cast<int32_t>(
+        dilations[1] * in_channel));
+    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, *(output->opencl_buffer()));
+  }
+  std::vector<uint32_t> lws = {16, 4, 0};
+  std::string tuning_key =
+      Concat("depthwise_conv2d_buffer_kernel", in_height, in_width, in_channel,
+             filter_height, filter_width, channel);
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION
+  return MACE_SUCCESS;
+}
+}  // namespace depthwise
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/buffer/depthwise_conv2d.h
+++ b/mace/kernels/opencl/buffer/depthwise_conv2d.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
+#define MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
+#include "mace/kernels/depthwise_conv2d.h"
+#include <functional>
+#include <memory>
+#include <vector>
+#include "mace/kernels/opencl/buffer/utils.h"
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+namespace depthwise {
+MaceStatus DepthwiseConv2d(OpKernelContext *context,
+                           cl::Kernel *kernel,
+                           const Tensor *padded_input,   // NHWC
+                           const Tensor *filter,  // HWIM
+                           const Tensor *bias,
+                           const int *strides,
+                           const int *dilations,
+                           const DataType dt,
+                           const ActivationType activation,
+                           const float relux_max_limit,
+                           const bool input_changed,
+                           Tensor *output,
+                           StatsFuture *future);
+}  // namespace depthwise
+template <typename T>
+class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
+ public:
+  DepthwiseConv2dKernel() : old_scratch_size_(0) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  index_t old_scratch_size_;
+  cl::Kernel kernels_[2];
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus DepthwiseConv2dKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const ActivationType activation,
+    const float relux_max_limit,
+    Tensor *output,
+    StatsFuture *future) {
+  StatsFuture pad_future, dw_conv_future;
+  index_t filter_w = filter->dim(3);
+  // Create a fake conv_2d filter to calculate the paddings and output size
+  std::vector<index_t> fake_filter_shape(4);
+  fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
+  fake_filter_shape[1] = filter->dim(1);
+  fake_filter_shape[2] = filter->dim(2);
+  fake_filter_shape[3] = filter->dim(3);
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    kernels::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), fake_filter_shape.data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
+                   padding_data.data(), dilations, strides, RoundType::FLOOR,
+                   output_shape.data());
+  }
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+  // calculate padded input shape
+  index_t width = output_shape[2];
+  index_t channels = output_shape[3];
+  index_t input_height = input->dim(1);
+  index_t input_width = input->dim(2);
+  index_t input_channels = input->dim(3);
+  int pad_top = paddings[0] >> 1;
+  int pad_left = paddings[1] >> 1;
+  MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
+  MACE_CHECK(filter->dim(0) * input_channels == channels);
+  MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
+             input_channels);
+  // Mark whether input changed or not
+  bool input_changed = !IsVecEqual(input_shape_, input->shape());
+  input_shape_ = input->shape();
+  std::vector<index_t> padded_output_shape = output_shape;
+  index_t tile_w = 4, tile_c = 4;
+  padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
+  std::vector<index_t> padded_input_shape = input->shape();
+  padded_input_shape[1] = input_height + paddings[0];
+  padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
+      (filter_w - 1) * dilations[1] + 1;
+  padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
+  const Tensor *padded_input_ptr = input;
+  // pad input
+  std::unique_ptr<Tensor> padded_input;
+  if (padded_input_shape[1] != input_height ||
+      padded_input_shape[2] != input_width ||
+      padded_input_shape[3] != input_channels) {
+    index_t total_scratch_size = 0;
+    index_t padded_input_size = 0;
+    padded_input_size =
+        std::accumulate(padded_input_shape.begin(),
+                        padded_input_shape.end(),
+                        1,
+                        std::multiplies<index_t>())
+            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
+    total_scratch_size += padded_input_size;
+    // Init scratch buffer
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
+    if (old_scratch_size_ != scratch->size()) {
+      input_changed |= scratch->size() != old_scratch_size_;
+      old_scratch_size_ = scratch->size();
+    }
+    padded_input.reset(new Tensor(scratch->Scratch(padded_input_size),
+                                  input->dtype()));
+    padded_input->Resize(padded_input_shape);
+    PadInput(context, &kernels_[0], input, pad_top, pad_left,
+             input_changed, padded_input.get(), &pad_future);
+    padded_input_ptr = padded_input.get();
+  }
+  MACE_RETURN_IF_ERROR(
+      depthwise::DepthwiseConv2d(
+          context, &kernels_[1], padded_input_ptr, filter, bias, strides,
+          dilations, DataTypeToEnum<T>::v(), activation, relux_max_limit,
+          input_changed, output, &dw_conv_future));
+  MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, future);
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
--- a/mace/kernels/opencl/buffer/pooling.h
+++ b/mace/kernels/opencl/buffer/pooling.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_BUFFER_POOLING_H_
+#define MACE_KERNELS_OPENCL_BUFFER_POOLING_H_
+#include "mace/kernels/pooling.h"
+#include <functional>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/opencl/buffer/utils.h"
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+template <typename T>
+class PoolingKernel : public OpenCLPoolingKernel {
+ public:
+  PoolingKernel() : old_scratch_size_(0) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const PoolingType pooling_type,
+      const int *kernels,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  index_t old_scratch_size_;
+  cl::Kernel kernels_[2];
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus PoolingKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    const PoolingType pooling_type,
+    const int *kernels,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    Tensor *output,
+    StatsFuture *future) {
+  MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
+    << "Pooling opencl kernel not support dilation yet";
+  StatsFuture pad_future, pooling_future;
+  index_t input_channels = input->dim(3);
+  std::vector<index_t> output_shape(4);
+  std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
+                                       kernels[0], kernels[1]};
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    kernels::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), filter_shape.data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), filter_shape.data(),
+                   padding_data.data(), dilations, strides, RoundType::CEIL,
+                   output_shape.data());
+  }
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+  // Mark whether input changed or not
+  bool input_changed = !IsVecEqual(input_shape_, input->shape());
+  input_shape_ = input->shape();
+  auto runtime = context->device()->opencl_runtime();
+  // pad input
+  std::vector<index_t> padded_input_shape = input->shape();
+  padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
+  const Tensor *padded_input_ptr = input;
+  // pad input
+  std::unique_ptr<Tensor> padded_input;
+  if (padded_input_shape[3] != input_channels) {
+    index_t total_scratch_size = 0;
+    index_t padded_input_size = 0;
+    padded_input_size =
+        std::accumulate(padded_input_shape.begin(),
+                        padded_input_shape.end(),
+                        1,
+                        std::multiplies<index_t>())
+            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
+    total_scratch_size += padded_input_size;
+    // Init scratch buffer
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
+    if (old_scratch_size_ != scratch->size()) {
+      input_changed |= scratch->size() != old_scratch_size_;
+      old_scratch_size_ = scratch->size();
+    }
+    padded_input.reset(new Tensor(scratch->Scratch(padded_input_size),
+                                  input->dtype()));
+    padded_input->Resize(padded_input_shape);
+    PadInput(context, &kernels_[0], input, 0, 0,
+             input_changed, padded_input.get(), &pad_future);
+    padded_input_ptr = padded_input.get();
+  }
+  cl::Kernel *kernel = &kernels_[1];
+  MACE_OUT_OF_RANGE_DEFINITION
+  if (kernel->get() == nullptr) {
+    const DataType dt = DataTypeToEnum<T>::value;
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
+    built_options.emplace("-Dpooling=" + kernel_name);
+    if (pooling_type == MAX && input->dtype() == output->dtype()) {
+      built_options.emplace("-DIN_DATA_TYPE=" +
+          DtToCLDt(input->dtype()));
+      built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    } else {
+      built_options.emplace("-DIN_DATA_TYPE=" +
+          DtToCLDt(input->dtype()));
+      built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
+      built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    }
+    if (pooling_type == AVG) {
+      built_options.emplace("-DPOOL_AVG");
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
+                                              kernel_name,
+                                              built_options,
+                                              kernel));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
+  }
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
+      static_cast<uint32_t>(output->dim(2)),
+      static_cast<uint32_t>(output->dim(0) * output->dim(1)),
+  };
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  if (input_changed) {
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
+    MACE_SET_3D_GWS_ARGS(*kernel, gws);
+    kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
+    kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
+    kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
+    kernel->setArg(idx++, paddings[0] / 2);
+    kernel->setArg(idx++, paddings[1] / 2);
+    kernel->setArg(idx++, strides[0]);
+    kernel->setArg(idx++, strides[1]);
+    kernel->setArg(idx++, kernels[0]);
+    kernel->setArg(idx++, kernels[1]);
+    kernel->setArg(idx++, *(output->opencl_buffer()));
+  }
+  const std::vector<uint32_t> lws = {4, 4, 4, 0};
+  std::string tuning_key =
+      Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
+                                           gws, lws, &pooling_future));
+  MACE_OUT_OF_RANGE_VALIDATION
+  MergeMultipleFutureWaitFn({pad_future, pooling_future}, future);
+  return MACE_SUCCESS;
+}
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_BUFFER_POOLING_H_
--- a/mace/kernels/opencl/buffer/softmax.h
+++ b/mace/kernels/opencl/buffer/softmax.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_
+#define MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_
+#include "mace/kernels/softmax.h"
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+template <typename T>
+class SoftmaxKernel : public OpenCLSoftmaxKernel {
+ public:
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *logits,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus SoftmaxKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *logits,
+    Tensor *output,
+    StatsFuture *future) {
+  index_t batch = 0;
+  index_t height = 0;
+  index_t width = 0;
+  index_t channels = 0;
+  if (logits->dim_size() == 2) {
+    batch = logits->dim(0);
+    height = 1;
+    width = 1;
+    channels = logits->dim(1);
+  } else if (logits->dim_size() == 4) {
+    batch = logits->dim(0);
+    height = logits->dim(1);
+    width = logits->dim(2);
+    channels = logits->dim(3);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const int remain_channels = channel_blocks * 4 - channels;
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
+    built_options.emplace("-Dsoftmax=" + kernel_name);
+    auto dt = DataTypeToEnum<T>::value;
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, logits->shape())) {
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(logits->opencl_buffer()));
+    kernel_.setArg(idx++, static_cast<int>(height));
+    kernel_.setArg(idx++, static_cast<int>(channels));
+    kernel_.setArg(idx++, remain_channels);
+    kernel_.setArg(idx++, *(output->opencl_buffer()));
+    input_shape_ = logits->shape();
+  }
+  std::vector<uint32_t> lws = {4, 4, 4, 0};
+  std::string tuning_key =
+      Concat("softmax_opencl_kernel", batch, height, width, channels);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION
+  return MACE_SUCCESS;
+}
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_
--- a/mace/kernels/opencl/buffer/utils.cc
+++ b/mace/kernels/opencl/buffer/utils.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/kernels/opencl/buffer/utils.h"
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+MaceStatus PadInput(OpKernelContext *context,
+                    cl::Kernel *kernel,
+                    const Tensor *input,
+                    const int pad_top,
+                    const int pad_left,
+                    const bool input_changed,
+                    Tensor *padded_input,
+                    StatsFuture *future) {
+  const index_t batch = input->dim(0);
+  const index_t in_height = input->dim(1);
+  const index_t in_width = input->dim(2);
+  const index_t in_channel = input->dim(3);
+  const index_t padded_height = padded_input->dim(1);
+  const index_t padded_width = padded_input->dim(2);
+  const index_t padded_channel = padded_input->dim(3);
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(padded_width * RoundUpDiv4(padded_channel)),
+      static_cast<uint32_t>(padded_height * batch)
+  };
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("Dpad_input");
+    built_options.emplace("-Dpad_input=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input->dtype()));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel(
+        "buffer_transform",
+        kernel_name,
+        built_options,
+        kernel));
+  }
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  if (input_changed) {
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, padded_input->size());
+    MACE_SET_2D_GWS_ARGS(*kernel, gws)
+    kernel->setArg(idx++, *(input->opencl_buffer()));
+    kernel->setArg(idx++, static_cast<int32_t>(in_height));
+    kernel->setArg(idx++, static_cast<int32_t>(in_width));
+    kernel->setArg(idx++, static_cast<int32_t>(in_channel));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_height));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_width));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_channel));
+    kernel->setArg(idx++, pad_top);
+    kernel->setArg(idx++, pad_left);
+    kernel->setArg(idx++, *(padded_input->opencl_buffer()));
+  }
+  std::string tuning_key =
+      Concat("pad_input", batch, in_height, in_width, in_channel,
+             padded_height, padded_width, padded_channel);
+  std::vector<uint32_t> lws = {8, 4, 0};
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION
+  return MACE_SUCCESS;
+}
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/buffer/utils.h
+++ b/mace/kernels/opencl/buffer/utils.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_BUFFER_UTILS_H_
+#define MACE_KERNELS_OPENCL_BUFFER_UTILS_H_
+#include "mace/core/future.h"
+#include "mace/core/op_kernel_context.h"
+#include "mace/core/tensor.h"
+#include "mace/public/mace.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+MaceStatus PadInput(OpKernelContext *context,
+                    cl::Kernel *kernel,
+                    const Tensor *input,
+                    const int pad_top,
+                    const int pad_left,
+                    const bool input_changed,
+                    Tensor *padded_input,
+                    StatsFuture *future);
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_BUFFER_UTILS_H_
--- a/mace/kernels/opencl/buffer_inverse_transform.cc
+++ b/mace/kernels/opencl/buffer_inverse_transform.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/kernels/buffer_inverse_transform.h"
+#include "mace/kernels/opencl/image/image_to_buffer.h"
+#include "mace/kernels/opencl/buffer/buffer_inverse_transform.h"
+namespace mace {
+namespace kernels {
+template<typename T>
+BufferInverseTransformFunctor<
+    DeviceType::GPU, T>::BufferInverseTransformFunctor(
+    OpKernelContext *context,
+    const int wino_blk_size)
+  : BufferInverseTransformFunctorBase(context, wino_blk_size) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::ImageToBuffer<T>);
+  } else {
+    kernel_.reset(new opencl::buffer::BufferInverseTransform<T>);
+  }
+}
+template <typename T>
+MaceStatus BufferInverseTransformFunctor<DeviceType::GPU, T>::operator()(
+    const Tensor *input,
+    const BufferType type,
+    Tensor *output,
+    StatsFuture *future) {
+  return kernel_->Compute(context_, input, type,
+                          wino_blk_size_, output, future);
+}
+template struct BufferInverseTransformFunctor<DeviceType::GPU, float>;
+template struct BufferInverseTransformFunctor<DeviceType::GPU, half>;
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/buffer_transform.cc
+++ b/mace/kernels/opencl/buffer_transform.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/kernels/buffer_transform.h"
+#include "mace/kernels/opencl/image/buffer_to_image.h"
+#include "mace/kernels/opencl/buffer/buffer_transform.h"
+namespace mace {
+namespace kernels {
+template<typename T>
+BufferTransformFunctor<DeviceType::GPU, T>::BufferTransformFunctor(
+    OpKernelContext *context,
+    const int wino_blk_size)
+  : BufferTransformFunctorBase(context, wino_blk_size) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::BufferToImage<T>);
+  } else {
+    kernel_.reset(new opencl::buffer::BufferTransform<T>);
+  }
+}
+template <typename T>
+MaceStatus BufferTransformFunctor<DeviceType::GPU, T>::operator()(
+    const Tensor *input,
+    const BufferType type,
+    Tensor *output,
+    StatsFuture *future) {
+  return kernel_->Compute(context_, input, type,
+                          wino_blk_size_, output, future);
+}
+template struct BufferTransformFunctor<DeviceType::GPU, float>;
+template struct BufferTransformFunctor<DeviceType::GPU, half>;
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -13,73 +13,26 @@
 // limitations under the License.
 #include "mace/kernels/channel_shuffle.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/kernels/opencl/image/channel_shuffle.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
 template <typename T>
-MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
+ChannelShuffleFunctor<DeviceType::GPU, T>::ChannelShuffleFunctor(
-    const Tensor *input, Tensor *output, StatsFuture *future) {
+    OpKernelContext *context,
-  MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+    const int groups) : OpKernel(context) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
-  const index_t batch = input->dim(0);
+    kernel_.reset(new opencl::image::ChannelShuffleKernel<T>(groups));
-  const index_t height = input->dim(1);
+  } else {
-  const index_t width = input->dim(2);
+    MACE_NOT_IMPLEMENTED;
-  const index_t channels = input->dim(3);
-  const index_t channels_per_group = channels / groups_;
-  MACE_CHECK(channels_per_group % 4 == 0,
-             "channels per group must be multiple of 4");
-  MACE_CHECK(groups_ % 4 == 0, "groups must be multiple of 4");
-  const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
-  const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
-    built_options.emplace("-Dchannel_shuffle=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(
-        runtime->BuildKernel("channel_shuffle", kernel_name,
-                             built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, groups_);
-    kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
  }
+}
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+template <typename T>
-  std::string tuning_key =
+MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
-      Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
+    const Tensor *input, Tensor *output, StatsFuture *future) {
-             output->dim(2), output->dim(3));
+  return kernel_->Compute(context_, input, output, future);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct ChannelShuffleFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/cl/activation.cl
+++ b/mace/kernels/opencl/cl/activation.cl
 #include <common.h>
-__kernel void activation(KERNEL_ERROR_PARAMS
+__kernel void activation(OUT_OF_RANGE_PARAMS
                         GLOBAL_WORK_GROUP_SIZE_DIM3
                         __read_only image2d_t input,
 #ifdef USE_PRELU

--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
 #include <common.h>
-__kernel void addn(KERNEL_ERROR_PARAMS
+__kernel void addn(OUT_OF_RANGE_PARAMS
                   GLOBAL_WORK_GROUP_SIZE_DIM2
                   __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                   __read_only image2d_t input1,

--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
 #include <common.h>
 // Supported data types: half/float
-__kernel void batch_norm(KERNEL_ERROR_PARAMS
+__kernel void batch_norm(OUT_OF_RANGE_PARAMS
                         GLOBAL_WORK_GROUP_SIZE_DIM3
                         __read_only image2d_t input,
                         __read_only image2d_t scale,

--- a/mace/kernels/opencl/cl/batch_to_space.cl
+++ b/mace/kernels/opencl/cl/batch_to_space.cl
 #include <common.h>
-__kernel void batch_to_space(KERNEL_ERROR_PARAMS
+__kernel void batch_to_space(OUT_OF_RANGE_PARAMS
                             GLOBAL_WORK_GROUP_SIZE_DIM3
                             __read_only image2d_t batch_data,
                             __write_only image2d_t space_data,

--- a/mace/kernels/opencl/cl/bias_add.cl
+++ b/mace/kernels/opencl/cl/bias_add.cl
 #include <common.h>
 // Supported data types: half/float
-__kernel void bias_add(KERNEL_ERROR_PARAMS
+__kernel void bias_add(OUT_OF_RANGE_PARAMS
                       GLOBAL_WORK_GROUP_SIZE_DIM3
                       __read_only image2d_t input,
                       __read_only image2d_t bias,

--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
 #include <common.h>
-__kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void filter_buffer_to_image(OUT_OF_RANGE_PARAMS
                                     GLOBAL_WORK_GROUP_SIZE_DIM2
                                     __global const DATA_TYPE *input, /* OIHW */
                                     __private const int input_offset,
@@ -52,7 +52,7 @@ __kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, coord, values);
 }
-__kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS
+__kernel void filter_image_to_buffer(OUT_OF_RANGE_PARAMS
                                     GLOBAL_WORK_GROUP_SIZE_DIM2
                                     __global DATA_TYPE *output, /* OIHW */
                                     __private const int out_channel,
@@ -102,7 +102,7 @@ __kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS
 }
 // TODO(liuqi): Support multiplier > 1
-__kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void dw_filter_buffer_to_image(OUT_OF_RANGE_PARAMS
                                        GLOBAL_WORK_GROUP_SIZE_DIM2
                                        __global const DATA_TYPE *input, /* MIHW */
                                        __private const int input_offset,
@@ -154,7 +154,7 @@ __kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, coord, values);
 }
-__kernel void in_out_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void in_out_buffer_to_image(OUT_OF_RANGE_PARAMS
                                     GLOBAL_WORK_GROUP_SIZE_DIM2
                                     __global const DATA_TYPE *input, /* nhwc */
                                     __private const int input_offset,
@@ -196,7 +196,7 @@ __kernel void in_out_buffer_to_image(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, coord, values);
 }
-__kernel void in_out_image_to_buffer(KERNEL_ERROR_PARAMS
+__kernel void in_out_image_to_buffer(OUT_OF_RANGE_PARAMS
                                     GLOBAL_WORK_GROUP_SIZE_DIM2
                                     __global DATA_TYPE *output, /* nhwc */
                                     __private const int height,
@@ -236,7 +236,7 @@ __kernel void in_out_image_to_buffer(KERNEL_ERROR_PARAMS
  }
 }
-__kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void arg_buffer_to_image(OUT_OF_RANGE_PARAMS
                                  GLOBAL_WORK_GROUP_SIZE_DIM2
                                  __global const DATA_TYPE *input,
                                  __private const int input_offset,
@@ -272,7 +272,7 @@ __kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, coord, values);
 }
-__kernel void arg_image_to_buffer(KERNEL_ERROR_PARAMS
+__kernel void arg_image_to_buffer(OUT_OF_RANGE_PARAMS
                                  GLOBAL_WORK_GROUP_SIZE_DIM2
                                  __global DATA_TYPE *output,
                                  __private const int count,
@@ -306,7 +306,7 @@ __kernel void arg_image_to_buffer(KERNEL_ERROR_PARAMS
 }
-__kernel void in_out_height_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void in_out_height_buffer_to_image(OUT_OF_RANGE_PARAMS
                                            GLOBAL_WORK_GROUP_SIZE_DIM2
                                            __global const DATA_TYPE *input, //nhwc
                                            __private const int input_offset,
@@ -349,7 +349,7 @@ __kernel void in_out_height_buffer_to_image(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, coord, values);
 }
-__kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS
+__kernel void in_out_height_image_to_buffer(OUT_OF_RANGE_PARAMS
                                            GLOBAL_WORK_GROUP_SIZE_DIM2
                                            __global DATA_TYPE *output, //nhwc
                                            __private const int height,
@@ -387,7 +387,7 @@ __kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS
  output[offset] = values.w;
 }
-__kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void in_out_width_buffer_to_image(OUT_OF_RANGE_PARAMS
                                           GLOBAL_WORK_GROUP_SIZE_DIM2
                                           __global const DATA_TYPE *input, /* nhwc */
                                           __private const int input_offset,
@@ -430,7 +430,7 @@ __kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, coord, values);
 }
-__kernel void weight_height_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void weight_height_buffer_to_image(OUT_OF_RANGE_PARAMS
                                            GLOBAL_WORK_GROUP_SIZE_DIM2
                                            __global const DATA_TYPE *input, // OIHW
                                            __private const int input_offset,
@@ -475,7 +475,7 @@ __kernel void weight_height_buffer_to_image(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, coord, values);
 }
-__kernel void weight_height_image_to_buffer(KERNEL_ERROR_PARAMS
+__kernel void weight_height_image_to_buffer(OUT_OF_RANGE_PARAMS
                                            GLOBAL_WORK_GROUP_SIZE_DIM2
                                            __global DATA_TYPE *output, //OIHW
                                            __private const int out_channels,
@@ -517,7 +517,7 @@ __kernel void weight_height_image_to_buffer(KERNEL_ERROR_PARAMS
 }
-__kernel void weight_width_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void weight_width_buffer_to_image(OUT_OF_RANGE_PARAMS
                                           GLOBAL_WORK_GROUP_SIZE_DIM2
                                           __global const DATA_TYPE *input, // OIHW
                                           __private const int input_offset,
@@ -565,7 +565,7 @@ __kernel void weight_width_buffer_to_image(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, coord, values);
 }
-__kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS
+__kernel void weight_width_image_to_buffer(OUT_OF_RANGE_PARAMS
                                           GLOBAL_WORK_GROUP_SIZE_DIM2
                                           __global DATA_TYPE *output, // OIHW
                                           __private const int in_channels,
@@ -609,7 +609,7 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS
 }
 // only support 3x3 now
-__kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS
+__kernel void winograd_filter_buffer_to_image_2x2(OUT_OF_RANGE_PARAMS
                                              GLOBAL_WORK_GROUP_SIZE_DIM2
                                              __global const DATA_TYPE *input, //Oc, Ic, H, W
                                              __private const int input_offset,
@@ -714,7 +714,7 @@ __kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS
 }
 // only support 3x3 now
-__kernel void winograd_filter_image_to_buffer_2x2(KERNEL_ERROR_PARAMS
+__kernel void winograd_filter_image_to_buffer_2x2(OUT_OF_RANGE_PARAMS
                                              GLOBAL_WORK_GROUP_SIZE_DIM2
                                              __global DATA_TYPE *output, //Oc, Ic, H, W
                                              __private const int height,
@@ -757,7 +757,7 @@ __kernel void winograd_filter_image_to_buffer_2x2(KERNEL_ERROR_PARAMS
 }
 // only support 3x3 now
-__kernel void winograd_filter_buffer_to_image_6x6(KERNEL_ERROR_PARAMS
+__kernel void winograd_filter_buffer_to_image_6x6(OUT_OF_RANGE_PARAMS
                                                  GLOBAL_WORK_GROUP_SIZE_DIM2
                                                  __global const DATA_TYPE *input, //Oc, Ic, H, W
                                                  __private const int input_offset,
@@ -891,7 +891,7 @@ PROCESS(7);
 #undef PROCESS
 }
-__kernel void winograd_filter_image_to_buffer_6x6(KERNEL_ERROR_PARAMS
+__kernel void winograd_filter_image_to_buffer_6x6(OUT_OF_RANGE_PARAMS
                                                  GLOBAL_WORK_GROUP_SIZE_DIM2
                                                  __global DATA_TYPE *output, //Oc, Ic, H, W
                                                  __private const int height,
@@ -933,7 +933,7 @@ __kernel void winograd_filter_image_to_buffer_6x6(KERNEL_ERROR_PARAMS
 }
 // only support 3x3 now
-__kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS
+__kernel void winograd_filter_buffer_to_image_4x4(OUT_OF_RANGE_PARAMS
                                                  GLOBAL_WORK_GROUP_SIZE_DIM2
                                                  __global const DATA_TYPE *input, //Oc, Ic, H, W
                                                  __private const int input_offset,
@@ -1040,7 +1040,7 @@ __kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS
 #undef PROCESS
 }
-__kernel void winograd_filter_image_to_buffer_4x4(KERNEL_ERROR_PARAMS
+__kernel void winograd_filter_image_to_buffer_4x4(OUT_OF_RANGE_PARAMS
                                                  GLOBAL_WORK_GROUP_SIZE_DIM2
                                                  __global DATA_TYPE *output, //Oc, Ic, H, W
                                                  __private const int height,

--- a/mace/kernels/opencl/cl/buffer_transform.cl
+++ b/mace/kernels/opencl/cl/buffer_transform.cl
+#include <common.h>
+__kernel void pad_input(BUFFER_OUT_OF_RANGE_PARAMS
+                        GLOBAL_WORK_GROUP_SIZE_DIM2
+                        __global IN_DATA_TYPE *input,
+                        __private const int in_height,
+                        __private const int in_width,
+                        __private const int in_chan,
+                        __private const int padded_height,
+                        __private const int padded_width,
+                        __private const int padded_chan,
+                        __private const int pad_top,
+                        __private const int pad_left,
+                        __global DATA_TYPE *output) {
+  const int padded_wc_blk_idx = get_global_id(0);
+  const int padded_hb_idx = get_global_id(1);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (padded_wc_blk_idx >= global_size_dim0 ||
+      padded_hb_idx >= global_size_dim1) {
+    return;
+  }
+#endif
+  const int padded_chan_blk = (padded_chan + 3) >> 2;
+  const int padded_width_idx = padded_wc_blk_idx / padded_chan_blk;
+  const int padded_chan_blk_idx = padded_wc_blk_idx % padded_chan_blk;
+  const int batch_idx = padded_hb_idx / padded_height;
+  const int padded_height_idx = padded_hb_idx % padded_height;
+  const int padded_chan_idx = padded_chan_blk_idx << 2;
+  const int in_height_idx = padded_height_idx - pad_top;
+  const int in_width_idx = padded_width_idx - pad_left;
+  const int padded_offset = mad24(mad24(mad24(batch_idx, padded_height, padded_height_idx),
+      padded_width, padded_width_idx), padded_chan, padded_chan_idx);
+  const int in_offset = mad24(mad24(mad24(batch_idx, in_height, in_height_idx),
+      in_width, in_width_idx), in_chan, padded_chan_idx);
+  DATA_TYPE4 value = 0;
+  if (0 <= in_height_idx && in_height_idx < in_height &&
+      0 <= in_width_idx && in_width_idx < in_width) {
+    const int remain_chan = in_chan - padded_chan_idx;
+    if (remain_chan < 4) {
+      switch (remain_chan) {
+        case 3:
+          value.z = CONVERT(input[in_offset + 2]);
+        case 2:
+          value.y = CONVERT(input[in_offset + 1]);
+        case 1:
+          value.x = CONVERT(input[in_offset]);
+      }
+    } else {
+      value = CONVERT4(vload4(0, input + in_offset));
+    }
+  }
+  vstore4(value, 0, output + padded_offset);
+  CHECK_OUT_OF_RANGE_FOR_BUFFER(padded_offset + 3);
+}
+// OIHW -> [H, W, (O+3) / 4, I, 4]
+__kernel void transform_conv_filter(BUFFER_OUT_OF_RANGE_PARAMS
+                                    GLOBAL_WORK_GROUP_SIZE_DIM3
+                                    __global IN_DATA_TYPE *input,  // OIHW
+                                    __private const int input_offset,
+                                    __global DATA_TYPE *output,
+                                    __private const int out_chan,
+                                    __private const int in_chan,
+                                    __private const int height,
+                                    __private const int width,
+                                    __private const int inner_size) {
+  const int in_chan_idx = get_global_id(0);
+  const int out_chan_blk_idx = get_global_id(1);
+  const int hw_idx = get_global_id(2);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (in_chan_idx >= global_size_dim0 ||
+      out_chan_blk_idx >= global_size_dim1 ||
+      hw_idx >= global_size_dim2) {
+    return;
+  }
+#endif
+  const int t_in_chan = global_size_dim0;
+  const int out_chan_blk = global_size_dim1;
+  const int h_idx = hw_idx / width;
+  const int w_idx = hw_idx % width;
+  const int out_chan_idx = out_chan_blk_idx << 2;
+  const int in_offset = mad24(mad24(mad24(out_chan_idx, in_chan, in_chan_idx),
+      height, h_idx), width, w_idx) + input_offset;
+  const int out_offset = (mad24(mad24(mad24(h_idx, width, w_idx),
+      out_chan_blk, out_chan_blk_idx), t_in_chan, in_chan_idx) << 2);
+  DATA_TYPE4 value = 0;
+  if (in_chan_idx < in_chan) {
+    if (out_chan_idx + 3 < out_chan) {
+      value.x = CONVERT(input[in_offset]);
+      value.y = CONVERT(input[in_offset + inner_size]);
+      value.z = CONVERT(input[in_offset + 2 * inner_size]);
+      value.w = CONVERT(input[in_offset + 3 * inner_size]);
+    } else {
+      const int diff = out_chan - out_chan_idx;
+      switch(diff) {
+        case 3:
+          value.z = CONVERT(input[in_offset + 2 * inner_size]);
+        case 2:
+          value.y = CONVERT(input[in_offset + inner_size]);
+        case 1:
+          value.x = CONVERT(input[in_offset]);
+      }
+    }
+  }
+  VSTORE4(value, output, out_offset);
+}
+// MIHW -> [M, (I+3) / 4, H, W, 4]
+__kernel void transform_dw_conv_filter(BUFFER_OUT_OF_RANGE_PARAMS
+                                       GLOBAL_WORK_GROUP_SIZE_DIM3
+                                       __global IN_DATA_TYPE *input,  // MIHW
+                                       __private const int input_offset,
+                                       __global DATA_TYPE *output,
+                                       __private const int in_chan,
+                                       __private const int in_hw) {
+  const int width_idx = get_global_id(0);
+  const int height_idx = get_global_id(1);
+  const int in_chan_blk_idx = get_global_id(2);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (width_idx >= global_size_dim0 ||
+      height_idx >= global_size_dim1 ||
+      in_chan_blk_idx >= global_size_dim2) {
+    return;
+  }
+#endif
+  const int width = global_size_dim0;
+  const int height = global_size_dim1;
+  const int in_chan_idx = in_chan_blk_idx << 2;
+  const int in_offset = mad24(in_chan_idx, in_hw,
+      mad24(height_idx, width, width_idx)) + input_offset;
+  const int out_offset = mad24(in_chan_blk_idx, in_hw,
+      mad24(height_idx, width, width_idx)) << 2;
+  DATA_TYPE4 value = 0;
+  if (in_chan_idx + 3 < in_chan) {
+    value.x = CONVERT(input[in_offset]);
+    value.y = CONVERT(input[in_offset + in_hw]);
+    value.z = CONVERT(input[in_offset + (in_hw << 1)]);
+    value.w = CONVERT(input[in_offset + in_hw + (in_hw << 1)]);
+  } else {
+    const int diff = in_chan - in_chan_idx;
+    switch(diff) {
+      case 3:
+        value.z = CONVERT(input[in_offset + (in_hw << 1)]);
+      case 2:
+        value.y = CONVERT(input[in_offset + in_hw]);
+      case 1:
+        value.x = CONVERT(input[in_offset]);
+    }
+  }
+  VSTORE4(value, output, out_offset);
+}
+__kernel void transform_arg(BUFFER_OUT_OF_RANGE_PARAMS
+                            __private const int global_size_dim0,
+                            __global IN_DATA_TYPE *input,
+                            __private const int input_offset,
+                            __global DATA_TYPE *output,
+                            __private int size) {
+  const int blk_idx = get_global_id(0);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (blk_idx >= global_size_dim0) {
+    return;
+  }
+#endif
+  const int idx = blk_idx << 2;
+  const int diff = size - idx;
+  const int in_idx = idx + input_offset;
+  DATA_TYPE4 value = 0;
+  if (diff < 4) {
+    switch (diff) {
+      case 3:
+        value.z = CONVERT(input[in_idx + 2]);
+      case 2:
+        value.y = CONVERT(input[in_idx + 1]);
+      case 1:
+        value.x = CONVERT(input[in_idx]);
+    }
+  } else {
+    value = CONVERT4(vload4(0, input + in_idx));
+  }
+  VSTORE4(value, output, idx);
+}
+__kernel void transform_data_type(BUFFER_OUT_OF_RANGE_PARAMS
+                                  __private const int global_size_dim0,
+                                  __global IN_DATA_TYPE *input,
+                                  __private const int input_offset,
+                                  __global DATA_TYPE *output) {
+  const int out_idx = get_global_id(0);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_idx >= global_size_dim0) {
+    return;
+  }
+#endif
+  DATA_TYPE4 input_value = CONVERT4(vload4(out_idx, input + input_offset));
+  vstore4(input_value, out_idx, output);
+}
--- a/mace/kernels/opencl/cl/channel_shuffle.cl
+++ b/mace/kernels/opencl/cl/channel_shuffle.cl
 #include <common.h>
 // assume channes_per_group mod 4 = 0 && groups mod 4 == 0
-__kernel void channel_shuffle(KERNEL_ERROR_PARAMS
+__kernel void channel_shuffle(OUT_OF_RANGE_PARAMS
                              GLOBAL_WORK_GROUP_SIZE_DIM3
                              __read_only image2d_t input,
                              __private const int groups,

--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -24,19 +24,13 @@
 #define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type)
 #define DATA_TYPE4 VEC_DATA_TYPE(DATA_TYPE, 4)
+#define OUT_DATA_TYPE4 VEC_DATA_TYPE(OUT_DATA_TYPE, 4)
-#ifdef OUT_OF_RANGE_CHECK
+#define CONVERT_STR(value, type) convert_##type((value))
-#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \
-  check_out_of_range_for_image2d(image, (coord).x, (coord).y, kernel_error);
-#else
-#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord)
-#endif
-#define READ_IMAGET(image, sampler, coord) \
+#define CONVERT_TO(value, type) CONVERT_STR(value, type)
-  CMD_TYPE(read_image, CMD_DATA_TYPE)(image, sampler, coord)
+#define CONVERT(value) CONVERT_TO(value, DATA_TYPE)
-#define WRITE_IMAGET(image, coord, value)        \
+#define CONVERT4(value) CONVERT_TO(value, DATA_TYPE4)
-  CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord)   \
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(image, coord, value);
 #define GLOBAL_WORK_GROUP_SIZE_DIM2       \
    __private const int global_size_dim0, \
@@ -47,16 +41,37 @@
    __private const int global_size_dim1, \
    __private const int global_size_dim2,
+// oorc for 'Out Of Range Check'
 #ifdef OUT_OF_RANGE_CHECK
+#define OUT_OF_RANGE_PARAMS \
+  __global int *oorc_flag,
-#define KERNEL_ERROR_PARAMS \
+#define BUFFER_OUT_OF_RANGE_PARAMS      \
-  __global char *kernel_error,
+  __global int *oorc_flag,              \
+  __private const int oorc_output_length,
+#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \
+  check_out_of_range_for_image2d(image, (coord).x, (coord).y, oorc_flag);
+#define CHECK_OUT_OF_RANGE_FOR_BUFFER(idx) \
+  check_out_of_range_for_buffer(oorc_output_length, (idx), oorc_flag);
 #else
+#define OUT_OF_RANGE_PARAMS
+#define BUFFER_OUT_OF_RANGE_PARAMS
+#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord)
+#define CHECK_OUT_OF_RANGE_FOR_BUFFER(idx)
+#endif
-#define KERNEL_ERROR_PARAMS
+#define READ_IMAGET(image, sampler, coord) \
+  CMD_TYPE(read_image, CMD_DATA_TYPE)(image, sampler, coord)
+#define WRITE_IMAGET(image, coord, value)        \
+  CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord)   \
+  CMD_TYPE(write_image, CMD_DATA_TYPE)(image, coord, value);
+#define VSTORE4(data, output, offset)         \
+  CHECK_OUT_OF_RANGE_FOR_BUFFER((offset) + 3) \
+  vstore4(data, 0, output + (offset));
-#endif
 __constant sampler_t SAMPLER =
    CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
@@ -66,6 +81,7 @@ inline float4 do_sigmoid(float4 in) {
  return native_recip(1.0f + native_exp(-in));
 }
+#ifdef DATA_TYPE
 inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
 #ifdef USE_PRELU
                                DATA_TYPE4 prelu_alpha,
@@ -89,17 +105,25 @@ inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
 #endif
  return out;
 }
+#endif
 inline void check_out_of_range_for_image2d(__write_only image2d_t image,
                                           __private const int x,
                                           __private const int y,
-                                           global char *kernel_error) {
+                                           __global int *oorc_flag) {
-#ifdef OUT_OF_RANGE_CHECK
  int2 image_dim = get_image_dim(image);
  if (x >= image_dim.x || y >= image_dim.y) {
-    *kernel_error = 1;
+    *oorc_flag = 1;
  }
-#endif
 }
+inline void check_out_of_range_for_buffer(__private const int length,
+                                          __private const int idx,
+                                          __global int *oorc_flag) {
+  if (idx >= length) {
+    *oorc_flag = idx - length + 1;
+  }
+}
 #endif  // MACE_KERNELS_OPENCL_CL_COMMON_H_
--- a/mace/kernels/opencl/cl/concat.cl
+++ b/mace/kernels/opencl/cl/concat.cl
@@ -22,7 +22,7 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
 }
 // Supported data type: half/float
-__kernel void concat_channel(KERNEL_ERROR_PARAMS
+__kernel void concat_channel(OUT_OF_RANGE_PARAMS
                             GLOBAL_WORK_GROUP_SIZE_DIM3
                             __read_only image2d_t input0,
                             __read_only image2d_t input1,
@@ -84,7 +84,7 @@ __kernel void concat_channel(KERNEL_ERROR_PARAMS
 }
 // Required: All input channels are divisible by 4
-__kernel void concat_channel_multi(KERNEL_ERROR_PARAMS
+__kernel void concat_channel_multi(OUT_OF_RANGE_PARAMS
                                   GLOBAL_WORK_GROUP_SIZE_DIM3
                                   __read_only image2d_t input,
                                   __private const int chan_blk_offset,

--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
 #include <common.h>
-__kernel void conv_2d(KERNEL_ERROR_PARAMS
+__kernel void conv_2d(OUT_OF_RANGE_PARAMS
                      GLOBAL_WORK_GROUP_SIZE_DIM3
                      __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                      __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */

--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
 #include <common.h>
-__kernel void conv_2d_1x1(KERNEL_ERROR_PARAMS
+__kernel void conv_2d_1x1(OUT_OF_RANGE_PARAMS
                          GLOBAL_WORK_GROUP_SIZE_DIM3
                          __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                          __read_only image2d_t filter, /* cout%4 * cin, cout/4 */

--- a/mace/kernels/opencl/cl/conv_2d_1x1_buffer.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1_buffer.cl
+#include <common.h>
+__kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS
+                     GLOBAL_WORK_GROUP_SIZE_DIM2
+                     __global IN_DATA_TYPE *padded_input,
+                     __global IN_DATA_TYPE *filter,
+#ifdef BIAS
+                     __global IN_DATA_TYPE *bias,
+#endif
+                     __private const int in_height,
+                     __private const int in_width,
+                     __private const int in_chan,
+                     __private const int filter_in_chan,
+                     __private const int out_height,
+                     __private const int out_width,
+                     __private const int out_chan,
+                     __private const int stride_h,
+                     __private const int stride_w,
+                     __private const float relux_max_limit,
+                     __global OUT_DATA_TYPE *output) {
+  const int out_wc_blk_idx = get_global_id(0);
+  const int out_hb_idx = get_global_id(1);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_wc_blk_idx >= global_size_dim0 ||
+      out_hb_idx >= global_size_dim1) {
+    return;
+  }
+#endif
+  const int out_chan_blk = (out_chan + 3) >> 2;
+  const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk;
+  const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk;
+  const int batch_idx = out_hb_idx / out_height;
+  const int out_height_idx = out_hb_idx % out_height;
+  const int out_width_idx = out_width_blk_idx << 1;
+  const int out_chan_idx = out_chan_blk_idx << 2;
+  const int in_height_idx = mul24(out_height_idx, stride_h);
+  const int in_width_idx = mul24(out_width_idx, stride_w);
+  const int strided_chan = mul24(in_chan, stride_w);
+#ifdef BIAS
+  DATA_TYPE4 out0 = CONVERT4(vload4(0, bias + out_chan_idx));
+  DATA_TYPE4 out1 = out0;
+#else
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+#endif
+  int in_offset = mul24(mad24(mad24(batch_idx, in_height, in_height_idx),
+      in_width, in_width_idx), in_chan);
+  int filter_offset = mul24(out_chan_blk_idx, filter_in_chan) << 2;
+  DATA_TYPE4 in0, in1;
+  DATA_TYPE4 w0, w1, w2, w3;
+  for (int in_chan_idx = 0; in_chan_idx < in_chan; in_chan_idx += 4) {
+    w0 = CONVERT4(vload4(0, filter + filter_offset));
+    w1 = CONVERT4(vload4(0, filter + filter_offset + 4));
+    w2 = CONVERT4(vload4(0, filter + filter_offset + 8));
+    w3 = CONVERT4(vload4(0, filter + filter_offset + 12));
+    in0 = CONVERT4(vload4(0, padded_input + in_offset));
+    in1 = CONVERT4(vload4(0, padded_input + in_offset + strided_chan));
+    out0 = mad((DATA_TYPE4)(in0.x), w0, out0);
+    out0 = mad((DATA_TYPE4)(in0.y), w1, out0);
+    out0 = mad((DATA_TYPE4)(in0.z), w2, out0);
+    out0 = mad((DATA_TYPE4)(in0.w), w3, out0);
+    out1 = mad((DATA_TYPE4)(in1.x), w0, out1);
+    out1 = mad((DATA_TYPE4)(in1.y), w1, out1);
+    out1 = mad((DATA_TYPE4)(in1.z), w2, out1);
+    out1 = mad((DATA_TYPE4)(in1.w), w3, out1);
+    filter_offset += 16;
+    in_offset += 4;
+  }
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+  out0 = do_activation(out0, relux_max_limit);
+  out1 = do_activation(out1, relux_max_limit);
+#endif
+  int out_offset = mad24(mad24(mad24(batch_idx, out_height, out_height_idx),
+      out_width, out_width_idx), out_chan, out_chan_idx);
+#define WRITE_OUTPUT(i) \
+  if (out_chan_idx + 4 > out_chan) {           \
+    const int diff = out_chan - out_chan_idx;  \
+    switch(diff) {                             \
+      case 3:                                  \
+        output[out_offset + 2] = CONVERT_TO(out##i.z, OUT_DATA_TYPE);     \
+      case 2:                                  \
+        output[out_offset + 1] = CONVERT_TO(out##i.y, OUT_DATA_TYPE);     \
+      case 1:                                  \
+        output[out_offset] = CONVERT_TO(out##i.x, OUT_DATA_TYPE);         \
+    }                                          \
+    CHECK_OUT_OF_RANGE_FOR_BUFFER(out_offset + diff - 1); \
+  } else {                                     \
+    VSTORE4(CONVERT_TO(out##i, OUT_DATA_TYPE4), output, out_offset);   \
+  }
+  WRITE_OUTPUT(0);
+  if (out_width_idx + 1 >= out_width) return;
+  out_offset += out_chan;
+  WRITE_OUTPUT(1);
+#undef WRITE_OUTPUT
+}
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
 #include <common.h>
-__kernel void conv_2d_3x3(KERNEL_ERROR_PARAMS
+__kernel void conv_2d_3x3(OUT_OF_RANGE_PARAMS
                          GLOBAL_WORK_GROUP_SIZE_DIM3
                          __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                          __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */

--- a/mace/kernels/opencl/cl/conv_2d_buffer.cl
+++ b/mace/kernels/opencl/cl/conv_2d_buffer.cl
+#include <common.h>
+__kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS
+                     GLOBAL_WORK_GROUP_SIZE_DIM2
+                     __global IN_DATA_TYPE *padded_input,
+                     __global IN_DATA_TYPE *filter,
+#ifdef BIAS
+                     __global IN_DATA_TYPE *bias,
+#endif
+                     __private const int in_height,
+                     __private const int in_width,
+                     __private const int in_chan,
+                     __private const int filter_height,
+                     __private const int filter_width,
+                     __private const int filter_in_chan,
+                     __private const int filter_chan_size,
+                     __private const int out_height,
+                     __private const int out_width,
+                     __private const int out_chan,
+                     __private const int stride_h,
+                     __private const int stride_w,
+                     __private const int dilated_h_offset,
+                     __private const int dilated_w_offset,
+                     __private const float relux_max_limit,
+                     __global OUT_DATA_TYPE *output) {
+  const int out_wc_blk_idx = get_global_id(0);
+  const int out_hb_idx = get_global_id(1);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_wc_blk_idx >= global_size_dim0 ||
+      out_hb_idx >= global_size_dim1) {
+    return;
+  }
+#endif
+  const int out_chan_blk = (out_chan + 3) >> 2;
+  const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk;
+  const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk;
+  const int batch_idx = out_hb_idx / out_height;
+  const int out_height_idx = out_hb_idx % out_height;
+  const int out_width_idx = out_width_blk_idx << 2;
+  const int out_chan_idx = out_chan_blk_idx << 2;
+  const int in_height_idx = mul24(out_height_idx, stride_h);
+  const int in_width_idx = mul24(out_width_idx, stride_w);
+  const int strided_chan = mul24(in_chan, stride_w);
+#ifdef BIAS
+  DATA_TYPE4 out0 = CONVERT4(vload4(0, bias + out_chan_idx));
+  DATA_TYPE4 out1 = out0;
+  DATA_TYPE4 out2 = out0;
+  DATA_TYPE4 out3 = out0;
+#else
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+  DATA_TYPE4 out2 = 0;
+  DATA_TYPE4 out3 = 0;
+#endif
+  const int in_offset_base = mul24(mad24(mad24(batch_idx, in_height, in_height_idx),
+      in_width, in_width_idx), in_chan);
+  int filter_offset_base = mul24(out_chan_blk_idx, filter_in_chan) << 2;
+  DATA_TYPE4 in0, in1, in2, in3;
+  DATA_TYPE4 w0, w1, w2, w3;
+  for (int filter_h_idx = 0; filter_h_idx < filter_height; ++filter_h_idx) {
+    int in_height_offset = mad24(filter_h_idx, dilated_h_offset, in_offset_base);
+    for (int filter_w_idx = 0; filter_w_idx < filter_width; ++filter_w_idx) {
+      int filter_offset = filter_offset_base;
+      int in_offset = mad24(filter_w_idx, dilated_w_offset, in_height_offset);
+      for (int in_chan_idx = 0; in_chan_idx < in_chan; in_chan_idx += 4) {
+        w0 = CONVERT4(vload4(0, filter + filter_offset));
+        w1 = CONVERT4(vload4(0, filter + filter_offset + 4));
+        w2 = CONVERT4(vload4(0, filter + filter_offset + 8));
+        w3 = CONVERT4(vload4(0, filter + filter_offset + 12));
+        in0 = CONVERT4(vload4(0, padded_input + in_offset));
+        in1 = CONVERT4(vload4(0, padded_input + in_offset + strided_chan));
+        in2 = CONVERT4(vload4(0, padded_input + in_offset + (strided_chan << 1)));
+        in3 = CONVERT4(vload4(0, padded_input + in_offset + strided_chan + (strided_chan << 1)));
+        out0 = mad((DATA_TYPE4)(in0.x), w0, out0);
+        out0 = mad((DATA_TYPE4)(in0.y), w1, out0);
+        out0 = mad((DATA_TYPE4)(in0.z), w2, out0);
+        out0 = mad((DATA_TYPE4)(in0.w), w3, out0);
+        out1 = mad((DATA_TYPE4)(in1.x), w0, out1);
+        out1 = mad((DATA_TYPE4)(in1.y), w1, out1);
+        out1 = mad((DATA_TYPE4)(in1.z), w2, out1);
+        out1 = mad((DATA_TYPE4)(in1.w), w3, out1);
+        out2 = mad((DATA_TYPE4)(in2.x), w0, out2);
+        out2 = mad((DATA_TYPE4)(in2.y), w1, out2);
+        out2 = mad((DATA_TYPE4)(in2.z), w2, out2);
+        out2 = mad((DATA_TYPE4)(in2.w), w3, out2);
+        out3 = mad((DATA_TYPE4)(in3.x), w0, out3);
+        out3 = mad((DATA_TYPE4)(in3.y), w1, out3);
+        out3 = mad((DATA_TYPE4)(in3.z), w2, out3);
+        out3 = mad((DATA_TYPE4)(in3.w), w3, out3);
+        filter_offset += 16;
+        in_offset += 4;
+      }
+      filter_offset_base += filter_chan_size;
+    }
+  }
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+  out0 = do_activation(out0, relux_max_limit);
+  out1 = do_activation(out1, relux_max_limit);
+  out2 = do_activation(out2, relux_max_limit);
+  out3 = do_activation(out3, relux_max_limit);
+#endif
+  int out_offset = mad24(mad24(mad24(batch_idx, out_height, out_height_idx),
+      out_width, out_width_idx), out_chan, out_chan_idx);
+#define WRITE_OUTPUT(i) \
+  if (out_chan_idx + 4 > out_chan) {           \
+    const int diff = out_chan - out_chan_idx;  \
+    switch(diff) {                             \
+      case 3:                                  \
+        output[out_offset + 2] = CONVERT_TO(out##i.z, OUT_DATA_TYPE);     \
+      case 2:                                  \
+        output[out_offset + 1] = CONVERT_TO(out##i.y, OUT_DATA_TYPE);     \
+      case 1:                                  \
+        output[out_offset] = CONVERT_TO(out##i.x, OUT_DATA_TYPE);         \
+    }                                          \
+    CHECK_OUT_OF_RANGE_FOR_BUFFER(out_offset + diff - 1); \
+  } else {                                     \
+    VSTORE4(CONVERT_TO(out##i, OUT_DATA_TYPE4), output, out_offset);   \
+  }
+  WRITE_OUTPUT(0);
+  if (out_width_idx + 1 >= out_width) return;
+  out_offset += out_chan;
+  WRITE_OUTPUT(1);
+  if (out_width_idx + 2 >= out_width) return;
+  out_offset += out_chan;
+  WRITE_OUTPUT(2);
+  if (out_width_idx + 3 >= out_width) return;
+  out_offset += out_chan;
+  WRITE_OUTPUT(3);
+#undef WRITE_OUTPUT
+}
--- a/mace/kernels/opencl/cl/crop.cl
+++ b/mace/kernels/opencl/cl/crop.cl
 #include <common.h>
-__kernel void crop(KERNEL_ERROR_PARAMS
+__kernel void crop(OUT_OF_RANGE_PARAMS
                   GLOBAL_WORK_GROUP_SIZE_DIM3
                   __read_only image2d_t input,
                   __private const int offset_b,

--- a/mace/kernels/opencl/cl/deconv_2d.cl
+++ b/mace/kernels/opencl/cl/deconv_2d.cl
 #include <common.h>
-__kernel void deconv_2d(KERNEL_ERROR_PARAMS
+__kernel void deconv_2d(OUT_OF_RANGE_PARAMS
                        GLOBAL_WORK_GROUP_SIZE_DIM3
                        __read_only image2d_t input,
                        __read_only image2d_t weights,

--- a/mace/kernels/opencl/cl/depth_to_space.cl
+++ b/mace/kernels/opencl/cl/depth_to_space.cl
 #include <common.h>
-__kernel void depth_to_space(KERNEL_ERROR_PARAMS
+__kernel void depth_to_space(OUT_OF_RANGE_PARAMS
                             GLOBAL_WORK_GROUP_SIZE_DIM3
                             __read_only image2d_t input,
                             __private const int block_size,

--- a/mace/kernels/opencl/cl/depthwise_conv2d.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl
 #include <common.h>
 // Only multiplier = 1 is supported
-__kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS
+__kernel void depthwise_conv2d(OUT_OF_RANGE_PARAMS
                               GLOBAL_WORK_GROUP_SIZE_DIM3
                               __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                               __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
@@ -136,7 +136,7 @@ __kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
 }
-__kernel void depthwise_conv2d_s1(KERNEL_ERROR_PARAMS
+__kernel void depthwise_conv2d_s1(OUT_OF_RANGE_PARAMS
                                  GLOBAL_WORK_GROUP_SIZE_DIM3
                                  __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                  __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */

--- a/mace/kernels/opencl/cl/depthwise_conv2d_buffer.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d_buffer.cl
+#include <common.h>
+#define BLOCK_SIZE 4
+__kernel void depthwise_conv2d(BUFFER_OUT_OF_RANGE_PARAMS
+                               GLOBAL_WORK_GROUP_SIZE_DIM2
+                               __global IN_DATA_TYPE *padded_input,
+                               __global IN_DATA_TYPE *filter,
+#ifdef BIAS
+                               __global IN_DATA_TYPE *bias,
+#endif
+                               __private const int in_height,
+                               __private const int in_width,
+                               __private const int in_chan,
+                               __private const int filter_height,
+                               __private const int filter_width,
+                               __private const int filter_hw,
+                               __private const int out_height,
+                               __private const int out_width,
+                               __private const int out_chan,
+                               __private const int stride_h,
+                               __private const int stride_w,
+                               __private const int dilated_h_offset,
+                               __private const int dilated_w_offset,
+                               __private const float relux_max_limit,
+                               __global OUT_DATA_TYPE *output) {
+  const int out_wc_blk_idx = get_global_id(0);
+  const int out_hb_idx = get_global_id(1);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_wc_blk_idx >= global_size_dim0 ||
+      out_hb_idx >= global_size_dim1) {
+      return;
+  }
+#endif
+  const int out_chan_blk = (out_chan + 3) >> 2;
+  const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk;
+  const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk;
+  const int batch_idx = out_hb_idx / out_height;
+  const int out_height_idx = out_hb_idx % out_height;
+  const int out_width_idx = out_width_blk_idx << 2;
+  const int out_chan_idx = out_chan_blk_idx << 2;
+  const int in_chan_idx = out_chan_idx;
+  const int in_height_idx = mul24(out_height_idx, stride_h);
+  const int in_width_idx = mul24(out_width_idx, stride_w);
+  const int strided_chan = mul24(in_chan, stride_w);
+#ifdef BIAS
+    DATA_TYPE4 out0 = CONVERT4(vload4(0, bias + out_chan_idx));
+    DATA_TYPE4 out1 = out0;
+    DATA_TYPE4 out2 = out0;
+    DATA_TYPE4 out3 = out0;
+#else
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+  DATA_TYPE4 out2 = 0;
+  DATA_TYPE4 out3 = 0;
+#endif
+  const int in_offset_base = mad24(mad24(mad24(batch_idx, in_height, in_height_idx),
+      in_width, in_width_idx), in_chan, in_chan_idx);
+  int filter_offset = mul24(out_chan_blk_idx, filter_hw) << 2;
+  DATA_TYPE4 in0, in1, in2, in3;
+  DATA_TYPE4 w;
+  for (int filter_h_idx = 0; filter_h_idx < filter_height; ++filter_h_idx) {
+    int in_offset = mad24(filter_h_idx, dilated_h_offset, in_offset_base);
+    for (int filter_w_idx = 0; filter_w_idx < filter_width; ++filter_w_idx) {
+      w = CONVERT4(vload4(0, filter + filter_offset));
+      in0 = CONVERT4(vload4(0, padded_input + in_offset));
+      in1 = CONVERT4(vload4(0, padded_input + in_offset + strided_chan));
+      in2 = CONVERT4(vload4(0, padded_input + in_offset + (strided_chan << 1)));
+      in3 = CONVERT4(vload4(0, padded_input + in_offset + strided_chan + (strided_chan << 1)));
+      out0 = mad(in0, w, out0);
+      out1 = mad(in1, w, out1);
+      out2 = mad(in2, w, out2);
+      out3 = mad(in3, w, out3);
+      filter_offset += 4;
+      in_offset += dilated_w_offset;
+    }
+  }
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+  out0 = do_activation(out0, relux_max_limit);
+  out1 = do_activation(out1, relux_max_limit);
+  out2 = do_activation(out2, relux_max_limit);
+  out3 = do_activation(out3, relux_max_limit);
+#endif
+  int out_offset = mad24(mad24(mad24(batch_idx, out_height, out_height_idx),
+      out_width, out_width_idx), out_chan, out_chan_idx);
+#define WRITE_OUTPUT(i) \
+  if (out_chan_idx + 4 > out_chan) {           \
+    const int diff = out_chan - out_chan_idx;  \
+    switch(diff) {                             \
+      case 3:                                  \
+        output[out_offset + 2] = CONVERT_TO(out##i.z, OUT_DATA_TYPE);     \
+      case 2:                                  \
+        output[out_offset + 1] = CONVERT_TO(out##i.y, OUT_DATA_TYPE);     \
+      case 1:                                  \
+        output[out_offset] = CONVERT_TO(out##i.x, OUT_DATA_TYPE);         \
+    }                                          \
+    CHECK_OUT_OF_RANGE_FOR_BUFFER(out_offset + diff - 1); \
+  } else {                                     \
+    VSTORE4(CONVERT_TO(out##i, OUT_DATA_TYPE4), output, out_offset);   \
+  }
+  WRITE_OUTPUT(0);
+  if (out_width_idx + 1 >= out_width) return;
+  out_offset += out_chan;
+  WRITE_OUTPUT(1);
+  if (out_width_idx + 2 >= out_width) return;
+  out_offset += out_chan;
+  WRITE_OUTPUT(2);
+  if (out_width_idx + 3 >= out_width) return;
+  out_offset += out_chan;
+  WRITE_OUTPUT(3);
+#undef WRITE_OUTPUT
+}
--- a/mace/kernels/opencl/cl/eltwise.cl
+++ b/mace/kernels/opencl/cl/eltwise.cl
 #include <common.h>
-__kernel void eltwise(KERNEL_ERROR_PARAMS
+__kernel void eltwise(OUT_OF_RANGE_PARAMS
                      GLOBAL_WORK_GROUP_SIZE_DIM3
                      __read_only image2d_t input0,
 #if INPUT_TYPE == 1

--- a/mace/kernels/opencl/cl/fully_connected.cl
+++ b/mace/kernels/opencl/cl/fully_connected.cl
 #include <common.h>
 // output = weight * input + bias
-__kernel void fully_connected(KERNEL_ERROR_PARAMS
+__kernel void fully_connected(OUT_OF_RANGE_PARAMS
                              GLOBAL_WORK_GROUP_SIZE_DIM2
                              __read_only image2d_t input,
                              __read_only image2d_t weight,
@@ -64,7 +64,7 @@ __kernel void fully_connected(KERNEL_ERROR_PARAMS
 }
 // output = weight * input + bias
-__kernel void fully_connected_width(KERNEL_ERROR_PARAMS
+__kernel void fully_connected_width(OUT_OF_RANGE_PARAMS
                                    GLOBAL_WORK_GROUP_SIZE_DIM3
                                    __read_only image2d_t input,
                                    __read_only image2d_t weight,

--- a/mace/kernels/opencl/cl/lstmcell.cl
+++ b/mace/kernels/opencl/cl/lstmcell.cl
 #include <common.h>
-__kernel void lstmcell(KERNEL_ERROR_PARAMS
+__kernel void lstmcell(OUT_OF_RANGE_PARAMS
                       GLOBAL_WORK_GROUP_SIZE_DIM2
                       __read_only image2d_t input,
                       __read_only image2d_t pre_output,

--- a/mace/kernels/opencl/cl/matmul.cl
+++ b/mace/kernels/opencl/cl/matmul.cl
 #include <common.h>
 // C = A * B
-__kernel void matmul(KERNEL_ERROR_PARAMS
+__kernel void matmul(OUT_OF_RANGE_PARAMS
                     GLOBAL_WORK_GROUP_SIZE_DIM2
                     __read_only image2d_t A,
                     __read_only image2d_t B,

--- a/mace/kernels/opencl/cl/pad.cl
+++ b/mace/kernels/opencl/cl/pad.cl
 #include <common.h>
-__kernel void pad(KERNEL_ERROR_PARAMS
+__kernel void pad(OUT_OF_RANGE_PARAMS
                  GLOBAL_WORK_GROUP_SIZE_DIM3
                  __read_only image2d_t input,
                  __write_only image2d_t output,

--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
@@ -16,7 +16,7 @@ inline int calculate_avg_block_size(const int pool_size_h,
 }
 // Supported data type: half/float
-__kernel void pooling(KERNEL_ERROR_PARAMS
+__kernel void pooling(OUT_OF_RANGE_PARAMS
                      GLOBAL_WORK_GROUP_SIZE_DIM3
                      __read_only image2d_t input,
                      __private const int in_height,

--- a/mace/kernels/opencl/cl/pooling_buffer.cl
+++ b/mace/kernels/opencl/cl/pooling_buffer.cl
+#include <common.h>
+#define MIN_VALUE -FLT_MAX
+inline int calculate_avg_block_size(const int pool_size_h,
+                                    const int pool_size_w,
+                                    const int pos_h,
+                                    const int pos_w,
+                                    const int h_size,
+                                    const int w_size) {
+  const int h_start = max(0, pos_h);
+  const int w_start = max(0, pos_w);
+  const int h_end = min(pos_h + pool_size_h, h_size);
+  const int w_end = min(pos_w + pool_size_w, w_size);
+  return mul24((h_end - h_start), (w_end - w_start));
+}
+// Supported data type: half/float
+__kernel void pooling(BUFFER_OUT_OF_RANGE_PARAMS
+                      GLOBAL_WORK_GROUP_SIZE_DIM3
+                      __global IN_DATA_TYPE *input,
+                      __private const int in_height,
+                      __private const int in_width,
+                      __private const int in_chan,
+                      __private const int out_height,
+                      __private const int out_chan,
+                      __private const int pad_top,
+                      __private const int pad_left,
+                      __private const int stride_h,
+                      __private const int stride_w,
+                      __private const int kernel_h,
+                      __private const int kernel_w,
+                      __global OUT_DATA_TYPE *output) {
+  const int out_chan_blk_idx = get_global_id(0);
+  const int out_width_idx = get_global_id(1);
+  const int out_hb_idx = get_global_id(2);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_chan_blk_idx >= global_size_dim0 ||
+      out_width_idx >= global_size_dim1 ||
+      out_hb_idx >= global_size_dim2) {
+    return;
+  }
+#endif
+  const int out_width = global_size_dim1;
+  const int in_wc_size = mul24(in_width, in_chan);
+  const int batch_idx = out_hb_idx / out_height;
+  const int out_height_idx = out_hb_idx % out_height;
+  const int chan_idx = out_chan_blk_idx << 2;
+  const int in_height_start = mul24(out_height_idx, stride_h) - pad_top;
+  const int in_width_start = mul24(out_width_idx, stride_w) - pad_left;
+  int in_offset_base = mad24(mad24(mad24(batch_idx, in_height, in_height_start),
+      in_width, in_width_start), in_chan, chan_idx);
+#ifdef POOL_AVG
+  DATA_TYPE4 res = 0;
+  for (int height = 0; height < kernel_h; ++height) {
+    int in_height_idx = in_height_start + height;
+    if (0 <= in_height_idx && in_height_idx < in_height) {
+      int in_offset = mad24(height, in_wc_size, in_offset_base);
+      for (int width = 0; width < kernel_w; ++width) {
+        int in_width_idx = in_width_start + width;
+        if (0 <= in_width_idx && in_width_idx < in_width) {
+          DATA_TYPE4 in = CONVERT4(vload4(0, input + in_offset));
+          res = res + in;
+        }
+        in_offset += in_chan;
+      }
+    }
+  }
+  const int block_size = calculate_avg_block_size(kernel_h,
+                                                  kernel_w,
+                                                  in_height_start,
+                                                  in_width_start,
+                                                  in_height,
+                                                  in_width);
+  res /= block_size;
+#else
+  DATA_TYPE4 res = (DATA_TYPE4)(MIN_VALUE);
+  for (int height = 0; height < kernel_h; ++height) {
+    int in_height_idx = in_height_start + height;
+    if (0 <= in_height_idx && in_height_idx < in_height) {
+      int in_offset = mad24(height, in_wc_size, in_offset_base);
+      for (int width = 0; width < kernel_w; ++width) {
+        int in_width_idx = in_width_start + width;
+        if (0 <= in_width_idx && in_width_idx < in_width) {
+          DATA_TYPE4 in = CONVERT4(vload4(0, input + in_offset));
+          res = fmax(res, in);
+        }
+        in_offset += in_chan;
+      }
+    }
+  }
+#endif
+  const int out_offset = mad24(mad24(mad24(batch_idx, out_height, out_height_idx),
+      out_width, out_width_idx), out_chan, chan_idx);
+  int remain_chan = out_chan - chan_idx;
+  if (remain_chan < 4) {
+    switch(remain_chan) {
+      case 3:
+        output[out_offset + 2] = res.z;
+      case 2:
+        output[out_offset + 1] = res.y;
+      case 1:
+        output[out_offset] = res.x;
+    }
+    CHECK_OUT_OF_RANGE_FOR_BUFFER(out_offset + remain_chan - 1);
+  } else {
+    VSTORE4(CONVERT_TO(res, OUT_DATA_TYPE4), output, out_offset);
+  }
+}
--- a/mace/kernels/opencl/cl/reduce_mean.cl
+++ b/mace/kernels/opencl/cl/reduce_mean.cl
 #include <common.h>
-__kernel void reduce_mean(KERNEL_ERROR_PARAMS
+__kernel void reduce_mean(OUT_OF_RANGE_PARAMS
                          GLOBAL_WORK_GROUP_SIZE_DIM3
                          __read_only image2d_t input,
                          __local DATA_TYPE4 *group_sum,

--- a/mace/kernels/opencl/cl/resize_bicubic.cl
+++ b/mace/kernels/opencl/cl/resize_bicubic.cl
@@ -10,7 +10,7 @@ inline float coeff_odd(float i) {
  return ((-0.75f * x + 3.75f) * x - 6.0f) * x + 3.0f;
 }
-__kernel void resize_bicubic_nocache(KERNEL_ERROR_PARAMS
+__kernel void resize_bicubic_nocache(OUT_OF_RANGE_PARAMS
                                     GLOBAL_WORK_GROUP_SIZE_DIM3
                                     __read_only image2d_t input,
                                     __write_only image2d_t output,

--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
 #include <common.h>
-__kernel void resize_bilinear_nocache(KERNEL_ERROR_PARAMS
+__kernel void resize_bilinear_nocache(OUT_OF_RANGE_PARAMS
                                      GLOBAL_WORK_GROUP_SIZE_DIM3
                                      __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                      __write_only image2d_t output,

--- a/mace/kernels/opencl/cl/softmax.cl
+++ b/mace/kernels/opencl/cl/softmax.cl
 #include <common.h>
-__kernel void softmax(KERNEL_ERROR_PARAMS
+__kernel void softmax(OUT_OF_RANGE_PARAMS
                      GLOBAL_WORK_GROUP_SIZE_DIM3
                      __read_only image2d_t input,
                      __private const int channels,

--- a/mace/kernels/opencl/cl/softmax_buffer.cl
+++ b/mace/kernels/opencl/cl/softmax_buffer.cl
+#include <common.h>
+__kernel void softmax(BUFFER_OUT_OF_RANGE_PARAMS
+                      GLOBAL_WORK_GROUP_SIZE_DIM3
+                      __global IN_DATA_TYPE *input,
+                      __private const int height,
+                      __private const int channels,
+                      __private const int remain_channels,
+                      __global OUT_DATA_TYPE *output) {
+  const int chan_blk_idx = get_global_id(0);
+  const int width_idx = get_global_id(1);
+  const int hb_idx = get_global_id(2);
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
+      || hb_idx >= global_size_dim2) {
+    return;
+  }
+#endif
+  const int chan_blks = global_size_dim0 - 1;
+  const int width = global_size_dim1;
+  const int batch_idx = hb_idx / height;
+  const int height_idx = hb_idx % height;
+  const int chan_idx = chan_blk_idx << 2;
+  const int offset_base = mul24(mad24(mad24(batch_idx, height, height_idx),
+      width, width_idx), channels);
+  int in_offset = offset_base;
+  DATA_TYPE max_value = -FLT_MAX;
+  DATA_TYPE sum = 0;
+  DATA_TYPE4 data;
+  for (short i = 0; i < chan_blks; ++i) {
+    data = CONVERT4(vload4(0, input + in_offset));
+    max_value = max(max_value, data.x);
+    max_value = max(max_value, data.y);
+    max_value = max(max_value, data.z);
+    max_value = max(max_value, data.w);
+    in_offset += 4;
+  }
+  switch(remain_channels) {
+    case 0:
+      max_value = max(max_value, CONVERT(input[in_offset + 3]));
+    case 1:
+      max_value = max(max_value, CONVERT(input[in_offset + 2]));
+    case 2:
+      max_value = max(max_value, CONVERT(input[in_offset + 1]));
+    case 3:
+      max_value = max(max_value, CONVERT(input[in_offset]));
+  }
+  in_offset = offset_base;
+  for (short i = 0; i < chan_blks; ++i) {
+    data = CONVERT4(vload4(0, input + in_offset));
+    data = native_exp(data - max_value);
+    sum += data.x;
+    sum += data.y;
+    sum += data.z;
+    sum += data.w;
+    in_offset += 4;
+  }
+  switch(remain_channels) {
+    case 0:
+      sum += native_exp(CONVERT(input[in_offset + 3]) - max_value);
+    case 1:
+      sum += native_exp(CONVERT(input[in_offset + 2]) - max_value);
+    case 2:
+      sum += native_exp(CONVERT(input[in_offset + 1]) - max_value);
+    case 3:
+      sum += native_exp(CONVERT(input[in_offset]) - max_value);
+  }
+  int remain_chan = channels - chan_idx;
+  int offset = offset_base + chan_idx;
+  if (remain_chan < 4) {
+    switch(remain_chan) {
+      case 3:
+        output[offset + 2] = native_exp(CONVERT(input[offset + 2]) - max_value) / sum;
+      case 2:
+        output[offset + 1] = native_exp(CONVERT(input[offset + 1]) - max_value) / sum;
+      case 1:
+        output[offset] = native_exp(CONVERT(input[offset]) - max_value) / sum;
+    }
+  } else {
+    data = CONVERT4(vload4(0, input + offset));
+    data = native_exp(data - max_value) / sum;
+    VSTORE4(CONVERT_TO(data, OUT_DATA_TYPE4), output, offset);
+  }
+}
--- a/mace/kernels/opencl/cl/space_to_batch.cl
+++ b/mace/kernels/opencl/cl/space_to_batch.cl
 #include <common.h>
-__kernel void space_to_batch(KERNEL_ERROR_PARAMS
+__kernel void space_to_batch(OUT_OF_RANGE_PARAMS
                             GLOBAL_WORK_GROUP_SIZE_DIM3
                             __read_only image2d_t space_data,
                             __write_only image2d_t batch_data,

--- a/mace/kernels/opencl/cl/space_to_depth.cl
+++ b/mace/kernels/opencl/cl/space_to_depth.cl
 #include <common.h>
-__kernel void space_to_depth(KERNEL_ERROR_PARAMS
+__kernel void space_to_depth(OUT_OF_RANGE_PARAMS
                             GLOBAL_WORK_GROUP_SIZE_DIM3
                             __read_only image2d_t input,
                             __private const int block_size,

--- a/mace/kernels/opencl/cl/split.cl
+++ b/mace/kernels/opencl/cl/split.cl
 #include <common.h>
-__kernel void split(KERNEL_ERROR_PARAMS
+__kernel void split(OUT_OF_RANGE_PARAMS
                    GLOBAL_WORK_GROUP_SIZE_DIM3
                    __read_only image2d_t input,
                    __private const int chan_blk_offset,

--- a/mace/kernels/opencl/cl/winograd_transform.cl
+++ b/mace/kernels/opencl/cl/winograd_transform.cl
 #include <common.h>
-__kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS
+__kernel void winograd_transform_2x2(OUT_OF_RANGE_PARAMS
                                     GLOBAL_WORK_GROUP_SIZE_DIM2
                                     __read_only image2d_t input,
                                     __write_only image2d_t output,
@@ -118,7 +118,7 @@ __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS
  }
 }
-__kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS
+__kernel void winograd_inverse_transform_2x2(OUT_OF_RANGE_PARAMS
                                             GLOBAL_WORK_GROUP_SIZE_DIM2
                                             __read_only image2d_t input,
 #ifdef BIAS
@@ -231,7 +231,7 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS
 }
-__kernel void winograd_transform_4x4(KERNEL_ERROR_PARAMS
+__kernel void winograd_transform_4x4(OUT_OF_RANGE_PARAMS
                                     GLOBAL_WORK_GROUP_SIZE_DIM2
                                     __read_only image2d_t input,
                                     __write_only image2d_t output,
@@ -390,7 +390,7 @@ __kernel void winograd_transform_4x4(KERNEL_ERROR_PARAMS
  }
 }
-__kernel void winograd_inverse_transform_4x4(KERNEL_ERROR_PARAMS
+__kernel void winograd_inverse_transform_4x4(OUT_OF_RANGE_PARAMS
                                             GLOBAL_WORK_GROUP_SIZE_DIM2
                                             __read_only image2d_t input,
 #ifdef BIAS

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -13,191 +13,21 @@
 // limitations under the License.
 #include "mace/kernels/concat.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/image/concat.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
-namespace {
+template <typename T>
-std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+ConcatFunctor<DeviceType::GPU, T>::ConcatFunctor(
-                              const uint32_t *gws,
+    OpKernelContext *context,
-                              const uint32_t kwg_size) {
+    const int32_t axis)
-  std::vector<uint32_t> lws(4, 0);
+    : OpKernel(context) {
-  if (kwg_size == 0) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    lws[0] = lws[1] = lws[2] = 1;
+    kernel_.reset(new opencl::image::ConcatKernel<T>(axis));
-  } else {
-    uint64_t
-        cache_size = runtime->device_global_mem_cache_size();
-    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
-    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
-    lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
-    const uint32_t lws_size = lws[0] * lws[1];
-    lws[2] =
-        std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
-  }
-  return lws;
-}
-}  // namespace
-static MaceStatus Concat2(OpKernelContext *context,
-                          cl::Kernel *kernel,
-                          const Tensor *input0,
-                          const Tensor *input1,
-                          const DataType dt,
-                          std::vector<index_t> *prev_input_shape,
-                          Tensor *output,
-                          StatsFuture *future,
-                          uint32_t *kwg_size,
-                          std::unique_ptr<BufferBase> *kernel_error) {
-  const index_t batch = output->dim(0);
-  const index_t height = output->dim(1);
-  const index_t width = output->dim(2);
-  const index_t channel = output->dim(3);
-  const int channel_blk = RoundUpDiv4(channel);
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(width),
-      static_cast<uint32_t>(batch * height),
-  };
-  auto runtime = context->device()->opencl_runtime();
-  if (kernel->get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error, context);
-    NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
-    built_options.emplace("-Dconcat_channel=" + kernel_name);
-    if (input0->dtype() == output->dtype()) {
-      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    } else {
-      built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-      built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    }
-    if (input0->dim(3) % 4 == 0) {
-      built_options.emplace("-DDIVISIBLE_FOUR");
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
-                                              built_options, kernel));
-    *kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-  }
-  if (!IsVecEqual(*prev_input_shape, input0->shape())) {
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG_PTR;
-    SET_3D_GWS_ARGS_PTR(kernel, gws);
-    kernel->setArg(idx++,
-                   *(static_cast<const cl::Image2D *>(input0->opencl_image())));
-    kernel->setArg(idx++,
-                   *(static_cast<const cl::Image2D *>(input1->opencl_image())));
-    kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
-    kernel->setArg(idx++,
-                   *(static_cast<cl::Image2D *>(output->opencl_image())));
-    *prev_input_shape = input0->shape();
-  }
-  const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
-  std::string tuning_key =
-      Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(*kernel_error);
-  return MACE_SUCCESS;
-}
-static MaceStatus ConcatN(OpKernelContext *context,
-                          cl::Kernel *kernel,
-                          const std::vector<const Tensor *> &input_list,
-                          const DataType dt,
-                          Tensor *output,
-                          StatsFuture *future,
-                          uint32_t *kwg_size,
-                          std::unique_ptr<BufferBase> *kernel_error) {
-  const index_t batch = output->dim(0);
-  const index_t height = output->dim(1);
-  const index_t width = output->dim(2);
-  auto runtime = context->device()->opencl_runtime();
-  if (kernel->get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error, context);
-    NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
-    built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
-                                              built_options, kernel));
-    *kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-  }
-  const int inputs_count = input_list.size();
-  index_t chan_blk_offset = 0;
-  cl::Event event;
-  CallStats call_stats{INT64_MAX, 0};
-  for (int i = 0; i < inputs_count; ++i) {
-    const Tensor *input = input_list[i];
-    index_t input_channel_blk = input->dim(3) / 4;
-    const uint32_t gws[3] = {
-        static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
-        static_cast<uint32_t>(batch * height),
-    };
-    const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG_PTR;
-    SET_3D_GWS_ARGS_PTR(kernel, gws);
-    kernel->setArg(idx++, *(input->opencl_image()));
-    kernel->setArg(idx++, static_cast<int32_t>(chan_blk_offset));
-    kernel->setArg(idx++, *(output->opencl_image()));
-    chan_blk_offset += input_channel_blk;
-    cl_int error;
-    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          *kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
  } else {
-      std::vector<uint32_t> roundup_gws(lws.size());
+    MACE_NOT_IMPLEMENTED;
-      for (size_t j = 0; j < 3; ++j) {
-        roundup_gws[j] = RoundUp(gws[j], lws[j]);
-      }
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          *kernel, cl::NullRange,
-          cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-    }
-    MACE_CL_RET_STATUS(error);
-    OUT_OF_RANGE_VALIDATION(*kernel_error);
-    if (future != nullptr && runtime->is_profiling_enabled()) {
-      event.wait();
-      CallStats tmp_stats;
-      runtime->GetCallStats(event, &tmp_stats);
-      call_stats.start_micros =
-          std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
-      call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
-    }
-  }
-  if (future != nullptr) {
-    future->wait_fn = [call_stats](CallStats *stats) {
-      if (stats != nullptr) {
-        stats->start_micros = call_stats.start_micros;
-        stats->end_micros = stats->start_micros + call_stats.end_micros;
-      }
-    };
  }
-  return MACE_SUCCESS;
 }
 template <typename T>
@@ -205,52 +35,7 @@ MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()(
    const std::vector<const Tensor *> &input_list,
    Tensor *output,
    StatsFuture *future) {
-  const int inputs_count = input_list.size();
+  return kernel_->Compute(context_, input_list, output, future);
-  MACE_CHECK(inputs_count >= 2 && axis_ == 3)
-      << "Concat opencl kernel only support >=2 elements with axis == 3";
-  const Tensor *input0 = input_list[0];
-  bool divisible_four = input0->dim(axis_) % 4 == 0;
-  std::vector<index_t> output_shape(input0->shape());
-  for (int i = 1; i < inputs_count; ++i) {
-    const Tensor *input = input_list[i];
-    MACE_CHECK(input->dim_size() == input0->dim_size(),
-               "Ranks of all input tensors must be same.");
-    divisible_four &= input->dim(axis_) % 4 == 0;
-    for (int j = 0; j < input->dim_size(); ++j) {
-      if (j == axis_) {
-        continue;
-      }
-      MACE_CHECK(input->dim(j) == input0->dim(j),
-                 "Dimensions of inputs should equal except axis.");
-    }
-    output_shape[axis_] += input->dim(axis_);
-  }
-  MACE_CHECK(
-      inputs_count == 2 || divisible_four,
-      "Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
-  std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-  switch (inputs_count) {
-    case 2:
-      return Concat2(context_,
-                     &kernel_, input_list[0], input_list[1],
-                     DataTypeToEnum<T>::value, &input_shape_, output, future,
-                     &kwg_size_, &kernel_error_);
-    default:
-      if (divisible_four) {
-        return ConcatN(context_,
-                       &kernel_, input_list, DataTypeToEnum<T>::value, output,
-                       future, &kwg_size_, &kernel_error_);
-      } else {
-        MACE_NOT_IMPLEMENTED;
-      }
-  }
-  return MACE_SUCCESS;
 }
 template struct ConcatFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/conv_2d.cc
+++ b/mace/kernels/opencl/conv_2d.cc
@@ -13,61 +13,37 @@
 // limitations under the License.
 #include "mace/kernels/conv_2d.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/kernels/opencl/image/conv_2d.h"
+#include "mace/kernels/opencl/buffer/conv_2d.h"
 namespace mace {
 namespace kernels {
-extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *runtime,
+template<typename T>
-                                   cl::Kernel *kernel,
+Conv2dFunctor<DeviceType::GPU, T>::Conv2dFunctor(
-                                   const Tensor *input,
+    OpKernelContext *context,
-                                   const Tensor *filter,
+    const int *strides,
-                                   const Tensor *bias,
+    const Padding &padding_type,
-                                   const int stride,
+    const std::vector<int> &paddings,
-                                   const int *padding,
    const int *dilations,
    const ActivationType activation,
    const float relux_max_limit,
-                                   const DataType dt,
+    const bool is_filter_transformed)
-                                   std::vector<index_t> *prev_input_shape,
+    : Conv2dFunctorBase(context,
-                                   Tensor *output,
+                        strides,
-                                   StatsFuture *future,
+                        padding_type,
-                                   uint32_t *kwg_size,
+                        paddings,
-                                   std::unique_ptr<BufferBase> *kernel_error);
+                        dilations,
+                        activation,
+                        relux_max_limit) {
+  MACE_UNUSED(is_filter_transformed);
-extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *runtime,
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
-                                   cl::Kernel *kernel,
+    kernel_.reset(new opencl::image::Conv2dKernel<T>);
-                                   const Tensor *input,
+  } else {
-                                   const Tensor *filter,
+    kernel_.reset(new opencl::buffer::Conv2dKernel<T>);
-                                   const Tensor *bias,
+  }
-                                   const int stride,
+}
-                                   const int *padding,
-                                   const int *dilations,
-                                   const ActivationType activation,
-                                   const float relux_max_limit,
-                                   const DataType dt,
-                                   std::vector<index_t> *prev_input_shape,
-                                   Tensor *output,
-                                   StatsFuture *future,
-                                   uint32_t *kwg_size,
-                                   std::unique_ptr<BufferBase> *kernel_error);
-extern MaceStatus Conv2dOpencl(OpKernelContext *runtime,
-                               cl::Kernel *kernel,
-                               const Tensor *input,
-                               const Tensor *filter,
-                               const Tensor *bias,
-                               const int stride,
-                               const int *padding,
-                               const int *dilations,
-                               const ActivationType activation,
-                               const float relux_max_limit,
-                               const DataType dt,
-                               std::vector<index_t> *prev_input_shape,
-                               Tensor *output,
-                               StatsFuture *future,
-                               uint32_t *kwg_size,
-                               std::unique_ptr<BufferBase> *kernel_error);
 template <typename T>
 MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
@@ -75,61 +51,11 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                         const Tensor *bias,
                                                         Tensor *output,
                                                         StatsFuture *future) {
-  typedef MaceStatus (*Conv2dOpenclFunction)(
+  // Compute
-      OpKernelContext *runtime, cl::Kernel * kernel, const Tensor *input,
+  return kernel_->Compute(context_, input, filter, bias,
-      const Tensor *filter, const Tensor *bias, const int stride,
+                          strides_, padding_type_, paddings_,
-      const int *padding, const int *dilations,
+                          dilations_, activation_, relux_max_limit_,
-      const ActivationType activation,
+                          output, future);
-      const float relux_max_limit, const DataType dt,
-      std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
-      uint32_t *kwg_size, std::unique_ptr<BufferBase> *kernel_error);
-  // Selection matrix: kernel_size x stride_size
-  static const Conv2dOpenclFunction selector[3] = {
-      Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3};
-  index_t kernel_h = filter->dim(2);
-  index_t kernel_w = filter->dim(3);
-  if (strides_[0] != strides_[1] ||
-      (dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) {
-    LOG(WARNING) << "OpenCL conv2d kernel with "
-                 << "filter" << kernel_h << "x" << kernel_w << ","
-                 << " stride " << strides_[0] << "x" << strides_[1]
-                 << ",dilations " << dilations_[0] << "x" << dilations_[1]
-                 << " is not implemented yet.";
-    MACE_NOT_IMPLEMENTED;
-  }
-  std::vector<index_t> output_shape(4);
-  std::vector<int> paddings(2);
-  if (paddings_.empty()) {
-    kernels::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), filter->shape().data(), dilations_, strides_,
-        padding_type_, output_shape.data(), paddings.data());
-  } else {
-    paddings = paddings_;
-    CalcOutputSize(input->shape().data(), filter->shape().data(),
-                   paddings_.data(), dilations_, strides_, RoundType::FLOOR,
-                   output_shape.data());
-  }
-  std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  if (kernel_h == kernel_w && kernel_h <= 3 &&
-      selector[kernel_h - 1] != nullptr) {
-    auto conv2d_func = selector[kernel_h - 1];
-    return conv2d_func(context_,
-        &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
-        activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
-        output, future, &kwg_size_, &kernel_error_);
-  } else {
-    return Conv2dOpencl(context_,
-        &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
-        activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
-        output, future, &kwg_size_, &kernel_error_);
-  }
 }
 template struct Conv2dFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/crop.cc
+++ b/mace/kernels/opencl/crop.cc
@@ -13,170 +13,29 @@
 // limitations under the License.
 #include "mace/kernels/crop.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/image/crop.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
-namespace {
+template <typename T>
-std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+CropFunctor<DeviceType::GPU, T>::CropFunctor(OpKernelContext *context,
-                              const uint32_t *gws,
+                                             const int axis,
-                              const uint32_t kwg_size) {
+                                             const std::vector<int> &offset)
-  std::vector<uint32_t> lws(4, 0);
+    : OpKernel(context) {
-  if (kwg_size == 0) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    lws[0] = lws[1] = lws[2] = 1;
+    kernel_.reset(new opencl::image::CropKernel<T>(axis, offset));
  } else {
-    uint64_t
+    MACE_NOT_IMPLEMENTED;
-        cache_size = runtime->device_global_mem_cache_size();
-    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
-    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
-    lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
-    const uint32_t lws_size = lws[0] * lws[1];
-    lws[2] =
-        std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
  }
-  return lws;
 }
-}  // namespace
 template <typename T>
 MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
    const std::vector<const Tensor *> &input_list,
    Tensor *output,
    StatsFuture *future) {
-  MACE_UNUSED(future);
+  return kernel_->Compute(context_, input_list, output, future);
-  const int32_t inputs_count = static_cast<int32_t>(input_list.size());
-  MACE_CHECK(inputs_count >= 2)
-      << "Crop opencl kernel only support 2 elements input";
-  const Tensor *input0 = input_list[0];
-  const Tensor *input1 = input_list[1];
-  const uint32_t in0_dims = static_cast<uint32_t >(input0->dim_size());
-  const uint32_t in1_dims = static_cast<uint32_t >(input0->dim_size());
-  MACE_CHECK(in0_dims == 4 && in1_dims == 4,
-             "Crop op only supports 4-dims inputs now.");
-  std::vector<int32_t> offsets(4, 0);
-  std::vector<index_t> output_shape(input0->shape());
-  switch (axis_) {
-    case 0:
-      if (offset_.size() == 1) {
-        offsets[0] = offset_[0];
-        offsets[1] = offset_[0];
-        offsets[2] = offset_[0];
-        offsets[3] = offset_[0];
-      } else if (offset_.size() == 4) {
-        offsets[0] = offset_[0];
-        offsets[1] = offset_[2];
-        offsets[2] = offset_[3];
-        offsets[3] = offset_[1];
-      }
-      for (int i = 0; i < 4; ++i) {
-        output_shape[i] = input1->dim(i);
-      }
-      break;
-    case 1:
-      if (offset_.size() == 1) {
-        offsets[1] = offset_[0];
-        offsets[2] = offset_[0];
-        offsets[3] = offset_[0];
-      } else if (offset_.size() == 3) {
-        offsets[1] = offset_[1];
-        offsets[2] = offset_[2];
-        offsets[3] = offset_[0];
-      }
-      for (int i = 1; i < 4; ++i) {
-        output_shape[i] = input1->dim(i);
-      }
-      break;
-    case 2:
-      if (offset_.size() == 1) {
-        offsets[1] = offset_[0];
-        offsets[2] = offset_[0];
-      } else if (offset_.size() == 2) {
-        offsets[1] = offset_[0];
-        offsets[2] = offset_[1];
-      }
-      output_shape[1] = input1->dim(1);
-      output_shape[2] = input1->dim(2);
-      break;
-    case 3:
-      if (offset_.size() == 1) {
-        offsets[2] = offset_[0];
-      }
-      output_shape[2] = input1->dim(2);
-      break;
-    default:
-      MACE_CHECK(axis_ >= 0 && axis_ < 4, "axis is out of boundary.");
-      break;
-  }
-  MACE_CHECK(offsets[3] % 4 == 0,
-      "MACE opencl only supports cropping channel offset divisible by 4.");
-  for (index_t i = 0; i < 4; ++i) {
-    MACE_CHECK(input0->dim(i) - offsets[i] >= input1->dim(i))
-        << "the crop for dimension" << i << "is out of bound with size"
-        << input1->dim(i) << "and offset" << offsets[i];
-  }
-  std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-  const index_t offset_chan_blk = RoundUpDiv4(offsets[3]);
-  const index_t channel_blk = RoundUpDiv4(output->dim(3));
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(output->dim(2)),
-      static_cast<uint32_t>(output->dim(0) * output->dim(1))
-  };
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
-    built_options.emplace("-Dcrop=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  if (!IsVecEqual(input_shape_, input0->shape())) {
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input0->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int>(offsets[0]));
-    kernel_.setArg(idx++, static_cast<int>(offsets[1]));
-    kernel_.setArg(idx++, static_cast<int>(offsets[2]));
-    kernel_.setArg(idx++, static_cast<int>(offset_chan_blk));
-    kernel_.setArg(idx++, static_cast<int>(input0->dim(1)));
-    kernel_.setArg(idx++, static_cast<int>(input0->dim(2)));
-    kernel_.setArg(idx++, static_cast<int>(output->dim(1)));
-    kernel_.setArg(idx++, static_cast<int>(output->dim(2)));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input0->shape();
-  }
-  const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct CropFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/deconv_2d.cc
+++ b/mace/kernels/opencl/deconv_2d.cc
@@ -13,140 +13,34 @@
 // limitations under the License.
 #include "mace/kernels/deconv_2d.h"
-#include "mace/kernels/opencl/helper.h"
+#include "mace/kernels/opencl/image/deconv_2d.h"
 namespace mace {
 namespace kernels {
-namespace {
+template <typename T>
+Deconv2dFunctor<DeviceType::GPU, T>::Deconv2dFunctor(
-MaceStatus Deconv2dOpencl(OpKernelContext *context,
+    OpKernelContext *context,
-                          cl::Kernel *kernel,
+    const std::vector<int> &strides,
-                          const Tensor *input,
+    const Padding &padding_type,
-                          const Tensor *filter,
+    const std::vector<int> &paddings,
-                          const Tensor *bias,
+    const std::vector<index_t> &output_shape,
-                          const int *strides,
-                          const int *paddings,
    const ActivationType activation,
-                          const float relux_max_limit,
+    const float relux_max_limit)
-                          const DataType dt,
+    : Deconv2dFunctorBase(context,
-                          std::vector<index_t> *prev_input_shape,
+                          strides,
-                          Tensor *output,
+                          padding_type,
-                          StatsFuture *future,
+                          paddings,
-                          uint32_t *kwg_size,
+                          output_shape,
-                          std::unique_ptr<BufferBase> *kernel_error) {
+                          activation,
-  const index_t batch = output->dim(0);
+                          relux_max_limit) {
-  const index_t height = output->dim(1);
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
-  const index_t width = output->dim(2);
+    kernel_.reset(new opencl::image::Deconv2dKernel<T>);
-  const index_t channels = output->dim(3);
+  } else {
-  const index_t input_channels = input->dim(3);
+    MACE_NOT_IMPLEMENTED;
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
-  const int stride_h = strides[0];
-  const int stride_w = strides[1];
-  MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
-#define MACE_WIDTH_BLK 5
-  const index_t n_strides = (width + stride_w - 1) / stride_w;
-  const index_t width_blocks =
-      ((n_strides + MACE_WIDTH_BLK - 1) / MACE_WIDTH_BLK) * stride_w;
-  const float stride_h_r = 1.f / static_cast<float>(stride_h);
-  const float stride_w_r = 1.f / static_cast<float>(stride_w);
-  const int padding_h = (paddings[0] + 1) >> 1;
-  const int padding_w = (paddings[1] + 1) >> 1;
-  const int align_h = stride_h - 1 - padding_h;
-  const int align_w = stride_w - 1 - padding_w;
-  const int kernel_size = filter->dim(2) * filter->dim(3);
-  auto runtime = context->device()->opencl_runtime();
-  if (kernel->get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error, context);
-    NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
-    built_options.emplace("-Ddeconv_2d=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-    switch (activation) {
-      case NOOP:
-        break;
-      case RELU:
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case TANH:
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation;
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name,
-                                              built_options, kernel));
-    *kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-  }
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width_blocks),
-                           static_cast<uint32_t>(height * batch)};
-  if (!IsVecEqual(*prev_input_shape, input->shape())) {
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG_PTR;
-    SET_3D_GWS_ARGS_PTR(kernel, gws);
-    kernel->setArg(idx++, *(input->opencl_image()));
-    kernel->setArg(idx++, *(filter->opencl_image()));
-    if (bias != nullptr) {
-      kernel->setArg(idx++, *(bias->opencl_image()));
-    }
-    kernel->setArg(idx++, *(output->opencl_image()));
-    kernel->setArg(idx++, relux_max_limit);
-    kernel->setArg(idx++, static_cast<int32_t>(input->dim(1)));
-    kernel->setArg(idx++, static_cast<int32_t>(input->dim(2)));
-    kernel->setArg(idx++, static_cast<int32_t>(input->dim(3)));
-    kernel->setArg(idx++, static_cast<int32_t>(height));
-    kernel->setArg(idx++, static_cast<int32_t>(width));
-    kernel->setArg(idx++, static_cast<int32_t>(channels));
-    kernel->setArg(idx++, static_cast<int32_t>(stride_h));
-    kernel->setArg(idx++, static_cast<int32_t>(stride_w));
-    kernel->setArg(idx++, stride_h_r);
-    kernel->setArg(idx++, stride_w_r);
-    kernel->setArg(idx++, static_cast<int32_t>(align_h));
-    kernel->setArg(idx++, static_cast<int32_t>(align_w));
-    kernel->setArg(idx++, static_cast<int32_t>(padding_h));
-    kernel->setArg(idx++, static_cast<int32_t>(padding_w));
-    kernel->setArg(idx++, static_cast<int32_t>(filter->dim(2)));
-    kernel->setArg(idx++, static_cast<int32_t>(filter->dim(3)));
-    kernel->setArg(idx++, static_cast<int32_t>(kernel_size));
-    kernel->setArg(idx++, static_cast<int32_t>(input_channel_blocks));
-    kernel->setArg(idx++, static_cast<int32_t>(channel_blocks));
-    *prev_input_shape = input->shape();
  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, *kwg_size);
-  std::string tuning_key =
-      Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
-             output->dim(1), output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(*kernel_error);
-  return MACE_SUCCESS;
 }
-}  // namespace
 template <typename T>
 MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
@@ -188,16 +82,10 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
                         output_shape.data(),
                         paddings.data());
  }
-  std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  return Deconv2dOpencl(context_, &kernel_, input, filter, bias,
+  return kernel_->Compute(context_, input, filter, bias,
                          strides_.data(), paddings.data(), activation_,
-                        relux_max_limit_, DataTypeToEnum<T>::value,
+                          relux_max_limit_, output_shape, output, future);
-                        &input_shape_, output, future,
-                        &kwg_size_, &kernel_error_);
 }
 template struct Deconv2dFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/depth_to_space.cc
+++ b/mace/kernels/opencl/depth_to_space.cc
@@ -13,98 +13,26 @@
 // limitations under the License.
 #include "mace/kernels/depth_to_space.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/kernels/opencl/image/depth_to_space.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
+template <typename T>
+DepthToSpaceOpFunctor<DeviceType::GPU, T>::DepthToSpaceOpFunctor(
+    OpKernelContext *context,
+    const int block_size)
+    : OpKernel(context) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::DepthToSpaceKernel<T>(block_size));
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
 template <typename T>
 MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input, Tensor *output, StatsFuture *future) {
-  const index_t batch = input->dim(0);
+  return kernel_->Compute(context_, input, output, future);
-  const index_t input_height = input->dim(1);
-  const index_t input_width = input->dim(2);
-  const index_t input_depth = input->dim(3);
-  MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
-             "input depth should be dividable by block_size * block_size",
-             input_depth);
-  MACE_CHECK((input_depth % 4) == 0,
-             "input channel should be dividable by 4");
-  const index_t output_height = input_height * block_size_;
-  const index_t output_width = input_width * block_size_;
-  const index_t output_depth = input_depth / (block_size_ * block_size_);
-  MACE_CHECK(output_depth % 4 == 0, "output channel not support:")
-    << output_depth;
-  const index_t input_depth_blocks = RoundUpDiv4(input_depth);
-  const index_t output_depth_blocks = RoundUpDiv4(output_depth);
-  std::vector<index_t> output_shape = {batch,
-                                       output_height,
-                                       output_width,
-                                       output_depth};
-  std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(RoundUpDiv4(output_depth)),
-      static_cast<uint32_t>(output_width),
-      static_cast<uint32_t>(output_height * batch)
-  };
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    const char *kernel_name = kernel_name = "depth_to_space";
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    built_options.emplace(kernel_name_ss.str());
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_height * batch));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-  std::string  tuning_key = Concat("depth_to_space_opencl_kernel",
-                                   batch, output_height,
-                                   output_width, output_depth);
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct DepthToSpaceOpFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/depthwise_conv2d.cc
+++ b/mace/kernels/opencl/depthwise_conv2d.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/kernels/depthwise_conv2d.h"
+#include "mace/kernels/opencl/buffer/depthwise_conv2d.h"
+#include "mace/kernels/opencl/image/depthwise_conv2d.h"
+namespace mace {
+namespace kernels {
+template <typename T>
+DepthwiseConv2dFunctor<DeviceType::GPU, T>::DepthwiseConv2dFunctor(
+    OpKernelContext *context,
+    const int *strides,
+    const Padding padding_type,
+    const std::vector<int> &paddings,
+    const int *dilations,
+    const ActivationType activation,
+    const float relux_max_limit)
+    : DepthwiseConv2dFunctorBase(context,
+                                 strides,
+                                 padding_type,
+                                 paddings,
+                                 dilations,
+                                 activation,
+                                 relux_max_limit) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::DepthwiseConv2dKernel<T>);
+  } else {
+    kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel<T>);
+  }
+}
+template <typename T>
+MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
+    const Tensor *input,
+    const Tensor *filter, /* MIHW */
+    const Tensor *bias,
+    Tensor *output,
+    StatsFuture *future) {
+  return kernel_->Compute(context_, input, filter, bias,
+                          strides_, padding_type_, paddings_,
+                          dilations_, activation_, relux_max_limit_,
+                          output, future);
+}
+template struct DepthwiseConv2dFunctor<DeviceType::GPU, float>;
+template struct DepthwiseConv2dFunctor<DeviceType::GPU, half>;
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/eltwise.cc
+++ b/mace/kernels/opencl/eltwise.cc
@@ -13,125 +13,33 @@
 // limitations under the License.
 #include "mace/kernels/eltwise.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/image/eltwise.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
 namespace mace {
 namespace kernels {
+template <typename T>
+EltwiseFunctor<DeviceType::GPU, T>::EltwiseFunctor(
+    OpKernelContext *context,
+    const EltwiseType type,
+    const std::vector<float> &coeff,
+    const float scalar_input,
+    const int32_t scalar_input_index,
+    const DataFormat data_format) : OpKernel(context) {
+  MACE_UNUSED(data_format);
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::EltwiseKernel<T>(
+        type, coeff, scalar_input, scalar_input_index));
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
 template <typename T>
 MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
                                                          const Tensor *input1,
                                                          Tensor *output,
                                                          StatsFuture *future) {
-  MACE_UNUSED(future);
+  return kernel_->Compute(context_, input0, input1, output, future);
-  bool swapped = false;
-  if (input1 != nullptr) {
-    MACE_CHECK(input0->dim_size() == input1->dim_size() ||
-               input0->dim_size() == 1 || input1->dim_size() == 1)
-        << "Inputs of Eltwise op must be same shape";
-    if (input0->size() != input1->size()) {
-      if (input0->size() < input1->size()) {
-        std::swap(input0, input1);
-        swapped = true;
-      }
-      if (input1->dim_size() == 1) {
-        MACE_CHECK(input0->dim(3) == input1->dim(0))
-            << "Element-Wise op only support channel dimension broadcast";
-      } else {
-        MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1) &&
-                   input0->dim(3) == input1->dim(3) && input1->dim(1) == 1 &&
-                   input1->dim(2) == 1)
-            << "Element-Wise op only support channel dimension broadcast";
-      }
-    }
-  }
-  if (scalar_input_index_ == 0) {
-    swapped = !swapped;
-  }
-  std::vector<index_t> output_shape(4);
-  output_shape[0] = input0->dim(0);
-  output_shape[1] = input0->dim(1);
-  output_shape[2] = input0->dim(2);
-  output_shape[3] = input0->dim(3);
-  std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  const index_t batch = output->dim(0);
-  const index_t height = output->dim(1);
-  const index_t width = output->dim(2);
-  const index_t channels = output->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t batch_height_pixels = batch * height;
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(batch_height_pixels)};
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
-    built_options.emplace("-Deltwise=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
-    if (input1 == nullptr) {
-      built_options.emplace("-DINPUT_TYPE=1");
-    } else if (input0->size() != input1->size()) {
-      if (input1->dim(0) == 1 || input1->dim_size() == 1)
-        built_options.emplace("-DINPUT_TYPE=3");
-      else
-        built_options.emplace("-DINPUT_TYPE=2");
-      if (swapped) built_options.emplace("-DSWAPPED");
-    }
-    if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  if (!IsVecEqual(input_shape_, input0->shape())) {
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input0->opencl_image()));
-    if (input1 == nullptr) {
-      kernel_.setArg(idx++, scalar_input_);
-    } else {
-      kernel_.setArg(idx++, *(input1->opencl_image()));
-    }
-    kernel_.setArg(idx++, static_cast<int32_t>(height));
-    kernel_.setArg(idx++, static_cast<int32_t>(width));
-    kernel_.setArg(idx++, static_cast<int32_t>(channels));
-    if (!coeff_.empty()) {
-      kernel_.setArg(idx++, coeff_[0]);
-      kernel_.setArg(idx++, coeff_[1]);
-    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input0->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct EltwiseFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/fully_connected.cc
+++ b/mace/kernels/opencl/fully_connected.cc
@@ -13,239 +13,23 @@
 // limitations under the License.
 #include "mace/kernels/fully_connected.h"
+#include "mace/kernels/opencl/image/fully_connected.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
 namespace mace {
 namespace kernels {
-namespace {
 template <typename T>
-MaceStatus FCWXKernel(OpKernelContext *context,
+FullyConnectedFunctor<DeviceType::GPU, T>::FullyConnectedFunctor(
-                      cl::Kernel *kernel,
+    OpKernelContext *context,
-                      const Tensor *input,
-                      const Tensor *weight,
-                      const Tensor *bias,
-                      std::vector<index_t> *prev_input_shape,
-                      Tensor *output,
    const ActivationType activation,
-                      std::vector<uint32_t> *gws,
+    const float relux_max_limit)
-                      std::vector<uint32_t> *lws,
+    : FullyConnectedBase(context, activation, relux_max_limit) {
-                      const float relux_max_limit,
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
-                      StatsFuture *future,
+    kernel_.reset(new opencl::image::FullyConnectedKernel<T>);
-                      std::unique_ptr<BufferBase> *kernel_error) {
-  MACE_CHECK_NOTNULL(gws);
-  MACE_CHECK_NOTNULL(lws);
-  auto runtime = context->device()->opencl_runtime();
-  if (kernel->get() == nullptr) {
-    const index_t batch = output->dim(0);
-    const index_t output_size = output->dim(3);
-    const index_t output_blocks = RoundUpDiv4(output_size);
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error, context);
-    NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
-    built_options.emplace("-Dfully_connected_width=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    if (bias != nullptr) {
-      built_options.emplace("-DBIAS");
-    }
-    switch (activation) {
-      case NOOP:
-        break;
-      case RELU:
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case TANH:
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation;
-    }
-    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
-      built_options.emplace("-DNON_QUALCOMM_ADRENO");
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
-                                              built_options, kernel));
-    if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
-      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
-      const uint32_t wave_size =
-          static_cast<uint32_t>(runtime->GetKernelWaveSize(*kernel));
-      *gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
-      const uint32_t kwg_size =
-          static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-      const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]);
-      *lws = {(*gws)[0], (*gws)[1], inter_local_blks};
-    } else {
-      *gws = {4, 8, static_cast<uint32_t>(batch * output_blocks)};
-      const uint32_t kwg_size =
-          static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-      const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]);
-      *lws = {(*gws)[0], (*gws)[1], inter_local_blks};
-    }
-  }
-  if (!IsVecEqual(*prev_input_shape, input->shape())) {
-    const index_t batch = output->dim(0);
-    const index_t output_blocks = RoundUpDiv4(output->dim(3));
-    (*gws)[2] = static_cast<uint32_t>(batch * output_blocks);
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG_PTR;
-    SET_3D_GWS_ARGS_PTR(kernel, *gws);
-    kernel->setArg(idx++, *(input->opencl_image()));
-    kernel->setArg(idx++, *(weight->opencl_image()));
-    if (bias != nullptr) {
-      kernel->setArg(idx++, *(bias->opencl_image()));
-    }
-    kernel->setArg(idx++, *(output->opencl_image()));
-    kernel->setArg(idx++, ((*lws)[0] * (*lws)[1] * (*lws)[2] * sizeof(float)),
-                   nullptr);
-    kernel->setArg(idx++, static_cast<int>(input->dim(1)));
-    kernel->setArg(idx++, static_cast<int>(input->dim(2)));
-    kernel->setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
-    kernel->setArg(idx++, static_cast<int>(output_blocks));
-    kernel->setArg(idx++, relux_max_limit);
-    *prev_input_shape = input->shape();
-  }
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        *kernel, cl::NullRange, cl::NDRange((*gws)[0], (*gws)[1], (*gws)[2]),
-        cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
  } else {
-    std::vector<uint32_t> roundup_gws(lws->size());
+    MACE_NOT_IMPLEMENTED;
-    for (size_t i = 0; i < lws->size(); ++i) {
-      roundup_gws[i] = RoundUp((*gws)[i], (*lws)[i]);
-    }
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        *kernel, cl::NullRange,
-        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-        cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
-  }
-  OUT_OF_RANGE_VALIDATION(*kernel_error);
-  MACE_CL_RET_STATUS(error);
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
  }
-    };
-  }
-  return MACE_SUCCESS;
 }
-template <typename T>
-MaceStatus FCWTXKernel(OpKernelContext *context,
-                       cl::Kernel *kernel,
-                       const Tensor *input,
-                       const Tensor *weight,
-                       const Tensor *bias,
-                       std::vector<index_t> *prev_input_shape,
-                       Tensor *output,
-                       const ActivationType activation,
-                       std::vector<uint32_t> *gws,
-                       std::vector<uint32_t> *lws,
-                       const float relux_max_limit,
-                       StatsFuture *future,
-                       std::unique_ptr<BufferBase> *kernel_error) {
-  MACE_CHECK_NOTNULL(gws);
-  MACE_CHECK_NOTNULL(lws);
-  auto runtime = context->device()->opencl_runtime();
-  if (kernel->get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error, context);
-    NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected");
-    built_options.emplace("-Dfully_connected=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    if (bias != nullptr) {
-      built_options.emplace("-DBIAS");
-    }
-    switch (activation) {
-      case NOOP:
-        break;
-      case RELU:
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case TANH:
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation;
-    }
-    MACE_RETURN_IF_ERROR(
-        runtime->BuildKernel("fully_connected", kernel_name,
-                             built_options, kernel));
-    uint32_t kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-    *lws = {16, kwg_size / 16, 0};
-  }
-  if (!IsVecEqual(*prev_input_shape, input->shape())) {
-    const index_t batch = output->dim(0);
-    const index_t output_blocks = RoundUpDiv4(output->dim(3));
-    *gws = {
-        static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
-    };
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG_PTR;
-    SET_2D_GWS_ARGS_PTR(kernel, *gws);
-    kernel->setArg(idx++, *(input->opencl_image()));
-    kernel->setArg(idx++, *(weight->opencl_image()));
-    if (bias != nullptr) {
-      kernel->setArg(idx++, *(bias->opencl_image()));
-    }
-    kernel->setArg(idx++, *(output->opencl_image()));
-    kernel->setArg(idx++, static_cast<int>(input->dim(1)));
-    kernel->setArg(idx++, static_cast<int>(input->dim(2)));
-    kernel->setArg(idx++, static_cast<int>(input->dim(3)));
-    // FIXME handle flexable data type: half not supported
-    kernel->setArg(idx++, relux_max_limit);
-    *prev_input_shape = input->shape();
-  }
-  std::string tuning_key =
-      Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2),
-             output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
-                                           gws->data(), *lws, future));
-  OUT_OF_RANGE_VALIDATION(*kernel_error);
-  return MACE_SUCCESS;
-}
-}  // namespace
 template <typename T>
 MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
@@ -253,16 +37,9 @@ MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *bias,
    Tensor *output,
    StatsFuture *future) {
-  std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
+  return kernel_->Compute(
-  std::vector<size_t> output_image_shape;
+      context_, input, weight, bias, activation_, relux_max_limit_,
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+      output, future);
-                  &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  return FCWXKernel<T>(context_,
-                       &kernel_, input, weight, bias, &input_shape_, output,
-                       activation_, &gws_, &lws_, relux_max_limit_, future,
-                       &kernel_error_);
 }
 template struct FullyConnectedFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -30,60 +30,61 @@
 namespace mace {
 namespace kernels {
+// oorc for 'Out Of Range Check'
+#define MACE_OUT_OF_RANGE_DEFINITION           \
+  std::shared_ptr<BufferBase> oorc_flag;
-#define OUT_OF_RANGE_CONFIG(kernel_error, context)          \
+#define MACE_OUT_OF_RANGE_CONFIG              \
  if (runtime->IsOutOfRangeCheckEnabled()) {           \
    built_options.emplace("-DOUT_OF_RANGE_CHECK");     \
-    (kernel_error) = std::move(std::unique_ptr<Buffer>(     \
+  }
+#define MACE_OUT_OF_RANGE_INIT(kernel)                 \
+  if (runtime->IsOutOfRangeCheckEnabled()) {           \
+    oorc_flag = std::move(std::unique_ptr<Buffer>(     \
        new Buffer((context)->device()->allocator())));         \
-    MACE_RETURN_IF_ERROR((kernel_error)->Allocate(1));      \
+    MACE_RETURN_IF_ERROR((oorc_flag)->Allocate(sizeof(int)));    \
-    (kernel_error)->Map(nullptr);                           \
+    oorc_flag->Map(nullptr);                           \
-    *((kernel_error)->mutable_data<char>()) = 0;            \
+    *(oorc_flag->mutable_data<int>()) = 0;            \
-    (kernel_error)->UnMap();                                \
+    oorc_flag->UnMap();                                \
+    (kernel).setArg(0,                             \
+    *(static_cast<cl::Buffer *>(oorc_flag->buffer())));\
  }
-#define OUT_OF_RANGE_SET_ARG                                \
+#define MACE_OUT_OF_RANGE_SET_ARGS(kernel)             \
  if (runtime->IsOutOfRangeCheckEnabled()) {           \
-    kernel_.setArg(idx++,                                   \
+    (kernel).setArg(idx++,                             \
-    *(static_cast<cl::Buffer *>(kernel_error_->buffer()))); \
+    *(static_cast<cl::Buffer *>(oorc_flag->buffer())));\
  }
-#define OUT_OF_RANGE_SET_ARG_PTR                              \
+#define MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel, size)     \
  if (runtime->IsOutOfRangeCheckEnabled()) {              \
-    kernel->setArg(idx++,                                     \
+    (kernel).setArg(idx++,                                \
-    *(static_cast<cl::Buffer *>((*kernel_error)->buffer()))); \
+    *(static_cast<cl::Buffer *>(oorc_flag->buffer())));   \
+    (kernel).setArg(idx++, static_cast<int>(size));       \
  }
-#define OUT_OF_RANGE_VALIDATION(kernel_error)                              \
+#define MACE_OUT_OF_RANGE_VALIDATION                                    \
  if (runtime->IsOutOfRangeCheckEnabled()) {                            \
-    (kernel_error)->Map(nullptr);                                          \
+    oorc_flag->Map(nullptr);                                            \
-    char *kerror_code = (kernel_error)->mutable_data<char>();              \
+    int *kerror_code = oorc_flag->mutable_data<int>();                \
-    MACE_CHECK(*kerror_code == 0, "Kernel error code: ", *kerror_code);\
+    MACE_CHECK(*kerror_code == 0, "Kernel error code: ", *kerror_code); \
-    (kernel_error)->UnMap();                                               \
+    oorc_flag->UnMap();                                                 \
  }
-#define NON_UNIFORM_WG_CONFIG                           \
+#define MACE_NON_UNIFORM_WG_CONFIG                      \
  if (runtime->IsNonUniformWorkgroupsSupported()) {     \
    built_options.emplace("-DNON_UNIFORM_WORK_GROUP");  \
  }
-#define SET_3D_GWS_ARGS(kernel) \
+#define MACE_SET_3D_GWS_ARGS(kernel, gws) \
-  kernel.setArg(idx++, gws[0]); \
+  (kernel).setArg(idx++, (gws)[0]);       \
-  kernel.setArg(idx++, gws[1]); \
+  (kernel).setArg(idx++, (gws)[1]);       \
-  kernel.setArg(idx++, gws[2]);
+  (kernel).setArg(idx++, (gws)[2]);
-#define SET_2D_GWS_ARGS(kernel) \
-  kernel.setArg(idx++, gws[0]); \
-  kernel.setArg(idx++, gws[1]);
-#define SET_3D_GWS_ARGS_PTR(kernel, gws)  \
+#define MACE_SET_2D_GWS_ARGS(kernel, gws) \
-  kernel->setArg(idx++, (gws)[0]);        \
+  (kernel).setArg(idx++, (gws)[0]);       \
-  kernel->setArg(idx++, (gws)[1]);        \
+  (kernel).setArg(idx++, (gws)[1]);
-  kernel->setArg(idx++, (gws)[2]);
-#define SET_2D_GWS_ARGS_PTR(kernel, gws)  \
-  kernel->setArg(idx++, (gws)[0]);        \
-  kernel->setArg(idx++, (gws)[1]);
 // Max execution time of OpenCL kernel for tuning to prevent UI stuck.
 const float kMaxKernelExecTime = 1000.0;  // microseconds
@@ -114,6 +115,10 @@ std::string DtToCLDt(const DataType dt);
 // e.g. half -> float
 std::string DtToUpCompatibleCLDt(const DataType dt);
+// CPU data type to OpenCL condition data type used in select
+// e.g. half -> float
+std::string DtToCLCondDt(const DataType dt);
 // Tuning or Run OpenCL kernel with 3D work group size
 MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
                               const cl::Kernel &kernel,
@@ -167,6 +172,7 @@ std::string Concat(Args... args) {
 std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
                                       const uint32_t *gws,
                                       const uint32_t kwg_size);
 }  // namespace kernels
 }  // namespace mace
 #endif  // MACE_KERNELS_OPENCL_HELPER_H_
--- a/mace/kernels/opencl/image/activation.h
+++ b/mace/kernels/opencl/image/activation.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_ACTIVATION_H_
+#define MACE_KERNELS_OPENCL_IMAGE_ACTIVATION_H_
+#include "mace/kernels/activation.h"
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class ActivationKernel : public OpenCLActivationKernel {
+ public:
+  ActivationKernel(ActivationType type,
+                   T relux_max_limit)
+      : activation_(type), relux_max_limit_(relux_max_limit) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *alpha,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  ActivationType activation_;
+  T relux_max_limit_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+  std::string tuning_key_prefix_;
+};
+template <typename T>
+MaceStatus ActivationKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    const Tensor *alpha,
+    Tensor *output,
+    StatsFuture *future) {
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
+    built_options.emplace("-Dactivation=" + kernel_name);
+    auto dt = DataTypeToEnum<T>::value;
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    switch (activation_) {
+      case RELU:
+        tuning_key_prefix_ = "relu_opencl_kernel";
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        tuning_key_prefix_ = "relux_opencl_kernel";
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case PRELU:
+        tuning_key_prefix_ = "prelu_opencl_kernel";
+        built_options.emplace("-DUSE_PRELU");
+        break;
+      case TANH:
+        tuning_key_prefix_ = "tanh_opencl_kernel";
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        tuning_key_prefix_ = "sigmoid_opencl_kernel";
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation_;
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    int idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    if (activation_ == PRELU) {
+      MACE_CHECK_NOTNULL(alpha);
+      kernel_.setArg(idx++, *(alpha->opencl_image()));
+    }
+    kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
+             output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_ACTIVATION_H_
--- a/mace/kernels/opencl/image/addn.h
+++ b/mace/kernels/opencl/image/addn.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_ADDN_H_
+#define MACE_KERNELS_OPENCL_IMAGE_ADDN_H_
+#include "mace/kernels/addn.h"
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class AddNKernel : public OpenCLAddNKernel {
+ public:
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const std::vector<const Tensor *> &input_tensors,
+      Tensor *output_tensor,
+      StatsFuture *future) override;
+ private:
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus AddNKernel<T>::Compute(
+    OpKernelContext *context,
+    const std::vector<const Tensor *> &input_tensors,
+    Tensor *output_tensor,
+    StatsFuture *future) {
+  size_t size = input_tensors.size();
+  MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
+  const index_t batch = input_tensors[0]->dim(0);
+  const index_t height = input_tensors[0]->dim(1);
+  const index_t width = input_tensors[0]->dim(2);
+  const index_t channels = input_tensors[0]->dim(3);
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  for (size_t i = 1; i < size; ++i) {
+    MACE_CHECK_NOTNULL(input_tensors[i]);
+    MACE_CHECK(batch == input_tensors[i]->dim(0));
+    MACE_CHECK(height == input_tensors[i]->dim(1));
+    MACE_CHECK(width == input_tensors[i]->dim(2));
+    MACE_CHECK(channels == input_tensors[i]->dim(3));
+  }
+  if (kernel_.get() == nullptr) {
+    if (input_tensors.size() > 4) {
+      MACE_NOT_IMPLEMENTED;
+    }
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    auto dt = DataTypeToEnum<T>::value;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
+    built_options.emplace("-Daddn=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  std::vector<index_t> output_shape = input_tensors[0]->shape();
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t width_pixels = channel_blocks * width;
+  const index_t batch_height_pixels = batch * height;
+  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
+                           static_cast<uint32_t>(batch_height_pixels)};
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
+    std::vector<size_t> output_image_shape;
+    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                    &output_image_shape);
+    MACE_RETURN_IF_ERROR(
+        output_tensor->ResizeImage(output_shape, output_image_shape));
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_2D_GWS_ARGS(kernel_, gws);
+    for (auto input : input_tensors) {
+      kernel_.setArg(idx++, *(input->opencl_image()));
+    }
+    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
+    input_shape_ = input_tensors[0]->shape();
+  }
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
+  std::string tuning_key =
+      Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
+             output_tensor->dim(2), output_tensor->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_ADDN_H_
--- a/mace/kernels/opencl/image/batch_norm.h
+++ b/mace/kernels/opencl/image/batch_norm.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_BATCH_NORM_H_
+#define MACE_KERNELS_OPENCL_IMAGE_BATCH_NORM_H_
+#include "mace/kernels/batch_norm.h"
+#include <memory>
+#include <vector>
+#include <set>
+#include <string>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class BatchNormKernel : public OpenCLBatchNormKernel {
+ public:
+  BatchNormKernel(
+      const bool folded_constant,
+      const ActivationType activation,
+      const float relux_max_limit);
+  MaceStatus Compute(OpKernelContext *context,
+                     const Tensor *input,
+                     const Tensor *scale,
+                     const Tensor *offset,
+                     const Tensor *mean,
+                     const Tensor *var,
+                     const float epsilon,
+                     Tensor *output,
+                     StatsFuture *future) override;
+ private:
+  const bool folded_constant_;
+  const ActivationType activation_;
+  const float relux_max_limit_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+BatchNormKernel<T>::BatchNormKernel(const bool folded_constant,
+                                    const ActivationType activation,
+                                    const float relux_max_limit)
+    : folded_constant_(folded_constant),
+      activation_(activation),
+      relux_max_limit_(relux_max_limit) {}
+template <typename T>
+MaceStatus BatchNormKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    const Tensor *scale,
+    const Tensor *offset,
+    const Tensor *mean,
+    const Tensor *var,
+    const float epsilon,
+    Tensor *output,
+    StatsFuture *future) {
+  MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr));
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    auto dt = DataTypeToEnum<T>::value;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
+    built_options.emplace("-Dbatch_norm=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    if (folded_constant_) {
+      built_options.emplace("-DFOLDED_CONSTANT");
+    }
+    switch (activation_) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation_;
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(scale->opencl_image()));
+    kernel_.setArg(idx++, *(offset->opencl_image()));
+    if (!folded_constant_) {
+      kernel_.setArg(idx++, *(mean->opencl_image()));
+      kernel_.setArg(idx++, *(var->opencl_image()));
+      kernel_.setArg(idx++, epsilon);
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, relux_max_limit_);
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3), folded_constant_);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_BATCH_NORM_H_
--- a/mace/kernels/opencl/image/batch_to_space.h
+++ b/mace/kernels/opencl/image/batch_to_space.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_BATCH_TO_SPACE_H_
+#define MACE_KERNELS_OPENCL_IMAGE_BATCH_TO_SPACE_H_
+#include "mace/kernels/batch_to_space.h"
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
+ public:
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *batch_tensor,
+      const std::vector<int> &paddings,
+      const std::vector<int> &block_shape,
+      const std::vector<index_t> &output_shape,
+      Tensor *space_tensor,
+      StatsFuture *future) override;
+ private:
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus BatchToSpaceKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *batch_tensor,
+    const std::vector<int> &paddings,
+    const std::vector<int> &block_shape,
+    const std::vector<index_t> &output_shape,
+    Tensor *space_tensor,
+    StatsFuture *future) {
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
+  MACE_RETURN_IF_ERROR(
+      space_tensor->ResizeImage(output_shape, output_image_shape));
+  const uint32_t chan_blk =
+      static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
+  const uint32_t gws[3] = {
+      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
+      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    const char *kernel_name = "batch_to_space";
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+        DtToCLCMDDt(DataTypeToEnum<T>::value));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, batch_tensor->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
+    kernel_.setArg(idx++, *(space_tensor->opencl_image()));
+    kernel_.setArg(idx++, block_shape[0]);
+    kernel_.setArg(idx++, block_shape[1]);
+    kernel_.setArg(idx++, paddings[0]);
+    kernel_.setArg(idx++, paddings[2]);
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
+    input_shape_ = batch_tensor->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
+             batch_tensor->dim(2), batch_tensor->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_BATCH_TO_SPACE_H_
--- a/mace/kernels/opencl/image/bias_add.h
+++ b/mace/kernels/opencl/image/bias_add.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_BIAS_ADD_H_
+#define MACE_KERNELS_OPENCL_IMAGE_BIAS_ADD_H_
+#include "mace/kernels/bias_add.h"
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class BiasAddKernel : public OpenCLBiasAddKernel {
+ public:
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *bias,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus BiasAddKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    const Tensor *bias,
+    Tensor *output,
+    StatsFuture *future) {
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    auto dt = DataTypeToEnum<T>::value;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
+    built_options.emplace("-Dbias_add=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(bias->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange,
+        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION;
+  if (future != nullptr) {
+    future->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_BIAS_ADD_H_
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -12,29 +12,55 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/kernels/buffer_to_image.h"
+#ifndef MACE_KERNELS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_
+#define MACE_KERNELS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/buffer_transform.h"
 #include "mace/kernels/opencl/helper.h"
 namespace mace {
 namespace kernels {
+namespace opencl {
+namespace image {
 template <typename T>
-MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
+class BufferToImage : public OpenCLBufferTransformKernel {
-    const Tensor *buffer,
+ public:
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
      const BufferType type,
-    Tensor *image,
+      const int wino_blk_size,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  cl::Kernel kernel_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus BufferToImage<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    const BufferType type,
+    const int wino_blk_size,
+    Tensor *output,
    StatsFuture *future) {
-  auto formatted_buffer_shape = FormatBufferShape(buffer->shape(), type);
+  auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
  std::vector<size_t> image_shape;
-  CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size_);
+  CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size);
  if (type == WINOGRAD_FILTER) {
    std::vector<index_t> new_shape =
-        {(wino_blk_size_ + 2) * (wino_blk_size_ + 2),
+        {(wino_blk_size + 2) * (wino_blk_size + 2),
-         buffer->dim(0), buffer->dim(1)};
+         input->dim(0), input->dim(1)};
-    MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape));
+    MACE_RETURN_IF_ERROR(output->ResizeImage(new_shape, image_shape));
  } else {
-    MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape));
+    MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
  }
  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
@@ -67,25 +93,26 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
      break;
    case WINOGRAD_FILTER: {
      std::stringstream ss_tmp;
-      gws[1] /= (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
+      gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
      ss_tmp << "winograd_filter_buffer_to_image_"
-             << wino_blk_size_ << "x" << wino_blk_size_;
+             << wino_blk_size << "x" << wino_blk_size;
      kernel_name = ss_tmp.str();
      break;
    }
  }
-  auto runtime = context_->device()->opencl_runtime();
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
  if (kernel_.get() == nullptr) {
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
+    MACE_OUT_OF_RANGE_CONFIG;
-    NON_UNIFORM_WG_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
    std::stringstream kernel_name_ss;
    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
    built_options.emplace(kernel_name_ss.str());
-    if (buffer->dtype() == image->dtype()) {
+    if (input->dtype() == output->dtype()) {
      built_options.emplace(
          "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
      built_options.emplace("-DCMD_DATA_TYPE=" +
@@ -100,30 +127,31 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
        "buffer_to_image", obfuscated_kernel_name, built_options, &kernel_));
  }
-  if (!IsVecEqual(input_shape_, buffer->shape())) {
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    SET_2D_GWS_ARGS(kernel_);
+    MACE_SET_2D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(buffer->opencl_buffer()));
+    kernel_.setArg(idx++, *(input->opencl_buffer()));
-    MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
+    MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
               "buffer offset not aligned");
    kernel_.setArg(idx++,
-                   static_cast<uint32_t>(buffer->buffer_offset() /
+                   static_cast<uint32_t>(input->buffer_offset() /
-                       GetEnumTypeSize(buffer->dtype())));
+                       GetEnumTypeSize(input->dtype())));
    if (type == CONV2D_FILTER) {
      const index_t
-          inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
+          inner_size = input->dim(1) * input->dim(2) * input->dim(3);
-      kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
      kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
    } else if (type == DW_CONV2D_FILTER || type == WEIGHT_HEIGHT) {
-      kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
    } else if (type == ARGUMENT) {
-      kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(input->dim(0)));
    } else {
      kernel_.setArg(idx++,
                     static_cast<uint32_t>(formatted_buffer_shape[1]));
@@ -132,8 +160,8 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
      kernel_.setArg(idx++,
                     static_cast<uint32_t>(formatted_buffer_shape[3]));
    }
-    kernel_.setArg(idx++, *(image->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = buffer->shape();
+    input_shape_ = input->shape();
  }
  const uint32_t kwg_size =
@@ -157,7 +185,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
        cl::NDRange(lws[0], lws[1]), nullptr, &event);
  }
  MACE_CL_RET_STATUS(error);
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
+  MACE_OUT_OF_RANGE_VALIDATION;
  if (future != nullptr) {
    future->wait_fn = [runtime, event](CallStats *stats) {
      event.wait();
@@ -168,13 +196,14 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
  }
  // Mark the buffer unused.
-  const_cast<Tensor *>(buffer)->MarkUnused();
+  const_cast<Tensor *>(input)->MarkUnused();
  return MACE_SUCCESS;
 }
-template struct BufferToImageFunctor<DeviceType::GPU, float>;
+}  // namespace image
-template struct BufferToImageFunctor<DeviceType::GPU, half>;
+}  // namespace opencl
 }  // namespace kernels
 }  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_
--- a/mace/kernels/opencl/image/channel_shuffle.h
+++ b/mace/kernels/opencl/image/channel_shuffle.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_
+#define MACE_KERNELS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_
+#include "mace/kernels/channel_shuffle.h"
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
+ public:
+  explicit ChannelShuffleKernel(const int groups) : groups_(groups) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  const int groups_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus ChannelShuffleKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    Tensor *output,
+    StatsFuture *future) {
+  MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+  const index_t batch = input->dim(0);
+  const index_t height = input->dim(1);
+  const index_t width = input->dim(2);
+  const index_t channels = input->dim(3);
+  const index_t channels_per_group = channels / groups_;
+  MACE_CHECK(channels_per_group % 4 == 0,
+             "channels per group must be multiple of 4");
+  MACE_CHECK(groups_ % 4 == 0, "groups must be multiple of 4");
+  const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
+  const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
+    built_options.emplace("-Dchannel_shuffle=" + kernel_name);
+    auto dt = DataTypeToEnum<T>::value;
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("channel_shuffle", kernel_name,
+                             built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, groups_);
+    kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_
--- a/mace/kernels/opencl/image/concat.cc
+++ b/mace/kernels/opencl/image/concat.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/kernels/opencl/image/concat.h"
+#include <algorithm>
+#include <set>
+#include <string>
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+namespace concat {
+namespace {
+std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                              const uint32_t *gws,
+                              const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = runtime->device_global_mem_cache_size();
+    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
+    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+    lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
+    const uint32_t lws_size = lws[0] * lws[1];
+    lws[2] =
+        std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
+  }
+  return lws;
+}
+}  // namespace
+MaceStatus Concat2(OpKernelContext *context,
+                   cl::Kernel *kernel,
+                   const Tensor *input0,
+                   const Tensor *input1,
+                   const DataType dt,
+                   std::vector<index_t> *prev_input_shape,
+                   Tensor *output,
+                   StatsFuture *future,
+                   uint32_t *kwg_size) {
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channel = output->dim(3);
+  const int channel_blk = RoundUpDiv4(channel);
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(width),
+      static_cast<uint32_t>(batch * height),
+  };
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
+    built_options.emplace("-Dconcat_channel=" + kernel_name);
+    if (input0->dtype() == output->dtype()) {
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    } else {
+      built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    }
+    if (input0->dim(3) % 4 == 0) {
+      built_options.emplace("-DDIVISIBLE_FOUR");
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
+                                              built_options, kernel));
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
+  }
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  if (!IsVecEqual(*prev_input_shape, input0->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(*kernel);
+    MACE_SET_3D_GWS_ARGS(*kernel, gws);
+    kernel->setArg(idx++,
+                   *(static_cast<const cl::Image2D *>(input0->opencl_image())));
+    kernel->setArg(idx++,
+                   *(static_cast<const cl::Image2D *>(input1->opencl_image())));
+    kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
+    kernel->setArg(idx++,
+                   *(static_cast<cl::Image2D *>(output->opencl_image())));
+    *prev_input_shape = input0->shape();
+  }
+  const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
+  std::string tuning_key =
+      Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+MaceStatus ConcatN(OpKernelContext *context,
+                   cl::Kernel *kernel,
+                   const std::vector<const Tensor *> &input_list,
+                   const DataType dt,
+                   Tensor *output,
+                   StatsFuture *future,
+                   uint32_t *kwg_size) {
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
+    built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
+                                              built_options, kernel));
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
+  }
+  const int inputs_count = input_list.size();
+  index_t chan_blk_offset = 0;
+  cl::Event event;
+  CallStats call_stats{INT64_MAX, 0};
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  for (int i = 0; i < inputs_count; ++i) {
+    const Tensor *input = input_list[i];
+    index_t input_channel_blk = input->dim(3) / 4;
+    const uint32_t gws[3] = {
+        static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
+        static_cast<uint32_t>(batch * height),
+    };
+    const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(*kernel);
+    MACE_SET_3D_GWS_ARGS(*kernel, gws);
+    kernel->setArg(idx++, *(input->opencl_image()));
+    kernel->setArg(idx++, static_cast<int32_t>(chan_blk_offset));
+    kernel->setArg(idx++, *(output->opencl_image()));
+    chan_blk_offset += input_channel_blk;
+    cl_int error;
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          *kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+    } else {
+      std::vector<uint32_t> roundup_gws(lws.size());
+      for (size_t j = 0; j < 3; ++j) {
+        roundup_gws[j] = RoundUp(gws[j], lws[j]);
+      }
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          *kernel, cl::NullRange,
+          cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+    }
+    MACE_CL_RET_STATUS(error);
+    MACE_OUT_OF_RANGE_VALIDATION;
+    if (future != nullptr && runtime->is_profiling_enabled()) {
+      event.wait();
+      CallStats tmp_stats;
+      runtime->GetCallStats(event, &tmp_stats);
+      call_stats.start_micros =
+          std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
+      call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
+    }
+  }
+  if (future != nullptr) {
+    future->wait_fn = [call_stats](CallStats *stats) {
+      if (stats != nullptr) {
+        stats->start_micros = call_stats.start_micros;
+        stats->end_micros = stats->start_micros + call_stats.end_micros;
+      }
+    };
+  }
+  return MACE_SUCCESS;
+}
+}  // namespace concat
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/image/concat.h
+++ b/mace/kernels/opencl/image/concat.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_CONCAT_H_
+#define MACE_KERNELS_OPENCL_IMAGE_CONCAT_H_
+#include "mace/kernels/concat.h"
+#include <memory>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+namespace concat {
+MaceStatus Concat2(OpKernelContext *context,
+                   cl::Kernel *kernel,
+                   const Tensor *input0,
+                   const Tensor *input1,
+                   const DataType dt,
+                   std::vector<index_t> *prev_input_shape,
+                   Tensor *output,
+                   StatsFuture *future,
+                   uint32_t *kwg_size);
+MaceStatus ConcatN(OpKernelContext *context,
+                   cl::Kernel *kernel,
+                   const std::vector<const Tensor *> &input_list,
+                   const DataType dt,
+                   Tensor *output,
+                   StatsFuture *future,
+                   uint32_t *kwg_size);
+}  // namespace concat
+template <typename T>
+class ConcatKernel : public OpenCLConcatKernel {
+ public:
+  explicit ConcatKernel(const int32_t axis) : axis_(axis) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const std::vector<const Tensor *> &input_list,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  int32_t axis_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus ConcatKernel<T>::Compute(
+    OpKernelContext *context,
+    const std::vector<const Tensor *> &input_list,
+    Tensor *output,
+    StatsFuture *future) {
+  const int inputs_count = input_list.size();
+  MACE_CHECK(inputs_count >= 2 && axis_ == 3)
+    << "Concat opencl kernel only support >=2 elements with axis == 3";
+  const Tensor *input0 = input_list[0];
+  bool divisible_four = input0->dim(axis_) % 4 == 0;
+  std::vector<index_t> output_shape(input0->shape());
+  for (int i = 1; i < inputs_count; ++i) {
+    const Tensor *input = input_list[i];
+    MACE_CHECK(input->dim_size() == input0->dim_size(),
+               "Ranks of all input tensors must be same.");
+    divisible_four &= input->dim(axis_) % 4 == 0;
+    for (int j = 0; j < input->dim_size(); ++j) {
+      if (j == axis_) {
+        continue;
+      }
+      MACE_CHECK(input->dim(j) == input0->dim(j),
+                 "Dimensions of inputs should equal except axis.");
+    }
+    output_shape[axis_] += input->dim(axis_);
+  }
+  MACE_CHECK(
+      inputs_count == 2 || divisible_four,
+      "Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
+  std::vector<size_t> image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+  switch (inputs_count) {
+    case 2:
+      return concat::Concat2(
+          context, &kernel_, input_list[0], input_list[1],
+          DataTypeToEnum<T>::value, &input_shape_, output, future, &kwg_size_);
+    default:
+      if (divisible_four) {
+        return concat::ConcatN(context, &kernel_, input_list,
+                               DataTypeToEnum<T>::value, output, future,
+                               &kwg_size_);
+      } else {
+        MACE_NOT_IMPLEMENTED;
+      }
+  }
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_CONCAT_H_
--- a/mace/kernels/opencl/image/conv_2d.h
+++ b/mace/kernels/opencl/image/conv_2d.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_CONV_2D_H_
+#define MACE_KERNELS_OPENCL_IMAGE_CONV_2D_H_
+#include "mace/kernels/conv_2d.h"
+#include <memory>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
+                                   cl::Kernel *kernel,
+                                   const Tensor *input,
+                                   const Tensor *filter,
+                                   const Tensor *bias,
+                                   const int stride,
+                                   const int *padding,
+                                   const int *dilations,
+                                   const ActivationType activation,
+                                   const float relux_max_limit,
+                                   const DataType dt,
+                                   std::vector<index_t> *prev_input_shape,
+                                   Tensor *output,
+                                   StatsFuture *future,
+                                   uint32_t *kwg_size);
+extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
+                                   cl::Kernel *kernel,
+                                   const Tensor *input,
+                                   const Tensor *filter,
+                                   const Tensor *bias,
+                                   const int stride,
+                                   const int *padding,
+                                   const int *dilations,
+                                   const ActivationType activation,
+                                   const float relux_max_limit,
+                                   const DataType dt,
+                                   std::vector<index_t> *prev_input_shape,
+                                   Tensor *output,
+                                   StatsFuture *future,
+                                   uint32_t *kwg_size);
+extern MaceStatus Conv2dOpencl(OpKernelContext *context,
+                               cl::Kernel *kernel,
+                               const Tensor *input,
+                               const Tensor *filter,
+                               const Tensor *bias,
+                               const int stride,
+                               const int *padding,
+                               const int *dilations,
+                               const ActivationType activation,
+                               const float relux_max_limit,
+                               const DataType dt,
+                               std::vector<index_t> *prev_input_shape,
+                               Tensor *output,
+                               StatsFuture *future,
+                               uint32_t *kwg_size);
+template <typename T>
+class Conv2dKernel : public OpenCLConv2dKernel {
+ public:
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus Conv2dKernel<T>::Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output,
+      StatsFuture *future) {
+  typedef MaceStatus (*Conv2dOpenclFunction)(
+      OpKernelContext *context,
+      cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
+      const Tensor *bias, const int stride, const int *padding,
+      const int *dilations, const ActivationType activation,
+      const float relux_max_limit, const DataType dt,
+      std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
+      uint32_t *kwg_size);
+  // Selection matrix: kernel_size x stride_size
+  static const Conv2dOpenclFunction selector[3] = {
+      Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3};
+  index_t kernel_h = filter->dim(2);
+  index_t kernel_w = filter->dim(3);
+  if (strides[0] != strides[1] ||
+      (dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) {
+    LOG(WARNING) << "OpenCL conv2d kernel with "
+                 << "filter" << kernel_h << "x" << kernel_w << ","
+                 << " stride " << strides[0] << "x" << strides[1]
+                 << ",dilations " << dilations[0] << "x" << dilations[1]
+                 << " is not implemented yet.";
+    MACE_NOT_IMPLEMENTED;
+  }
+  // Reshape output
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    kernels::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), filter->shape().data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), filter->shape().data(),
+                   padding_data.data(), dilations, strides, RoundType::FLOOR,
+                   output_shape.data());
+  }
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  if (kernel_h == kernel_w && kernel_h <= 3 &&
+      selector[kernel_h - 1] != nullptr) {
+    auto conv2d_func = selector[kernel_h - 1];
+    return conv2d_func(context,
+        &kernel_, input, filter, bias, strides[0], paddings.data(), dilations,
+        activation, relux_max_limit, DataTypeToEnum<T>::value, &input_shape_,
+        output, future, &kwg_size_);
+  } else {
+    return Conv2dOpencl(
+        context, &kernel_, input, filter, bias,
+        strides[0], paddings.data(), dilations,
+        activation, relux_max_limit, DataTypeToEnum<T>::value, &input_shape_,
+        output, future, &kwg_size_);
+  }
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_CONV_2D_H_
--- a/mace/kernels/opencl/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_1x1.cc
@@ -19,6 +19,8 @@
 namespace mace {
 namespace kernels {
+namespace opencl {
+namespace image {
 namespace {
 // (inputs + weights + outputs) * array_size * sizeof(float)
@@ -78,8 +80,7 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
                                   std::vector<index_t> *prev_input_shape,
                                   Tensor *output,
                                   StatsFuture *future,
-                                   uint32_t *kwg_size,
+                                   uint32_t *kwg_size) {
-                                   std::unique_ptr<BufferBase> *kernel_error) {
  MACE_UNUSED(padding);
  MACE_UNUSED(dilations);
  const index_t batch = output->dim(0);
@@ -96,13 +97,13 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
  if (kernel->get() == nullptr) {
    MACE_CHECK(input_batch == batch);
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error, context);
+    MACE_OUT_OF_RANGE_CONFIG;
-    NON_UNIFORM_WG_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1");
    built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
@@ -139,12 +140,13 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                           static_cast<uint32_t>(width_blocks),
                           static_cast<uint32_t>(height * batch)};
+  MACE_OUT_OF_RANGE_INIT(*kernel);
  // Support different input size
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG_PTR;
+    MACE_OUT_OF_RANGE_SET_ARGS(*kernel);
-    SET_3D_GWS_ARGS_PTR(kernel, gws);
+    MACE_SET_3D_GWS_ARGS(*kernel, gws);
    kernel->setArg(idx++, *(input->opencl_image()));
    kernel->setArg(idx++, *(filter->opencl_image()));
    if (bias != nullptr) {
@@ -169,9 +171,11 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
             output->dim(2), output->dim(3));
  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(*kernel_error);
+  MACE_OUT_OF_RANGE_VALIDATION;
  return MACE_SUCCESS;
 }
+}  // namespace image
+}  // namespace opencl
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/conv_2d_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_3x3.cc
@@ -21,6 +21,9 @@
 namespace mace {
 namespace kernels {
+namespace opencl {
+namespace image {
 namespace {
 // (inputs + weights + outputs) * array_size * sizeof(float)
 const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
@@ -71,8 +74,7 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
                                   std::vector<index_t> *prev_input_shape,
                                   Tensor *output,
                                   StatsFuture *future,
-                                   uint32_t *kwg_size,
+                                   uint32_t *kwg_size) {
-                                   std::unique_ptr<BufferBase> *kernel_error) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -84,11 +86,12 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
  const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error, context);
+    MACE_OUT_OF_RANGE_CONFIG;
-    NON_UNIFORM_WG_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
    built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
@@ -123,12 +126,13 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                           static_cast<uint32_t>(width_blocks),
                           static_cast<uint32_t>(height * batch)};
+  MACE_OUT_OF_RANGE_INIT(*kernel);
  // Support different input size
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG_PTR;
+    MACE_OUT_OF_RANGE_SET_ARGS(*kernel);
-    SET_3D_GWS_ARGS_PTR(kernel, gws);
+    MACE_SET_3D_GWS_ARGS(*kernel, gws);
    kernel->setArg(idx++, *(input->opencl_image()));
    kernel->setArg(idx++, *(filter->opencl_image()));
    if (bias != nullptr) {
@@ -149,16 +153,17 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
    *prev_input_shape = input->shape();
  }
  std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
  std::string tuning_key =
      Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1),
             output->dim(2), output->dim(3));
  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(*kernel_error);
+  MACE_OUT_OF_RANGE_VALIDATION;
  return MACE_SUCCESS;
 }
+}  // namespace image
+}  // namespace opencl
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/conv_2d_general.cc
+++ b/mace/kernels/opencl/conv_2d_general.cc
@@ -21,6 +21,9 @@
 namespace mace {
 namespace kernels {
+namespace opencl {
+namespace image {
 namespace {
 // (inputs + weights + outputs) * array_size * sizeof(float)
 const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
@@ -79,8 +82,7 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context,
                               std::vector<index_t> *prev_input_shape,
                               Tensor *output,
                               StatsFuture *future,
-                               uint32_t *kwg_size,
+                               uint32_t *kwg_size) {
-                               std::unique_ptr<BufferBase> *kernel_error) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -92,11 +94,12 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context,
  const index_t width_blocks = RoundUpDiv4(width);
  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error, context);
+    MACE_OUT_OF_RANGE_CONFIG;
-    NON_UNIFORM_WG_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
    built_options.emplace("-Dconv_2d=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
@@ -131,12 +134,13 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context,
  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                           static_cast<uint32_t>(width_blocks),
                           static_cast<uint32_t>(height * batch)};
+  MACE_OUT_OF_RANGE_INIT(*kernel);
  // Support different input size
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG_PTR;
+    MACE_OUT_OF_RANGE_SET_ARGS(*kernel);
-    SET_3D_GWS_ARGS_PTR(kernel, gws);
+    MACE_SET_3D_GWS_ARGS(*kernel, gws);
    kernel->setArg(idx++, *(input->opencl_image()));
    kernel->setArg(idx++, *(filter->opencl_image()));
    if (bias != nullptr) {
@@ -168,9 +172,11 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context,
  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(*kernel_error);
+  MACE_OUT_OF_RANGE_VALIDATION;
  return MACE_SUCCESS;
 }
+}  // namespace image
+}  // namespace opencl
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/image/crop.h
+++ b/mace/kernels/opencl/image/crop.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_CROP_H_
+#define MACE_KERNELS_OPENCL_IMAGE_CROP_H_
+#include "mace/kernels/crop.h"
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class CropKernel : public OpenCLCropKernel {
+ public:
+  explicit CropKernel(
+      const int axis,
+      const std::vector<int> &offset)
+      : axis_(axis), offset_(offset) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const std::vector<const Tensor *> &input_list,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  const int axis_;
+  std::vector<int> offset_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus CropKernel<T>::Compute(
+    OpKernelContext *context,
+    const std::vector<const Tensor *> &input_list,
+    Tensor *output,
+    StatsFuture *future) {
+  const int32_t inputs_count = static_cast<int32_t>(input_list.size());
+  MACE_CHECK(inputs_count >= 2)
+    << "Crop opencl kernel only support 2 elements input";
+  const Tensor *input0 = input_list[0];
+  const Tensor *input1 = input_list[1];
+  const uint32_t in0_dims = static_cast<uint32_t >(input0->dim_size());
+  const uint32_t in1_dims = static_cast<uint32_t >(input0->dim_size());
+  MACE_CHECK(in0_dims == 4 && in1_dims == 4,
+             "Crop op only supports 4-dims inputs now.");
+  std::vector<int32_t> offsets(4, 0);
+  std::vector<index_t> output_shape(input0->shape());
+  switch (axis_) {
+    case 0:
+      if (offset_.size() == 1) {
+        offsets[0] = offset_[0];
+        offsets[1] = offset_[0];
+        offsets[2] = offset_[0];
+        offsets[3] = offset_[0];
+      } else if (offset_.size() == 4) {
+        offsets[0] = offset_[0];
+        offsets[1] = offset_[2];
+        offsets[2] = offset_[3];
+        offsets[3] = offset_[1];
+      }
+      for (int i = 0; i < 4; ++i) {
+        output_shape[i] = input1->dim(i);
+      }
+      break;
+    case 1:
+      if (offset_.size() == 1) {
+        offsets[1] = offset_[0];
+        offsets[2] = offset_[0];
+        offsets[3] = offset_[0];
+      } else if (offset_.size() == 3) {
+        offsets[1] = offset_[1];
+        offsets[2] = offset_[2];
+        offsets[3] = offset_[0];
+      }
+      for (int i = 1; i < 4; ++i) {
+        output_shape[i] = input1->dim(i);
+      }
+      break;
+    case 2:
+      if (offset_.size() == 1) {
+        offsets[1] = offset_[0];
+        offsets[2] = offset_[0];
+      } else if (offset_.size() == 2) {
+        offsets[1] = offset_[0];
+        offsets[2] = offset_[1];
+      }
+      output_shape[1] = input1->dim(1);
+      output_shape[2] = input1->dim(2);
+      break;
+    case 3:
+      if (offset_.size() == 1) {
+        offsets[2] = offset_[0];
+      }
+      output_shape[2] = input1->dim(2);
+      break;
+    default:
+      MACE_CHECK(axis_ >= 0 && axis_ < 4, "axis is out of boundary.");
+      break;
+  }
+  MACE_CHECK(offsets[3] % 4 == 0,
+             "MACE opencl only supports cropping channel"
+                 " offset divisible by 4.");
+  for (index_t i = 0; i < 4; ++i) {
+    MACE_CHECK(input0->dim(i) - offsets[i] >= input1->dim(i))
+      << "the crop for dimension" << i << "is out of bound with size"
+      << input1->dim(i) << "and offset" << offsets[i];
+  }
+  std::vector<size_t> image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+  const index_t offset_chan_blk = RoundUpDiv4(offsets[3]);
+  const index_t channel_blk = RoundUpDiv4(output->dim(3));
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(output->dim(2)),
+      static_cast<uint32_t>(output->dim(0) * output->dim(1))
+  };
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
+    built_options.emplace("-Dcrop=" + kernel_name);
+    auto dt = DataTypeToEnum<T>::value;
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input0->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input0->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int>(offsets[0]));
+    kernel_.setArg(idx++, static_cast<int>(offsets[1]));
+    kernel_.setArg(idx++, static_cast<int>(offsets[2]));
+    kernel_.setArg(idx++, static_cast<int>(offset_chan_blk));
+    kernel_.setArg(idx++, static_cast<int>(input0->dim(1)));
+    kernel_.setArg(idx++, static_cast<int>(input0->dim(2)));
+    kernel_.setArg(idx++, static_cast<int>(output->dim(1)));
+    kernel_.setArg(idx++, static_cast<int>(output->dim(2)));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input0->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_CROP_H_
--- a/mace/kernels/opencl/image/deconv_2d.h
+++ b/mace/kernels/opencl/image/deconv_2d.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_DECONV_2D_H_
+#define MACE_KERNELS_OPENCL_IMAGE_DECONV_2D_H_
+#include "mace/kernels/deconv_2d.h"
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class Deconv2dKernel : public OpenCLDeconv2dKernel {
+ public:
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const int *padding_data,
+      const ActivationType activation,
+      const float relux_max_limit,
+      const std::vector<index_t> &output_shape,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus Deconv2dKernel<T>::Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const int *padding_data,
+      const ActivationType activation,
+      const float relux_max_limit,
+      const std::vector<index_t> &output_shape,
+      Tensor *output,
+      StatsFuture *future) {
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  const DataType dt = DataTypeToEnum<T>::value;
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channels = output->dim(3);
+  const index_t input_channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
+  const int stride_h = strides[0];
+  const int stride_w = strides[1];
+  MACE_CHECK(stride_w > 0 && stride_h > 0, "strides should be > 0.");
+#define MACE_WIDTH_BLK 5
+  const index_t n_strides = (width + stride_w - 1) / stride_w;
+  const index_t width_blocks =
+      ((n_strides + MACE_WIDTH_BLK - 1) / MACE_WIDTH_BLK) * stride_w;
+  const float stride_h_r = 1.f / static_cast<float>(stride_h);
+  const float stride_w_r = 1.f / static_cast<float>(stride_w);
+  const int padding_h = (padding_data[0] + 1) >> 1;
+  const int padding_w = (padding_data[1] + 1) >> 1;
+  const int align_h = stride_h - 1 - padding_h;
+  const int align_w = stride_w - 1 - padding_w;
+  const int kernel_size = filter->dim(2) * filter->dim(3);
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
+    built_options.emplace("-Ddeconv_2d=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width_blocks),
+                           static_cast<uint32_t>(height * batch)};
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(filter->opencl_image()));
+    if (bias != nullptr) {
+      kernel_.setArg(idx++, *(bias->opencl_image()));
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, relux_max_limit);
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(3)));
+    kernel_.setArg(idx++, static_cast<int32_t>(height));
+    kernel_.setArg(idx++, static_cast<int32_t>(width));
+    kernel_.setArg(idx++, static_cast<int32_t>(channels));
+    kernel_.setArg(idx++, static_cast<int32_t>(stride_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(stride_w));
+    kernel_.setArg(idx++, stride_h_r);
+    kernel_.setArg(idx++, stride_w_r);
+    kernel_.setArg(idx++, static_cast<int32_t>(align_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(align_w));
+    kernel_.setArg(idx++, static_cast<int32_t>(padding_h));
+    kernel_.setArg(idx++, static_cast<int32_t>(padding_w));
+    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(filter->dim(3)));
+    kernel_.setArg(idx++, static_cast<int32_t>(kernel_size));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_channel_blocks));
+    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_DECONV_2D_H_
--- a/mace/kernels/opencl/image/depth_to_space.h
+++ b/mace/kernels/opencl/image/depth_to_space.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_
+#define MACE_KERNELS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_
+#include "mace/kernels/depth_to_space.h"
+#include <memory>
+#include <vector>
+#include <set>
+#include <string>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel {
+ public:
+  explicit DepthToSpaceKernel(const int block_size)
+      : block_size_(block_size) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  const int block_size_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus DepthToSpaceKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    Tensor *output,
+    StatsFuture *future) {
+  const index_t batch = input->dim(0);
+  const index_t input_height = input->dim(1);
+  const index_t input_width = input->dim(2);
+  const index_t input_depth = input->dim(3);
+  MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
+             "input depth should be dividable by block_size * block_size",
+             input_depth);
+  MACE_CHECK((input_depth % 4) == 0,
+             "input channel should be dividable by 4");
+  const index_t output_height = input_height * block_size_;
+  const index_t output_width = input_width * block_size_;
+  const index_t output_depth = input_depth / (block_size_ * block_size_);
+  MACE_CHECK(output_depth % 4 == 0, "output channel not support:")
+    << output_depth;
+  const index_t input_depth_blocks = RoundUpDiv4(input_depth);
+  const index_t output_depth_blocks = RoundUpDiv4(output_depth);
+  std::vector<index_t> output_shape = {batch,
+                                       output_height,
+                                       output_width,
+                                       output_depth};
+  std::vector<size_t> image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(RoundUpDiv4(output_depth)),
+      static_cast<uint32_t>(output_width),
+      static_cast<uint32_t>(output_height * batch)
+  };
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    const char *kernel_name = kernel_name = "depth_to_space";
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    auto dt = DataTypeToEnum<T>::value;
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_height * batch));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  std::string  tuning_key = Concat("depth_to_space_opencl_kernel",
+                                   batch, output_height,
+                                   output_width, output_depth);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_
--- a/mace/kernels/opencl/depthwise_conv.cc
+++ b/mace/kernels/opencl/depthwise_conv.cc
@@ -12,14 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/image/depthwise_conv2d.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/depthwise_conv2d.h"
+#include <algorithm>
-#include "mace/kernels/opencl/helper.h"
+#include <set>
-#include "mace/utils/tuner.h"
+#include <string>
 namespace mace {
 namespace kernels {
+namespace opencl {
+namespace image {
+namespace depthwise {
 namespace {
 // (inputs + weights + outputs) * array_size * sizeof(float)
@@ -60,7 +63,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 }  // namespace
-static MaceStatus DepthwiseConv2d(OpKernelContext *context,
+MaceStatus DepthwiseConv2d(OpKernelContext *context,
                           cl::Kernel *kernel,
                           const Tensor *input,   // NHWC
                           const Tensor *filter,  // HWIM
@@ -74,8 +77,7 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context,
                           std::vector<index_t> *prev_input_shape,
                           Tensor *output,
                           StatsFuture *future,
-                                  uint32_t *kwg_size,
+                           uint32_t *kwg_size) {
-                                  std::unique_ptr<BufferBase> *kernel_error) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -93,11 +95,12 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context,
                           static_cast<uint32_t>(height * batch)};
  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(*kernel_error, context);
+    MACE_OUT_OF_RANGE_CONFIG;
-    NON_UNIFORM_WG_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
    if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
      kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d_s1");
@@ -135,6 +138,7 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context,
    *kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
+  MACE_OUT_OF_RANGE_INIT(*kernel);
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    const index_t input_height = input->dim(1);
    const index_t input_width = input->dim(2);
@@ -147,8 +151,8 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context,
               input_channels);
    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG_PTR;
+    MACE_OUT_OF_RANGE_SET_ARGS(*kernel);
-    SET_3D_GWS_ARGS_PTR(kernel, gws);
+    MACE_SET_3D_GWS_ARGS(*kernel, gws);
    kernel->setArg(idx++, *(input->opencl_image()));
    kernel->setArg(idx++, *(filter->opencl_image()));
    if (bias != nullptr) {
@@ -179,60 +183,12 @@ static MaceStatus DepthwiseConv2d(OpKernelContext *context,
  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(*kernel_error);
+  MACE_OUT_OF_RANGE_VALIDATION;
  return MACE_SUCCESS;
 }
-template <typename T>
+}  // namespace depthwise
-MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
+}  // namespace image
-    const Tensor *input,
+}  // namespace opencl
-    const Tensor *filter, /* MIHW */
-    const Tensor *bias,
-    Tensor *output,
-    StatsFuture *future) {
-  index_t kernel_h = filter->dim(2);
-  index_t kernel_w = filter->dim(3);
-  if (strides_[0] != strides_[1]) {
-    LOG(FATAL) << "GPU depthwise conv2d kernel with "
-               << "filter" << kernel_h << "x" << kernel_w << ","
-               << " stride " << strides_[0] << "x" << strides_[1]
-               << " is not implemented yet.";
-  }
-  // Create a fake conv_2d filter to calculate the paddings and output size
-  std::vector<index_t> fake_filter_shape(4);
-  fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
-  fake_filter_shape[1] = filter->dim(1);
-  fake_filter_shape[2] = filter->dim(2);
-  fake_filter_shape[3] = filter->dim(3);
-  std::vector<index_t> output_shape(4);
-  std::vector<int> paddings(2);
-  if (paddings_.empty()) {
-    kernels::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), fake_filter_shape.data(), dilations_, strides_,
-        padding_type_, output_shape.data(), paddings.data());
-  } else {
-    paddings = paddings_;
-    CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
-                   paddings_.data(), dilations_, strides_, RoundType::FLOOR,
-                   output_shape.data());
-  }
-  std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  return DepthwiseConv2d(
-      context_,
-      &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
-      activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
-      output, future, &kwg_size_, &kernel_error_);
-}
-template struct DepthwiseConv2dFunctor<DeviceType::GPU, float>;
-template struct DepthwiseConv2dFunctor<DeviceType::GPU, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/image/depthwise_conv2d.h
+++ b/mace/kernels/opencl/image/depthwise_conv2d.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_
+#define MACE_KERNELS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_
+#include "mace/kernels/depthwise_conv2d.h"
+#include <memory>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+namespace depthwise {
+MaceStatus DepthwiseConv2d(OpKernelContext *context,
+                           cl::Kernel *kernel,
+                           const Tensor *input,   // NHWC
+                           const Tensor *filter,  // HWIM
+                           const Tensor *bias,
+                           const int stride,
+                           const int *paddings,
+                           const int *dilations,
+                           const ActivationType activation,
+                           const float relux_max_limit,
+                           const DataType dt,
+                           std::vector<index_t> *prev_input_shape,
+                           Tensor *output,
+                           StatsFuture *future,
+                           uint32_t *kwg_size);
+}  // namespace depthwise
+template <typename T>
+class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
+ public:
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus DepthwiseConv2dKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const ActivationType activation,
+    const float relux_max_limit,
+    Tensor *output,
+    StatsFuture *future) {
+  index_t kernel_h = filter->dim(2);
+  index_t kernel_w = filter->dim(3);
+  if (strides[0] != strides[1]) {
+    LOG(WARNING) << "OpenCL depthwise conv2d kernel with "
+                 << "filter" << kernel_h << "x" << kernel_w << ","
+                 << " stride " << strides[0] << "x" << strides[1]
+                 << " is not implemented yet, using slow version";
+    MACE_NOT_IMPLEMENTED;
+  }
+  // Create a fake conv_2d filter to calculate the paddings and output size
+  std::vector<index_t> fake_filter_shape(4);
+  fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
+  fake_filter_shape[1] = filter->dim(1);
+  fake_filter_shape[2] = filter->dim(2);
+  fake_filter_shape[3] = filter->dim(3);
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    kernels::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), fake_filter_shape.data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
+                   padding_data.data(), dilations, strides, RoundType::FLOOR,
+                   output_shape.data());
+  }
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  return depthwise::DepthwiseConv2d(
+      context, &kernel_, input, filter, bias, strides[0], paddings.data(),
+      dilations, activation, relux_max_limit, DataTypeToEnum<T>::value,
+      &input_shape_, output, future, &kwg_size_);
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_
--- a/mace/kernels/opencl/image/eltwise.h
+++ b/mace/kernels/opencl/image/eltwise.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_ELTWISE_H_
+#define MACE_KERNELS_OPENCL_IMAGE_ELTWISE_H_
+#include "mace/kernels/eltwise.h"
+#include <memory>
+#include <utility>
+#include <vector>
+#include <set>
+#include <string>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class EltwiseKernel : public OpenCLEltwiseKernel {
+ public:
+  explicit EltwiseKernel(
+      const EltwiseType type,
+      const std::vector<float> &coeff,
+      const float scalar_input,
+      const int32_t scalar_input_index)
+      : type_(type),
+        coeff_(coeff),
+        scalar_input_(scalar_input),
+        scalar_input_index_(scalar_input_index) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input0,
+      const Tensor *input1,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  EltwiseType type_;
+  std::vector<float> coeff_;
+  float scalar_input_;
+  int32_t scalar_input_index_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus EltwiseKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input0,
+    const Tensor *input1,
+    Tensor *output,
+    StatsFuture *future) {
+  bool swapped = false;
+  if (input1 != nullptr) {
+    MACE_CHECK(input0->dim_size() == input1->dim_size() ||
+        input0->dim_size() == 1 || input1->dim_size() == 1)
+      << "Inputs of Eltwise op must be same shape";
+    if (input0->size() != input1->size()) {
+      if (input0->size() < input1->size()) {
+        std::swap(input0, input1);
+        swapped = true;
+      }
+      if (input1->dim_size() == 1) {
+        MACE_CHECK(input0->dim(3) == input1->dim(0))
+          << "Element-Wise op only support channel dimension broadcast";
+      } else {
+        MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1) &&
+            input0->dim(3) == input1->dim(3) && input1->dim(1) == 1 &&
+            input1->dim(2) == 1)
+          << "Element-Wise op only support channel dimension broadcast";
+      }
+    }
+  }
+  if (scalar_input_index_ == 0) {
+    swapped = !swapped;
+  }
+  std::vector<index_t> output_shape(4);
+  output_shape[0] = input0->dim(0);
+  output_shape[1] = input0->dim(1);
+  output_shape[2] = input0->dim(2);
+  output_shape[3] = input0->dim(3);
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channels = output->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t batch_height_pixels = batch * height;
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(batch_height_pixels)};
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    auto dt = DataTypeToEnum<T>::value;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
+    built_options.emplace("-Deltwise=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
+    if (input1 == nullptr) {
+      built_options.emplace("-DINPUT_TYPE=1");
+    } else if (input0->size() != input1->size()) {
+      if (input1->dim(0) == 1 || input1->dim_size() == 1)
+        built_options.emplace("-DINPUT_TYPE=3");
+      else
+        built_options.emplace("-DINPUT_TYPE=2");
+      if (swapped) built_options.emplace("-DSWAPPED");
+    }
+    if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input0->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input0->opencl_image()));
+    if (input1 == nullptr) {
+      kernel_.setArg(idx++, scalar_input_);
+    } else {
+      kernel_.setArg(idx++, *(input1->opencl_image()));
+    }
+    kernel_.setArg(idx++, static_cast<int32_t>(height));
+    kernel_.setArg(idx++, static_cast<int32_t>(width));
+    kernel_.setArg(idx++, static_cast<int32_t>(channels));
+    if (!coeff_.empty()) {
+      kernel_.setArg(idx++, coeff_[0]);
+      kernel_.setArg(idx++, coeff_[1]);
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input0->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_ELTWISE_H_
--- a/mace/kernels/opencl/image/fully_connected.h
+++ b/mace/kernels/opencl/image/fully_connected.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_FULLY_CONNECTED_H_
+#define MACE_KERNELS_OPENCL_IMAGE_FULLY_CONNECTED_H_
+#include "mace/kernels/fully_connected.h"
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class FullyConnectedKernel : public OpenCLFullyConnectedKernel {
+ public:
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *weight,
+      const Tensor *bias,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  cl::Kernel kernel_;
+  std::vector<uint32_t> gws_;
+  std::vector<uint32_t> lws_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus FullyConnectedKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    const Tensor *weight,
+    const Tensor *bias,
+    const ActivationType activation,
+    const float relux_max_limit,
+    Tensor *output,
+    StatsFuture *future) {
+  std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    const index_t batch = output->dim(0);
+    const index_t output_size = output->dim(3);
+    const index_t output_blocks = RoundUpDiv4(output_size);
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    auto dt = DataTypeToEnum<T>::value;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
+    built_options.emplace("-Dfully_connected_width=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    if (bias != nullptr) {
+      built_options.emplace("-DBIAS");
+    }
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
+      built_options.emplace("-DNON_QUALCOMM_ADRENO");
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
+                                              built_options, &kernel_));
+    const uint32_t kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+    if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+      const uint32_t wave_size =
+          static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
+      gws_ = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
+      const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
+      lws_ = {gws_[0], gws_[1], inter_local_blks};
+    } else {
+      gws_ = {4, 8, static_cast<uint32_t>(batch * output_blocks)};
+      const uint32_t inter_local_blks = kwg_size / (gws_[0] * gws_[1]);
+      lws_ = {gws_[0], gws_[1], inter_local_blks};
+    }
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    const index_t batch = output->dim(0);
+    const index_t output_blocks = RoundUpDiv4(output->dim(3));
+    gws_[2] = static_cast<uint32_t>(batch * output_blocks);
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws_);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(weight->opencl_image()));
+    if (bias != nullptr) {
+      kernel_.setArg(idx++, *(bias->opencl_image()));
+    }
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, (lws_[0] * lws_[1] * lws_[2] * sizeof(float)),
+                   nullptr);
+    kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
+    kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
+    kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
+    kernel_.setArg(idx++, static_cast<int>(output_blocks));
+    kernel_.setArg(idx++, relux_max_limit);
+    input_shape_ = input->shape();
+  }
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws_[0], gws_[1], gws_[2]),
+        cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws_.size());
+    for (size_t i = 0; i < lws_.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws_[i], lws_[i]);
+    }
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange,
+        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+        cl::NDRange(lws_[0], lws_[1], lws_[2]), nullptr, &event);
+  }
+  MACE_OUT_OF_RANGE_VALIDATION;
+  MACE_CL_RET_STATUS(error);
+  if (future != nullptr) {
+    future->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_FULLY_CONNECTED_H_
--- a/mace/kernels/opencl/image_to_buffer.cc
+++ b/mace/kernels/opencl/image_to_buffer.cc
@@ -12,24 +12,47 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/kernels/image_to_buffer.h"
+#ifndef MACE_KERNELS_OPENCL_IMAGE_IMAGE_TO_BUFFER_H_
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#define MACE_KERNELS_OPENCL_IMAGE_IMAGE_TO_BUFFER_H_
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/buffer_inverse_transform.h"
 #include "mace/kernels/opencl/helper.h"
 namespace mace {
 namespace kernels {
+namespace opencl {
+namespace image {
 template <typename T>
-MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
+class ImageToBuffer : public OpenCLBufferInverseTransformKernel {
-    const Tensor *image,
+ public:
+  MaceStatus Compute(OpKernelContext *context,
+                     const Tensor *input,
                     const BufferType type,
-    Tensor *buffer,
+                     const int wino_blk_size,
+                     Tensor *output,
+                     StatsFuture *future) override;
+ private:
+  cl::Kernel kernel_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus ImageToBuffer<T>::Compute(OpKernelContext *context,
+                                     const Tensor *input,
+                                     const BufferType type,
+                                     const int wino_blk_size,
+                                     Tensor *output,
                                     StatsFuture *future) {
-  auto formatted_buffer_shape = FormatBufferShape(image->shape(), type);
+  auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
  std::vector<size_t> image_shape;
-  CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size_);
+  CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size);
-  MACE_RETURN_IF_ERROR(buffer->Resize(image->shape()));
+  MACE_RETURN_IF_ERROR(output->Resize(input->shape()));
  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
                     static_cast<uint32_t>(image_shape[1])};
@@ -49,9 +72,9 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
      break;
    case WINOGRAD_FILTER: {
      std::stringstream ss_tmp;
-      gws[1] /= (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
+      gws[1] /= (wino_blk_size + 2) * (wino_blk_size + 2);
      ss_tmp << "winograd_filter_image_to_buffer_"
-             << wino_blk_size_ << "x" << wino_blk_size_;
+             << wino_blk_size << "x" << wino_blk_size;
      kernel_name = ss_tmp.str();
      break;
    }
@@ -67,17 +90,18 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
      break;
  }
-  auto runtime = context_->device()->opencl_runtime();
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
  if (kernel_.get() == nullptr) {
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
+    MACE_OUT_OF_RANGE_CONFIG;
-    NON_UNIFORM_WG_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
    std::stringstream kernel_name_ss;
    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
    built_options.emplace(kernel_name_ss.str());
-    if (buffer->dtype() == image->dtype()) {
+    if (output->dtype() == input->dtype()) {
      built_options.emplace(
          "-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
      built_options.emplace("-DCMD_DATA_TYPE=" +
@@ -94,25 +118,26 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
                                              &kernel_));
  }
-  if (!IsVecEqual(input_shape_, image->shape())) {
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    SET_2D_GWS_ARGS(kernel_);
+    MACE_SET_2D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(buffer->opencl_buffer()));
+    kernel_.setArg(idx++, *(output->opencl_buffer()));
    if (type == CONV2D_FILTER) {
      const index_t
-          inner_size = buffer->dim(1) * buffer->dim(2) * buffer->dim(3);
+          inner_size = output->dim(1) * output->dim(2) * output->dim(3);
-      kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
      kernel_.setArg(idx++, static_cast<uint32_t>(inner_size));
    } else if (type == ARGUMENT) {
-      kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
    } else if (type == WEIGHT_HEIGHT) {
-      kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(0)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(1)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(2)));
-      kernel_.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
+      kernel_.setArg(idx++, static_cast<uint32_t>(output->dim(3)));
    } else {
      kernel_.setArg(idx++,
                     static_cast<uint32_t>(formatted_buffer_shape[1]));
@@ -121,8 +146,8 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
      kernel_.setArg(idx++,
                     static_cast<uint32_t>(formatted_buffer_shape[3]));
    }
-    kernel_.setArg(idx++, *(image->opencl_image()));
+    kernel_.setArg(idx++, *(input->opencl_image()));
-    input_shape_ = image->shape();
+    input_shape_ = input->shape();
  }
  const uint32_t kwg_size =
@@ -146,7 +171,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
        cl::NDRange(lws[0], lws[1]), nullptr, &event);
  }
  MACE_CL_RET_STATUS(error);
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
+  MACE_OUT_OF_RANGE_VALIDATION;
  if (future != nullptr) {
    future->wait_fn = [runtime, event](CallStats *stats) {
      event.wait();
@@ -159,8 +184,9 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
  return MACE_SUCCESS;
 }
-template struct ImageToBufferFunctor<DeviceType::GPU, float>;
+}  // namespace image
-template struct ImageToBufferFunctor<DeviceType::GPU, half>;
+}  // namespace opencl
 }  // namespace kernels
 }  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_IMAGE_TO_BUFFER_H_
--- a/mace/kernels/opencl/image/lstm_cell.h
+++ b/mace/kernels/opencl/image/lstm_cell.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_LSTM_CELL_H_
+#define MACE_KERNELS_OPENCL_IMAGE_LSTM_CELL_H_
+#include <memory>
+#include <vector>
+#include <set>
+#include <string>
+#include "mace/kernels/lstmcell.h"
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class LSTMCellKernel : public OpenCLLSTMCellKernel {
+ public:
+  explicit LSTMCellKernel(
+       const T forget_bias)
+      : forget_bias_(forget_bias) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *pre_output,
+      const Tensor *weight,
+      const Tensor *bias,
+      const Tensor *pre_cell,
+      Tensor *cell,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  T forget_bias_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus LSTMCellKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    const Tensor *pre_output,
+    const Tensor *weight,
+    const Tensor *bias,
+    const Tensor *pre_cell,
+    Tensor *cell,
+    Tensor *output,
+    StatsFuture *future) {
+  MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0,
+             "LSTM hidden units should be a multiple of 4");
+  const index_t height = input->dim(0);
+  const index_t width = input->dim(1);
+  const index_t hidden_units = pre_output->dim(1);
+  const index_t w_blocks = hidden_units >> 2;
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    auto dt = DataTypeToEnum<T>::value;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell");
+    built_options.emplace("-Dlstmcell=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("lstmcell", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  const uint32_t gws[2] = {static_cast<uint32_t>(w_blocks),
+                           static_cast<uint32_t>(height)};
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    std::vector<index_t> output_shape_padded = {height, 1, 1, hidden_units};
+    std::vector<size_t> output_image_shape;
+    CalImage2DShape(output_shape_padded, BufferType::IN_OUT_CHANNEL,
+                    &output_image_shape);
+    MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(),
+                                             output_image_shape));
+    MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(),
+                                           output_image_shape));
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_2D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(pre_output->opencl_image()));
+    kernel_.setArg(idx++, *(weight->opencl_image()));
+    kernel_.setArg(idx++, *(bias->opencl_image()));
+    kernel_.setArg(idx++, *(pre_cell->opencl_image()));
+    kernel_.setArg(idx++, static_cast<float>(forget_bias_));
+    kernel_.setArg(idx++, static_cast<int32_t>(width));
+    kernel_.setArg(idx++, static_cast<int32_t>(hidden_units));
+    kernel_.setArg(idx++, static_cast<int32_t>(RoundUpDiv4(width)));
+    kernel_.setArg(idx++, *(cell->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
+  std::string tuning_key =
+      Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_LSTM_CELL_H_
--- a/mace/kernels/opencl/image/matmul.h
+++ b/mace/kernels/opencl/image/matmul.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_MATMUL_H_
+#define MACE_KERNELS_OPENCL_IMAGE_MATMUL_H_
+#include "mace/kernels/matmul.h"
+#include <functional>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class MatMulKernel : public OpenCLMatMulKernel {
+ public:
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *A,
+      const Tensor *B,
+      Tensor *C,
+      bool transpose_a,
+      bool transpose_b,
+      StatsFuture *future) override;
+ private:
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+};
+template <typename T>
+MaceStatus MatMulKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *A,
+    const Tensor *B,
+    Tensor *C,
+    bool transpose_a,
+    bool transpose_b,
+    StatsFuture *future) {
+  MACE_UNUSED(future);
+  MACE_CHECK(!transpose_a && !transpose_b,
+             "GPU does not support transpose matmul");
+  index_t rank = A->dim_size();
+  index_t height = A->dim(rank - 2);
+  index_t K = A->dim(rank - 1);
+  index_t width = B->dim(rank - 1);
+  index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
+                                  std::multiplies<index_t>());
+  std::vector<index_t> c_shape = A->shape();
+  c_shape[rank - 2] = height;
+  c_shape[rank - 1] = width;
+  std::vector<size_t> c_image_shape;
+  std::vector<index_t> padded_c_shape = {batch, height, width, 1};
+  CalImage2DShape(padded_c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
+  MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
+  const index_t height_blocks = RoundUpDiv4(height);
+  const index_t width_blocks = RoundUpDiv4(width);
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(width_blocks),
+      static_cast<uint32_t>(height_blocks * batch),
+  };
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    auto dt = DataTypeToEnum<T>::value;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
+    built_options.emplace("-Dmatmul=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  uint32_t idx = 0;
+  MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+  MACE_SET_2D_GWS_ARGS(kernel_, gws);
+  kernel_.setArg(idx++, *(A->opencl_image()));
+  kernel_.setArg(idx++, *(B->opencl_image()));
+  kernel_.setArg(idx++, *(C->opencl_image()));
+  kernel_.setArg(idx++, static_cast<int>(height));
+  kernel_.setArg(idx++, static_cast<int>(width));
+  kernel_.setArg(idx++, static_cast<int>(K));
+  kernel_.setArg(idx++, static_cast<int>(height_blocks));
+  kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(K)));
+  const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
+  std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_MATMUL_H_
--- a/mace/kernels/opencl/image/pad.h
+++ b/mace/kernels/opencl/image/pad.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_PAD_H_
+#define MACE_KERNELS_OPENCL_IMAGE_PAD_H_
+#include "mace/kernels/pad.h"
+#include <memory>
+#include <vector>
+#include <set>
+#include <string>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class PadKernel : public OpenCLPadKernel {
+ public:
+  PadKernel(const std::vector<int> &paddings,
+            const float constant_value)
+      : paddings_(paddings), constant_value_(constant_value) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  std::vector<int> paddings_;
+  float constant_value_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus PadKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    Tensor *output,
+    StatsFuture *future) {
+  MACE_CHECK(this->paddings_.size() ==
+      static_cast<size_t>((input->dim_size() * 2)));
+  MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) &&
+      (this->paddings_[6] == 0) && (this->paddings_[7] == 0))
+    << "Mace only support height/width dimension now";
+  auto input_shape = input->shape();
+  std::vector<index_t> output_shape = {
+      input_shape[0] + this->paddings_[0] + this->paddings_[1],
+      input_shape[1] + this->paddings_[2] + this->paddings_[3],
+      input_shape[2] + this->paddings_[4] + this->paddings_[5],
+      input_shape[3] + this->paddings_[6] + this->paddings_[7]};
+  std::vector<size_t> image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channels = output->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
+    built_options.emplace("-Dpad=" + kernel_name);
+    auto dt = DataTypeToEnum<T>::value;
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    int idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, this->constant_value_);
+    kernel_.setArg(idx++, static_cast<int32_t>(input_shape[1]));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_shape[2]));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_shape[1]));
+    kernel_.setArg(idx++, this->paddings_[2]);
+    kernel_.setArg(idx++, this->paddings_[4]);
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
+                                  output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_PAD_H_
--- a/mace/kernels/opencl/image/pooling.h
+++ b/mace/kernels/opencl/image/pooling.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_POOLING_H_
+#define MACE_KERNELS_OPENCL_IMAGE_POOLING_H_
+#include "mace/kernels/pooling.h"
+#include <algorithm>
+#include <memory>
+#include <vector>
+#include <set>
+#include <string>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+namespace pooling {
+inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                                     const uint32_t *gws,
+                                     const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = runtime->device_global_mem_cache_size();
+    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
+    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+    lws[2] =
+        std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
+    const uint32_t lws_size = lws[1] * lws[2];
+    lws[0] = gws[0] / 4;
+    if (lws[0] == 0) {
+      lws[0] = gws[0];
+    }
+    lws[0] = std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws_size),
+                                1);
+  }
+  return lws;
+}
+}  // namespace pooling
+template <typename T>
+class PoolingKernel : public OpenCLPoolingKernel {
+ public:
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const PoolingType pooling_type,
+      const int *kernels,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus PoolingKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    const PoolingType pooling_type,
+    const int *kernels,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    Tensor *output,
+    StatsFuture *future) {
+  MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
+    << "Pooling opencl kernel not support dilation yet";
+  std::vector<index_t> output_shape(4);
+  std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
+                                       kernels[0], kernels[1]};
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    kernels::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), filter_shape.data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), filter_shape.data(),
+                   padding_data.data(), dilations, strides, RoundType::CEIL,
+                   output_shape.data());
+  }
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    const DataType dt = DataTypeToEnum<T>::value;
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
+    built_options.emplace("-Dpooling=" + kernel_name);
+    if (pooling_type == MAX && input->dtype() == output->dtype()) {
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    } else {
+      built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    }
+    if (pooling_type == AVG) {
+      built_options.emplace("-DPOOL_AVG");
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
+                                              kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
+      static_cast<uint32_t>(output->dim(2)),
+      static_cast<uint32_t>(output->dim(0) * output->dim(1)),
+  };
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(output->dim(1)));
+    kernel_.setArg(idx++, paddings[0] / 2);
+    kernel_.setArg(idx++, paddings[1] / 2);
+    kernel_.setArg(idx++, strides[0]);
+    kernel_.setArg(idx++, strides[1]);
+    kernel_.setArg(idx++, kernels[0]);
+    kernel_.setArg(idx++, kernels[1]);
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = pooling::LocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_POOLING_H_
--- a/mace/kernels/opencl/image/reduce_mean.h
+++ b/mace/kernels/opencl/image/reduce_mean.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_REDUCE_MEAN_H_
+#define MACE_KERNELS_OPENCL_IMAGE_REDUCE_MEAN_H_
+#include "mace/kernels/reduce_mean.h"
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class ReduceMeanKernel : public OpenCLReduceMeanKernel {
+ public:
+  ReduceMeanKernel(const std::vector<int> axis,
+                   const bool keep_dims)
+      : axis_(axis), keep_dims_(keep_dims) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  const std::vector<int> axis_;
+  bool keep_dims_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus ReduceMeanKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    Tensor *output,
+    StatsFuture *future) {
+  MACE_CHECK_NOTNULL(input);
+//  MACE_CHECK(keep_dims_, "reduce mean gpu only support keep dims.");
+  MACE_CHECK(input->dim_size() == 4,
+             "reduce mean gpu only support 4-dim input");
+  MACE_CHECK(axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2,
+             "reduce mean gpu only support 1,2-axis reduce");
+  index_t batch = input->dim(0);
+  const index_t in_height = input->dim(1);
+  const index_t in_width = input->dim(2);
+  const index_t channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
+  std::vector<uint32_t> gws(3);
+  std::vector<uint32_t> lws(3);
+  std::vector<index_t> output_shape{batch, 1, 1, channels};
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    const DataType dt = DataTypeToEnum<T>::value;
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean");
+    built_options.emplace("-Dreduce_mean=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
+      built_options.emplace("-DNON_QUALCOMM_ADRENO");
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce_mean",
+                                              kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
+    const uint32_t wave_size =
+        static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
+    gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
+  } else {
+    gws = {4, 16, static_cast<uint32_t>(batch * channel_blocks)};
+  }
+  lws = {gws[0], gws[1], 1};
+  const int group_size = lws[0] * lws[1] * lws[2];
+  const int partial_len = (image_size + group_size - 1) / group_size;
+  const int remain_index = image_size % group_size;
+  const float in_width_reciprocal = 1.f / in_width;
+  const float img_size_reciprocal = 1.f / (in_width * in_height);
+  const float channel_blk_reciprocal = 1.f / channel_blocks;
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, (group_size * 4 * sizeof(T)),
+                   nullptr);
+    kernel_.setArg(idx++, static_cast<int32_t>(group_size));
+    kernel_.setArg(idx++, static_cast<int32_t>(partial_len));
+    kernel_.setArg(idx++, static_cast<int32_t>(remain_index));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
+    kernel_.setArg(idx++, img_size_reciprocal);
+    kernel_.setArg(idx++, in_width_reciprocal);
+    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
+    kernel_.setArg(idx++, channel_blk_reciprocal);
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange,
+        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION;
+  if (future != nullptr) {
+    future->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_REDUCE_MEAN_H_
--- a/mace/kernels/opencl/image/resize_bicubic.h
+++ b/mace/kernels/opencl/image/resize_bicubic.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_RESIZE_BICUBIC_H_
+#define MACE_KERNELS_OPENCL_IMAGE_RESIZE_BICUBIC_H_
+#include "mace/kernels/resize_bicubic.h"
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+namespace resize_bicubic {
+inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                                     const uint32_t *gws,
+                                     const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  uint64_t cache_size = runtime->device_global_mem_cache_size();
+  uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  if (lws[1] >= base) {
+    lws[0] = std::min<uint32_t>(gws[0], base);
+  } else {
+    lws[0] = gws[0] / 8;
+    if (lws[0] == 0) {
+      lws[0] = gws[0];
+    }
+  }
+  lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
+  const uint32_t lws_size = lws[0] * lws[1];
+  lws[2] = gws[2] / 8;
+  if (lws[2] == 0) {
+    lws[2] = gws[2];
+  }
+  lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
+                              1);
+  return lws;
+}
+}  // namespace resize_bicubic
+template <typename T>
+class ResizeBicubicKernel : public OpenCLResizeBicubicKernel {
+ public:
+  ResizeBicubicKernel(bool align_corners,
+                      const index_t out_height,
+                      const index_t out_width)
+      : align_corners_(align_corners),
+        out_height_(out_height),
+        out_width_(out_width) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  bool align_corners_;
+  index_t out_height_;
+  index_t out_width_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus ResizeBicubicKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    Tensor *output,
+    StatsFuture *future) {
+  const index_t batch = input->dim(0);
+  const index_t in_height = input->dim(1);
+  const index_t in_width = input->dim(2);
+  const index_t channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t out_height = out_height_;
+  const index_t out_width = out_width_;
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(out_width),
+                           static_cast<uint32_t>(out_height * batch)};
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    auto dt = DataTypeToEnum<T>::value;
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache");
+    built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace(MakeString("-DTABLE_SIZE=", kTableSize));
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("resize_bicubic",
+                             kernel_name,
+                             built_options,
+                             &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    MACE_CHECK(out_height > 0 && out_width > 0);
+    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
+    std::vector<size_t> output_image_shape;
+    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                    &output_image_shape);
+    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+    float height_scale =
+        CalculateResizeScale(in_height, out_height, align_corners_);
+    float width_scale =
+        CalculateResizeScale(in_width, out_width, align_corners_);
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, height_scale);
+    kernel_.setArg(idx++, width_scale);
+    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t>
+      lws = resize_bicubic::LocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_RESIZE_BICUBIC_H_
--- a/mace/kernels/opencl/image/resize_bilinear.h
+++ b/mace/kernels/opencl/image/resize_bilinear.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_RESIZE_BILINEAR_H_
+#define MACE_KERNELS_OPENCL_IMAGE_RESIZE_BILINEAR_H_
+#include "mace/kernels/resize_bilinear.h"
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+namespace resize_bilinear {
+inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                                     const uint32_t *gws,
+                                     const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = runtime->device_global_mem_cache_size();
+    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
+    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+    if (lws[1] >= base) {
+      lws[0] = std::min<uint32_t>(gws[0], base);
+    } else {
+      lws[0] = gws[0] / 8;
+      if (lws[0] == 0) {
+        lws[0] = gws[0];
+      }
+    }
+    lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
+    const uint32_t lws_size = lws[0] * lws[1];
+    lws[2] = gws[2] / 8;
+    if (lws[2] == 0) {
+      lws[2] = gws[2];
+    }
+    lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
+                                1);
+  }
+  return lws;
+}
+}  // namespace resize_bilinear
+template <typename T>
+class ResizeBilinearKernel : public OpenCLResizeBilinearKernel {
+ public:
+  ResizeBilinearKernel(bool align_corners,
+                      const index_t out_height,
+                      const index_t out_width)
+      : align_corners_(align_corners),
+        out_height_(out_height),
+        out_width_(out_width) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  bool align_corners_;
+  index_t out_height_;
+  index_t out_width_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus ResizeBilinearKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    Tensor *output,
+    StatsFuture *future) {
+  const index_t batch = input->dim(0);
+  const index_t in_height = input->dim(1);
+  const index_t in_width = input->dim(2);
+  const index_t channels = input->dim(3);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t out_height = out_height_;
+  const index_t out_width = out_width_;
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(out_width),
+                           static_cast<uint32_t>(out_height * batch)};
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
+    built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
+    auto dt = DataTypeToEnum<T>::value;
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("resize_bilinear",
+                             kernel_name,
+                             built_options,
+                             &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    MACE_CHECK(out_height > 0 && out_width > 0);
+    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
+    std::vector<size_t> output_image_shape;
+    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                    &output_image_shape);
+    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
+    float height_scale =
+        CalculateResizeScale(in_height, out_height, align_corners_);
+    float width_scale =
+        CalculateResizeScale(in_width, out_width, align_corners_);
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    kernel_.setArg(idx++, height_scale);
+    kernel_.setArg(idx++, width_scale);
+    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t>
+      lws = resize_bilinear::LocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_RESIZE_BILINEAR_H_
--- a/mace/kernels/opencl/image/softmax.h
+++ b/mace/kernels/opencl/image/softmax.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_SOFTMAX_H_
+#define MACE_KERNELS_OPENCL_IMAGE_SOFTMAX_H_
+#include "mace/kernels/softmax.h"
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+namespace softmax {
+inline std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+                                     const uint32_t *gws,
+                                     const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  if (kwg_size == 0) {
+    lws[0] = lws[1] = lws[2] = 1;
+  } else {
+    uint64_t
+        cache_size = runtime->device_global_mem_cache_size();
+    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
+    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+    if (gws[0] < base) {
+      lws[0] = gws[0];
+    } else {
+      lws[0] = gws[0] / base;
+    }
+    lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
+    lws[2] = std::max<uint32_t>(std::min<uint32_t>(
+        gws[2], kwg_size / (lws[0] * lws[1])), 1);
+  }
+  return lws;
+}
+}  // namespace softmax
+template <typename T>
+class SoftmaxKernel : public OpenCLSoftmaxKernel {
+ public:
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *logits,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus SoftmaxKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *logits,
+    Tensor *output,
+    StatsFuture *future) {
+  index_t batch = 0;
+  index_t height = 0;
+  index_t width = 0;
+  index_t channels = 0;
+  if (logits->dim_size() == 2) {
+    batch = logits->dim(0);
+    height = 1;
+    width = 1;
+    channels = logits->dim(1);
+  } else if (logits->dim_size() == 4) {
+    batch = logits->dim(0);
+    height = logits->dim(1);
+    width = logits->dim(2);
+    channels = logits->dim(3);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const int remain_channels = channel_blocks * 4 - channels;
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
+    built_options.emplace("-Dsoftmax=" + kernel_name);
+    auto dt = DataTypeToEnum<T>::value;
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
+                                              built_options, &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, logits->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(logits->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int>(channels));
+    kernel_.setArg(idx++, remain_channels);
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = logits->shape();
+  }
+  std::vector<uint32_t> lws = softmax::LocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat("softmax_opencl_kernel", batch, height, width, channels);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_SOFTMAX_H_
--- a/mace/kernels/opencl/image/space_to_batch.h
+++ b/mace/kernels/opencl/image/space_to_batch.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_BATCH_H_
+#define MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_BATCH_H_
+#include "mace/kernels/space_to_batch.h"
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel {
+ public:
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *space_tensor,
+      const std::vector<int> &paddings,
+      const std::vector<int> &block_shape,
+      const std::vector<index_t> &output_shape,
+      Tensor *batch_tensor,
+      StatsFuture *future) override;
+ private:
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus SpaceToBatchKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *space_tensor,
+    const std::vector<int> &paddings,
+    const std::vector<int> &block_shape,
+    const std::vector<index_t> &output_shape,
+    Tensor *batch_tensor,
+    StatsFuture *future) {
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
+  MACE_RETURN_IF_ERROR(
+      batch_tensor->ResizeImage(output_shape, output_image_shape));
+  const char *kernel_name = "space_to_batch";
+  const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
+  const uint32_t gws[3] = {
+      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
+      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+        DtToCLCMDDt(DataTypeToEnum<T>::value));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, space_tensor->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(space_tensor->opencl_image()));
+    kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
+    kernel_.setArg(idx++, block_shape[0]);
+    kernel_.setArg(idx++, block_shape[1]);
+    kernel_.setArg(idx++, paddings[0]);
+    kernel_.setArg(idx++, paddings[2]);
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
+    input_shape_ = space_tensor->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key =
+      Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
+             batch_tensor->dim(2), batch_tensor->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_BATCH_H_
--- a/mace/kernels/opencl/image/space_to_depth.h
+++ b/mace/kernels/opencl/image/space_to_depth.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_
+#define MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_
+#include "mace/kernels/space_to_depth.h"
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel {
+ public:
+  explicit SpaceToDepthKernel(const int block_size)
+      : block_size_(block_size) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      Tensor *output,
+      StatsFuture *future) override;
+ private:
+  const int block_size_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus SpaceToDepthKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    Tensor *output,
+    StatsFuture *future) {
+  const index_t batch = input->dim(0);
+  const index_t input_height = input->dim(1);
+  const index_t input_width = input->dim(2);
+  const index_t input_depth = input->dim(3);
+  MACE_CHECK((input_depth % 4) == 0,
+             "input channel should be dividable by 4");
+  MACE_CHECK(
+      (input_width % block_size_ == 0) && (input_height % block_size_ == 0),
+      "input width and height should be dividable by block_size");
+  const index_t output_height = input_height / block_size_;
+  const index_t output_width = input_width / block_size_;
+  const index_t output_depth = input_depth * block_size_ * block_size_;
+  const index_t input_depth_blocks = RoundUpDiv4(input_depth);
+  const index_t output_depth_blocks = RoundUpDiv4(output_depth);
+  std::vector<index_t> output_shape = {batch, output_height, output_width,
+                                       output_depth};
+  std::vector<size_t> image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    const char *kernel_name = "space_to_depth";
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    auto dt = DataTypeToEnum<T>::value;
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  const uint32_t gws[3] = {static_cast<uint32_t>(input_depth_blocks),
+                           static_cast<uint32_t>(input_width),
+                           static_cast<uint32_t>(input_height * batch)};
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_height * batch));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
+    kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
+  }
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  std::string tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0),
+                                  input->dim(1), input->dim(2), input->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_
--- a/mace/kernels/opencl/image/split.h
+++ b/mace/kernels/opencl/image/split.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_SPLIT_H_
+#define MACE_KERNELS_OPENCL_IMAGE_SPLIT_H_
+#include "mace/kernels/split.h"
+#include <algorithm>
+#include <memory>
+#include <vector>
+#include <set>
+#include <string>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class SplitKernel : public OpenCLSplitKernel {
+ public:
+  explicit SplitKernel(const int32_t axis) : axis_(axis) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const std::vector<Tensor *> &output_list,
+      StatsFuture *future) override;
+ private:
+  int32_t axis_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+};
+template <typename T>
+MaceStatus SplitKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    const std::vector<Tensor *> &output_list,
+    StatsFuture *future) {
+  const index_t input_channels = input->dim(3);
+  const size_t outputs_count = output_list.size();
+  const index_t output_channels = input_channels / outputs_count;
+  MACE_CHECK(output_channels % 4 == 0)
+    << "output channels of split op must be divisible by 4";
+  std::vector<index_t> output_shape(
+      {input->dim(0), input->dim(1), input->dim(2), output_channels});
+  std::vector<size_t> image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
+  for (size_t i = 0; i < outputs_count; ++i) {
+    MACE_RETURN_IF_ERROR(
+        output_list[i]->ResizeImage(output_shape, image_shape));
+  }
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split");
+    built_options.emplace("-Dsplit=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+        DtToCLCMDDt(DataTypeToEnum<T>::value));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("split",
+                                              kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  const index_t channel_blk = RoundUpDiv4(output_channels);
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(input->dim(2)),
+      static_cast<uint32_t>(input->dim(0) * input->dim(1)),
+  };
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
+  cl::Event event;
+  CallStats call_stats{INT64_MAX, 0};
+  for (size_t i = 0; i < outputs_count; ++i) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
+    kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
+    cl_int error;
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+    } else {
+      std::vector<uint32_t> roundup_gws(lws.size());
+      for (size_t j = 0; j < 3; ++j) {
+        roundup_gws[j] = RoundUp(gws[j], lws[j]);
+      }
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel_, cl::NullRange,
+          cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+    }
+    MACE_CL_RET_STATUS(error);
+    MACE_OUT_OF_RANGE_VALIDATION;
+    if (future != nullptr && runtime->is_profiling_enabled()) {
+      event.wait();
+      CallStats tmp_stats;
+      runtime->GetCallStats(event, &tmp_stats);
+      call_stats.start_micros =
+          std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
+      call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
+    }
+  }
+  if (future != nullptr) {
+    future->wait_fn = [runtime, call_stats](CallStats *stats) {
+      if (stats != nullptr) {
+        stats->start_micros = call_stats.start_micros;
+        stats->end_micros = stats->start_micros + call_stats.end_micros;
+      }
+    };
+  }
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_SPLIT_H_
--- a/mace/kernels/opencl/image/winograd_transform.h
+++ b/mace/kernels/opencl/image/winograd_transform.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_
+#define MACE_KERNELS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_
+#include "mace/kernels/winograd_transform.h"
+#include <memory>
+#include <vector>
+#include <set>
+#include <string>
+#include "mace/kernels/opencl/helper.h"
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace image {
+template <typename T>
+class WinogradTransformKernel : public OpenCLWinogradTransformKernel {
+ public:
+  WinogradTransformKernel(
+      const Padding &padding_type,
+      const std::vector<int> &paddings,
+      const int block_size)
+      : strides_({1, 1}),
+        dilations_({1, 1}),
+        padding_type_(padding_type),
+        paddings_(paddings),
+        wino_blk_size_(block_size) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input_tensor,
+      Tensor *output_tensor,
+      StatsFuture *future) override;
+ private:
+  const std::vector<int> strides_;    // [stride_h, stride_w]
+  const std::vector<int> dilations_;  // [dilation_h, dilation_w]
+  Padding padding_type_;
+  std::vector<int> paddings_;
+  const int wino_blk_size_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus WinogradTransformKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input_tensor,
+    Tensor *output_tensor,
+    StatsFuture *future) {
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel_.get() == nullptr) {
+    std::string obfuscated_kernel_name;
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    if (wino_blk_size_ == 4) {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4");
+      built_options.emplace("-Dwinograd_transform_4x4="
+                                + obfuscated_kernel_name);
+    } else if (wino_blk_size_ == 2) {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
+      built_options.emplace("-Dwinograd_transform_2x2="
+                                + obfuscated_kernel_name);
+    } else {
+      MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
+      return MACE_SUCCESS;
+    }
+    built_options.emplace("-DDATA_TYPE=" +
+        DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+        DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  std::vector<index_t> output_shape(4);
+  std::vector<index_t> filter_shape = {1, input_tensor->dim(3), 3, 3};
+  std::vector<int> paddings(2);
+  if (paddings_.empty()) {
+    kernels::CalcNHWCPaddingAndOutputSize(
+        input_tensor->shape().data(), filter_shape.data(), dilations_.data(),
+        strides_.data(), padding_type_, output_shape.data(), paddings.data());
+  } else {
+    paddings = paddings_;
+    CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
+                   paddings_.data(), dilations_.data(), strides_.data(),
+                   RoundType::FLOOR, output_shape.data());
+  }
+  const index_t round_h =
+      (output_shape[1] + wino_blk_size_ - 1) / wino_blk_size_;
+  const index_t round_w =
+      (output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_;
+  const index_t out_width = input_tensor->dim(0) * round_h * round_w;
+  const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
+  const float round_w_r = 1.f / static_cast<float>(round_w);
+  const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(out_width),
+      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))
+  };
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input_tensor->shape())) {
+    output_shape = {blk_sqr, input_tensor->dim(3), out_width};
+    std::vector<index_t> padded_output_shape = {
+        output_shape[0], output_shape[1], output_shape[2], 1
+    };
+    std::vector<size_t> image_shape;
+    CalImage2DShape(padded_output_shape,
+                    BufferType::IN_OUT_HEIGHT,
+                    &image_shape);
+    // remove unused last dimension
+    MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_2D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(input_tensor->opencl_image()));
+    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
+    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2)));
+    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
+    kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
+    kernel_.setArg(idx++, round_hw_r);
+    kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
+    kernel_.setArg(idx++, round_w_r);
+    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
+    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
+    input_shape_ = input_tensor->shape();
+  }
+  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
+  std::string tuning_key = Concat("winograd_transform_kernel",
+                                  output_tensor->dim(0),
+                                  output_tensor->dim(1),
+                                  output_tensor->dim(2));
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+template <typename T>
+class WinogradInverseTransformKernel
+    : public OpenCLWinogradInverseTransformKernel {
+ public:
+  WinogradInverseTransformKernel(
+      const ActivationType activation,
+      const float relux_max_limit,
+      const int block_size)
+      : wino_blk_size_(block_size),
+        activation_(activation),
+        relux_max_limit_(relux_max_limit) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const std::vector<const Tensor*> &inputs,
+      Tensor *output_tensor,
+      StatsFuture *future) override;
+ private:
+  const int wino_blk_size_;
+  const ActivationType activation_;
+  const float relux_max_limit_;
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+template <typename T>
+MaceStatus WinogradInverseTransformKernel<T>::Compute(
+    OpKernelContext *context,
+    const std::vector<const Tensor*> &inputs,
+    Tensor *output_tensor,
+    StatsFuture *future) {
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+  const Tensor *input_tensor = inputs[0];
+  const Tensor *bias = inputs.size() == 3 ? inputs[2] : nullptr;
+  if (kernel_.get() == nullptr) {
+    std::string obfuscated_kernel_name;
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    if (wino_blk_size_ == 4) {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4");
+      built_options.emplace("-Dwinograd_inverse_transform_4x4="
+                                + obfuscated_kernel_name);
+    } else if (wino_blk_size_ == 2) {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
+      built_options.emplace("-Dwinograd_inverse_transform_2x2="
+                                + obfuscated_kernel_name);
+    } else {
+      MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
+      return MACE_SUCCESS;
+    }
+    built_options.emplace("-DDATA_TYPE=" +
+        DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+        DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    switch (activation_) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case PRELU:
+        built_options.emplace("-DUSE_PRELU");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation_;
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              &kernel_));
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  Tensor::MappingGuard output_shape_guard(inputs[1]);
+  const int32_t *output_shape_data = inputs[1]->data<int32_t>();
+  const index_t batch = output_shape_data[0];
+  const index_t height = output_shape_data[1];
+  const index_t width = output_shape_data[2];
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(input_tensor->dim(2)),
+      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, input_tensor->shape())) {
+    std::vector<index_t> output_shape = {batch, height, width,
+                                         input_tensor->dim(1)};
+    std::vector<size_t> image_shape;
+    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
+    MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
+    const index_t round_h = (height + wino_blk_size_ - 1) / wino_blk_size_;
+    const index_t round_w = (width + wino_blk_size_ - 1) / wino_blk_size_;
+    const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
+    const float round_w_r = 1.f / static_cast<float>(round_w);
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
+    MACE_SET_2D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(
+        idx++,
+        *(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
+    if (bias != nullptr) {
+      kernel_.setArg(idx++,
+                     *(static_cast<const cl::Image2D *>(bias->opencl_image())));
+    }
+    kernel_.setArg(
+        idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image())));
+    kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
+    kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
+    kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
+    kernel_.setArg(idx++, round_hw_r);
+    kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
+    kernel_.setArg(idx++, round_w_r);
+    kernel_.setArg(idx++, relux_max_limit_);
+    input_shape_ = input_tensor->shape();
+  }
+  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
+  std::string tuning_key =
+      Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
+             output_tensor->dim(1), output_tensor->dim(2),
+             output_tensor->dim(3), input_tensor->dim(2));
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+}  // namespace image
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_
--- a/mace/kernels/opencl/lstmcell.cc
+++ b/mace/kernels/opencl/lstmcell.cc
@@ -13,14 +13,23 @@
 // limitations under the License.
 #include "mace/kernels/lstmcell.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/image/lstm_cell.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
+template <typename T>
+LSTMCellFunctor<DeviceType::GPU, T>::LSTMCellFunctor(
+    OpKernelContext *context,
+    T forget_bias)
+    : OpKernel(context) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::LSTMCellKernel<T>(forget_bias));
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
 template <typename T>
 MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
@@ -31,76 +40,11 @@ MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()(
    Tensor *cell,
    Tensor *output,
    StatsFuture *future) {
-  MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0,
+  return kernel_->Compute(context_, input, pre_output, weight, bias,
-             "LSTM hidden units should be a multiple of 4");
+                          pre_cell, cell, output, future);
-  const index_t height = input->dim(0);
-  const index_t width = input->dim(1);
-  const index_t hidden_units = pre_output->dim(1);
-  const index_t w_blocks = hidden_units >> 2;
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell");
-    built_options.emplace("-Dlstmcell=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("lstmcell", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  const uint32_t gws[2] = {static_cast<uint32_t>(w_blocks),
-                           static_cast<uint32_t>(height)};
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    std::vector<index_t> output_shape_padded = {height, 1, 1, hidden_units};
-    std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape_padded, BufferType::IN_OUT_CHANNEL,
-                    &output_image_shape);
-    MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(),
-                                             output_image_shape));
-    MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(),
-                                           output_image_shape));
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_2D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(pre_output->opencl_image()));
-    kernel_.setArg(idx++, *(weight->opencl_image()));
-    kernel_.setArg(idx++, *(bias->opencl_image()));
-    kernel_.setArg(idx++, *(pre_cell->opencl_image()));
-    kernel_.setArg(idx++, static_cast<float>(forget_bias_));
-    kernel_.setArg(idx++, static_cast<int32_t>(width));
-    kernel_.setArg(idx++, static_cast<int32_t>(hidden_units));
-    kernel_.setArg(idx++, static_cast<int32_t>(RoundUpDiv4(width)));
-    kernel_.setArg(idx++, *(cell->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
-  std::string tuning_key =
-      Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct LSTMCellFunctor<DeviceType::GPU, float>;
 template struct LSTMCellFunctor<DeviceType::GPU, half>;
 }  // namespace kernels

--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -13,13 +13,21 @@
 // limitations under the License.
 #include "mace/kernels/matmul.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/image/matmul.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
 namespace mace {
 namespace kernels {
+template <typename T>
+MatMulFunctor<DeviceType::GPU, T>::MatMulFunctor(OpKernelContext *context)
+    : OpKernel(context) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::MatMulKernel<T>);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
 template <typename T>
 MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
                                                         const Tensor *B,
@@ -27,68 +35,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
                                                         bool transpose_a,
                                                         bool transpose_b,
                                                         StatsFuture *future) {
-  MACE_UNUSED(future);
+  return kernel_->Compute(context_, A, B, C, transpose_a, transpose_b, future);
-  MACE_CHECK(!transpose_a && !transpose_b,
-             "GPU does not support transpose matmul");
-  index_t rank = A->dim_size();
-  index_t height = A->dim(rank - 2);
-  index_t K = A->dim(rank - 1);
-  index_t width = B->dim(rank - 1);
-  index_t batch = std::accumulate(A->shape().begin(), A->shape().end() - 2, 1,
-                                  std::multiplies<index_t>());
-  std::vector<index_t> c_shape = A->shape();
-  c_shape[rank - 2] = height;
-  c_shape[rank - 1] = width;
-  std::vector<size_t> c_image_shape;
-  std::vector<index_t> padded_c_shape = {batch, height, width, 1};
-  CalImage2DShape(padded_c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
-  MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
-  const index_t height_blocks = RoundUpDiv4(height);
-  const index_t width_blocks = RoundUpDiv4(width);
-  const uint32_t gws[2] = {
-      static_cast<uint32_t>(width_blocks),
-      static_cast<uint32_t>(height_blocks * batch),
-  };
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
-    built_options.emplace("-Dmatmul=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  uint32_t idx = 0;
-  OUT_OF_RANGE_SET_ARG;
-  SET_2D_GWS_ARGS(kernel_);
-  kernel_.setArg(idx++, *(A->opencl_image()));
-  kernel_.setArg(idx++, *(B->opencl_image()));
-  kernel_.setArg(idx++, *(C->opencl_image()));
-  kernel_.setArg(idx++, static_cast<int>(height));
-  kernel_.setArg(idx++, static_cast<int>(width));
-  kernel_.setArg(idx++, static_cast<int>(K));
-  kernel_.setArg(idx++, static_cast<int>(height_blocks));
-  kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(K)));
-  const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
-  std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct MatMulFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
@@ -31,7 +31,7 @@ bool BufferToImageOpImpl(OpKernelContext *context,
                         Tensor *buffer,
                         Tensor *image,
                         const std::vector<size_t> &image_shape) {
-  std::unique_ptr<BufferBase> kernel_error;
+  std::unique_ptr<BufferBase> oorc_flag;
  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
                     static_cast<uint32_t>(image_shape[1])};
@@ -43,8 +43,8 @@ bool BufferToImageOpImpl(OpKernelContext *context,
  std::stringstream kernel_name_ss;
  kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
  built_options.emplace(kernel_name_ss.str());
-  OUT_OF_RANGE_CONFIG(kernel_error, context);
+  MACE_OUT_OF_RANGE_CONFIG;
-  NON_UNIFORM_WG_CONFIG;
+  MACE_NON_UNIFORM_WG_CONFIG;
  if (buffer->dtype() == image->dtype()) {
    built_options.emplace("-DDATA_TYPE=" +
                          DtToCLDt(DataTypeToEnum<float>::value));
@@ -67,12 +67,13 @@ bool BufferToImageOpImpl(OpKernelContext *context,
    return false;
  }
+  MACE_OUT_OF_RANGE_INIT(kernel);
  uint32_t idx = 0;
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel.setArg(idx++,
-                  *(static_cast<cl::Buffer *>(kernel_error->buffer())));
+                  *(static_cast<cl::Buffer *>(oorc_flag->buffer())));
  }
-  SET_2D_GWS_ARGS(kernel);
+  MACE_SET_2D_GWS_ARGS(kernel, gws);
  kernel.setArg(idx++, *(buffer->opencl_buffer()));
  MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
             "buffer offset not aligned");
@@ -110,9 +111,9 @@ bool BufferToImageOpImpl(OpKernelContext *context,
  runtime->command_queue().finish();
  bool is_out_of_range = false;
  if (runtime->IsOutOfRangeCheckEnabled()) {
-    kernel_error->Map(nullptr);
+    oorc_flag->Map(nullptr);
-    is_out_of_range = *(kernel_error->mutable_data<char>()) == 1 ? true : false;
+    is_out_of_range = *(oorc_flag->mutable_data<char>()) == 1 ? true : false;
-    kernel_error->UnMap();
+    oorc_flag->UnMap();
  }
  return is_out_of_range;
 }

--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
@@ -13,86 +13,29 @@
 // limitations under the License.
 #include "mace/kernels/pad.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/image/pad.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
 namespace mace {
 namespace kernels {
+template <typename T>
+PadFunctor<DeviceType::GPU, T>::PadFunctor(
+    OpKernelContext *context,
+    const std::vector<int> &paddings,
+    const float constant_value)
+    : OpKernel(context) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::PadKernel<T>(paddings, constant_value));
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
 template <typename T>
 MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                      Tensor *output,
                                                      StatsFuture *future) {
-  MACE_CHECK(this->paddings_.size() ==
+  return kernel_->Compute(context_, input, output, future);
-             static_cast<size_t>((input->dim_size() * 2)));
-  MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) &&
-             (this->paddings_[6] == 0) && (this->paddings_[7] == 0))
-      << "Mace only support height/width dimension now";
-  auto input_shape = input->shape();
-  std::vector<index_t> output_shape = {
-      input_shape[0] + this->paddings_[0] + this->paddings_[1],
-      input_shape[1] + this->paddings_[2] + this->paddings_[3],
-      input_shape[2] + this->paddings_[4] + this->paddings_[5],
-      input_shape[3] + this->paddings_[6] + this->paddings_[7]};
-  std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-  const index_t batch = output->dim(0);
-  const index_t height = output->dim(1);
-  const index_t width = output->dim(2);
-  const index_t channels = output->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
-    built_options.emplace("-Dpad=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    int idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, this->constant_value_);
-    kernel_.setArg(idx++, static_cast<int32_t>(input_shape[1]));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_shape[2]));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_shape[1]));
-    kernel_.setArg(idx++, this->paddings_[2]);
-    kernel_.setArg(idx++, this->paddings_[4]);
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
-                                  output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct PadFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/pooling.cc
+++ b/mace/kernels/opencl/pooling.cc
@@ -13,153 +13,45 @@
 // limitations under the License.
 #include "mace/kernels/pooling.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/buffer/pooling.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
+#include "mace/kernels/opencl/image/pooling.h"
 namespace mace {
 namespace kernels {
-namespace {
+template <typename T>
+PoolingFunctor<DeviceType::GPU, T>::PoolingFunctor(
-std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+    OpKernelContext *context,
-                              const uint32_t *gws,
+    const PoolingType pooling_type,
-                              const uint32_t kwg_size) {
+    const int *kernels,
-  std::vector<uint32_t> lws(4, 0);
+    const int *strides,
-  if (kwg_size == 0) {
+    const Padding padding_type,
-    lws[0] = lws[1] = lws[2] = 1;
+    const std::vector<int> &paddings,
+    const int *dilations)
+    : PoolingFunctorBase(context,
+                         pooling_type,
+                         kernels,
+                         strides,
+                         padding_type,
+                         paddings,
+                         dilations) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::PoolingKernel<T>);
  } else {
-    uint64_t
+    kernel_.reset(new opencl::buffer::PoolingKernel<T>);
-        cache_size = runtime->device_global_mem_cache_size();
-    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
-    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
-    lws[2] =
-        std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
-    const uint32_t lws_size = lws[1] * lws[2];
-    lws[0] = gws[0] / 4;
-    if (lws[0] == 0) {
-      lws[0] = gws[0];
-    }
-    lws[0] = std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws_size),
-                                1);
  }
-  return lws;
 }
-}  // namespace
 template <typename T>
-MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
+MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(
+    const Tensor *input,
    Tensor *output,
    StatsFuture *future) {
-  MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
+  return kernel_->Compute(context_, input, pooling_type_, kernels_, strides_,
-      << "Pooling opencl kernel not support dilation yet";
+                          padding_type_, paddings_, dilations_,
+                          output, future);
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    const DataType dt = DataTypeToEnum<T>::value;
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
-    built_options.emplace("-Dpooling=" + kernel_name);
-    if (pooling_type_ == MAX && input->dtype() == output->dtype()) {
-      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-      built_options.emplace(dt == DT_HALF ? "-DFP16" : "");
-    } else {
-      built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-      built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    }
-    if (pooling_type_ == AVG) {
-      built_options.emplace("-DPOOL_AVG");
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
-                                              kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  std::vector<uint32_t> gws;
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    std::vector<index_t> output_shape(4);
-    std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
-                                         kernels_[0], kernels_[1]};
-    std::vector<int> paddings(2);
-    if (paddings_.empty()) {
-      kernels::CalcNHWCPaddingAndOutputSize(
-          input->shape().data(), filter_shape.data(), dilations_, strides_,
-          padding_type_, output_shape.data(), paddings.data());
-    } else {
-      paddings = paddings_;
-      CalcOutputSize(input->shape().data(), filter_shape.data(),
-                     paddings_.data(), dilations_, strides_, RoundType::CEIL,
-                     output_shape.data());
-    }
-    std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                    &output_image_shape);
-    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-    index_t batch = output->dim(0);
-    index_t out_height = output->dim(1);
-    index_t out_width = output->dim(2);
-    index_t channels = output->dim(3);
-    index_t channel_blocks = (channels + 3) / 4;
-    gws = {
-        static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
-        static_cast<uint32_t>(batch * out_height),
-    };
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(output->dim(1)));
-    kernel_.setArg(idx++, paddings[0] / 2);
-    kernel_.setArg(idx++, paddings[1] / 2);
-    kernel_.setArg(idx++, strides_[0]);
-    kernel_.setArg(idx++, strides_[1]);
-    kernel_.setArg(idx++, kernels_[0]);
-    kernel_.setArg(idx++, kernels_[1]);
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  } else {
-    index_t batch = output->dim(0);
-    index_t out_height = output->dim(1);
-    index_t out_width = output->dim(2);
-    index_t channels = output->dim(3);
-    index_t channel_blocks = (channels + 3) / 4;
-    gws = {
-        static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
-        static_cast<uint32_t>(batch * out_height),
-    };
-  }
-  const std::vector<uint32_t> lws = LocalWS(runtime, gws.data(), kwg_size_);
-  std::string tuning_key =
-      Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws.data(), lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct PoolingFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/reduce_mean.cc
+++ b/mace/kernels/opencl/reduce_mean.cc
@@ -13,127 +13,29 @@
 // limitations under the License.
 #include "mace/kernels/reduce_mean.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/kernels/opencl/image/reduce_mean.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
 namespace mace {
 namespace kernels {
+template <typename T>
+ReduceMeanFunctor<DeviceType::GPU, T>::ReduceMeanFunctor(
+    OpKernelContext *context,
+    const std::vector<int> &axis,
+    const bool keep_dims) : OpKernel(context) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::ReduceMeanKernel<T>(axis, keep_dims));
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
 template <typename T>
 MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
    Tensor *output,
    StatsFuture *future) {
-  MACE_CHECK_NOTNULL(input);
+  return kernel_->Compute(context_, input, output, future);
-//  MACE_CHECK(keep_dims_, "reduce mean gpu only support keep dims.");
-  MACE_CHECK(input->dim_size() == 4,
-             "reduce mean gpu only support 4-dim input");
-  MACE_CHECK(axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2,
-             "reduce mean gpu only support 1,2-axis reduce");
-  index_t batch = input->dim(0);
-  const index_t in_height = input->dim(1);
-  const index_t in_width = input->dim(2);
-  const index_t channels = input->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
-  auto runtime = context_->device()->opencl_runtime();
-  std::vector<uint32_t> gws(3);
-  std::vector<uint32_t> lws(3);
-  std::vector<index_t> output_shape{batch, 1, 1, channels};
-  std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-  if (kernel_.get() == nullptr) {
-    const DataType dt = DataTypeToEnum<T>::value;
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean");
-    built_options.emplace("-Dreduce_mean=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
-      built_options.emplace("-DNON_QUALCOMM_ADRENO");
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce_mean",
-                                              kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
-    const uint32_t wave_size =
-        static_cast<uint32_t>(runtime->GetKernelWaveSize(kernel_));
-    gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * channel_blocks)};
-  } else {
-    gws = {4, 16, static_cast<uint32_t>(batch * channel_blocks)};
-  }
-  lws = {gws[0], gws[1], 1};
-  const int group_size = lws[0] * lws[1] * lws[2];
-  const int partial_len = (image_size + group_size - 1) / group_size;
-  const int remain_index = image_size % group_size;
-  const float in_width_reciprocal = 1.f / in_width;
-  const float img_size_reciprocal = 1.f / (in_width * in_height);
-  const float channel_blk_reciprocal = 1.f / channel_blocks;
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, (group_size * 4 * sizeof(T)),
-                   nullptr);
-    kernel_.setArg(idx++, static_cast<int32_t>(group_size));
-    kernel_.setArg(idx++, static_cast<int32_t>(partial_len));
-    kernel_.setArg(idx++, static_cast<int32_t>(remain_index));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
-    kernel_.setArg(idx++, img_size_reciprocal);
-    kernel_.setArg(idx++, in_width_reciprocal);
-    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
-    kernel_.setArg(idx++, channel_blk_reciprocal);
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  } else {
-    std::vector<uint32_t> roundup_gws(lws.size());
-    for (size_t i = 0; i < lws.size(); ++i) {
-      roundup_gws[i] = RoundUp(gws[i], lws[i]);
-    }
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange,
-        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  }
-  MACE_CL_RET_STATUS(error);
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-  return MACE_SUCCESS;
 }
 template struct ReduceMeanFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/resize_bicubic.cc
+++ b/mace/kernels/opencl/resize_bicubic.cc
@@ -13,119 +13,31 @@
 // limitations under the License.
 #include "mace/kernels/resize_bicubic.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/image/resize_bicubic.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
-namespace {
+template <typename T>
-std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+ResizeBicubicFunctor<DeviceType::GPU, T>::ResizeBicubicFunctor(
-                              const uint32_t *gws,
+    OpKernelContext *context,
-                              const uint32_t kwg_size) {
+    bool align_corners,
-  std::vector<uint32_t> lws(4, 0);
+    const std::vector<index_t> &size)
-  uint64_t cache_size = runtime->device_global_mem_cache_size();
+    : OpKernel(context) {
-  uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
+  MACE_CHECK(size.size() == 2);
-  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
-  if (lws[1] >= base) {
+    kernel_.reset(new opencl::image::ResizeBicubicKernel<T>(align_corners,
-    lws[0] = std::min<uint32_t>(gws[0], base);
+                                                            size[0],
+                                                            size[1]));
  } else {
-    lws[0] = gws[0] / 8;
+    MACE_NOT_IMPLEMENTED;
-    if (lws[0] == 0) {
-      lws[0] = gws[0];
-    }
-  }
-  lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
-  const uint32_t lws_size = lws[0] * lws[1];
-  lws[2] = gws[2] / 8;
-  if (lws[2] == 0) {
-    lws[2] = gws[2];
  }
-  lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
-                              1);
-  return lws;
 }
-}  // namespace
 template <typename T>
 MaceStatus ResizeBicubicFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input, Tensor *output, StatsFuture *future) {
-  const index_t batch = input->dim(0);
+  return kernel_->Compute(context_, input, output, future);
-  const index_t in_height = input->dim(1);
-  const index_t in_width = input->dim(2);
-  const index_t channels = input->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t out_height = out_height_;
-  const index_t out_width = out_width_;
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(out_width),
-                           static_cast<uint32_t>(out_height * batch)};
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    auto dt = DataTypeToEnum<T>::value;
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache");
-    built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(MakeString("-DTABLE_SIZE=", kTableSize));
-    MACE_RETURN_IF_ERROR(
-            runtime->BuildKernel("resize_bicubic",
-                                 kernel_name,
-                                 built_options,
-                                 &kernel_));
-    kwg_size_ =
-            static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    MACE_CHECK(out_height > 0 && out_width > 0);
-    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
-    std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                    &output_image_shape);
-    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-    float height_scale =
-            CalculateResizeScale(in_height, out_height, align_corners_);
-    float width_scale =
-            CalculateResizeScale(in_width, out_width, align_corners_);
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, height_scale);
-    kernel_.setArg(idx++, width_scale);
-    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-          Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
-                 output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct ResizeBicubicFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/resize_bilinear.cc
+++ b/mace/kernels/opencl/resize_bilinear.cc
@@ -13,122 +13,29 @@
 // limitations under the License.
 #include "mace/kernels/resize_bilinear.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/image/resize_bilinear.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
-namespace {
+template <typename T>
-std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+ResizeBilinearFunctor<DeviceType::GPU, T>::ResizeBilinearFunctor(
-                              const uint32_t *gws,
+    OpKernelContext *context,
-                              const uint32_t kwg_size) {
+    const std::vector<index_t> &size,
-  std::vector<uint32_t> lws(4, 0);
+    bool align_corners) : OpKernel(context) {
-  if (kwg_size == 0) {
+  MACE_CHECK(size.size() == 2);
-    lws[0] = lws[1] = lws[2] = 1;
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
-  } else {
+    kernel_.reset(new opencl::image::ResizeBilinearKernel<T>(align_corners,
-    uint64_t
+                                                             size[0],
-        cache_size = runtime->device_global_mem_cache_size();
+                                                             size[1]));
-    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
-    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
-    if (lws[1] >= base) {
-      lws[0] = std::min<uint32_t>(gws[0], base);
  } else {
-      lws[0] = gws[0] / 8;
+    MACE_NOT_IMPLEMENTED;
-      if (lws[0] == 0) {
-        lws[0] = gws[0];
-      }
-    }
-    lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
-    const uint32_t lws_size = lws[0] * lws[1];
-    lws[2] = gws[2] / 8;
-    if (lws[2] == 0) {
-      lws[2] = gws[2];
-    }
-    lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
-                                1);
  }
-  return lws;
 }
-}  // namespace
 template <typename T>
 MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input, Tensor *output, StatsFuture *future) {
-  const index_t batch = input->dim(0);
+  return kernel_->Compute(context_, input, output, future);
-  const index_t in_height = input->dim(1);
-  const index_t in_width = input->dim(2);
-  const index_t channels = input->dim(3);
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t out_height = out_height_;
-  const index_t out_width = out_width_;
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(out_width),
-                           static_cast<uint32_t>(out_height * batch)};
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
-    built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(
-        runtime->BuildKernel("resize_bilinear",
-                             kernel_name,
-                             built_options,
-                             &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    MACE_CHECK(out_height > 0 && out_width > 0);
-    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
-    std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                    &output_image_shape);
-    MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
-    float height_scale =
-        CalculateResizeScale(in_height, out_height, align_corners_);
-    float width_scale =
-        CalculateResizeScale(in_width, out_width, align_corners_);
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, height_scale);
-    kernel_.setArg(idx++, width_scale);
-    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
-    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct ResizeBilinearFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/softmax.cc
+++ b/mace/kernels/opencl/softmax.cc
@@ -13,110 +13,28 @@
 // limitations under the License.
 #include "mace/kernels/softmax.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/buffer/softmax.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
+#include "mace/kernels/opencl/image/softmax.h"
-#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
-namespace {
+template <typename T>
+SoftmaxFunctor<DeviceType::GPU, T>::SoftmaxFunctor(OpKernelContext *context)
-std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
+    : OpKernel(context) {
-                              const uint32_t *gws,
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
-                              const uint32_t kwg_size) {
+    kernel_.reset(new opencl::image::SoftmaxKernel<T>);
-  std::vector<uint32_t> lws(4, 0);
-  if (kwg_size == 0) {
-    lws[0] = lws[1] = lws[2] = 1;
-  } else {
-    uint64_t
-        cache_size = runtime->device_global_mem_cache_size();
-    uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
-    lws[1] = std::min<uint32_t>(gws[1], kwg_size);
-    if (gws[0] < base) {
-      lws[0] = gws[0];
  } else {
-      lws[0] = gws[0] / base;
+    kernel_.reset(new opencl::buffer::SoftmaxKernel<T>);
-    }
-    lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
-    lws[2] = std::max<uint32_t>(std::min<uint32_t>(
-        gws[2], kwg_size / (lws[0] * lws[1])), 1);
  }
-  return lws;
 }
-}  // namespace
 template <typename T>
 MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
                                                          Tensor *output,
                                                          StatsFuture *future) {
-  index_t batch = 0;
+  return kernel_->Compute(context_, logits, output, future);
-  index_t height = 0;
-  index_t width = 0;
-  index_t channels = 0;
-  if (logits->dim_size() == 2) {
-    batch = logits->dim(0);
-    height = 1;
-    width = 1;
-    channels = logits->dim(1);
-  } else if (logits->dim_size() == 4) {
-    batch = logits->dim(0);
-    height = logits->dim(1);
-    width = logits->dim(2);
-    channels = logits->dim(3);
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const int remain_channels = channel_blocks * 4 - channels;
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
-    built_options.emplace("-Dsoftmax=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  if (!IsVecEqual(input_shape_, logits->shape())) {
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(logits->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int>(channels));
-    kernel_.setArg(idx++, remain_channels);
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = logits->shape();
-  }
-  std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("softmax_opencl_kernel", batch, height, width, channels);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct SoftmaxFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/space_to_batch.cc
+++ b/mace/kernels/opencl/space_to_batch.cc
@@ -16,81 +16,32 @@
 #define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
 #include "mace/kernels/space_to_batch.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/image/space_to_batch.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
+template <typename T>
+SpaceToBatchFunctor<DeviceType::GPU, T>::SpaceToBatchFunctor(
+    OpKernelContext *context,
+    const std::vector<int> &paddings,
+    const std::vector<int> &block_shape)
+    : SpaceToBatchFunctorBase(context, paddings, block_shape) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::SpaceToBatchKernel<T>);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
 template <typename T>
 MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
-    Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) {
+    const Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) {
  std::vector<index_t> output_shape(4, 0);
  CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC,
                                   output_shape.data());
-  std::vector<size_t> output_image_shape;
+  return kernel_->Compute(context_, space_tensor, paddings_, block_shape_,
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                          output_shape, batch_tensor, future);
-                  &output_image_shape);
-  MACE_RETURN_IF_ERROR(
-      batch_tensor->ResizeImage(output_shape, output_image_shape));
-  const char *kernel_name = "space_to_batch";
-  const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
-  const uint32_t gws[3] = {
-      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
-      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    built_options.emplace(kernel_name_ss.str());
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" +
-                          DtToCLCMDDt(DataTypeToEnum<T>::value));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  if (!IsVecEqual(space_shape_, space_tensor->shape())) {
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(space_tensor->opencl_image()));
-    kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
-    kernel_.setArg(idx++, block_shape_[0]);
-    kernel_.setArg(idx++, block_shape_[1]);
-    kernel_.setArg(idx++, paddings_[0]);
-    kernel_.setArg(idx++, paddings_[2]);
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
-    space_shape_ = space_tensor->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
-             batch_tensor->dim(2), batch_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct SpaceToBatchFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/space_to_depth.cc
+++ b/mace/kernels/opencl/space_to_depth.cc
@@ -13,91 +13,27 @@
 // limitations under the License.
 #include "mace/kernels/space_to_depth.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/kernels/opencl/image/space_to_depth.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
 template <typename T>
-MaceStatus SpaceToDepthOpFunctor<DeviceType::GPU, T>::operator()(
+SpaceToDepthOpFunctor<DeviceType::GPU, T>::SpaceToDepthOpFunctor(
-    const Tensor *input, Tensor *output, StatsFuture *future) {
+    OpKernelContext *context,
-  const index_t batch = input->dim(0);
+    const int block_size)
-  const index_t input_height = input->dim(1);
+    : OpKernel(context) {
-  const index_t input_width = input->dim(2);
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
-  const index_t input_depth = input->dim(3);
+    kernel_.reset(new opencl::image::SpaceToDepthKernel<T>(block_size));
+  } else {
-  MACE_CHECK((input_depth % 4) == 0,
+    MACE_NOT_IMPLEMENTED;
-             "input channel should be dividable by 4");
-  MACE_CHECK(
-      (input_width % block_size_ == 0) && (input_height % block_size_ == 0),
-      "input width and height should be dividable by block_size");
-  const index_t output_height = input_height / block_size_;
-  const index_t output_width = input_width / block_size_;
-  const index_t output_depth = input_depth * block_size_ * block_size_;
-  const index_t input_depth_blocks = RoundUpDiv4(input_depth);
-  const index_t output_depth_blocks = RoundUpDiv4(output_depth);
-  std::vector<index_t> output_shape = {batch, output_height, output_width,
-                                       output_depth};
-  std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
-  MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    const char *kernel_name = "space_to_depth";
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    built_options.emplace(kernel_name_ss.str());
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_depth",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
+}
-  const uint32_t gws[3] = {static_cast<uint32_t>(input_depth_blocks),
+template <typename T>
-                           static_cast<uint32_t>(input_width),
+MaceStatus SpaceToDepthOpFunctor<DeviceType::GPU, T>::operator()(
-                           static_cast<uint32_t>(input_height * batch)};
+    const Tensor *input, Tensor *output, StatsFuture *future) {
-  if (!IsVecEqual(input_shape_, input->shape())) {
+  return kernel_->Compute(context_, input, output, future);
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int32_t>(block_size_));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(input_depth_blocks));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_height * batch));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_width));
-    kernel_.setArg(idx++, static_cast<int32_t>(output_depth_blocks));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0),
-                                  input->dim(1), input->dim(2), input->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct SpaceToDepthOpFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/split.cc
+++ b/mace/kernels/opencl/split.cc
@@ -13,107 +13,28 @@
 // limitations under the License.
 #include "mace/kernels/split.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/image/split.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
 namespace mace {
 namespace kernels {
+template <typename T>
+SplitFunctor<DeviceType::GPU, T>::SplitFunctor(OpKernelContext *context,
+                                               const int32_t axis)
+    : OpKernel(context) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::SplitKernel<T>(axis));
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
 template <typename T>
 MaceStatus SplitFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
    const std::vector<Tensor *> &output_list,
    StatsFuture *future) {
-  const index_t input_channels = input->dim(3);
+  return kernel_->Compute(context_, input, output_list, future);
-  const size_t outputs_count = output_list.size();
-  const index_t output_channels = input_channels / outputs_count;
-  MACE_CHECK(output_channels % 4 == 0)
-      << "output channels of split op must be divisible by 4";
-  std::vector<index_t> output_shape(
-      {input->dim(0), input->dim(1), input->dim(2), output_channels});
-  std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
-  for (size_t i = 0; i < outputs_count; ++i) {
-    MACE_RETURN_IF_ERROR(
-        output_list[i]->ResizeImage(output_shape, image_shape));
-  }
-  auto runtime = context_->device()->opencl_runtime();
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split");
-    built_options.emplace("-Dsplit=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" +
-                          DtToCLCMDDt(DataTypeToEnum<T>::value));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("split",
-                                              kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  const index_t channel_blk = RoundUpDiv4(output_channels);
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(input->dim(2)),
-      static_cast<uint32_t>(input->dim(0) * input->dim(1)),
-  };
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  cl::Event event;
-  CallStats call_stats{INT64_MAX, 0};
-  for (size_t i = 0; i < outputs_count; ++i) {
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
-    kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
-    cl_int error;
-    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-    } else {
-      std::vector<uint32_t> roundup_gws(lws.size());
-      for (size_t j = 0; j < 3; ++j) {
-        roundup_gws[j] = RoundUp(gws[j], lws[j]);
-      }
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          kernel_, cl::NullRange,
-          cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-          cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-    }
-    MACE_CL_RET_STATUS(error);
-    OUT_OF_RANGE_VALIDATION(kernel_error_);
-    if (future != nullptr && runtime->is_profiling_enabled()) {
-      event.wait();
-      CallStats tmp_stats;
-      runtime->GetCallStats(event, &tmp_stats);
-      call_stats.start_micros =
-          std::min<int64_t>(tmp_stats.start_micros, call_stats.start_micros);
-      call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
-    }
-  }
-  if (future != nullptr) {
-    future->wait_fn = [runtime, call_stats](CallStats *stats) {
-      if (stats != nullptr) {
-        stats->start_micros = call_stats.start_micros;
-        stats->end_micros = stats->start_micros + call_stats.end_micros;
-      }
-    };
-  }
-  return MACE_SUCCESS;
 }
 template struct SplitFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -13,239 +13,49 @@
 // limitations under the License.
 #include "mace/kernels/winograd_transform.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/kernels/opencl/image/winograd_transform.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
 namespace mace {
 namespace kernels {
 template <typename T>
-MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
+WinogradTransformFunctor<DeviceType::GPU, T>::WinogradTransformFunctor(
-    const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
+    OpKernelContext *context,
-  auto runtime = context_->device()->opencl_runtime();
+    const Padding &padding_type,
+    const std::vector<int> &paddings,
-  if (kernel_.get() == nullptr) {
+    const int block_size) : OpKernel(context) {
-    std::string obfuscated_kernel_name;
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    std::set<std::string> built_options;
+    kernel_.reset(new opencl::image::WinogradTransformKernel<T>(
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
+        padding_type, paddings, block_size));
-    NON_UNIFORM_WG_CONFIG;
-    if (wino_blk_size_ == 4) {
-      obfuscated_kernel_name =
-          MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4");
-      built_options.emplace("-Dwinograd_transform_4x4="
-                                + obfuscated_kernel_name);
-    } else if (wino_blk_size_ == 2) {
-      obfuscated_kernel_name =
-          MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
-      built_options.emplace("-Dwinograd_transform_2x2="
-                                + obfuscated_kernel_name);
  } else {
-      MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
+    MACE_NOT_IMPLEMENTED;
-      return MACE_SUCCESS;
  }
-    built_options.emplace("-DDATA_TYPE=" +
+}
-                          DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
+template <typename T>
-    built_options.emplace("-DCMD_DATA_TYPE=" +
+MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
-                          DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
+    const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
+  return kernel_->Compute(context_, input_tensor, output_tensor, future);
-                                              obfuscated_kernel_name,
+}
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
+template <typename T>
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+WinogradInverseTransformFunctor<DeviceType::GPU, T>::WinogradInverseTransformFunctor(  // NOLINT(whitespace/line_length)
-  }
+    OpKernelContext *context,
-  std::vector<index_t> output_shape(4);
+    const ActivationType activation,
-  std::vector<index_t> filter_shape = {1, input_tensor->dim(3), 3, 3};
+    const float relux_max_limit,
-  std::vector<int> paddings(2);
+    const int block_size) : OpKernel(context) {
-  if (paddings_.empty()) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernels::CalcNHWCPaddingAndOutputSize(
+    kernel_.reset(new opencl::image::WinogradInverseTransformKernel<T>(
-        input_tensor->shape().data(), filter_shape.data(), dilations_.data(),
+        activation, relux_max_limit, block_size));
-        strides_.data(), padding_type_, output_shape.data(), paddings.data());
  } else {
-    paddings = paddings_;
+    MACE_NOT_IMPLEMENTED;
-    CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
-                   paddings_.data(), dilations_.data(), strides_.data(),
-                   RoundType::FLOOR, output_shape.data());
  }
-  const index_t round_h =
-      (output_shape[1] + wino_blk_size_ - 1) / wino_blk_size_;
-  const index_t round_w =
-      (output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_;
-  const index_t out_width = input_tensor->dim(0) * round_h * round_w;
-  const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
-  const float round_w_r = 1.f / static_cast<float>(round_w);
-  const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
-  const uint32_t gws[2] = {
-      static_cast<uint32_t>(out_width),
-      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))
-  };
-  if (!IsVecEqual(input_shape_, input_tensor->shape())) {
-    output_shape = {blk_sqr, input_tensor->dim(3), out_width};
-    std::vector<index_t> padded_output_shape = {
-        output_shape[0], output_shape[1], output_shape[2], 1
-    };
-    std::vector<size_t> image_shape;
-    CalImage2DShape(padded_output_shape,
-                    BufferType::IN_OUT_HEIGHT,
-                    &image_shape);
-    // remove unused last dimension
-    MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_2D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input_tensor->opencl_image()));
-    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
-    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2)));
-    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
-    kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
-    kernel_.setArg(idx++, round_hw_r);
-    kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
-    kernel_.setArg(idx++, round_w_r);
-    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
-    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
-    input_shape_ = input_tensor->shape();
-  }
-  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
-  std::string tuning_key = Concat("winograd_transform_kernel",
-                                  output_tensor->dim(0),
-                                  output_tensor->dim(1),
-                                  output_tensor->dim(2));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template <typename T>
 MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
    const std::vector<const Tensor*> &inputs,
    Tensor *output_tensor,
    StatsFuture *future) {
-  auto runtime = context_->device()->opencl_runtime();
+  return kernel_->Compute(context_, inputs, output_tensor, future);
-  const Tensor *input_tensor = inputs[0];
-  const Tensor *bias = inputs.size() == 3 ? inputs[2] : nullptr;
-  if (kernel_.get() == nullptr) {
-    std::string obfuscated_kernel_name;
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    if (wino_blk_size_ == 4) {
-      obfuscated_kernel_name =
-          MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4");
-      built_options.emplace("-Dwinograd_inverse_transform_4x4="
-                                + obfuscated_kernel_name);
-    } else if (wino_blk_size_ == 2) {
-      obfuscated_kernel_name =
-          MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
-      built_options.emplace("-Dwinograd_inverse_transform_2x2="
-                                + obfuscated_kernel_name);
-    } else {
-      MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
-      return MACE_SUCCESS;
-    }
-    built_options.emplace("-DDATA_TYPE=" +
-                          DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" +
-                          DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
-    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-    switch (activation_) {
-      case NOOP:
-        break;
-      case RELU:
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case PRELU:
-        built_options.emplace("-DUSE_PRELU");
-        break;
-      case TANH:
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation_;
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  Tensor::MappingGuard output_shape_guard(inputs[1]);
-  const int32_t *output_shape_data = inputs[1]->data<int32_t>();
-  const index_t batch = output_shape_data[0];
-  const index_t height = output_shape_data[1];
-  const index_t width = output_shape_data[2];
-  const uint32_t gws[2] = {
-      static_cast<uint32_t>(input_tensor->dim(2)),
-      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
-  if (!IsVecEqual(input_shape_, input_tensor->shape())) {
-    std::vector<index_t> output_shape = {batch, height, width,
-                                         input_tensor->dim(1)};
-    std::vector<size_t> image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
-    MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
-    const index_t round_h = (height + wino_blk_size_ - 1) / wino_blk_size_;
-    const index_t round_w = (width + wino_blk_size_ - 1) / wino_blk_size_;
-    const float round_hw_r = 1.f / static_cast<float>(round_h * round_w);
-    const float round_w_r = 1.f / static_cast<float>(round_w);
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_2D_GWS_ARGS(kernel_);
-    kernel_.setArg(
-        idx++,
-        *(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
-    if (bias != nullptr) {
-      kernel_.setArg(idx++,
-                     *(static_cast<const cl::Image2D *>(bias->opencl_image())));
-    }
-    kernel_.setArg(
-        idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image())));
-    kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
-    kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
-    kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
-    kernel_.setArg(idx++, round_hw_r);
-    kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
-    kernel_.setArg(idx++, round_w_r);
-    kernel_.setArg(idx++, relux_max_limit_);
-    input_shape_ = input_tensor->shape();
-  }
-  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
-  std::string tuning_key =
-      Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
-             output_tensor->dim(1), output_tensor->dim(2),
-             output_tensor->dim(3), input_tensor->dim(2));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
 }
 template struct WinogradTransformFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/pad.h
+++ b/mace/kernels/pad.h
@@ -23,32 +23,18 @@
 #include "mace/core/tensor.h"
 #include "mace/kernels/kernel.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
-struct PadFunctorBase : OpKernel {
+template<DeviceType D, typename T>
-  PadFunctorBase(OpKernelContext *context,
+struct PadFunctor : OpKernel {
+  PadFunctor(OpKernelContext *context,
             const std::vector<int> &paddings,
             const float constant_value)
      : OpKernel(context),
        paddings_(paddings),
        constant_value_(constant_value) {}
-  std::vector<int> paddings_;
-  float constant_value_;
-};
-template<DeviceType D, typename T>
-struct PadFunctor : public PadFunctorBase {
-  PadFunctor(OpKernelContext *context,
-             const std::vector<int> &paddings,
-             const float constant_value)
-      : PadFunctorBase(context, paddings, constant_value) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future) {
@@ -93,24 +79,32 @@ struct PadFunctor : public PadFunctorBase {
    return MACE_SUCCESS;
  }
+  std::vector<int> paddings_;
+  float constant_value_;
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLPadKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLPadKernel);
+};
 template <typename T>
-struct PadFunctor<DeviceType::GPU, T> : PadFunctorBase {
+struct PadFunctor<DeviceType::GPU, T> : OpKernel {
  PadFunctor(OpKernelContext *context,
             const std::vector<int> &paddings,
-             const float constant_value)
+             const float constant_value);
-      : PadFunctorBase(context, paddings, constant_value) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLPadKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -29,10 +29,6 @@
 #include <arm_neon.h>
 #endif
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 enum PoolingType {
@@ -84,8 +80,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
                           strides,
                           padding_type,
                           paddings,
-                           dilations) {
+                           dilations) {}
-  }
  void MaxPooling(const float *input,
                  const index_t *in_shape,
@@ -455,6 +450,21 @@ struct PoolingFunctor<DeviceType::CPU, uint8_t>: PoolingFunctorBase {
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLPoolingKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const PoolingType pooling_type,
+      const int *kernels,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLPoolingKernel);
+};
 template <typename T>
 struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase {
  PoolingFunctor(OpKernelContext *context,
@@ -463,23 +473,13 @@ struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase {
                 const int *strides,
                 const Padding padding_type,
                 const std::vector<int> &paddings,
-                 const int *dilations)
+                 const int *dilations);
-      : PoolingFunctorBase(context,
-                           pooling_type,
-                           kernels,
-                           strides,
-                           padding_type,
-                           paddings,
-                           dilations) {
-  }
  MaceStatus operator()(const Tensor *input_tensor,
                        Tensor *output_tensor,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLPoolingKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/reduce_mean.h
+++ b/mace/kernels/reduce_mean.h
@@ -25,33 +25,15 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/kernel.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif
 namespace mace {
 namespace kernels {
-struct ReduceFunctorBase : OpKernel {
-  ReduceFunctorBase(OpKernelContext *context,
-                    const std::vector<int> &axis,
-                    const bool keep_dims)
-      : OpKernel(context),
-        keep_dims_(keep_dims),
-        axis_(axis) {}
-  bool keep_dims_;
-  bool reduce_first_axis_;
-  const std::vector<int> axis_;
-  std::vector<int> data_reshape_;
-  std::vector<index_t> out_shape_;
-};
 template <DeviceType D, typename T>
-struct ReduceMeanFunctor : ReduceFunctorBase {
+struct ReduceMeanFunctor : OpKernel {
  ReduceMeanFunctor(OpKernelContext *context,
                    const std::vector<int> &axis,
                    const bool keep_dims)
-      : ReduceFunctorBase(context, axis, keep_dims) {}
+      : OpKernel(context), axis_(axis), keep_dims_(keep_dims) {}
  void Simplify(const Tensor *input) {
    std::vector<bool> bitmap(static_cast<uint32_t>(input->dim_size()), false);
@@ -217,25 +199,35 @@ struct ReduceMeanFunctor : ReduceFunctorBase {
    Compute(input, output);
    return MACE_SUCCESS;
  }
+  const std::vector<int> axis_;
+  bool keep_dims_;
+  bool reduce_first_axis_;
+  std::vector<int> data_reshape_;
+  std::vector<index_t> out_shape_;
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLReduceMeanKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLReduceMeanKernel);
+};
 template <typename T>
-struct ReduceMeanFunctor<DeviceType::GPU, T>
+struct ReduceMeanFunctor<DeviceType::GPU, T> : OpKernel {
-    : ReduceFunctorBase {
  ReduceMeanFunctor(OpKernelContext *context,
-                    const std::vector<int> axis,
+                    const std::vector<int> &axis,
-                    const bool keep_dims)
+                    const bool keep_dims);
-      : ReduceFunctorBase(context, axis, keep_dims) {}
  MaceStatus operator()(const Tensor *input,
-                        Tensor *output_tensor,
+                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLReduceMeanKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif

--- a/mace/kernels/resize_bicubic.h
+++ b/mace/kernels/resize_bicubic.h
@@ -25,10 +25,6 @@
 #include "mace/kernels/kernel.h"
 #include "mace/utils/logging.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
@@ -141,32 +137,20 @@ inline void ResizeImage(const float *images,
  }
 }
-struct ResizeBicubicFunctorBase : OpKernel {
-  ResizeBicubicFunctorBase(OpKernelContext *context,
-                           const std::vector<index_t> &size,
-                           bool align_corners)
-      : OpKernel(context), align_corners_(align_corners) {
-    MACE_CHECK(size.size() == 2);
-    out_height_ = size[0];
-    out_width_ = size[1];
-  }
- protected:
-  bool align_corners_;
-  index_t out_height_;
-  index_t out_width_;
-};
 template<DeviceType D, typename T>
 struct ResizeBicubicFunctor;
 template<>
-struct ResizeBicubicFunctor<DeviceType::CPU, float>
+struct ResizeBicubicFunctor<DeviceType::CPU, float> : OpKernel {
-    : ResizeBicubicFunctorBase {
  ResizeBicubicFunctor(OpKernelContext *context,
-                       const std::vector<index_t> &size,
+                       const bool align_corners,
-                       bool align_corners)
+                       const std::vector<index_t> &size)
-      : ResizeBicubicFunctorBase(context, size, align_corners) {}
+      : OpKernel(context),
+        align_corners_(align_corners) {
+    MACE_CHECK(size.size() == 2);
+    out_height_ = size[0];
+    out_width_ = size[1];
+  }
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
@@ -205,25 +189,34 @@ struct ResizeBicubicFunctor<DeviceType::CPU, float>
    return MACE_SUCCESS;
  }
+  bool align_corners_;
+  index_t out_height_;
+  index_t out_width_;
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLResizeBicubicKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLResizeBicubicKernel);
+};
 template<typename T>
 struct ResizeBicubicFunctor<DeviceType::GPU, T>
-    : ResizeBicubicFunctorBase {
+    : OpKernel {
  ResizeBicubicFunctor(OpKernelContext *context,
-                       const std::vector<index_t> &size,
+                       bool align_corners,
-                       bool align_corners)
+                       const std::vector<index_t> &size);
-      : ResizeBicubicFunctorBase(context, size, align_corners) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLResizeBicubicKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -24,10 +24,6 @@
 #include "mace/kernels/kernel.h"
 #include "mace/utils/quantize.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
@@ -179,30 +175,17 @@ inline void ResizeImageNHWC(const T *images,
  }
 }
-struct ResizeBilinearFunctorBase : OpKernel {
+template<DeviceType D, typename T>
-  ResizeBilinearFunctorBase(OpKernelContext *context,
+struct ResizeBilinearFunctor : OpKernel {
+  ResizeBilinearFunctor(OpKernelContext *context,
                        const std::vector<index_t> &size,
                        bool align_corners)
-      : OpKernel(context),
+      : OpKernel(context), align_corners_(align_corners) {
-        align_corners_(align_corners) {
    MACE_CHECK(size.size() == 2);
    out_height_ = size[0];
    out_width_ = size[1];
  }
- protected:
-  bool align_corners_;
-  index_t out_height_;
-  index_t out_width_;
-};
-template<DeviceType D, typename T>
-struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
-  ResizeBilinearFunctor(OpKernelContext *context,
-                        const std::vector<index_t> &size,
-                        bool align_corners)
-      : ResizeBilinearFunctorBase(context, size, align_corners) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future) {
@@ -255,14 +238,22 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
    return MACE_SUCCESS;
  }
+  bool align_corners_;
+  index_t out_height_;
+  index_t out_width_;
 };
 template<DeviceType D>
-struct ResizeBilinearFunctor<D, uint8_t> : ResizeBilinearFunctorBase {
+struct ResizeBilinearFunctor<D, uint8_t> : OpKernel {
  ResizeBilinearFunctor(OpKernelContext *context,
                        const std::vector<index_t> &size,
                        bool align_corners)
-      : ResizeBilinearFunctorBase(context, size, align_corners) {}
+      : OpKernel(context), align_corners_(align_corners) {
+    MACE_CHECK(size.size() == 2);
+    out_height_ = size[0];
+    out_width_ = size[1];
+  }
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
@@ -316,25 +307,34 @@ struct ResizeBilinearFunctor<D, uint8_t> : ResizeBilinearFunctorBase {
    return MACE_SUCCESS;
  }
+  bool align_corners_;
+  index_t out_height_;
+  index_t out_width_;
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLResizeBilinearKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLResizeBilinearKernel);
+};
 template<typename T>
 struct ResizeBilinearFunctor<DeviceType::GPU, T>
-    : ResizeBilinearFunctorBase {
+    : OpKernel {
  ResizeBilinearFunctor(OpKernelContext *context,
                        const std::vector<index_t> &size,
-                        bool align_corners)
+                        bool align_corners);
-      : ResizeBilinearFunctorBase(context, size, align_corners) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLResizeBilinearKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -30,10 +30,6 @@
 #include "mace/kernels/kernel.h"
 #include "mace/kernels/quantize.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
@@ -356,17 +352,23 @@ struct SoftmaxFunctor<DeviceType::CPU, uint8_t> : OpKernel {
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLSoftmaxKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *logits,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSoftmaxKernel);
+};
 template<typename T>
 struct SoftmaxFunctor<DeviceType::GPU, T> : OpKernel {
-  explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {}
+  explicit SoftmaxFunctor(OpKernelContext *context);
  MaceStatus operator()(const Tensor *logits,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLSoftmaxKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -23,10 +23,6 @@
 #include "mace/core/tensor.h"
 #include "mace/kernels/kernel.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
@@ -102,7 +98,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
                      const std::vector<int> &block_shape)
      : SpaceToBatchFunctorBase(context, paddings, block_shape) {}
-  MaceStatus operator()(Tensor *space_tensor,
+  MaceStatus operator()(const Tensor *space_tensor,
                        Tensor *batch_tensor,
                        StatsFuture *future) {
    MACE_UNUSED(future);
@@ -212,7 +208,7 @@ struct SpaceToBatchFunctor<DeviceType::CPU, uint8_t> : SpaceToBatchFunctorBase {
                      const std::vector<int> &block_shape)
      : SpaceToBatchFunctorBase(context, paddings, block_shape) {}
-  MaceStatus operator()(Tensor *space_tensor,
+  MaceStatus operator()(const Tensor *space_tensor,
                        Tensor *batch_tensor,
                        StatsFuture *future) {
    MACE_UNUSED(future);
@@ -311,21 +307,29 @@ struct SpaceToBatchFunctor<DeviceType::CPU, uint8_t> : SpaceToBatchFunctorBase {
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLSpaceToBatchKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *space_tensor,
+      const std::vector<int> &paddings,
+      const std::vector<int> &block_shape,
+      const std::vector<index_t> &output_shape,
+      Tensor *batch_tensor,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSpaceToBatchKernel);
+};
 template <typename T>
 struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase {
  SpaceToBatchFunctor(OpKernelContext *context,
                      const std::vector<int> &paddings,
-                      const std::vector<int> &block_shape)
+                      const std::vector<int> &block_shape);
-      : SpaceToBatchFunctorBase(context, paddings, block_shape) {}
-  MaceStatus operator()(Tensor *space_tensor,
+  MaceStatus operator()(const Tensor *space_tensor,
                        Tensor *batch_tensor,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLSpaceToBatchKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> space_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/space_to_depth.h
+++ b/mace/kernels/space_to_depth.h
@@ -22,10 +22,6 @@
 #include "mace/public/mace.h"
 #include "mace/kernels/kernel.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
@@ -91,20 +87,24 @@ struct SpaceToDepthOpFunctor : OpKernel {
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLSpaceToDepthKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSpaceToDepthKernel);
+};
 template<typename T>
 struct SpaceToDepthOpFunctor<DeviceType::GPU, T> : OpKernel {
  explicit SpaceToDepthOpFunctor(OpKernelContext *context,
-                                 const int block_size)
+                                 const int block_size);
-      : OpKernel(context), block_size_(block_size) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future);
-  const int block_size_;
+  std::unique_ptr<OpenCLSpaceToDepthKernel> kernel_;
-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/split.h
+++ b/mace/kernels/split.h
@@ -25,24 +25,13 @@
 #include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
-struct SplitFunctorBase : OpKernel {
-  SplitFunctorBase(OpKernelContext *context, const int32_t axis)
-      : OpKernel(context), axis_(axis) {}
-  int32_t axis_;
-};
 template<DeviceType D, typename T>
-struct SplitFunctor : SplitFunctorBase {
+struct SplitFunctor : OpKernel {
  SplitFunctor(OpKernelContext *context, const int32_t axis)
-      : SplitFunctorBase(context, axis) {}
+      : OpKernel(context), axis_(axis) {}
  MaceStatus operator()(const Tensor *input,
                        const std::vector<Tensor *> &output_list,
@@ -88,20 +77,28 @@ struct SplitFunctor : SplitFunctorBase {
    return MACE_SUCCESS;
  }
+  int32_t axis_;
 };
 #ifdef MACE_ENABLE_OPENCL
+class OpenCLSplitKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const std::vector<Tensor *> &output_list,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSplitKernel);
+};
 template<typename T>
-struct SplitFunctor<DeviceType::GPU, T> : SplitFunctorBase {
+struct SplitFunctor<DeviceType::GPU, T> : OpKernel {
-  SplitFunctor(OpKernelContext *context, const int32_t axis)
+  SplitFunctor(OpKernelContext *context, const int32_t axis);
-      : SplitFunctorBase(context, axis) {}
  MaceStatus operator()(const Tensor *input,
                        const std::vector<Tensor *> &output_list,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLSplitKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -23,132 +23,63 @@
 #include "mace/kernels/activation.h"
 #include "mace/kernels/conv_pool_2d_util.h"
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
 namespace mace {
 namespace kernels {
-struct WinogradTransformFunctorBase : OpKernel {
+template <DeviceType D, typename T>
-  WinogradTransformFunctorBase(OpKernelContext *context,
+struct WinogradTransformFunctor;
-                               const Padding &padding_type,
-                               const std::vector<int> &paddings,
-                               const int block_size)
-      : OpKernel(context),
-        strides_({1, 1}),
-        dilations_({1, 1}),
-        padding_type_(padding_type),
-        paddings_(paddings),
-        wino_blk_size_(block_size) {}
-  const std::vector<int> strides_;    // [stride_h, stride_w]
-  const std::vector<int> dilations_;  // [dilation_h, dilation_w]
-  Padding padding_type_;
-  std::vector<int> paddings_;
-  const int wino_blk_size_;
-};
-template<DeviceType D, typename T>
+#ifdef MACE_ENABLE_OPENCL
-struct WinogradTransformFunctor : WinogradTransformFunctorBase {
+class OpenCLWinogradTransformKernel {
-  WinogradTransformFunctor(OpKernelContext *context,
+ public:
-                           const Padding &padding_type,
+  virtual MaceStatus Compute(
-                           const std::vector<int> &paddings,
+      OpKernelContext *context,
-                           const int block_size)
+      const Tensor *input,
-      : WinogradTransformFunctorBase(context,
-                                     padding_type,
-                                     paddings,
-                                     block_size) {}
-  MaceStatus operator()(const Tensor *input,
      Tensor *output,
-                        StatsFuture *future) {
+      StatsFuture *future) = 0;
-    MACE_UNUSED(input);
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLWinogradTransformKernel);
-    MACE_UNUSED(output);
-    MACE_UNUSED(future);
-    MACE_NOT_IMPLEMENTED;
-    return MACE_SUCCESS;
-  }
 };
-#ifdef MACE_ENABLE_OPENCL
 template<typename T>
-struct WinogradTransformFunctor<DeviceType::GPU, T>
+struct WinogradTransformFunctor<DeviceType::GPU, T> : OpKernel {
-    : WinogradTransformFunctorBase {
  WinogradTransformFunctor(OpKernelContext *context,
                           const Padding &padding_type,
                           const std::vector<int> &paddings,
-                           const int block_size)
+                           const int block_size);
-      : WinogradTransformFunctorBase(context,
-                                     padding_type,
-                                     paddings,
-                                     block_size) {}
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLWinogradTransformKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL
-struct WinogradInverseTransformFunctorBase : OpKernel {
-  WinogradInverseTransformFunctorBase(OpKernelContext *context,
-                                      const ActivationType activation,
-                                      const float relux_max_limit,
-                                      const int block_size)
-      : OpKernel(context),
-        wino_blk_size_(block_size),
-        activation_(activation),
-        relux_max_limit_(relux_max_limit) {}
-  const int wino_blk_size_;
-  const ActivationType activation_;
-  const float relux_max_limit_;
-};
 template<DeviceType D, typename T>
-struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
+struct WinogradInverseTransformFunctor;
-  WinogradInverseTransformFunctor(OpKernelContext *context,
-                                  const ActivationType activation,
-                                  const float relux_max_limit,
-                                  const int block_size)
-      : WinogradInverseTransformFunctorBase(
-            context, activation, relux_max_limit, block_size) {}
-  MaceStatus operator()(const std::vector<const Tensor*> &inputs,
+#ifdef MACE_ENABLE_OPENCL
+class OpenCLWinogradInverseTransformKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const std::vector<const Tensor*> &inputs,
      Tensor *output,
-                        StatsFuture *future) {
+      StatsFuture *future) = 0;
-    MACE_UNUSED(inputs);
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLWinogradInverseTransformKernel);
-    MACE_UNUSED(output);
-    MACE_UNUSED(future);
-    MACE_NOT_IMPLEMENTED;
-    return MACE_SUCCESS;
-  }
 };
-#ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct WinogradInverseTransformFunctor<DeviceType::GPU, T>
+struct WinogradInverseTransformFunctor<DeviceType::GPU, T> : OpKernel {
-    : WinogradInverseTransformFunctorBase {
  WinogradInverseTransformFunctor(OpKernelContext *context,
                                  const ActivationType activation,
                                  const float relux_max_limit,
-                                  const int block_size)
+                                  const int block_size);
-      : WinogradInverseTransformFunctorBase(
-            context, activation, relux_max_limit, block_size) {}
-  MaceStatus operator()(const std::vector<const Tensor*> &inputs,
+  MaceStatus operator()(const std::vector<const Tensor *> &inputs,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
+  std::unique_ptr<OpenCLWinogradInverseTransformKernel> kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -77,6 +77,14 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
    return MaceStatus::MACE_INVALID_ARGS;
  }
+  const int mem_type_i =
+      ProtoArgHelper::GetOptionalArg<NetDef, int>(
+          *net_def, "opencl_mem_type",
+          static_cast<MemoryType>(MemoryType::GPU_IMAGE));
+  const MemoryType mem_type = static_cast<MemoryType>(mem_type_i);
+  runtime->set_mem_type(mem_type);
+  if (mem_type == MemoryType::GPU_IMAGE) {
    if (!runtime->IsImageSupport()) {
      return MaceStatus::MACE_OUT_OF_RESOURCES;
    }
@@ -97,6 +105,8 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
                << " vs " << MakeString(net_max_image_size);
      return MaceStatus::MACE_OUT_OF_RESOURCES;
    }
+  }
  return MaceStatus::MACE_SUCCESS;
 }
 #endif

--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -54,14 +54,14 @@ cc_library(
            "*_test.cc",
            "*_benchmark.cc",
            "ops_test_util.cc",
-            "buffer_to_image.cc",
+            "buffer_transform.cc",
-            "image_to_buffer.cc",
+            "buffer_inverse_transform.cc",
            "lstmcell.cc",
        ],
    ) + if_opencl_enabled(
        [
-            "buffer_to_image.cc",
+            "buffer_transform.cc",
-            "image_to_buffer.cc",
+            "buffer_inverse_transform.cc",
            "lstmcell.cc",
        ],
    ),

--- a/mace/ops/batch_to_space.h
+++ b/mace/ops/batch_to_space.h
@@ -36,7 +36,7 @@ class BatchToSpaceNDOp : public Operator<D, T> {
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *batch_tensor = this->Input(INPUT);
    Tensor *space_tensor = this->Output(OUTPUT);
-    return functor_(space_tensor, const_cast<Tensor *>(batch_tensor), future);
+    return functor_(batch_tensor, space_tensor, future);
  }
 private:

--- a/mace/ops/buffer_to_image.cc
+++ b/mace/ops/buffer_to_image.cc
@@ -12,23 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/buffer_to_image.h"
+#include "mace/ops/buffer_inverse_transform.h"
 namespace mace {
 namespace ops {
-void Register_BufferToImage(OperatorRegistryBase *op_registry) {
+void Register_BufferInverseTransform(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferInverseTransform")
                                          .Device(DeviceType::GPU)
                                          .TypeConstraint<float>("T")
                                          .Build(),
-                         BufferToImageOp<DeviceType::GPU, float>);
+                         BufferInverseTransformOp<DeviceType::GPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferInverseTransform")
                                          .Device(DeviceType::GPU)
                                          .TypeConstraint<half>("T")
                                          .Build(),
-                         BufferToImageOp<DeviceType::GPU, half>);
+                         BufferInverseTransformOp<DeviceType::GPU, half>);
 }
 }  // namespace ops

--- a/mace/ops/image_to_buffer.h
+++ b/mace/ops/image_to_buffer.h
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_IMAGE_TO_BUFFER_H_
+#ifndef MACE_OPS_BUFFER_INVERSE_TRANSFORM_H_
-#define MACE_OPS_IMAGE_TO_BUFFER_H_
+#define MACE_OPS_BUFFER_INVERSE_TRANSFORM_H_
 #include "mace/core/operator.h"
-#include "mace/kernels/image_to_buffer.h"
+#include "mace/kernels/buffer_inverse_transform.h"
 namespace mace {
 namespace ops {
 template <DeviceType D, typename T>
-class ImageToBufferOp : public Operator<D, T> {
+class BufferInverseTransformOp : public Operator<D, T> {
 public:
-  ImageToBufferOp(const OperatorDef &op_def, OpKernelContext *context)
+  BufferInverseTransformOp(const OperatorDef &op_def, OpKernelContext *context)
      : Operator<D, T>(op_def, context),
        functor_(context,
                 OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
@@ -40,7 +40,7 @@ class ImageToBufferOp : public Operator<D, T> {
  }
 private:
-  kernels::ImageToBufferFunctor<D, T> functor_;
+  kernels::BufferInverseTransformFunctor<D, T> functor_;
 protected:
  MACE_OP_INPUT_TAGS(INPUT);
@@ -50,4 +50,4 @@ class ImageToBufferOp : public Operator<D, T> {
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_IMAGE_TO_BUFFER_H_
+#endif  // MACE_OPS_BUFFER_INVERSE_TRANSFORM_H_
--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -24,7 +24,7 @@ template <DeviceType D, typename T>
 void TestBidirectionTransform(const int type,
                              const std::vector<index_t> &input_shape) {
  OpsTestNet net;
-  OpDefBuilder("BufferToImage", "BufferToImageTest")
+  OpDefBuilder("BufferTransform", "BufferTransformTest")
      .Input("Input")
      .Output("B2IOutput")
      .AddIntArg("buffer_type", type)
@@ -37,7 +37,7 @@ void TestBidirectionTransform(const int type,
  // Run
  net.RunOp(D);
-  OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
+  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
      .Input("B2IOutput")
      .Output("I2BOutput")
      .AddIntArg("buffer_type", type)
@@ -159,7 +159,7 @@ template <DeviceType D, typename T>
 void TestDiffTypeBidirectionTransform(const int type,
                                      const std::vector<index_t> &input_shape) {
  OpsTestNet net;
-  OpDefBuilder("BufferToImage", "BufferToImageTest")
+  OpDefBuilder("BufferTransform", "BufferTransformTest")
      .Input("Input")
      .Output("B2IOutput")
      .AddIntArg("buffer_type", type)
@@ -172,7 +172,7 @@ void TestDiffTypeBidirectionTransform(const int type,
  // Run
  net.RunOp(D);
-  OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
+  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
      .Input("B2IOutput")
      .Output("I2BOutput")
      .AddIntArg("buffer_type", type)
@@ -198,7 +198,7 @@ void TestStringHalfBidirectionTransform(const int type,
                                        const std::vector<index_t> &input_shape,
                                        const unsigned char *input_data) {
  OpsTestNet net;
-  OpDefBuilder("BufferToImage", "BufferToImageTest")
+  OpDefBuilder("BufferTransform", "BufferTransformTest")
      .Input("Input")
      .Output("B2IOutput")
      .AddIntArg("buffer_type", type)
@@ -213,7 +213,7 @@ void TestStringHalfBidirectionTransform(const int type,
  // Run
  net.RunOp(D);
-  OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
+  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
      .Input("B2IOutput")
      .Output("I2BOutput")
      .AddIntArg("buffer_type", type)

--- a/mace/ops/image_to_buffer.cc
+++ b/mace/ops/image_to_buffer.cc
@@ -12,23 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/image_to_buffer.h"
+#include "mace/ops/buffer_transform.h"
 namespace mace {
 namespace ops {
-void Register_ImageToBuffer(OperatorRegistryBase *op_registry) {
+void Register_BufferTransform(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer")
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferTransform")
                                          .Device(DeviceType::GPU)
                                          .TypeConstraint<float>("T")
                                          .Build(),
-                         ImageToBufferOp<DeviceType::GPU, float>);
+                         BufferTransformOp<DeviceType::GPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer")
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferTransform")
                                          .Device(DeviceType::GPU)
                                          .TypeConstraint<half>("T")
                                          .Build(),
-                         ImageToBufferOp<DeviceType::GPU, half>);
+                         BufferTransformOp<DeviceType::GPU, half>);
 }
 }  // namespace ops

--- a/mace/ops/buffer_to_image.h
+++ b/mace/ops/buffer_to_image.h
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_BUFFER_TO_IMAGE_H_
+#ifndef MACE_OPS_BUFFER_TRANSFORM_H_
-#define MACE_OPS_BUFFER_TO_IMAGE_H_
+#define MACE_OPS_BUFFER_TRANSFORM_H_
 #include "mace/core/operator.h"
-#include "mace/kernels/buffer_to_image.h"
+#include "mace/kernels/buffer_transform.h"
 namespace mace {
 namespace ops {
 template <DeviceType D, typename T>
-class BufferToImageOp : public Operator<D, T> {
+class BufferTransformOp : public Operator<D, T> {
 public:
-  BufferToImageOp(const OperatorDef &op_def, OpKernelContext *context)
+  BufferTransformOp(const OperatorDef &op_def, OpKernelContext *context)
      : Operator<D, T>(op_def, context),
        functor_(context,
                 OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
@@ -41,7 +41,7 @@ class BufferToImageOp : public Operator<D, T> {
  }
 private:
-  kernels::BufferToImageFunctor<D, T> functor_;
+  kernels::BufferTransformFunctor<D, T> functor_;
 protected:
  MACE_OP_INPUT_TAGS(INPUT);
@@ -50,4 +50,4 @@ class BufferToImageOp : public Operator<D, T> {
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_BUFFER_TO_IMAGE_H_
+#endif  // MACE_OPS_BUFFER_TRANSFORM_H_
--- a/mace/ops/buffer_transform_test.cc
+++ b/mace/ops/buffer_transform_test.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <cstring>
+#include "gtest/gtest.h"
+#include "mace/ops/ops_test_util.h"
+namespace mace {
+namespace ops {
+namespace test {
+class BufferTransformTest : public OpsTestBase {
+ protected:
+  virtual void SetUp() {
+    OpTestContext::Get()->SetOCLBufferTestFlag();
+  }
+};
+namespace {
+template <typename OrgType, typename DstType>
+void TestBidirectionTransform(const int type,
+                              const std::vector<index_t> &input_shape) {
+  OpsTestNet net;
+  OpDefBuilder("BufferTransform", "BufferTransformTest")
+      .Input("Input")
+      .Output("TransformedOutput")
+      .AddIntArg("buffer_type", type)
+      .AddIntArg("T", DataTypeToEnum<DstType>::value)
+      .Finalize(net.NewOperatorDef());
+  // Add input data
+  net.AddRandomInput<DeviceType::GPU, OrgType>("Input", input_shape);
+  // Run
+  net.RunOp(DeviceType::GPU);
+  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
+      .Input("TransformedOutput")
+      .Output("Output")
+      .AddIntArg("buffer_type", type)
+      .AddIntArg("T", DataTypeToEnum<OrgType>::value)
+      .Finalize(net.NewOperatorDef());
+  // Run
+  net.RunOp(DeviceType::GPU);
+  if (DataTypeToEnum<OrgType>::value == DataTypeToEnum<DstType>::value) {
+    EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(),
+              net.GetOutput("Output")->UnderlyingBuffer());
+  } else {
+    // Check
+    ExpectTensorNear<OrgType>(*net.GetOutput("Input"),
+                              *net.GetOutput("Output"),
+                              1e-3, 1e-4);
+  }
+}
+}  // namespace
+TEST_F(BufferTransformTest, FloatToHalf) {
+  TestBidirectionTransform<float, half>(kernels::BufferType::IN_OUT_CHANNEL,
+                                        {1, 2, 3, 4});
+}
+TEST_F(BufferTransformTest, HalfToHalf) {
+  TestBidirectionTransform<half, half>(kernels::BufferType::IN_OUT_CHANNEL,
+                                       {1, 2, 3, 4});
+}
+namespace {
+template <typename T>
+void TestArgumentTransform(const index_t input_size) {
+  OpsTestNet net;
+  OpDefBuilder("BufferTransform", "BufferTransformTest")
+      .Input("Input")
+      .Output("Output")
+      .AddIntArg("buffer_type", kernels::BufferType::ARGUMENT)
+      .AddIntArg("T", DataTypeToEnum<T>::value)
+      .Finalize(net.NewOperatorDef());
+  // Add input data
+  net.AddRandomInput<DeviceType::GPU, T>("Input", {input_size});
+  // Run
+  net.RunOp(DeviceType::GPU);
+  auto output_tensor = net.GetOutput("Output");
+  index_t expected_size = RoundUp<index_t>(input_size, 4);
+  EXPECT_EQ(expected_size, output_tensor->buffer_shape()[0]);
+  // Check
+  ExpectTensorNear<T>(*net.GetTensor("Input"), *output_tensor,
+                      1e-3, 1e-4);
+}
+}  // namespace
+TEST_F(BufferTransformTest, Argument) {
+  TestArgumentTransform<half>(30);
+  TestArgumentTransform<half>(32);
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/conv_2d.h
+++ b/mace/ops/conv_2d.h
@@ -40,8 +40,7 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
                                                               "NOOP")),
                 OperatorBase::GetOptionalArg<float>("max_limit", 0.0f),
                 static_cast<bool>(OperatorBase::GetOptionalArg<int>(
-                     "is_filter_transformed", false)),
+                     "is_filter_transformed", false))) {}
-                 context->workspace()->GetScratchBuffer(D)) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -23,21 +23,26 @@ namespace mace {
 namespace ops {
 namespace test {
-class Conv2dOpTest : public OpsTestBase {};
+class Conv2dOpTest : public OpsTestBase {
+ protected:
+  virtual void SetUp() {
+    OpTestContext::Get()->SetOCLImageTestFlag();
+  }
+};
 namespace {
 template <DeviceType D, typename T>
 void TestNHWCSimple3x3VALID() {
  OpsTestNet net;
  // Add input data
-  net.AddInputFromArray<D, T>(
+  net.AddInputFromArray<D, float>(
      "Input", {1, 3, 3, 2},
      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<D, T>(
+  net.AddInputFromArray<D, float>(
      "Filter", {1, 2, 3, 3},
      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
+  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
  if (D == DeviceType::CPU) {
    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
@@ -50,7 +55,6 @@ void TestNHWCSimple3x3VALID() {
        .AddIntsArg("strides", {1, 1})
        .AddIntArg("padding", Padding::VALID)
        .AddIntsArg("dilations", {1, 1})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
@@ -77,7 +81,7 @@ void TestNHWCSimple3x3VALID() {
    net.RunOp(D);
    // Transfer output
-    ImageToBuffer<D, T>(&net, "OutputImage", "Output",
+    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
                            kernels::BufferType::IN_OUT_CHANNEL);
  } else {
@@ -85,7 +89,11 @@ void TestNHWCSimple3x3VALID() {
  }
  auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {18.1f});
-  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5);
+  if (DataTypeToEnum<T>::value == DataType::DT_FLOAT) {
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+  } else {
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-3, 1e-3);
+  }
 }
 template <DeviceType D, typename T>
@@ -93,14 +101,14 @@ void TestNHWCSimple3x3SAME() {
  OpsTestNet net;
  // Add input data
-  net.AddInputFromArray<D, T>(
+  net.AddInputFromArray<D, float>(
      "Input", {1, 3, 3, 2},
      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<D, T>(
+  net.AddInputFromArray<D, float>(
      "Filter", {1, 2, 3, 3},
      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
+  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
  if (D == DeviceType::CPU) {
    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
@@ -113,7 +121,6 @@ void TestNHWCSimple3x3SAME() {
        .AddIntsArg("strides", {1, 1})
        .AddIntArg("padding", Padding::SAME)
        .AddIntsArg("dilations", {1, 1})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
@@ -140,7 +147,7 @@ void TestNHWCSimple3x3SAME() {
    net.RunOp(D);
    // Transfer output
-    ImageToBuffer<D, T>(&net, "OutputImage", "Output",
+    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
                            kernels::BufferType::IN_OUT_CHANNEL);
  } else {
@@ -151,7 +158,11 @@ void TestNHWCSimple3x3SAME() {
      {1, 3, 3, 1},
      {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
-  ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5);
+  if (DataTypeToEnum<T>::value == DataType::DT_FLOAT) {
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+  } else {
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-3, 1e-3);
+  }
 }
 }  // namespace
@@ -165,6 +176,11 @@ TEST_F(Conv2dOpTest, OPENCLSimple) {
  TestNHWCSimple3x3SAME<DeviceType::GPU, float>();
 }
+TEST_F(Conv2dOpTest, OPENCLHalfSimple) {
+  TestNHWCSimple3x3VALID<DeviceType::GPU, half>();
+  TestNHWCSimple3x3SAME<DeviceType::GPU, half>();
+}
 namespace {
 template <DeviceType D, typename T>
 void TestNHWCSimple3x3WithoutBias() {
@@ -638,7 +654,7 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
  auto func = [&](int stride_h, int stride_w, Padding padding) {
    // generate random input
-    index_t batch = 3;
+    index_t batch = 1;
    index_t height = input_shape[0];
    index_t width = input_shape[1];
    index_t kernel_h = filter_shape[0];
@@ -713,7 +729,7 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
                            kernels::BufferType::IN_OUT_CHANNEL);
    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
-                            1e-1);
+                            1e-2);
  };
  func(1, 1, VALID);

--- a/mace/ops/core_test.cc
+++ b/mace/ops/core_test.cc
@@ -26,7 +26,7 @@ TEST(CoreTest, INIT_MODE) {
  Workspace ws;
  op_defs.emplace_back(OperatorDef());
-  OpDefBuilder("BufferToImage", "BufferToImageTest")
+  OpDefBuilder("BufferTransform", "BufferTransformTest")
      .Input("Input")
      .Output("B2IOutput")
      .AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER)
@@ -43,7 +43,7 @@ TEST(CoreTest, INIT_MODE) {
  }
  op_defs.emplace_back(OperatorDef());
-  OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
+  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
      .Input("B2IOutput")
      .Output("Output")
      .AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER)

--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -250,19 +250,19 @@ void TestNxNS12(const index_t height, const index_t width) {
                  Padding type) {
    // generate random input
    static unsigned int seed = time(NULL);
-    index_t batch = 1 + rand_r(&seed) % 5;
+    index_t batch = 1;
-    index_t input_channels = 3 + rand_r(&seed) % 16;
+    index_t channel = 32;
    index_t multiplier = 1;
    // Construct graph
    OpsTestNet net;
    // Add input data
    net.AddRandomInput<DeviceType::GPU, float>(
-        "Input", {batch, height, width, input_channels});
+        "Input", {batch, height, width, channel});
    net.AddRandomInput<DeviceType::GPU, float>(
-        "Filter", {multiplier, input_channels, kernel_h, kernel_w});
+        "Filter", {multiplier, channel, kernel_h, kernel_w});
    net.AddRandomInput<DeviceType::GPU, float>("Bias",
-                                               {multiplier * input_channels});
+                                               {multiplier * channel});
    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                    NCHW);
@@ -275,6 +275,8 @@ void TestNxNS12(const index_t height, const index_t width) {
        .AddIntArg("padding", type)
        .AddIntsArg("dilations", {1, 1})
        .AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
+        .AddStringArg("activation", "RELUX")
+        .AddFloatArg("max_limit", 6.0)
        .Finalize(net.NewOperatorDef());
    // Run on cpu
@@ -302,6 +304,8 @@ void TestNxNS12(const index_t height, const index_t width) {
        .AddIntArg("padding", type)
        .AddIntsArg("dilations", {1, 1})
        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .AddStringArg("activation", "RELUX")
+        .AddFloatArg("max_limit", 6.0)
        .Finalize(net.NewOperatorDef());
    net.RunOp(DeviceType::GPU);

--- a/mace/ops/ops_register.cc
+++ b/mace/ops/ops_register.cc
@@ -67,8 +67,8 @@ extern void Register_WinogradInverseTransform(OperatorRegistryBase *op_registry)
 extern void Register_WinogradTransform(OperatorRegistryBase *op_registry);
 #ifdef MACE_ENABLE_OPENCL
-extern void Register_BufferToImage(OperatorRegistryBase *op_registry);
+extern void Register_BufferTransform(OperatorRegistryBase *op_registry);
-extern void Register_ImageToBuffer(OperatorRegistryBase *op_registry);
+extern void Register_BufferInverseTransform(OperatorRegistryBase *op_registry);
 extern void Register_LSTMCell(OperatorRegistryBase *op_registry);
 #endif  // MACE_ENABLE_OPENCL
 }  // namespace ops
@@ -125,8 +125,8 @@ OperatorRegistry::OperatorRegistry() : OperatorRegistryBase() {
  ops::Register_WinogradTransform(this);
 #ifdef MACE_ENABLE_OPENCL
-  ops::Register_BufferToImage(this);
+  ops::Register_BufferTransform(this);
-  ops::Register_ImageToBuffer(this);
+  ops::Register_BufferInverseTransform(this);
  ops::Register_LSTMCell(this);
 #endif  // MACE_ENABLE_OPENCL
 }

--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
@@ -27,18 +27,11 @@ OpTestContext *OpTestContext::Get(int num_threads,
  return &instance;
 }
-std::shared_ptr<GPUContext> OpTestContext::gpu_context() const {
-  return gpu_context_;
-}
-Device *OpTestContext::GetDevice(DeviceType device_type) {
-  return device_map_[device_type].get();
-}
 OpTestContext::OpTestContext(int num_threads,
                             CPUAffinityPolicy cpu_affinity_policy,
                             bool use_gemmlowp)
-    : gpu_context_(new GPUContext()) {
+    : gpu_context_(new GPUContext()),
+      opencl_mem_types_({MemoryType::GPU_IMAGE}) {
  device_map_[DeviceType::CPU] = std::unique_ptr<Device>(
      new CPUDevice(num_threads,
                    cpu_affinity_policy,
@@ -50,6 +43,30 @@ OpTestContext::OpTestContext(int num_threads,
                    GPUPriorityHint::PRIORITY_NORMAL));
 }
+std::shared_ptr<GPUContext> OpTestContext::gpu_context() const {
+  return gpu_context_;
+}
+Device *OpTestContext::GetDevice(DeviceType device_type) {
+  return device_map_[device_type].get();
+}
+std::vector<MemoryType> OpTestContext::opencl_mem_types() {
+  return opencl_mem_types_;
+}
+void OpTestContext::SetOCLBufferTestFlag() {
+  opencl_mem_types_ = {MemoryType::GPU_BUFFER};
+}
+void OpTestContext::SetOCLImageTestFlag() {
+  opencl_mem_types_ = {MemoryType::GPU_IMAGE};
+}
+void OpTestContext::SetOCLImageAndBufferTestFlag() {
+  opencl_mem_types_ = {MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER};
+}
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -120,7 +120,10 @@ class OpTestContext {
      bool use_gemmlowp = true);
  std::shared_ptr<GPUContext> gpu_context() const;
  Device *GetDevice(DeviceType device_type);
+  std::vector<MemoryType> opencl_mem_types();
+  void SetOCLBufferTestFlag();
+  void SetOCLImageTestFlag();
+  void SetOCLImageAndBufferTestFlag();
 private:
  OpTestContext(int num_threads,
                CPUAffinityPolicy cpu_affinity_policy,
@@ -128,6 +131,7 @@ class OpTestContext {
  MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext);
  std::shared_ptr<GPUContext> gpu_context_;
+  std::vector<MemoryType> opencl_mem_types_;
  std::map<DeviceType, std::unique_ptr<Device>> device_map_;
 };
@@ -459,9 +463,20 @@ class OpsTestNet {
  // Test and benchmark should setup model once and run multiple times.
  // Setup time should not be counted during benchmark.
  MaceStatus RunOp(DeviceType device) {
+    if (device == DeviceType::GPU) {
+      auto opencl_mem_types = OpTestContext::Get()->opencl_mem_types();
+      for (auto type : opencl_mem_types) {
+        OpTestContext::Get()->GetDevice(device)
+            ->opencl_runtime()->set_mem_type(type);
+        Setup(device);
+        MACE_RETURN_IF_ERROR(Run());
+      }
+      return MACE_SUCCESS;
+    } else {
      Setup(device);
      return Run();
    }
+  }
  // DEPRECATED(liyin):
  // Test and benchmark should setup model once and run multiple times.
@@ -512,6 +527,7 @@ class OpsTestBase : public ::testing::Test {
  }
  virtual void TearDown() {
+    OpTestContext::Get()->SetOCLImageTestFlag();
  }
 };
@@ -747,7 +763,7 @@ void BufferToImage(OpsTestNet *net,
                   const int wino_block_size = 2) {
  MACE_CHECK_NOTNULL(net);
-  OpDefBuilder("BufferToImage", "BufferToImageTest")
+  OpDefBuilder("BufferTransform", "BufferTransformTest")
    .Input(input_name)
    .Output(output_name)
    .AddIntArg("buffer_type", type)
@@ -755,7 +771,7 @@ void BufferToImage(OpsTestNet *net,
    .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
    .Finalize(net->NewOperatorDef());
-  // Run
+  // TODO(liuqi): Use AddNewOperatorDef, and run all ops with same NetDef.
  net->RunOp(D);
  net->Sync();
@@ -769,7 +785,7 @@ void ImageToBuffer(OpsTestNet *net,
                   const int wino_block_size = 2) {
  MACE_CHECK_NOTNULL(net);
-  OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
+  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
    .Input(input_name)
    .Output(output_name)
    .AddIntArg("buffer_type", type)

--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
@@ -111,6 +111,7 @@ void Pooling(int iters,
 #define MACE_BM_POOLING(N, C, H, W, K, S, PA, PO)       \
  MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, float, CPU); \
  MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, float, GPU); \
+  MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, half, GPU); \
  MACE_BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, uint8_t, CPU);

--- a/mace/ops/resize_bicubic.h
+++ b/mace/ops/resize_bicubic.h
@@ -27,8 +27,8 @@ class ResizeBicubicOp : public Operator<D, T> {
  ResizeBicubicOp(const OperatorDef &operator_def, OpKernelContext *context)
      : Operator<D, T>(operator_def, context),
        functor_(context,
-                 OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}),
+                 OperatorBase::GetOptionalArg<bool>("align_corners", false),
-                 OperatorBase::GetOptionalArg<bool>("align_corners", false)) {}
+                 OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1})) {}
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *input = this->Input(0);

--- a/mace/ops/space_to_batch.h
+++ b/mace/ops/space_to_batch.h
@@ -36,7 +36,7 @@ class SpaceToBatchNDOp : public Operator<D, T> {
  MaceStatus Run(StatsFuture *future) override {
    const Tensor *space_tensor = this->Input(INPUT);
    Tensor *batch_tensor = this->Output(OUTPUT);
-    return functor_(const_cast<Tensor *>(space_tensor), batch_tensor, future);
+    return functor_(space_tensor, batch_tensor, future);
  }
 private:

--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -106,6 +106,7 @@ def main(unused_args):
    option.winograd = FLAGS.winograd
    option.quantize = FLAGS.quantize
    option.quantize_range_file = FLAGS.quantize_range_file
+    option.cl_mem_type = FLAGS.cl_mem_type
    input_node_names = FLAGS.input_node.split(',')
    input_node_shapes = FLAGS.input_shape.split(':')
@@ -323,6 +324,11 @@ def parse_args():
        type=str,
        default="",
        help="file path of quantize range for each tensor")
+    parser.add_argument(
+        "--cl_mem_type",
+        type=str,
+        default="image",
+        help="which memory type to use.[image|buffer]")
    return parser.parse_known_args()

--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -131,8 +131,8 @@ class MaceKeyword(object):
    mace_output_node_name = 'mace_output_node'
    mace_buffer_type = 'buffer_type'
    mace_mode = 'mode'
-    mace_buffer_to_image = 'BufferToImage'
+    mace_buffer_transform = 'BufferTransform'
-    mace_image_to_buffer = 'ImageToBuffer'
+    mace_buffer_inverse_transform = 'BufferInverseTransform'
    # arg related str
    mace_padding_str = 'padding'
    mace_padding_values_str = 'padding_values'
@@ -175,6 +175,7 @@ class MaceKeyword(object):
    mace_opencl_max_image_size = "opencl_max_image_size"
    mace_seperate_buffer_str = 'seperate_buffer'
    mace_scalar_input_index_str = 'scalar_input_index'
+    mace_opencl_mem_type = "opencl_mem_type"
 class TransformerRule(Enum):
@@ -194,7 +195,7 @@ class TransformerRule(Enum):
    RESHAPE_FC_WEIGHT = 14
    TRANSPOSE_DATA_FORMAT = 15
    TRANSFORM_GLOBAL_CONV_TO_FC = 16
-    TRANSFORM_BUFFER_IMAGE = 17
+    ADD_BUFFER_TRANSFORM = 17
    ADD_DEVICE = 18
    SORT_BY_EXECUTION = 19
    ADD_IN_OUT_TENSOR_INFO = 20
@@ -208,6 +209,7 @@ class TransformerRule(Enum):
    TRANSFORM_FAKE_QUANTIZE = 28
    CHECK_QUANTIZE_INFO = 29
    REARRANGE_BATCH_TO_SPACE = 30
+    ADD_OPENCL_INFORMATIONS = 31
 class ConverterInterface(object):
@@ -265,6 +267,7 @@ class ConverterOption(object):
        self._quantize = False
        self._quantize_range_file = ""
        self._transformer_option = None
+        self._cl_mem_type = ""
    @property
    def input_nodes(self):
@@ -298,6 +301,10 @@ class ConverterOption(object):
    def transformer_option(self):
        return self._transformer_option
+    @property
+    def cl_mem_type(self):
+        return self._cl_mem_type
    @input_nodes.setter
    def input_nodes(self, input_nodes):
        for node in input_nodes:
@@ -338,6 +345,10 @@ class ConverterOption(object):
    def transformer_option(self, transformer_option):
        self._transformer_option = transformer_option
+    @cl_mem_type.setter
+    def cl_mem_type(self, cl_mem_type):
+        self._cl_mem_type = cl_mem_type
    def disable_transpose_filters(self):
        if TransformerRule.TRANSPOSE_FILTERS in self._transformer_option:
            self._transformer_option.remove(TransformerRule.TRANSPOSE_FILTERS)
@@ -377,11 +388,12 @@ class ConverterOption(object):
                # Mace model structure related transformation
                TransformerRule.ADD_IN_OUT_TENSOR_INFO,
                # Device related transformation
-                TransformerRule.TRANSFORM_BUFFER_IMAGE,
+                TransformerRule.ADD_BUFFER_TRANSFORM,
                TransformerRule.ADD_DEVICE,
                # Data type related transformation
                TransformerRule.UPDATE_FLOAT_OP_DATA_TYPE,
                # Transform finalization
+                TransformerRule.ADD_OPENCL_INFORMATIONS,
                TransformerRule.ADD_MACE_INPUT_AND_OUTPUT_NODES,
                # for quantization entropy calibration use
                TransformerRule.SORT_BY_EXECUTION,

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -80,8 +80,8 @@ class Transformer(base_converter.ConverterInterface):
            TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC:
                self.transform_global_conv_to_fc,
            TransformerRule.RESHAPE_FC_WEIGHT: self.reshape_fc_weight,
-            TransformerRule.TRANSFORM_BUFFER_IMAGE:
+            TransformerRule.ADD_BUFFER_TRANSFORM:
-                self.transform_buffer_image,
+                self.add_buffer_transform,
            TransformerRule.QUANTIZE_NODES:
                self.quantize_nodes,
            TransformerRule.ADD_QUANTIZE_TENSOR_RANGE:
@@ -94,6 +94,8 @@ class Transformer(base_converter.ConverterInterface):
                self.update_float_op_data_type,
            TransformerRule.ADD_MACE_INPUT_AND_OUTPUT_NODES:
                self.add_mace_input_and_output_nodes,
+            TransformerRule.ADD_OPENCL_INFORMATIONS:
+                self.add_opencl_informations,
            TransformerRule.SORT_BY_EXECUTION: self.sort_by_execution,
            TransformerRule.CHECK_QUANTIZE_INFO:
                self.check_quantize_info,
@@ -1269,13 +1271,13 @@ class Transformer(base_converter.ConverterInterface):
        return False
-    def buffer_to_image(self, op, input_idx, input_type):
+    def buffer_transform(self, op, input_idx, input_type):
        net = self._model
        input_name = op.input[input_idx]
        op_def = net.op.add()
        op_def.name = input_name.replace(':', '_') + "_b2i"
        output_name = op_def.name
-        op_def.type = MaceKeyword.mace_buffer_to_image
+        op_def.type = MaceKeyword.mace_buffer_transform
        op_def.input.extend([input_name])
        op_def.output.extend([output_name])
@@ -1307,65 +1309,66 @@ class Transformer(base_converter.ConverterInterface):
        self._opencl_max_image_size[1] = max(self._opencl_max_image_size[1],
                                             img_shape[1])
-    def transform_buffer_image(self):
+    def add_buffer_transform(self):
        if self._option.device != DeviceType.GPU.value:
            return False
-        print("Transform buffer to image")
+        print("Add buffer transform op")
        net = self._model
        for op in net.op:
            if op.type == MaceOp.Conv2D.name \
                    or op.type == MaceOp.Deconv2D.name:
-                self.buffer_to_image(op, 1, OpenCLBufferType.CONV2D_FILTER)
+                self.buffer_transform(op, 1, OpenCLBufferType.CONV2D_FILTER)
                if len(op.input) >= 3 and op.type == MaceOp.Conv2D.name:
-                    self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT)
+                    self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
                elif len(op.input) >= 4 and op.type == MaceOp.Deconv2D.name:
-                    self.buffer_to_image(op, 3, OpenCLBufferType.ARGUMENT)
+                    self.buffer_transform(op, 3, OpenCLBufferType.ARGUMENT)
            elif op.type == MaceOp.DepthwiseConv2d.name:
-                self.buffer_to_image(op, 1, OpenCLBufferType.DW_CONV2D_FILTER)
+                self.buffer_transform(op, 1, OpenCLBufferType.DW_CONV2D_FILTER)
                if len(op.input) >= 3:
-                    self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT)
+                    self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
            elif op.type == MaceOp.BiasAdd.name:
-                self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT)
+                self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT)
            elif op.type == MaceOp.Eltwise.name and len(op.input) == 2:
                if op.input[0] in self._consts \
                        and len(self._consts[op.input[0]].dims) == 1:
-                    self.buffer_to_image(op, 0, OpenCLBufferType.ARGUMENT)
+                    self.buffer_transform(op, 0, OpenCLBufferType.ARGUMENT)
                if op.input[1] in self._consts \
                        and len(self._consts[op.input[1]].dims) == 1:
-                    self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT)
+                    self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT)
            elif op.type == MaceOp.FoldedBatchNorm.name:
-                self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT)
+                self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT)
-                self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT)
+                self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
                if len(op.input) >= 4:
-                    self.buffer_to_image(op, 3, OpenCLBufferType.ARGUMENT)
+                    self.buffer_transform(op, 3, OpenCLBufferType.ARGUMENT)
            elif op.type == MaceOp.MatMul.name and \
                    ConverterUtil.get_arg(op,
                                          MaceKeyword.mace_winograd_filter_transformed) is not None:  # noqa
-                self.buffer_to_image(op, 0, OpenCLBufferType.WINOGRAD_FILTER)
+                self.buffer_transform(op, 0, OpenCLBufferType.WINOGRAD_FILTER)
            elif op.type == MaceOp.WinogradInverseTransform.name \
                    and len(op.input) >= 3:
-                self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT)
+                self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
            elif op.type == MaceOp.FullyConnected.name:
-                self.buffer_to_image(op, 1, OpenCLBufferType.WEIGHT_WIDTH)
+                self.buffer_transform(op, 1, OpenCLBufferType.WEIGHT_WIDTH)
                if len(op.input) >= 3:
-                    self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT)
+                    self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
            elif op.type == MaceOp.Activation.name:
                if ConverterUtil.get_arg(op,
                                         MaceKeyword.mace_activation_type_str).s == ActivationType.PRELU.name:  # noqa
-                    self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT)
+                    self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT)
            elif op.type == MaceOp.LSTMCell.name:
                if op.input[1] in self._consts:
-                    self.buffer_to_image(op, 1,
+                    self.buffer_transform(op, 1,
                                          OpenCLBufferType.IN_OUT_CHANNEL)
-                self.buffer_to_image(op, 2, OpenCLBufferType.IN_OUT_CHANNEL)
+                self.buffer_transform(op, 2, OpenCLBufferType.IN_OUT_CHANNEL)
-                self.buffer_to_image(op, 3, OpenCLBufferType.ARGUMENT)
+                self.buffer_transform(op, 3, OpenCLBufferType.ARGUMENT)
                if op.input[4] in self._consts:
-                    self.buffer_to_image(op, 4,
+                    self.buffer_transform(op, 4,
                                          OpenCLBufferType.IN_OUT_CHANNEL)
        # Add OpenCL max image size
+        if self._option.cl_mem_type == "image":
            arg = net.arg.add()
            arg.name = MaceKeyword.mace_opencl_max_image_size
            arg.ints.extend(self._opencl_max_image_size)
@@ -1376,7 +1379,7 @@ class Transformer(base_converter.ConverterInterface):
            op_def = self._model.op.add()
            op_def.name = self.normalize_op_name(input_node.name)
-            op_def.type = MaceKeyword.mace_buffer_to_image
+            op_def.type = MaceKeyword.mace_buffer_transform
            op_def.input.extend([new_input_name])
            op_def.output.extend([input_node.name])
            output_shape = op_def.output_shape.add()
@@ -1394,7 +1397,7 @@ class Transformer(base_converter.ConverterInterface):
                          + '_' + output_node.name
            op_def = self._model.op.add()
            op_def.name = self.normalize_op_name(output_name)
-            op_def.type = MaceKeyword.mace_image_to_buffer
+            op_def.type = MaceKeyword.mace_buffer_inverse_transform
            op_def.input.extend([output_node.name])
            op_def.output.extend([output_name])
            if output_node.shape:
@@ -1920,3 +1923,16 @@ class Transformer(base_converter.ConverterInterface):
                and op.type != MaceOp.Dequantize.name):  # noqa
                mace_check(len(op.output) == len(op.quantize_info),
                           "missing quantize info: %s" % op)
+    def add_opencl_informations(self):
+        if self._option.device != DeviceType.GPU.value:
+            return False
+        print("Add OpenCL informations")
+        net = self._model
+        arg = net.arg.add()
+        arg.name = MaceKeyword.mace_opencl_mem_type
+        arg.i = mace_pb2.GPU_IMAGE if self._option.cl_mem_type == "image"\
+            else mace_pb2.GPU_BUFFER
--- a/mace/python/tools/memory_optimizer.py
+++ b/mace/python/tools/memory_optimizer.py
@@ -18,6 +18,8 @@ from mace.proto import mace_pb2
 from mace.python.tools.converter_tool import base_converter as cvt
 from mace.python.tools.converter_tool.base_converter import DeviceType
+from mace.python.tools.converter_tool.base_converter import ConverterUtil
+from mace.python.tools.converter_tool.base_converter import MaceKeyword
 from mace.python.tools.convert_util import calculate_image_shape
 from mace.python.tools.convert_util import OpenCLBufferType
@@ -56,6 +58,10 @@ class MemoryOptimizer(object):
        self.total_mem_count = 0
        self.input_ref_counter = {}
        self.mem_ref_counter = {}
+        ocl_mem_type_arg = ConverterUtil.get_arg(
+            net_def, MaceKeyword.mace_opencl_mem_type)
+        self.cl_mem_type = ocl_mem_type_arg.i if ocl_mem_type_arg is not None\
+            else None
        consumers = {}
        for op in net_def.op:
@@ -223,13 +229,13 @@ class MemoryOptimizer(object):
 class GPUMemoryOptimizer(MemoryOptimizer):
    def op_need_optimize_memory(self, op):
-        if op.type == 'BufferToImage':
+        if op.type == MaceKeyword.mace_buffer_transform:
            for arg in op.arg:
                if arg.name == 'mode' and arg.i == 0:
                    return False
-        return op.type != 'ImageToBuffer'
+        return op.type != MaceKeyword.mace_buffer_inverse_transform
-    def get_op_mem_block(self, op_type, output_shape, output_type):
+    def get_op_image_mem_block(self, op_type, output_shape):
        if op_type == 'WinogradTransform' or op_type == 'MatMul':
            buffer_shape = list(output_shape) + [1]
            mem_block = MemoryBlock(
@@ -264,6 +270,16 @@ class GPUMemoryOptimizer(MemoryOptimizer):
                                      buffer_shape))
        return mem_block
+    def get_op_buffer_mem_block(self, output_shape):
+        return MemoryBlock(mace_pb2.GPU_BUFFER,
+                           [reduce(operator.mul, output_shape, 1), 1])
+    def get_op_mem_block(self, op_type, output_shape, output_type):
+        if self.cl_mem_type == mace_pb2.GPU_IMAGE:
+            return self.get_op_image_mem_block(op_type, output_shape)
+        else:
+            return self.get_op_buffer_mem_block(output_shape)
    def mem_size(self, memory_block):
        if memory_block.mem_type == mace_pb2.GPU_IMAGE:
            return memory_block.block[0] * memory_block.block[1] * 4
@@ -295,6 +311,7 @@ class GPUMemoryOptimizer(MemoryOptimizer):
                max_image_size_x = max(max_image_size_x, block.x)
                max_image_size_y = max(max_image_size_y, block.y)
+        if self.cl_mem_type == mace_pb2.GPU_IMAGE:
            # Update OpenCL max image size
            net_ocl_max_img_size_arg = None
            for arg in self.net_def.arg:

--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
@@ -69,7 +69,7 @@ void BufferToImage(const std::string &input_name,
                   const int mode = NetMode::NORMAL) {
  OperatorDef operator_def;
-  ops::test::OpDefBuilder("BufferToImage", "BufferToImageOp")
+  ops::test::OpDefBuilder("BufferTransform", "BufferTransformOp")
      .Input(input_name)
      .Output(output_name)
      .AddIntArg("buffer_type", buffer_type)
@@ -93,7 +93,7 @@ void ImageToBuffer(const std::string &input_name,
                   NetDef *net_def) {
  OperatorDef operator_def;
-  ops::test::OpDefBuilder("ImageToBuffer", "ImageToBufferOp")
+  ops::test::OpDefBuilder("BufferInverseTransform", "BufferInverseTransformOp")
      .Input(input_name)
      .Output(output_name)
      .AddIntArg("buffer_type", buffer_type)

--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -70,7 +70,7 @@ void BufferToImage(const std::string &input_name,
                   const int mode = NetMode::NORMAL) {
  OperatorDef operator_def;
-  ops::test::OpDefBuilder("BufferToImage", "BufferToImageOp")
+  ops::test::OpDefBuilder("BufferTransform", "BufferTransformOp")
      .Input(input_name)
      .Output(output_name)
      .AddIntArg("buffer_type", buffer_type)
@@ -95,7 +95,7 @@ void ImageToBuffer(const std::string &input_name,
                   NetDef *net_def) {
  OperatorDef operator_def;
-  ops::test::OpDefBuilder("ImageToBuffer", "ImageToBufferOp")
+  ops::test::OpDefBuilder("BufferInverseTransform", "BufferInverseTransformOp")
      .Input(input_name)
      .Output(output_name)
      .AddIntArg("buffer_type", buffer_type)

--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
@@ -33,6 +33,12 @@ namespace mace {
  CLASSNAME &operator=(const CLASSNAME &) = delete
 #endif
+#ifndef MACE_VIRTUAL_EMPTY_DESTRUCTOR
+#define MACE_VIRTUAL_EMPTY_DESTRUCTOR(CLASSNAME) \
+ public:                                         \
+  virtual ~CLASSNAME() {}
+#endif
 template <typename Integer>
 Integer RoundUp(Integer i, Integer factor) {
  return (i + factor - 1) / factor * factor;

--- a/repository/opencl-kernel/opencl_kernel_configure.bzl
+++ b/repository/opencl-kernel/opencl_kernel_configure.bzl
@@ -23,30 +23,38 @@ def _opencl_encrypt_kernel_impl(repository_ctx):
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/activation.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/addn.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/batch_norm.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/batch_to_space.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/bias_add.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/buffer_to_image.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/buffer_transform.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/channel_shuffle.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/common.h"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/concat.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_1x1.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_1x1_buffer.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_3x3.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/conv_2d_buffer.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/crop.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/deconv_2d.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/depth_to_space.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/depthwise_conv2d.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/depthwise_conv2d_buffer.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/eltwise.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/fully_connected.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/lstmcell.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/matmul.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/pad.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/pooling.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/pooling_buffer.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/reduce_mean.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/resize_bicubic.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/resize_bilinear.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/split.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/softmax.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/softmax_buffer.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/space_to_batch.cl"))
+    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/space_to_depth.cl"))
    unused_var = repository_ctx.path(Label("//:mace/kernels/opencl/cl/winograd_transform.cl"))
  python_bin_path = repository_ctx.which("python")

--- a/tools/converter.py
+++ b/tools/converter.py
@@ -203,6 +203,7 @@ class YAMLKeyword(object):
    validation_inputs_data = 'validation_inputs_data'
    validation_threshold = 'validation_threshold'
    graph_optimize_options = 'graph_optimize_options'  # internal use for now
+    cl_mem_type = 'cl_mem_type'
 class ModuleName(object):
@@ -692,7 +693,7 @@ def get_model_files(model_file_path,
    return model_file, weight_file
-def convert_model(configs):
+def convert_model(configs, cl_mem_type):
    # Remove previous output dirs
    library_name = configs[YAMLKeyword.library_name]
    if not os.path.exists(BUILD_OUTPUT_DIR):
@@ -735,6 +736,10 @@ def convert_model(configs):
            StringFormatter.block("Convert %s model" % model_name))
        model_config = configs[YAMLKeyword.models][model_name]
        runtime = model_config[YAMLKeyword.runtime]
+        if cl_mem_type:
+            model_config[YAMLKeyword.cl_mem_type] = cl_mem_type
+        else:
+            model_config[YAMLKeyword.cl_mem_type] = "image"
        model_file_path, weight_file_path = get_model_files(
            model_config[YAMLKeyword.model_file_path],
@@ -769,6 +774,7 @@ def convert_model(configs):
            model_config[YAMLKeyword.obfuscate],
            configs[YAMLKeyword.model_graph_format],
            data_type,
+            model_config[YAMLKeyword.cl_mem_type],
            ",".join(model_config.get(YAMLKeyword.graph_optimize_options, [])))
        if configs[YAMLKeyword.model_graph_format] == ModelFormat.file:
@@ -844,7 +850,7 @@ def convert_func(flags):
    print_configuration(configs)
-    convert_model(configs)
+    convert_model(configs, flags.cl_mem_type)
    if configs[YAMLKeyword.model_graph_format] == ModelFormat.code:
        build_model_lib(configs, flags.address_sanitizer)
@@ -1683,6 +1689,11 @@ def parse_args():
        'convert',
        parents=[all_type_parent_parser, convert_run_parent_parser],
        help='convert to mace model (file or code)')
+    convert.add_argument(
+        "--cl_mem_type",
+        type=str,
+        default=None,
+        help="Which type of OpenCL memory type to use [image | buffer].")
    convert.set_defaults(func=convert_func)
    run = subparsers.add_parser(
        'run',

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -340,7 +340,7 @@ def bazel_build(target,
                enable_neon=True,
                enable_opencl=True,
                address_sanitizer=False,
-                symbol_hidden=False,
+                symbol_hidden=True,
                extra_args=""):
    print("* Build %s with ABI %s" % (target, abi))
    if abi == "host":
@@ -560,6 +560,7 @@ def gen_model_code(model_codegen_dir,
                   obfuscate,
                   model_graph_format,
                   data_type,
+                   cl_mem_type,
                   graph_optimize_options):
    bazel_build_common("//mace/python/tools:converter")
@@ -591,6 +592,7 @@ def gen_model_code(model_codegen_dir,
              "--model_graph_format=%s" % model_graph_format,
              "--data_type=%s" % data_type,
              "--graph_optimize_options=%s" % graph_optimize_options,
+              "--cl_mem_type=%s" % cl_mem_type,
              _fg=True)