Merge branch 'gpu-buffer' into 'master'

Refactor OpenCL kernel for supporting buffer. See merge request !803

Merge branch 'gpu-buffer' into 'master'
Refactor OpenCL kernel for supporting buffer. See merge request !803
e2a40a03 · 刘琦 · f2f0a76c · e3fb9821 · e2a40a03 · e2a40a03
183 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -8,6 +8,7 @@ stages:
  - ops_test
  - api_test
  - python_tools_tests
+  - model_tests
  - build_android_demo
  - ops_benchmark
  - extra_tests
@@ -113,6 +114,18 @@ python_tools_tests:
      python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file || exit 1;
      python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1;
      python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+
+model_tests:
+  stage: model_tests
+  script:
+    - pwd
+    - rm -rf mace-models
+    - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
+    - CONF_FILE=mace-models/mobilenet-v1/mobilenet-v1.yml
+    - >
+      python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file --cl_mem_type=buffer || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
    - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml
    - >
      python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1;

--- a/mace/core/device.cc
+++ b/mace/core/device.cc
@@ -14,6 +14,8 @@

 #include "mace/core/device.h"

+#include "mace/core/buffer.h"
+
 namespace mace {

 CPUDevice::CPUDevice(const int num_threads,
@@ -21,7 +23,8 @@ CPUDevice::CPUDevice(const int num_threads,
                     const bool use_gemmlowp)
    : cpu_runtime_(new CPURuntime(num_threads,
                                  policy,
-                                  use_gemmlowp)) {}
+                                  use_gemmlowp)),
+      scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {}

 CPUDevice::~CPUDevice() = default;

@@ -31,6 +34,7 @@ CPURuntime *CPUDevice::cpu_runtime() {

 #ifdef MACE_ENABLE_OPENCL
 OpenCLRuntime *CPUDevice::opencl_runtime() {
+  LOG(FATAL) << "CPU device should not call OpenCL Runtime";
  return nullptr;
 }
 #endif
@@ -43,4 +47,8 @@ DeviceType CPUDevice::device_type() const {
  return DeviceType::CPU;
 }

+ScratchBuffer *CPUDevice::scratch_buffer() {
+  return scratch_buffer_.get();
+}
+
 }  // namespace mace
--- a/mace/core/device.h
+++ b/mace/core/device.h
@@ -26,6 +26,8 @@

 namespace mace {

+class ScratchBuffer;
+
 class Device {
 public:
  virtual ~Device() {}
@@ -37,6 +39,7 @@ class Device {

  virtual Allocator *allocator() = 0;
  virtual DeviceType device_type() const = 0;
+  virtual ScratchBuffer *scratch_buffer() = 0;
 };

 class CPUDevice : public Device {
@@ -53,9 +56,11 @@ class CPUDevice : public Device {

  Allocator *allocator() override;
  DeviceType device_type() const override;
+  ScratchBuffer *scratch_buffer() override;

 private:
  std::unique_ptr<CPURuntime> cpu_runtime_;
+  std::unique_ptr<ScratchBuffer> scratch_buffer_;
 };

 }  // namespace mace

--- a/mace/core/future.h
+++ b/mace/core/future.h
@@ -15,7 +15,9 @@
 #ifndef MACE_CORE_FUTURE_H_
 #define MACE_CORE_FUTURE_H_

+#include <algorithm>
 #include <functional>
+#include <vector>

 #include "mace/utils/logging.h"

@@ -25,9 +27,7 @@ struct CallStats;

 // Wait the call to finish and get the stats if param is not nullptr
 struct StatsFuture {
-  std::function<void(CallStats *)> wait_fn = [](CallStats *) {
-    LOG(FATAL) << "wait_fn must be properly set";
-  };
+  std::function<void(CallStats *)> wait_fn;
 };

 inline void SetFutureDefaultWaitFn(StatsFuture *future) {
@@ -41,6 +41,29 @@ inline void SetFutureDefaultWaitFn(StatsFuture *future) {
  }
 }

+inline void MergeMultipleFutureWaitFn(
+    const std::vector<StatsFuture> &org_futures,
+    StatsFuture *dst_future) {
+  if (dst_future != nullptr) {
+    dst_future->wait_fn = [org_futures](CallStats *stats) {
+      if (stats != nullptr) {
+        stats->start_micros = INT64_MAX;
+        stats->end_micros = 0;
+        for (auto &org_future : org_futures) {
+          CallStats tmp_stats;
+          if (org_future.wait_fn != nullptr) {
+            org_future.wait_fn(&tmp_stats);
+            stats->start_micros = std::min(stats->start_micros,
+                                           tmp_stats.start_micros);
+            stats->end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
+          }
+        }
+        stats->end_micros += stats->start_micros;
+      }
+    };
+  }
+}
+
 }  // namespace mace

 #endif  // MACE_CORE_FUTURE_H_
--- a/mace/core/runtime/opencl/gpu_device.cc
+++ b/mace/core/runtime/opencl/gpu_device.cc
@@ -14,6 +14,8 @@

 #include "mace/core/runtime/opencl/gpu_device.h"

+#include "mace/core/buffer.h"
+
 namespace mace {

 GPUDevice::GPUDevice(Tuner<uint32_t> *tuner,
@@ -27,7 +29,8 @@ GPUDevice::GPUDevice(Tuner<uint32_t> *tuner,
    CPUDevice(num_threads, cpu_affinity_policy, use_gemmlowp),
    runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf,
                               opencl_binary_storage, tuner)),
-    allocator_(new OpenCLAllocator(runtime_.get())) {}
+    allocator_(new OpenCLAllocator(runtime_.get())),
+    scratch_buffer_(new ScratchBuffer(allocator_.get())) {}

 GPUDevice::~GPUDevice() = default;

@@ -43,4 +46,8 @@ DeviceType GPUDevice::device_type() const {
  return DeviceType::GPU;
 }

+ScratchBuffer *GPUDevice::scratch_buffer() {
+  return scratch_buffer_.get();
+}
+
 }  // namespace mace
--- a/mace/core/runtime/opencl/gpu_device.h
+++ b/mace/core/runtime/opencl/gpu_device.h
@@ -37,9 +37,11 @@ class GPUDevice : public CPUDevice {
  OpenCLRuntime *opencl_runtime() override;
  Allocator *allocator() override;
  DeviceType device_type() const override;
+  ScratchBuffer *scratch_buffer() override;
 private:
  std::unique_ptr<OpenCLRuntime> runtime_;
  std::unique_ptr<OpenCLAllocator> allocator_;
+  std::unique_ptr<ScratchBuffer> scratch_buffer_;
 };

 }  // namespace mace

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -31,8 +31,6 @@

 namespace mace {

-std::string kOpenCLParameterPath;  // NOLINT(runtime/string)
-
 extern const std::map<std::string, std::vector<unsigned char>>
    kEncryptedProgramMap;

@@ -286,7 +284,8 @@ OpenCLRuntime::OpenCLRuntime(
    is_opencl_avaliable_(false),
    is_profiling_enabled_(false),
    opencl_version_(CL_VER_UNKNOWN),
-    gpu_type_(UNKNOWN) {
+    gpu_type_(UNKNOWN),
+    mem_type_(MemoryType::GPU_IMAGE) {
  std::vector<cl::Platform> all_platforms;
  cl::Platform::get(&all_platforms);
  if (all_platforms.size() == 0) {
@@ -471,6 +470,14 @@ uint32_t OpenCLRuntime::device_compute_units() const {
  return device_compute_units_;
 }

+bool OpenCLRuntime::UseImageMemory() {
+  return this->mem_type_ == MemoryType::GPU_IMAGE;
+}
+
+void OpenCLRuntime::set_mem_type(MemoryType type) {
+  this->mem_type_ = type;
+}
+
 bool OpenCLRuntime::BuildProgramFromCache(
    const std::string &built_program_key,
    const std::string &build_options_str,

--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -25,6 +25,7 @@
 #include "mace/core/file_storage.h"
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/proto/mace.pb.h"
 #include "mace/utils/string_util.h"
 #include "mace/utils/timer.h"
 #include "mace/utils/tuner.h"
@@ -82,6 +83,9 @@ class OpenCLRuntime {
  uint32_t device_compute_units() const;
  Tuner<uint32_t> *tuner();
  bool is_opencl_avaliable();
+  // TODO(liuqi): remove this function in the future, make decision at runtime.
+  bool UseImageMemory();
+  void set_mem_type(MemoryType type);

  void GetCallStats(const cl::Event &event, CallStats *stats);
  uint64_t GetDeviceMaxWorkGroupSize();
@@ -129,6 +133,7 @@ class OpenCLRuntime {
  bool is_profiling_enabled_;
  OpenCLVersion opencl_version_;
  GPUType gpu_type_;
+  MemoryType mem_type_;
  // All OpenCL object must be a pointer and manually deleted before unloading
  // OpenCL library.
  std::shared_ptr<cl::Context> context_;

--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -101,13 +101,14 @@ enum DataFormat { NHWC = 0, NCHW = 1, HWOI = 2, OIHW = 3, HWIO = 4, OHWI = 5 };
 class Tensor {
 public:
  Tensor(Allocator *alloc, DataType type,
-         bool is_weight = false)
+         bool is_weight = false,
+         const std::string name = "")
      : allocator_(alloc),
        dtype_(type),
        buffer_(nullptr),
        is_buffer_owner_(true),
        unused_(false),
-        name_(""),
+        name_(name),
        is_weight_(is_weight),
        scale_(0.f),
        zero_point_(0),
@@ -115,12 +116,13 @@ class Tensor {
        maxval_(0.f) {}

  Tensor(BufferBase *buffer, DataType dtype,
-         bool is_weight = false)
+         bool is_weight = false,
+         const std::string name = "")
    : dtype_(dtype),
      buffer_(buffer),
      is_buffer_owner_(false),
      unused_(false),
-      name_(""),
+      name_(name),
      is_weight_(is_weight),
      scale_(0.f),
      zero_point_(0),
@@ -129,12 +131,13 @@ class Tensor {

  Tensor(const BufferSlice &buffer_slice,
         DataType dtype,
-         bool is_weight = false)
+         bool is_weight = false,
+         const std::string name = "")
      : dtype_(dtype),
        buffer_slice_(buffer_slice),
        is_buffer_owner_(false),
        unused_(false),
-        name_(""),
+        name_(name),
        is_weight_(is_weight),
        scale_(0.f),
        zero_point_(0),
@@ -152,6 +155,8 @@ class Tensor {
    }
  }

+  inline std::string name() const { return name_; }
+
  inline DataType dtype() const { return dtype_; }

  inline void SetDtype(DataType dtype) { dtype_ = dtype; }
@@ -188,11 +193,15 @@ class Tensor {
    shape_configured_ = shape_configured;
  }

+  inline const std::vector<index_t> &buffer_shape() const {
+    return buffer_shape_;
+  }
+
  inline index_t dim_size() const { return shape_.size(); }

  inline index_t dim(unsigned int index) const {
-    MACE_CHECK(index < shape_.size(), "Dim out of range: ", index, " >= ",
-               shape_.size());
+    MACE_CHECK(index < shape_.size(),
+               name_, ": Dim out of range: ", index, " >= ", shape_.size());
    return shape_[index];
  }

@@ -214,12 +223,12 @@ class Tensor {

 #ifdef MACE_ENABLE_OPENCL
  inline cl::Image *opencl_image() const {
-    MACE_CHECK(has_opencl_image(), "do not have image");
+    MACE_CHECK(has_opencl_image(), name_, " do not have image");
    return static_cast<cl::Image *>(buffer_->buffer());
  }

  inline cl::Buffer *opencl_buffer() const {
-    MACE_CHECK(has_opencl_buffer(), "do not have opencl buffer");
+    MACE_CHECK(has_opencl_buffer(), name_, " do not have opencl buffer");
    return static_cast<cl::Buffer *>(buffer_->buffer());
  }
 #endif
@@ -268,12 +277,14 @@ class Tensor {

  inline MaceStatus Resize(const std::vector<index_t> &shape) {
    shape_ = shape;
+    buffer_shape_ = shape;
    image_shape_.clear();
    if (buffer_ != nullptr) {
-      MACE_CHECK(!has_opencl_image(), "Cannot resize image, use ResizeImage.");
+      MACE_CHECK(!has_opencl_image(),
+                 name_, ": Cannot resize image, use ResizeImage.");
      if (raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE > buffer_->size()) {
-        LOG(WARNING) << "Resize buffer from size " << buffer_->size() << " to "
-                     << raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE;
+        LOG(WARNING) << name_ << ": Resize buffer from size " << buffer_->size()
+                     << " to " << raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE;
        return buffer_->Resize(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE);
      }
      return MaceStatus::MACE_SUCCESS;
@@ -296,19 +307,22 @@ class Tensor {
    allocator_ = other.allocator_;
    dtype_ = other.dtype_;
    shape_ = other.shape_;
+    buffer_shape_ = other.buffer_shape_;
    image_shape_ = other.image_shape_;
  }

  inline MaceStatus ResizeImage(const std::vector<index_t> &shape,
                                const std::vector<size_t> &image_shape) {
    shape_ = shape;
+    buffer_shape_ = shape;
    image_shape_ = image_shape;
    if (buffer_ == nullptr) {
      MACE_CHECK(is_buffer_owner_);
      buffer_ = new Image(allocator_);
      return buffer_->Allocate(image_shape, dtype_);
    } else {
-      MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");
+      MACE_CHECK(has_opencl_image(),
+                 name_, ": Cannot ResizeImage buffer, use Resize.");
      Image *image = dynamic_cast<Image *>(buffer_);
      MACE_CHECK(image_shape[0] <= image->image_shape()[0] &&
                     image_shape[1] <= image->image_shape()[1],
@@ -366,8 +380,6 @@ class Tensor {

  inline BufferBase *UnderlyingBuffer() const { return buffer_; }

-  inline void SetSourceOpName(const std::string name) { name_ = name; }
-
  inline void DebugPrint() const {
    using namespace numerical_chars;  // NOLINT(build/namespaces)
    std::stringstream os;
@@ -459,9 +471,12 @@ class Tensor {
 private:
  Allocator *allocator_;
  DataType dtype_;
+  // the shape of buffer(logical)
  std::vector<index_t> shape_;
  std::vector<index_t> shape_configured_;
  std::vector<size_t> image_shape_;
+  // the shape of buffer(physical storage)
+  std::vector<index_t> buffer_shape_;
  BufferBase *buffer_;
  BufferSlice buffer_slice_;
  bool is_buffer_owner_;

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -44,8 +44,7 @@ bool HasQuantizeOp(const NetDef &net_def) {
 }
 }  // namespace

-Workspace::Workspace() :
-    host_scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {}
+Workspace::Workspace() = default;

 Tensor *Workspace::CreateTensor(const std::string &name,
                                Allocator *alloc,
@@ -54,8 +53,8 @@ Tensor *Workspace::CreateTensor(const std::string &name,
    VLOG(3) << "Tensor " << name << " already exists. Skipping.";
  } else {
    VLOG(3) << "Creating Tensor " << name;
-    tensor_map_[name] = std::unique_ptr<Tensor>(new Tensor(alloc, type));
-    tensor_map_[name]->SetSourceOpName(name);
+    tensor_map_[name] = std::unique_ptr<Tensor>(new Tensor(alloc, type,
+                                                           false, name));
  }
  return GetTensor(name);
 }
@@ -171,7 +170,10 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
                tensor_buffer_.get(), const_tensor.offset(),
                const_tensor.data_size() *
                    GetEnumTypeSize(const_tensor.data_type())),
-                       const_tensor.data_type(), true));
+                       const_tensor.data_type(),
+                       true,
+                       const_tensor.name()));
+
        tensor->Reshape(dims);
        tensor->SetScale(const_tensor.scale());
        tensor->SetZeroPoint(const_tensor.zero_point());
@@ -275,7 +277,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
        std::unique_ptr<BufferBase> tensor_buf(
            new Buffer(device->allocator()));
        MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
-            mem_block.x() * GetEnumTypeSize(dtype)));
+            mem_block.x() * GetEnumTypeSize(dtype)
+                + MACE_EXTRA_BUFFER_PAD_SIZE));
        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
                                          std::move(tensor_buf));
      }
@@ -301,10 +304,9 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
          }
          std::unique_ptr<Tensor> tensor
              (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]),
-                          output_type));
-          tensor->SetSourceOpName(op.name());
-          if (device_type == DeviceType::GPU) {
-            VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")"
+                          output_type, false, op.output(i)));
+          if (device_type == DeviceType::GPU && tensor->has_opencl_image()) {
+            VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
                    << " Mem: " << mem_ids[i]
                    << " Image shape: "
                    << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
@@ -312,8 +314,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
                    << ", "
                    << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
                        ->image_shape()[1];
-          } else if (device_type == DeviceType::CPU) {
-            VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")"
+          } else {
+            VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
                    << " Mem: " << mem_ids[i]
                    << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
          }
@@ -356,14 +358,6 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
  return MaceStatus::MACE_SUCCESS;
 }

-ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {
-  if (device_type == CPU) {
-    return host_scratch_buffer_.get();
-  } else {
-    return nullptr;
-  }
-}
-
 void Workspace::RemoveUnusedBuffer() {
  auto iter = tensor_map_.begin();
  auto end_iter = tensor_map_.end();

--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -52,8 +52,6 @@ class Workspace {
                             Device *device,
                             const unsigned char *model_data);

-  ScratchBuffer *GetScratchBuffer(DeviceType device_type);
-
  void RemoveUnusedBuffer();

  void RemoveAndReloadBuffer(const NetDef &net_def,
@@ -64,15 +62,12 @@ class Workspace {
  MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
                                      Device *device);

-  Device *device_;
-
  TensorMap tensor_map_;

  std::unique_ptr<BufferBase> tensor_buffer_;

  PreallocatedPooledAllocator preallocated_allocator_;

-  std::unique_ptr<ScratchBuffer> host_scratch_buffer_;
  bool fused_buffer_;

  MACE_DISABLE_COPY_AND_ASSIGN(Workspace);

--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -32,6 +32,8 @@ cc_library(
    ) + if_opencl_enabled(glob(
        [
            "opencl/*.cc",
+            "opencl/image/*.cc",
+            "opencl/buffer/*.cc",
        ],
        exclude = [
            "opencl/*_test.cc",
@@ -43,14 +45,16 @@ cc_library(
            "arm/*.h",
        ],
        exclude = [
-            "buffer_to_image.h",
-            "image_to_buffer.h",
+            "buffer_transform.h",
+            "buffer_inverse_transform.h",
            "lstmcell.h",
        ],
    ) + if_opencl_enabled(glob([
        "opencl/*.h",
-        "buffer_to_image.h",
-        "image_to_buffer.h",
+        "opencl/image/*.h",
+        "opencl/buffer/*.h",
+        "buffer_transform.h",
+        "buffer_inverse_transform.h",
        "lstmcell.h",
    ])),
    copts = [

--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -26,10 +26,6 @@
 #include "mace/core/types.h"
 #include "mace/kernels/kernel.h"

-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
-
 namespace mace {
 namespace kernels {

@@ -164,15 +160,22 @@ class ActivationFunctor<DeviceType::CPU, float> : OpKernel {
 };

 #ifdef MACE_ENABLE_OPENCL
+class OpenCLActivationKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *alpha,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLActivationKernel);
+};
 template <typename T>
 class ActivationFunctor<DeviceType::GPU, T> : OpKernel {
 public:
  ActivationFunctor(OpKernelContext *context,
                    ActivationType type,
-                    T relux_max_limit)
-      : OpKernel(context),
-        activation_(type),
-        relux_max_limit_(static_cast<T>(relux_max_limit)) {}
+                    T relux_max_limit);

  MaceStatus operator()(const Tensor *input,
                        const Tensor *alpha,
@@ -180,13 +183,7 @@ class ActivationFunctor<DeviceType::GPU, T> : OpKernel {
                        StatsFuture *future);

 private:
-  ActivationType activation_;
-  T relux_max_limit_;
-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::string tuning_key_prefix_;
-  std::vector<index_t> input_shape_;
+  std::unique_ptr<OpenCLActivationKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL


--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -26,10 +26,6 @@
 #include "mace/core/tensor.h"
 #include "mace/kernels/kernel.h"

-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
-
 namespace mace {
 namespace kernels {

@@ -96,17 +92,23 @@ struct AddNFunctor : OpKernel {
 };

 #ifdef MACE_ENABLE_OPENCL
+class OpenCLAddNKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const std::vector<const Tensor *> &input_tensors,
+      Tensor *output_tensor,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLAddNKernel);
+};
 template <typename T>
 struct AddNFunctor<DeviceType::GPU, T> : OpKernel {
-  explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {}
+  explicit AddNFunctor(OpKernelContext *context);
  MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
                        Tensor *output_tensor,
                        StatsFuture *future);

-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
+  std::unique_ptr<OpenCLAddNKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL


--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -26,41 +26,22 @@
 #include "mace/kernels/activation.h"
 #include "mace/public/mace.h"

-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
-
 namespace mace {
 namespace kernels {

-struct BatchNormFunctorBase : OpKernel {
-  BatchNormFunctorBase(OpKernelContext *context,
-                       bool folded_constant,
-                       const ActivationType activation,
-                       const float relux_max_limit)
-    : OpKernel(context),
-      folded_constant_(folded_constant),
-      activation_(activation),
-      relux_max_limit_(relux_max_limit) {}
-
-  const bool folded_constant_;
-  const ActivationType activation_;
-  const float relux_max_limit_;
-};
-
 template<DeviceType D, typename T>
 struct BatchNormFunctor;

 template<>
-struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
+struct BatchNormFunctor<DeviceType::CPU, float> : OpKernel {
  BatchNormFunctor(OpKernelContext *context,
                   const bool folded_constant,
                   const ActivationType activation,
                   const float relux_max_limit)
-      : BatchNormFunctorBase(context,
-                             folded_constant,
-                             activation,
-                             relux_max_limit) {}
+      : OpKernel(context),
+        folded_constant_(folded_constant),
+        activation_(activation),
+        relux_max_limit_(relux_max_limit) {}

  MaceStatus operator()(const Tensor *input,
                  const Tensor *scale,
@@ -133,19 +114,33 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {

    return MACE_SUCCESS;
  }
+
+  const bool folded_constant_;
+  const ActivationType activation_;
+  const float relux_max_limit_;
 };

 #ifdef MACE_ENABLE_OPENCL
+class OpenCLBatchNormKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *scale,
+      const Tensor *offset,
+      const Tensor *mean,
+      const Tensor *var,
+      const float epsilon,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBatchNormKernel);
+};
 template<typename T>
-struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase {
+struct BatchNormFunctor<DeviceType::GPU, T> : OpKernel {
  BatchNormFunctor(OpKernelContext *context,
                   const bool folded_constant,
                   const ActivationType activation,
-                   const float relux_max_limit)
-      : BatchNormFunctorBase(context,
-                             folded_constant,
-                             activation,
-                             relux_max_limit) {}
+                   const float relux_max_limit);
  MaceStatus operator()(const Tensor *input,
                        const Tensor *scale,
                        const Tensor *offset,
@@ -154,10 +149,7 @@ struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase {
                        const float epsilon,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
+  std::unique_ptr<OpenCLBatchNormKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL


--- a/mace/kernels/batch_to_space.h
+++ b/mace/kernels/batch_to_space.h
@@ -24,10 +24,6 @@
 #include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"

-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
-
 namespace mace {
 namespace kernels {

@@ -51,7 +47,8 @@ struct BatchToSpaceFunctorBase : OpKernel {
  void CalculateBatchToSpaceOutputShape(const Tensor *input_tensor,
                                        const DataFormat data_format,
                                        index_t *output_shape) {
-    MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
+    MACE_CHECK(input_tensor->dim_size() == 4,
+               "Input(", input_tensor->name(), ") shape should be 4D");
    index_t batch = input_tensor->dim(0);
    index_t channels = 0;
    index_t height = 0;
@@ -96,8 +93,8 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
                      const std::vector<int> &block_shape)
      : BatchToSpaceFunctorBase(context, paddings, block_shape) {}

-  MaceStatus operator()(Tensor *space_tensor,
-                        Tensor *batch_tensor,
+  MaceStatus operator()(const Tensor *batch_tensor,
+                        Tensor *space_tensor,
                        StatsFuture *future) {
    MACE_UNUSED(future);

@@ -191,8 +188,8 @@ struct BatchToSpaceFunctor<CPU, uint8_t> : BatchToSpaceFunctorBase {
                      const std::vector<int> &block_shape)
      : BatchToSpaceFunctorBase(context, paddings, block_shape) {}

-  MaceStatus operator()(Tensor *space_tensor,
-                        Tensor *batch_tensor,
+  MaceStatus operator()(const Tensor *batch_tensor,
+                        Tensor *space_tensor,
                        StatsFuture *future) {
    MACE_UNUSED(future);

@@ -272,21 +269,29 @@ struct BatchToSpaceFunctor<CPU, uint8_t> : BatchToSpaceFunctorBase {
 };

 #ifdef MACE_ENABLE_OPENCL
+class OpenCLBatchToSpaceKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *batch_tensor,
+      const std::vector<int> &paddings,
+      const std::vector<int> &block_shape,
+      const std::vector<index_t> &output_shape,
+      Tensor *space_tensor,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBatchToSpaceKernel);
+};
 template <typename T>
 struct BatchToSpaceFunctor<DeviceType::GPU, T> : BatchToSpaceFunctorBase {
  BatchToSpaceFunctor(OpKernelContext *context,
                      const std::vector<int> &paddings,
-                      const std::vector<int> &block_shape)
-      : BatchToSpaceFunctorBase(context, paddings, block_shape) {}
+                      const std::vector<int> &block_shape);

-  MaceStatus operator()(Tensor *space_tensor,
-                  Tensor *batch_tensor,
+  MaceStatus operator()(const Tensor *batch_tensor,
+                        Tensor *space_tensor,
                        StatsFuture *future);

-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> space_shape_;
+  std::unique_ptr<OpenCLBatchToSpaceKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL


--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -24,10 +24,6 @@
 #include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"

-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
-
 namespace mace {
 namespace kernels {

@@ -96,18 +92,26 @@ struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
 };

 #ifdef MACE_ENABLE_OPENCL
+class OpenCLBiasAddKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *bias,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBiasAddKernel);
+};
+
 template <typename T>
 struct BiasAddFunctor<DeviceType::GPU, T> : BiasAddFunctorBase {
-  BiasAddFunctor(OpKernelContext *context, const DataFormat data_format)
-      : BiasAddFunctorBase(context, data_format) {}
+  BiasAddFunctor(OpKernelContext *context, const DataFormat data_format);
  MaceStatus operator()(const Tensor *input,
                        const Tensor *bias,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
+
+  std::unique_ptr<OpenCLBiasAddKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL


--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_BUFFER_TO_IMAGE_H_
-#define MACE_KERNELS_BUFFER_TO_IMAGE_H_
+#ifndef MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_
+#define MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_

 #include <memory>
 #include <vector>
@@ -26,18 +26,19 @@
 namespace mace {
 namespace kernels {

-struct BufferToImageFunctorBase : OpKernel {
-  explicit BufferToImageFunctorBase(OpKernelContext *context,
+struct BufferInverseTransformFunctorBase : OpKernel {
+  BufferInverseTransformFunctorBase(OpKernelContext *context,
                                    const int wino_blk_size)
-    : OpKernel(context), wino_blk_size_(wino_blk_size) {}
+    : OpKernel(context),
+      wino_blk_size_(wino_blk_size) {}
  const int wino_blk_size_;
 };

 template <DeviceType D, typename T>
-struct BufferToImageFunctor : BufferToImageFunctorBase {
-  explicit BufferToImageFunctor(OpKernelContext *context,
+struct BufferInverseTransformFunctor : BufferInverseTransformFunctorBase {
+  explicit BufferInverseTransformFunctor(OpKernelContext *context,
                                         const int wino_blk_size)
-      : BufferToImageFunctorBase(context, wino_blk_size) {}
+    : BufferInverseTransformFunctorBase(context, wino_blk_size) {}
  MaceStatus operator()(const Tensor *input,
                        const BufferType type,
                        Tensor *output,
@@ -51,22 +52,31 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
  }
 };

+class OpenCLBufferInverseTransformKernel {
+ public:
+  virtual MaceStatus Compute(OpKernelContext *context,
+                             const Tensor *input,
+                             const BufferType type,
+                             const int wino_blk_size,
+                             Tensor *output,
+                             StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBufferInverseTransformKernel)
+};
+
 template <typename T>
-struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
-  explicit BufferToImageFunctor(OpKernelContext *context,
-                                const int wino_blk_size)
-      : BufferToImageFunctorBase(context, wino_blk_size) {}
+struct BufferInverseTransformFunctor<DeviceType::GPU, T>
+    : BufferInverseTransformFunctorBase {
+  explicit BufferInverseTransformFunctor(OpKernelContext *context,
+                                         const int wino_blk_size);
  MaceStatus operator()(const Tensor *input,
                        const BufferType type,
                        Tensor *output,
                        StatsFuture *future);

-  cl::Kernel kernel_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
+  std::unique_ptr<OpenCLBufferInverseTransformKernel> kernel_;
 };

 }  // namespace kernels
 }  // namespace mace

-#endif  // MACE_KERNELS_BUFFER_TO_IMAGE_H_
+#endif  // MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_
--- a/mace/kernels/image_to_buffer.h
+++ b/mace/kernels/image_to_buffer.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_IMAGE_TO_BUFFER_H_
-#define MACE_KERNELS_IMAGE_TO_BUFFER_H_
+#ifndef MACE_KERNELS_BUFFER_TRANSFORM_H_
+#define MACE_KERNELS_BUFFER_TRANSFORM_H_

 #include <memory>
 #include <vector>
@@ -26,18 +26,19 @@
 namespace mace {
 namespace kernels {

-struct ImageToBufferFunctorBase : OpKernel {
-  ImageToBufferFunctorBase(OpKernelContext *context,
+struct BufferTransformFunctorBase : OpKernel {
+  explicit BufferTransformFunctorBase(OpKernelContext *context,
                                      const int wino_blk_size)
-    : OpKernel(context),
-      wino_blk_size_(wino_blk_size) {}
+    : OpKernel(context), wino_blk_size_(wino_blk_size) {}
  const int wino_blk_size_;
 };

 template <DeviceType D, typename T>
-struct ImageToBufferFunctor : ImageToBufferFunctorBase {
-  ImageToBufferFunctor(OpKernelContext *context, const int wino_blk_size)
-    : ImageToBufferFunctorBase(context, wino_blk_size) {}
+struct BufferTransformFunctor : BufferTransformFunctorBase {
+  BufferTransformFunctor(OpKernelContext *context,
+                         const int wino_blk_size)
+      : BufferTransformFunctorBase(context, wino_blk_size) {}
+
  MaceStatus operator()(const Tensor *input,
                        const BufferType type,
                        Tensor *output,
@@ -51,22 +52,30 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
  }
 };

+class OpenCLBufferTransformKernel {
+ public:
+  virtual MaceStatus Compute(OpKernelContext *context,
+                             const Tensor *input,
+                             const BufferType type,
+                             const int wino_blk_size,
+                             Tensor *output,
+                             StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBufferTransformKernel)
+};
+
 template <typename T>
-struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase {
-  ImageToBufferFunctor(OpKernelContext *context,
-                                const int wino_blk_size)
-      : ImageToBufferFunctorBase(context, wino_blk_size) {}
+struct BufferTransformFunctor<DeviceType::GPU, T> : BufferTransformFunctorBase {
+  BufferTransformFunctor(OpKernelContext *context, const int wino_blk_size);
+
  MaceStatus operator()(const Tensor *input,
                        const BufferType type,
                        Tensor *output,
                        StatsFuture *future);

-  cl::Kernel kernel_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
+  std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
 };

 }  // namespace kernels
 }  // namespace mace

-#endif  // MACE_KERNELS_IMAGE_TO_BUFFER_H_
+#endif  // MACE_KERNELS_BUFFER_TRANSFORM_H_
--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -71,20 +71,24 @@ struct ChannelShuffleFunctor : OpKernel {
 };

 #ifdef MACE_ENABLE_OPENCL
+class OpenCLChannelShuffleKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLChannelShuffleKernel);
+};
 template<typename T>
 struct ChannelShuffleFunctor<DeviceType::GPU, T> : OpKernel {
-  ChannelShuffleFunctor(OpKernelContext *context, const int groups)
-      : OpKernel(context), groups_(groups) {}
+  ChannelShuffleFunctor(OpKernelContext *context, const int groups);

  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future);

-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  const int groups_;
-  std::vector<index_t> input_shape_;
+  std::unique_ptr<OpenCLChannelShuffleKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL


--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -24,24 +24,13 @@
 #include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"

-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
-
 namespace mace {
 namespace kernels {

-struct ConcatFunctorBase : OpKernel {
-  ConcatFunctorBase(OpKernelContext *context, const int32_t axis)
-      : OpKernel(context), axis_(axis) {}
-
-  int32_t axis_;
-};
-
 template <DeviceType D, typename T>
-struct ConcatFunctor : ConcatFunctorBase {
+struct ConcatFunctor : OpKernel {
  ConcatFunctor(OpKernelContext *context, const int32_t axis)
-      : ConcatFunctorBase(context, axis) {}
+      : OpKernel(context), axis_(axis) {}

  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
                        Tensor *output,
@@ -98,21 +87,29 @@ struct ConcatFunctor : ConcatFunctorBase {

    return MACE_SUCCESS;
  }
+
+  int32_t axis_;
 };

 #ifdef MACE_ENABLE_OPENCL
+class OpenCLConcatKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const std::vector<const Tensor *> &input_list,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLConcatKernel);
+};
 template <typename T>
-struct ConcatFunctor<DeviceType::GPU, T> : ConcatFunctorBase {
-  ConcatFunctor(OpKernelContext *context, const int32_t axis)
-      : ConcatFunctorBase(context, axis) {}
+struct ConcatFunctor<DeviceType::GPU, T> : OpKernel {
+  ConcatFunctor(OpKernelContext *context, const int32_t axis);

  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
+
+  std::unique_ptr<OpenCLConcatKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL


--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -35,10 +35,6 @@
 #include "mace/kernels/quantize.h"
 #include "mace/utils/utils.h"

-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
-
 namespace mace {
 namespace kernels {

@@ -78,8 +74,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                const int *dilations,
                const ActivationType activation,
                const float relux_max_limit,
-                const bool is_filter_transformed,
-                ScratchBuffer *scratch)
+                const bool is_filter_transformed)
    : Conv2dFunctorBase(context,
                        strides,
                        padding_type,
@@ -88,8 +83,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                        activation,
                        relux_max_limit),
      transformed_filter_(GetCPUAllocator(), DataType::DT_FLOAT),
-      is_filter_transformed_(is_filter_transformed),
-      scratch_(scratch) {}
+      is_filter_transformed_(is_filter_transformed) {}

  void Conv2dGeneral(const float *input,
                     const float *filter,
@@ -494,14 +488,15 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
    }

    // Init scratch buffer
-    scratch_->Rewind();
-    scratch_->GrowSize(total_scratch_size);
+    ScratchBuffer *scratch = context_->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
    Tensor
-      transformed_input(scratch_->Scratch(transformed_input_size), DT_FLOAT);
+      transformed_input(scratch->Scratch(transformed_input_size), DT_FLOAT);
    Tensor
-      transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT);
-    Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT);
-    Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT);
+      transformed_output(scratch->Scratch(transformed_output_size), DT_FLOAT);
+    Tensor padded_input(scratch->Scratch(padded_input_size), DT_FLOAT);
+    Tensor padded_output(scratch->Scratch(padded_output_size), DT_FLOAT);
    const index_t extra_input_shape[4] =
        {batch, input_channels, extra_input_height, extra_input_width};
    const index_t extra_output_shape[4] =
@@ -559,7 +554,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                          transformed_output_data,
                          pad_output,
                          &sgemm_,
-                          scratch_);
+                          scratch);
      };
    } else if (use_neon_3x3_s1) {
      conv_func = [=](const float *pad_input, float *pad_output) {
@@ -588,7 +583,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                         channels,
                         pad_output,
                         &sgemm_,
-                         scratch_);
+                         scratch);
      };
    } else if (use_neon_5x5_s1) {
      conv_func = [=](const float *pad_input, float *pad_output) {
@@ -735,7 +730,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {

  Tensor transformed_filter_;
  bool is_filter_transformed_;
-  ScratchBuffer *scratch_;
  SGemm sgemm_;
 };

@@ -748,16 +742,14 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
                const int *dilations,
                const ActivationType activation,
                const float relux_max_limit,
-                const bool is_filter_transformed,
-                ScratchBuffer *scratch)
+                const bool is_filter_transformed)
      : Conv2dFunctorBase(context,
                          strides,
                          padding_type,
                          paddings,
                          dilations,
                          activation,
-                          relux_max_limit),
-        scratch_(scratch) {
+                          relux_max_limit) {
    MACE_UNUSED(is_filter_transformed);
  }

@@ -926,13 +918,14 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
    bool im2col_required =
        filter_h != 1 || filter_w != 1 || stride_h != 1 || stride_w != 1;
    total_scratch_size += (im2col_required ? im2col_size : 0);
-    scratch_->Rewind();
-    scratch_->GrowSize(total_scratch_size);
+    ScratchBuffer *scratch = context_->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);

    std::unique_ptr<Tensor> zero_bias;
    const int32_t *bias_data = nullptr;
    if (bias == nullptr) {
-      zero_bias.reset(new Tensor(scratch_->Scratch(zero_bias_size), DT_INT32));
+      zero_bias.reset(new Tensor(scratch->Scratch(zero_bias_size), DT_INT32));
      zero_bias->Reshape({channels});
      zero_bias->Clear();
      bias_data = zero_bias->data<int32_t>();
@@ -944,7 +937,7 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
    auto gemm_input_data = input_data;
    if (im2col_required) {
      // prepare im2col
-      im2col.reset(new Tensor(scratch_->Scratch(im2col_size), DT_UINT8));
+      im2col.reset(new Tensor(scratch->Scratch(im2col_size), DT_UINT8));
      uint8_t *im2col_data = im2col->mutable_data<uint8_t>();
      Im2col(input_data, input->shape(), filter_h, filter_w, stride_h,
             stride_w, static_cast<uint8_t>(input->zero_point()),
@@ -976,12 +969,28 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {

    return MACE_SUCCESS;
  }
-
-  ScratchBuffer *scratch_;
 };

 #ifdef MACE_ENABLE_OPENCL
-template<typename T>
+class OpenCLConv2dKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLConv2dKernel);
+};
+
+template <typename T>
 struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
  Conv2dFunctor(OpKernelContext *context,
                const int *strides,
@@ -990,18 +999,7 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
                const int *dilations,
                const ActivationType activation,
                const float relux_max_limit,
-                const bool is_filter_transformed,
-                ScratchBuffer *scratch)
-    : Conv2dFunctorBase(context,
-                        strides,
-                        padding_type,
-                        paddings,
-                        dilations,
-                        activation,
-                        relux_max_limit) {
-    MACE_UNUSED(is_filter_transformed);
-    MACE_UNUSED(scratch);
-  }
+                const bool is_filter_transformed);

  MaceStatus operator()(const Tensor *input,
                        const Tensor *filter,
@@ -1009,10 +1007,7 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
                        Tensor *output,
                        StatsFuture *future);

-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
+  std::unique_ptr<OpenCLConv2dKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL


--- a/mace/kernels/conv_pool_2d_util.cc
+++ b/mace/kernels/conv_pool_2d_util.cc
@@ -210,6 +210,20 @@ void CalcOutputSize(const index_t *input_shape,
  }
 }

+void CalcNCHWInputShape(const index_t *output_shape,
+                        const index_t *filter_shape,
+                        const int *strides,
+                        const int *dilations,
+                        index_t *input_shape) {
+  MACE_CHECK_NOTNULL(input_shape);
+  input_shape[0] = output_shape[0];
+  input_shape[1] = filter_shape[1];
+  input_shape[2] = (output_shape[2] - 1) * strides[0] +
+      (filter_shape[2] - 1) * dilations[0] + 1;
+  input_shape[3] = (output_shape[3] - 1) * strides[1] +
+      (filter_shape[3] - 1) * dilations[1] + 1;
+}
+
 void CalcOutputSize(const index_t *input_shape,   // NHWC
                    const index_t *filter_shape,  // OIHW
                    const int *padding_size,
@@ -234,8 +248,8 @@ void CalcNCHWOutputSize(const index_t *input_shape,   // NCHW

 void CalPaddingSize(const index_t *input_shape,   // NCHW
                    const index_t *filter_shape,  // OIHW
-                    const int *dilations,
                    const int *strides,
+                    const int *dilations,
                    Padding padding,
                    int *padding_size) {
  MACE_CHECK(dilations[0] > 0 && dilations[1] > 0,

--- a/mace/kernels/conv_pool_2d_util.h
+++ b/mace/kernels/conv_pool_2d_util.h
@@ -84,6 +84,12 @@ void CalcNCHWOutputSize(const index_t *input_shape,
                    const RoundType round_type,
                    index_t *output_shape);

+void CalcNCHWInputShape(const index_t *output_shape,
+                        const index_t *filter_shape,
+                        const int *strides,
+                        const int *dilations,
+                        index_t *input_shape);
+
 void CalPaddingSize(const index_t *input_shape,   // NCHW
                    const index_t *filter_shape,  // OIHW
                    const int *dilations,
@@ -91,6 +97,7 @@ void CalPaddingSize(const index_t *input_shape,   // NCHW
                    Padding padding,
                    int *padding_size);

+
 MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input,
                               const int pad_top, const int pad_bottom,
                               const int pad_left, const int pad_right,

--- a/mace/kernels/crop.h
+++ b/mace/kernels/crop.h
@@ -24,32 +24,18 @@
 #include "mace/kernels/kernel.h"
 #include "mace/public/mace.h"

-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
-
 namespace mace {
 namespace kernels {

-struct CropFunctorBase : OpKernel {
-  CropFunctorBase(OpKernelContext *context,
+template <DeviceType D, typename T>
+struct CropFunctor : OpKernel {
+  CropFunctor(OpKernelContext *context,
              const int axis,
              const std::vector<int> &offset)
      : OpKernel(context),
        axis_(axis),
        offset_(offset) {}

-  const int axis_;
-  std::vector<int> offset_;
-};
-
-template <DeviceType D, typename T>
-struct CropFunctor : CropFunctorBase {
-  CropFunctor(OpKernelContext *context,
-              const int axis,
-              const std::vector<int> &offset)
-      : CropFunctorBase(context, axis, offset) {}
-
  void crop_copy(const T* input_data, T* output_data,
                 const std::vector<index_t> &input_shape,
                 const std::vector<index_t> &output_shape,
@@ -121,23 +107,31 @@ struct CropFunctor : CropFunctorBase {

    return MACE_SUCCESS;
  }
+
+  const int axis_;
+  std::vector<int> offset_;
 };

 #ifdef MACE_ENABLE_OPENCL
+class OpenCLCropKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const std::vector<const Tensor *> &input_list,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLCropKernel);
+};
 template <typename T>
-struct CropFunctor<DeviceType::GPU, T> : CropFunctorBase {
+struct CropFunctor<DeviceType::GPU, T> : OpKernel {
  CropFunctor(OpKernelContext *context,
              const int axis,
-              const std::vector<int> &offset)
-      : CropFunctorBase(context, axis, offset) {}
+              const std::vector<int> &offset);

  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
                        Tensor *output,
                        StatsFuture *future);
-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
+  std::unique_ptr<OpenCLCropKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL


--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -28,10 +28,6 @@
 #include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/utils/utils.h"

-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
-
 namespace mace {
 namespace kernels {

@@ -317,6 +313,22 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
 };

 #ifdef MACE_ENABLE_OPENCL
+class OpenCLDeconv2dKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const int *padding_data,
+      const ActivationType activation,
+      const float relux_max_limit,
+      const std::vector<index_t> &output_shape,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDeconv2dKernel);
+};
 template <typename T>
 struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
  Deconv2dFunctor(OpKernelContext *context,
@@ -325,14 +337,7 @@ struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
                  const std::vector<int> &paddings,
                  const std::vector<index_t> &output_shape,
                  const ActivationType activation,
-                  const float relux_max_limit)
-      : Deconv2dFunctorBase(context,
-                            strides,
-                            padding_type,
-                            paddings,
-                            output_shape,
-                            activation,
-                            relux_max_limit) {}
+                  const float relux_max_limit);

  MaceStatus operator()(const Tensor *input,
                  const Tensor *filter,
@@ -341,10 +346,7 @@ struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
                  Tensor *output,
                  StatsFuture *future);

-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
+  std::unique_ptr<OpenCLDeconv2dKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL


--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -93,20 +93,24 @@ struct DepthToSpaceOpFunctor : OpKernel {
 };

 #ifdef MACE_ENABLE_OPENCL
+class OpenCLDepthToSpaceKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDepthToSpaceKernel);
+};
 template<typename T>
 struct DepthToSpaceOpFunctor<DeviceType::GPU, T> : OpKernel {
  DepthToSpaceOpFunctor(OpKernelContext *context,
-                        const int block_size)
-      : OpKernel(context), block_size_(block_size) {}
+                        const int block_size);
  MaceStatus operator()(const Tensor *input,
                        Tensor *output,
                        StatsFuture *future);

-  const int block_size_;
-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
+  std::unique_ptr<OpenCLDepthToSpaceKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL


--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -501,6 +501,24 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
 };

 #ifdef MACE_ENABLE_OPENCL
+class OpenCLDepthwiseConv2dKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDepthwiseConv2dKernel);
+};
+
 template<typename T>
 struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
  : DepthwiseConv2dFunctorBase {
@@ -510,14 +528,7 @@ struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
                         const std::vector<int> &paddings,
                         const int *dilations,
                         const ActivationType activation,
-                         const float relux_max_limit)
-    : DepthwiseConv2dFunctorBase(context,
-                                 strides,
-                                 padding_type,
-                                 paddings,
-                                 dilations,
-                                 activation,
-                                 relux_max_limit) {}
+                         const float relux_max_limit);

  MaceStatus operator()(const Tensor *input,
                        const Tensor *filter,
@@ -525,10 +536,7 @@ struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
                        Tensor *output,
                        StatsFuture *future);

-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
+  std::unique_ptr<OpenCLDepthwiseConv2dKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL


--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -27,10 +27,6 @@
 #include "mace/kernels/kernel.h"
 #include "mace/utils/quantize.h"

-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
-
 namespace mace {
 namespace kernels {

@@ -805,11 +801,12 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
  }
 }

-struct EltwiseFunctorBase : OpKernel {
-  EltwiseFunctorBase(OpKernelContext *context,
+template <DeviceType D, typename T>
+struct EltwiseFunctor : OpKernel {
+  EltwiseFunctor(OpKernelContext *context,
                 const EltwiseType type,
                 const std::vector<float> &coeff,
-                     const float scalar_input,
+                 const float scalar_input,  // float as it comes from arg
                 const int32_t scalar_input_index,
                 const DataFormat data_format)
      : OpKernel(context),
@@ -819,28 +816,6 @@ struct EltwiseFunctorBase : OpKernel {
        scalar_input_index_(scalar_input_index),
        data_format_(data_format) {}

-  EltwiseType type_;
-  std::vector<float> coeff_;
-  float scalar_input_;
-  int32_t scalar_input_index_;
-  DataFormat data_format_;
-};
-
-template <DeviceType D, typename T>
-struct EltwiseFunctor : EltwiseFunctorBase {
-  EltwiseFunctor(OpKernelContext *context,
-                 const EltwiseType type,
-                 const std::vector<float> &coeff,
-                 const float scalar_input,  // float as it comes from arg
-                 const int32_t scalar_input_index,
-                 const DataFormat data_format)
-      : EltwiseFunctorBase(context,
-                           type,
-                           coeff,
-                           scalar_input,
-                           scalar_input_index,
-                           data_format) {}
-
  template <typename DstType>
  MaceStatus DoEltwise(const Tensor *input0,
                       const Tensor *input1,
@@ -957,23 +932,28 @@ struct EltwiseFunctor : EltwiseFunctorBase {
    }
  }

+  EltwiseType type_;
+  std::vector<float> coeff_;
+  float scalar_input_;
+  int32_t scalar_input_index_;
+  DataFormat data_format_;
  Tensor scalar_tensor_;
 };

 template <>
-struct EltwiseFunctor<DeviceType::CPU, uint8_t> : EltwiseFunctorBase {
+struct EltwiseFunctor<DeviceType::CPU, uint8_t> : OpKernel {
  EltwiseFunctor(OpKernelContext *context,
                 const EltwiseType type,
                 const std::vector<float> &coeff,
                 const float scalar_input,  // float as it comes from arg
                 const int32_t scalar_input_index,
                 const DataFormat data_format)
-      : EltwiseFunctorBase(context,
-                           type,
-                           coeff,
-                           scalar_input,
-                           scalar_input_index,
-                           data_format) {}
+      : OpKernel(context),
+        type_(type),
+        coeff_(coeff),
+        scalar_input_(scalar_input),
+        scalar_input_index_(scalar_input_index),
+        data_format_(data_format) {}

  MaceStatus operator()(const Tensor *input0,
                        const Tensor *input1,
@@ -1093,33 +1073,41 @@ struct EltwiseFunctor<DeviceType::CPU, uint8_t> : EltwiseFunctorBase {

    return MACE_SUCCESS;
  }
+
+  EltwiseType type_;
+  std::vector<float> coeff_;
+  float scalar_input_;
+  int32_t scalar_input_index_;
+  DataFormat data_format_;
+  Tensor scalar_tensor_;
 };

 #ifdef MACE_ENABLE_OPENCL
+class OpenCLEltwiseKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input0,
+      const Tensor *input1,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLEltwiseKernel);
+};
 template <typename T>
-struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
+struct EltwiseFunctor<DeviceType::GPU, T> : OpKernel {
  EltwiseFunctor(OpKernelContext *context,
                 const EltwiseType type,
                 const std::vector<float> &coeff,
                 const float scalar_input,
                 const int32_t scalar_input_index,
-                 const DataFormat data_format)
-      : EltwiseFunctorBase(context,
-                           type,
-                           coeff,
-                           scalar_input,
-                           scalar_input_index,
-                           data_format) {}
+                 const DataFormat data_format);

  MaceStatus operator()(const Tensor *input0,
                        const Tensor *input1,
                        Tensor *output,
                        StatsFuture *future);

-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
+  std::unique_ptr<OpenCLEltwiseKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL


--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -151,12 +151,24 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
 };

 #ifdef MACE_ENABLE_OPENCL
+class OpenCLFullyConnectedKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *weight,
+      const Tensor *bias,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLFullyConnectedKernel);
+};
 template <typename T>
 struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
  FullyConnectedFunctor(OpKernelContext *context,
                        const ActivationType activation,
-                        const float relux_max_limit)
-      : FullyConnectedBase(context, activation, relux_max_limit) {}
+                        const float relux_max_limit);

  MaceStatus operator()(const Tensor *input,
                        const Tensor *weight,
@@ -164,11 +176,7 @@ struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
                        Tensor *output,
                        StatsFuture *future);

-  cl::Kernel kernel_;
-  std::vector<uint32_t> gws_;
-  std::vector<uint32_t> lws_;
-  std::vector<index_t> input_shape_;
-  std::unique_ptr<BufferBase> kernel_error_;
+  std::unique_ptr<OpenCLFullyConnectedKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL


--- a/mace/kernels/lstmcell.h
+++ b/mace/kernels/lstmcell.h
@@ -35,11 +35,23 @@ namespace kernels {
 template <DeviceType D, typename T>
 struct LSTMCellFunctor;

+class OpenCLLSTMCellKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *pre_output,
+      const Tensor *weight,
+      const Tensor *bias,
+      const Tensor *pre_cell,
+      Tensor *cell,
+      Tensor *output,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLLSTMCellKernel);
+};
 template <typename T>
 struct LSTMCellFunctor<DeviceType::GPU, T> : OpKernel{
-  LSTMCellFunctor(OpKernelContext *context, T forget_bias)
-      : OpKernel(context),
-        forget_bias_(static_cast<T>(forget_bias)) {}
+  LSTMCellFunctor(OpKernelContext *context, T forget_bias);
  MaceStatus operator()(const Tensor *input,
                        const Tensor *pre_output,
                        const Tensor *weight,
@@ -49,11 +61,7 @@ struct LSTMCellFunctor<DeviceType::GPU, T> : OpKernel{
                        Tensor *output,
                        StatsFuture *future);

-  T forget_bias_;
-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
-  std::vector<index_t> input_shape_;
+  std::unique_ptr<OpenCLLSTMCellKernel> kernel_;
 };

 }  // namespace kernels

--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -34,10 +34,6 @@
 #include "mace/kernels/gemmlowp_util.h"
 #include "mace/kernels/sgemm.h"

-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
-
 namespace mace {
 namespace kernels {

@@ -89,7 +85,7 @@ struct MatMulFunctor : OpKernel {
    const index_t height_b = B->dim(rank - 2);
    const index_t width_b = B->dim(rank - 1);

-    auto scratch_buffer = context_->workspace()->GetScratchBuffer(D);
+    auto scratch_buffer = context_->device()->scratch_buffer();
    scratch_buffer->Rewind();
    index_t scratch_size = C->raw_max_size();
    if (!A->is_weight()) {
@@ -112,7 +108,7 @@ struct MatMulFunctor : OpKernel {
               A->is_weight(),
               B->is_weight(),
               c_ptr_base,
-               scratch_buffer);
+               context_->device()->scratch_buffer());
    return MACE_SUCCESS;
  }

@@ -218,9 +214,21 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel {
 };

 #ifdef MACE_ENABLE_OPENCL
+class OpenCLMatMulKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *A,
+      const Tensor *B,
+      Tensor *C,
+      bool transpose_a,
+      bool transpose_b,
+      StatsFuture *future) = 0;
+  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLMatMulKernel);
+};
 template <typename T>
 struct MatMulFunctor<DeviceType::GPU, T> : OpKernel {
-  explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
+  explicit MatMulFunctor(OpKernelContext *context);

  MaceStatus operator()(const Tensor *A,
                        const Tensor *B,
@@ -229,9 +237,7 @@ struct MatMulFunctor<DeviceType::GPU, T> : OpKernel {
                        bool transpose_b,
                        StatsFuture *future);

-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::unique_ptr<BufferBase> kernel_error_;
+  std::unique_ptr<OpenCLMatMulKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL


--- a/mace/kernels/opencl/activation.cc
+++ b/mace/kernels/opencl/activation.cc
@@ -13,96 +13,31 @@
 // limitations under the License.

 #include "mace/kernels/activation.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
+
+#include "mace/kernels/opencl/image/activation.h"

 namespace mace {
 namespace kernels {
+
+template <typename T>
+ActivationFunctor<DeviceType::GPU, T>::ActivationFunctor(
+    OpKernelContext *context,
+    ActivationType type,
+    T relux_max_limit) : OpKernel(context) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(
+        new opencl::image::ActivationKernel<T>(type, relux_max_limit));
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
 template <typename T>
 MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
    const Tensor *alpha,
    Tensor *output,
    StatsFuture *future) {
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-
-  auto runtime = context_->device()->opencl_runtime();
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
-    built_options.emplace("-Dactivation=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    switch (activation_) {
-      case RELU:
-        tuning_key_prefix_ = "relu_opencl_kernel";
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        tuning_key_prefix_ = "relux_opencl_kernel";
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case PRELU:
-        tuning_key_prefix_ = "prelu_opencl_kernel";
-        built_options.emplace("-DUSE_PRELU");
-        break;
-      case TANH:
-        tuning_key_prefix_ = "tanh_opencl_kernel";
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        tuning_key_prefix_ = "sigmoid_opencl_kernel";
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation_;
-    }
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    int idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    if (activation_ == PRELU) {
-      MACE_CHECK_NOTNULL(alpha);
-      kernel_.setArg(idx++, *(alpha->opencl_image()));
-    }
-    kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
-             output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
+  return kernel_->Compute(context_, input, alpha, output, future);
 }

 template struct ActivationFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -13,97 +13,32 @@
 // limitations under the License.

 #include "mace/kernels/addn.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
+
+#include "mace/kernels/opencl/image/addn.h"

 namespace mace {
 namespace kernels {

+template <typename T>
+AddNFunctor<DeviceType::GPU, T>::AddNFunctor(OpKernelContext *context)
+    : OpKernel(context) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(
+        new opencl::image::AddNKernel<T>);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
+
 template <typename T>
 MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
    const std::vector<const Tensor *> &input_tensors,
    Tensor *output_tensor,
    StatsFuture *future) {
-  size_t size = input_tensors.size();
-  MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
-
-  const index_t batch = input_tensors[0]->dim(0);
-  const index_t height = input_tensors[0]->dim(1);
-  const index_t width = input_tensors[0]->dim(2);
-  const index_t channels = input_tensors[0]->dim(3);
-
-  auto runtime = context_->device()->opencl_runtime();
-
-  for (size_t i = 1; i < size; ++i) {
-    MACE_CHECK_NOTNULL(input_tensors[i]);
-    MACE_CHECK(batch == input_tensors[i]->dim(0));
-    MACE_CHECK(height == input_tensors[i]->dim(1));
-    MACE_CHECK(width == input_tensors[i]->dim(2));
-    MACE_CHECK(channels == input_tensors[i]->dim(3));
-  }
-
-  if (kernel_.get() == nullptr) {
-    if (input_tensors.size() > 4) {
-      MACE_NOT_IMPLEMENTED;
-    }
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
-    built_options.emplace("-Daddn=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
-
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  std::vector<index_t> output_shape = input_tensors[0]->shape();
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t width_pixels = channel_blocks * width;
-  const index_t batch_height_pixels = batch * height;
-
-  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
-                           static_cast<uint32_t>(batch_height_pixels)};
-
-  if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
-    std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                    &output_image_shape);
-    MACE_RETURN_IF_ERROR(
-        output_tensor->ResizeImage(output_shape, output_image_shape));
-
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_2D_GWS_ARGS(kernel_);
-    for (auto input : input_tensors) {
-      kernel_.setArg(idx++, *(input->opencl_image()));
-    }
-    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
-
-    input_shape_ = input_tensors[0]->shape();
-  }
-
-  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
-  std::string tuning_key =
-      Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
-             output_tensor->dim(2), output_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
+  return kernel_->Compute(context_, input_tensors, output_tensor, future);
 }

 template struct AddNFunctor<DeviceType::GPU, float>;
-
 template struct AddNFunctor<DeviceType::GPU, half>;

 }  // namespace kernels

--- a/mace/kernels/opencl/batch_norm.cc
+++ b/mace/kernels/opencl/batch_norm.cc
@@ -13,14 +13,26 @@
 // limitations under the License.

 #include "mace/kernels/batch_norm.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
+#include "mace/kernels/opencl/image/batch_norm.h"

 namespace mace {
 namespace kernels {

+template <typename T>
+BatchNormFunctor<DeviceType::GPU, T>::BatchNormFunctor(
+    OpKernelContext *context,
+    const bool folded_constant,
+    const ActivationType activation,
+    const float relux_max_limit)
+    : OpKernel(context) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::BatchNormKernel<T>(
+        folded_constant, activation, relux_max_limit));
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
+
 template <typename T>
 MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
@@ -31,84 +43,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
    const float epsilon,
    Tensor *output,
    StatsFuture *future) {
-  MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr));
-
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
-  auto runtime = context_->device()->opencl_runtime();
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    auto dt = DataTypeToEnum<T>::value;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
-    built_options.emplace("-Dbatch_norm=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    if (folded_constant_) {
-      built_options.emplace("-DFOLDED_CONSTANT");
-    }
-    switch (activation_) {
-      case NOOP:
-        break;
-      case RELU:
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case TANH:
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation_;
-    }
-
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
-                                              built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(scale->opencl_image()));
-    kernel_.setArg(idx++, *(offset->opencl_image()));
-    if (!folded_constant_) {
-      kernel_.setArg(idx++, *(mean->opencl_image()));
-      kernel_.setArg(idx++, *(var->opencl_image()));
-      kernel_.setArg(idx++, epsilon);
-    }
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, relux_max_limit_);
-
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
-             output->dim(1), output->dim(2), output->dim(3), folded_constant_);
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
+  return kernel_->Compute(context_, input, scale, offset, mean,
+                          var, epsilon, output, future);
 }

 template struct BatchNormFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/batch_to_space.cc
+++ b/mace/kernels/opencl/batch_to_space.cc
@@ -16,84 +16,31 @@
 #define MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_

 #include "mace/kernels/batch_to_space.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
+#include "mace/kernels/opencl/image/batch_to_space.h"

 namespace mace {
 namespace kernels {

+template <typename T>
+BatchToSpaceFunctor<DeviceType::GPU, T>::BatchToSpaceFunctor(
+    OpKernelContext *context,
+    const std::vector<int> &paddings,
+    const std::vector<int> &block_shape)
+    : BatchToSpaceFunctorBase(context, paddings, block_shape) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::BatchToSpaceKernel<T>);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
 template <typename T>
 MaceStatus BatchToSpaceFunctor<DeviceType::GPU, T>::operator()(
-    Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) {
+    const Tensor *batch_tensor, Tensor *space_tensor, StatsFuture *future) {
  std::vector<index_t> output_shape(4, 0);
  CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC,
                                   output_shape.data());
-
-  std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
-  MACE_RETURN_IF_ERROR(
-      space_tensor->ResizeImage(output_shape, output_image_shape));
-
-  const uint32_t chan_blk =
-      static_cast<uint32_t>(RoundUpDiv4(batch_tensor->dim(3)));
-
-  const uint32_t gws[3] = {
-      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
-      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
-
-  auto runtime = context_->device()->opencl_runtime();
-
-  if (kernel_.get() == nullptr) {
-    const char *kernel_name = "batch_to_space";
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    std::stringstream kernel_name_ss;
-    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-    built_options.emplace(kernel_name_ss.str());
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" +
-                          DtToCLCMDDt(DataTypeToEnum<T>::value));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_to_space",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  if (!IsVecEqual(space_shape_, space_tensor->shape())) {
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
-    kernel_.setArg(idx++, *(space_tensor->opencl_image()));
-    kernel_.setArg(idx++, block_shape_[0]);
-    kernel_.setArg(idx++, block_shape_[1]);
-    kernel_.setArg(idx++, paddings_[0]);
-    kernel_.setArg(idx++, paddings_[2]);
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(0)));
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
-
-    space_shape_ = space_tensor->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
-             batch_tensor->dim(2), batch_tensor->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
+  return kernel_->Compute(context_, batch_tensor, paddings_, block_shape_,
+                          output_shape, space_tensor, future);
 }

 template struct BatchToSpaceFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/bias_add.cc
+++ b/mace/kernels/opencl/bias_add.cc
@@ -13,13 +13,23 @@
 // limitations under the License.

 #include "mace/kernels/bias_add.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/utils.h"
+#include "mace/kernels/opencl/image/bias_add.h"

 namespace mace {
 namespace kernels {

+template <typename T>
+BiasAddFunctor<DeviceType::GPU, T>::BiasAddFunctor(
+    OpKernelContext *context,
+    const DataFormat data_format)
+    : BiasAddFunctorBase(context, data_format) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::BiasAddKernel<T>);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+}
+
 template <typename T>
 MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                          const Tensor *bias,
@@ -27,75 +37,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                          StatsFuture *future) {
  MACE_CHECK(input->dim_size() == 4 && data_format_ == NHWC,
             "gpu only support biasadd for 4-dimensional NHWC format tensor");
-
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
-  auto runtime = context_->device()->opencl_runtime();
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    auto dt = DataTypeToEnum<T>::value;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
-    built_options.emplace("-Dbias_add=" + kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
-                                              built_options, &kernel_));
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, *(bias->opencl_image()));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-    input_shape_ = input->shape();
-  }
-
-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-
-  cl::Event event;
-  cl_int error;
-  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  } else {
-    std::vector<uint32_t> roundup_gws(lws.size());
-    for (size_t i = 0; i < lws.size(); ++i) {
-      if (lws[i] != 0) roundup_gws[i] = RoundUp(gws[i], lws[i]);
-    }
-
-    error = runtime->command_queue().enqueueNDRangeKernel(
-        kernel_, cl::NullRange,
-        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-  }
-  MACE_CL_RET_STATUS(error);
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-
-  return MACE_SUCCESS;
+  return kernel_->Compute(context_, input, bias, output, future);
 }

 template struct BiasAddFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/buffer/buffer_inverse_transform.h
+++ b/mace/kernels/opencl/buffer/buffer_inverse_transform.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
+#define MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
+
+#include "mace/kernels/buffer_inverse_transform.h"
+#include "mace/kernels/opencl/helper.h"
+
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+
+MaceStatus BufferTypeTransform(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future);
+
+template <typename T>
+class BufferInverseTransform: public OpenCLBufferInverseTransformKernel {
+ public:
+  MaceStatus Compute(OpKernelContext *context,
+                     const Tensor *input,
+                     const BufferType type,
+                     const int wino_blk_size,
+                     Tensor *output,
+                     StatsFuture *future) override;
+ private:
+  cl::Kernel kernel_;
+};
+
+template <typename T>
+MaceStatus BufferInverseTransform<T>::Compute(OpKernelContext *context,
+                                              const Tensor *input,
+                                              const BufferType type,
+                                              const int wino_blk_size,
+                                              Tensor *output,
+                                              StatsFuture *future) {
+  MACE_UNUSED(type);
+  MACE_UNUSED(wino_blk_size);
+  const DataType dt = DataTypeToEnum<T>::value;
+  if (input->dtype() != output->dtype()) {
+    return BufferTypeTransform(context, &kernel_, input, dt, output, future);
+  } else {
+    SetFutureDefaultWaitFn(future);
+    output->ReuseTensorBuffer(*input);
+    return MaceStatus::MACE_SUCCESS;
+  }
+}
+
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
--- a/mace/kernels/opencl/buffer/buffer_transform.cc
+++ b/mace/kernels/opencl/buffer/buffer_transform.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/kernels/opencl/buffer/buffer_transform.h"
+
+#include <vector>
+#include <set>
+#include <string>
+
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+
+MaceStatus TransformConv2DFilter(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future) {
+  const index_t out_chan = input->dim(0);
+  const index_t in_chan = input->dim(1);
+  const index_t filter_height = input->dim(2);
+  const index_t filter_width = input->dim(3);
+
+  std::vector<index_t> transformed_shape = {
+      filter_height, filter_width,
+      RoundUpDiv4(out_chan),
+      RoundUp<index_t>(in_chan, 4),
+      4,
+  };
+  uint32_t gws[3];
+  gws[0] = static_cast<uint32_t>(transformed_shape[3]);
+  gws[1] = static_cast<uint32_t>(transformed_shape[2]);
+  gws[2] = static_cast<uint32_t>(filter_height * filter_width);
+  MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
+  output->Reshape(input->shape());
+
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    MACE_OUT_OF_RANGE_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_conv_filter");
+    built_options.emplace("-Dtransform_conv_filter=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
+                                              kernel_name,
+                                              built_options,
+                                              kernel));
+  }
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+
+  uint32_t idx = 0;
+  MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->UnderlyingBuffer()->size());
+  MACE_SET_3D_GWS_ARGS(*kernel, gws);
+  kernel->setArg(idx++, *(input->opencl_buffer()));
+  MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
+             "buffer offset not aligned");
+  kernel->setArg(idx++,
+                 static_cast<uint32_t>(input->buffer_offset() /
+                     GetEnumTypeSize(input->dtype())));
+  kernel->setArg(idx++, *(output->opencl_buffer()));
+  kernel->setArg(idx++, static_cast<int32_t>(out_chan));
+  kernel->setArg(idx++, static_cast<int32_t>(in_chan));
+  kernel->setArg(idx++, static_cast<int32_t>(filter_height));
+  kernel->setArg(idx++, static_cast<int32_t>(filter_width));
+  kernel->setArg(idx++, static_cast<int32_t>(
+      in_chan * filter_height * filter_width));
+
+  std::string tuning_key =
+      Concat("transform_conv_filter",
+             transformed_shape[0],
+             transformed_shape[1],
+             transformed_shape[2],
+             transformed_shape[3]);
+  std::vector<uint32_t> lws = {4, 4, 4, 0};
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION
+  // Mark the buffer unused.
+  const_cast<Tensor *>(input)->MarkUnused();
+  return MACE_SUCCESS;
+}
+
+MaceStatus TransformDWConv2DFilter(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future) {
+  const index_t multiplier = input->dim(0);
+  const index_t in_chan = input->dim(1);
+  const index_t filter_height = input->dim(2);
+  const index_t filter_width = input->dim(3);
+
+  std::vector<index_t> transformed_shape = {
+      multiplier, RoundUpDiv4(in_chan),
+      filter_height, filter_width, 4,
+  };
+  uint32_t gws[3];
+  gws[0] = static_cast<uint32_t>(filter_width);
+  gws[1] = static_cast<uint32_t>(filter_height);
+  gws[2] = static_cast<uint32_t>(transformed_shape[1]);
+  MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
+  output->Reshape(input->shape());
+
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_dw_conv_filter");
+    built_options.emplace("-Dtransform_dw_conv_filter=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
+                                              kernel_name,
+                                              built_options,
+                                              kernel));
+  }
+
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+
+  uint32_t idx = 0;
+  MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->UnderlyingBuffer()->size());
+  MACE_SET_3D_GWS_ARGS(*kernel, gws);
+  kernel->setArg(idx++, *(input->opencl_buffer()));
+  MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
+             "buffer offset not aligned");
+  kernel->setArg(idx++,
+                 static_cast<uint32_t>(input->buffer_offset() /
+                     GetEnumTypeSize(input->dtype())));
+  kernel->setArg(idx++, *(output->opencl_buffer()));
+  kernel->setArg(idx++, static_cast<int32_t>(in_chan));
+  kernel->setArg(idx++, static_cast<int32_t>(filter_height * filter_width));
+
+  std::string tuning_key =
+      Concat("transform_conv_filter",
+             transformed_shape[0],
+             transformed_shape[1],
+             transformed_shape[2],
+             transformed_shape[3]);
+  std::vector<uint32_t> lws = {4, 4, 4, 0};
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION
+  // Mark the buffer unused.
+  const_cast<Tensor *>(input)->MarkUnused();
+  return MACE_SUCCESS;
+}
+
+MaceStatus TransformArgument(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future) {
+  const index_t size = input->dim(0);
+
+  std::vector<index_t> transformed_shape = {RoundUp<index_t>(size, 4)};
+  uint32_t gws = static_cast<uint32_t>(RoundUpDiv4(transformed_shape[0]));
+  MACE_RETURN_IF_ERROR(output->Resize(transformed_shape));
+  output->Reshape(input->shape());
+
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_arg");
+    built_options.emplace("-Dtransform_arg=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
+                                              kernel_name,
+                                              built_options,
+                                              kernel));
+  }
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+
+  uint32_t idx = 0;
+  MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->UnderlyingBuffer()->size());
+  kernel->setArg(idx++, gws);
+  kernel->setArg(idx++, *(input->opencl_buffer()));
+  MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
+             "buffer offset not aligned");
+  kernel->setArg(idx++,
+                 static_cast<uint32_t>(input->buffer_offset() /
+                     GetEnumTypeSize(input->dtype())));
+  kernel->setArg(idx++, *(output->opencl_buffer()));
+  kernel->setArg(idx++, static_cast<int32_t>(size));
+
+  const uint32_t lws =
+      static_cast<uint32_t>(RoundUpDiv4(runtime->GetDeviceMaxWorkGroupSize()));
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        *kernel, cl::NullRange, cl::NDRange(gws),
+        cl::NDRange(lws), nullptr, &event);
+  } else {
+    uint32_t roundup_gws = RoundUp(gws, lws);
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        *kernel, cl::NullRange, cl::NDRange(roundup_gws),
+        cl::NDRange(lws), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION
+  if (future != nullptr) {
+    future->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+  // Mark the buffer unused.
+  const_cast<Tensor *>(input)->MarkUnused();
+  return MACE_SUCCESS;
+}
+
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/buffer/buffer_transform.h
+++ b/mace/kernels/opencl/buffer/buffer_transform.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
+#define MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
+
+#include <vector>
+
+#include "mace/kernels/buffer_transform.h"
+#include "mace/kernels/opencl/helper.h"
+
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+
+MaceStatus BufferTypeTransform(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future);
+
+MaceStatus TransformConv2DFilter(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future);
+
+MaceStatus TransformDWConv2DFilter(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future);
+
+MaceStatus TransformArgument(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future);
+
+
+template <typename T>
+class BufferTransform: public OpenCLBufferTransformKernel {
+ public:
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const BufferType type,
+      const int wino_blk_size,
+      Tensor *output,
+      StatsFuture *future) override;
+
+ private:
+  cl::Kernel kernel_;
+  std::vector<index_t> input_shape_;
+};
+
+template <typename T>
+MaceStatus BufferTransform<T>::Compute(OpKernelContext *context,
+                                       const Tensor *input,
+                                       const BufferType type,
+                                       const int wino_blk_size,
+                                       Tensor *output,
+                                       StatsFuture *future) {
+  MACE_UNUSED(type);
+  MACE_UNUSED(wino_blk_size);
+  const DataType dt = DataTypeToEnum<T>::value;
+  switch (type) {
+    case CONV2D_FILTER:
+      return TransformConv2DFilter(context, &kernel_, input,
+                                   dt, output, future);
+    case DW_CONV2D_FILTER:
+      return TransformDWConv2DFilter(context, &kernel_, input,
+                                     dt, output, future);
+    case ARGUMENT:
+      return TransformArgument(context, &kernel_, input, dt, output, future);
+    default:
+      if (input->dtype() != dt) {
+        return BufferTypeTransform(context, &kernel_, input,
+                                   dt, output, future);
+      } else {
+        SetFutureDefaultWaitFn(future);
+        output->ReuseTensorBuffer(*input);
+        return MaceStatus::MACE_SUCCESS;
+      }
+  }
+}
+
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
--- a/mace/kernels/opencl/buffer/buffer_type_transform.cc
+++ b/mace/kernels/opencl/buffer/buffer_type_transform.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/activation.h"
+#include "mace/kernels/conv_2d.h"
+#include "mace/kernels/opencl/helper.h"
+#include "mace/utils/tuner.h"
+
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+
+
+MaceStatus BufferTypeTransform(
+    OpKernelContext *context,
+    cl::Kernel *kernel,
+    const Tensor *input,
+    const DataType dt,
+    Tensor *output,
+    StatsFuture *future) {
+  MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION
+
+  const uint32_t gws =
+      static_cast<uint32_t>(RoundUpDiv4(output->size()));
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_data_type");
+    built_options.emplace("-Dtransform_data_type=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform",
+                                              kernel_name,
+                                              built_options,
+                                              kernel));
+  }
+
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  uint32_t idx = 0;
+  MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
+  kernel->setArg(idx++, gws);
+  kernel->setArg(idx++, *(input->opencl_buffer()));
+  MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0,
+             "buffer offset not aligned");
+  kernel->setArg(idx++,
+                 static_cast<uint32_t>(input->buffer_offset() /
+                     GetEnumTypeSize(input->dtype())));
+  kernel->setArg(idx++, *(output->opencl_buffer()));
+
+  const uint32_t lws =
+      static_cast<uint32_t>(RoundUpDiv4(runtime->GetDeviceMaxWorkGroupSize()));
+  cl::Event event;
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        *kernel, cl::NullRange, cl::NDRange(gws),
+        cl::NDRange(lws), nullptr, &event);
+  } else {
+    uint32_t roundup_gws = RoundUp(gws, lws);
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        *kernel, cl::NullRange, cl::NDRange(roundup_gws),
+        cl::NDRange(lws), nullptr, &event);
+  }
+  MACE_CL_RET_STATUS(error);
+  MACE_OUT_OF_RANGE_VALIDATION
+  if (future != nullptr) {
+    future->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+  // Mark the buffer unused.
+  const_cast<Tensor *>(input)->MarkUnused();
+  return MACE_SUCCESS;
+}
+
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/buffer/conv_2d.h
+++ b/mace/kernels/opencl/buffer/conv_2d.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_
+#define MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_
+
+#include "mace/kernels/conv_2d.h"
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "mace/kernels/opencl/buffer/utils.h"
+#include "mace/kernels/opencl/helper.h"
+
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+namespace conv2d {
+
+extern MaceStatus Conv2d1x1(OpKernelContext *context,
+                            cl::Kernel *kernel,
+                            const Tensor *padded_input,
+                            const Tensor *filter,
+                            const Tensor *bias,
+                            const int *strides,
+                            const DataType dt,
+                            const ActivationType activation,
+                            const float relux_max_limit,
+                            const bool input_changed,
+                            Tensor *output,
+                            StatsFuture *future);
+
+extern MaceStatus Conv2dGeneral(OpKernelContext *context,
+                                cl::Kernel *kernel,
+                                const Tensor *input,
+                                const Tensor *filter,
+                                const Tensor *bias,
+                                const int *strides,
+                                const int *dilations,
+                                const DataType dt,
+                                const ActivationType activation,
+                                const float relux_max_limit,
+                                const bool input_changed,
+                                Tensor *output,
+                                StatsFuture *future);
+}  // namespace conv2d
+
+template <typename T>
+class Conv2dKernel : public OpenCLConv2dKernel {
+ public:
+  Conv2dKernel() : old_scratch_size_(0) {}
+
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output,
+      StatsFuture *future) override;
+
+ private:
+  index_t old_scratch_size_;
+  cl::Kernel kernels_[2];
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+
+template <typename T>
+MaceStatus Conv2dKernel<T>::Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output,
+      StatsFuture *future) {
+  StatsFuture pad_future, conv_future;
+  index_t filter_h = filter->dim(2);
+  index_t filter_w = filter->dim(3);
+  // Reshape output
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    kernels::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), filter->shape().data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), filter->shape().data(),
+                   padding_data.data(), dilations, strides, RoundType::FLOOR,
+                   output_shape.data());
+  }
+
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  // calculate padded input shape
+  index_t width = output_shape[2];
+  index_t channels = output_shape[3];
+
+  index_t input_height = input->dim(1);
+  index_t input_width = input->dim(2);
+  index_t input_channels = input->dim(3);
+
+  int pad_top = paddings[0] >> 1;
+  int pad_left = paddings[1] >> 1;
+
+  MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
+  MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
+             input_channels);
+
+  std::function<MaceStatus(const Tensor *input, Tensor *output)> conv_func;
+
+  // Mark whether input changed or not
+  bool input_changed = !IsVecEqual(input_shape_, input->shape());
+  input_shape_ = input->shape();
+
+  bool use_1x1 = filter_h == 1 && filter_w == 1;
+
+  std::vector<index_t> padded_output_shape = output_shape;
+  index_t tile_w, tile_c = 4;
+  if (use_1x1) {
+    tile_w = 2;
+  } else {
+    tile_w = 4;
+  }
+  padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
+
+  std::vector<index_t> padded_input_shape = input->shape();
+  padded_input_shape[1] = input_height + paddings[0];
+  padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
+      (filter_w - 1) * dilations[1] + 1;
+  padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
+
+  const Tensor *padded_input_ptr = input;
+  // pad input
+  std::unique_ptr<Tensor> padded_input;
+  if (padded_input_shape[1] != input_height ||
+      padded_input_shape[2] != input_width ||
+      padded_input_shape[3] != input_channels) {
+    // decide scratch size before allocate it
+    index_t total_scratch_size = 0;
+    index_t padded_input_size = 0;
+
+    padded_input_size =
+        std::accumulate(padded_input_shape.begin(),
+                        padded_input_shape.end(),
+                        1,
+                        std::multiplies<index_t>())
+            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
+    total_scratch_size += padded_input_size;
+
+    // Init scratch buffer
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
+    if (old_scratch_size_ != scratch->size()) {
+      input_changed |= scratch->size() != old_scratch_size_;
+      old_scratch_size_ = scratch->size();
+    }
+
+    padded_input.reset(new Tensor(scratch->Scratch(padded_input_size),
+                                  input->dtype()));
+
+    padded_input->Resize(padded_input_shape);
+    PadInput(context, &kernels_[0], input, pad_top, pad_left,
+             input_changed, padded_input.get(), &pad_future);
+    padded_input_ptr = padded_input.get();
+  }
+
+  if (use_1x1) {
+    conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
+      return conv2d::Conv2d1x1(
+          context, &kernels_[1], pad_input, filter, bias, strides,
+          DataTypeToEnum<T>::v(), activation, relux_max_limit,
+          input_changed, output, &conv_future);
+    };
+  } else {
+    conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
+      return conv2d::Conv2dGeneral(
+        context, &kernels_[1], pad_input, filter, bias, strides, dilations,
+        DataTypeToEnum<T>::v(), activation, relux_max_limit,
+        input_changed, output, &conv_future);
+    };
+  }
+  MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
+  MergeMultipleFutureWaitFn({pad_future, conv_future}, future);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_
--- a/mace/kernels/opencl/buffer/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/buffer/conv_2d_1x1.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/activation.h"
+#include "mace/kernels/conv_2d.h"
+#include "mace/kernels/opencl/helper.h"
+#include "mace/utils/tuner.h"
+
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+namespace conv2d {
+
+MaceStatus Conv2d1x1(OpKernelContext *context,
+                     cl::Kernel *kernel,
+                     const Tensor *padded_input,
+                     const Tensor *filter,
+                     const Tensor *bias,
+                     const int *strides,
+                     const DataType dt,
+                     const ActivationType activation,
+                     const float relux_max_limit,
+                     const bool input_changed,
+                     Tensor *output,
+                     StatsFuture *future) {
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channel = output->dim(3);
+
+  const index_t in_height = padded_input->dim(1);
+  const index_t in_width = padded_input->dim(2);
+
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
+    built_options.emplace("-Dconv2d=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1_buffer",
+                                              kernel_name,
+                                              built_options, kernel));
+  }
+
+  const uint32_t gws[2] = {static_cast<uint32_t>(
+                               RoundUpDiv4(channel) *
+                                   RoundUpDiv<index_t>(width, 2)),
+                           static_cast<uint32_t>(height * batch)};
+
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  if (input_changed) {
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
+    MACE_SET_2D_GWS_ARGS(*kernel, gws);
+    kernel->setArg(idx++, *(padded_input->opencl_buffer()));
+    kernel->setArg(idx++, *(filter->opencl_buffer()));
+    if (bias != nullptr) {
+      kernel->setArg(idx++, *(bias->opencl_buffer()));
+    }
+    kernel->setArg(idx++, static_cast<int32_t>(in_height));
+    kernel->setArg(idx++, static_cast<int32_t>(in_width));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input->dim(3)));
+    kernel->setArg(idx++,
+                   static_cast<int32_t>(filter->buffer_shape()[3]));
+    kernel->setArg(idx++, static_cast<int32_t>(height));
+    kernel->setArg(idx++, static_cast<int32_t>(width));
+    kernel->setArg(idx++, static_cast<int32_t>(channel));
+    kernel->setArg(idx++, strides[0]);
+    kernel->setArg(idx++, strides[1]);
+    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, *(output->opencl_buffer()));
+  }
+
+  std::string tuning_key =
+      Concat("conv2d_1x1_buffer", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  std::vector<uint32_t> lws = {16, 4, 0};
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, gws,
+                                           lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MACE_SUCCESS;
+}
+
+}  // namespace conv2d
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/buffer/conv_2d_general.cc
+++ b/mace/kernels/opencl/buffer/conv_2d_general.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/activation.h"
+#include "mace/kernels/conv_2d.h"
+#include "mace/kernels/opencl/helper.h"
+#include "mace/utils/tuner.h"
+
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+namespace conv2d {
+
+MaceStatus Conv2dGeneral(OpKernelContext *context,
+                         cl::Kernel *kernel,
+                         const Tensor *padded_input,
+                         const Tensor *filter,
+                         const Tensor *bias,
+                         const int *strides,
+                         const int *dilations,
+                         const DataType dt,
+                         const ActivationType activation,
+                         const float relux_max_limit,
+                         const bool input_changed,
+                         Tensor *output,
+                         StatsFuture *future) {
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channel = output->dim(3);
+
+  const index_t in_height = padded_input->dim(1);
+  const index_t in_width = padded_input->dim(2);
+  const index_t in_channel = padded_input->dim(3);
+
+  const index_t filter_height = filter->dim(2);
+  const index_t filter_width = filter->dim(3);
+
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv2d");
+    built_options.emplace("-Dconv2d=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_buffer",
+                                              kernel_name,
+                                              built_options, kernel));
+  }
+
+  const uint32_t gws[2] = {static_cast<uint32_t>(
+                               RoundUpDiv4(channel) * RoundUpDiv4(width)),
+                           static_cast<uint32_t>(height * batch)};
+
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  if (input_changed) {
+    auto filter_buffer_shape = filter->buffer_shape();
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
+    MACE_SET_2D_GWS_ARGS(*kernel, gws)
+    kernel->setArg(idx++, *(padded_input->opencl_buffer()));
+    kernel->setArg(idx++, *(filter->opencl_buffer()));
+    if (bias != nullptr) {
+      kernel->setArg(idx++, *(bias->opencl_buffer()));
+    }
+    kernel->setArg(idx++, static_cast<int32_t>(in_height));
+    kernel->setArg(idx++, static_cast<int32_t>(in_width));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input->dim(3)));
+    kernel->setArg(idx++, static_cast<int32_t>(filter_height));
+    kernel->setArg(idx++, static_cast<int32_t>(filter_width));
+    kernel->setArg(idx++,
+                   static_cast<int32_t>(filter_buffer_shape[3]));
+    kernel->setArg(idx++, static_cast<int32_t>(
+        filter_buffer_shape[2] * filter_buffer_shape[3]
+            * filter_buffer_shape[4]));
+    kernel->setArg(idx++, static_cast<int32_t>(height));
+    kernel->setArg(idx++, static_cast<int32_t>(width));
+    kernel->setArg(idx++, static_cast<int32_t>(channel));
+    kernel->setArg(idx++, strides[0]);
+    kernel->setArg(idx++, strides[1]);
+    kernel->setArg(idx++, static_cast<int32_t>(
+        dilations[0] * in_width * in_channel));
+    kernel->setArg(idx++, static_cast<int32_t>(
+        dilations[1] * in_channel));
+    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, *(output->opencl_buffer()));
+  }
+
+  std::string tuning_key =
+      Concat("conv2d_general_buffer", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3), filter_height, filter_width);
+  std::vector<uint32_t> lws = {16, 4, 0};
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, gws,
+                                           lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION
+  return MACE_SUCCESS;
+}
+
+}  // namespace conv2d
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/buffer/depthwise_conv2d.cc
+++ b/mace/kernels/opencl/buffer/depthwise_conv2d.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/kernels/opencl/buffer/depthwise_conv2d.h"
+
+#include <set>
+#include <string>
+
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+namespace depthwise {
+
+MaceStatus DepthwiseConv2d(OpKernelContext *context,
+                           cl::Kernel *kernel,
+                           const Tensor *padded_input,   // NHWC
+                           const Tensor *filter,  // HWIM
+                           const Tensor *bias,
+                           const int *strides,
+                           const int *dilations,
+                           const DataType dt,
+                           const ActivationType activation,
+                           const float relux_max_limit,
+                           const bool input_changed,
+                           Tensor *output,
+                           StatsFuture *future) {
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channel = output->dim(3);
+
+  const index_t in_height = padded_input->dim(1);
+  const index_t in_width = padded_input->dim(2);
+  const index_t in_channel = padded_input->dim(3);
+
+  const index_t filter_height = filter->dim(2);
+  const index_t filter_width = filter->dim(3);
+
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION
+
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
+    built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(padded_input->dtype()));
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+
+    MACE_RETURN_IF_ERROR(
+        runtime->BuildKernel("depthwise_conv2d_buffer", kernel_name,
+                             built_options, kernel));
+  }
+
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(RoundUpDiv4(channel) * RoundUpDiv4(width)),
+      static_cast<uint32_t>(height * batch)
+  };
+
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  if (input_changed) {
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
+    MACE_SET_2D_GWS_ARGS(*kernel, gws);
+    kernel->setArg(idx++, *(padded_input->opencl_buffer()));
+    kernel->setArg(idx++, *(filter->opencl_buffer()));
+    if (bias != nullptr) {
+      kernel->setArg(idx++, *(bias->opencl_buffer()));
+    }
+    kernel->setArg(idx++, static_cast<uint32_t>(in_height));
+    kernel->setArg(idx++, static_cast<uint32_t>(in_width));
+    kernel->setArg(idx++, static_cast<uint32_t>(in_channel));
+    kernel->setArg(idx++, static_cast<uint32_t>(filter_height));
+    kernel->setArg(idx++, static_cast<uint32_t>(filter_width));
+    kernel->setArg(idx++, static_cast<uint32_t>(filter_height * filter_width));
+    kernel->setArg(idx++, static_cast<uint32_t>(height));
+    kernel->setArg(idx++, static_cast<uint32_t>(width));
+    kernel->setArg(idx++, static_cast<uint32_t>(channel));
+    kernel->setArg(idx++, static_cast<uint32_t>(strides[0]));
+    kernel->setArg(idx++, static_cast<uint32_t>(strides[1]));
+    kernel->setArg(idx++, static_cast<int32_t>(
+        dilations[0] * in_width * in_channel));
+    kernel->setArg(idx++, static_cast<int32_t>(
+        dilations[1] * in_channel));
+    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, *(output->opencl_buffer()));
+  }
+
+  std::vector<uint32_t> lws = {16, 4, 0};
+  std::string tuning_key =
+      Concat("depthwise_conv2d_buffer_kernel", in_height, in_width, in_channel,
+             filter_height, filter_width, channel);
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
+                                           gws, lws, future));
+
+  MACE_OUT_OF_RANGE_VALIDATION
+  return MACE_SUCCESS;
+}
+
+}  // namespace depthwise
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/buffer/depthwise_conv2d.h
+++ b/mace/kernels/opencl/buffer/depthwise_conv2d.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
+#define MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
+
+#include "mace/kernels/depthwise_conv2d.h"
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "mace/kernels/opencl/buffer/utils.h"
+#include "mace/kernels/opencl/helper.h"
+
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+namespace depthwise {
+
+MaceStatus DepthwiseConv2d(OpKernelContext *context,
+                           cl::Kernel *kernel,
+                           const Tensor *padded_input,   // NHWC
+                           const Tensor *filter,  // HWIM
+                           const Tensor *bias,
+                           const int *strides,
+                           const int *dilations,
+                           const DataType dt,
+                           const ActivationType activation,
+                           const float relux_max_limit,
+                           const bool input_changed,
+                           Tensor *output,
+                           StatsFuture *future);
+}  // namespace depthwise
+
+
+template <typename T>
+class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
+ public:
+  DepthwiseConv2dKernel() : old_scratch_size_(0) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output,
+      StatsFuture *future) override;
+
+ private:
+  index_t old_scratch_size_;
+  cl::Kernel kernels_[2];
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+
+template <typename T>
+MaceStatus DepthwiseConv2dKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    const ActivationType activation,
+    const float relux_max_limit,
+    Tensor *output,
+    StatsFuture *future) {
+  StatsFuture pad_future, dw_conv_future;
+  index_t filter_w = filter->dim(3);
+
+  // Create a fake conv_2d filter to calculate the paddings and output size
+  std::vector<index_t> fake_filter_shape(4);
+  fake_filter_shape[0] = filter->dim(0) * filter->dim(1);
+  fake_filter_shape[1] = filter->dim(1);
+  fake_filter_shape[2] = filter->dim(2);
+  fake_filter_shape[3] = filter->dim(3);
+
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    kernels::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), fake_filter_shape.data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
+                   padding_data.data(), dilations, strides, RoundType::FLOOR,
+                   output_shape.data());
+  }
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  // calculate padded input shape
+  index_t width = output_shape[2];
+  index_t channels = output_shape[3];
+
+  index_t input_height = input->dim(1);
+  index_t input_width = input->dim(2);
+  index_t input_channels = input->dim(3);
+
+  int pad_top = paddings[0] >> 1;
+  int pad_left = paddings[1] >> 1;
+
+  MACE_CHECK(filter->dim(0) == 1, "Multiplier > 1 not supported");
+  MACE_CHECK(filter->dim(0) * input_channels == channels);
+  MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
+             input_channels);
+
+  // Mark whether input changed or not
+  bool input_changed = !IsVecEqual(input_shape_, input->shape());
+  input_shape_ = input->shape();
+
+  std::vector<index_t> padded_output_shape = output_shape;
+  index_t tile_w = 4, tile_c = 4;
+  padded_output_shape[2] = RoundUp<index_t>(width, tile_w);
+
+  std::vector<index_t> padded_input_shape = input->shape();
+  padded_input_shape[1] = input_height + paddings[0];
+  padded_input_shape[2] = (padded_output_shape[2] - 1) * strides[1] +
+      (filter_w - 1) * dilations[1] + 1;
+  padded_input_shape[3] = RoundUp<index_t>(input_channels, tile_c);
+
+  const Tensor *padded_input_ptr = input;
+  // pad input
+  std::unique_ptr<Tensor> padded_input;
+  if (padded_input_shape[1] != input_height ||
+      padded_input_shape[2] != input_width ||
+      padded_input_shape[3] != input_channels) {
+    index_t total_scratch_size = 0;
+    index_t padded_input_size = 0;
+
+    padded_input_size =
+        std::accumulate(padded_input_shape.begin(),
+                        padded_input_shape.end(),
+                        1,
+                        std::multiplies<index_t>())
+            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
+    total_scratch_size += padded_input_size;
+
+    // Init scratch buffer
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
+    if (old_scratch_size_ != scratch->size()) {
+      input_changed |= scratch->size() != old_scratch_size_;
+      old_scratch_size_ = scratch->size();
+    }
+
+    padded_input.reset(new Tensor(scratch->Scratch(padded_input_size),
+                                  input->dtype()));
+
+    padded_input->Resize(padded_input_shape);
+    PadInput(context, &kernels_[0], input, pad_top, pad_left,
+             input_changed, padded_input.get(), &pad_future);
+    padded_input_ptr = padded_input.get();
+  }
+
+  MACE_RETURN_IF_ERROR(
+      depthwise::DepthwiseConv2d(
+          context, &kernels_[1], padded_input_ptr, filter, bias, strides,
+          dilations, DataTypeToEnum<T>::v(), activation, relux_max_limit,
+          input_changed, output, &dw_conv_future));
+  MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, future);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
--- a/mace/kernels/opencl/buffer/pooling.h
+++ b/mace/kernels/opencl/buffer/pooling.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_BUFFER_POOLING_H_
+#define MACE_KERNELS_OPENCL_BUFFER_POOLING_H_
+
+#include "mace/kernels/pooling.h"
+
+#include <functional>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "mace/kernels/opencl/buffer/utils.h"
+#include "mace/kernels/opencl/helper.h"
+
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+
+template <typename T>
+class PoolingKernel : public OpenCLPoolingKernel {
+ public:
+  PoolingKernel() : old_scratch_size_(0) {}
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *input,
+      const PoolingType pooling_type,
+      const int *kernels,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      Tensor *output,
+      StatsFuture *future) override;
+
+ private:
+  index_t old_scratch_size_;
+  cl::Kernel kernels_[2];
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+
+template <typename T>
+MaceStatus PoolingKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *input,
+    const PoolingType pooling_type,
+    const int *kernels,
+    const int *strides,
+    const Padding &padding_type,
+    const std::vector<int> &padding_data,
+    const int *dilations,
+    Tensor *output,
+    StatsFuture *future) {
+  MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
+    << "Pooling opencl kernel not support dilation yet";
+
+  StatsFuture pad_future, pooling_future;
+
+  index_t input_channels = input->dim(3);
+
+  std::vector<index_t> output_shape(4);
+  std::vector<index_t> filter_shape = {input->dim(3), input->dim(3),
+                                       kernels[0], kernels[1]};
+
+  std::vector<int> paddings(2);
+  if (padding_data.empty()) {
+    kernels::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), filter_shape.data(), dilations, strides,
+        padding_type, output_shape.data(), paddings.data());
+  } else {
+    paddings = padding_data;
+    CalcOutputSize(input->shape().data(), filter_shape.data(),
+                   padding_data.data(), dilations, strides, RoundType::CEIL,
+                   output_shape.data());
+  }
+
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+
+  // Mark whether input changed or not
+  bool input_changed = !IsVecEqual(input_shape_, input->shape());
+  input_shape_ = input->shape();
+
+  auto runtime = context->device()->opencl_runtime();
+
+  // pad input
+  std::vector<index_t> padded_input_shape = input->shape();
+  padded_input_shape[3] = RoundUp<index_t>(input_channels, 4);
+
+  const Tensor *padded_input_ptr = input;
+  // pad input
+  std::unique_ptr<Tensor> padded_input;
+  if (padded_input_shape[3] != input_channels) {
+    index_t total_scratch_size = 0;
+    index_t padded_input_size = 0;
+
+    padded_input_size =
+        std::accumulate(padded_input_shape.begin(),
+                        padded_input_shape.end(),
+                        1,
+                        std::multiplies<index_t>())
+            * GetEnumTypeSize(input->dtype()) + MACE_EXTRA_BUFFER_PAD_SIZE;
+    total_scratch_size += padded_input_size;
+
+    // Init scratch buffer
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(total_scratch_size);
+    if (old_scratch_size_ != scratch->size()) {
+      input_changed |= scratch->size() != old_scratch_size_;
+      old_scratch_size_ = scratch->size();
+    }
+
+    padded_input.reset(new Tensor(scratch->Scratch(padded_input_size),
+                                  input->dtype()));
+
+    padded_input->Resize(padded_input_shape);
+    PadInput(context, &kernels_[0], input, 0, 0,
+             input_changed, padded_input.get(), &pad_future);
+    padded_input_ptr = padded_input.get();
+  }
+
+  cl::Kernel *kernel = &kernels_[1];
+  MACE_OUT_OF_RANGE_DEFINITION
+
+  if (kernel->get() == nullptr) {
+    const DataType dt = DataTypeToEnum<T>::value;
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
+    built_options.emplace("-Dpooling=" + kernel_name);
+
+    if (pooling_type == MAX && input->dtype() == output->dtype()) {
+      built_options.emplace("-DIN_DATA_TYPE=" +
+          DtToCLDt(input->dtype()));
+      built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    } else {
+      built_options.emplace("-DIN_DATA_TYPE=" +
+          DtToCLDt(input->dtype()));
+      built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
+      built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    }
+    if (pooling_type == AVG) {
+      built_options.emplace("-DPOOL_AVG");
+    }
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling_buffer",
+                                              kernel_name,
+                                              built_options,
+                                              kernel));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
+  }
+
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(RoundUpDiv4(output->dim(3))),
+      static_cast<uint32_t>(output->dim(2)),
+      static_cast<uint32_t>(output->dim(0) * output->dim(1)),
+  };
+
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  if (input_changed) {
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, output->size());
+    MACE_SET_3D_GWS_ARGS(*kernel, gws);
+    kernel->setArg(idx++, *(padded_input_ptr->opencl_buffer()));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(1)));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(2)));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_input_ptr->dim(3)));
+    kernel->setArg(idx++, static_cast<int32_t>(output->dim(1)));
+    kernel->setArg(idx++, static_cast<int32_t>(output->dim(3)));
+    kernel->setArg(idx++, paddings[0] / 2);
+    kernel->setArg(idx++, paddings[1] / 2);
+    kernel->setArg(idx++, strides[0]);
+    kernel->setArg(idx++, strides[1]);
+    kernel->setArg(idx++, kernels[0]);
+    kernel->setArg(idx++, kernels[1]);
+    kernel->setArg(idx++, *(output->opencl_buffer()));
+  }
+
+  const std::vector<uint32_t> lws = {4, 4, 4, 0};
+  std::string tuning_key =
+      Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
+             output->dim(2), output->dim(3));
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
+                                           gws, lws, &pooling_future));
+  MACE_OUT_OF_RANGE_VALIDATION
+  MergeMultipleFutureWaitFn({pad_future, pooling_future}, future);
+
+  return MACE_SUCCESS;
+}
+
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_BUFFER_POOLING_H_
--- a/mace/kernels/opencl/buffer/softmax.h
+++ b/mace/kernels/opencl/buffer/softmax.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_
+#define MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_
+
+#include "mace/kernels/softmax.h"
+
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "mace/kernels/opencl/helper.h"
+
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+template <typename T>
+class SoftmaxKernel : public OpenCLSoftmaxKernel {
+ public:
+  MaceStatus Compute(
+      OpKernelContext *context,
+      const Tensor *logits,
+      Tensor *output,
+      StatsFuture *future) override;
+
+ private:
+  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  std::vector<index_t> input_shape_;
+};
+
+template <typename T>
+MaceStatus SoftmaxKernel<T>::Compute(
+    OpKernelContext *context,
+    const Tensor *logits,
+    Tensor *output,
+    StatsFuture *future) {
+  index_t batch = 0;
+  index_t height = 0;
+  index_t width = 0;
+  index_t channels = 0;
+
+  if (logits->dim_size() == 2) {
+    batch = logits->dim(0);
+    height = 1;
+    width = 1;
+    channels = logits->dim(1);
+
+  } else if (logits->dim_size() == 4) {
+    batch = logits->dim(0);
+    height = logits->dim(1);
+    width = logits->dim(2);
+    channels = logits->dim(3);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const int remain_channels = channel_blocks * 4 - channels;
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION
+
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
+    built_options.emplace("-Dsoftmax=" + kernel_name);
+    auto dt = DataTypeToEnum<T>::value;
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(logits->dtype()));
+    built_options.emplace("-DOUT_DATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax_buffer", kernel_name,
+                                              built_options, &kernel_));
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  }
+  MACE_OUT_OF_RANGE_INIT(kernel_);
+  if (!IsVecEqual(input_shape_, logits->shape())) {
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(kernel_, output->size());
+    MACE_SET_3D_GWS_ARGS(kernel_, gws);
+    kernel_.setArg(idx++, *(logits->opencl_buffer()));
+    kernel_.setArg(idx++, static_cast<int>(height));
+    kernel_.setArg(idx++, static_cast<int>(channels));
+    kernel_.setArg(idx++, remain_channels);
+    kernel_.setArg(idx++, *(output->opencl_buffer()));
+
+    input_shape_ = logits->shape();
+  }
+
+  std::vector<uint32_t> lws = {4, 4, 4, 0};
+  std::string tuning_key =
+      Concat("softmax_opencl_kernel", batch, height, width, channels);
+  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION
+  return MACE_SUCCESS;
+}
+
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_
--- a/mace/kernels/opencl/buffer/utils.cc
+++ b/mace/kernels/opencl/buffer/utils.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/kernels/opencl/buffer/utils.h"
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/helper.h"
+
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+
+MaceStatus PadInput(OpKernelContext *context,
+                    cl::Kernel *kernel,
+                    const Tensor *input,
+                    const int pad_top,
+                    const int pad_left,
+                    const bool input_changed,
+                    Tensor *padded_input,
+                    StatsFuture *future) {
+  const index_t batch = input->dim(0);
+  const index_t in_height = input->dim(1);
+  const index_t in_width = input->dim(2);
+  const index_t in_channel = input->dim(3);
+  const index_t padded_height = padded_input->dim(1);
+  const index_t padded_width = padded_input->dim(2);
+  const index_t padded_channel = padded_input->dim(3);
+
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(padded_width * RoundUpDiv4(padded_channel)),
+      static_cast<uint32_t>(padded_height * batch)
+  };
+
+  auto runtime = context->device()->opencl_runtime();
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("Dpad_input");
+    built_options.emplace("-Dpad_input=" + kernel_name);
+    built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype()));
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(input->dtype()));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel(
+        "buffer_transform",
+        kernel_name,
+        built_options,
+        kernel));
+  }
+
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  if (input_changed) {
+    uint32_t idx = 0;
+    MACE_BUFF_OUT_OF_RANGE_SET_ARGS(*kernel, padded_input->size());
+    MACE_SET_2D_GWS_ARGS(*kernel, gws)
+    kernel->setArg(idx++, *(input->opencl_buffer()));
+    kernel->setArg(idx++, static_cast<int32_t>(in_height));
+    kernel->setArg(idx++, static_cast<int32_t>(in_width));
+    kernel->setArg(idx++, static_cast<int32_t>(in_channel));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_height));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_width));
+    kernel->setArg(idx++, static_cast<int32_t>(padded_channel));
+    kernel->setArg(idx++, pad_top);
+    kernel->setArg(idx++, pad_left);
+    kernel->setArg(idx++, *(padded_input->opencl_buffer()));
+  }
+  std::string tuning_key =
+      Concat("pad_input", batch, in_height, in_width, in_channel,
+             padded_height, padded_width, padded_channel);
+  std::vector<uint32_t> lws = {8, 4, 0};
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
+                                           gws, lws, future));
+  MACE_OUT_OF_RANGE_VALIDATION
+  return MACE_SUCCESS;
+}
+
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/buffer/utils.h
+++ b/mace/kernels/opencl/buffer/utils.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_BUFFER_UTILS_H_
+#define MACE_KERNELS_OPENCL_BUFFER_UTILS_H_
+
+#include "mace/core/future.h"
+#include "mace/core/op_kernel_context.h"
+#include "mace/core/tensor.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace kernels {
+namespace opencl {
+namespace buffer {
+
+MaceStatus PadInput(OpKernelContext *context,
+                    cl::Kernel *kernel,
+                    const Tensor *input,
+                    const int pad_top,
+                    const int pad_left,
+                    const bool input_changed,
+                    Tensor *padded_input,
+                    StatsFuture *future);
+
+}  // namespace buffer
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace mace
+#endif  // MACE_KERNELS_OPENCL_BUFFER_UTILS_H_
--- a/mace/kernels/opencl/buffer_inverse_transform.cc
+++ b/mace/kernels/opencl/buffer_inverse_transform.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/kernels/buffer_inverse_transform.h"
+#include "mace/kernels/opencl/image/image_to_buffer.h"
+#include "mace/kernels/opencl/buffer/buffer_inverse_transform.h"
+
+namespace mace {
+namespace kernels {
+
+template<typename T>
+BufferInverseTransformFunctor<
+    DeviceType::GPU, T>::BufferInverseTransformFunctor(
+    OpKernelContext *context,
+    const int wino_blk_size)
+  : BufferInverseTransformFunctorBase(context, wino_blk_size) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::ImageToBuffer<T>);
+  } else {
+    kernel_.reset(new opencl::buffer::BufferInverseTransform<T>);
+  }
+}
+
+template <typename T>
+MaceStatus BufferInverseTransformFunctor<DeviceType::GPU, T>::operator()(
+    const Tensor *input,
+    const BufferType type,
+    Tensor *output,
+    StatsFuture *future) {
+  return kernel_->Compute(context_, input, type,
+                          wino_blk_size_, output, future);
+}
+
+template struct BufferInverseTransformFunctor<DeviceType::GPU, float>;
+template struct BufferInverseTransformFunctor<DeviceType::GPU, half>;
+
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/buffer_transform.cc
+++ b/mace/kernels/opencl/buffer_transform.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/kernels/buffer_transform.h"
+#include "mace/kernels/opencl/image/buffer_to_image.h"
+#include "mace/kernels/opencl/buffer/buffer_transform.h"
+
+namespace mace {
+namespace kernels {
+
+template<typename T>
+BufferTransformFunctor<DeviceType::GPU, T>::BufferTransformFunctor(
+    OpKernelContext *context,
+    const int wino_blk_size)
+  : BufferTransformFunctorBase(context, wino_blk_size) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::BufferToImage<T>);
+  } else {
+    kernel_.reset(new opencl::buffer::BufferTransform<T>);
+  }
+}
+
+template <typename T>
+MaceStatus BufferTransformFunctor<DeviceType::GPU, T>::operator()(
+    const Tensor *input,
+    const BufferType type,
+    Tensor *output,
+    StatsFuture *future) {
+  return kernel_->Compute(context_, input, type,
+                          wino_blk_size_, output, future);
+}
+
+template struct BufferTransformFunctor<DeviceType::GPU, float>;
+template struct BufferTransformFunctor<DeviceType::GPU, half>;
+
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -13,73 +13,26 @@
 // limitations under the License.

 #include "mace/kernels/channel_shuffle.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
+#include "mace/kernels/opencl/image/channel_shuffle.h"

 namespace mace {
 namespace kernels {

 template <typename T>
-MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input, Tensor *output, StatsFuture *future) {
-  MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-
-  const index_t batch = input->dim(0);
-  const index_t height = input->dim(1);
-  const index_t width = input->dim(2);
-  const index_t channels = input->dim(3);
-  const index_t channels_per_group = channels / groups_;
-  MACE_CHECK(channels_per_group % 4 == 0,
-             "channels per group must be multiple of 4");
-  MACE_CHECK(groups_ % 4 == 0, "groups must be multiple of 4");
-  const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);
-
-  const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
-  auto runtime = context_->device()->opencl_runtime();
-
-  if (kernel_.get() == nullptr) {
-    std::set<std::string> built_options;
-    OUT_OF_RANGE_CONFIG(kernel_error_, context_);
-    NON_UNIFORM_WG_CONFIG;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
-    built_options.emplace("-Dchannel_shuffle=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    MACE_RETURN_IF_ERROR(
-        runtime->BuildKernel("channel_shuffle", kernel_name,
-                             built_options, &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    OUT_OF_RANGE_SET_ARG;
-    SET_3D_GWS_ARGS(kernel_);
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, groups_);
-    kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
-    kernel_.setArg(idx++, *(output->opencl_image()));
-
-    input_shape_ = input->shape();
+ChannelShuffleFunctor<DeviceType::GPU, T>::ChannelShuffleFunctor(
+    OpKernelContext *context,
+    const int groups) : OpKernel(context) {
+  if (context->device()->opencl_runtime()->UseImageMemory()) {
+    kernel_.reset(new opencl::image::ChannelShuffleKernel<T>(groups));
+  } else {
+    MACE_NOT_IMPLEMENTED;
  }
+}

-  const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
-  std::string tuning_key =
-      Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
-  MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
-  OUT_OF_RANGE_VALIDATION(kernel_error_);
-  return MACE_SUCCESS;
+template <typename T>
+MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
+    const Tensor *input, Tensor *output, StatsFuture *future) {
+  return kernel_->Compute(context_, input, output, future);
 }

 template struct ChannelShuffleFunctor<DeviceType::GPU, float>;

--- a/mace/kernels/opencl/cl/activation.cl
+++ b/mace/kernels/opencl/cl/activation.cl
 #include <common.h>

-__kernel void activation(KERNEL_ERROR_PARAMS
+__kernel void activation(OUT_OF_RANGE_PARAMS
                         GLOBAL_WORK_GROUP_SIZE_DIM3
                         __read_only image2d_t input,
 #ifdef USE_PRELU

--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
 #include <common.h>

-__kernel void addn(KERNEL_ERROR_PARAMS
+__kernel void addn(OUT_OF_RANGE_PARAMS
                   GLOBAL_WORK_GROUP_SIZE_DIM2
                   __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                   __read_only image2d_t input1,

--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
 #include <common.h>
 // Supported data types: half/float
-__kernel void batch_norm(KERNEL_ERROR_PARAMS
+__kernel void batch_norm(OUT_OF_RANGE_PARAMS
                         GLOBAL_WORK_GROUP_SIZE_DIM3
                         __read_only image2d_t input,
                         __read_only image2d_t scale,

--- a/mace/kernels/opencl/cl/batch_to_space.cl
+++ b/mace/kernels/opencl/cl/batch_to_space.cl
 #include <common.h>

-__kernel void batch_to_space(KERNEL_ERROR_PARAMS
+__kernel void batch_to_space(OUT_OF_RANGE_PARAMS
                             GLOBAL_WORK_GROUP_SIZE_DIM3
                             __read_only image2d_t batch_data,
                             __write_only image2d_t space_data,

--- a/mace/kernels/opencl/cl/bias_add.cl
+++ b/mace/kernels/opencl/cl/bias_add.cl
 #include <common.h>
 // Supported data types: half/float
-__kernel void bias_add(KERNEL_ERROR_PARAMS
+__kernel void bias_add(OUT_OF_RANGE_PARAMS
                       GLOBAL_WORK_GROUP_SIZE_DIM3
                       __read_only image2d_t input,
                       __read_only image2d_t bias,

--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
 #include <common.h>

-__kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void filter_buffer_to_image(OUT_OF_RANGE_PARAMS
                                     GLOBAL_WORK_GROUP_SIZE_DIM2
                                     __global const DATA_TYPE *input, /* OIHW */
                                     __private const int input_offset,
@@ -52,7 +52,7 @@ __kernel void filter_buffer_to_image(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS
+__kernel void filter_image_to_buffer(OUT_OF_RANGE_PARAMS
                                     GLOBAL_WORK_GROUP_SIZE_DIM2
                                     __global DATA_TYPE *output, /* OIHW */
                                     __private const int out_channel,
@@ -102,7 +102,7 @@ __kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS
 }

 // TODO(liuqi): Support multiplier > 1
-__kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void dw_filter_buffer_to_image(OUT_OF_RANGE_PARAMS
                                        GLOBAL_WORK_GROUP_SIZE_DIM2
                                        __global const DATA_TYPE *input, /* MIHW */
                                        __private const int input_offset,
@@ -154,7 +154,7 @@ __kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void in_out_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void in_out_buffer_to_image(OUT_OF_RANGE_PARAMS
                                     GLOBAL_WORK_GROUP_SIZE_DIM2
                                     __global const DATA_TYPE *input, /* nhwc */
                                     __private const int input_offset,
@@ -196,7 +196,7 @@ __kernel void in_out_buffer_to_image(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void in_out_image_to_buffer(KERNEL_ERROR_PARAMS
+__kernel void in_out_image_to_buffer(OUT_OF_RANGE_PARAMS
                                     GLOBAL_WORK_GROUP_SIZE_DIM2
                                     __global DATA_TYPE *output, /* nhwc */
                                     __private const int height,
@@ -236,7 +236,7 @@ __kernel void in_out_image_to_buffer(KERNEL_ERROR_PARAMS
  }
 }

-__kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void arg_buffer_to_image(OUT_OF_RANGE_PARAMS
                                  GLOBAL_WORK_GROUP_SIZE_DIM2
                                  __global const DATA_TYPE *input,
                                  __private const int input_offset,
@@ -272,7 +272,7 @@ __kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void arg_image_to_buffer(KERNEL_ERROR_PARAMS
+__kernel void arg_image_to_buffer(OUT_OF_RANGE_PARAMS
                                  GLOBAL_WORK_GROUP_SIZE_DIM2
                                  __global DATA_TYPE *output,
                                  __private const int count,
@@ -306,7 +306,7 @@ __kernel void arg_image_to_buffer(KERNEL_ERROR_PARAMS
 }


-__kernel void in_out_height_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void in_out_height_buffer_to_image(OUT_OF_RANGE_PARAMS
                                            GLOBAL_WORK_GROUP_SIZE_DIM2
                                            __global const DATA_TYPE *input, //nhwc
                                            __private const int input_offset,
@@ -349,7 +349,7 @@ __kernel void in_out_height_buffer_to_image(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS
+__kernel void in_out_height_image_to_buffer(OUT_OF_RANGE_PARAMS
                                            GLOBAL_WORK_GROUP_SIZE_DIM2
                                            __global DATA_TYPE *output, //nhwc
                                            __private const int height,
@@ -387,7 +387,7 @@ __kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS
  output[offset] = values.w;
 }

-__kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void in_out_width_buffer_to_image(OUT_OF_RANGE_PARAMS
                                           GLOBAL_WORK_GROUP_SIZE_DIM2
                                           __global const DATA_TYPE *input, /* nhwc */
                                           __private const int input_offset,
@@ -430,7 +430,7 @@ __kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void weight_height_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void weight_height_buffer_to_image(OUT_OF_RANGE_PARAMS
                                            GLOBAL_WORK_GROUP_SIZE_DIM2
                                            __global const DATA_TYPE *input, // OIHW
                                            __private const int input_offset,
@@ -475,7 +475,7 @@ __kernel void weight_height_buffer_to_image(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void weight_height_image_to_buffer(KERNEL_ERROR_PARAMS
+__kernel void weight_height_image_to_buffer(OUT_OF_RANGE_PARAMS
                                            GLOBAL_WORK_GROUP_SIZE_DIM2
                                            __global DATA_TYPE *output, //OIHW
                                            __private const int out_channels,
@@ -517,7 +517,7 @@ __kernel void weight_height_image_to_buffer(KERNEL_ERROR_PARAMS
 }


-__kernel void weight_width_buffer_to_image(KERNEL_ERROR_PARAMS
+__kernel void weight_width_buffer_to_image(OUT_OF_RANGE_PARAMS
                                           GLOBAL_WORK_GROUP_SIZE_DIM2
                                           __global const DATA_TYPE *input, // OIHW
                                           __private const int input_offset,
@@ -565,7 +565,7 @@ __kernel void weight_width_buffer_to_image(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS
+__kernel void weight_width_image_to_buffer(OUT_OF_RANGE_PARAMS
                                           GLOBAL_WORK_GROUP_SIZE_DIM2
                                           __global DATA_TYPE *output, // OIHW
                                           __private const int in_channels,
@@ -609,7 +609,7 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS
 }

 // only support 3x3 now
-__kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS
+__kernel void winograd_filter_buffer_to_image_2x2(OUT_OF_RANGE_PARAMS
                                              GLOBAL_WORK_GROUP_SIZE_DIM2
                                              __global const DATA_TYPE *input, //Oc, Ic, H, W
                                              __private const int input_offset,
@@ -714,7 +714,7 @@ __kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS
 }

 // only support 3x3 now
-__kernel void winograd_filter_image_to_buffer_2x2(KERNEL_ERROR_PARAMS
+__kernel void winograd_filter_image_to_buffer_2x2(OUT_OF_RANGE_PARAMS
                                              GLOBAL_WORK_GROUP_SIZE_DIM2
                                              __global DATA_TYPE *output, //Oc, Ic, H, W
                                              __private const int height,
@@ -757,7 +757,7 @@ __kernel void winograd_filter_image_to_buffer_2x2(KERNEL_ERROR_PARAMS
 }

 // only support 3x3 now
-__kernel void winograd_filter_buffer_to_image_6x6(KERNEL_ERROR_PARAMS
+__kernel void winograd_filter_buffer_to_image_6x6(OUT_OF_RANGE_PARAMS
                                                  GLOBAL_WORK_GROUP_SIZE_DIM2
                                                  __global const DATA_TYPE *input, //Oc, Ic, H, W
                                                  __private const int input_offset,
@@ -891,7 +891,7 @@ PROCESS(7);
 #undef PROCESS

 }
-__kernel void winograd_filter_image_to_buffer_6x6(KERNEL_ERROR_PARAMS
+__kernel void winograd_filter_image_to_buffer_6x6(OUT_OF_RANGE_PARAMS
                                                  GLOBAL_WORK_GROUP_SIZE_DIM2
                                                  __global DATA_TYPE *output, //Oc, Ic, H, W
                                                  __private const int height,
@@ -933,7 +933,7 @@ __kernel void winograd_filter_image_to_buffer_6x6(KERNEL_ERROR_PARAMS
 }

 // only support 3x3 now
-__kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS
+__kernel void winograd_filter_buffer_to_image_4x4(OUT_OF_RANGE_PARAMS
                                                  GLOBAL_WORK_GROUP_SIZE_DIM2
                                                  __global const DATA_TYPE *input, //Oc, Ic, H, W
                                                  __private const int input_offset,
@@ -1040,7 +1040,7 @@ __kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS
 #undef PROCESS

 }
-__kernel void winograd_filter_image_to_buffer_4x4(KERNEL_ERROR_PARAMS
+__kernel void winograd_filter_image_to_buffer_4x4(OUT_OF_RANGE_PARAMS
                                                  GLOBAL_WORK_GROUP_SIZE_DIM2
                                                  __global DATA_TYPE *output, //Oc, Ic, H, W
                                                  __private const int height,

--- a/mace/kernels/opencl/cl/buffer_transform.cl
+++ b/mace/kernels/opencl/cl/buffer_transform.cl
+#include <common.h>
+
+__kernel void pad_input(BUFFER_OUT_OF_RANGE_PARAMS
+                        GLOBAL_WORK_GROUP_SIZE_DIM2
+                        __global IN_DATA_TYPE *input,
+                        __private const int in_height,
+                        __private const int in_width,
+                        __private const int in_chan,
+                        __private const int padded_height,
+                        __private const int padded_width,
+                        __private const int padded_chan,
+                        __private const int pad_top,
+                        __private const int pad_left,
+                        __global DATA_TYPE *output) {
+  const int padded_wc_blk_idx = get_global_id(0);
+  const int padded_hb_idx = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (padded_wc_blk_idx >= global_size_dim0 ||
+      padded_hb_idx >= global_size_dim1) {
+    return;
+  }
+#endif
+  const int padded_chan_blk = (padded_chan + 3) >> 2;
+  const int padded_width_idx = padded_wc_blk_idx / padded_chan_blk;
+  const int padded_chan_blk_idx = padded_wc_blk_idx % padded_chan_blk;
+  const int batch_idx = padded_hb_idx / padded_height;
+  const int padded_height_idx = padded_hb_idx % padded_height;
+  const int padded_chan_idx = padded_chan_blk_idx << 2;
+  const int in_height_idx = padded_height_idx - pad_top;
+  const int in_width_idx = padded_width_idx - pad_left;
+  const int padded_offset = mad24(mad24(mad24(batch_idx, padded_height, padded_height_idx),
+      padded_width, padded_width_idx), padded_chan, padded_chan_idx);
+  const int in_offset = mad24(mad24(mad24(batch_idx, in_height, in_height_idx),
+      in_width, in_width_idx), in_chan, padded_chan_idx);
+
+  DATA_TYPE4 value = 0;
+  if (0 <= in_height_idx && in_height_idx < in_height &&
+      0 <= in_width_idx && in_width_idx < in_width) {
+    const int remain_chan = in_chan - padded_chan_idx;
+    if (remain_chan < 4) {
+      switch (remain_chan) {
+        case 3:
+          value.z = CONVERT(input[in_offset + 2]);
+        case 2:
+          value.y = CONVERT(input[in_offset + 1]);
+        case 1:
+          value.x = CONVERT(input[in_offset]);
+      }
+    } else {
+      value = CONVERT4(vload4(0, input + in_offset));
+    }
+  }
+  vstore4(value, 0, output + padded_offset);
+  CHECK_OUT_OF_RANGE_FOR_BUFFER(padded_offset + 3);
+}
+
+// OIHW -> [H, W, (O+3) / 4, I, 4]
+__kernel void transform_conv_filter(BUFFER_OUT_OF_RANGE_PARAMS
+                                    GLOBAL_WORK_GROUP_SIZE_DIM3
+                                    __global IN_DATA_TYPE *input,  // OIHW
+                                    __private const int input_offset,
+                                    __global DATA_TYPE *output,
+                                    __private const int out_chan,
+                                    __private const int in_chan,
+                                    __private const int height,
+                                    __private const int width,
+                                    __private const int inner_size) {
+  const int in_chan_idx = get_global_id(0);
+  const int out_chan_blk_idx = get_global_id(1);
+  const int hw_idx = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (in_chan_idx >= global_size_dim0 ||
+      out_chan_blk_idx >= global_size_dim1 ||
+      hw_idx >= global_size_dim2) {
+    return;
+  }
+#endif
+  const int t_in_chan = global_size_dim0;
+  const int out_chan_blk = global_size_dim1;
+
+  const int h_idx = hw_idx / width;
+  const int w_idx = hw_idx % width;
+  const int out_chan_idx = out_chan_blk_idx << 2;
+  const int in_offset = mad24(mad24(mad24(out_chan_idx, in_chan, in_chan_idx),
+      height, h_idx), width, w_idx) + input_offset;
+  const int out_offset = (mad24(mad24(mad24(h_idx, width, w_idx),
+      out_chan_blk, out_chan_blk_idx), t_in_chan, in_chan_idx) << 2);
+
+  DATA_TYPE4 value = 0;
+  if (in_chan_idx < in_chan) {
+    if (out_chan_idx + 3 < out_chan) {
+      value.x = CONVERT(input[in_offset]);
+      value.y = CONVERT(input[in_offset + inner_size]);
+      value.z = CONVERT(input[in_offset + 2 * inner_size]);
+      value.w = CONVERT(input[in_offset + 3 * inner_size]);
+    } else {
+      const int diff = out_chan - out_chan_idx;
+      switch(diff) {
+        case 3:
+          value.z = CONVERT(input[in_offset + 2 * inner_size]);
+        case 2:
+          value.y = CONVERT(input[in_offset + inner_size]);
+        case 1:
+          value.x = CONVERT(input[in_offset]);
+      }
+    }
+  }
+  VSTORE4(value, output, out_offset);
+}
+
+// MIHW -> [M, (I+3) / 4, H, W, 4]
+__kernel void transform_dw_conv_filter(BUFFER_OUT_OF_RANGE_PARAMS
+                                       GLOBAL_WORK_GROUP_SIZE_DIM3
+                                       __global IN_DATA_TYPE *input,  // MIHW
+                                       __private const int input_offset,
+                                       __global DATA_TYPE *output,
+                                       __private const int in_chan,
+                                       __private const int in_hw) {
+  const int width_idx = get_global_id(0);
+  const int height_idx = get_global_id(1);
+  const int in_chan_blk_idx = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (width_idx >= global_size_dim0 ||
+      height_idx >= global_size_dim1 ||
+      in_chan_blk_idx >= global_size_dim2) {
+    return;
+  }
+#endif
+  const int width = global_size_dim0;
+  const int height = global_size_dim1;
+  const int in_chan_idx = in_chan_blk_idx << 2;
+
+  const int in_offset = mad24(in_chan_idx, in_hw,
+      mad24(height_idx, width, width_idx)) + input_offset;
+  const int out_offset = mad24(in_chan_blk_idx, in_hw,
+      mad24(height_idx, width, width_idx)) << 2;
+
+  DATA_TYPE4 value = 0;
+  if (in_chan_idx + 3 < in_chan) {
+    value.x = CONVERT(input[in_offset]);
+    value.y = CONVERT(input[in_offset + in_hw]);
+    value.z = CONVERT(input[in_offset + (in_hw << 1)]);
+    value.w = CONVERT(input[in_offset + in_hw + (in_hw << 1)]);
+  } else {
+    const int diff = in_chan - in_chan_idx;
+    switch(diff) {
+      case 3:
+        value.z = CONVERT(input[in_offset + (in_hw << 1)]);
+      case 2:
+        value.y = CONVERT(input[in_offset + in_hw]);
+      case 1:
+        value.x = CONVERT(input[in_offset]);
+    }
+  }
+  VSTORE4(value, output, out_offset);
+}
+
+__kernel void transform_arg(BUFFER_OUT_OF_RANGE_PARAMS
+                            __private const int global_size_dim0,
+                            __global IN_DATA_TYPE *input,
+                            __private const int input_offset,
+                            __global DATA_TYPE *output,
+                            __private int size) {
+  const int blk_idx = get_global_id(0);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (blk_idx >= global_size_dim0) {
+    return;
+  }
+#endif
+  const int idx = blk_idx << 2;
+  const int diff = size - idx;
+  const int in_idx = idx + input_offset;
+  DATA_TYPE4 value = 0;
+  if (diff < 4) {
+    switch (diff) {
+      case 3:
+        value.z = CONVERT(input[in_idx + 2]);
+      case 2:
+        value.y = CONVERT(input[in_idx + 1]);
+      case 1:
+        value.x = CONVERT(input[in_idx]);
+    }
+  } else {
+    value = CONVERT4(vload4(0, input + in_idx));
+  }
+
+  VSTORE4(value, output, idx);
+}
+
+__kernel void transform_data_type(BUFFER_OUT_OF_RANGE_PARAMS
+                                  __private const int global_size_dim0,
+                                  __global IN_DATA_TYPE *input,
+                                  __private const int input_offset,
+                                  __global DATA_TYPE *output) {
+  const int out_idx = get_global_id(0);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_idx >= global_size_dim0) {
+    return;
+  }
+#endif
+
+  DATA_TYPE4 input_value = CONVERT4(vload4(out_idx, input + input_offset));
+  vstore4(input_value, out_idx, output);
+}
--- a/mace/kernels/opencl/cl/channel_shuffle.cl
+++ b/mace/kernels/opencl/cl/channel_shuffle.cl
 #include <common.h>

 // assume channes_per_group mod 4 = 0 && groups mod 4 == 0
-__kernel void channel_shuffle(KERNEL_ERROR_PARAMS
+__kernel void channel_shuffle(OUT_OF_RANGE_PARAMS
                              GLOBAL_WORK_GROUP_SIZE_DIM3
                              __read_only image2d_t input,
                              __private const int groups,

--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -24,19 +24,13 @@
 #define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type)

 #define DATA_TYPE4 VEC_DATA_TYPE(DATA_TYPE, 4)
+#define OUT_DATA_TYPE4 VEC_DATA_TYPE(OUT_DATA_TYPE, 4)

-#ifdef OUT_OF_RANGE_CHECK
-#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \
-  check_out_of_range_for_image2d(image, (coord).x, (coord).y, kernel_error);
-#else
-#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord)
-#endif
+#define CONVERT_STR(value, type) convert_##type((value))

-#define READ_IMAGET(image, sampler, coord) \
-  CMD_TYPE(read_image, CMD_DATA_TYPE)(image, sampler, coord)
-#define WRITE_IMAGET(image, coord, value)        \
-  CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord)   \
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(image, coord, value);
+#define CONVERT_TO(value, type) CONVERT_STR(value, type)
+#define CONVERT(value) CONVERT_TO(value, DATA_TYPE)
+#define CONVERT4(value) CONVERT_TO(value, DATA_TYPE4)

 #define GLOBAL_WORK_GROUP_SIZE_DIM2       \
    __private const int global_size_dim0, \
@@ -47,16 +41,37 @@
    __private const int global_size_dim1, \
    __private const int global_size_dim2,

+// oorc for 'Out Of Range Check'
 #ifdef OUT_OF_RANGE_CHECK
+#define OUT_OF_RANGE_PARAMS \
+  __global int *oorc_flag,

-#define KERNEL_ERROR_PARAMS \
-  __global char *kernel_error,
+#define BUFFER_OUT_OF_RANGE_PARAMS      \
+  __global int *oorc_flag,              \
+  __private const int oorc_output_length,

+#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \
+  check_out_of_range_for_image2d(image, (coord).x, (coord).y, oorc_flag);
+
+#define CHECK_OUT_OF_RANGE_FOR_BUFFER(idx) \
+  check_out_of_range_for_buffer(oorc_output_length, (idx), oorc_flag);
 #else
+#define OUT_OF_RANGE_PARAMS
+#define BUFFER_OUT_OF_RANGE_PARAMS
+#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord)
+#define CHECK_OUT_OF_RANGE_FOR_BUFFER(idx)
+#endif

-#define KERNEL_ERROR_PARAMS
+#define READ_IMAGET(image, sampler, coord) \
+  CMD_TYPE(read_image, CMD_DATA_TYPE)(image, sampler, coord)
+#define WRITE_IMAGET(image, coord, value)        \
+  CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord)   \
+  CMD_TYPE(write_image, CMD_DATA_TYPE)(image, coord, value);
+
+#define VSTORE4(data, output, offset)         \
+  CHECK_OUT_OF_RANGE_FOR_BUFFER((offset) + 3) \
+  vstore4(data, 0, output + (offset));

-#endif

 __constant sampler_t SAMPLER =
    CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
@@ -66,6 +81,7 @@ inline float4 do_sigmoid(float4 in) {
  return native_recip(1.0f + native_exp(-in));
 }

+#ifdef DATA_TYPE
 inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
 #ifdef USE_PRELU
                                DATA_TYPE4 prelu_alpha,
@@ -89,17 +105,25 @@ inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
 #endif
  return out;
 }
+#endif

 inline void check_out_of_range_for_image2d(__write_only image2d_t image,
                                           __private const int x,
                                           __private const int y,
-                                           global char *kernel_error) {
-#ifdef OUT_OF_RANGE_CHECK
+                                           __global int *oorc_flag) {
  int2 image_dim = get_image_dim(image);
  if (x >= image_dim.x || y >= image_dim.y) {
-    *kernel_error = 1;
+    *oorc_flag = 1;
  }
-#endif
 }

+inline void check_out_of_range_for_buffer(__private const int length,
+                                          __private const int idx,
+                                          __global int *oorc_flag) {
+  if (idx >= length) {
+    *oorc_flag = idx - length + 1;
+  }
+}
+
+
 #endif  // MACE_KERNELS_OPENCL_CL_COMMON_H_
--- a/mace/kernels/opencl/cl/concat.cl
+++ b/mace/kernels/opencl/cl/concat.cl
@@ -22,7 +22,7 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
 }

 // Supported data type: half/float
-__kernel void concat_channel(KERNEL_ERROR_PARAMS
+__kernel void concat_channel(OUT_OF_RANGE_PARAMS
                             GLOBAL_WORK_GROUP_SIZE_DIM3
                             __read_only image2d_t input0,
                             __read_only image2d_t input1,
@@ -84,7 +84,7 @@ __kernel void concat_channel(KERNEL_ERROR_PARAMS
 }

 // Required: All input channels are divisible by 4
-__kernel void concat_channel_multi(KERNEL_ERROR_PARAMS
+__kernel void concat_channel_multi(OUT_OF_RANGE_PARAMS
                                   GLOBAL_WORK_GROUP_SIZE_DIM3
                                   __read_only image2d_t input,
                                   __private const int chan_blk_offset,

--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
 #include <common.h>

-__kernel void conv_2d(KERNEL_ERROR_PARAMS
+__kernel void conv_2d(OUT_OF_RANGE_PARAMS
                      GLOBAL_WORK_GROUP_SIZE_DIM3
                      __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                      __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */

--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
 #include <common.h>

-__kernel void conv_2d_1x1(KERNEL_ERROR_PARAMS
+__kernel void conv_2d_1x1(OUT_OF_RANGE_PARAMS
                          GLOBAL_WORK_GROUP_SIZE_DIM3
                          __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                          __read_only image2d_t filter, /* cout%4 * cin, cout/4 */

--- a/mace/kernels/opencl/cl/conv_2d_1x1_buffer.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1_buffer.cl
+#include <common.h>
+
+__kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS
+                     GLOBAL_WORK_GROUP_SIZE_DIM2
+                     __global IN_DATA_TYPE *padded_input,
+                     __global IN_DATA_TYPE *filter,
+#ifdef BIAS
+                     __global IN_DATA_TYPE *bias,
+#endif
+                     __private const int in_height,
+                     __private const int in_width,
+                     __private const int in_chan,
+                     __private const int filter_in_chan,
+                     __private const int out_height,
+                     __private const int out_width,
+                     __private const int out_chan,
+                     __private const int stride_h,
+                     __private const int stride_w,
+                     __private const float relux_max_limit,
+                     __global OUT_DATA_TYPE *output) {
+  const int out_wc_blk_idx = get_global_id(0);
+  const int out_hb_idx = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_wc_blk_idx >= global_size_dim0 ||
+      out_hb_idx >= global_size_dim1) {
+    return;
+  }
+#endif
+  const int out_chan_blk = (out_chan + 3) >> 2;
+
+  const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk;
+  const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk;
+
+  const int batch_idx = out_hb_idx / out_height;
+  const int out_height_idx = out_hb_idx % out_height;
+  const int out_width_idx = out_width_blk_idx << 1;
+  const int out_chan_idx = out_chan_blk_idx << 2;
+
+  const int in_height_idx = mul24(out_height_idx, stride_h);
+  const int in_width_idx = mul24(out_width_idx, stride_w);
+  const int strided_chan = mul24(in_chan, stride_w);
+
+#ifdef BIAS
+  DATA_TYPE4 out0 = CONVERT4(vload4(0, bias + out_chan_idx));
+  DATA_TYPE4 out1 = out0;
+#else
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+#endif
+
+  int in_offset = mul24(mad24(mad24(batch_idx, in_height, in_height_idx),
+      in_width, in_width_idx), in_chan);
+  int filter_offset = mul24(out_chan_blk_idx, filter_in_chan) << 2;
+  DATA_TYPE4 in0, in1;
+  DATA_TYPE4 w0, w1, w2, w3;
+
+  for (int in_chan_idx = 0; in_chan_idx < in_chan; in_chan_idx += 4) {
+    w0 = CONVERT4(vload4(0, filter + filter_offset));
+    w1 = CONVERT4(vload4(0, filter + filter_offset + 4));
+    w2 = CONVERT4(vload4(0, filter + filter_offset + 8));
+    w3 = CONVERT4(vload4(0, filter + filter_offset + 12));
+
+    in0 = CONVERT4(vload4(0, padded_input + in_offset));
+    in1 = CONVERT4(vload4(0, padded_input + in_offset + strided_chan));
+
+    out0 = mad((DATA_TYPE4)(in0.x), w0, out0);
+    out0 = mad((DATA_TYPE4)(in0.y), w1, out0);
+    out0 = mad((DATA_TYPE4)(in0.z), w2, out0);
+    out0 = mad((DATA_TYPE4)(in0.w), w3, out0);
+
+    out1 = mad((DATA_TYPE4)(in1.x), w0, out1);
+    out1 = mad((DATA_TYPE4)(in1.y), w1, out1);
+    out1 = mad((DATA_TYPE4)(in1.z), w2, out1);
+    out1 = mad((DATA_TYPE4)(in1.w), w3, out1);
+
+    filter_offset += 16;
+    in_offset += 4;
+  }
+
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+  out0 = do_activation(out0, relux_max_limit);
+  out1 = do_activation(out1, relux_max_limit);
+#endif
+
+  int out_offset = mad24(mad24(mad24(batch_idx, out_height, out_height_idx),
+      out_width, out_width_idx), out_chan, out_chan_idx);
+
+#define WRITE_OUTPUT(i) \
+  if (out_chan_idx + 4 > out_chan) {           \
+    const int diff = out_chan - out_chan_idx;  \
+    switch(diff) {                             \
+      case 3:                                  \
+        output[out_offset + 2] = CONVERT_TO(out##i.z, OUT_DATA_TYPE);     \
+      case 2:                                  \
+        output[out_offset + 1] = CONVERT_TO(out##i.y, OUT_DATA_TYPE);     \
+      case 1:                                  \
+        output[out_offset] = CONVERT_TO(out##i.x, OUT_DATA_TYPE);         \
+    }                                          \
+    CHECK_OUT_OF_RANGE_FOR_BUFFER(out_offset + diff - 1); \
+  } else {                                     \
+    VSTORE4(CONVERT_TO(out##i, OUT_DATA_TYPE4), output, out_offset);   \
+  }
+
+  WRITE_OUTPUT(0);
+  if (out_width_idx + 1 >= out_width) return;
+  out_offset += out_chan;
+  WRITE_OUTPUT(1);
+#undef WRITE_OUTPUT
+
+}
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
 #include <common.h>

-__kernel void conv_2d_3x3(KERNEL_ERROR_PARAMS
+__kernel void conv_2d_3x3(OUT_OF_RANGE_PARAMS
                          GLOBAL_WORK_GROUP_SIZE_DIM3
                          __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                          __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */

--- a/mace/kernels/opencl/cl/conv_2d_buffer.cl
+++ b/mace/kernels/opencl/cl/conv_2d_buffer.cl
--- a/mace/kernels/opencl/cl/crop.cl
+++ b/mace/kernels/opencl/cl/crop.cl
 #include <common.h>

-__kernel void crop(KERNEL_ERROR_PARAMS
+__kernel void crop(OUT_OF_RANGE_PARAMS
                   GLOBAL_WORK_GROUP_SIZE_DIM3
                   __read_only image2d_t input,
                   __private const int offset_b,

--- a/mace/kernels/opencl/cl/deconv_2d.cl
+++ b/mace/kernels/opencl/cl/deconv_2d.cl
 #include <common.h>

-__kernel void deconv_2d(KERNEL_ERROR_PARAMS
+__kernel void deconv_2d(OUT_OF_RANGE_PARAMS
                        GLOBAL_WORK_GROUP_SIZE_DIM3
                        __read_only image2d_t input,
                        __read_only image2d_t weights,

--- a/mace/kernels/opencl/cl/depth_to_space.cl
+++ b/mace/kernels/opencl/cl/depth_to_space.cl
 #include <common.h>

-__kernel void depth_to_space(KERNEL_ERROR_PARAMS
+__kernel void depth_to_space(OUT_OF_RANGE_PARAMS
                             GLOBAL_WORK_GROUP_SIZE_DIM3
                             __read_only image2d_t input,
                             __private const int block_size,

--- a/mace/kernels/opencl/cl/depthwise_conv2d.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl
 #include <common.h>

 // Only multiplier = 1 is supported
-__kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS
+__kernel void depthwise_conv2d(OUT_OF_RANGE_PARAMS
                               GLOBAL_WORK_GROUP_SIZE_DIM3
                               __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                               __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
@@ -136,7 +136,7 @@ __kernel void depthwise_conv2d(KERNEL_ERROR_PARAMS
  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
 }

-__kernel void depthwise_conv2d_s1(KERNEL_ERROR_PARAMS
+__kernel void depthwise_conv2d_s1(OUT_OF_RANGE_PARAMS
                                  GLOBAL_WORK_GROUP_SIZE_DIM3
                                  __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                  __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */

--- a/mace/kernels/opencl/cl/depthwise_conv2d_buffer.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d_buffer.cl
--- a/mace/kernels/opencl/cl/eltwise.cl
+++ b/mace/kernels/opencl/cl/eltwise.cl
 #include <common.h>

-__kernel void eltwise(KERNEL_ERROR_PARAMS
+__kernel void eltwise(OUT_OF_RANGE_PARAMS
                      GLOBAL_WORK_GROUP_SIZE_DIM3
                      __read_only image2d_t input0,
 #if INPUT_TYPE == 1

--- a/mace/kernels/opencl/cl/fully_connected.cl
+++ b/mace/kernels/opencl/cl/fully_connected.cl
--- a/mace/kernels/opencl/cl/lstmcell.cl
+++ b/mace/kernels/opencl/cl/lstmcell.cl
 #include <common.h>

-__kernel void lstmcell(KERNEL_ERROR_PARAMS
+__kernel void lstmcell(OUT_OF_RANGE_PARAMS
                       GLOBAL_WORK_GROUP_SIZE_DIM2
                       __read_only image2d_t input,
                       __read_only image2d_t pre_output,

--- a/mace/kernels/opencl/cl/matmul.cl
+++ b/mace/kernels/opencl/cl/matmul.cl
--- a/mace/kernels/opencl/cl/pad.cl
+++ b/mace/kernels/opencl/cl/pad.cl
--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
--- a/mace/kernels/opencl/cl/pooling_buffer.cl
+++ b/mace/kernels/opencl/cl/pooling_buffer.cl
--- a/mace/kernels/opencl/cl/reduce_mean.cl
+++ b/mace/kernels/opencl/cl/reduce_mean.cl
--- a/mace/kernels/opencl/cl/resize_bicubic.cl
+++ b/mace/kernels/opencl/cl/resize_bicubic.cl
--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
--- a/mace/kernels/opencl/cl/softmax.cl
+++ b/mace/kernels/opencl/cl/softmax.cl
--- a/mace/kernels/opencl/cl/softmax_buffer.cl
+++ b/mace/kernels/opencl/cl/softmax_buffer.cl
--- a/mace/kernels/opencl/cl/space_to_batch.cl
+++ b/mace/kernels/opencl/cl/space_to_batch.cl
--- a/mace/kernels/opencl/cl/space_to_depth.cl
+++ b/mace/kernels/opencl/cl/space_to_depth.cl
--- a/mace/kernels/opencl/cl/split.cl
+++ b/mace/kernels/opencl/cl/split.cl
--- a/mace/kernels/opencl/cl/winograd_transform.cl
+++ b/mace/kernels/opencl/cl/winograd_transform.cl
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
--- a/mace/kernels/opencl/conv_2d.cc
+++ b/mace/kernels/opencl/conv_2d.cc
--- a/mace/kernels/opencl/crop.cc
+++ b/mace/kernels/opencl/crop.cc
--- a/mace/kernels/opencl/deconv_2d.cc
+++ b/mace/kernels/opencl/deconv_2d.cc
--- a/mace/kernels/opencl/depth_to_space.cc
+++ b/mace/kernels/opencl/depth_to_space.cc
--- a/mace/kernels/opencl/depthwise_conv2d.cc
+++ b/mace/kernels/opencl/depthwise_conv2d.cc
--- a/mace/kernels/opencl/eltwise.cc
+++ b/mace/kernels/opencl/eltwise.cc
--- a/mace/kernels/opencl/fully_connected.cc
+++ b/mace/kernels/opencl/fully_connected.cc
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
--- a/mace/kernels/opencl/image/activation.h
+++ b/mace/kernels/opencl/image/activation.h
--- a/mace/kernels/opencl/image/addn.h
+++ b/mace/kernels/opencl/image/addn.h
--- a/mace/kernels/opencl/image/batch_norm.h
+++ b/mace/kernels/opencl/image/batch_norm.h
--- a/mace/kernels/opencl/image/batch_to_space.h
+++ b/mace/kernels/opencl/image/batch_to_space.h
--- a/mace/kernels/opencl/image/bias_add.h
+++ b/mace/kernels/opencl/image/bias_add.h
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
--- a/mace/kernels/opencl/image/channel_shuffle.h
+++ b/mace/kernels/opencl/image/channel_shuffle.h
--- a/mace/kernels/opencl/image/concat.cc
+++ b/mace/kernels/opencl/image/concat.cc
--- a/mace/kernels/opencl/image/concat.h
+++ b/mace/kernels/opencl/image/concat.h
--- a/mace/kernels/opencl/image/conv_2d.h
+++ b/mace/kernels/opencl/image/conv_2d.h
--- a/mace/kernels/opencl/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_1x1.cc
--- a/mace/kernels/opencl/conv_2d_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_3x3.cc
--- a/mace/kernels/opencl/conv_2d_general.cc
+++ b/mace/kernels/opencl/conv_2d_general.cc
--- a/mace/kernels/opencl/image/crop.h
+++ b/mace/kernels/opencl/image/crop.h
--- a/mace/kernels/opencl/image/deconv_2d.h
+++ b/mace/kernels/opencl/image/deconv_2d.h
--- a/mace/kernels/opencl/image/depth_to_space.h
+++ b/mace/kernels/opencl/image/depth_to_space.h
--- a/mace/kernels/opencl/depthwise_conv.cc
+++ b/mace/kernels/opencl/depthwise_conv.cc
--- a/mace/kernels/opencl/image/depthwise_conv2d.h
+++ b/mace/kernels/opencl/image/depthwise_conv2d.h
--- a/mace/kernels/opencl/image/eltwise.h
+++ b/mace/kernels/opencl/image/eltwise.h
--- a/mace/kernels/opencl/image/fully_connected.h
+++ b/mace/kernels/opencl/image/fully_connected.h
--- a/mace/kernels/opencl/image_to_buffer.cc
+++ b/mace/kernels/opencl/image_to_buffer.cc
--- a/mace/kernels/opencl/image/lstm_cell.h
+++ b/mace/kernels/opencl/image/lstm_cell.h
--- a/mace/kernels/opencl/image/matmul.h
+++ b/mace/kernels/opencl/image/matmul.h
--- a/mace/kernels/opencl/image/pad.h
+++ b/mace/kernels/opencl/image/pad.h
--- a/mace/kernels/opencl/image/pooling.h
+++ b/mace/kernels/opencl/image/pooling.h
--- a/mace/kernels/opencl/image/reduce_mean.h
+++ b/mace/kernels/opencl/image/reduce_mean.h
--- a/mace/kernels/opencl/image/resize_bicubic.h
+++ b/mace/kernels/opencl/image/resize_bicubic.h
--- a/mace/kernels/opencl/image/resize_bilinear.h
+++ b/mace/kernels/opencl/image/resize_bilinear.h
--- a/mace/kernels/opencl/image/softmax.h
+++ b/mace/kernels/opencl/image/softmax.h
--- a/mace/kernels/opencl/image/space_to_batch.h
+++ b/mace/kernels/opencl/image/space_to_batch.h
--- a/mace/kernels/opencl/image/space_to_depth.h
+++ b/mace/kernels/opencl/image/space_to_depth.h
--- a/mace/kernels/opencl/image/split.h
+++ b/mace/kernels/opencl/image/split.h
--- a/mace/kernels/opencl/image/winograd_transform.h
+++ b/mace/kernels/opencl/image/winograd_transform.h
--- a/mace/kernels/opencl/lstmcell.cc
+++ b/mace/kernels/opencl/lstmcell.cc
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
--- a/mace/kernels/opencl/pooling.cc
+++ b/mace/kernels/opencl/pooling.cc
--- a/mace/kernels/opencl/reduce_mean.cc
+++ b/mace/kernels/opencl/reduce_mean.cc
--- a/mace/kernels/opencl/resize_bicubic.cc
+++ b/mace/kernels/opencl/resize_bicubic.cc
--- a/mace/kernels/opencl/resize_bilinear.cc
+++ b/mace/kernels/opencl/resize_bilinear.cc
--- a/mace/kernels/opencl/softmax.cc
+++ b/mace/kernels/opencl/softmax.cc
--- a/mace/kernels/opencl/space_to_batch.cc
+++ b/mace/kernels/opencl/space_to_batch.cc
--- a/mace/kernels/opencl/space_to_depth.cc
+++ b/mace/kernels/opencl/space_to_depth.cc
--- a/mace/kernels/opencl/split.cc
+++ b/mace/kernels/opencl/split.cc
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
--- a/mace/kernels/pad.h
+++ b/mace/kernels/pad.h
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
--- a/mace/kernels/reduce_mean.h
+++ b/mace/kernels/reduce_mean.h
--- a/mace/kernels/resize_bicubic.h
+++ b/mace/kernels/resize_bicubic.h
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
--- a/mace/kernels/space_to_depth.h
+++ b/mace/kernels/space_to_depth.h
--- a/mace/kernels/split.h
+++ b/mace/kernels/split.h
--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
--- a/mace/ops/batch_to_space.h
+++ b/mace/ops/batch_to_space.h
--- a/mace/ops/buffer_to_image.cc
+++ b/mace/ops/buffer_to_image.cc
--- a/mace/ops/image_to_buffer.h
+++ b/mace/ops/image_to_buffer.h
--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
--- a/mace/ops/image_to_buffer.cc
+++ b/mace/ops/image_to_buffer.cc
--- a/mace/ops/buffer_to_image.h
+++ b/mace/ops/buffer_to_image.h
--- a/mace/ops/buffer_transform_test.cc
+++ b/mace/ops/buffer_transform_test.cc
--- a/mace/ops/conv_2d.h
+++ b/mace/ops/conv_2d.h
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
--- a/mace/ops/core_test.cc
+++ b/mace/ops/core_test.cc
--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
--- a/mace/ops/ops_register.cc
+++ b/mace/ops/ops_register.cc
--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
--- a/mace/ops/resize_bicubic.h
+++ b/mace/ops/resize_bicubic.h
--- a/mace/ops/space_to_batch.h
+++ b/mace/ops/space_to_batch.h
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
--- a/mace/python/tools/memory_optimizer.py
+++ b/mace/python/tools/memory_optimizer.py
--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
--- a/repository/opencl-kernel/opencl_kernel_configure.bzl
+++ b/repository/opencl-kernel/opencl_kernel_configure.bzl
--- a/tools/converter.py
+++ b/tools/converter.py
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py