diff --git a/mace/core/buffer.h b/mace/core/buffer.h
index bada99c06a997f804399d0d726629b88d4743e66..c17c4a1d9fcf83d8a6c8bfa7ead4fd3b3d5fe6b5 100644
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -241,7 +241,9 @@ class Image : public BufferBase {
     mapped_buf_ = nullptr;
   };
 
-  void Resize(index_t size) {}
+  void Resize(index_t size) {
+    MACE_NOT_IMPLEMENTED;
+  }
 
   void Copy(void *src, index_t offset, index_t length) {
     MACE_NOT_IMPLEMENTED;
@@ -263,7 +265,11 @@ class Image : public BufferBase {
 
 class BufferSlice : public BufferBase {
  public:
-  BufferSlice() {}
+  BufferSlice()
+    : buffer_(nullptr),
+      mapped_buf_(nullptr),
+      offset_(0),
+      length_(0) {}
   BufferSlice(BufferBase *buffer, index_t offset, index_t length)
     : BufferBase(buffer->size()),
       buffer_(buffer),
@@ -284,12 +290,13 @@ class BufferSlice : public BufferBase {
                                                       other.length_) {}
 
   ~BufferSlice() {
-    if (mapped_buf_ != nullptr) {
+    if (buffer_ != nullptr && mapped_buf_ != nullptr) {
       UnMap();
     }
   }
 
   void *buffer() {
+    MACE_CHECK_NOTNULL(buffer_);
     return buffer_->buffer();
   };
 
@@ -330,6 +337,7 @@ class BufferSlice : public BufferBase {
   };
 
   void Resize(index_t size) {
+    MACE_NOT_IMPLEMENTED;
   }
 
   void Copy(void *src, index_t offset, index_t length) {
diff --git a/mace/core/tensor.h b/mace/core/tensor.h
index cfe832ed234a822e7713fc59e7531f8faa3f27e4..47fa3d11387c258e8cc96d55b7a9cca68a94f9e0 100644
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -5,6 +5,7 @@
 #ifndef MACE_CORE_TENSOR_H_
 #define MACE_CORE_TENSOR_H_
 
+#include "mace/core/runtime/opencl/cl2.hpp"
 #include "mace/core/buffer.h"
 #include "mace/utils/logging.h"
 #include "mace/core/types.h"
@@ -112,10 +113,24 @@ class Tensor {
     return size() * SizeOfType();
   }
 
-  inline void *buffer() const {
-    MACE_CHECK(buffer_ != nullptr && buffer_->buffer() != nullptr,
-               "buffer is null");
-    return buffer_->buffer();
+  inline bool has_opencl_image() const {
+    return buffer_ != nullptr && !buffer_->OnHost()
+      && typeid(*buffer_) == typeid(Image);
+  }
+
+  inline bool has_opencl_buffer() const {
+    return buffer_ != nullptr && !buffer_->OnHost()
+      && !has_opencl_image();
+  }
+
+  inline cl::Image *opencl_image() const {
+    MACE_CHECK(has_opencl_image(), "do not have image");
+    return static_cast<cl::Image*>(buffer_->buffer());
+  }
+
+  inline cl::Buffer *opencl_buffer() const {
+    MACE_CHECK(has_opencl_buffer(), "do not have opencl buffer");
+    return static_cast<cl::Buffer*>(buffer_->buffer());
   }
 
   inline index_t buffer_offset() const {
@@ -152,6 +167,7 @@ class Tensor {
   inline void Resize(const std::vector<index_t> &shape) {
     shape_ = shape;
     if (buffer_ != nullptr) {
+      MACE_CHECK(!has_opencl_image(), "Cannot resize image, use ResizeImage.");
       buffer_->Resize(raw_size());
     } else {
       buffer_ = new Buffer(allocator_, raw_size());
@@ -159,20 +175,38 @@ class Tensor {
     }
   }
 
-  inline void ResizeLike(const Tensor &other) {
-    Resize(other.shape());
-  }
-
-  inline void ResizeLike(const Tensor *other) {
-    Resize(other->shape());
-  }
-
   inline void ResizeImage(const std::vector<index_t> &shape,
                           const std::vector<size_t> &image_shape) {
     shape_ = shape;
     if (buffer_ == nullptr) {
       buffer_ = new Image(image_shape, dtype_);
       is_buffer_owner_ = true;
+    } else {
+      MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");
+      Image *image = dynamic_cast<Image*>(buffer_);
+      MACE_CHECK(image_shape[0] <= image->image_shape()[0]
+                   && image_shape[1] <= image->image_shape()[1]);
+    }
+  }
+
+  inline void ResizeLike(const Tensor &other) {
+    ResizeLike(&other);
+  }
+
+  inline void ResizeLike(const Tensor *other) {
+    if (other->has_opencl_image()) {
+      if (is_buffer_owner_ && buffer_ != nullptr && !has_opencl_image()) {
+        delete buffer_;
+        buffer_ = nullptr;
+      }
+      ResizeImage(other->shape(),
+                  dynamic_cast<Image *>(other->UnderlyingBuffer())->image_shape());
+    } else {
+      if (is_buffer_owner_ && buffer_ != nullptr && has_opencl_image()) {
+        delete buffer_;
+        buffer_ = nullptr;
+      }
+      Resize(other->shape());
     }
   }
 
diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc
index dee010875e853ba192cf05b90abfcdf5ee2cb48f..99b8a6bc80bed3d92c8649155b7aca1210cbd0a7 100644
--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -60,12 +60,12 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     kernel_ =
         runtime->BuildKernel("activation", kernel_name, built_options);
     int idx = 0;
-    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+    kernel_.setArg(idx++, *(input->opencl_image()));
     if (activation_ == PRELU) {
-      kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(alpha->buffer())));
+      kernel_.setArg(idx++, *(alpha->opencl_image()));
     }
     kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
-    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+    kernel_.setArg(idx++, *(output->opencl_image()));
   }
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index 3495ddca6f4d097e7ff5252c433dbd3f7e08e2f2..3838808192420e8ade3127932c8db626aba8fbf0 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -58,9 +58,9 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
     uint32_t idx = 0;
     for (auto input : input_tensors) {
       kernel_.setArg(idx++,
-                         *(static_cast<const cl::Image2D *>(input->buffer())));
+                         *(input->opencl_image()));
     }
-    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->buffer())));
+    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
   }
 
   const uint32_t gws[2] = {
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index 7696e875e538b0f06aefb4e30c4032fcba56a538..571bdd533e4051f841cef7efae702645023457f5 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -64,17 +64,17 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
         runtime->BuildKernel("batch_norm", kernel_name, built_options);
 
     uint32_t idx = 0;
-    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
-    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(scale->buffer())));
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(scale->opencl_image()));
     kernel_.setArg(idx++,
-                     *(static_cast<const cl::Image2D *>(offset->buffer())));
+                     *(offset->opencl_image()));
     if (!folded_constant_) {
       kernel_.setArg(idx++,
-                       *(static_cast<const cl::Image2D *>(mean->buffer())));
-      kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(var->buffer())));
+                       *(mean->opencl_image()));
+      kernel_.setArg(idx++, *(var->opencl_image()));
       kernel_.setArg(idx++, epsilon);
     }
-    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+    kernel_.setArg(idx++, *(output->opencl_image()));
     kernel_.setArg(idx++, relux_max_limit_);
   }
 
diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc
index 84eff1bfabcaad80d913fdd1aa0a73279883e4ad..c8507433ca804150df5f4d4c3277b52ebdaddd1c 100644
--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -35,9 +35,9 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
 
     uint32_t idx = 0;
-    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
-    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
-    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(bias->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
   }
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index bc906163b7913db55065a58b3938316f2c3a490a..19be430f8d47cf6b2ef2c0a2fd28b8856f911a18 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -77,7 +77,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
                                          built_options);
 
   uint32_t idx = 0;
-  b2f_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(buffer->buffer())));
+  b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
   if (!i2b_) {
     MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, "buffer offset not aligned");
     b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->buffer_offset() / GetEnumTypeSize(buffer->dtype())));
@@ -93,8 +93,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
     b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
     b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
   }
-  b2f_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(image->buffer())));
-
+  b2f_kernel.setArg(idx++, *(image->opencl_image()));
   const std::vector<uint32_t> lws = {16, 64};
   cl::Event event;
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 686e3a7add8d6c5d5ee73b892941a463aa1753b1..48466e6afaaf908dd8f1fbccbaa49fcf475aa26d 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -42,10 +42,10 @@ static void Concat2(cl::Kernel *kernel,
     *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
 
     uint32_t idx = 0;
-    kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input0->buffer())));
-    kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input1->buffer())));
+    kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input0->opencl_image())));
+    kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input1->opencl_image())));
     kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
-    kernel->setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+    kernel->setArg(idx++, *(static_cast<cl::Image2D *>(output->opencl_image())));
   }
 
   const uint32_t gws[3] = {
@@ -90,9 +90,9 @@ static void ConcatN(cl::Kernel *kernel,
   for (int i = 0; i < inputs_count; ++i) {
     const Tensor *input = input_list[i];
     uint32_t idx = 0;
-    kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+    kernel->setArg(idx++, *(input->opencl_image()));
     kernel->setArg(idx++, static_cast<int32_t>(chan_blk_offset));
-    kernel->setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+    kernel->setArg(idx++, *(output->opencl_image()));
 
     index_t input_channel_blk = input->dim(3) / 4;
     chan_blk_offset += input_channel_blk;
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index bee0e12a8826c2cd4d7bbe28ba3e70c9fe42f259..b370b32bebf84d938a7d0f8482ecdaba98e498c5 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -71,15 +71,15 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
 
     uint32_t idx = 0;
     kernel->setArg(idx++,
-                          *(static_cast<const cl::Image2D *>(input->buffer())));
+                          *(input->opencl_image()));
     kernel->setArg(idx++,
-                          *(static_cast<const cl::Image2D *>(filter->buffer())));
+                          *(filter->opencl_image()));
     if (bias != nullptr) {
       kernel->setArg(idx++,
-                            *(static_cast<const cl::Image2D *>(bias->buffer())));
+                            *(bias->opencl_image()));
     }
     kernel->setArg(idx++,
-                          *(static_cast<const cl::Image2D *>(output->buffer())));
+                          *(output->opencl_image()));
     // FIXME handle flexable data type: half not supported
     kernel->setArg(idx++, relux_max_limit);
     kernel->setArg(idx++, static_cast<int>(input_height));
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index bb67717791fd05f820ad92f734af545fe2b99e1e..a7eb668ddf093c46112396ad45c4aa32700fea58 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -66,15 +66,15 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
 
     uint32_t idx = 0;
     kernel->setArg(idx++,
-                          *(static_cast<const cl::Image2D *>(input->buffer())));
+                          *(input->opencl_image()));
     kernel->setArg(idx++,
-                          *(static_cast<const cl::Image2D *>(filter->buffer())));
+                          *(filter->opencl_image()));
     if (bias != nullptr) {
       kernel->setArg(idx++,
-                            *(static_cast<const cl::Image2D *>(bias->buffer())));
+                            *(bias->opencl_image()));
     }
     kernel->setArg(idx++,
-                          *(static_cast<const cl::Image2D *>(output->buffer())));
+                          *(output->opencl_image()));
     kernel->setArg(idx++, relux_max_limit);
     kernel->setArg(idx++, static_cast<int>(input->dim(1)));
     kernel->setArg(idx++, static_cast<int>(input->dim(2)));
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
index af344c284fe04836d1d2ac23b4014ffdf76ac22b..5f3ffa5e90e291e9ccf4aace429e19b72ee430ce 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -66,15 +66,15 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
 
     uint32_t idx = 0;
     kernel->setArg(idx++,
-                          *(static_cast<const cl::Image2D *>(input->buffer())));
+                          *(input->opencl_image()));
     kernel->setArg(idx++,
-                          *(static_cast<const cl::Image2D *>(filter->buffer())));
+                          *(filter->opencl_image()));
     if (bias != nullptr) {
       kernel->setArg(idx++,
-                            *(static_cast<const cl::Image2D *>(bias->buffer())));
+                            *(bias->opencl_image()));
     }
     kernel->setArg(idx++,
-                          *(static_cast<const cl::Image2D *>(output->buffer())));
+                          *(output->opencl_image()));
     kernel->setArg(idx++, relux_max_limit);
     kernel->setArg(idx++, static_cast<uint32_t>(input->dim(1)));
     kernel->setArg(idx++, static_cast<uint32_t>(input->dim(2)));
diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc
index 2942c5d060b9a240c0e9c3aa47cf6e2a82a6fdfd..3bbd4f438ce00567adebd450a4101037dd69a297 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -81,16 +81,15 @@ void DepthwiseConv2d(cl::Kernel *kernel,
     *kernel = runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
 
     uint32_t idx = 0;
-    kernel->setArg(idx++,
-                            *(static_cast<const cl::Image2D *>(input->buffer())));
+    kernel->setArg(idx++, *(input->opencl_image()));
     kernel->setArg(
-        idx++, *(static_cast<const cl::Image2D *>(filter->buffer())));
+        idx++, *(filter->opencl_image()));
     if (bias != nullptr) {
       kernel->setArg(
-          idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
+          idx++, *(bias->opencl_image()));
     }
     kernel->setArg(
-        idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
+        idx++, *(output->opencl_image()));
     kernel->setArg(idx++, relux_max_limit);
     kernel->setArg(idx++, static_cast<short>(input_height));
     kernel->setArg(idx++, static_cast<short>(input_width));
diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc
index e49a36b242688757b7df33f9bec74746b771b003..8c589c2f16a8ed0c3e030c9e2b6d67ba02975298 100644
--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -39,14 +39,14 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
 
     uint32_t idx = 0;
     kernel_.setArg(idx++,
-                   *(static_cast<const cl::Image2D *>(input0->buffer())));
+                   *(input0->opencl_image()));
     kernel_.setArg(idx++,
-                   *(static_cast<const cl::Image2D *>(input1->buffer())));
+                   *(input1->opencl_image()));
     if (!coeff_.empty()) {
       kernel_.setArg(idx++, coeff_[0]);
       kernel_.setArg(idx++, coeff_[1]);
     }
-    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+    kernel_.setArg(idx++, *(output->opencl_image()));
   }
 
   const uint32_t gws[2] = {
diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected_opencl.cc
index 33e26ecab3a668cacf77ae7a16bcd61f13d87aa9..4a4eacc15d0f5d9fe8d1635483ffaaa35fa37ae1 100644
--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -61,15 +61,15 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
 
     uint32_t idx = 0;
     kernel_.setArg(idx++,
-                   *(static_cast<const cl::Image2D *>(input->buffer())));
+                   *(input->opencl_image()));
     kernel_.setArg(idx++,
-                   *(static_cast<const cl::Image2D *>(weight->buffer())));
+                   *(weight->opencl_image()));
     if (bias != nullptr) {
       kernel_.setArg(idx++,
-                     *(static_cast<const cl::Image2D *>(bias->buffer())));
+                     *(bias->opencl_image()));
     }
     kernel_.setArg(idx++,
-                   *(static_cast<const cl::Image2D *>(output->buffer())));
+                   *(output->opencl_image()));
     kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
     kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
     kernel_.setArg(idx++, static_cast<int>(input->dim(3)));
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index c7f618496392bc3ff63905507ba7cbe416f38d0f..775608537a79107ff8d3d36221ac506f8f8c3b16 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -40,11 +40,10 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
 
     uint32_t idx = 0;
+    kernel_.setArg(idx++, *(A->opencl_image()));
     kernel_.setArg(idx++,
-                         *(static_cast<const cl::Image2D *>(A->buffer())));
-    kernel_.setArg(idx++,
-                         *(static_cast<const cl::Image2D *>(B->buffer())));
-    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(C->buffer())));
+                         *(B->opencl_image()));
+    kernel_.setArg(idx++, *(C->opencl_image()));
     kernel_.setArg(idx++, static_cast<int>(height));
     kernel_.setArg(idx++, static_cast<int>(width));
     kernel_.setArg(idx++, static_cast<int>(A->dim(2)));
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index 2ec0e0845982ac32cb041203454342411f846e9f..1272a4fbfe716c9a2cc1b33ff6314c26fbb79630 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -65,7 +65,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
 
     uint32_t idx = 0;
-    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+    kernel_.setArg(idx++, *(input->opencl_image()));
     kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
     kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
     kernel_.setArg(idx++, static_cast<int32_t>(out_height));
@@ -73,7 +73,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     kernel_.setArg(idx++, paddings[1] / 2);
     kernel_.setArg(idx++, strides_[0]);
     kernel_.setArg(idx++, kernels_[0]);
-    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+    kernel_.setArg(idx++, *(output->opencl_image()));
   }
 
   const uint32_t gws[3] = {
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index d8f4185ee91259604fb2c3a2e153202556f94695..5761d3cbb1f8b718947d4c3ae96c6f7f57e75d35 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -48,8 +48,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_ = runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
 
     uint32_t idx = 0;
-    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
-    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
     kernel_.setArg(idx++, height_scale);
     kernel_.setArg(idx++, width_scale);
     kernel_.setArg(idx++, static_cast<int32_t>(in_height));
diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc
index 55a487757ebcc399d61813db5935259454dfd935..a3336aa6f721b51178d5ed136b81fe45c342dda0 100644
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -35,10 +35,10 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
     kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
 
     uint32_t idx = 0;
-    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(logits->buffer())));
+    kernel_.setArg(idx++, *(logits->opencl_image()));
     kernel_.setArg(idx++, static_cast<int>(channels));
     kernel_.setArg(idx++, remain_channels);
-    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+    kernel_.setArg(idx++, *(output->opencl_image()));
   }
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width),
diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc
index 5940f4d3fdc8996b87765977db3a7120a86abb09..2eb06027a83ed668795329cd525c8de5d7ba2668 100644
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -42,11 +42,11 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
 
     uint32_t idx = 0;
     if (b2s_) {
-      kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(batch_tensor->buffer())));
-      kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(space_tensor->buffer())));
+      kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
+      kernel_.setArg(idx++, *(space_tensor->opencl_image()));
     } else {
-      kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(space_tensor->buffer())));
-      kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(batch_tensor->buffer())));
+      kernel_.setArg(idx++, *(space_tensor->opencl_image()));
+      kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
     }
     kernel_.setArg(idx++, block_shape_[0]);
     kernel_.setArg(idx++, block_shape_[1]);
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index 54511220fdc4ce1cec32f8e2a38f0fbf38b35519..8fd17f215e587b302cd7c90763cb655433a62788 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -49,8 +49,8 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
                                    built_options);
 
     uint32_t idx = 0;
-    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input_tensor->buffer())));
-    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->buffer())));
+    kernel_.setArg(idx++, *(input_tensor->opencl_image()));
+    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
     kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1)));
     kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2)));
     kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
@@ -119,11 +119,11 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
     const uint32_t round_h = (height_ + 1) / 2;
     const uint32_t round_w = (width_ + 1) / 2;
     uint32_t idx = 0;
-    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input_tensor->buffer())));
+    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
     if (bias != nullptr) {
-      kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
+      kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->opencl_image())));
     }
-    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->buffer())));
+    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image())));
     kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
     kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
     kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));