Support opencl image2d format and add buffer to image opencl kernel.

3b55f153 · liuqi · 0f27434c · 3b55f153 · 3b55f153 · 3b55f153
37 changed file
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -10,6 +10,7 @@
 #include "mace/core/common.h"
 #include "mace/core/registry.h"
 #include "mace/proto/mace.pb.h"
+#include "mace/core/types.h"

 namespace mace {

@@ -26,8 +27,14 @@ class Allocator {
  Allocator() {}
  virtual ~Allocator() noexcept {}
  virtual void *New(size_t nbytes) = 0;
+  virtual void *NewImage(const std::vector<size_t> &image_shape,
+                         const DataType dt) = 0;
  virtual void Delete(void *data) = 0;
+  virtual void DeleteImage(void *data) = 0;
  virtual void *Map(void *buffer, size_t nbytes) = 0;
+  virtual void *MapImage(void *buffer,
+                         const std::vector<size_t> &image_shape,
+                         std::vector<size_t> &mapped_image_pitch) = 0;
  virtual void Unmap(void *buffer, void *mapper_ptr) = 0;
  virtual bool OnHost() = 0;

@@ -58,8 +65,19 @@ class CPUAllocator : public Allocator {
    return data;
  }

+  void *NewImage(const std::vector<size_t> &shape,
+                 const DataType dt) override {
+    return nullptr;
+  }
+
  void Delete(void *data) override { free(data); }
+  void DeleteImage(void *data) override { free(data); };
  void *Map(void *buffer, size_t nbytes) override { return buffer; }
+  void *MapImage(void *buffer,
+                 const std::vector<size_t> &image_shape,
+                 std::vector<size_t> &mapped_image_pitch) override {
+    return buffer;
+  }
  void Unmap(void *buffer, void *mapper_ptr) override {}
  bool OnHost() override { return true; }
 };

--- a/mace/core/half.h
+++ b/mace/core/half.h
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -51,6 +51,9 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
      return false;
    }
    if (op_stats) {
+      if (device_type_ == DeviceType::OPENCL) {
+        OpenCLRuntime::Get()->command_queue().finish();
+      }
      op_stats->set_op_end_rel_micros(NowInMicroSec() -
                                      op_stats->all_start_micros());
      op_stats->set_all_end_rel_micros(NowInMicroSec() -

--- a/mace/core/opencl_allocator.cc
+++ b/mace/core/opencl_allocator.cc
@@ -8,6 +8,29 @@

 namespace mace {

+namespace {
+
+static cl_channel_type DataTypeToCLChannelType(const DataType t) {
+  switch (t) {
+    case DT_HALF:
+    case DT_FLOAT:
+      return CL_FLOAT;
+    case DT_INT8:
+    case DT_INT16:
+    case DT_INT32:
+      return CL_SIGNED_INT32;
+    case DT_UINT8:
+    case DT_UINT16:
+    case DT_UINT32:
+      return CL_UNSIGNED_INT32;
+    default:
+      LOG(FATAL) << "Image doesn't support the data type: " << t;
+      return 0;
+  }
+}
+
+}
+
 OpenCLAllocator::OpenCLAllocator() {}

 OpenCLAllocator::~OpenCLAllocator() {}
@@ -21,6 +44,23 @@ void *OpenCLAllocator::New(size_t nbytes) {
  return static_cast<void *>(buffer);
 }

+void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
+                                const DataType dt) {
+  MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2";
+
+  cl::ImageFormat img_format(CL_RGBA, DataTypeToCLChannelType(dt));
+
+  cl_int error;
+  cl::Image2D *cl_image =
+      new cl::Image2D(OpenCLRuntime::Get()->context(),
+                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR ,
+                      img_format,
+                      image_shape[0], image_shape[1],
+                      0, nullptr, &error);
+
+  return cl_image;
+}
+
 void OpenCLAllocator::Delete(void *buffer) {
  if (buffer != nullptr) {
    cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(buffer);
@@ -28,6 +68,13 @@ void OpenCLAllocator::Delete(void *buffer) {
  }
 }

+void OpenCLAllocator::DeleteImage(void *buffer) {
+  if (buffer != nullptr) {
+    cl::Image3D *cl_image = static_cast<cl::Image3D *>(buffer);
+    delete cl_image;
+  }
+}
+
 void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
  auto cl_buffer = static_cast<cl::Buffer *>(buffer);
  auto queue = OpenCLRuntime::Get()->command_queue();
@@ -40,6 +87,29 @@ void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
  return mapped_ptr;
 }

+// TODO : there is something wrong with half type.
+void *OpenCLAllocator::MapImage(void *buffer,
+                                const std::vector<size_t> &image_shape,
+                                std::vector<size_t> &mapped_image_pitch) {
+  MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
+  auto cl_image = static_cast<cl::Image2D *>(buffer);
+  std::array<size_t, 3> origin = {0, 0, 0};
+  std::array<size_t, 3> region = {image_shape[0], image_shape[1], 1};
+
+  mapped_image_pitch.resize(2);
+  cl_int error;
+  void *mapped_ptr =
+      OpenCLRuntime::Get()->command_queue().enqueueMapImage(*cl_image,
+                                                            CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+                                                            origin, region,
+                                                            &mapped_image_pitch[0],
+                                                            &mapped_image_pitch[1],
+                                                            nullptr, nullptr, &error);
+  MACE_CHECK(error == CL_SUCCESS) << error;
+
+  return mapped_ptr;
+}
+
 void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) {
  auto cl_buffer = static_cast<cl::Buffer *>(buffer);
  auto queue = OpenCLRuntime::Get()->command_queue();

--- a/mace/core/opencl_allocator.h
+++ b/mace/core/opencl_allocator.h
@@ -17,10 +17,25 @@ class OpenCLAllocator : public Allocator {

  void *New(size_t nbytes) override;

+  /*
+   * Only support shape.size() > 1 and collapse first n-2 dimensions to depth.
+   * Use Image3D with RGBA (128-bit) format to represent the image.
+   *
+   * @ shape : [depth, ..., height, width ].
+   */
+  void *NewImage(const std::vector<size_t> &image_shape,
+                 const DataType dt) override;
+
  void Delete(void *buffer) override;

+  void DeleteImage(void *buffer) override;
+
  void *Map(void *buffer, size_t nbytes) override;

+  void *MapImage(void *buffer,
+                 const std::vector<size_t> &image_shape,
+                 std::vector<size_t> &mapped_image_pitch) override;
+
  void Unmap(void *buffer, void *mapped_ptr) override;

  bool OnHost() override;

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -120,6 +120,7 @@ const std::map<std::string, std::string>
  {"relu", "relu.cl"},
  {"resize_bilinear", "resize_bilinear.cl"},
  {"space_to_batch", "space_to_batch.cl"},
+  {"buffer_to_image", "buffer_to_image.cl"},
 };

 void OpenCLRuntime::BuildProgram(const std::string &program_file_name,

--- a/mace/core/runtime/opencl/opencl_wrapper.cc
+++ b/mace/core/runtime/opencl/opencl_wrapper.cc
@@ -101,6 +101,18 @@ class OpenCLLibraryImpl final {
                                           const cl_event *,
                                           cl_event *,
                                           cl_int *);
+  using clEnqueueMapImageFunc = void *(*)(cl_command_queue,
+                                          cl_mem,
+                                          cl_bool,
+                                          cl_map_flags,
+                                          const size_t *,
+                                          const size_t *,
+                                          size_t *,
+                                          size_t *,
+                                          cl_uint,
+                                          const cl_event *,
+                                          cl_event *,
+                                          cl_int *);
  using clCreateCommandQueueWithPropertiesFunc =
      cl_command_queue (*)(cl_context /* context */,
                           cl_device_id /* device */,
@@ -148,6 +160,11 @@ class OpenCLLibraryImpl final {
                                                  size_t,
                                                  void *,
                                                  size_t *);
+  using clGetImageInfoFunc = cl_int (*)(cl_mem,
+                                    cl_image_info,
+                                    size_t,
+                                    void *,
+                                    size_t *);

 #define DEFINE_FUNC_PTR(func) func##Func func = nullptr

@@ -172,6 +189,7 @@ class OpenCLLibraryImpl final {
  DEFINE_FUNC_PTR(clCreateCommandQueueWithProperties);
  DEFINE_FUNC_PTR(clReleaseCommandQueue);
  DEFINE_FUNC_PTR(clEnqueueMapBuffer);
+  DEFINE_FUNC_PTR(clEnqueueMapImage);
  DEFINE_FUNC_PTR(clRetainProgram);
  DEFINE_FUNC_PTR(clGetProgramBuildInfo);
  DEFINE_FUNC_PTR(clEnqueueReadBuffer);
@@ -191,6 +209,7 @@ class OpenCLLibraryImpl final {
  DEFINE_FUNC_PTR(clReleaseDevice);
  DEFINE_FUNC_PTR(clRetainEvent);
  DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo);
+  DEFINE_FUNC_PTR(clGetImageInfo);

 #undef DEFINE_FUNC_PTR

@@ -294,6 +313,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
  ASSIGN_FROM_DLSYM(clCreateCommandQueueWithProperties);
  ASSIGN_FROM_DLSYM(clReleaseCommandQueue);
  ASSIGN_FROM_DLSYM(clEnqueueMapBuffer);
+  ASSIGN_FROM_DLSYM(clEnqueueMapImage);
  ASSIGN_FROM_DLSYM(clRetainProgram);
  ASSIGN_FROM_DLSYM(clGetProgramBuildInfo);
  ASSIGN_FROM_DLSYM(clEnqueueReadBuffer);
@@ -313,6 +333,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
  ASSIGN_FROM_DLSYM(clReleaseDevice);
  ASSIGN_FROM_DLSYM(clRetainEvent);
  ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo);
+  ASSIGN_FROM_DLSYM(clGetImageInfo);

 #undef ASSIGN_FROM_DLSYM

@@ -577,6 +598,31 @@ void *clEnqueueMapBuffer(cl_command_queue command_queue,
    return nullptr;
  }
 }
+
+void *clEnqueueMapImage(cl_command_queue command_queue,
+                        cl_mem image,
+                        cl_bool blocking_map,
+                        cl_map_flags map_flags,
+                        const size_t origin[3],
+                        const size_t region[3],
+                        size_t *image_row_pitch,
+                        size_t *image_slice_pitch,
+                        cl_uint num_events_in_wait_list,
+                        const cl_event *event_wait_list,
+                        cl_event *event,
+                        cl_int *errcode_ret) {
+  auto func = mace::OpenCLLibraryImpl::Get().clEnqueueMapImage;
+  if (func != nullptr) {
+    return func(command_queue, image, blocking_map, map_flags, origin, region,
+                image_row_pitch, image_slice_pitch,
+                num_events_in_wait_list, event_wait_list, event, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) {
+      *errcode_ret = CL_OUT_OF_RESOURCES;
+    }
+    return nullptr;
+  }
+}
 cl_command_queue clCreateCommandQueueWithProperties(
    cl_context context,
    cl_device_id device,
@@ -832,3 +878,17 @@ cl_int clGetKernelWorkGroupInfo(cl_kernel kernel,
    return CL_OUT_OF_RESOURCES;
  }
 }
+
+cl_int clGetImageInfo(cl_mem image,
+                      cl_image_info param_name,
+                      size_t param_value_size,
+                      void *param_value,
+                      size_t *param_value_size_ret) {
+  auto func = mace::OpenCLLibraryImpl::Get().clGetImageInfo;
+  if (func != nullptr) {
+    return func(image, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_OUT_OF_RESOURCES;
+  }
+}
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -23,6 +23,7 @@ namespace mace {

 #define CASES_WITH_DEFAULT(TYPE_ENUM, STMTS, INVALID, DEFAULT) \
  switch (TYPE_ENUM) {                                         \
+    CASE(half, SINGLE_ARG(STMTS))                              \
    CASE(float, SINGLE_ARG(STMTS))                             \
    CASE(double, SINGLE_ARG(STMTS))                            \
    CASE(int32_t, SINGLE_ARG(STMTS))                           \
@@ -68,20 +69,26 @@ class Tensor {
        size_(0),
        dtype_(DT_FLOAT),
        buffer_(nullptr),
-        data_(nullptr){};
+        data_(nullptr),
+        is_image_(false){};

  Tensor(Allocator *alloc, DataType type)
      : alloc_(alloc),
        size_(0),
        dtype_(type),
        buffer_(nullptr),
-        data_(nullptr){};
+        data_(nullptr),
+        is_image_(false){};

  ~Tensor() {
    MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before destroy");
    if (buffer_ != nullptr) {
      MACE_CHECK_NOTNULL(alloc_);
-      alloc_->Delete(buffer_);
+      if (is_image_) {
+        alloc_->DeleteImage(buffer_);
+      } else {
+        alloc_->Delete(buffer_);
+      }
    }
  }

@@ -91,6 +98,10 @@ class Tensor {

  inline const vector<index_t> &shape() const { return shape_; }

+  inline const vector<size_t> &image_shape() const { return image_shape_; }
+
+  inline const bool is_image() const { return is_image_; }
+
  inline index_t dim_size() const { return shape_.size(); }

  inline index_t dim(unsigned int index) const {
@@ -120,6 +131,11 @@ class Tensor {
    }
  }

+  inline void MapImage(std::vector<size_t> &mapped_image_pitch) const {
+    MACE_CHECK(!OnHost() && buffer_ != nullptr && data_ == nullptr);
+    data_ = alloc_->MapImage(buffer_, image_shape_, mapped_image_pitch);
+  }
+
  /*
   *  Unmap the device buffer
   */
@@ -162,17 +178,53 @@ class Tensor {
  inline void Resize(const vector<index_t> &shape) {
    shape_ = shape;
    index_t size = NumElements();
-    if (size_ != size) {
+    if (size_ != size || is_image_) {
      size_ = size;
      MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before resize");
-      alloc_->Delete(buffer_);
+      if (is_image_) {
+        alloc_->DeleteImage(buffer_);
+      } else {
+        alloc_->Delete(buffer_);
+      }
+      is_image_ = false;
      CASES(dtype_, buffer_ = alloc_->New(size_ * sizeof(T)));
    }
  }

-  inline void ResizeLike(const Tensor &other) { Resize(other.shape()); }
+  inline void ResizeImage(const vector<index_t> &shape,
+                          const std::vector<size_t> &image_shape) {
+    shape_ = shape;
+    index_t size = NumElements();
+    if (size_ != size || !is_image_) {
+      size_ = size;
+      MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before resize");
+
+      if (is_image_) {
+        alloc_->DeleteImage(buffer_);
+      } else {
+        alloc_->Delete(buffer_);
+      }
+      is_image_ = true;
+      image_shape_ = image_shape;
+      buffer_ = alloc_->NewImage(image_shape, dtype_);
+    }
+  }
+
+  inline void ResizeLike(const Tensor &other) {
+    if (other.is_image()) {
+      ResizeImage(other.shape(), other.image_shape());
+    } else {
+      Resize(other.shape());
+    }
+  }

-  inline void ResizeLike(const Tensor *other) { Resize(other->shape()); }
+  inline void ResizeLike(const Tensor *other) {
+    if (other->is_image()) {
+      ResizeImage(other->shape(), other->image_shape());
+    } else {
+      Resize(other->shape());
+    }
+  }

  template <typename T>
  inline void Copy(const T *src, index_t size) {
@@ -202,8 +254,6 @@ class Tensor {
    for (int i : shape_) {
      os << i << ", ";
    }
-    LOG(INFO) << "Tensor shape: " << os.str()
-              << " type: " << DataType_Name(dtype_);

    os.str("");
    os.clear();
@@ -212,7 +262,7 @@ class Tensor {
      if ( i != 0 && i % shape_[3] == 0) {
        os << "\n";
      }
-      CASES(dtype_, (os << this->data<T>()[i]) << ", ");
+      CASES(dtype_, (os << (this->data<T>()[i]) << ", "));
    }
    LOG(INFO) << os.str();
  }
@@ -228,20 +278,33 @@ class Tensor {
    dtype_ = other.dtype_;
    ResizeLike(other);
    MappingGuard map_other(&other);
-    CopyBytes(other.raw_data(), size_ * SizeOfType());
+    if (is_image_) {
+      LOG(FATAL) << "Not support copy image tensor, please use Copy API.";
+    } else {
+      CopyBytes(other.raw_data(), size_ * SizeOfType());
+    }
  }

  class MappingGuard {
   public:
    MappingGuard(const Tensor *tensor) : tensor_(tensor) {
-      if (tensor_ != nullptr) tensor_->Map();
+      if (tensor_ != nullptr) {
+        if (tensor_->is_image()) {
+          tensor_->MapImage(mapped_image_pitch_);
+        } else {
+          tensor_->Map();
+        }
+      }
    }
    ~MappingGuard() {
      if (tensor_ != nullptr) tensor_->Unmap();
    }

+    inline const vector<size_t> &mapped_image_pitch() const { return mapped_image_pitch_; }
+
   private:
    const Tensor *tensor_;
+    std::vector<size_t> mapped_image_pitch_;
  };

 private:
@@ -261,6 +324,9 @@ class Tensor {
  // Mapped buffer
  mutable void *data_;
  vector<index_t> shape_;
+  // Image for opencl
+  bool is_image_;
+  std::vector<size_t> image_shape_;

  DISABLE_COPY_AND_ASSIGN(Tensor);
 };

--- a/mace/core/types.cc
+++ b/mace/core/types.cc
@@ -24,31 +24,32 @@ bool DataTypeCanUseMemcpy(DataType dt) {
  }
 }

-std::string DataTypeToCLType(const DataType dt) {
+
+size_t GetEnumTypeSize(const DataType dt) {
  switch (dt) {
    case DT_FLOAT:
-      return "float";
+      return sizeof(float);
    case DT_HALF:
-      return "half";
+      return sizeof(half);
    case DT_UINT8:
-      return "uchar";
+      return sizeof(uint8_t);
    case DT_INT8:
-      return "char";
+      return sizeof(int8_t);
    case DT_DOUBLE:
-      return "double";
+      return sizeof(double);
    case DT_INT32:
-      return "int";
+      return sizeof(int32_t);
    case DT_UINT32:
-      return "int";
+      return sizeof(uint32_t);
    case DT_UINT16:
-      return "ushort";
+      return sizeof(uint16_t);
    case DT_INT16:
-      return "short";
+      return sizeof(int16_t);
    case DT_INT64:
-      return "long";
+      return sizeof(int64_t);
    default:
      LOG(FATAL) << "Unsupported data type";
-      return "";
+      return 0;
  }
 }


--- a/mace/core/types.h
+++ b/mace/core/types.h
@@ -7,12 +7,16 @@

 #include "mace/core/common.h"
 #include "mace/proto/mace.pb.h"
+#include "mace/core/half.h"
+

 namespace mace {

+using half = half_float::half;
+
 bool DataTypeCanUseMemcpy(DataType dt);

-std::string DataTypeToCLType(const DataType dt);
+size_t GetEnumTypeSize(const DataType dt);

 template <class T>
 struct IsValidDataType;
@@ -43,6 +47,7 @@ struct EnumToDataType {};  // Specializations below
    typedef TYPE Type;                      \
  }

+MATCH_TYPE_AND_ENUM(half, DT_HALF);
 MATCH_TYPE_AND_ENUM(float, DT_FLOAT);
 MATCH_TYPE_AND_ENUM(double, DT_DOUBLE);
 MATCH_TYPE_AND_ENUM(int32_t, DT_INT32);

--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_KERNELS_BATCH_NORM_H_
+#define MACE_KERNELS_BATCH_NORM_H_
+
+#include "mace/core/tensor.h"
+#include "mace/kernels/opencl/helper.h"
+
+namespace mace {
+namespace kernels {
+
+struct BufferToImageFunctorBase {
+  BufferToImageFunctorBase(bool i2b) : i2b_(i2b) {}
+  bool i2b_;
+};
+
+template<DeviceType D, typename T>
+struct BufferToImageFunctor : BufferToImageFunctorBase{
+  BufferToImageFunctor(bool i2b = false) :
+      BufferToImageFunctorBase(i2b) {}
+  void operator()(Tensor *input,
+                  const BufferType type,
+                  Tensor *output) {
+    MACE_NOT_IMPLEMENTED;
+  }
+  bool i2b_;
+};
+
+template<typename T>
+struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase{
+  BufferToImageFunctor(bool i2b = false) :
+      BufferToImageFunctorBase(i2b) {}
+  void operator()(Tensor *input,
+                  const BufferType type,
+                  Tensor *output);
+};
+
+}  //  namepsace kernels
+}  //  namespace mace
+
+#endif  //  MACE_KERNELS_BATCH_NORM_H_
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -15,9 +15,9 @@ template <DeviceType D, typename T>
 struct Conv2dFunctor {
  Conv2dFunctor() {}
  Conv2dFunctor(const int *strides,
-                const std::vector<int> &paddings,
+                const Padding &paddings,
                const int *dilations)
-      : strides_(strides), paddings_(paddings), dilations_(dilations) {}
+      : strides_(strides), dilations_(dilations), paddings_(paddings)  {}

  void operator()(const Tensor *input,
                  const Tensor *filter,
@@ -27,6 +27,13 @@ struct Conv2dFunctor {
    MACE_CHECK_NOTNULL(filter);
    MACE_CHECK_NOTNULL(output);

+    std::vector<index_t> output_shape(4);
+    std::vector<int> paddings(2);
+    kernels::CalcPaddingAndOutputSize(
+        input->shape().data(), filter->shape().data(), dilations_,
+        strides_, paddings_, output_shape.data(), paddings.data());
+    output->Resize(output_shape);
+
    index_t batch = output->dim(0);
    index_t channels = output->dim(1);
    index_t height = output->dim(2);
@@ -49,10 +56,10 @@ struct Conv2dFunctor {
    MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");

    // The left-upper most offset of the padded input
-    int padded_h_start = 0 - paddings_[0] / 2;
-    int padded_w_start = 0 - paddings_[1] / 2;
-    index_t padded_h_stop = input_height + paddings_[0] - paddings_[0] / 2;
-    index_t padded_w_stop = input_width + paddings_[1] - paddings_[1] / 2;
+    int padded_h_start = 0 - paddings[0] / 2;
+    int padded_w_start = 0 - paddings[1] / 2;
+    index_t padded_h_stop = input_height + paddings[0] - paddings[0] / 2;
+    index_t padded_w_stop = input_width + paddings[1] - paddings[1] / 2;

    index_t kernel_size = input_channels * kernel_h * kernel_w;

@@ -108,8 +115,8 @@ struct Conv2dFunctor {
  }

  const int *strides_;         // [stride_h, stride_w]
-  std::vector<int> paddings_;  // [padding_h, padding_w]
  const int *dilations_;       // [dilation_h, dilation_w]
+  Padding paddings_;
 };

 template <>

--- a/mace/kernels/neon/conv_2d_neon.cc
+++ b/mace/kernels/neon/conv_2d_neon.cc
@@ -49,6 +49,14 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
  MACE_CHECK_NOTNULL(filter);
  MACE_CHECK_NOTNULL(output);

+
+  std::vector<index_t> output_shape_vec(4);
+  std::vector<int> paddings(2);
+  kernels::CalcPaddingAndOutputSize(
+      input->shape().data(), filter->shape().data(), dilations_,
+      strides_, paddings_, output_shape_vec.data(), paddings.data());
+  output->Resize(output_shape_vec);
+
  typedef void (*Conv2dNeonFunction)(
      const float *input, const index_t *input_shape, const float *filter,
      const index_t *filter_shape, const float *bias, float *output,
@@ -77,8 +85,8 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,

  Tensor padded_input;
  // Keep this alive during kernel execution
-  if (paddings_[0] > 0 || paddings_[1] > 0) {
-    ConstructInputWithPadding(input, paddings_.data(), &padded_input);
+  if (paddings[0] > 0 || paddings[1] > 0) {
+    ConstructInputWithPadding(input, paddings.data(), &padded_input);
    input = &padded_input;
  }
  Tensor::MappingGuard input_mapper(input);

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -4,6 +4,7 @@

 #include "mace/kernels/addn.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/helper.h"

 namespace mace {
 namespace kernels {

--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -6,6 +6,7 @@
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/utils/tuner.h"
+#include "mace/kernels/opencl/helper.h"

 namespace mace {
 namespace kernels {

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/kernels/buffer_to_image.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+
+namespace mace {
+namespace kernels {
+
+template<typename T>
+void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
+                                                             const BufferType type,
+                                                             Tensor *image) {
+  MACE_CHECK(!buffer->is_image()) << "buffer must be buffer-type";
+  std::vector<size_t> image_shape;
+  if (!i2b_) {
+    CalImage2DShape(buffer->shape(), type, image_shape);
+    image->ResizeImage(buffer->shape(), image_shape);
+  } else {
+    image_shape = image->image_shape();
+    buffer->Resize(image->shape());
+  }
+
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(image->dtype()));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOpenclCMDDataType(image->dtype()));
+  auto runtime = OpenCLRuntime::Get();
+  string kernel_name;
+  switch (type) {
+    case FILTER:
+      kernel_name = i2b_ ? "filter_image_to_buffer" : "filter_buffer_to_image";
+      break;
+    case IN_OUT:
+      kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image";
+      break;
+    case ARGUMENT:
+      kernel_name = i2b_ ? "arg_image_to_buffer" : "arg_buffer_to_image";
+      break;
+  }
+  VLOG(0) << kernel_name;
+  auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
+                                         kernel_name,
+                                         built_options);
+
+  uint32_t idx = 0;
+  b2f_kernel.setArg(idx++, *(static_cast<const cl::Image3D *>(buffer->buffer())));
+  if (type == ARGUMENT) {
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
+  } else {
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
+    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
+  }
+  b2f_kernel.setArg(idx++, *(static_cast<cl::Image3D *>(image->buffer())));
+
+  const size_t gws[3] = {image_shape[0],
+                         image_shape[1],
+                         1};
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(b2f_kernel);
+  const std::vector<uint32_t> lws = {kwg_size, 1, 1};
+  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+      b2f_kernel, cl::NullRange,
+      cl::NDRange(gws[0], gws[1], gws[2]),
+      cl::NDRange(lws[0], lws[1], lws[2]));
+
+  MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+}
+
+template struct BufferToImageFunctor<DeviceType::OPENCL, float>;
+template struct BufferToImageFunctor<DeviceType::OPENCL, half>;
+
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
+#include <common.h>
+
+__kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, oc */
+                                     __private const int filter_w,
+                                     __private const int in_channel,
+                                     __private const int out_channel,
+                                     __write_only image2d_t output) {
+  int w = get_global_id(0);
+  int h = get_global_id(1);
+  const int out_channel_idx = h * 4;
+  const int hw_idx = w / in_channel;
+  int in_channel_idx = w % in_channel;
+  const int h_idx = hw_idx / filter_w;
+  const int w_idx = hw_idx % filter_w;
+  const int offset = ((h_idx * filter_w + w_idx) * in_channel + in_channel_idx) * out_channel
+                           + out_channel_idx;
+
+  VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input + offset);
+  int2 coord = (int2)(w, h);
+  CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
+}
+
+__kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc */
+                                     __private const int filter_w,
+                                     __private const int in_channel,
+                                     __private const int out_channel,
+                                     __read_only image2d_t input) {
+  int w = get_global_id(0);
+  int h = get_global_id(1);
+  const int out_channel_idx = h * 4;
+  const int hw_idx = w / in_channel;
+  int in_channel_idx = w % in_channel;
+  const int h_idx = hw_idx / filter_w;
+  const int w_idx = hw_idx % filter_w;
+  const int offset = ((h_idx * filter_w + w_idx) * in_channel + in_channel_idx) * out_channel
+                           + out_channel_idx;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coord = (int2)(w, h);
+  VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
+  if (out_channel_idx + 4 > out_channel) {
+    const int diff = in_channel - in_channel_idx;
+    output[offset] = values.s0;
+    if (diff == 2) {
+      output[offset+1] = values.s1;
+    } else {
+      output[offset+1] = values.s1;
+      output[offset+2] = values.s2;
+    }
+  } else {
+    vstore4(values, 0, output + offset);
+  }
+}
+
+__kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
+                                     __private const int height,
+                                     __private const int width,
+                                     __private const int channels,
+                                     __write_only image2d_t output) {
+  int w = get_global_id(0);
+  int h = get_global_id(1);
+  const int batch_idx = h / height;
+  const int height_idx = h % height;
+  const int width_idx = w % width;
+  const int channel_idx = w / width * 4;
+  const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+                           + channel_idx;
+
+  VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input + offset);
+  int2 coord = (int2)(w, h);
+  CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
+}
+
+__kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
+                                     __private const int height,
+                                     __private const int width,
+                                     __private const int channels,
+                                     __read_only image2d_t input) {
+  int w = get_global_id(0);
+  int h = get_global_id(1);
+  const int batch_idx = h / height;
+  const int height_idx = h % height;
+  const int width_idx = w % width;
+  const int channel_idx = w / width * 4;
+  const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+                           + channel_idx;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coord = (int2)(w, h);
+  VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
+  if (channel_idx + 4 > channels) {
+    const int diff = channels - channel_idx;
+    output[offset] = values.s0;
+    if (diff == 2) {
+      output[offset+1] = values.s1;
+    } else {
+      output[offset+1] = values.s1;
+      output[offset+2] = values.s2;
+    }
+  } else {
+    vstore4(values, 0, output + offset);
+  }
+}
+
+__kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
+                                  __private const int count,
+                                  __write_only image2d_t output) {
+  int w = get_global_id(0);
+  int h = get_global_id(1);
+  const int offset = w * 4;
+
+  VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input + offset);
+  int2 coord = (int2)(w, h);
+  CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
+}
+
+__kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
+                                  __private const int count,
+                                  __read_only image2d_t input) {
+  int w = get_global_id(0);
+  int h = get_global_id(1);
+  const int offset = w * 4;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coord = (int2)(w, h);
+  VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
+  if (offset + 4 > count) {
+    const int diff = count - offset;
+    output[offset] = values.s0;
+    if (diff == 2) {
+      output[offset+1] = values.s1;
+    } else {
+      output[offset+1] = values.s1;
+      output[offset+2] = values.s2;
+    }
+  } else {
+    vstore4(values, 0, output + offset);
+  }
+}
--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -11,4 +11,7 @@
 #define VEC_DATA_TYPE_STR(data_type, size) data_type##size
 #define VEC_DATA_TYPE(data_type, size) VEC_DATA_TYPE_STR(data_type, size)

+#define CMD_TYPE_STR(cmd, type) cmd##type
+#define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type)
+
 #endif  // MACE_KERNELS_OPENCL_CL_COMMON_H_
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -165,47 +165,85 @@ __kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */
  }
 }

-/* FIXME this is incomplete */
-__kernel void conv_2d_1x1_v3(__read_only image3d_t input, /* n, c/4, h, w, 4 */
-                             __global const float *filter, /* o, i, kh, kw */
-                             __global const float *bias, /* o */
-                             __write_only image3d_t output, /* n, c/4, h, w, 4 */
-                             __private const int batch_num,
-                             __private const int in_chan_num,
-                             __private const int out_chan_num,
-                             __private const int height,
-                             __private const int width) {
-  int out_chan_blk = get_global_id(0);
-  int h = get_global_id(1);
-  int w = get_global_id(2);
+// TODO : validation
+__kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+                          __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
+                          __read_only image2d_t bias, /* cout%4 * cout/4 */
+                          __write_only image2d_t output,
+                          __private const int in_ch_blks,
+                          __private const int width) {
+  const int out_ch_blk = get_global_id(0);
+  const int out_w_blk = get_global_id(1);
+  const int out_w_blks = get_global_size(1);
+  const int out_hb = get_global_id(2);

+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

-  int in_chan_blk_num = (in_chan_num + 3) / 4;
-  int out_chan_blk_num = (out_chan_num + 3) / 4;
+  half4 bias_value = read_imageh(bias, sampler, (int2)(out_w_blk, 1));
+  half4 out0 = (half4)(bias_value.x, bias_value.x, bias_value.x, bias_value.x);
+  half4 out1 = (half4)(bias_value.y, bias_value.y, bias_value.y, bias_value.y);
+  half4 out2 = (half4)(bias_value.z, bias_value.z, bias_value.z, bias_value.z);
+  half4 out3 = (half4)(bias_value.w, bias_value.w, bias_value.w, bias_value.w);
+
+  // Unrolling this loop hurt perfmance
+  int in_x_base = 0;
+  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
+    const int in_x0 = in_x_base + out_w_blk;;
+    const int in_x1 = in_x0 + out_w_blks;
+    const int in_x2 = in_x1 + out_w_blks;
+    const int in_x3 = in_x2 + out_w_blks;
+    in_x_base += width;
+
+    const half4 in0 = read_imageh(input, sampler, (int2)(in_x0, out_hb));
+    const half4 in1 = read_imageh(input, sampler, (int2)(in_x1, out_hb));
+    const half4 in2 = read_imageh(input, sampler, (int2)(in_x2, out_hb));
+    const half4 in3 = read_imageh(input, sampler, (int2)(in_x3, out_hb));
+
+    // The order matters, load input first then load filter, why?
+    const int filter_x0 = in_ch_blk << 2;
+    const half4 weights0 = read_imageh(filter, sampler, (int2)(filter_x0, out_ch_blk));
+    const half4 weights1 = read_imageh(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
+    const half4 weights2 = read_imageh(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
+    const half4 weights3 = read_imageh(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
+    // Will prefetch L2 improve performance? How to pretch image data?
+
+    // Interleaving load and mul does not improve performance as expected
+    out0 += in0.x * weights0;
+    out1 += in1.x * weights0;
+    out2 += in2.x * weights0;
+    out3 += in3.x * weights0;
+
+    out0 += in0.y * weights1;
+    out1 += in1.y * weights1;
+    out2 += in2.y * weights1;
+    out3 += in3.y * weights1;
+
+    out0 += in0.z * weights2;
+    out1 += in1.z * weights2;
+    out2 += in2.z * weights2;
+    out3 += in3.z * weights2;
+
+    out0 += in0.w * weights3;
+    out1 += in1.w * weights3;
+    out2 += in2.w * weights3;
+    out3 += in3.w * weights3;
+  }

-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  const int out_x_offset = out_ch_blk * width;
+  const int w0 = out_w_blk;
+  write_imageh(output, (int2)(out_x_offset + w0, out_hb), out0);

-  for (int batch = 0; batch < batch_num; ++batch) {
-    float4 bias_value = vload4(out_chan_blk, bias);
-    __private float4 out = bias_value;
-
-    for (int in_chan_blk = 0; in_chan_blk < in_chan_blk_num; ++in_chan_blk) {
-      int in_d = batch * in_chan_blk_num + in_chan_blk;
-      float4 in = read_imagef(input, sampler, (int4)(in_d, h, w, 0));
-
-      const float *filter_base = filter + (out_chan_blk << 2) * in_chan_num;
-      float4 weights = vload4(in_chan_blk, filter_base);
-      out.x += dot(in, weights);
-      weights = vload4(in_chan_blk, filter_base + in_chan_num);
-      out.y += dot(in, weights);
-      weights = vload4(in_chan_blk, filter_base + in_chan_num * 2);
-      out.z += dot(in, weights);
-      weights = vload4(in_chan_blk, filter_base + in_chan_num * 3);
-      out.w += dot(in, weights);
-    }
+  const int w1 = w0 + out_w_blks;
+  if (w1 >= width) return;
+  write_imageh(output, (int2)(out_x_offset + w1, out_hb), out1);

-    int out_d = batch * out_chan_blk_num + out_chan_blk;
-    int4 out_coord = (int4)(out_d, h, w, 0);
-    write_imagef(output, out_coord, out);
-  }
+  const int w2 = w1 + out_w_blks;
+  if (w2 >= width) return;
+  write_imageh(output, (int2)(out_x_offset + w2, out_hb), out2);
+
+  const int w3 = w2 + out_w_blks;
+  if (w3 >= width) return;
+  write_imageh(output, (int2)(out_x_offset + w3, out_hb), out3);
 }
+
+
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -3,6 +3,7 @@
 //

 #include "mace/kernels/conv_2d.h"
+#include "mace/kernels/opencl/helper.h"

 namespace mace {
 namespace kernels {
@@ -47,10 +48,25 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
        input, filter, bias, output);
    return;
  }
+
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  kernels::CalcPaddingAndOutputSize(
+      input->shape().data(), filter->shape().data(), dilations_,
+      strides_, paddings_, output_shape.data(), paddings.data());
+
+  if (input->is_image()) {
+    std::vector<size_t> output_image_shape;
+    CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+    output->ResizeImage(output_shape, output_image_shape);
+  } else {
+    output->Resize(output_shape);
+  }
+
  auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
-  if (paddings_[0] > 0 || paddings_[1] > 0) {
+  if (paddings[0] > 0 || paddings[1] > 0) {
    Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
-    ConstructInputWithPadding(input, paddings_.data(), &padded_input);
+    ConstructInputWithPadding(input, paddings.data(), &padded_input);
    conv2d_func(&padded_input, filter, bias, output);
  }else {
    conv2d_func(input, filter, bias, output);

--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -3,45 +3,14 @@
 //

 #include "mace/kernels/conv_2d.h"
-#include "mace/core/common.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/utils/utils.h"
+#include "mace/kernels/opencl/helper.h"

 namespace mace {
 namespace kernels {

-void Conv1x1Naive(const Tensor *input,
-                  const Tensor *filter,
-                  const Tensor *bias,
-                  Tensor *output) {
-  const index_t batch = output->dim(0);
-  const index_t channels = output->dim(1);
-  const index_t height = output->dim(2);
-  const index_t width = output->dim(3);
-  const index_t input_channels = input->dim(1);
-
-  auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
-  auto conv_2d =
-      cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer, int,
-                        int>(program, "conv_2d_1x1_naive");
-  const index_t pixels = height * width;
-
-  cl_int error;
-  conv_2d(cl::EnqueueArgs(
-              runtime->command_queue(),
-              cl::NDRange(static_cast<int>(batch), static_cast<int>(channels),
-                          static_cast<int>(pixels)),
-              cl::NDRange(1, 1, 128)),
-          *(static_cast<cl::Buffer *>(input->buffer())),
-          *(static_cast<cl::Buffer *>(filter->buffer())),
-          *(static_cast<cl::Buffer *>(bias->buffer())),
-          *(static_cast<cl::Buffer *>(output->buffer())),
-          static_cast<int>(input_channels), error);
-  MACE_CHECK(error == CL_SUCCESS);
-}
-
 void Conv1x1V2(const Tensor *input,
               const Tensor *filter,
               const Tensor *bias,
@@ -98,85 +67,44 @@ void Conv1x1V3(const Tensor *input,
               const Tensor *filter,
               const Tensor *bias,
               Tensor *output) {
+  
  const index_t batch = output->dim(0);
  const index_t channels = output->dim(1);
  const index_t height = output->dim(2);
  const index_t width = output->dim(3);
  const index_t input_channels = input->dim(1);

-  auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t input_channel_blocks = RoundUpDiv4(input_channels);

-  const index_t pixels = height * width;
-  const index_t pixel_blocks = (pixels + 3) / 4;
+  std::set<std::string> built_options;
+  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace("-DSTRIDE_1");
+  built_options.emplace(bias != nullptr ? "-DBIAS" : "");

-  const index_t channel_blocks = (channels + 3) / 4;
-  const index_t input_channel_blocks = (input_channels + 3) / 4;
-
-  // FIXME temp hacking
-  static std::map<std::ptrdiff_t, cl::Image3D> input_image_map;
-  static std::map<std::ptrdiff_t, cl::Image3D> output_image_map;
-  cl::Image3D input_image;
-  cl::Image3D output_image;
-  auto input_iter =
-      input_image_map.find(reinterpret_cast<std::ptrdiff_t>(input->buffer()));
-  if (input_iter != input_image_map.end()) {
-    input_image = input_iter->second;
-  } else {
-    // The batch dimension is collapsed with channel
-    cl_int error;
-    cl::Image3D image =
-        cl::Image3D(OpenCLRuntime::Get()->context(),
-                    CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
-                    cl::ImageFormat(CL_RGBA, CL_FLOAT), height, width,
-                    batch * input_channel_blocks, 0, 0, nullptr, &error);
-    MACE_CHECK(error == CL_SUCCESS);
-    input_image = image;
-    input_image_map.clear();
-    input_image_map.emplace(reinterpret_cast<std::ptrdiff_t>(input->buffer()),
-                            image);
-  }
-  auto output_iter =
-      output_image_map.find(reinterpret_cast<std::ptrdiff_t>(output->buffer()));
-  if (output_iter != output_image_map.end()) {
-    output_image = output_iter->second;
-  } else {
-    cl_int error;
-    cl::Image3D image =
-        cl::Image3D(OpenCLRuntime::Get()->context(),
-                    CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
-                    cl::ImageFormat(CL_RGBA, CL_FLOAT), height, width,
-                    batch * channel_blocks, 0, 0, nullptr, &error);
-    MACE_CHECK(error == CL_SUCCESS);
-    output_image = image;
-    output_image_map.clear();
-    output_image_map.emplace(reinterpret_cast<std::ptrdiff_t>(output->buffer()),
-                             image);
-  }
+  auto runtime = OpenCLRuntime::Get();
+  auto program = runtime->program();

-  auto conv_2d_kernel = cl::Kernel(program, "conv_2d_1x1_v3");
+  auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1", built_options);
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);

  uint32_t idx = 0;
-  conv_2d_kernel.setArg(idx++, input_image);
-  conv_2d_kernel.setArg(idx++,
-                        *(static_cast<const cl::Buffer *>(filter->buffer())));
-  conv_2d_kernel.setArg(idx++,
-                        *(static_cast<const cl::Buffer *>(bias->buffer())));
-  conv_2d_kernel.setArg(idx++, output_image);
-  conv_2d_kernel.setArg(idx++, static_cast<int>(batch));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channels));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(channels));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
+  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(filter->buffer())));
+  if (bias != nullptr) {
+    conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
+  }
+  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
  conv_2d_kernel.setArg(idx++, static_cast<int>(width));

  auto command_queue = runtime->command_queue();
  cl_int error;
  error = command_queue.enqueueNDRangeKernel(
      conv_2d_kernel, cl::NullRange,
-      cl::NDRange(static_cast<int>(channel_blocks), static_cast<int>(height),
-                  static_cast<int>(width)),
-      cl::NDRange(1, 2, kwg_size / 2));
+      cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(height),
+                  static_cast<uint32_t>(height * batch)),
+      cl::NDRange(4, 15, 8));
  MACE_CHECK(error == CL_SUCCESS, error);
 }


--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -5,6 +5,7 @@
 #include "mace/core/common.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/conv_2d.h"
+#include "mace/kernels/opencl/helper.h"

 namespace mace {
 namespace kernels {

--- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
@@ -5,6 +5,7 @@
 #include "mace/core/common.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/conv_2d.h"
+#include "mace/kernels/opencl/helper.h"

 namespace mace {
 namespace kernels {

--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+namespace kernels {
+
+// [(c+3)/4*W, N * H]
+void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
+                        std::vector<size_t> &image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape.resize(2);
+  image_shape[0] = RoundUpDiv4(shape[3]) * shape[2];
+  image_shape[1] = shape[0] * shape[1];
+}
+
+// [H * W * Ic, (Oc + 3) / 4]
+void CalFilterImageShape(const std::vector<index_t> &shape, /* HWIO*/
+                         std::vector<size_t> &image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape.resize(2);
+  image_shape[0] = shape[0] * shape[1] * shape[2];
+  image_shape[1] = RoundUpDiv4(shape.back());
+}
+
+// [(size + 3) / 4, 1]
+void CalArgImageShape(const std::vector<index_t> &shape,
+                      std::vector<size_t> &image_shape) {
+  MACE_CHECK(shape.size() == 1);
+  image_shape.resize(2);
+  image_shape[0] = RoundUpDiv4(shape[0]);
+  image_shape[1] = 1;
+}
+
+void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
+                     const BufferType type,
+                     std::vector<size_t> &image_shape) {
+  switch (type) {
+    case FILTER:
+      CalFilterImageShape(shape, image_shape);
+      break;
+    case IN_OUT:
+      CalInOutputImageShape(shape, image_shape);
+      break;
+    case ARGUMENT:
+      CalArgImageShape(shape, image_shape);
+      break;
+    default:
+      LOG(FATAL) << "Mace not supported yet.";
+  }
+}
+
+
+std::string DataTypeToCLType(const DataType dt) {
+  switch (dt) {
+    case DT_FLOAT:
+      return "float";
+    case DT_HALF:
+      return "half";
+    case DT_UINT8:
+      return "uchar";
+    case DT_INT8:
+      return "char";
+    case DT_DOUBLE:
+      return "double";
+    case DT_INT32:
+      return "int";
+    case DT_UINT32:
+      return "int";
+    case DT_UINT16:
+      return "ushort";
+    case DT_INT16:
+      return "short";
+    case DT_INT64:
+      return "long";
+    default:
+      LOG(FATAL) << "Unsupported data type";
+      return "";
+  }
+}
+
+std::string DataTypeToOpenclCMDDataType(const DataType dt) {
+  switch (dt) {
+    case DT_FLOAT:
+      return "f";
+    case DT_HALF:
+      return "h";
+    default:
+      LOG(FATAL) << "Not supported data type for opencl cmd data type";
+      return "";
+  }
+}
+
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_KERNELS_OPENCL_HELPER_H_
+#define MACE_KERNELS_OPENCL_HELPER_H_
+#include "mace/core/types.h"
+
+namespace mace {
+namespace kernels {
+
+enum BufferType {
+  FILTER = 0,
+  IN_OUT= 1,
+  ARGUMENT = 2
+};
+
+void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
+                     const BufferType type,
+                     std::vector<size_t> &image_shape);
+
+std::string DataTypeToOpenclCMDDataType(const DataType dt);
+
+std::string DataTypeToCLType(const DataType dt);
+
+
+}  // namespace kernels
+} //  namespace mace
+#endif //  MACE_KERNELS_OPENCL_HELPER_H_
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -5,6 +5,7 @@
 #include "mace/kernels/pooling.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/helper.h"

 namespace mace {
 namespace kernels {

--- a/mace/kernels/opencl/relu_opencl.cc
+++ b/mace/kernels/opencl/relu_opencl.cc
@@ -5,6 +5,7 @@
 #include "mace/kernels/relu.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/helper.h"

 namespace mace {
 namespace kernels {

--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -5,6 +5,7 @@
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/resize_bilinear.h"
+#include "mace/kernels/opencl/helper.h"

 namespace mace {
 namespace kernels {

--- a/mace/kernels/opencl/space_to_batch_opecl.cc
+++ b/mace/kernels/opencl/space_to_batch_opecl.cc
@@ -8,6 +8,7 @@
 #include "mace/core/common.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/space_to_batch.h"
+#include "mace/kernels/opencl/helper.h"

 namespace mace {
 namespace kernels {

--- a/mace/ops/buffer_to_image.cc
+++ b/mace/ops/buffer_to_image.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/ops/buffer_to_image.h"
+
+namespace mace {
+
+REGISTER_OPENCL_OPERATOR(BufferToImage, BufferToImageOp<DeviceType::OPENCL, float>);
+
+}  //  namespace mace
--- a/mace/ops/buffer_to_image.h
+++ b/mace/ops/buffer_to_image.h
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_OPS_BUFFER_TO_IMAGE_H_
+#define MACE_OPS_BUFFER_TO_IMAGE_H_
+
+#include "mace/core/operator.h"
+#include "mace/kernels/buffer_to_image.h"
+
+namespace mace {
+
+template <DeviceType D, typename T>
+class BufferToImageOp: public Operator<D, T> {
+ public:
+  BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
+      : Operator<D, T>(op_def, ws)  {}
+
+  bool Run() override {
+    const Tensor *input_tensor = this->Input(INPUT);
+
+    kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
+        "buffer_type", static_cast<int>(kernels::FILTER)));
+    Tensor *output = this->Output(OUTPUT);
+
+    functor_(const_cast<Tensor *>(input_tensor), type, output);
+    return true;
+  }
+
+ private:
+  kernels::BufferToImageFunctor<D, T> functor_;
+
+ protected:
+  OP_INPUT_TAGS(INPUT);
+  OP_OUTPUT_TAGS(OUTPUT);
+};
+
+} //  namespace mace
+#endif //  MACE_OPS_BUFFER_TO_IMAGE_H_
--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "gtest/gtest.h"
+#include "mace/ops/ops_test_util.h"
+#include "mace/kernels/opencl/helper.h"
+
+using namespace mace;
+
+template<DeviceType D, typename T>
+void TestBidirectionTransform(const int type, const std::vector<index_t> &input_shape) {
+  OpsTestNet net;
+  OpDefBuilder("BufferToImage", "BufferToImageTest")
+      .Input("Input")
+      .Output("B2IOutput")
+      .AddIntArg("buffer_type", type)
+      .Finalize(net.NewOperatorDef());
+
+  // Add input data
+  net.AddRandomInput<D, T>("Input", input_shape);
+
+  // Run
+  net.RunOp(D);
+
+  OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
+      .Input("B2IOutput")
+      .Output("I2BOutput")
+      .AddIntArg("buffer_type", type)
+      .Finalize(net.NewOperatorDef());
+
+  // Run
+  net.RunOp(D);
+
+  // Check
+  ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-5);
+}
+
+TEST(BufferToImageTest, ArgSmall) {
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1});
+}
+
+TEST(BufferToImageTest, ArgMedia) {
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11});
+}
+
+TEST(BufferToImageTest, ArgLarge) {
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {256});
+}
+
+TEST(BufferToImageTest, InputSmallSingleChannel) {
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {1, 2, 3, 1});
+}
+
+TEST(BufferToImageTest, InputSmallMultipleChannel) {
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {1, 2, 3, 3});
+}
+
+TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {3, 2, 3, 3});
+}
+
+TEST(BufferToImageTest, InputMedia) {
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {3, 13, 17, 128});
+}
+
+TEST(BufferToImageTest, InputLarge) {
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {3, 64, 64, 256});
+}
+
+TEST(BufferToImageTest, Filter1x1Small) {
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {1, 1, 3, 5});
+}
+
+TEST(BufferToImageTest, Filter1x1Media) {
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {1, 1, 13, 17});
+}
+
+TEST(BufferToImageTest, Filter1x1Large) {
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {1, 1, 128, 512});
+}
+
+TEST(BufferToImageTest, Filter3x3Small) {
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 3, 5});
+}
+
+TEST(BufferToImageTest, Filter3x3Meida) {
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 13, 17});
+}
+
+TEST(BufferToImageTest, Filter3x3Large) {
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 128, 256});
+}
--- a/mace/ops/conv_2d.h
+++ b/mace/ops/conv_2d.h
@@ -17,9 +17,9 @@ template <DeviceType D, typename T>
 class Conv2dOp : public ConvPool2dOpBase<D, T> {
 public:
  Conv2dOp(const OperatorDef &op_def, Workspace *ws)
-      : ConvPool2dOpBase<D, T>(op_def, ws) {
-    functor_.strides_ = this->strides_.data();
-    functor_.dilations_ = this->dilations_.data();
+      : ConvPool2dOpBase<D, T>(op_def, ws),
+        functor_(this->strides_.data(), this->padding_,
+                 this->dilations_.data()) {
  }

  bool Run() override {
@@ -28,15 +28,6 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
    const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
    Tensor *output = this->Output(OUTPUT);

-    std::vector<index_t> output_shape(4);
-    std::vector<int> paddings(2);
-    kernels::CalcPaddingAndOutputSize(
-        input->shape().data(), filter->shape().data(), this->dilations_.data(),
-        this->strides_.data(), this->padding_, output_shape.data(),
-        paddings.data());
-    output->Resize(output_shape);
-    functor_.paddings_ = paddings;
-
    functor_(input, filter, bias, output);

    return true;

--- a/mace/ops/image_to_buffer.cc
+++ b/mace/ops/image_to_buffer.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/ops/image_to_buffer.h"
+
+namespace mace {
+
+REGISTER_OPENCL_OPERATOR(ImageToBuffer, ImageToBufferOp<DeviceType::OPENCL, float>);
+
+}  //  namespace mace
--- a/mace/ops/image_to_buffer.h
+++ b/mace/ops/image_to_buffer.h
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_OPS_BUFFER_TO_IMAGE_H_
+#define MACE_OPS_BUFFER_TO_IMAGE_H_
+
+#include "mace/core/operator.h"
+#include "mace/kernels/buffer_to_image.h"
+
+namespace mace {
+
+template <DeviceType D, typename T>
+class ImageToBufferOp: public Operator<D, T> {
+ public:
+  ImageToBufferOp(const OperatorDef &op_def, Workspace *ws)
+      : Operator<D, T>(op_def, ws), functor_(true)  {}
+
+  bool Run() override {
+    const Tensor *input_tensor = this->Input(INPUT);
+    Tensor *output = this->Output(OUTPUT);
+
+    kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
+        "buffer_type", static_cast<int>(kernels::FILTER)));
+    functor_(output, type, const_cast<Tensor *>(input_tensor));
+    return true;
+  }
+
+ private:
+  kernels::BufferToImageFunctor<D, T> functor_;
+
+ protected:
+  OP_INPUT_TAGS(INPUT);
+  OP_OUTPUT_TAGS(OUTPUT);
+};
+
+} //  namespace mace
+#endif //  MACE_OPS_BUFFER_TO_IMAGE_H_
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -12,6 +12,7 @@
 #include "mace/core/net.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/helper.h"

 namespace mace {

@@ -106,6 +107,32 @@ class OpsTestNet {
    memcpy(input_data, data.data(), data.size() * sizeof(T));
  }

+  template <DeviceType D, typename T>
+  void AddInputImageFromArray(const std::string &name,
+                              const std::vector<index_t> &shape,
+                              const std::vector<T> &data) {
+    Tensor *input =
+        ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
+    std::vector<size_t> image_shape;
+    input->ResizeImage(shape, image_shape);
+    Tensor::MappingGuard input_mapper(input);
+    T *input_data = input->mutable_data<T>();
+    MACE_CHECK(static_cast<size_t>(input->size()) == data.size());
+    const T *data_ptr = data.data();
+    const int type_size = sizeof(T);
+    const int row_pitch = shape[3];
+    auto mapped_image_pitch = input_mapper.mapped_image_pitch();
+    const size_t slice_size = mapped_image_pitch[1] / sizeof(T);
+    for (int c = 0; c < shape[0] * shape[1]; ++c) {
+      T *input_ptr = input_data + c * slice_size;
+      for (int h = 0; h < shape[2]; ++h) {
+        memcpy(input_ptr, data_ptr, row_pitch * type_size);
+        input_ptr += mapped_image_pitch[0] / sizeof(T);
+        data_ptr += row_pitch;
+      }
+    }
+  }
+
  template <DeviceType D, typename T>
  void AddRepeatedInput(const std::string &name,
                        const std::vector<index_t> &shape,
@@ -126,19 +153,26 @@ class OpsTestNet {
        ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
    input->Resize(shape);
    Tensor::MappingGuard input_mapper(input);
-    float *input_data = input->mutable_data<T>();
+    T *input_data = input->mutable_data<T>();

    std::random_device rd;
    std::mt19937 gen(rd());
-    std::normal_distribution<T> nd(0, 1);
-
-    std::generate(input_data, input_data + input->size(),
-                  [&gen, &nd, positive] {
-                    return positive ? std::abs(nd(gen)) : nd(gen);
-                  });
+    std::normal_distribution<float> nd(0, 1);
+    if (DataTypeToEnum<T>::value == DT_HALF) {
+      std::generate(input_data, input_data + input->size(),
+                    [&gen, &nd, positive] {
+                      return half_float::half_cast<half>(positive ? std::abs(nd(gen)) : nd(gen));
+                    });
+    } else {
+      std::generate(input_data, input_data + input->size(),
+                    [&gen, &nd, positive] {
+                      return positive ? std::abs(nd(gen)) : nd(gen);
+                    });
+    }
  }

  OperatorDef *NewOperatorDef() {
+    op_defs_.clear();
    op_defs_.emplace_back(OperatorDef());
    return &op_defs_[op_defs_.size() - 1];
  }
@@ -258,7 +292,8 @@ inline std::string ShapeToString(const Tensor &x) {
 template <typename T>
 struct is_floating_point_type {
  static const bool value =
-      std::is_same<T, float>::value || std::is_same<T, double>::value;
+      std::is_same<T, float>::value || std::is_same<T, double>::value
+          || std::is_same<T, half>::value;
 };

 template <typename T>
@@ -314,6 +349,7 @@ struct Expector<T, true> {
                                       << " index = " << i;
    }
  }
+
 };

 template <typename T>
@@ -330,6 +366,7 @@ std::string ToString(const T &input) {
  return ss.str();
 }

+
 }  // namespace mace

 #endif  //  MACE_OPS_TEST_UTIL_H_
--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
@@ -13,6 +13,17 @@ Integer RoundUp(Integer i, Integer factor) {
  return (i + factor - 1) / factor * factor;
 }

+template <typename Integer, uint32_t factor>
+Integer RoundUpDiv(Integer i) {
+  return (i + factor - 1) / factor;
+}
+
+// Partial specialization of function templates is not allowed
+template <typename Integer>
+Integer RoundUpDiv4(Integer i) {
+  return (i + 3) >> 2;
+}
+
 template <typename Integer>
 Integer CeilQuotient(Integer a, Integer b) {
  return (a + b - 1) / b;