Merge branch 'unify-cpu-gpu' into 'master'

Unify cpu gpu See merge request !877

Merge branch 'unify-cpu-gpu' into 'master'
Unify cpu gpu See merge request !877
bfbe1a30 · 李寅 · 4983fcb2 · e446bd65 · bfbe1a30 · bfbe1a30
176 changed file
--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -69,9 +69,9 @@ in one deployment file.
      - The output tensor name(s) (tensorflow) or top name(s) of outputs' layer (caffe).
        If there are more than one tensors, use one line for a tensor.
    * - input_shapes
-      - The shapes of the input tensors, in NHWC order.
+      - The shapes of the input tensors, default is NHWC order.
    * - output_shapes
-      - The shapes of the output tensors, in NHWC order.
+      - The shapes of the output tensors, default is NHWC order.
    * - input_ranges
      - The numerical range of the input tensors' data, default [-1, 1]. It is only for test.
    * - validation_inputs_data
@@ -84,6 +84,10 @@ in one deployment file.
      - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP.
    * - input_data_types
      - [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32.
+    * - input_data_formats
+      - [optional] The format of the input tensors, one of [NONE, NHWC]. If there is no format of the input, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order.
+    * - output_data_formats
+      - [optional] The format of the output tensors, one of [NONE, NHWC]. If there is no format of the output, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order.
    * - limit_opencl_kernel_time
      - [optional] Whether splitting the OpenCL kernel within 1 ms to keep UI responsiveness, default is 0.
    * - obfuscate

--- a/docs/user_guide/devices/demo_device_nanopi.yml
+++ b/docs/user_guide/devices/demo_device_nanopi.yml
+# one yaml config file can contain multi device info
+devices:
+  # The name of the device
+  nanopi:
+  # arm64 or armhf
+    target_abis: [arm64, armhf]
+  # device soc, you can get it from device manual
+    target_socs: RK3399
+  # device model full name
+    models: FriendlyElec Nanopi M4
+  # device ip address
+    address: 10.0.0.0
+  # login username
+    username: user
+  # login password, is required when you can login into device without password
+    password: 1234567
+  raspberry:
+    target_abis: [armv7l]
+    target_socs: BCM2837
+    models: Raspberry Pi 3 Model B Plus Rev 1.3
+    address: 10.0.0.1
+    username: user
+    password: 123456
--- a/mace/core/arg_helper.cc
+++ b/mace/core/arg_helper.cc
@@ -95,4 +95,12 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(float, floats, false)
 MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true)
 MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true)
 #undef MACE_GET_REPEATED_ARGUMENT_FUNC
+
+
+bool IsQuantizedModel(const NetDef &net_def) {
+  return
+      ProtoArgHelper::GetOptionalArg<NetDef, int>(net_def, "quantize_flag", 0)
+          == 1;
+}
+
 }  // namespace mace
--- a/mace/core/arg_helper.h
+++ b/mace/core/arg_helper.h
@@ -55,6 +55,8 @@ class ProtoArgHelper {
  std::map<std::string, Argument> arg_map_;
 };

+bool IsQuantizedModel(const NetDef &def);
+
 }  // namespace mace

 #endif  // MACE_CORE_ARG_HELPER_H_
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -233,6 +233,11 @@ class Image : public BufferBase {
    }
  }

+  inline DataType dtype() const {
+    MACE_CHECK_NOTNULL(buf_);
+    return data_type_;
+  }
+
  void *buffer() {
    MACE_CHECK_NOTNULL(buf_);
    return buf_;

--- a/mace/core/device.h
+++ b/mace/core/device.h
@@ -34,7 +34,7 @@ class Device {

 #ifdef MACE_ENABLE_OPENCL
  virtual OpenCLRuntime *opencl_runtime() = 0;
-#endif
+#endif  // MACE_ENABLE_OPENCL
  virtual CPURuntime *cpu_runtime() = 0;

  virtual Allocator *allocator() = 0;

--- a/mace/core/memory_optimizer.cc
+++ b/mace/core/memory_optimizer.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/memory_optimizer.h"
+
+#include <algorithm>
+#include <functional>
+#include <numeric>
+#include <sstream>
+#include <unordered_set>
+
+#include "mace/core/arg_helper.h"
+#include "mace/core/macros.h"
+#include "mace/utils/logging.h"
+
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_util.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+
+bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) {
+  static const std::unordered_set<std::string> kReuseOp = {
+      "Reshape", "Identity", "Squeeze", "ExpandDims"
+  };
+  return kReuseOp.count(op_type) == 1;
+}
+
+void MemoryOptimizer::UpdateTensorRef(const std::string &tensor_name) {
+  if (tensor_ref_count_.count(tensor_name) == 0) {
+    tensor_ref_count_.emplace(tensor_name, 1);
+  } else {
+    tensor_ref_count_[tensor_name] += 1;
+  }
+}
+
+void MemoryOptimizer::UpdateTensorRef(const mace::OperatorDef *op_def) {
+  int input_size = op_def->input_size();
+  for (int i = 0; i < input_size; ++i) {
+    if (tensor_ref_count_.count(op_def->input(i)) == 1) {
+      tensor_ref_count_[op_def->input(i)] += 1;
+    }
+  }
+  int output_size = op_def->output_size();
+  for (int i = 0; i < output_size; ++i) {
+    if (tensor_ref_count_.count(op_def->output(i)) == 0) {
+      tensor_ref_count_.emplace(op_def->output(i), 0);
+    }
+  }
+}
+
+MemoryBlock MemoryOptimizer::CreateMemoryBlock(
+    std::vector<int64_t> shape,
+    DataType dt,
+    mace::MemoryType mem_type) {
+  MemoryBlock block;
+#ifdef MACE_ENABLE_OPENCL
+  if (mem_type == MemoryType::GPU_IMAGE) {
+    std::vector<size_t> image_shape;
+    if (shape.size() == 2) {
+      shape = {shape[0], 1, 1, shape[1]};
+    } else {
+      MACE_CHECK(shape.size() == 4) << "GPU only support 2D/4D input";
+    }
+    OpenCLUtil::CalImage2DShape(shape,
+                                OpenCLBufferType::IN_OUT_CHANNEL,
+                                &image_shape);
+    block.set_x(image_shape[0]);
+    block.set_y(image_shape[1]);
+    return block;
+  }
+#endif  // MACE_ENABLE_OPENCL
+  MACE_UNUSED(mem_type);
+  int64_t op_mem_size = std::accumulate(shape.begin(),
+                                        shape.end(),
+                                        GetEnumTypeSize(dt),
+                                        std::multiplies<int64_t>());
+  block.set_x(op_mem_size);
+  block.set_y(1);
+  return block;
+}
+
+void MemoryOptimizer::Optimize(
+    const mace::OperatorDef *op_def,
+    const std::unordered_map<std::string, MemoryType> &mem_types) {
+  MACE_LATENCY_LOGGER(2, "Optimize memory");
+  if (op_def->output_size() != op_def->output_shape_size()) {
+    VLOG(1) << op_def->name()
+            << ": the number of output shape "
+            << "is not equal to the number of output";
+    return;
+  }
+
+  auto device = static_cast<DeviceType>(op_def->device_type());
+  DataType op_dtype = static_cast<DataType>(ProtoArgHelper::GetOptionalArg(
+      *op_def,
+      "T",
+      static_cast<int>(DT_FLOAT)));
+  MACE_CHECK(
+      op_def->output_type_size() == 0 ||
+          op_def->output_size() == op_def->output_type_size(),
+      "operator output size != operator output type size",
+      op_def->output_size(),
+      op_def->output_type_size());
+  DataType dt;
+
+  int output_size = op_def->output_size();
+  for (int i = 0; i < output_size; ++i) {
+    if (i < op_def->output_type_size()) {
+      dt = op_def->output_type(i);
+    } else {
+      dt = op_dtype;
+    }
+    int best_mem_id = -1;
+    MemoryType mem_type = MemoryType::CPU_BUFFER;
+    if (device == DeviceType::GPU) {
+      mem_type = mem_types.at(op_def->output(i));
+    }
+    auto shape = std::vector<int64_t>(
+        op_def->output_shape(i).dims().begin(),
+        op_def->output_shape(i).dims().end());
+    MemoryBlock op_mem_block = CreateMemoryBlock(shape, dt, mem_type);
+    MemoryBlock best_mem_block;
+    if (IsMemoryReuseOp(op_def->type())) {
+      if (tensor_mem_map_.count(op_def->input(0)) == 1) {
+        best_mem_id = tensor_mem_map_[op_def->input(0)].first;
+      }
+    } else {
+      auto shape = std::vector<int64_t>(
+          op_def->output_shape(i).dims().begin(),
+          op_def->output_shape(i).dims().end());
+
+      int64_t op_mem_size = op_mem_block.x() * op_mem_block.y();
+      int64_t best_added_mem_size = LLONG_MAX;
+      int64_t best_wasted_mem_size = LLONG_MAX;
+
+      int64_t old_mem_size = 0, new_mem_size = 0;
+      MemoryBlock new_mem_block;
+      for (auto idle_mem_id : idle_blocks_) {
+        if (mem_blocks_[idle_mem_id].mem_type() == mem_type) {
+          if (mem_type == MemoryType::GPU_IMAGE) {
+            // GPU Image could reuse memory with same data type only
+            if (mem_blocks_[idle_mem_id].data_type() != dt) {
+              continue;
+            }
+            old_mem_size =
+                mem_blocks_[idle_mem_id].x() * mem_blocks_[idle_mem_id].y();
+            new_mem_block.set_x(std::max<int64_t>(mem_blocks_[idle_mem_id].x(),
+                                                  op_mem_block.x()));
+            new_mem_block.set_y(std::max<int64_t>(mem_blocks_[idle_mem_id].y(),
+                                                  op_mem_block.y()));
+            new_mem_size = new_mem_block.x() * new_mem_block.y();
+          } else {
+            old_mem_size = mem_blocks_[idle_mem_id].x();
+            new_mem_size = std::max(op_mem_size, old_mem_size);
+            new_mem_block.set_x(new_mem_size);
+          }
+          int64_t added_mem_size = new_mem_size - old_mem_size;
+          int64_t wasted_mem_size = new_mem_size - op_mem_size;
+          // minimize add_mem_size; if best_mem_add_size is 0,
+          // then minimize waste_mem_size
+          if ((best_added_mem_size > 0 && added_mem_size < best_added_mem_size)
+              || (best_added_mem_size == 0 &&
+                  wasted_mem_size < best_wasted_mem_size)) {
+            best_mem_id = idle_mem_id;
+            best_added_mem_size = added_mem_size;
+            best_wasted_mem_size = wasted_mem_size;
+            best_mem_block = new_mem_block;
+          }
+        }
+      }
+
+      if (best_added_mem_size <= op_mem_size) {
+        best_mem_block.set_mem_id(best_mem_id);
+        best_mem_block.set_data_type(dt);
+        best_mem_block.set_mem_type(mem_type);
+        mem_blocks_[best_mem_id] = best_mem_block;
+        idle_blocks_.erase(best_mem_id);
+      } else {
+        best_mem_id = static_cast<int>(mem_blocks_.size());
+        best_mem_block.set_mem_id(best_mem_id);
+        best_mem_block.set_data_type(dt);
+        best_mem_block.set_mem_type(mem_type);
+        best_mem_block.set_x(op_mem_block.x());
+        best_mem_block.set_y(op_mem_block.y());
+        mem_blocks_.push_back(best_mem_block);
+      }
+    }
+
+    if (best_mem_id != -1) {
+      if (mem_ref_count_.count(best_mem_id) == 1) {
+        mem_ref_count_[best_mem_id] += 1;
+      } else {
+        mem_ref_count_[best_mem_id] = 1;
+      }
+      tensor_mem_map_[op_def->output(i)] = std::make_pair(best_mem_id, dt);
+    }
+  }
+
+  // de-refer input tensors
+  int input_size = op_def->input_size();
+  for (int i = 0; i < input_size; ++i) {
+    auto &input_name = op_def->input(i);
+    if (tensor_ref_count_.count(input_name) == 1) {
+      tensor_ref_count_[input_name] -= 1;
+      if (tensor_ref_count_.at(input_name) == 0 &&
+          tensor_mem_map_.count(input_name) == 1) {
+        int mem_id = tensor_mem_map_.at(input_name).first;
+        mem_ref_count_[mem_id] -= 1;
+        if (mem_ref_count_.at(mem_id) == 0) {
+          idle_blocks_.insert(mem_id);
+        }
+      } else {
+        MACE_CHECK(tensor_ref_count_.at(input_name) >= 0);
+      }
+    }
+  }
+}
+
+const std::vector<MemoryBlock>& MemoryOptimizer::mem_blocks() const {
+  return mem_blocks_;
+}
+
+const std::unordered_map<std::string, std::pair<int, DataType>>&
+    MemoryOptimizer::tensor_mem_map() const {
+  return tensor_mem_map_;
+}
+
+std::string MemoryOptimizer::DebugInfo() const {
+  auto memory_type_to_str = [](const MemoryType type) -> std::string {
+    if (type == MemoryType::CPU_BUFFER) {
+      return "CPU_BUFFER";
+    } else if (type == MemoryType::GPU_BUFFER) {
+      return "GPU_BUFFER";
+    } else if (type == MemoryType::GPU_IMAGE) {
+      return "GPU_IMAGE";
+    } else {
+      return "UNKNOWN";
+    }
+  };
+  std::stringstream sstream;
+  sstream << "\n";
+  size_t block_size = mem_blocks_.size();
+  for (size_t i = 0; i < block_size; ++i) {
+    sstream << i << " " << memory_type_to_str(mem_blocks_[i].mem_type())
+            << " ";
+    if (mem_blocks_[i].mem_type() == MemoryType::GPU_IMAGE) {
+      sstream << DataTypeToString(mem_blocks_[i].data_type()) << " "
+              "[" << mem_blocks_[i].x() << ", " << mem_blocks_[i].y() << "]";
+    } else {
+      sstream << "[" << mem_blocks_[i].x() << "]";
+    }
+    sstream << "\n";
+  }
+
+  return sstream.str();
+}
+
+}  // namespace mace
--- a/mace/core/memory_optimizer.h
+++ b/mace/core/memory_optimizer.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_MEMORY_OPTIMIZER_H_
+#define MACE_CORE_MEMORY_OPTIMIZER_H_
+
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "mace/proto/mace.pb.h"
+#include "mace/core/types.h"
+
+namespace mace {
+
+class MemoryBlock {
+ public:
+  inline void set_mem_id(int mem_id) {
+    mem_id_ = mem_id;
+  }
+
+  inline int mem_id() const {
+    return mem_id_;
+  }
+
+  inline void set_data_type(DataType data_type) {
+    data_type_ = data_type;
+  }
+
+  inline DataType data_type() const {
+    return data_type_;
+  }
+
+  inline void set_mem_type(MemoryType mem_type) {
+    mem_type_ = mem_type;
+  }
+
+  inline MemoryType mem_type() const {
+    return mem_type_;
+  }
+
+  inline void set_x(int64_t x) {
+    x_ = x;
+  }
+
+  inline int64_t x() const {
+    return x_;
+  }
+
+  inline void set_y(int64_t y) {
+    y_ = y;
+  }
+
+  inline int64_t y() const {
+    return y_;
+  }
+
+ private:
+  int mem_id_;
+  DataType data_type_;
+  MemoryType mem_type_;
+  int64_t x_;
+  int64_t y_;
+};
+
+class MemoryOptimizer {
+ public:
+  static bool IsMemoryReuseOp(const std::string &op_type);
+  void UpdateTensorRef(const std::string &tensor_name);
+  void UpdateTensorRef(const OperatorDef *op_def);
+  void Optimize(const OperatorDef *op_def,
+                const std::unordered_map<std::string, MemoryType> &mem_types);
+
+  const std::vector<MemoryBlock> &mem_blocks() const;
+
+  const std::unordered_map<std::string,
+                           std::pair<int, DataType>> &tensor_mem_map() const;
+
+  std::string DebugInfo() const;
+
+ private:
+  MemoryBlock CreateMemoryBlock(std::vector<int64_t> shape,
+                                DataType dt,
+                                MemoryType mem_type);
+
+ private:
+  std::unordered_map<std::string, int> tensor_ref_count_;
+  std::vector<MemoryBlock> mem_blocks_;
+  // tensor name : <mem_id, data_type>
+  // Buffer Memory do not different data type, so store the data type.
+  std::unordered_map<std::string, std::pair<int, DataType>> tensor_mem_map_;
+  std::unordered_map<int, int> mem_ref_count_;
+  std::set<int> idle_blocks_;
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_MEMORY_OPTIMIZER_H_
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -18,6 +18,7 @@

 #include "mace/core/future.h"
 #include "mace/core/macros.h"
+#include "mace/core/memory_optimizer.h"
 #include "mace/core/net.h"
 #include "mace/core/op_context.h"
 #include "mace/public/mace.h"
@@ -25,13 +26,94 @@
 #include "mace/utils/timer.h"
 #include "mace/utils/utils.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_util.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {

+namespace {
+struct InternalOutputInfo {
+  InternalOutputInfo(const MemoryType mem_type,
+                     const DataType dtype,
+                     const std::vector<index_t> &shape,
+                     int op_idx)
+      : mem_type(mem_type), dtype(dtype), shape(shape), op_idx(op_idx) {}
+
+  MemoryType mem_type;  // transformed memory type
+  DataType dtype;
+  std::vector<index_t> shape;  // tensor shape
+  int op_idx;  // operation which generate the tensor
+};
+
+#ifdef MACE_ENABLE_OPENCL
+std::string TransformedName(const std::string &input_name,
+                            const mace::MemoryType mem_type) {
+  std::stringstream ss;
+  ss << input_name << "_mem_type_" << mem_type;
+  return ss.str();
+}
+#endif  // MACE_ENABLE_OPENCL
+
+}  // namespace
+
+std::unique_ptr<Operation> SerialNet::CreateOperation(
+    const OpRegistryBase *op_registry,
+    OpConstructContext *construct_context,
+    std::shared_ptr<OperatorDef> op_def,
+    DataFormat data_format_flag,
+    bool is_quantize_model) {
+  // Create the Operation
+  DeviceType target_device_type = target_device_->device_type();
+  // Get available devices
+  auto available_devices = op_registry->AvailableDevices(op_def->type());
+  // Find the device type to run the op.
+  // If the target_device_type in available devices, use target_device_type,
+  // otherwise, fallback to CPU device.
+  DeviceType device_type = DeviceType::CPU;
+  construct_context->set_device(cpu_device_);
+  construct_context->set_output_mem_type(MemoryType::CPU_BUFFER);
+  for (auto device : available_devices) {
+    if (device == target_device_type) {
+      device_type = target_device_type;
+      construct_context->set_device(target_device_);
+      if (target_device_->device_type() == DeviceType::GPU) {
+        construct_context->set_output_mem_type(MemoryType::GPU_IMAGE);
+      }
+      break;
+    }
+  }
+  op_def->set_device_type(device_type);
+  // transpose output shape if run on CPU (default format is NHWC)
+  if (!is_quantize_model && device_type == DeviceType::CPU &&
+      op_def->output_shape_size() == op_def->output_size()) {
+    for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
+      if (data_format_flag == NHWC &&
+          op_def->output_shape(out_idx).dims_size() == 4) {
+        //  NHWC -> NCHW
+        std::vector<index_t> output_shape =
+            TransposeShape<index_t, index_t>(
+                std::vector<index_t>(
+                    op_def->output_shape(out_idx).dims().begin(),
+                    op_def->output_shape(out_idx).dims().end()),
+                {0, 3, 1, 2});
+        for (int i = 0; i < 4; ++i) {
+          op_def->mutable_output_shape(out_idx)->set_dims(i, output_shape[i]);
+        }
+      }
+    }
+  }
+  construct_context->set_operator_def(op_def);
+  std::unique_ptr<Operation> op(
+      op_registry->CreateOperation(construct_context, device_type));
+  return std::move(op);
+}
+
 SerialNet::SerialNet(const OpRegistryBase *op_registry,
                     const NetDef *net_def,
                     Workspace *ws,
                     Device *target_device,
-                     const NetMode mode)
+                     MemoryOptimizer *mem_optimizer)
    : NetBase(),
      ws_(ws),
      target_device_(target_device),
@@ -40,44 +122,211 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
                        target_device->cpu_runtime()->policy(),
                        target_device->cpu_runtime()->use_gemmlowp())) {
  MACE_LATENCY_LOGGER(1, "Constructing SerialNet");
-  // Create Operations
-  DeviceType target_device_type = target_device_->device_type();
+  // output tensor : related information
+  std::unordered_map<std::string, InternalOutputInfo> output_map;
+  // used for memory optimization
+  std::unordered_map<std::string, MemoryType> output_mem_map;
+  std::unordered_map<std::string, std::string> transformed_map;
+  // add input information
+  MemoryType target_mem_type;
+  // quantize model flag
+  bool is_quantize_model = IsQuantizedModel(*net_def);
+  //
+  DataFormat data_format_flag = NHWC;
+  if (target_device_->device_type() == DeviceType::CPU) {
+    target_mem_type = MemoryType::CPU_BUFFER;
+    for (auto &input_info : net_def->input_info()) {
+      std::vector<index_t> input_shape =
+          std::vector<index_t>(input_info.dims().begin(),
+                               input_info.dims().end());
+      // Only could be NONE or NHWC
+      auto input_data_format = static_cast<DataFormat>(
+          input_info.data_format());
+      if (!is_quantize_model &&
+          input_data_format == NHWC &&
+          input_info.dims_size() == 4) {
+        // NHWC -> NCHW
+        input_shape =
+            TransposeShape<index_t, index_t>(input_shape, {0, 3, 1, 2});
+      } else if (input_data_format == DataFormat::DF_NONE) {
+        data_format_flag = DataFormat::DF_NONE;
+      }
+      output_map.emplace(input_info.name(), InternalOutputInfo(
+          target_mem_type, DataType::DT_FLOAT, input_shape, -1));
+    }
+  }
+#ifdef MACE_ENABLE_OPENCL
+  else {  // GPU  NOLINT[readability/braces]
+    target_mem_type = MemoryType::GPU_BUFFER;
+    for (auto &input_info : net_def->input_info()) {
+      std::vector<index_t> input_shape =
+          std::vector<index_t>(input_info.dims().begin(),
+                               input_info.dims().end());
+      output_map.emplace(input_info.name(), InternalOutputInfo(
+          target_mem_type, DataType::DT_FLOAT, input_shape, -1));
+    }
+  }
+#endif  // MACE_ENABLE_OPENCL
+
  OpConstructContext construct_context(ws_);
  for (int idx = 0; idx < net_def->op_size(); ++idx) {
-    const auto &operator_def = net_def->op(idx);
-    // Create the Operation
-    const int op_device =
-        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            operator_def, "device", static_cast<int>(target_device_type));
-    if (op_device == target_device_type) {
-      // Get available devices (sorted based on priority)
-      OperatorDef temp_def(operator_def);
-      auto available_devices = op_registry->AvailableDevices(temp_def.type());
-      // Find the device type to run the op.
-      // If the target_device_type in available devices, use target_device_type,
-      // otherwise, fallback to CPU device.
-      DeviceType device_type = DeviceType::CPU;
-      construct_context.set_device(cpu_device_);
-      for (auto device : available_devices) {
-        if (device == target_device_type) {
-          device_type = target_device_type;
-          construct_context.set_device(target_device_);
-          break;
+    std::shared_ptr<OperatorDef> op_def(new OperatorDef(net_def->op(idx)));
+    // Create operation
+    auto op = CreateOperation(op_registry,
+                              &construct_context,
+                              op_def,
+                              data_format_flag,
+                              is_quantize_model);
+#ifdef MACE_ENABLE_OPENCL
+    // Add input transform operation if necessary
+    if (target_device_->device_type() == DeviceType::GPU) {
+      const DataType dt =
+          static_cast<DataType>(
+              ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                  *op_def, "T", static_cast<int>(DataType::DT_FLOAT)));
+      // the outputs' memory type of the operation
+      MemoryType out_mem_type = construct_context.output_mem_type();
+      int input_size = op_def->input_size();
+      for (int i = 0; i < input_size; ++i) {
+        if (output_map.count(op_def->input(i)) == 1) {
+          // if op is memory-reuse op, no transformation
+          if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) {
+            out_mem_type = output_map.at(op_def->input(i)).mem_type;
+            break;
+          }
+          // check whether is the output tensor of other operation
+          if (output_map.at(op_def->input(i)).mem_type != out_mem_type ||
+              output_map.at(op_def->input(i)).dtype != dt) {
+            auto key = TransformedName(op_def->input(i), out_mem_type);
+            auto &output_info = output_map.at(op_def->input(i));
+            // check whether the tensor has been transformed
+            if (transformed_map.count(key) == 0) {
+              VLOG(1) << "Add Transform operation to transform tensor '"
+                      << op_def->input(i) << "', from memory type "
+                      << output_info.mem_type << " to " << out_mem_type
+                      << ", from Data Type " << output_info.dtype << " to "
+                      << dt;
+              std::string input_name = op_def->input(i);
+              std::string t_input_name =
+                  TransformedName(input_name,
+                                  out_mem_type);
+              op_def->set_input(i, t_input_name);
+              auto input_shape = output_info.shape;
+              if (output_info.mem_type == MemoryType::CPU_BUFFER &&
+                  input_shape.size() == 4) {
+                // NCHW -> NHWC
+                input_shape =
+                    TransposeShape<index_t, index_t>(input_shape,
+                                                     {0, 2, 3, 1});
+              }
+              auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
+                  input_name, input_shape, t_input_name,
+                  dt, out_mem_type);
+              auto transform_op = CreateOperation(
+                  op_registry,
+                  &construct_context,
+                  transform_op_def,
+                  data_format_flag);
+              operators_.emplace_back(std::move(transform_op));
+              transformed_map.emplace(key, t_input_name);
+              output_mem_map[t_input_name] = out_mem_type;
+              // where to do graph reference count.
+              mem_optimizer->UpdateTensorRef(transform_op_def.get());
+            } else {
+              op_def->set_input(i, transformed_map[key]);
+            }
+          }
+        } else {
+          MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
+                         && ws_->GetTensor(op_def->input(i))->is_weight(),
+                     "Tensor ", op_def->input(i), " of ",
+                     op_def->name(), " not allocated");
        }
      }
-      temp_def.set_device_type(device_type);
-      construct_context.set_operator_def(&temp_def);
-      std::unique_ptr<Operation> op(
-          op_registry->CreateOperation(&construct_context, device_type, mode));
-      if (op) {
-        operators_.emplace_back(std::move(op));
+      // update the map : output_tensor -> Operation
+      for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
+        output_mem_map[op_def->output(out_idx)] = out_mem_type;
+        output_map.emplace(
+            op_def->output(out_idx),
+            InternalOutputInfo(
+                out_mem_type,
+                dt,
+                op_def->output_shape().empty() ?
+                std::vector<index_t>() :
+                std::vector<index_t>(
+                    op_def->output_shape(out_idx).dims().begin(),
+                    op_def->output_shape(out_idx).dims().end()),
+                static_cast<int>(operators_.size())));
      }
    }
+#endif  // MACE_ENABLE_OPENCL
+    operators_.emplace_back(std::move(op));
+    // where to do graph reference count.
+    mem_optimizer->UpdateTensorRef(op_def.get());
+  }
+
+#ifdef MACE_ENABLE_OPENCL
+  // Transform the output tensor if necessary
+  if (target_device_->device_type() == DeviceType::GPU) {
+    for (auto &output_info : net_def->output_info()) {
+      auto &internal_output_info = output_map.at(output_info.name());
+      if ((internal_output_info.mem_type != target_mem_type &&
+          internal_output_info.mem_type != MemoryType::CPU_BUFFER) ||
+          internal_output_info.dtype != DataType::DT_FLOAT) {
+        VLOG(1) << "Add Transform operation to transform output tensor '"
+                << output_info.name() << "', from memory type "
+                << internal_output_info.mem_type
+                << " to " << target_mem_type
+                << ", from Data Type " << internal_output_info.dtype
+                << " to " << DataType::DT_FLOAT;
+        std::string t_output_name = TransformedName(output_info.name(),
+            target_mem_type);
+        auto output_op_def =
+            operators_[internal_output_info.op_idx]->operator_def();
+        int output_size = output_op_def->output_size();
+        for (int i = 0; i < output_size; ++i) {
+          if (output_op_def->output(i) == output_info.name()) {
+            output_op_def->set_output(i, t_output_name);
+            // update the output : mem_type map
+            output_mem_map[t_output_name] = output_mem_map[output_info.name()];
+            output_mem_map[output_info.name()] = target_mem_type;
+          }
+        }
+        auto output_data_format =
+            static_cast<DataFormat>(output_info.data_format());
+        auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
+            t_output_name,
+            internal_output_info.shape,
+            output_info.name(),
+            DataType::DT_FLOAT,
+            target_mem_type);
+        auto transform_op = CreateOperation(
+            op_registry,
+            &construct_context,
+            transform_op_def,
+            output_data_format);
+        operators_.emplace_back(std::move(transform_op));
+        // where to do graph reference count.
+        mem_optimizer->UpdateTensorRef(transform_op_def.get());
+      }
+    }
+  }
+#endif  // MACE_ENABLE_OPENCL
+  // Update output tensor reference
+  for (auto &output_info : net_def->output_info()) {
+    mem_optimizer->UpdateTensorRef(output_info.name());
+  }
+
+  // Do memory optimization
+  for (auto &op : operators_) {
+    VLOG(2) << "Operator " << op->debug_def().name() << "<" << op->device_type()
+            << ", " << op->debug_def().type() << ">";
+    mem_optimizer->Optimize(op->operator_def().get(), output_mem_map);
  }
+  VLOG(1) << mem_optimizer->DebugInfo();
 }

 MaceStatus SerialNet::Init() {
-  // TODO(liuqi): where to do memory reuse.
  MACE_LATENCY_LOGGER(1, "Initializing SerialNet");
  OpInitContext init_context(ws_);
  for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
@@ -95,18 +344,18 @@ MaceStatus SerialNet::Init() {
 }

 MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
-  // TODO(liuqi): In/Out Buffer Transform
  MACE_MEMORY_LOGGING_GUARD();
  MACE_LATENCY_LOGGER(1, "Running net");
  OpContext context(ws_, cpu_device_);
  for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
    auto &op = *iter;
    DeviceType device_type = op->device_type();
-    MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(),
-                        "<", device_type, ", ", op->debug_def().type(), ">",
-                        ". mem_id: ",
-                        MakeListString(op->debug_def().mem_id().data(),
-                                       op->debug_def().mem_id().size()));
+    MACE_LATENCY_LOGGER(1, "Running operator ", op->debug_def().name(),
+                        "<", device_type, ", ", op->debug_def().type(),
+                        ", ",
+                        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                            op->debug_def(), "T", static_cast<int>(DT_FLOAT)),
+                        ">");
    if (device_type == target_device_->device_type()) {
      context.set_device(target_device_);
    } else {
@@ -173,7 +422,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
            float max_v = std::numeric_limits<float>::lowest();
            float min_v = std::numeric_limits<float>::max();
            Tensor::MappingGuard guard(op->Output(i));
-            const float *output_data = op->Output(i)->data<float>();
+            auto *output_data = op->Output(i)->data<float>();
            for (index_t j = 0; j < op->Output(i)->size(); ++j) {
              max_v = std::max(max_v, output_data[j]);
              min_v = std::min(min_v, output_data[j]);
@@ -189,14 +438,14 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
            std::vector<int> bin_distribution(bin_size, 0);
            float bin_v = (max_v - min_v) / bin_size;
            Tensor::MappingGuard guard(op->Output(i));
-            const float *output_data = op->Output(i)->data<float>();
+            auto *output_data = op->Output(i)->data<float>();
            for (index_t j = 0; j < op->Output(i)->size(); ++j) {
-                int ind = static_cast<int>((output_data[j] - min_v) / bin_v);
-                if (ind < 0)
-                  ind = 0;
-                else if (ind > bin_size-1)
-                  ind = bin_size-1;
-                bin_distribution[ind]++;
+                int index = static_cast<int>((output_data[j] - min_v) / bin_v);
+                if (index < 0)
+                  index = 0;
+                else if (index > bin_size-1)
+                  index = bin_size-1;
+                bin_distribution[index]++;
            }
            LOG(INFO) << "Tensor range @@" << op->debug_def().output(i)
                        << "@@" << min_v << "," << max_v<< "@@"

--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -27,6 +27,7 @@ namespace mace {

 class RunMetadata;
 class Workspace;
+class MemoryOptimizer;

 class NetBase {
 public:
@@ -47,12 +48,20 @@ class SerialNet : public NetBase {
            const NetDef *net_def,
            Workspace *ws,
            Device *target_device,
-            const NetMode mode = NetMode::NORMAL);
+            MemoryOptimizer * mem_optimizer);

  MaceStatus Init() override;

  MaceStatus Run(RunMetadata *run_metadata = nullptr) override;

+ private:
+  std::unique_ptr<Operation> CreateOperation(
+      const OpRegistryBase *op_registry,
+      OpConstructContext *construct_context,
+      std::shared_ptr<OperatorDef> op_def,
+      DataFormat input_format,
+      bool is_quantize_model = false);
+
 protected:
  Workspace *ws_;
  Device *target_device_;

--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -23,16 +23,12 @@ namespace mace {

 OpConstructContext::OpConstructContext(Workspace *ws)
    : operator_def_(nullptr), ws_(ws), device_(nullptr) {}
-OpConstructContext::OpConstructContext(OperatorDef *operator_def,
-                                       Workspace *ws,
-                                       Device *device)
-    : operator_def_(operator_def), ws_(ws), device_(device) {}

 OpInitContext::OpInitContext(Workspace *ws, Device *device)
    : ws_(ws), device_(device) {}

 Operation::Operation(OpConstructContext *context)
-    : operator_def_(std::make_shared<OperatorDef>(*(context->operator_def())))
+    : operator_def_(context->operator_def())
 {}

 MaceStatus Operation::Init(OpInitContext *context) {
@@ -43,11 +39,9 @@ MaceStatus Operation::Init(OpInitContext *context) {
               ": Encountered a non-existing input tensor: ", input_str);
    inputs_.push_back(tensor);
  }
-  // TODO(liuqi): filter transform
  for (int i = 0; i < operator_def_->output_size(); ++i) {
    const std::string output_str = operator_def_->output(i);
    if (ws->HasTensor(output_str)) {
-      // TODO(liuqi): Workspace should pre-allocate all of the output tensors
      outputs_.push_back(ws->GetTensor(output_str));
    } else {
      MACE_CHECK(
@@ -66,15 +60,14 @@ MaceStatus Operation::Init(OpInitContext *context) {
      }
      outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
          output_str, context->device()->allocator(), output_type)));
-
-      if (i < operator_def_->output_shape_size()) {
-        std::vector<index_t>
-            shape_configured(operator_def_->output_shape(i).dims_size());
-        for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
-          shape_configured[dim] = operator_def_->output_shape(i).dims(dim);
-        }
-        ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
+    }
+    if (i < operator_def_->output_shape_size()) {
+      std::vector<index_t>
+          shape_configured(operator_def_->output_shape(i).dims_size());
+      for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
+        shape_configured[dim] = operator_def_->output_shape(i).dims(dim);
      }
+      ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
    }
  }
  return MaceStatus::MACE_SUCCESS;
@@ -164,33 +157,34 @@ const std::set<DeviceType> OpRegistryBase::AvailableDevices(

 std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
    OpConstructContext *context,
-    DeviceType device_type,
-    const NetMode mode) const {
-  OperatorDef *operator_def = context->operator_def();
-  const DataType dtype = static_cast<DataType>(
+    DeviceType device_type) const {
+  auto operator_def = context->operator_def();
+  DataType dtype = static_cast<DataType>(
      ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
          *operator_def, "T", static_cast<int>(DT_FLOAT)));
-  const int op_mode_i = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-      *operator_def, "mode", static_cast<int>(NetMode::NORMAL));
-  const NetMode op_mode = static_cast<NetMode>(op_mode_i);
-  VLOG(3) << "Creating operator " << operator_def->name() << "("
+  if (device_type == DeviceType::CPU && dtype == DT_HALF) {
+    int arg_size = operator_def->arg_size();
+    for (int i = 0; i < arg_size; ++i) {
+      if (operator_def->arg(i).name() == "T") {
+        operator_def->mutable_arg(i)->set_i(DT_FLOAT);
+      }
+    }
+    dtype = DT_FLOAT;
+  }
+  VLOG(1) << "Creating operator " << operator_def->name() << "("
          << operator_def->type() << "<" << dtype << ">" << ") on "
          << device_type;
-  if (op_mode == mode) {
-    const std::string op_type = context->operator_def()->type();
-    MACE_CHECK(registry_.count(op_type) != 0,
-               op_type, " operation is not registered.");
-
-    std::string key = OpKeyBuilder(op_type)
-        .Device(device_type)
-        .TypeConstraint("T", dtype)
-        .Build();
-    if (registry_.at(op_type)->creators.count(key) == 0) {
-      LOG(FATAL) << "Key not registered: " << key;
-    }
-    return registry_.at(op_type)->creators.at(key)(context);
-  } else {
-    return nullptr;
+  const std::string op_type = context->operator_def()->type();
+  MACE_CHECK(registry_.count(op_type) != 0,
+             op_type, " operation is not registered.");
+
+  std::string key = OpKeyBuilder(op_type)
+      .Device(device_type)
+      .TypeConstraint("T", dtype)
+      .Build();
+  if (registry_.at(op_type)->creators.count(key) == 0) {
+    LOG(FATAL) << "Key not registered: " << key;
  }
+  return registry_.at(op_type)->creators.at(key)(context);
 }
 }  // namespace mace
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -33,14 +33,13 @@ namespace mace {
 class OpConstructContext {
 public:
  explicit OpConstructContext(Workspace *ws);
-  OpConstructContext(OperatorDef *operator_def, Workspace *ws, Device *device);
  ~OpConstructContext() = default;

-  inline void set_operator_def(OperatorDef *operator_def) {
+  inline void set_operator_def(std::shared_ptr<OperatorDef> operator_def) {
    operator_def_ = operator_def;
  }

-  inline OperatorDef *operator_def() const {
+  inline std::shared_ptr<OperatorDef> operator_def() const {
    return operator_def_;
  }

@@ -56,10 +55,19 @@ class OpConstructContext {
    return device_;
  }

+  inline void set_output_mem_type(MemoryType type) {
+    output_mem_type_ = type;
+  }
+
+  inline MemoryType output_mem_type() const {
+    return output_mem_type_;
+  }
+
 private:
-  OperatorDef *operator_def_;
+  std::shared_ptr<OperatorDef> operator_def_;
  Workspace *ws_;
  Device *device_;
+  MemoryType output_mem_type_;  // used for transform memory
 };

 // memory_optimizer, device
@@ -131,14 +139,18 @@ class Operation {
  }

  inline void set_debug_def(
-      const std::shared_ptr<const OperatorDef> &operator_def) {
+      const std::shared_ptr<OperatorDef> &operator_def) {
    operator_def_ = operator_def;
  }

  inline bool has_debug_def() const { return operator_def_ != nullptr; }

+  inline std::shared_ptr<OperatorDef> operator_def() {
+    return operator_def_;
+  }
+
 protected:
-  std::shared_ptr<const OperatorDef> operator_def_;
+  std::shared_ptr<OperatorDef> operator_def_;
  std::vector<const Tensor *> inputs_;
  std::vector<Tensor *> outputs_;

@@ -190,8 +202,7 @@ class OpRegistryBase {

  std::unique_ptr<Operation> CreateOperation(
      OpConstructContext *context,
-      DeviceType device_type,
-      const NetMode mode) const;
+      DeviceType device_type) const;

  template <class DerivedType>
  static std::unique_ptr<Operation> DefaultCreator(

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -285,7 +285,8 @@ OpenCLRuntime::OpenCLRuntime(
    is_profiling_enabled_(false),
    opencl_version_(CL_VER_UNKNOWN),
    gpu_type_(UNKNOWN),
-    mem_type_(MemoryType::GPU_IMAGE) {
+    mem_type_(MemoryType::GPU_IMAGE),
+    scratch_image_manager_(new ScratchImageManager) {
  std::vector<cl::Platform> all_platforms;
  cl::Platform::get(&all_platforms);
  if (all_platforms.size() == 0) {
@@ -791,4 +792,8 @@ bool OpenCLRuntime::is_profiling_enabled() const {
  return is_profiling_enabled_;
 }

+ScratchImageManager* OpenCLRuntime::scratch_image_manager() const {
+  return scratch_image_manager_.get();
+}
+
 }  // namespace mace
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -25,6 +25,7 @@
 #include "mace/core/file_storage.h"
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/runtime/opencl/scratch_image.h"
 #include "mace/proto/mace.pb.h"
 #include "mace/utils/string_util.h"
 #include "mace/utils/timer.h"
@@ -82,6 +83,7 @@ class OpenCLRuntime {
  uint64_t device_global_mem_cache_size() const;
  uint32_t device_compute_units() const;
  Tuner<uint32_t> *tuner();
+  ScratchImageManager *scratch_image_manager() const;
  bool is_opencl_avaliable();
  // TODO(liuqi): remove this function in the future, make decision at runtime.
  bool UseImageMemory();
@@ -134,6 +136,7 @@ class OpenCLRuntime {
  OpenCLVersion opencl_version_;
  GPUType gpu_type_;
  MemoryType mem_type_;
+  std::unique_ptr<ScratchImageManager> scratch_image_manager_;
  // All OpenCL object must be a pointer and manually deleted before unloading
  // OpenCL library.
  std::shared_ptr<cl::Context> context_;

--- a/mace/core/runtime/opencl/opencl_util.cc
+++ b/mace/core/runtime/opencl/opencl_util.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/opencl/opencl_util.h"
+
+#include <utility>
+
+#include "mace/utils/logging.h"
+
+namespace mace {
+
+namespace {
+// [(C + 3) / 4 * W, N * H]
+void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
+                           std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2];
+  (*image_shape)[1] = shape[0] * shape[1];
+}
+
+// [Ic, H * W * (Oc + 3) / 4]
+void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* OIHW */
+                               std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[1];
+  (*image_shape)[1] = shape[2] * shape[3] * RoundUpDiv4(shape[0]);
+}
+
+// [H * W * M, (Ic + 3) / 4]
+void CalDepthwiseConv2dFilterImageShape(
+    const std::vector<index_t> &shape, /* MIHW */
+    std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[0] * shape[2] * shape[3];
+  (*image_shape)[1] = RoundUpDiv4(shape[1]);
+}
+
+// [(size + 3) / 4, 1]
+void CalArgImageShape(const std::vector<index_t> &shape,
+                      std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 1);
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[0]);
+  (*image_shape)[1] = 1;
+}
+
+// Only support 3x3 now
+// [ (Ic + 3) / 4, 16 * Oc]
+void CalWinogradFilterImageShape(
+    const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
+    std::vector<size_t> *image_shape,
+    const int blk_size) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[1]);
+  (*image_shape)[1] = (shape[0] * (blk_size + 2) * (blk_size + 2));
+}
+
+
+// [W * C, N * RoundUp<4>(H)]
+void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
+                              std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[2] * shape[3];
+  (*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]);
+}
+
+// [RoundUp<4>(W) * C, N * H]
+void CalInOutWidthImageShape(const std::vector<index_t> &shape, /* NHWC */
+                             std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3];
+  (*image_shape)[1] = shape[0] * shape[1];
+}
+
+// [Ic * H * W, (Oc + 3) / 4]
+void CalWeightHeightImageShape(const std::vector<index_t> &shape, /* OIHW */
+                               std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[1] * shape[2] * shape[3];
+  (*image_shape)[1] = RoundUpDiv4(shape[0]);
+}
+
+// [(Ic + 3) / 4 * H * W, Oc]
+void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* OIHW */
+                              std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[1]) * shape[2] * shape[3];
+  (*image_shape)[1] = shape[0];
+}
+}  // namespace
+
+void OpenCLUtil::CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
+                                 const OpenCLBufferType type,
+                                 std::vector<size_t> *image_shape,
+                                 const int wino_block_size) {
+  MACE_CHECK_NOTNULL(image_shape);
+  switch (type) {
+    case CONV2D_FILTER:
+      CalConv2dFilterImageShape(shape, image_shape);
+      break;
+    case DW_CONV2D_FILTER:
+      CalDepthwiseConv2dFilterImageShape(shape, image_shape);
+      break;
+    case IN_OUT_CHANNEL:
+      CalInOutputImageShape(shape, image_shape);
+      break;
+    case ARGUMENT:
+      CalArgImageShape(shape, image_shape);
+      break;
+    case IN_OUT_HEIGHT:
+      CalInOutHeightImageShape(shape, image_shape);
+      break;
+    case IN_OUT_WIDTH:
+      CalInOutWidthImageShape(shape, image_shape);
+      break;
+    case WINOGRAD_FILTER:
+      CalWinogradFilterImageShape(shape, image_shape, wino_block_size);
+      break;
+    case WEIGHT_HEIGHT:
+      CalWeightHeightImageShape(shape, image_shape);
+      break;
+    case WEIGHT_WIDTH:
+      CalWeightWidthImageShape(shape, image_shape);
+      break;
+    default:
+      LOG(FATAL) << "Mace not supported yet.";
+  }
+}
+
+std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef(
+    const std::string &input_name,
+    const std::vector<mace::index_t> &input_shape,
+    const std::string &output_name,
+    const mace::DataType dt,
+    const mace::MemoryType mem_type) {
+  std::unique_ptr<OperatorDef> op(new OperatorDef);
+  std::string op_name = "mace_node_" + output_name;
+  op->set_name(op_name);
+  op->set_type("BufferTransform");
+  op->add_input(input_name);
+  op->add_output(output_name);
+  Argument *arg = op->add_arg();
+  arg->set_name("buffer_type");
+  arg->set_i(static_cast<int32_t>(OpenCLBufferType::IN_OUT_CHANNEL));
+  arg = op->add_arg();
+  arg->set_name("mem_type");
+  arg->set_i(static_cast<int32_t>(mem_type));
+  arg = op->add_arg();
+  arg->set_name("T");
+  arg->set_i(static_cast<int32_t>(dt));
+  arg = op->add_arg();
+  arg->set_name("device");
+  arg->set_i(DeviceType::GPU);
+  if (!input_shape.empty()) {
+    OutputShape *shape = op->add_output_shape();
+    for (auto value : input_shape) {
+      shape->add_dims(value);
+    }
+  }
+  return std::move(op);
+}
+}  // namespace mace
--- a/mace/core/runtime/opencl/opencl_util.h
+++ b/mace/core/runtime/opencl/opencl_util.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_
+#define MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mace/core/types.h"
+
+namespace mace {
+enum OpenCLBufferType {
+  CONV2D_FILTER = 0,
+  IN_OUT_CHANNEL = 1,
+  ARGUMENT = 2,
+  IN_OUT_HEIGHT = 3,
+  IN_OUT_WIDTH = 4,
+  WINOGRAD_FILTER = 5,
+  DW_CONV2D_FILTER = 6,
+  WEIGHT_HEIGHT = 7,
+  WEIGHT_WIDTH = 8,
+};
+
+
+class OpenCLUtil {
+ public:
+  static void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
+                              const OpenCLBufferType type,
+                              std::vector<size_t> *image_shape,
+                              const int wino_blk_size = 2);
+
+  static std::shared_ptr<OperatorDef> CreateTransformOpDef(
+      const std::string &input_name,
+      const std::vector<mace::index_t> &input_shape,
+      const std::string &output_name,
+      const mace::DataType dt,
+      const MemoryType mem_type);
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_
--- a/mace/core/runtime/opencl/scratch_image.cc
+++ b/mace/core/runtime/opencl/scratch_image.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/opencl/scratch_image.h"
+
+#include <utility>
+#include <vector>
+
+namespace mace {
+
+ScratchImageManager::ScratchImageManager() = default;
+ScratchImageManager::~ScratchImageManager() = default;
+
+Image *ScratchImageManager::Spawn(
+    Allocator *allocator,
+    const std::vector<size_t> &shape,
+    const DataType dt,
+    int *id) {
+  // TODO(liuqi): not optimal memory reuse strategy
+  int found_image_idx = -1;
+  int image_count = static_cast<int>(reference_count_.size());
+  for (int i = 0; i < image_count; ++i) {
+    int count = reference_count_[i];
+    if (count == 0 && images_.at(count)->dtype() == dt) {
+      auto image_shape = images_.at(count)->image_shape();
+      if (image_shape[0] >= shape[0] && image_shape[1] >= shape[1]) {
+        found_image_idx = i;
+        break;
+      }
+    }
+  }
+  // if not found
+  if (found_image_idx == -1) {
+    reference_count_.push_back(0);
+    images_[image_count] =
+        std::move(std::unique_ptr<Image>(new Image(allocator)));
+    if (images_.at(image_count)->Allocate(shape, dt) !=
+        MaceStatus::MACE_SUCCESS) {
+      return nullptr;
+    }
+    found_image_idx = image_count;
+    VLOG(2) << "Spawn image " << found_image_idx << ": " << MakeString(shape)
+            << "<" << dt << ">";
+  }
+  reference_count_[found_image_idx] += 1;
+  *id = found_image_idx;
+  return images_.at(found_image_idx).get();
+}
+
+void ScratchImageManager::Deactive(int id) {
+  MACE_CHECK(reference_count_.size() > static_cast<size_t>(id)
+                 && reference_count_[id] > 0,
+             "Image id ", id, " exceed the vector size ",
+             reference_count_.size());
+  reference_count_[id] -= 1;
+}
+
+ScratchImage::ScratchImage(mace::ScratchImageManager *manager)
+    : manager_(manager), id_(-1) {}
+
+ScratchImage::~ScratchImage() {
+  if (id_ >= 0) {
+    manager_->Deactive(id_);
+  }
+}
+
+Image* ScratchImage::Scratch(Allocator *allocator,
+                             const std::vector<size_t> &shape,
+                             const mace::DataType dt) {
+  return manager_->Spawn(allocator, shape, dt, &id_);
+}
+
+}  // namespace mace
--- a/mace/ops/opencl/winograd_transform.h
+++ b/mace/ops/opencl/winograd_transform.h
@@ -12,39 +12,47 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_
-#define MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_
+#ifndef MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_
+#define MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_

+#include <memory>
+#include <unordered_map>
 #include <vector>

-#include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/core/buffer.h"
+
 namespace mace {

-class OpContext;
-class Tensor;
+class ScratchImageManager {
+ public:
+  ScratchImageManager();
+  ~ScratchImageManager();
+
+  Image *Spawn(Allocator *allocator,
+               const std::vector<size_t> &shape,
+               const DataType dt,
+               int *id);

-namespace ops {
+  void Deactive(int id);

-class OpenCLWinogradTransformKernel {
- public:
-  virtual MaceStatus Compute(
-      OpContext *context,
-      const Tensor *input,
-      Tensor *output) = 0;
-  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradTransformKernel);
+ private:
+  std::unordered_map<int, std::unique_ptr<Image>> images_;
+  std::vector<int> reference_count_;
 };

-class OpenCLWinogradInverseTransformKernel {
+class ScratchImage {
 public:
-  virtual MaceStatus Compute(
-      OpContext *context,
-      const std::vector<const Tensor*> &inputs,
-      Tensor *output) = 0;
-  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradInverseTransformKernel);
+  explicit ScratchImage(ScratchImageManager *);
+  ~ScratchImage();
+
+  Image *Scratch(Allocator *allocator,
+                 const std::vector<size_t> &shape,
+                 const DataType dt);
+
+ private:
+  ScratchImageManager *manager_;
+  int id_;
 };

-}  // namespace ops
 }  // namespace mace
-
-#endif  // MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_
+#endif  // MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -97,7 +97,7 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) {
 }
 }  // namespace numerical_chars

-enum DataFormat { NHWC = 0, NCHW = 1, HWOI = 2, OIHW = 3, HWIO = 4, OHWI = 5 };
+enum FilterDataFormat { HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103 };

 class Tensor {
 public:
@@ -222,6 +222,25 @@ class Tensor {
    return buffer_ != nullptr && !buffer_->OnHost() && !has_opencl_image();
  }

+  inline MemoryType memory_type() const {
+    MACE_CHECK(buffer_ != nullptr, "Tensor ", name_, " is empty");
+    if (buffer_->OnHost()) {
+      return MemoryType::CPU_BUFFER;
+    } else if (typeid(*buffer_) == typeid(Image)) {
+      return MemoryType::GPU_IMAGE;
+    } else {
+      return MemoryType::GPU_BUFFER;
+    }
+  }
+
+  inline void set_data_format(DataFormat data_format) {
+    data_format_ = data_format;
+  }
+
+  inline DataFormat data_format() const {
+    return data_format_;
+  }
+
 #ifdef MACE_ENABLE_OPENCL
  inline cl::Image *opencl_image() const {
    MACE_CHECK(has_opencl_image(), name_, " do not have image");
@@ -488,6 +507,7 @@ class Tensor {
  int32_t zero_point_;
  float minval_;
  float maxval_;
+  DataFormat data_format_;  // used for 4D input/output tensor

  MACE_DISABLE_COPY_AND_ASSIGN(Tensor);
 };

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -18,6 +18,7 @@
 #include <utility>

 #include "mace/core/arg_helper.h"
+#include "mace/core/memory_optimizer.h"
 #include "mace/utils/quantize.h"

 #ifdef MACE_ENABLE_OPENCL
@@ -27,13 +28,6 @@
 namespace mace {

 namespace {
-bool ShouldPreallocateMemoryForOp(const OperatorDef &op) {
-  static const std::unordered_set<std::string> reuse_buffer_ops {
-      "Reshape", "Identity", "Squeeze"
-  };
-  return reuse_buffer_ops.find(op.type()) == reuse_buffer_ops.end();
-}
-
 bool HasQuantizeOp(const NetDef &net_def) {
  for (auto &op : net_def.op()) {
    if (op.type() == "Quantize") {
@@ -48,13 +42,14 @@ Workspace::Workspace() = default;

 Tensor *Workspace::CreateTensor(const std::string &name,
                                Allocator *alloc,
-                                DataType type) {
+                                DataType type,
+                                bool is_weight) {
  if (HasTensor(name)) {
    VLOG(3) << "Tensor " << name << " already exists. Skipping.";
  } else {
    VLOG(3) << "Creating Tensor " << name;
    tensor_map_[name] = std::unique_ptr<Tensor>(new Tensor(alloc, type,
-                                                           false, name));
+                                                           is_weight, name));
  }
  return GetTensor(name);
 }
@@ -199,13 +194,79 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
      fused_buffer_ = true;
    }
  }
+  return MaceStatus::MACE_SUCCESS;
+}

-  if (device_type == DeviceType::CPU || device_type == DeviceType::GPU) {
-    MaceStatus status = CreateOutputTensorBuffer(net_def, device);
-    if (status != MaceStatus::MACE_SUCCESS) return status;
+MaceStatus Workspace::PreallocateOutputTensor(
+    const mace::NetDef &net_def,
+    const mace::MemoryOptimizer *mem_optimizer,
+    Device *device) {
+  auto &mem_blocks = mem_optimizer->mem_blocks();
+  for (auto &mem_block : mem_blocks) {
+    VLOG(3) << "Preallocate memory block. id: " << mem_block.mem_id()
+            << ", memory type: " << mem_block.mem_type()
+            << ", size: " << mem_block.x() << "x" << mem_block.y();
+    if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
+      std::unique_ptr<BufferBase> tensor_buf(
+          new Buffer(GetCPUAllocator()));
+      MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
+          mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE));
+      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
+                                        std::move(tensor_buf));
+    } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
+      std::unique_ptr<BufferBase> image_buf(
+          new Image(device->allocator()));
+      MACE_RETURN_IF_ERROR(image_buf->Allocate(
+          {static_cast<size_t>(mem_block.x()),
+           static_cast<size_t>(mem_block.y())}, mem_block.data_type()));
+      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
+                                        std::move(image_buf));
+    } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
+      std::unique_ptr<BufferBase> tensor_buf(
+          new Buffer(device->allocator()));
+      MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
+          mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE));
+      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
+                                        std::move(tensor_buf));
+    }
+  }
+  VLOG(1) << "Preallocate buffer to tensors";
+  bool is_quantize_model = IsQuantizedModel(net_def);
+  for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) {
+    std::unique_ptr<Tensor> tensor
+        (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.first),
+                    tensor_mem.second.second,
+                    false, tensor_mem.first));
+    if (mem_blocks[tensor_mem.second.first].mem_type()
+        == MemoryType::GPU_IMAGE) {
+      VLOG(1) << "Tensor: " << tensor_mem.first
+              << " Mem: " << tensor_mem.second.first
+              << " Data type: " << tensor->dtype()
+              << " Image shape: "
+              << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
+                  ->image_shape()[0]
+              << ", "
+              << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
+                  ->image_shape()[1];
+     tensor->set_data_format(DataFormat::NHWC);
+    } else {
+      VLOG(1) << "Tensor: " << tensor_mem.first
+              << " Mem: " << tensor_mem.second.first
+              << " Data type: " << tensor->dtype()
+              << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
+      if (mem_blocks[tensor_mem.second.first].mem_type()
+          == MemoryType::GPU_BUFFER ||
+          is_quantize_model) {
+        tensor->set_data_format(DataFormat::NHWC);
+      } else {
+        tensor->set_data_format(DataFormat::NCHW);
+      }
+    }
+    tensor_map_[tensor_mem.first] = std::move(tensor);
  }

-  if (device_type == DeviceType::CPU) {
+  // add quantize info for output tensors.
+  if (device->device_type() == DeviceType::CPU) {
    for (const auto &op : net_def.op()) {
      VLOG(2) << "Add quantize info for op: " << op.name();
      MACE_CHECK(op.quantize_info().empty()
@@ -225,139 +286,6 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
  return MaceStatus::MACE_SUCCESS;
 }

-MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
-                                               Device *device) {
-  DeviceType device_type = device->device_type();
-  DataType dtype = DataType::DT_INVALID;
-  if (net_def.mem_arena().mem_block_size() > 0) {
-    // We use the data type of the first op with mem id,
-    // as CPU&GPU have consistent data type for each layer for now.
-    // As DSP may have different data output type for each op,
-    // we stick to the same concept.
-    for (auto &op : net_def.op()) {
-      // TODO(liuqi): refactor to add device_type to OperatorDef
-      const int op_device =
-          ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-              op, "device", static_cast<int>(device_type));
-      if (op_device == device_type && !op.mem_id().empty()) {
-        const DataType op_dtype = static_cast<DataType>(
-            ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                op, "T", static_cast<int>(DT_FLOAT)));
-        if (op_dtype != DataType::DT_INVALID) {
-          dtype = op_dtype;
-          // find first valid data type, break
-          break;
-        }
-      }
-    }
-    MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
-  }
-  // TODO(liyin): memory block should not have concept of type, but to be
-  // consistent with gpu, all memory block use float/half as unit
-  for (auto &mem_block : net_def.mem_arena().mem_block()) {
-    if (mem_block.device_type() == device_type) {
-      VLOG(3) << "Preallocate memory block. id: " << mem_block.mem_id()
-              << ", device type: " << mem_block.device_type()
-              << ", memory type: " << mem_block.mem_type();
-      if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
-        std::unique_ptr<BufferBase> tensor_buf(
-            new Buffer(GetCPUAllocator()));
-        MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
-            mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE));
-        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
-                                          std::move(tensor_buf));
-      } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
-        std::unique_ptr<BufferBase> image_buf(
-            new Image(device->allocator()));
-        MACE_RETURN_IF_ERROR(image_buf->Allocate(
-            {mem_block.x(), mem_block.y()}, dtype));
-        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
-                                          std::move(image_buf));
-      } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
-        std::unique_ptr<BufferBase> tensor_buf(
-            new Buffer(device->allocator()));
-        MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
-            mem_block.x() * GetEnumTypeSize(dtype)
-                + MACE_EXTRA_BUFFER_PAD_SIZE));
-        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
-                                          std::move(tensor_buf));
-      }
-    }
-  }
-  VLOG(3) << "Preallocate buffer to tensors";
-  for (auto &op : net_def.op()) {
-    // TODO(liuqi): refactor to add device_type to OperatorDef
-    const int op_device =
-        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            op, "device", static_cast<int>(device_type));
-    if (op_device == device_type) {
-      if (!op.mem_id().empty()
-          && ShouldPreallocateMemoryForOp(op)) {
-        auto mem_ids = op.mem_id();
-        int count = mem_ids.size();
-        for (int i = 0; i < count; ++i) {
-          DataType output_type;
-          if (i < op.output_type_size()) {
-            output_type = op.output_type(i);
-          } else {
-            output_type = dtype;
-          }
-          std::unique_ptr<Tensor> tensor
-              (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]),
-                          output_type, false, op.output(i)));
-          if (device_type == DeviceType::GPU && tensor->has_opencl_image()) {
-            VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
-                    << " Mem: " << mem_ids[i]
-                    << " Image shape: "
-                    << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
-                        ->image_shape()[0]
-                    << ", "
-                    << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
-                        ->image_shape()[1];
-          } else {
-            VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
-                    << " Mem: " << mem_ids[i]
-                    << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
-          }
-          tensor_map_[op.output(i)] = std::move(tensor);
-        }
-      } else {
-        for (int i = 0; i < op.output().size(); ++i) {
-          MACE_CHECK(
-              op.output_type_size() == 0
-                  || op.output_size()
-                      == op.output_type_size(),
-              "operator output size != operator output type size",
-              op.output_size(),
-              op.output_type_size());
-          DataType output_type;
-          if (i < op.output_type_size()) {
-            output_type = op.output_type(i);
-          } else {
-            output_type = static_cast<DataType>(ProtoArgHelper::GetOptionalArg(
-                op, "T", static_cast<int>(DT_FLOAT)));
-          }
-          CreateTensor(op.output(i),
-                       device->allocator(),
-                       output_type);
-        }
-      }
-
-      for (int output_idx = 0; output_idx < op.output_shape_size();
-           ++output_idx) {
-        std::vector<index_t>
-            shape_configured(op.output_shape(output_idx).dims_size());
-        for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
-          shape_configured[dim] = op.output_shape(output_idx).dims(dim);
-        }
-        tensor_map_[op.output(output_idx)]->SetShapeConfigured(
-            shape_configured);
-      }
-    }
-  }
-  return MaceStatus::MACE_SUCCESS;
-}
-
 void Workspace::RemoveUnusedBuffer() {
  auto iter = tensor_map_.begin();
  auto end_iter = tensor_map_.end();
@@ -398,4 +326,11 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
  tensor_buffer_.reset(nullptr);
 }

+void Workspace::RemoveTensor(const std::string &name) {
+  auto iter = tensor_map_.find(name);
+  if (iter != tensor_map_.end()) {
+    tensor_map_.erase(iter);
+  }
+}
+
 }  // namespace mace
--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -27,6 +27,8 @@

 namespace mace {

+class MemoryOptimizer;
+
 class Workspace {
 public:
  typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap;
@@ -36,7 +38,8 @@ class Workspace {

  Tensor *CreateTensor(const std::string &name,
                       Allocator *alloc,
-                       DataType type);
+                       DataType type,
+                       bool is_weight = false);

  inline bool HasTensor(const std::string &name) const {
    return tensor_map_.find(name) != tensor_map_.end();
@@ -52,12 +55,19 @@ class Workspace {
                             Device *device,
                             const unsigned char *model_data);

+  MaceStatus PreallocateOutputTensor(const NetDef &net_def,
+                                     const MemoryOptimizer *mem_optimizer,
+                                     Device *device);
+
  void RemoveUnusedBuffer();

  void RemoveAndReloadBuffer(const NetDef &net_def,
                             const unsigned char *model_data,
                             Allocator *alloc);

+  void RemoveTensor(const std::string &name);
+
+
 private:
  MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
                                      Device *device);

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -20,9 +20,11 @@

 #include <memory>

-#include "mace/core/net.h"
 #include "mace/core/device_context.h"
+#include "mace/core/memory_optimizer.h"
+#include "mace/core/net.h"
 #include "mace/ops/ops_registry.h"
+#include "mace/ops/transpose.h"
 #include "mace/public/mace.h"

 #ifdef MACE_ENABLE_OPENCL
@@ -69,6 +71,7 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
  // Check OpenCL avaliable
  auto runtime = device->opencl_runtime();
  if (!runtime->is_opencl_avaliable()) {
+    LOG(WARNING) << "The device does not support OpenCL";
    return MaceStatus::MACE_OUT_OF_RESOURCES;
  }

@@ -84,28 +87,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
  const MemoryType mem_type = static_cast<MemoryType>(mem_type_i);

  runtime->set_mem_type(mem_type);
-  if (mem_type == MemoryType::GPU_IMAGE) {
-    if (!runtime->IsImageSupport()) {
-      return MaceStatus::MACE_OUT_OF_RESOURCES;
-    }
-
-    auto opencl_max_image_size = runtime->GetMaxImage2DSize();
-    if (opencl_max_image_size.empty()) {
-      return MaceStatus::MACE_OUT_OF_RESOURCES;
-    }
-
-    const std::vector<int64_t> net_max_image_size =
-        ProtoArgHelper::GetRepeatedArgs<NetDef, int64_t>(
-            *net_def, "opencl_max_image_size", {0, 0});
-
-    if (static_cast<uint64_t>(net_max_image_size[0]) > opencl_max_image_size[0]
-        || static_cast<uint64_t>(net_max_image_size[1])
-            > opencl_max_image_size[1]) {
-      LOG(INFO) << "opencl max image size " << MakeString(opencl_max_image_size)
-                << " vs " << MakeString(net_max_image_size);
-      return MaceStatus::MACE_OUT_OF_RESOURCES;
-    }
-  }

  return MaceStatus::MACE_SUCCESS;
 }
@@ -288,14 +269,17 @@ class MaceTensor::Impl {
 public:
  std::vector<int64_t> shape;
  std::shared_ptr<float> data;
+  DataFormat format;
 };

 MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
-                       std::shared_ptr<float> data) {
+                       std::shared_ptr<float> data,
+                       const DataFormat format) {
  MACE_CHECK_NOTNULL(data.get());
  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
  impl_->shape = shape;
  impl_->data = data;
+  impl_->format = format;
 }

 MaceTensor::MaceTensor() {
@@ -306,23 +290,27 @@ MaceTensor::MaceTensor(const MaceTensor &other) {
  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
  impl_->shape = other.shape();
  impl_->data = other.data();
+  impl_->format = other.data_format();
 }

 MaceTensor::MaceTensor(const MaceTensor &&other) {
  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
  impl_->shape = other.shape();
  impl_->data = other.data();
+  impl_->format = other.data_format();
 }

 MaceTensor &MaceTensor::operator=(const MaceTensor &other) {
  impl_->shape = other.shape();
  impl_->data = other.data();
+  impl_->format = other.data_format();
  return *this;
 }

 MaceTensor &MaceTensor::operator=(const MaceTensor &&other) {
  impl_->shape = other.shape();
  impl_->data = other.data();
+  impl_->format = other.data_format();
  return *this;
 }

@@ -334,6 +322,10 @@ const std::shared_ptr<float> MaceTensor::data() const { return impl_->data; }

 std::shared_ptr<float> MaceTensor::data() { return impl_->data; }

+DataFormat MaceTensor::data_format() const {
+  return impl_->format;
+}
+
 // Mace Engine
 class MaceEngine::Impl {
 public:
@@ -355,6 +347,14 @@ class MaceEngine::Impl {
                 std::map<std::string, MaceTensor> *outputs,
                 RunMetadata *run_metadata);

+ private:
+  MaceStatus TransposeInput(
+      const std::pair<const std::string, MaceTensor> &input,
+      Tensor *input_tensor);
+
+  MaceStatus TransposeOutput(const Tensor *output_tensor,
+                             std::pair<const std::string, MaceTensor> *output);
+
 private:
  const unsigned char *model_data_;
  size_t model_data_size_;
@@ -363,11 +363,12 @@ class MaceEngine::Impl {
  std::unique_ptr<Device> device_;
  std::unique_ptr<Workspace> ws_;
  std::unique_ptr<NetBase> net_;
-  std::map<std::string, mace::InputInfo> input_info_map_;
-  std::map<std::string, mace::OutputInfo> output_info_map_;
+  bool is_quantized_model_;
 #ifdef MACE_ENABLE_HEXAGON
  std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
 #endif
+  std::map<std::string, mace::InputInfo> input_info_map_;
+  std::map<std::string, mace::OutputInfo> output_info_map_;

  MACE_DISABLE_COPY_AND_ASSIGN(Impl);
 };
@@ -379,7 +380,8 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
      device_type_(config.impl_->device_type()),
      device_(nullptr),
      ws_(new Workspace()),
-      net_(nullptr)
+      net_(nullptr),
+      is_quantized_model_(false)
 #ifdef MACE_ENABLE_HEXAGON
      , hexagon_controller_(nullptr)
 #endif
@@ -417,6 +419,8 @@ MaceStatus MaceEngine::Impl::Init(
    MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def, device_.get()));
  }
 #endif
+  // mark quantized model flag
+  is_quantized_model_ = IsQuantizedModel(*net_def);
  // Get input and output information.
  for (auto &input_info : net_def->input_info()) {
    input_info_map_[input_info.name()] = input_info;
@@ -431,8 +435,7 @@ MaceStatus MaceEngine::Impl::Init(
                 << "' does not belong to model's inputs: "
                 << MakeString(MapKeys(input_info_map_));
    }
-    ws_->CreateTensor(MakeString("mace_input_node_", input_name),
-                      device_->allocator(), DT_FLOAT);
+    ws_->CreateTensor(input_name, device_->allocator(), DT_FLOAT);
  }
  for (auto output_name : output_nodes) {
    if (output_info_map_.find(output_name) == output_info_map_.end()) {
@@ -440,8 +443,6 @@ MaceStatus MaceEngine::Impl::Init(
                 << "' does not belong to model's outputs "
                 << MakeString(MapKeys(output_info_map_));
    }
-    ws_->CreateTensor(MakeString("mace_output_node_", output_name),
-                      device_->allocator(), DT_FLOAT);
  }
 #ifdef MACE_ENABLE_HEXAGON
  if (device_type_ == HEXAGON) {
@@ -461,19 +462,19 @@ MaceStatus MaceEngine::Impl::Init(
                                              device_.get(),
                                              model_data));

+    MemoryOptimizer mem_optimizer;
    // Init model
-    auto net = std::unique_ptr<NetBase>(new SerialNet(
-        op_registry_.get(),
-        net_def,
-        ws_.get(),
-        device_.get(),
-        NetMode::INIT));
-    MACE_RETURN_IF_ERROR(net->Init());
-    MACE_RETURN_IF_ERROR(net->Run());
    net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(),
                                                  net_def,
                                                  ws_.get(),
-                                                  device_.get()));
+                                                  device_.get(),
+                                                  &mem_optimizer));
+
+    // Preallocate all output tensors of ops
+    MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def,
+                                                      &mem_optimizer,
+                                                      device_.get()));
+
    MACE_RETURN_IF_ERROR(net_->Init());
 #ifdef MACE_ENABLE_HEXAGON
  }
@@ -524,6 +525,117 @@ MaceEngine::Impl::~Impl() {
 #endif
 }

+MaceStatus MaceEngine::Impl::TransposeInput(
+    const std::pair<const std::string, MaceTensor> &input,
+    Tensor *input_tensor) {
+  if (device_->device_type() == DeviceType::CPU &&
+      input.second.shape().size() == 4 &&
+      input.second.data_format() == NHWC &&
+      !is_quantized_model_) {
+    VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
+    input_tensor->set_data_format(DataFormat::NCHW);
+    std::vector<int> dst_dims = {0, 3, 1, 2};
+    std::vector<index_t> output_shape =
+        TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
+    MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
+    Tensor::MappingGuard input_guard(input_tensor);
+    float *input_data = input_tensor->mutable_data<float>();
+    return ops::Transpose(input.second.data().get(),
+                          input.second.shape(),
+                          dst_dims,
+                          input_data);
+  } else if (
+      (is_quantized_model_ || device_->device_type() == DeviceType::GPU) &&
+      input.second.shape().size() == 4 &&
+      input.second.data_format() == DataFormat::NCHW) {
+    VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC";
+    std::vector<int> dst_dims = {0, 2, 3, 1};
+    input_tensor->set_data_format(DataFormat::NHWC);
+    std::vector<index_t> output_shape =
+        TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
+    MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
+    Tensor::MappingGuard input_guard(input_tensor);
+    float *input_data = input_tensor->mutable_data<float>();
+    return ops::Transpose(input.second.data().get(),
+                          input.second.shape(),
+                          dst_dims,
+                          input_data);
+  } else {
+    input_tensor->set_data_format(input.second.data_format());
+    MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
+    Tensor::MappingGuard input_guard(input_tensor);
+    float *input_data = input_tensor->mutable_data<float>();
+    memcpy(input_data, input.second.data().get(),
+           input_tensor->size() * sizeof(float));
+    return MaceStatus::MACE_SUCCESS;
+  }
+}
+
+MaceStatus MaceEngine::Impl::TransposeOutput(
+    const mace::Tensor *output_tensor,
+    std::pair<const std::string, mace::MaceTensor> *output) {
+  // save output
+  if (output_tensor != nullptr && output->second.data() != nullptr) {
+    if (device_->device_type() == DeviceType::CPU &&
+        output->second.shape().size() == 4 &&
+        output->second.data_format() != output_tensor->data_format()) {
+      MACE_CHECK(output_tensor->data_format() == NCHW);
+      VLOG(1) << "Transform output " << output->first << " from NCHW to NHWC";
+      std::vector<int> dst_dims = {0, 2, 3, 1};
+      std::vector<index_t> shape =
+          TransposeShape<index_t, index_t>(output_tensor->shape(),
+                                           dst_dims);
+      MACE_CHECK(shape == output->second.shape())
+        << "Output shape mismatch: "
+        << MakeString<int64_t>(shape) << " != "
+        << MakeString<int64_t>(output->second.shape());
+      Tensor::MappingGuard output_guard(output_tensor);
+      const float *output_data = output_tensor->data<float>();
+      return ops::Transpose(output_data,
+                            output_tensor->shape(),
+                            dst_dims,
+                            output->second.data().get());
+    } else if (device_->device_type() == DeviceType::GPU &&
+        output->second.shape().size() == 4 &&
+        output->second.data_format() != output_tensor->data_format()) {
+      VLOG(1) << "Transform output " << output->first << " from "
+              << output_tensor->data_format() << " to "
+              << output->second.data_format();
+      std::vector<int> dst_dims = {0, 3, 1, 2};
+      if (output_tensor->data_format() == NCHW) {
+        dst_dims = {0, 2, 3, 1};
+      }
+      std::vector<index_t> shape =
+          TransposeShape<index_t, index_t>(output_tensor->shape(),
+                                           dst_dims);
+      MACE_CHECK(shape == output->second.shape())
+        << "Output shape mismatch: "
+        << MakeString<int64_t>(shape) << " != "
+        << MakeString<int64_t>(output->second.shape());
+      Tensor::MappingGuard output_guard(output_tensor);
+      const float *output_data = output_tensor->data<float>();
+      return ops::Transpose(output_data,
+                            output_tensor->shape(),
+                            dst_dims,
+                            output->second.data().get());
+    } else {
+      Tensor::MappingGuard output_guard(output_tensor);
+      auto shape = output_tensor->shape();
+      int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
+                                            std::multiplies<int64_t>());
+      MACE_CHECK(shape == output->second.shape())
+        << "Output shape mismatch: "
+        << MakeString<int64_t>(shape) << " != "
+        << MakeString<int64_t>(output->second.shape());
+      std::memcpy(output->second.data().get(), output_tensor->data<float>(),
+                  output_size * sizeof(float));
+      return MaceStatus::MACE_SUCCESS;
+    }
+  } else {
+    return MaceStatus::MACE_INVALID_ARGS;
+  }
+}
+
 MaceStatus MaceEngine::Impl::Run(
    const std::map<std::string, MaceTensor> &inputs,
    std::map<std::string, MaceTensor> *outputs,
@@ -537,15 +649,8 @@ MaceStatus MaceEngine::Impl::Run(
                 << "' does not belong to model's inputs: "
                 << MakeString(MapKeys(input_info_map_));
    }
-    Tensor *input_tensor =
-        ws_->GetTensor(MakeString("mace_input_node_", input.first));
-    MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
-    {
-      Tensor::MappingGuard input_guard(input_tensor);
-      float *input_data = input_tensor->mutable_data<float>();
-      memcpy(input_data, input.second.data().get(),
-             input_tensor->size() * sizeof(float));
-    }
+    Tensor *input_tensor = ws_->GetTensor(input.first);
+    MACE_RETURN_IF_ERROR(TransposeInput(input, input_tensor));
    input_tensors.push_back(input_tensor);
  }
  for (auto &output : *outputs) {
@@ -554,8 +659,7 @@ MaceStatus MaceEngine::Impl::Run(
                 << "' does not belong to model's outputs: "
                 << MakeString(MapKeys(output_info_map_));
    }
-    Tensor *output_tensor =
-        ws_->GetTensor(MakeString("mace_output_node_", output.first));
+    Tensor *output_tensor = ws_->GetTensor(output.first);
    output_tensors.push_back(output_tensor);
  }
 #ifdef MACE_ENABLE_HEXAGON
@@ -577,23 +681,9 @@ MaceStatus MaceEngine::Impl::Run(
  }
 #endif
  for (auto &output : *outputs) {
-    Tensor *output_tensor =
-        ws_->GetTensor(MakeString("mace_output_node_", output.first));
+    Tensor *output_tensor = ws_->GetTensor(output.first);
    // save output
-    if (output_tensor != nullptr && output.second.data() != nullptr) {
-      Tensor::MappingGuard output_guard(output_tensor);
-      auto shape = output_tensor->shape();
-      int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
-                                            std::multiplies<int64_t>());
-      MACE_CHECK(shape == output.second.shape())
-          << "Output shape mismatch: "
-          << MakeString<int64_t>(output.second.shape())
-          << " != " << MakeString<int64_t>(shape);
-      std::memcpy(output.second.data().get(), output_tensor->data<float>(),
-                  output_size * sizeof(float));
-    } else {
-      return MaceStatus::MACE_INVALID_ARGS;
-    }
+    MACE_RETURN_IF_ERROR(TransposeOutput(output_tensor, &output));
  }
  return MaceStatus::MACE_SUCCESS;
 }

--- a/mace/libmace/mace_version_script.lds
+++ b/mace/libmace/mace_version_script.lds
@@ -14,7 +14,6 @@ mace {
    *mace*NetDef*;
    *mace*MemoryType*;
    *mace*DataType*;
-    *mace*MemoryArena*;
    *mace*InputInfo*;
    *mace*OutputInfo*;
    *mace*OutputShape*;

--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -30,10 +30,8 @@ cc_library(
            "arm/*_test.cc",
            "ops_registry.cc",
            "ops_test_util.cc",
-            "buffer_inverse_transform.cc",
            "buffer_transform.cc",
            "lstm_cell.cc",
-            "winograd_transform.cc",
            "quantize.cc",
        ],
    ) + if_opencl_enabled(glob(
@@ -41,10 +39,8 @@ cc_library(
            "opencl/*.cc",
            "opencl/image/*.cc",
            "opencl/buffer/*.cc",
-            "buffer_inverse_transform.cc",
            "buffer_transform.cc",
            "lstm_cell.cc",
-            "winograd_transform.cc",
        ],
        exclude = [
            "opencl/*_test.cc",

--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -19,6 +19,7 @@
 #include "mace/core/operator.h"

 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/activation.h"
 #endif  // MACE_ENABLE_OPENCL

@@ -79,12 +80,19 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
                                              "NOOP"));
    auto relux_max_limit = static_cast<T>(
        Operation::GetOptionalArg<float>("max_limit", 0.0f));
+    MemoryType mem_type;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
      kernel_.reset(
          new opencl::image::ActivationKernel<T>(type, relux_max_limit));
    } else {
      MACE_NOT_IMPLEMENTED;
    }
+    if (type == ActivationType::PRELU) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
+                     == MaceStatus::MACE_SUCCESS);
+    }
  }
  MaceStatus Run(OpContext *context) override {
    const Tensor *input = this->Input(0);

--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -30,31 +30,19 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) {

  // Add input data
  if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
  } else if (D == DeviceType::GPU) {
-    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
  }

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("Activation", "ReluBM")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluBM")
-        .Input("InputImage")
-        .Output("Output")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
+  OpDefBuilder("Activation", "ReluBM")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELU")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());

  // Warm-up
  for (int i = 0; i < 5; ++i) {
@@ -100,29 +88,18 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {

  // Add input data
  if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
  } else {
-    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  }

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluxBM")
-        .Input("InputImage")
-        .Output("Output")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6.0)
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("Activation", "ReluxBM")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6.0)
-        .Finalize(net.NewOperatorDef());
-  }
+  OpDefBuilder("Activation", "ReluxBM")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELUX")
+      .AddFloatArg("max_limit", 6.0)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());

  // Warm-up
  for (int i = 0; i < 5; ++i) {
@@ -168,36 +145,21 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) {

  // Add input data
  if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
  } else if (D == DeviceType::GPU) {
-    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
  }
-  net.AddRandomInput<D, float>("Alpha", {channels});
+  net.AddRandomInput<D, T>("Alpha", {channels}, true);

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("Activation", "PreluBM")
-        .Input("Input")
-        .Input("Alpha")
-        .Output("Output")
-        .AddStringArg("activation", "PRELU")
-        .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
-                            ops::BufferType::ARGUMENT);
-
-    OpDefBuilder("Activation", "PreluBM")
-        .Input("InputImage")
-        .Input("AlphaImage")
-        .Output("Output")
-        .AddStringArg("activation", "PRELU")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
+  OpDefBuilder("Activation", "PreluBM")
+      .Input("Input")
+      .Input("Alpha")
+      .Output("Output")
+      .AddStringArg("activation", "PRELU")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());

  // Warm-up
  for (int i = 0; i < 5; ++i) {
@@ -243,27 +205,17 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) {

  // Add input data
  if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
  } else {
-    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  }

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "TanhBM")
-        .Input("InputImage")
-        .Output("Output")
-        .AddStringArg("activation", "TANH")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("Activation", "TanhBM")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "TANH")
-        .Finalize(net.NewOperatorDef());
-  }
+  OpDefBuilder("Activation", "TanhBM")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "TANH")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());

  // Warm-up
  for (int i = 0; i < 5; ++i) {
@@ -310,27 +262,17 @@ void SigmoidBenchmark(

  // Add input data
  if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
  } else {
-    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  }

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "SigmoidBM")
-        .Input("InputImage")
-        .Output("Output")
-        .AddStringArg("activation", "SIGMOID")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("Activation", "SigmoidBM")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "SIGMOID")
-        .Finalize(net.NewOperatorDef());
-  }
+  OpDefBuilder("Activation", "SigmoidBM")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "SIGMOID")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());

  // Warm-up
  for (int i = 0; i < 5; ++i) {

--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -30,32 +30,14 @@ void TestSimpleRelu() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
+  OpDefBuilder("Activation", "ReluTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELU")
+      .Finalize(net.NewOperatorDef());

-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "ReluTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);

  auto expected = net.CreateTensor<float>(
      {2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
@@ -78,32 +60,14 @@ void TestUnalignedSimpleRelu() {
  // Add input data
  net.AddInputFromArray<D, float>("Input", {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5});

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
+  OpDefBuilder("Activation", "ReluTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELU")
+      .Finalize(net.NewOperatorDef());

-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "ReluTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);

  auto expected = net.CreateTensor<float>({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5});

@@ -129,34 +93,15 @@ void TestSimpleRelux() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluxTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6)
-        .Finalize(net.NewOperatorDef());
+  OpDefBuilder("Activation", "ReluxTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELUX")
+      .AddFloatArg("max_limit", 6)
+      .Finalize(net.NewOperatorDef());

-    // Run
-    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "ReluxTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6)
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);

  auto expected = net.CreateTensor<float>(
      {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
@@ -179,34 +124,15 @@ void TestSimpleReluRelux() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluxTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6)
-        .Finalize(net.NewOperatorDef());
+  OpDefBuilder("Activation", "ReluxTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELUX")
+      .AddFloatArg("max_limit", 6)
+      .Finalize(net.NewOperatorDef());

-    // Run
-    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "ReluxTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6)
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);

  auto expected = net.CreateTensor<float>(
      {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
@@ -232,45 +158,36 @@ void TestSimplePrelu() {
  net.AddInputFromArray<D, float>(
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0});
-  net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0});
+  net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0}, true);

  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
-                            ops::BufferType::ARGUMENT);
-
    OpDefBuilder("Activation", "PreluTest")
-        .Input("InputImage")
-        .Input("AlphaImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Alpha")
+        .Output("Output")
        .AddStringArg("activation", "PRELU")
        .Finalize(net.NewOperatorDef());

    // Run
    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
+    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
    OpDefBuilder("Activation", "PreluTest")
-        .Input("Input")
+        .Input("InputNCHW")
        .Input("Alpha")
-        .Output("Output")
+        .Output("OutputNCHW")
        .AddStringArg("activation", "PRELU")
        .Finalize(net.NewOperatorDef());

    // Run
    net.RunOp(D);
+    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
  }

-  if (D == DeviceType::CPU) {
-    auto expected = net.CreateTensor<float>(
-        {2, 2, 2, 2},
-        {-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0});
-    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
-  }
+  auto expected = net.CreateTensor<float>(
+      {2, 2, 2, 2},
+      {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace

@@ -290,32 +207,14 @@ void TestSimpleTanh() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "TanhTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "TANH")
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "TanhTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "TANH")
-        .Finalize(net.NewOperatorDef());
+  OpDefBuilder("Activation", "TanhTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "TANH")
+      .Finalize(net.NewOperatorDef());

-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);

  auto expected = net.CreateTensor<float>(
      {2, 2, 2, 2},
@@ -343,32 +242,14 @@ void TestSimpleSigmoid() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "SigmoidTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "SIGMOID")
-        .Finalize(net.NewOperatorDef());
+  OpDefBuilder("Activation", "SigmoidTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "SIGMOID")
+      .Finalize(net.NewOperatorDef());

-    // Run
-    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "SigmoidTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "SIGMOID")
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);

  auto expected = net.CreateTensor<float>(
      {2, 2, 2, 2},

--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -32,28 +32,13 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
    net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c});
  }

-  if (D == DeviceType::GPU) {
-    for (int i = 0; i < inputs; ++i) {
-      BufferToImage<D, T>(&net, MakeString("Input", i).c_str(),
-                          MakeString("InputImage", i).c_str(),
-                          ops::BufferType::IN_OUT_CHANNEL);
-    }
-    OpDefBuilder op_def_builder("AddN", "AddNBM");
-    for (int i = 0; i < inputs; ++i) {
-      op_def_builder.Input(MakeString("InputImage", i).c_str());
-    }
-    op_def_builder.Output("OutputImage")
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder op_def_builder("AddN", "AddNBM");
-    for (int i = 0; i < inputs; ++i) {
-      op_def_builder.Input(MakeString("Input", i).c_str());
-    }
-    op_def_builder.Output("Output")
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
+  OpDefBuilder op_def_builder("AddN", "AddNBM");
+  for (int i = 0; i < inputs; ++i) {
+    op_def_builder.Input(MakeString("Input", i).c_str());
  }
+  op_def_builder.Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());

  // Warm-up
  for (int i = 0; i < 5; ++i) {

--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -62,39 +62,15 @@ void SimpleAdd3() {
  net.AddInputFromArray<D, float>("Input3", {1, 2, 3, 1},
                                  {-0.1582, 2, 3, 4, 5, 6});

-  const int input_num = 4;
-  if (D == DeviceType::GPU) {
-    // run on gpu
-    for (int i = 0; i < input_num; ++i) {
-      BufferToImage<D, half>(&net, MakeString("Input", i),
-                             MakeString("InputImage", i),
-                             ops::BufferType::IN_OUT_CHANNEL);
-    }
-
-    auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
-    for (int i = 0; i < input_num; ++i) {
-      op_def_cl.Input(MakeString("InputImage", i));
-    }
-    op_def_cl.Output("OutputImage")
-        .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
-        .Finalize(net.NewOperatorDef());
-
-    // Run on device
-    net.RunOp(D);
-
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("AddN", "AddNTest")
-        .Input("Input0")
-        .Input("Input1")
-        .Input("Input2")
-        .Input("Input3")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
-  }
+  OpDefBuilder("AddN", "AddNTest")
+      .Input("Input0")
+      .Input("Input1")
+      .Input("Input2")
+      .Input("Input3")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+  // Run
+  net.RunOp(D);

  auto expected =
      net.CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24});
@@ -138,28 +114,10 @@ void RandomTest() {
    auto expected = net.CreateTensor<float>();
    expected->Copy(*net.GetOutput("Output"));

-    // run on gpu
-    for (int i = 0; i < input_num; ++i) {
-      BufferToImage<D, half>(&net, MakeString("Input", i),
-                             MakeString("InputImage", i),
-                             ops::BufferType::IN_OUT_CHANNEL);
-    }
-
-    auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
-    for (int i = 0; i < input_num; ++i) {
-      op_def_cl.Input(MakeString("InputImage", i));
-    }
-    op_def_cl.Output("OutputImage")
-        .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
-        .Finalize(net.NewOperatorDef());
-
-    // Run on device
+    // run on device
    net.RunOp(D);

-    ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2,
                            1e-2);
  }
 }

--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -19,6 +19,7 @@
 #include "mace/core/operator.h"
 #include "mace/ops/activation.h"
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/batch_norm.h"
 #endif  // MACE_ENABLE_OPENCL

@@ -147,12 +148,27 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
    ActivationType activation = ops::StringToActivationType(
        Operation::GetOptionalArg<std::string>("activation", "NOOP"));
    float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
+    MemoryType mem_type;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
      kernel_.reset(new opencl::image::BatchNormKernel<T>(
          epsilon, activation, relux_max_limit));
    } else {
      MACE_NOT_IMPLEMENTED;
    }
+    // Transform filters
+    int input_size = operator_def_->input_size();
+    for (int i = 1; i < input_size; ++i) {
+      const Tensor *input_tensor = context->workspace()->GetTensor(
+          operator_def_->input(i));
+      MACE_CHECK(input_tensor != nullptr);
+      MACE_CHECK(TransformFilter<T>(
+          context,
+          operator_def_.get(),
+          i,
+          OpenCLBufferType::ARGUMENT,
+          mem_type) == MaceStatus::MACE_SUCCESS);
+    }
  }
  MaceStatus Run(OpContext *context) override {
    bool not_folded = this->InputSize() == 5;

--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -36,13 +36,12 @@ void BatchNorm(
  } else {
    MACE_NOT_IMPLEMENTED;
  }
-  net.AddRandomInput<D, T>("Scale", {channels});
-  net.AddRandomInput<D, T>("Offset", {channels});
-  net.AddRandomInput<D, T>("Mean", {channels});
-  net.AddRandomInput<D, T>("Var", {channels}, true);
+  net.AddRandomInput<D, T>("Scale", {channels}, true);
+  net.AddRandomInput<D, T>("Offset", {channels}, true);
+  net.AddRandomInput<D, T>("Mean", {channels}, true);
+  net.AddRandomInput<D, T>("Var", {channels}, true, true);

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("BatchNorm", "BatchNormBM")
+  OpDefBuilder("BatchNorm", "BatchNormBM")
      .Input("Input")
      .Input("Scale")
      .Input("Offset")
@@ -50,30 +49,8 @@ void BatchNorm(
      .Input("Var")
      .AddFloatArg("epsilon", 1e-3)
      .Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Offset", "OffsetImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Mean", "MeanImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Var", "VarImage",
-                            ops::BufferType::ARGUMENT);
-    OpDefBuilder("BatchNorm", "BatchNormBM")
-        .Input("InputImage")
-        .Input("ScaleImage")
-        .Input("OffsetImage")
-        .Input("MeanImage")
-        .Input("VarImage")
-        .AddFloatArg("epsilon", 1e-3)
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }

  // tuning
  setenv("MACE_TUNING", "1", 1);

--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -28,10 +28,10 @@ void Simple() {
  // Add input data
  net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
                                  {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
-  net.AddInputFromArray<D, float>("Scale", {1}, {4.0f});
-  net.AddInputFromArray<D, float>("Offset", {1}, {2.0});
-  net.AddInputFromArray<D, float>("Mean", {1}, {10});
-  net.AddInputFromArray<D, float>("Var", {1}, {11.67f});
+  net.AddInputFromArray<D, float>("Scale", {1}, {4.0f}, true);
+  net.AddInputFromArray<D, float>("Offset", {1}, {2.0}, true);
+  net.AddInputFromArray<D, float>("Mean", {1}, {10}, true);
+  net.AddInputFromArray<D, float>("Var", {1}, {11.67f}, true);

  if (D == DeviceType::CPU) {
    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
@@ -49,32 +49,17 @@ void Simple() {
    net.RunOp(D);
    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Offset", "OffsetImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Mean", "MeanImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Var", "VarImage",
-                            ops::BufferType::ARGUMENT);
-
    OpDefBuilder("BatchNorm", "BatchNormTest")
-        .Input("InputImage")
-        .Input("ScaleImage")
-        .Input("OffsetImage")
-        .Input("MeanImage")
-        .Input("VarImage")
+        .Input("Input")
+        .Input("Scale")
+        .Input("Offset")
+        .Input("Mean")
+        .Input("Var")
        .AddFloatArg("epsilon", 1e-3)
-        .Output("OutputImage")
+        .Output("Output")
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
  }

  // Check
@@ -103,10 +88,10 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input",
                                             {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);

  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                  NCHW);
@@ -133,25 +118,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  expected->Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
-                                        ops::BufferType::ARGUMENT);
-
  OpDefBuilder("BatchNorm", "BatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Input("MeanImage")
-      .Input("VarImage")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
      .AddFloatArg("epsilon", 1e-3)
-      .Output("OutputImage")
+      .Output("Output")
      .Finalize(net.NewOperatorDef());

  // Tuning
@@ -162,10 +136,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  // Run on opencl
  net.RunOp(DeviceType::GPU);
  net.Sync();
-
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                          1e-5, 1e-4);
 }

@@ -183,10 +154,10 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input",
                                             {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);

  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                  NCHW);
@@ -212,25 +183,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  expected->Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
-                                       ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
-                                       ops::BufferType::ARGUMENT);
-
  OpDefBuilder("BatchNorm", "BatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Input("MeanImage")
-      .Input("VarImage")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
      .AddFloatArg("epsilon", 1e-1)
-      .Output("OutputImage")
+      .Output("Output")
      .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
      .Finalize(net.NewOperatorDef());

@@ -243,9 +203,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  net.RunOp(DeviceType::GPU);
  net.Sync();

-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                          1e-1, 1e-2);
 }

@@ -263,10 +221,10 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input",
                                             {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);

  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                  NCHW);
@@ -292,25 +250,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  expected->Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
-                                        ops::BufferType::ARGUMENT);
-
  OpDefBuilder("BatchNorm", "BatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Input("MeanImage")
-      .Input("VarImage")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
      .AddFloatArg("epsilon", 1e-3)
-      .Output("OutputImage")
+      .Output("Output")
      .Finalize(net.NewOperatorDef());

  // tuning
@@ -322,9 +269,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  net.RunOp(DeviceType::GPU);
  net.Sync();

-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                          1e-5, 1e-4);
 }

@@ -342,10 +287,10 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input",
                                             {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);

  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                  NCHW);
@@ -371,25 +316,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  expected->Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
-                                       ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
-                                       ops::BufferType::ARGUMENT);
-
  OpDefBuilder("BatchNorm", "BatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Input("MeanImage")
-      .Input("VarImage")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
      .AddFloatArg("epsilon", 1e-1)
-      .Output("OutputImage")
+      .Output("Output")
      .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
      .Finalize(net.NewOperatorDef());

@@ -402,9 +336,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  net.RunOp(DeviceType::GPU);
  net.Sync();

-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                          1e-1, 1e-2);
 }


--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
@@ -32,23 +32,13 @@ void BMBatchToSpace(
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  }

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
-        .Input("Input")
-        .Output("Output")
-        .AddIntsArg("crops", {0, 0, 0, 0})
-        .AddIntsArg("block_shape", {arg, arg})
-        .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddIntsArg("crops", {0, 0, 0, 0})
-        .AddIntsArg("block_shape", {arg, arg})
-        .Finalize(net.NewOperatorDef());
-  }
+  OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
+      .Input("Input")
+      .Output("Output")
+      .AddIntsArg("crops", {0, 0, 0, 0})
+      .AddIntsArg("block_shape", {arg, arg})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
  // Warm-up
  for (int i = 0; i < 5; ++i) {
    net.RunOp(D);

--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -19,6 +19,7 @@
 #include "mace/core/operator.h"
 #include "mace/ops/activation.h"
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/bias_add.h"
 #endif  // MACE_ENABLE_OPENCL

@@ -99,11 +100,16 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
      : Operation(context),
        data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
            "data_format", NHWC))) {
+    MemoryType mem_type;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
      kernel_.reset(new opencl::image::BiasAddKernel<T>);
    } else {
      MACE_NOT_IMPLEMENTED;
    }
+    MACE_CHECK(TransformFilter<T>(
+        context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
+                   == MaceStatus::MACE_SUCCESS);
  }
  MaceStatus Run(OpContext *context) override {
    const Tensor *input = this->Input(0);

--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -28,35 +28,24 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
  OpsTestNet net;

  // Add input data
+  DataFormat data_format = NHWC;
  if (D == DeviceType::CPU) {
+    data_format = NCHW;
    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
  }
-  net.AddRandomInput<D, T>("Bias", {channels}, true);
+  net.AddRandomInput<D, T>("Bias", {channels}, true, true);

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("BiasAdd", "BiasAddBM")
+  OpDefBuilder("BiasAdd", "BiasAddBM")
      .Input("Input")
      .Input("Bias")
-      .AddIntArg("data_format", NCHW)
+      .AddIntArg("data_format", data_format)
      .Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-    OpDefBuilder("BiasAdd", "BiasAddBM")
-        .Input("InputImage")
-        .Input("BiasImage")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }

  // Warm-up
  for (int i = 0; i < 5; ++i) {

--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -28,7 +28,7 @@ void BiasAddSimple() {
  // Add input data
  net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
                                  {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
-  net.AddInputFromArray<D, float>("Bias", {1}, {0.5f});
+  net.AddInputFromArray<D, float>("Bias", {1}, {0.5f}, true);

  if (D == DeviceType::CPU) {
    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
@@ -44,22 +44,13 @@ void BiasAddSimple() {
    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                    "Output", NHWC);
  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Bias", "BiasImage",
-                            ops::BufferType::ARGUMENT);
-
    OpDefBuilder("BiasAdd", "BiasAddTest")
-        .Input("InputImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Bias")
+        .Output("Output")
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
    MACE_NOT_IMPLEMENTED;
  }
@@ -90,7 +81,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input",
                                             {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);

  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                  NCHW);
@@ -113,25 +104,17 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
  auto expected = net.CreateTensor<float>();
  expected->Copy(*net.GetOutput("Output"));

-  // Run on opencl
-  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
-                                        ops::BufferType::ARGUMENT);
-
+  // Run on gpu
  OpDefBuilder("BiasAdd", "BiasAddTest")
-      .Input("InputImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Input("Bias")
+      .Output("Output")
      .Finalize(net.NewOperatorDef());

  // Run on opencl
  net.RunOp(DeviceType::GPU);
-  net.Sync();

-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }

 TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
@@ -147,7 +130,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input",
                                             {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);

  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                  NCHW);
@@ -169,25 +152,17 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
  auto expected = net.CreateTensor<float>();
  expected->Copy(*net.GetOutput("Output"));

-  // Run on opencl
-  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
-                                        ops::BufferType::ARGUMENT);
-
+  // Run on gpu
  OpDefBuilder("BiasAdd", "BiasAddTest")
-      .Input("InputImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Input("Bias")
+      .Output("Output")
      .Finalize(net.NewOperatorDef());

  // Run on opencl
  net.RunOp(DeviceType::GPU);
-  net.Sync();

-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }

 }  // namespace test

--- a/mace/ops/buffer_inverse_transform.cc
+++ b/mace/ops/buffer_inverse_transform.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-
-#include "mace/core/operator.h"
-#include "mace/ops/opencl/buffer/buffer_inverse_transform.h"
-#include "mace/ops/opencl/image/image_to_buffer.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class BufferInverseTransformOp;
-
-template <typename T>
-class BufferInverseTransformOp<DeviceType::GPU, T> : public Operation {
- public:
-  explicit BufferInverseTransformOp(OpConstructContext *context)
-      : Operation(context),
-        wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::ImageToBuffer<T>);
-    } else {
-      kernel_.reset(new opencl::buffer::BufferInverseTransform<T>);
-    }
-  }
-
-  MaceStatus Run(OpContext *context) override {
-    const Tensor *input = this->Input(0);
-    Tensor *output = this->Output(0);
-
-    ops::BufferType type =
-        static_cast<ops::BufferType>(Operation::GetOptionalArg<int>(
-            "buffer_type", static_cast<int>(ops::CONV2D_FILTER)));
-
-    return kernel_->Compute(context, input, type,
-                            wino_blk_size_, output);
-  }
-
- private:
-  const int wino_blk_size_;
-  std::unique_ptr<OpenCLBufferInverseTransformKernel> kernel_;
-};
-
-
-void RegisterBufferInverseTransform(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "BufferInverseTransform",
-                   BufferInverseTransformOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "BufferInverseTransform",
-                   BufferInverseTransformOp, DeviceType::GPU, half);
-}
-
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/buffer_to_image_benchmark.cc
+++ b/mace/ops/buffer_to_image_benchmark.cc
@@ -14,6 +14,7 @@

 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -28,26 +29,36 @@ void FilterBufferToImage(int iters,
  mace::testing::StopTiming();

  OpsTestNet net;
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));

  // Add input data
  net.AddRandomInput<D, T>("Input",
                           {out_channel, in_channel, height, width});
+  // Create output
+  Tensor *b2i_output = net.ws()->CreateTensor(
+      "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);

-  OpDefBuilder("BufferToImage", "BufferToImageBM")
-      .Input("Input")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
+  auto transform_func = [&]() {
+    OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+        .Transform(&context,
+                   net.ws()->GetTensor("Input"),
+                   OpenCLBufferType::IN_OUT_CHANNEL,
+                   MemoryType::GPU_IMAGE,
+                   0,
+                   b2i_output);
+  };

  // Warm-up
  net.Setup(D);
  for (int i = 0; i < 5; ++i) {
-    net.Run();
+    transform_func();
  }
  net.Sync();

  mace::testing::StartTiming();
  while (iters--) {
-    net.Run();
+    transform_func();
  }
  net.Sync();
 }

--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -14,6 +14,7 @@

 #include "gtest/gtest.h"
 #include "mace/ops/ops_test_util.h"
+#include "mace/ops/opencl/buffer_transformer.h"

 namespace mace {
 namespace ops {
@@ -21,31 +22,27 @@ namespace test {

 namespace {
 template <DeviceType D, typename T>
-void TestBidirectionTransform(const int type,
+void TestBidirectionTransform(const OpenCLBufferType type,
                              const std::vector<index_t> &input_shape) {
  OpsTestNet net;
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("B2IOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));

  // Add input data
  net.AddRandomInput<D, T>("Input", input_shape);
+  Tensor *b2i_output = net.ws()->CreateTensor(
+      "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);

-  // Run
-  net.RunOp(D);
+  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+      .Transform(&context, net.ws()->GetTensor("Input"),
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);

-  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
-      .Input("B2IOutput")
-      .Output("I2BOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
-
-  // Run
-  net.RunOp(D);
+  // Inverse Transform
+  Tensor *i2b_output = net.ws()->CreateTensor(
+      "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
+  OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
+      .Transform(&context, b2i_output,
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);

  // Check
  ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
@@ -54,132 +51,139 @@ void TestBidirectionTransform(const int type,
 }  // namespace

 TEST(BufferToImageTest, ArgSmall) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {1});
+  TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
+                                                   {1});
 }

 TEST(BufferToImageTest, ArgHalfSmall) {
-  TestBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT, {11});
+  TestBidirectionTransform<DeviceType::GPU, half>(OpenCLBufferType::ARGUMENT,
+                                                  {11});
 }

 TEST(BufferToImageTest, ArgMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {11});
+  TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
+                                                   {11});
 }

 TEST(BufferToImageTest, ArgLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {256});
+  TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
+                                                   {256});
 }

 TEST(BufferToImageTest, InputSmallSingleChannel) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
-                                                   {1, 2, 3, 1});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::IN_OUT_CHANNEL, {1, 2, 3, 1});
 }

 TEST(BufferToImageTest, InputSmallMultipleChannel) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
-                                                   {1, 2, 3, 3});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::IN_OUT_CHANNEL, {1, 2, 3, 3});
 }

 TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
-                                                   {3, 2, 3, 3});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::IN_OUT_CHANNEL, {3, 2, 3, 3});
 }

 TEST(BufferToImageTest, InputMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
-                                                   {3, 13, 17, 128});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::IN_OUT_CHANNEL, {3, 13, 17, 128});
 }

 TEST(BufferToImageTest, InputLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
-                                                   {3, 64, 64, 256});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::IN_OUT_CHANNEL, {3, 64, 64, 256});
 }

 TEST(BufferToImageTest, Filter1x1Small) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                   {5, 3, 1, 1});
 }

 TEST(BufferToImageTest, Filter1x1Medium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                   {13, 17, 1, 1});
 }

 TEST(BufferToImageTest, Filter1x1Large) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                   {512, 128, 1, 1});
 }

 TEST(BufferToImageTest, Filter3x3Small) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                   {3, 5, 3, 3});
 }

 TEST(BufferToImageTest, Filter3x3Medium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                   {17, 13, 3, 3});
 }

 TEST(BufferToImageTest, Filter3x3Large) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                   {256, 128, 3, 3});
 }

 TEST(BufferToImageTest, WeightWidthSmall) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
-                                                   {1, 3, 3, 3});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_WIDTH,
+      {1, 3, 3, 3});
 }

 TEST(BufferToImageTest, WeightWidthMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
-                                                   {11, 13, 13, 17});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_WIDTH,
+      {11, 13, 13, 17});
 }

 TEST(BufferToImageTest, WeightWidthLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
-                                                   {64, 64, 11, 13});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_WIDTH,
+      {64, 64, 11, 13});
 }

 TEST(BufferToImageTest, WeightHeightSmall) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
-                                                   {2, 1, 1, 1});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_HEIGHT,
+      {2, 1, 1, 1});
 }

 TEST(BufferToImageTest, WeightHeightMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
-                                                   {11, 13, 13, 17});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_HEIGHT,
+      {11, 13, 13, 17});
 }

 TEST(BufferToImageTest, WeightHeightLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
-                                                   {64, 16, 11, 13});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_HEIGHT,
+      {64, 16, 11, 13});
 }

 namespace {
 template <DeviceType D, typename T>
-void TestDiffTypeBidirectionTransform(const int type,
+void TestDiffTypeBidirectionTransform(const OpenCLBufferType type,
                                      const std::vector<index_t> &input_shape) {
  OpsTestNet net;
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("B2IOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));

  // Add input data
  net.AddRandomInput<D, float>("Input", input_shape);
+  Tensor *b2i_output = net.ws()->CreateTensor(
+      "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);

-  // Run
-  net.RunOp(D);
-
-  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
-      .Input("B2IOutput")
-      .Output("I2BOutput")
-      .AddIntArg("buffer_type", type)
-      .Finalize(net.NewOperatorDef());
+  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+      .Transform(&context, net.ws()->GetTensor("Input"),
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);

-  // Run
-  net.RunOp(D);
+  // Inverse Transform
+  Tensor *i2b_output = net.ws()->CreateTensor(
+      "I2BOutput", context.device()->allocator(), DT_FLOAT);
+  OpenCLBufferTransformer<float>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
+      .Transform(&context, b2i_output,
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);

  // Check
  ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
@@ -188,40 +192,38 @@ void TestDiffTypeBidirectionTransform(const int type,
 }  // namespace

 TEST(BufferToImageTest, ArgFloatToHalfSmall) {
-  TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT,
-                                                          {11});
+  TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(
+      OpenCLBufferType::ARGUMENT,
+      {11});
 }

 namespace {
 template <DeviceType D, typename T>
-void TestStringHalfBidirectionTransform(const int type,
+void TestStringHalfBidirectionTransform(const OpenCLBufferType type,
                                        const std::vector<index_t> &input_shape,
                                        const unsigned char *input_data) {
  OpsTestNet net;
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("B2IOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));

+  // Add input data
  const half *h_data = reinterpret_cast<const half *>(input_data);
-
  net.AddInputFromArray<D, half>("Input", input_shape,
                                 std::vector<half>(h_data, h_data + 2));
+  Tensor *b2i_output = net.ws()->CreateTensor(
+      "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);

-  // Run
-  net.RunOp(D);
-
-  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
-      .Input("B2IOutput")
-      .Output("I2BOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
+  // Transform
+  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+      .Transform(&context, net.ws()->GetTensor("Input"),
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);

-  // Run
-  net.RunOp(D);
+  // Inverse Transform
+  Tensor *i2b_output = net.ws()->CreateTensor(
+      "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
+  OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
+      .Transform(&context, b2i_output,
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);

  // Check
  ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
@@ -233,8 +235,8 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
  const unsigned char input_data[] = {
      0xCD, 0x3C, 0x33, 0x40,
  };
-  TestStringHalfBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT,
-                                                            {2}, input_data);
+  TestStringHalfBidirectionTransform<DeviceType::GPU, half>(
+      OpenCLBufferType::ARGUMENT, {2}, input_data);
 }

 }  // namespace test

--- a/mace/ops/buffer_transform.cc
+++ b/mace/ops/buffer_transform.cc
@@ -15,8 +15,7 @@
 #include <memory>

 #include "mace/core/operator.h"
-#include "mace/ops/opencl/buffer/buffer_transform.h"
-#include "mace/ops/opencl/image/buffer_to_image.h"
+#include "mace/ops/opencl/buffer_transformer.h"

 namespace mace {
 namespace ops {
@@ -29,29 +28,27 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
 public:
  explicit BufferTransformOp(OpConstructContext *context)
      : Operation(context),
-        wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::BufferToImage<T>);
-    } else {
-      kernel_.reset(new opencl::buffer::BufferTransform<T>);
-    }
-  }
+        wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)),
+        out_mem_type_(static_cast<MemoryType>(Operation::GetOptionalArg<int>(
+            "mem_type", static_cast<int>(MemoryType::GPU_IMAGE)))) {}

  MaceStatus Run(OpContext *context) override {
    const Tensor *input = this->Input(0);
    Tensor *output = this->Output(0);

-    ops::BufferType type =
-        static_cast<ops::BufferType>(Operation::GetOptionalArg<int>(
-            "buffer_type", static_cast<int>(ops::CONV2D_FILTER)));
+    auto type =
+        static_cast<OpenCLBufferType>(Operation::GetOptionalArg<int>(
+            "buffer_type", static_cast<int>(CONV2D_FILTER)));

-    return kernel_->Compute(context, input, type,
-                            wino_blk_size_, output);
+    MemoryType in_mem_type = context->workspace()->GetTensor(
+        operator_def_->input(0))->memory_type();
+    return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
+        context, input, type, out_mem_type_, wino_blk_size_, output);
  }

 private:
  const int wino_blk_size_;
-  std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
+  MemoryType out_mem_type_;
 };



--- a/mace/ops/buffer_transform_test.cc
+++ b/mace/ops/buffer_transform_test.cc
@@ -15,6 +15,7 @@
 #include <cstring>

 #include "gtest/gtest.h"
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -30,31 +31,31 @@ class BufferTransformTest : public OpsTestBase {

 namespace {
 template <typename OrgType, typename DstType>
-void TestBidirectionTransform(const int type,
+void TestBidirectionTransform(const OpenCLBufferType type,
                              const std::vector<index_t> &input_shape) {
  OpsTestNet net;
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("TransformedOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<DstType>::value)
-      .Finalize(net.NewOperatorDef());
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));

  // Add input data
  net.AddRandomInput<DeviceType::GPU, OrgType>("Input", input_shape);
-
-  // Run
-  net.RunOp(DeviceType::GPU);
-
-  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
-      .Input("TransformedOutput")
-      .Output("Output")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<OrgType>::value)
-      .Finalize(net.NewOperatorDef());
-
-  // Run
-  net.RunOp(DeviceType::GPU);
+  Tensor *bt_output = net.ws()->CreateTensor(
+      "BtOutput", context.device()->allocator(),
+      DataTypeToEnum<DstType>::value);
+
+  OpenCLBufferTransformer<DstType>(MemoryType::GPU_BUFFER,
+                                   MemoryType::GPU_BUFFER)
+      .Transform(&context, net.ws()->GetTensor("Input"),
+                 type, MemoryType::GPU_BUFFER, 0, bt_output);
+
+  // Inverse Transform
+  Tensor *output = net.ws()->CreateTensor(
+      "Output", context.device()->allocator(),
+      DataTypeToEnum<OrgType>::value);
+  OpenCLBufferTransformer<OrgType>(MemoryType::GPU_BUFFER,
+                                   MemoryType::GPU_BUFFER)
+      .Transform(&context, bt_output,
+                 type, MemoryType::GPU_BUFFER, 0, output);

  if (DataTypeToEnum<OrgType>::value == DataTypeToEnum<DstType>::value) {
    EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(),
@@ -69,38 +70,35 @@ void TestBidirectionTransform(const int type,
 }  // namespace

 TEST_F(BufferTransformTest, FloatToHalf) {
-  TestBidirectionTransform<float, half>(ops::BufferType::IN_OUT_CHANNEL,
+  TestBidirectionTransform<float, half>(OpenCLBufferType::IN_OUT_CHANNEL,
                                        {1, 2, 3, 4});
 }

-TEST_F(BufferTransformTest, HalfToHalf) {
-  TestBidirectionTransform<half, half>(ops::BufferType::IN_OUT_CHANNEL,
-                                       {1, 2, 3, 4});
-}
-
 namespace {
 template <typename T>
 void TestArgumentTransform(const index_t input_size) {
  OpsTestNet net;
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("Output")
-      .AddIntArg("buffer_type", ops::BufferType::ARGUMENT)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));

  // Add input data
  net.AddRandomInput<DeviceType::GPU, T>("Input", {input_size});

  // Run
-  net.RunOp(DeviceType::GPU);
+  Tensor *output = net.ws()->CreateTensor(
+      "Output", context.device()->allocator(),
+      DataTypeToEnum<T>::value);
+  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER,
+                             MemoryType::GPU_BUFFER)
+      .Transform(&context, net.ws()->GetTensor("Input"),
+                 OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER,
+                 0, output);

-  auto output_tensor = net.GetOutput("Output");
  index_t expected_size = RoundUp<index_t>(input_size, 4);
-  EXPECT_EQ(expected_size, output_tensor->buffer_shape()[0]);
+  EXPECT_EQ(expected_size, output->buffer_shape()[0]);

  // Check
-  ExpectTensorNear<T>(*net.GetTensor("Input"), *output_tensor,
+  ExpectTensorNear<T>(*net.GetTensor("Input"), *output,
                      1e-3, 1e-4);
 }
 }  // namespace

--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
@@ -36,23 +36,11 @@ void ChannelShuffle(
    MACE_NOT_IMPLEMENTED;
  }

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("Softmax", "SoftmaxBM")
+  OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
      .Input("Input")
      .Output("Output")
+      .AddIntArg("group", group)
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
-        .Input("InputImage")
-        .Output("Output")
-        .AddIntArg("group", group)
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }

  // Warm-up
  for (int i = 0; i < 5; ++i) {

--- a/mace/ops/channel_shuffle_test.cc
+++ b/mace/ops/channel_shuffle_test.cc
@@ -59,22 +59,15 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
      "Input", {1, 1, 2, 16},
      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
-  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-
  OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
-      .Input("InputImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Output("Output")
      .AddIntArg("group", 4)
      .Finalize(net.NewOperatorDef());

  // Run
  net.RunOp(DeviceType::GPU);

-  // Transfer output
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-
  // Check
  auto expected = net.CreateTensor<float>(
      {1, 1, 2, 16},

--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -28,7 +28,8 @@ class ConcatOpBase : public Operation {
 public:
  explicit ConcatOpBase(OpConstructContext *context)
      : Operation(context),
-        axis_(Operation::GetOptionalArg<int>("axis", 3)) {}
+        axis_(Operation::GetOptionalArg<int>("axis", 3)),
+        checked_(false) {}

 protected:
  void Validate() {
@@ -42,6 +43,7 @@ class ConcatOpBase : public Operation {

 protected:
  int axis_;
+  bool checked_;
 };

 template <DeviceType D, class T>
@@ -55,7 +57,15 @@ class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {

  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
-    Validate();
+    if (!checked_) {
+      Validate();
+      if (this->Input(0)->dim_size() == 4) {
+        if (axis_ == 3) axis_ = 1;
+        else if (axis_ == 2) axis_ = 3;
+        else if (axis_ == 1) axis_ = 2;
+      }
+      checked_ = true;
+    }
    const std::vector<const Tensor *> &inputs = this->Inputs();
    Tensor *output = this->Output(0);
    const Tensor *input0 = inputs.front();

--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -76,7 +76,7 @@ MACE_BM_CONCAT_CPU(1, 1225, 128);

 namespace {
 template <typename T>
-void OpenclConcatHelper(int iters,
+void OpenCLConcatHelper(int iters,
                        const std::vector<index_t> &shape0,
                        const std::vector<index_t> &shape1,
                        int concat_dim) {
@@ -88,15 +88,11 @@ void OpenclConcatHelper(int iters,
  net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
  net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);

-  BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
-                                       ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
-                                       ops::BufferType::IN_OUT_CHANNEL);
  OpDefBuilder("Concat", "ConcatBM")
-      .Input("InputImage0")
-      .Input("InputImage1")
+      .Input("Input0")
+      .Input("Input1")
      .AddIntArg("axis", concat_dim)
-      .Output("OutputImage")
+      .Output("Output")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

@@ -120,7 +116,7 @@ void OpenclConcatHelper(int iters,
 #define MACE_BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE)                          \
  static void MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) {\
    std::vector<index_t> shape = {N, H, W, C};                                 \
-    OpenclConcatHelper<TYPE>(iters, shape, shape, 3);                          \
+    OpenCLConcatHelper<TYPE>(iters, shape, shape, 3);                          \
  }                                                                            \
  MACE_BENCHMARK(MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE)


--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -104,7 +104,7 @@ TEST_F(ConcatOpTest, CPURandom) {
  static unsigned int seed = time(NULL);
  int dim = 5;
  int num_inputs = 2 + rand_r(&seed) % 10;
-  int axis = rand_r(&seed) % dim;
+  int axis = 1;
  // Construct graph
  OpsTestNet net;
  auto builder = OpDefBuilder("Concat", "ConcatTest");
@@ -157,7 +157,8 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
  static unsigned int seed = time(NULL);
  int dim = 4;
  int num_inputs = 2 + rand_r(&seed) % 10;
-  int axis = rand_r(&seed) % dim;
+  int axis = 1;
+  int axis_arg = 3;  // NHWC
  // Construct graph
  OpsTestNet net;

@@ -178,13 +179,13 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
  std::vector<index_t> output_shape = input_shapes[0];
  output_shape[axis] = concat_axis_size;
  net.AddRandomInput<DeviceType::CPU, float>(
-      "Output", output_shape, true, true);
+      "Output", output_shape, false, true, true);

  auto builder = OpDefBuilder("Concat", "ConcatTest");
  for (int i = 0; i < num_inputs; ++i) {
    builder = builder.Input(MakeString("Input", i));
  }
-  builder.AddIntArg("axis", axis)
+  builder.AddIntArg("axis", axis_arg)
      .Output("Output")
      .Finalize(net.NewOperatorDef());

@@ -212,7 +213,7 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
  net.RunOp();

  net.AddRandomInput<DeviceType::CPU, uint8_t>(
-      "QuantizedOutput", output_shape, true, true);
+      "QuantizedOutput", output_shape, false, true, true);
  auto q_builder = OpDefBuilder("Concat", "QuantizedConcatTest");
  for (int i = 0; i < num_inputs; ++i) {
    q_builder = q_builder.Input(MakeString("QuantizedInput", i));
@@ -255,32 +256,26 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
  OpsTestNet net;
  for (int i = 0; i < num_inputs; ++i) {
    const std::string input_name = MakeString("Input", i);
-    const std::string image_name = MakeString("InputImage", i);
    concat_axis_size += shapes[i][axis];
    GenerateRandomRealTypeData(shapes[i], &inputs[i]);
    input_ptrs[i] = inputs[i].data();
    net.AddInputFromArray<DeviceType::GPU, float>(input_name, shapes[i],
                                                  inputs[i]);
-    BufferToImage<DeviceType::GPU, T>(&net, input_name, image_name,
-                                      ops::BufferType::IN_OUT_CHANNEL);
  }

  auto builder = OpDefBuilder("Concat", "ConcatTest");
  for (int i = 0; i < num_inputs; ++i) {
-    const std::string image_name = MakeString("InputImage", i);
+    const std::string image_name = MakeString("Input", i);
    builder = builder.Input(image_name);
  }
  builder.AddIntArg("axis", axis)
-      .Output("OutputImage")
+      .Output("Output")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

  // Run
  net.RunOp(DeviceType::GPU);

-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-
  // Check
  auto output = net.GetOutput("Output");


--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -38,8 +38,9 @@
 #endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
-#include "mace/ops/opencl/image/conv_2d.h"
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/buffer/conv_2d.h"
+#include "mace/ops/opencl/image/conv_2d.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
@@ -958,13 +959,45 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
      : ConvPool2dOpBase(context),
        activation_(ops::StringToActivationType(
            Operation::GetOptionalArg<std::string>("activation",
-                                                  "NOOP"))),
-        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {
+                                                   "NOOP"))),
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
+        wino_block_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)) {
+    MemoryType mem_type;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
      kernel_.reset(new opencl::image::Conv2dKernel<T>);
    } else {
+      mem_type = MemoryType::GPU_BUFFER;
      kernel_.reset(new opencl::buffer::Conv2dKernel<T>);
    }
+    context->set_output_mem_type(mem_type);
+    // Transform filter tensor to target format
+    if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
+        (kernel_->CheckUseWinograd(
+          context->device()->opencl_runtime(),
+          context->workspace()->GetTensor(
+              operator_def_->input(1))->shape(),
+          std::vector<index_t>(operator_def_->output_shape(0).dims().begin(),
+                               operator_def_->output_shape(0).dims().end()),
+          strides_.data(),
+          dilations_.data(),
+          &wino_block_size_))) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 1,
+          OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_)
+                     == MaceStatus::MACE_SUCCESS);
+    } else {
+      wino_block_size_ = 0;
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 1,
+          OpenCLBufferType::CONV2D_FILTER, mem_type)
+                     == MaceStatus::MACE_SUCCESS);
+    }
+    if (operator_def_->input_size() > 2) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
+                     == MaceStatus::MACE_SUCCESS);
+    }
  }
  MaceStatus Run(OpContext *context) override {
    const Tensor *input = this->Input(INPUT);
@@ -974,13 +1007,14 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
    return kernel_->Compute(context, input, filter, bias,
                            strides_.data(), padding_type_, paddings_,
                            dilations_.data(), activation_, relux_max_limit_,
-                            output);
+                            wino_block_size_, output);
  }

 private:
  const ActivationType activation_;
  const float relux_max_limit_;
  std::unique_ptr<OpenCLConv2dKernel> kernel_;
+  int wino_block_size_;

 private:
  MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);

--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -49,11 +49,10 @@ void Conv2d(int iters,
  }
  net.AddRandomInput<D, float>("Filter",
                               {output_channels, channels, kernel_h,
-                                kernel_w});
-  net.AddRandomInput<D, float>("Bias", {output_channels});
+                                kernel_w}, true);
+  net.AddRandomInput<D, float>("Bias", {output_channels}, true);

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("Conv2D", "Conv2dTest")
+  OpDefBuilder("Conv2D", "Conv2dTest")
      .Input("Input")
      .Input("Filter")
      .Input("Bias")
@@ -63,26 +62,6 @@ void Conv2d(int iters,
      .AddIntsArg("dilations", {dilation, dilation})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-    OpDefBuilder("Conv2D", "Conv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntArg("padding", padding)
-        .AddIntsArg("dilations", {dilation, dilation})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }

  net.Setup(D);

@@ -123,9 +102,9 @@ void Conv2d<CPU, uint8_t>(int iters,
      "Input", {batch, height, width, channels});
  net.GetTensor("Input")->SetScale(0.1);
  net.AddRandomInput<DeviceType::CPU, uint8_t>(
-      "Filter", {output_channels, kernel_h, kernel_w, channels});
+      "Filter", {output_channels, kernel_h, kernel_w, channels}, true);
  net.GetTensor("Filter")->SetScale(0.1);
-  net.AddRandomInput<DeviceType::CPU, int32_t>("Bias", {output_channels});
+  net.AddRandomInput<DeviceType::CPU, int32_t>("Bias", {output_channels}, true);
  OpDefBuilder("Conv2D", "Conv2dTest")
      .Input("Input")
      .Input("Filter")

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
--- a/mace/ops/conv_pool_2d_util.cc
+++ b/mace/ops/conv_pool_2d_util.cc
@@ -24,7 +24,7 @@ namespace ops {
 void CalcPaddingAndOutputSize(const index_t *input_shape,
                              const DataFormat input_format,
                              const index_t *filter_shape,
-                              const DataFormat filter_format,
+                              const FilterDataFormat filter_format,
                              const int *dilations,
                              const int *strides,
                              Padding padding,
@@ -137,7 +137,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape,   // NHWC
 void CalcOutputSize(const index_t *input_shape,
                    const DataFormat input_format,
                    const index_t *filter_shape,
-                    const DataFormat filter_format,
+                    const FilterDataFormat filter_format,
                    const int *padding_size,
                    const int *dilations,
                    const int *strides,

--- a/mace/ops/conv_pool_2d_util.h
+++ b/mace/ops/conv_pool_2d_util.h
@@ -35,7 +35,7 @@ namespace ops {
 void CalcPaddingAndOutputSize(const index_t *input_shape,
                              const DataFormat input_format,
                              const index_t *filter_shape,
-                              const DataFormat filter_format,
+                              const FilterDataFormat filter_format,
                              const int *dilations,
                              const int *strides,
                              Padding padding,
@@ -61,7 +61,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape,
 void CalcOutputSize(const index_t *input_shape,
                    const DataFormat input_format,
                    const index_t *filter_shape,
-                    const DataFormat filter_format,
+                    const FilterDataFormat filter_format,
                    const int *padding_size,
                    const int *dilations,
                    const int *strides,

--- a/mace/ops/core_test.cc
+++ b/mace/ops/core_test.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/ops_test_util.h"
-
-namespace mace {
-namespace ops {
-namespace test {
-
-TEST(CoreTest, INIT_MODE) {
-  std::vector<OperatorDef> op_defs;
-
-  Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU);
-  std::unique_ptr<Tuner<uint32_t>> tuner;
-  Workspace ws;
-
-  op_defs.emplace_back(OperatorDef());
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("B2IOutput")
-      .AddIntArg("buffer_type", ops::BufferType::CONV2D_FILTER)
-      .AddIntArg("mode", static_cast<int>(NetMode::INIT))
-      .Finalize(&op_defs[op_defs.size() - 1]);
-
-  Tensor *input = ws.CreateTensor("Input", device->allocator(),
-                                  DataTypeToEnum<float>::v());
-  input->Resize({1, 3, 3, 3});
-  {
-    Tensor::MappingGuard input_mapper(input);
-    float *input_data = input->mutable_data<float>();
-    std::fill(input_data, input_data + input->size(), 1);
-  }
-
-  op_defs.emplace_back(OperatorDef());
-  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
-      .Input("B2IOutput")
-      .Output("Output")
-      .AddIntArg("buffer_type", ops::BufferType::CONV2D_FILTER)
-      .Finalize(&op_defs[op_defs.size() - 1]);
-
-  NetDef net_def;
-  for (auto &op_def : op_defs) {
-    net_def.add_op()->CopyFrom(op_def);
-  }
-  std::shared_ptr<OpRegistry> op_registry(new OpRegistry());
-  auto net = std::unique_ptr<NetBase>(new SerialNet(
-      op_registry.get(), &net_def, &ws, device,
-      NetMode::INIT));
-  MaceStatus status = net->Init();
-  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
-  status = net->Run();
-  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
-
-  EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr);
-  EXPECT_TRUE(ws.GetTensor("Output") == nullptr);
-  net = std::unique_ptr<NetBase>(new SerialNet(
-      op_registry.get(), &net_def, &ws, device));
-  status = net->Init();
-  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
-  status = net->Run();
-  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
-  EXPECT_TRUE(ws.GetTensor("Output") != nullptr);
-
-  ExpectTensorNear<float>(*ws.GetTensor("Input"), *ws.GetTensor("Output"),
-                          1e-5);
-}
-
-}  // namespace test
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/crop_benchmark.cc
+++ b/mace/ops/crop_benchmark.cc
@@ -66,7 +66,7 @@ MACE_BM_CROP_CPU_MACRO(2, 512, 6);

 namespace {
 template <typename T>
-void OpenclCropHelper(int iters,
+void OpenCLCropHelper(int iters,
                      const std::vector<index_t> &shape0,
                      const std::vector<index_t> &shape1,
                      int crop_axis,
@@ -79,16 +79,12 @@ void OpenclCropHelper(int iters,
  net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
  net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);

-  BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
-                                       ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
-                                       ops::BufferType::IN_OUT_CHANNEL);
  OpDefBuilder("Crop", "CropBM")
-      .Input("InputImage0")
-      .Input("InputImage1")
+      .Input("Input0")
+      .Input("Input1")
      .AddIntArg("axis", crop_axis)
      .AddIntsArg("offset", {offset})
-      .Output("OutputImage")
+      .Output("Output")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

@@ -114,7 +110,7 @@ void OpenclCropHelper(int iters,
  _##TYPE(int iters) {                                                        \
    std::vector<index_t> shape0 = {N, H, W, C};                              \
    std::vector<index_t> shape1 = {N / 2, H / 2, W / 2, C / 2};              \
-    OpenclCropHelper<TYPE>(iters, shape0, shape1, AXIS, OFFSET);             \
+    OpenCLCropHelper<TYPE>(iters, shape0, shape1, AXIS, OFFSET);             \
  }                                                                          \
  MACE_BENCHMARK(MACE_BM_CROP_GPU_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET\
  ##_##TYPE)

--- a/mace/ops/crop_test.cc
+++ b/mace/ops/crop_test.cc
@@ -34,14 +34,10 @@ void RunCrop(const std::vector<index_t> &input_shape,
  net.AddRandomInput<D, float>("Input1", input_shape2);

  if (D == GPU) {
-    BufferToImage<D, float>(&net, "Input0", "InputImage0",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Input1", "InputImage1",
-                            ops::BufferType::IN_OUT_CHANNEL);
    OpDefBuilder("Crop", "CropTest")
-        .Input("InputImage0")
-        .Input("InputImage1")
-        .Output("OutputImage")
+        .Input("Input0")
+        .Input("Input1")
+        .Output("Output")
        .AddIntsArg("offset", offset)
        .AddIntArg("axis", axis)
        .Finalize(net.NewOperatorDef());
@@ -66,10 +62,7 @@ void RunCrop(const std::vector<index_t> &input_shape,
  // Run
  net.RunOp(D);

-  if (D == GPU) {
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else if (D == CPU) {
+  if (D == CPU) {
    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                    "Output", NHWC);
  }

--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -30,6 +30,7 @@
 #include "mace/ops/arm/deconv_2d_neon.h"
 #include "mace/utils/utils.h"
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/deconv_2d.h"
 #endif  // MACE_ENABLE_OPENCL

@@ -358,11 +359,27 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
 public:
  explicit Deconv2dOp(OpConstructContext *context)
      : Deconv2dOpBase(context) {
+    MemoryType mem_type = MemoryType::GPU_IMAGE;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
      kernel_.reset(new opencl::image::Deconv2dKernel<T>);
    } else {
      MACE_NOT_IMPLEMENTED;
    }
+    MACE_CHECK(TransformFilter<T>(
+        context, operator_def_.get(), 1,
+        OpenCLBufferType::CONV2D_FILTER, mem_type)
+                   == MaceStatus::MACE_SUCCESS);
+    if (model_type_ == FrameworkType::CAFFE) {
+      if (operator_def_->input_size() >= 3) {
+        MACE_CHECK(TransformFilter<T>(
+            context, operator_def_.get(), 2,
+            OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
+      }
+    } else if (operator_def_->input_size() >= 4) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 3, OpenCLBufferType::ARGUMENT, mem_type)
+                     == MaceStatus::MACE_SUCCESS);
+    }
  }
  MaceStatus Run(OpContext *context) override {
    const Tensor *input = this->Input(0);

--- a/mace/ops/deconv_2d_benchmark.cc
+++ b/mace/ops/deconv_2d_benchmark.cc
@@ -47,40 +47,21 @@ static void Deconv2d(int iters,
  }
  net.AddRandomInput<D, float>("Filter",
                               {output_channels, channels, kernel_h,
-                                kernel_w});
-  net.AddRandomInput<D, float>("Bias", {output_channels});
+                                kernel_w}, true);
+  net.AddRandomInput<D, float>("Bias", {output_channels}, true);
  net.AddInputFromArray<D, int32_t>("OutputShape", {4},
-                                    {batch, out_h, out_w, output_channels});
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-    OpDefBuilder("Deconv2D", "Deconv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("OutputShape")
-        .Input("BiasImage")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntArg("padding", padding)
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("Deconv2D", "Deconv2dTest")
-        .Input("Input")
-        .Input("Filter")
-        .Input("OutputShape")
-        .Input("Bias")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntArg("padding", padding)
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  }
-
+                                    {batch, out_h, out_w, output_channels},
+                                    true);
+  OpDefBuilder("Deconv2D", "Deconv2dTest")
+      .Input("Input")
+      .Input("Filter")
+      .Input("OutputShape")
+      .Input("Bias")
+      .Output("Output")
+      .AddIntsArg("strides", {stride, stride})
+      .AddIntArg("padding", padding)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
  net.Setup(D);

  // Warm-up

--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
@@ -41,40 +41,34 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
                   ops::FrameworkType model_type) {
  OpsTestNet net;
  // Add input data
-  const index_t batch = input_shape[0];
  const index_t out_channels = filter_shape[2];

  net.AddInputFromArray<D, float>("Input", input_shape, input_data);
-  net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data);
-  net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data);
-  net.TransformDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
+  net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data, true);
+  net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data, true);
+  // TODO(liutuo): remove the unused transform
+  net.TransformFilterDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Bias", "BiasImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "FilterOIHW", "FilterImage",
-                            ops::BufferType::CONV2D_FILTER);
    if (model_type == ops::FrameworkType::CAFFE) {
      OpDefBuilder("Deconv2D", "Deconv2dTest")
-          .Input("InputImage")
-          .Input("FilterImage")
-          .Input("BiasImage")
-          .Output("OutputImage")
+          .Input("Input")
+          .Input("FilterOIHW")
+          .Input("Bias")
+          .Output("Output")
          .AddIntsArg("strides", {stride, stride})
          .AddIntArg("padding", padding)
          .AddIntsArg("padding_values", padding_size)
          .AddIntArg("framework_type", model_type)
          .Finalize(net.NewOperatorDef());
    } else {
-      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape);
+      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape, true);

      OpDefBuilder("Deconv2D", "Deconv2dTest")
-          .Input("InputImage")
-          .Input("FilterImage")
+          .Input("Input")
+          .Input("FilterOIHW")
          .Input("OutputShape")
-          .Input("BiasImage")
-          .Output("OutputImage")
+          .Input("Bias")
+          .Output("Output")
          .AddIntsArg("strides", {stride, stride})
          .AddIntArg("padding", padding)
          .AddIntsArg("padding_values", padding_size)
@@ -82,10 +76,6 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
          .Finalize(net.NewOperatorDef());
    }
    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                    NCHW);
@@ -102,7 +92,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
          .AddIntArg("framework_type", model_type)
          .Finalize(net.NewOperatorDef());
    } else {
-      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape);
+      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape, true);

      OpDefBuilder("Deconv2D", "Deconv2dTest")
          .Input("InputNCHW")
@@ -387,8 +377,8 @@ void TestComplexDeconvNxN(const int batch,
    // Add input data
    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
    net.AddRandomInput<D, T>(
-        "Filter", {output_channels, input_channels, kernel_h, kernel_w});
-    net.AddRandomInput<D, T>("Bias", {output_channels});
+        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
+    net.AddRandomInput<D, T>("Bias", {output_channels}, true);
    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                    NCHW);
    int out_h = 0;
@@ -413,7 +403,7 @@ void TestComplexDeconvNxN(const int batch,
      output_shape.push_back(out_h);
      output_shape.push_back(out_w);
      output_shape.push_back(output_channels);
-      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape);
+      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape, true);
    } else {
      paddings.push_back(padding);
      paddings.push_back(padding);
@@ -455,19 +445,12 @@ void TestComplexDeconvNxN(const int batch,
    expected->Copy(*net.GetOutput("Output"));

    // run on gpu
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-
    if (model_type == ops::FrameworkType::CAFFE) {
      OpDefBuilder("Deconv2D", "Deconv2dTest")
-          .Input("InputImage")
-          .Input("FilterImage")
-          .Input("BiasImage")
-          .Output("OutputImage")
+          .Input("Input")
+          .Input("Filter")
+          .Input("Bias")
+          .Output("Output")
          .AddIntsArg("strides", {stride_h, stride_w})
          .AddIntsArg("padding_values", paddings)
          .AddIntArg("framework_type", model_type)
@@ -475,11 +458,11 @@ void TestComplexDeconvNxN(const int batch,
          .Finalize(net.NewOperatorDef());
    } else {
      OpDefBuilder("Deconv2D", "Deconv2dTest")
-          .Input("InputImage")
-          .Input("FilterImage")
+          .Input("Input")
+          .Input("Filter")
          .Input("OutputShape")
-          .Input("BiasImage")
-          .Output("OutputImage")
+          .Input("Bias")
+          .Output("Output")
          .AddIntsArg("strides", {stride_h, stride_w})
          .AddIntArg("padding", type)
          .AddIntArg("framework_type", model_type)
@@ -489,9 +472,7 @@ void TestComplexDeconvNxN(const int batch,
    // Run on device
    net.RunOp(D);

-    ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4,
                            1e-4);
  };


--- a/mace/ops/depth_to_space_benchmark.cc
+++ b/mace/ops/depth_to_space_benchmark.cc
@@ -36,23 +36,12 @@ void DepthToSpace(
    MACE_NOT_IMPLEMENTED;
  }

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
+  OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
      .Input("Input")
      .Output("Output")
+      .AddIntArg("block_size", block_size)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
-        .Input("InputImage")
-        .Output("Output")
-        .AddIntArg("block_size", block_size)
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }

  // Warm-up
  for (int i = 0; i < 5; ++i) {

--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
@@ -45,21 +45,15 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,
                                                    "Output", NHWC);

  } else {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
    OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
-        .Input("InputImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Output("Output")
        .AddIntArg("block_size", block_size)
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
  }

-  if (D == DeviceType::GPU) {
-    ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
-                                          ops::BufferType::IN_OUT_CHANNEL);
-  }
  auto expected = net.CreateTensor<float>(expected_shape, expected_data);
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -134,28 +128,23 @@ void RandomTest(const int block_size,
  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
                                                  NHWC);

-  BufferToImage<D, T>(&net, "Input", "InputImg",
-                      ops::BufferType::IN_OUT_CHANNEL);
-
  OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
-      .Input("InputImg")
+      .Input("Input")
      .AddIntArg("block_size", block_size)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Output("OutputImg")
+      .Output("GPUOutput")
      .Finalize(net.NewOperatorDef());

  // Run
  net.RunOp(D);

-  ImageToBuffer<D, float>(&net, "OutputImg", "OPENCLOutput",
-                          ops::BufferType::IN_OUT_CHANNEL);

  if (DataTypeToEnum<T>::value == DT_FLOAT) {
    ExpectTensorNear<float>(*net.GetTensor("Output"),
-                            *net.GetOutput("OPENCLOutput"), 1e-5);
+                            *net.GetOutput("GPUOutput"), 1e-5);
  } else {
    ExpectTensorNear<float>(*net.GetTensor("Output"),
-                            *net.GetOutput("OPENCLOutput"), 1e-3, 1e-4);
+                            *net.GetOutput("GPUOutput"), 1e-3, 1e-4);
  }
 }
 }  // namespace

--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -34,8 +34,9 @@
 #include "mace/public/mace.h"
 #include "mace/utils/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/ops/opencl/image/depthwise_conv2d.h"
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/buffer/depthwise_conv2d.h"
+#include "mace/ops/opencl/image/depthwise_conv2d.h"
 #endif  // MACE_ENABLE_OPENCL

 namespace mace {
@@ -490,11 +491,27 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
 public:
  explicit DepthwiseConv2dOp(OpConstructContext *context)
      : DepthwiseConv2dOpBase(context) {
+    MemoryType mem_type;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
      kernel_.reset(new opencl::image::DepthwiseConv2dKernel<T>);
    } else {
+      mem_type = MemoryType::GPU_BUFFER;
      kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel<T>);
    }
+    context->set_output_mem_type(mem_type);
+    // Transform filter tensor to target format
+    MACE_CHECK(TransformFilter<T>(
+        context,
+        operator_def_.get(),
+        1,
+        OpenCLBufferType::DW_CONV2D_FILTER,
+        mem_type) == MaceStatus::MACE_SUCCESS);
+    if (operator_def_->input_size() > 2) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
+                     == MaceStatus::MACE_SUCCESS);
+    }
  }
  MaceStatus Run(OpContext *context) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -57,18 +57,17 @@ void DepthwiseConv2d(int iters,
  }
  if (DataTypeToEnum<T>::value != DT_UINT8) {
    net.AddRandomInput<D, float>(
-        "Filter", {multiplier, input_channels, kernel_h, kernel_w});
-    net.AddRandomInput<D, float>("Bias", {input_channels * multiplier});
+        "Filter", {multiplier, input_channels, kernel_h, kernel_w}, true);
+    net.AddRandomInput<D, float>("Bias", {input_channels * multiplier}, true);
  } else {
    net.AddRandomInput<DeviceType::CPU, uint8_t>(
-        "Filter", {kernel_h, kernel_w, input_channels, multiplier});
+        "Filter", {kernel_h, kernel_w, input_channels, multiplier}, true);
    net.GetTensor("Filter")->SetScale(0.1);
    net.AddRandomInput<DeviceType::CPU, int32_t>(
-        "Bias", {input_channels * multiplier});
+        "Bias", {input_channels * multiplier}, true);
  }

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
+  OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
      .Input("Input")
      .Input("Filter")
      .Input("Bias")
@@ -78,26 +77,6 @@ void DepthwiseConv2d(int iters,
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::DW_CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntArg("padding", padding)
-        .AddIntsArg("dilations", {1, 1})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }

  net.Setup(D);


--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -29,6 +29,7 @@
 #include "mace/utils/utils.h"
 #include "mace/public/mace.h"
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/depthwise_deconv2d.h"
 #endif  // MACE_ENABLE_OPENCL

@@ -408,11 +409,21 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
 public:
  explicit DepthwiseDeconv2dOp(OpConstructContext *context)
      : Deconv2dOpBase(context) {
+    MemoryType mem_type = MemoryType::GPU_IMAGE;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
      kernel_.reset(new opencl::image::DepthwiseDeconv2dKernel<T>);
    } else {
      MACE_NOT_IMPLEMENTED;
    }
+    MACE_CHECK(TransformFilter<T>(
+        context, operator_def_.get(), 1,
+        OpenCLBufferType::DW_CONV2D_FILTER, mem_type)
+                   == MaceStatus::MACE_SUCCESS);
+    if (operator_def_->input_size() >= 3) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 2,
+          OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
+    }
  }

  MaceStatus Run(OpContext *context) override {

--- a/mace/ops/depthwise_deconv2d_benchmark.cc
+++ b/mace/ops/depthwise_deconv2d_benchmark.cc
@@ -44,32 +44,16 @@ static void DepthwiseDeconv2d(int iters,
  }
  net.AddRandomInput<D, float>("Filter",
                               {1, channels, kernel_h,
-                                kernel_w});
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::DW_CONV2D_FILTER);
-    OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntsArg("padding_values", {padding, padding})
-        .AddIntArg("group", channels)
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
-        .Input("Input")
-        .Input("Filter")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntsArg("padding_values", {padding, padding})
-        .AddIntArg("group", channels)
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
-        .Finalize(net.NewOperatorDef());
-  }
+                                kernel_w}, true);
+  OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
+      .Input("Input")
+      .Input("Filter")
+      .Output("Output")
+      .AddIntsArg("strides", {stride, stride})
+      .AddIntsArg("padding_values", {padding, padding})
+      .AddIntArg("group", channels)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());

  net.Setup(D);


--- a/mace/ops/depthwise_deconv2d_test.cc
+++ b/mace/ops/depthwise_deconv2d_test.cc
--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -26,6 +26,7 @@
 #include "mace/core/tensor.h"
 #include "mace/utils/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/eltwise.h"
 #endif  // MACE_ENABLE_OPENCL

@@ -1086,12 +1087,28 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
    float scalar_input = Operation::GetOptionalArg<float>("scalar_input", 1.0);
    int32_t scalar_input_index = Operation::GetOptionalArg<int32_t>(
            "scalar_input_index", 1);
+    MemoryType mem_type;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      mem_type = MemoryType::GPU_IMAGE;
      kernel_.reset(new opencl::image::EltwiseKernel<T>(
          type, coeff, scalar_input, scalar_input_index));
    } else {
      MACE_NOT_IMPLEMENTED;
    }
+    // Transform filters
+    int input_size = operator_def_->input_size();
+    Workspace *ws = context->workspace();
+    for (int i = 0; i < input_size; ++i) {
+      if (ws->HasTensor(operator_def_->input(i)) &&
+          ws->GetTensor(operator_def_->input(i))->is_weight()) {
+        MACE_CHECK(TransformFilter<T>(
+            context,
+            operator_def_.get(),
+            i,
+            OpenCLBufferType::ARGUMENT,
+            mem_type) == MaceStatus::MACE_SUCCESS);
+      }
+    }
  }
  MaceStatus Run(OpContext *context) override {
    const Tensor *input0 = this->Input(0);

--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
--- a/mace/ops/lstm_cell.cc
+++ b/mace/ops/lstm_cell.cc
--- a/mace/ops/lstmcell_benchmark.cc
+++ b/mace/ops/lstmcell_benchmark.cc
--- a/mace/ops/lstmcell_test.cc
+++ b/mace/ops/lstmcell_test.cc
--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -31,6 +31,7 @@
 #endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/matmul.h"
 #endif  // MACE_ENABLE_OPENCL

@@ -351,11 +352,8 @@ class MatMulOp<DeviceType::GPU, T> : public MatMulOpBase {
 public:
  explicit MatMulOp(OpConstructContext *context)
      : MatMulOpBase(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::MatMulKernel<T>);
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
+    MACE_UNUSED(context);
+    MACE_NOT_IMPLEMENTED;
  }
  MaceStatus Run(OpContext *context) override {
    Validate();

--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
--- a/mace/ops/opencl/buffer/buffer_inverse_transform.h
+++ b/mace/ops/opencl/buffer/buffer_inverse_transform.h
--- a/mace/ops/opencl/buffer/buffer_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_transform.cc
--- a/mace/ops/opencl/buffer/buffer_transform.h
+++ b/mace/ops/opencl/buffer/buffer_transform.h
--- a/mace/ops/opencl/buffer/buffer_type_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_type_transform.cc
@@ -86,8 +86,6 @@ MaceStatus BufferTypeTransform(
      }
    };
  }
-  // Mark the buffer unused.
-  const_cast<Tensor *>(input)->MarkUnused();
  return MaceStatus::MACE_SUCCESS;
 }


--- a/mace/ops/opencl/buffer/conv_2d.h
+++ b/mace/ops/opencl/buffer/conv_2d.h
--- a/mace/ops/opencl/buffer_transform.h
+++ b/mace/ops/opencl/buffer_transform.h
--- a/mace/ops/opencl/common.h
+++ b/mace/ops/opencl/common.h
--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
--- a/mace/ops/opencl/cl/eltwise.cl
+++ b/mace/ops/opencl/cl/eltwise.cl
--- a/mace/ops/opencl/conv_2d.h
+++ b/mace/ops/opencl/conv_2d.h
--- a/mace/ops/opencl/depthwise_deconv2d.h
+++ b/mace/ops/opencl/depthwise_deconv2d.h
--- a/mace/ops/opencl/helper.cc
+++ b/mace/ops/opencl/helper.cc
--- a/mace/ops/opencl/helper.h
+++ b/mace/ops/opencl/helper.h
--- a/mace/ops/opencl/image/addn.h
+++ b/mace/ops/opencl/image/addn.h
--- a/mace/ops/opencl/image/batch_to_space.h
+++ b/mace/ops/opencl/image/batch_to_space.h
--- a/mace/ops/opencl/image/buffer_to_image.h
+++ b/mace/ops/opencl/image/buffer_to_image.h
--- a/mace/ops/opencl/image/concat.h
+++ b/mace/ops/opencl/image/concat.h
--- a/mace/ops/opencl/image/conv_2d.h
+++ b/mace/ops/opencl/image/conv_2d.h
--- a/mace/ops/opencl/image/conv_2d_1x1.cc
+++ b/mace/ops/opencl/image/conv_2d_1x1.cc
--- a/mace/ops/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
--- a/mace/ops/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
--- a/mace/ops/opencl/image/crop.h
+++ b/mace/ops/opencl/image/crop.h
--- a/mace/ops/opencl/image/deconv_2d.h
+++ b/mace/ops/opencl/image/deconv_2d.h
--- a/mace/ops/opencl/image/depth_to_space.h
+++ b/mace/ops/opencl/image/depth_to_space.h
--- a/mace/ops/opencl/image/depthwise_conv2d.h
+++ b/mace/ops/opencl/image/depthwise_conv2d.h
--- a/mace/ops/opencl/image/depthwise_deconv2d.h
+++ b/mace/ops/opencl/image/depthwise_deconv2d.h
--- a/mace/ops/opencl/image/eltwise.h
+++ b/mace/ops/opencl/image/eltwise.h
--- a/mace/ops/opencl/image/fully_connected.h
+++ b/mace/ops/opencl/image/fully_connected.h
--- a/mace/ops/opencl/image/image_to_buffer.h
+++ b/mace/ops/opencl/image/image_to_buffer.h
--- a/mace/ops/opencl/image/lstm_cell.h
+++ b/mace/ops/opencl/image/lstm_cell.h
--- a/mace/ops/opencl/image/matmul.h
+++ b/mace/ops/opencl/image/matmul.h
--- a/mace/ops/opencl/image/pad.h
+++ b/mace/ops/opencl/image/pad.h
--- a/mace/ops/opencl/image/pooling.h
+++ b/mace/ops/opencl/image/pooling.h
--- a/mace/ops/opencl/image/reduce_mean.h
+++ b/mace/ops/opencl/image/reduce_mean.h
--- a/mace/ops/opencl/image/resize_bicubic.h
+++ b/mace/ops/opencl/image/resize_bicubic.h
--- a/mace/ops/opencl/image/resize_bilinear.h
+++ b/mace/ops/opencl/image/resize_bilinear.h
--- a/mace/ops/opencl/image/space_to_batch.h
+++ b/mace/ops/opencl/image/space_to_batch.h
--- a/mace/ops/opencl/image/space_to_depth.h
+++ b/mace/ops/opencl/image/space_to_depth.h
--- a/mace/ops/opencl/image/split.h
+++ b/mace/ops/opencl/image/split.h
--- a/mace/ops/opencl/image/sqrdiff_mean.h
+++ b/mace/ops/opencl/image/sqrdiff_mean.h
--- a/mace/ops/opencl/image/winograd_transform.h
+++ b/mace/ops/opencl/image/winograd_transform.h
--- a/mace/ops/opencl/out_of_range_check_test.cc
+++ b/mace/ops/opencl/out_of_range_check_test.cc
--- a/mace/ops/ops_registry.cc
+++ b/mace/ops/ops_registry.cc
--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
--- a/mace/ops/pad_benchmark.cc
+++ b/mace/ops/pad_benchmark.cc
--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
--- a/mace/ops/reduce_mean.cc
+++ b/mace/ops/reduce_mean.cc
--- a/mace/ops/reduce_mean_benchmark.cc
+++ b/mace/ops/reduce_mean_benchmark.cc
--- a/mace/ops/reduce_mean_test.cc
+++ b/mace/ops/reduce_mean_test.cc
--- a/mace/ops/resize_bicubic_benchmark.cc
+++ b/mace/ops/resize_bicubic_benchmark.cc
--- a/mace/ops/resize_bicubic_test.cc
+++ b/mace/ops/resize_bicubic_test.cc
--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
--- a/mace/ops/scalar_math.cc
+++ b/mace/ops/scalar_math.cc
--- a/mace/ops/shape.cc
+++ b/mace/ops/shape.cc
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
--- a/mace/ops/space_to_depth_benchmark.cc
+++ b/mace/ops/space_to_depth_benchmark.cc
--- a/mace/ops/space_to_depth_test.cc
+++ b/mace/ops/space_to_depth_test.cc
--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
--- a/mace/ops/split_benchmark.cc
+++ b/mace/ops/split_benchmark.cc
--- a/mace/ops/split_test.cc
+++ b/mace/ops/split_test.cc
--- a/mace/ops/sqrdiff_mean_benchmark.cc
+++ b/mace/ops/sqrdiff_mean_benchmark.cc
--- a/mace/ops/sqrdiff_mean_test.cc
+++ b/mace/ops/sqrdiff_mean_test.cc
--- a/mace/ops/squeeze.cc
+++ b/mace/ops/squeeze.cc
--- a/mace/ops/squeeze_test.cc
+++ b/mace/ops/squeeze_test.cc
--- a/mace/ops/stack.cc
+++ b/mace/ops/stack.cc
--- a/mace/ops/transpose.cc
+++ b/mace/ops/transpose.cc
--- a/mace/ops/opencl/buffer_inverse_transform.h
+++ b/mace/ops/opencl/buffer_inverse_transform.h
--- a/mace/ops/winograd_convolution_benchmark.cc
+++ b/mace/ops/winograd_convolution_benchmark.cc
--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
--- a/mace/ops/winograd_transform.cc
+++ b/mace/ops/winograd_transform.cc
--- a/mace/ops/winograd_transform_benchmark.cc
+++ b/mace/ops/winograd_transform_benchmark.cc
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
--- a/mace/python/tools/BUILD
+++ b/mace/python/tools/BUILD
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
--- a/mace/python/tools/memory_optimizer.py
+++ b/mace/python/tools/memory_optimizer.py
--- a/mace/python/tools/model.jinja2
+++ b/mace/python/tools/model.jinja2
--- a/mace/python/tools/model_saver.py
+++ b/mace/python/tools/model_saver.py
--- a/mace/python/tools/operator.jinja2
+++ b/mace/python/tools/operator.jinja2
--- a/mace/test/BUILD
+++ b/mace/test/BUILD
--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
--- a/mace/test/mace_api_test.h
+++ b/mace/test/mace_api_test.h
--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
--- a/tools/converter.py
+++ b/tools/converter.py
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py