Feature: Support mixed usage of CPU and GPU

1. Support memory optimization at runtime 2. Support memory type transformation automatically at runtime. 3. Move gpu winograd convolution to Conv2D. 4. Support Data Type transformation automatically. 5. MaceTensor API support optional data type. 6. Add input_data_formats and output_data_formats at yaml 7. Related issue(#363)

Feature: Support mixed usage of CPU and GPU
1. Support memory optimization at runtime 2. Support memory type transformation automatically at runtime. 3. Move gpu winograd convolution to Conv2D. 4. Support Data Type transformation automatically. 5. MaceTensor API support optional data type. 6. Add input_data_formats and output_data_formats at yaml 7. Related issue(#363)
e446bd65 · liuqi · 19dcf2c3 · e446bd65 · e446bd65 · e446bd65
177 changed file
--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -69,9 +69,9 @@ in one deployment file.
      - The output tensor name(s) (tensorflow) or top name(s) of outputs' layer (caffe).
        If there are more than one tensors, use one line for a tensor.
    * - input_shapes
-      - The shapes of the input tensors, in NHWC order.
+      - The shapes of the input tensors, default is NHWC order.
    * - output_shapes
-      - The shapes of the output tensors, in NHWC order.
+      - The shapes of the output tensors, default is NHWC order.
    * - input_ranges
      - The numerical range of the input tensors' data, default [-1, 1]. It is only for test.
    * - validation_inputs_data
@@ -84,6 +84,10 @@ in one deployment file.
      - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP.
    * - input_data_types
      - [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32.
+    * - input_data_formats
+      - [optional] The format of the input tensors, one of [NONE, NHWC]. If there is no format of the input, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order.
+    * - output_data_formats
+      - [optional] The format of the output tensors, one of [NONE, NHWC]. If there is no format of the output, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order.
    * - limit_opencl_kernel_time
      - [optional] Whether splitting the OpenCL kernel within 1 ms to keep UI responsiveness, default is 0.
    * - obfuscate

--- a/docs/user_guide/devices/demo_device_nanopi.yml
+++ b/docs/user_guide/devices/demo_device_nanopi.yml
+# one yaml config file can contain multi device info
+devices:
+  # The name of the device
+  nanopi:
+  # arm64 or armhf
+    target_abis: [arm64, armhf]
+  # device soc, you can get it from device manual
+    target_socs: RK3399
+  # device model full name
+    models: FriendlyElec Nanopi M4
+  # device ip address
+    address: 10.0.0.0
+  # login username
+    username: user
+  # login password, is required when you can login into device without password
+    password: 1234567
+  raspberry:
+    target_abis: [armv7l]
+    target_socs: BCM2837
+    models: Raspberry Pi 3 Model B Plus Rev 1.3
+    address: 10.0.0.1
+    username: user
+    password: 123456
--- a/mace/core/arg_helper.cc
+++ b/mace/core/arg_helper.cc
@@ -95,4 +95,12 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(float, floats, false)
 MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true)
 MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true)
 #undef MACE_GET_REPEATED_ARGUMENT_FUNC
+
+
+bool IsQuantizedModel(const NetDef &net_def) {
+  return
+      ProtoArgHelper::GetOptionalArg<NetDef, int>(net_def, "quantize_flag", 0)
+          == 1;
+}
+
 }  // namespace mace
--- a/mace/core/arg_helper.h
+++ b/mace/core/arg_helper.h
@@ -55,6 +55,8 @@ class ProtoArgHelper {
  std::map<std::string, Argument> arg_map_;
 };

+bool IsQuantizedModel(const NetDef &def);
+
 }  // namespace mace

 #endif  // MACE_CORE_ARG_HELPER_H_
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -233,6 +233,11 @@ class Image : public BufferBase {
    }
  }

+  inline DataType dtype() const {
+    MACE_CHECK_NOTNULL(buf_);
+    return data_type_;
+  }
+
  void *buffer() {
    MACE_CHECK_NOTNULL(buf_);
    return buf_;

--- a/mace/core/device.h
+++ b/mace/core/device.h
@@ -34,7 +34,7 @@ class Device {

 #ifdef MACE_ENABLE_OPENCL
  virtual OpenCLRuntime *opencl_runtime() = 0;
-#endif
+#endif  // MACE_ENABLE_OPENCL
  virtual CPURuntime *cpu_runtime() = 0;

  virtual Allocator *allocator() = 0;

--- a/mace/core/memory_optimizer.cc
+++ b/mace/core/memory_optimizer.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/memory_optimizer.h"
+
+#include <algorithm>
+#include <functional>
+#include <numeric>
+#include <sstream>
+#include <unordered_set>
+
+#include "mace/core/arg_helper.h"
+#include "mace/core/macros.h"
+#include "mace/utils/logging.h"
+
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_util.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+
+bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) {
+  static const std::unordered_set<std::string> kReuseOp = {
+      "Reshape", "Identity", "Squeeze", "ExpandDims"
+  };
+  return kReuseOp.count(op_type) == 1;
+}
+
+void MemoryOptimizer::UpdateTensorRef(const std::string &tensor_name) {
+  if (tensor_ref_count_.count(tensor_name) == 0) {
+    tensor_ref_count_.emplace(tensor_name, 1);
+  } else {
+    tensor_ref_count_[tensor_name] += 1;
+  }
+}
+
+void MemoryOptimizer::UpdateTensorRef(const mace::OperatorDef *op_def) {
+  int input_size = op_def->input_size();
+  for (int i = 0; i < input_size; ++i) {
+    if (tensor_ref_count_.count(op_def->input(i)) == 1) {
+      tensor_ref_count_[op_def->input(i)] += 1;
+    }
+  }
+  int output_size = op_def->output_size();
+  for (int i = 0; i < output_size; ++i) {
+    if (tensor_ref_count_.count(op_def->output(i)) == 0) {
+      tensor_ref_count_.emplace(op_def->output(i), 0);
+    }
+  }
+}
+
+MemoryBlock MemoryOptimizer::CreateMemoryBlock(
+    std::vector<int64_t> shape,
+    DataType dt,
+    mace::MemoryType mem_type) {
+  MemoryBlock block;
+#ifdef MACE_ENABLE_OPENCL
+  if (mem_type == MemoryType::GPU_IMAGE) {
+    std::vector<size_t> image_shape;
+    if (shape.size() == 2) {
+      shape = {shape[0], 1, 1, shape[1]};
+    } else {
+      MACE_CHECK(shape.size() == 4) << "GPU only support 2D/4D input";
+    }
+    OpenCLUtil::CalImage2DShape(shape,
+                                OpenCLBufferType::IN_OUT_CHANNEL,
+                                &image_shape);
+    block.set_x(image_shape[0]);
+    block.set_y(image_shape[1]);
+    return block;
+  }
+#endif  // MACE_ENABLE_OPENCL
+  MACE_UNUSED(mem_type);
+  int64_t op_mem_size = std::accumulate(shape.begin(),
+                                        shape.end(),
+                                        GetEnumTypeSize(dt),
+                                        std::multiplies<int64_t>());
+  block.set_x(op_mem_size);
+  block.set_y(1);
+  return block;
+}
+
+void MemoryOptimizer::Optimize(
+    const mace::OperatorDef *op_def,
+    const std::unordered_map<std::string, MemoryType> &mem_types) {
+  MACE_LATENCY_LOGGER(2, "Optimize memory");
+  if (op_def->output_size() != op_def->output_shape_size()) {
+    VLOG(1) << op_def->name()
+            << ": the number of output shape "
+            << "is not equal to the number of output";
+    return;
+  }
+
+  auto device = static_cast<DeviceType>(op_def->device_type());
+  DataType op_dtype = static_cast<DataType>(ProtoArgHelper::GetOptionalArg(
+      *op_def,
+      "T",
+      static_cast<int>(DT_FLOAT)));
+  MACE_CHECK(
+      op_def->output_type_size() == 0 ||
+          op_def->output_size() == op_def->output_type_size(),
+      "operator output size != operator output type size",
+      op_def->output_size(),
+      op_def->output_type_size());
+  DataType dt;
+
+  int output_size = op_def->output_size();
+  for (int i = 0; i < output_size; ++i) {
+    if (i < op_def->output_type_size()) {
+      dt = op_def->output_type(i);
+    } else {
+      dt = op_dtype;
+    }
+    int best_mem_id = -1;
+    MemoryType mem_type = MemoryType::CPU_BUFFER;
+    if (device == DeviceType::GPU) {
+      mem_type = mem_types.at(op_def->output(i));
+    }
+    auto shape = std::vector<int64_t>(
+        op_def->output_shape(i).dims().begin(),
+        op_def->output_shape(i).dims().end());
+    MemoryBlock op_mem_block = CreateMemoryBlock(shape, dt, mem_type);
+    MemoryBlock best_mem_block;
+    if (IsMemoryReuseOp(op_def->type())) {
+      if (tensor_mem_map_.count(op_def->input(0)) == 1) {
+        best_mem_id = tensor_mem_map_[op_def->input(0)].first;
+      }
+    } else {
+      auto shape = std::vector<int64_t>(
+          op_def->output_shape(i).dims().begin(),
+          op_def->output_shape(i).dims().end());
+
+      int64_t op_mem_size = op_mem_block.x() * op_mem_block.y();
+      int64_t best_added_mem_size = LLONG_MAX;
+      int64_t best_wasted_mem_size = LLONG_MAX;
+
+      int64_t old_mem_size = 0, new_mem_size = 0;
+      MemoryBlock new_mem_block;
+      for (auto idle_mem_id : idle_blocks_) {
+        if (mem_blocks_[idle_mem_id].mem_type() == mem_type) {
+          if (mem_type == MemoryType::GPU_IMAGE) {
+            // GPU Image could reuse memory with same data type only
+            if (mem_blocks_[idle_mem_id].data_type() != dt) {
+              continue;
+            }
+            old_mem_size =
+                mem_blocks_[idle_mem_id].x() * mem_blocks_[idle_mem_id].y();
+            new_mem_block.set_x(std::max<int64_t>(mem_blocks_[idle_mem_id].x(),
+                                                  op_mem_block.x()));
+            new_mem_block.set_y(std::max<int64_t>(mem_blocks_[idle_mem_id].y(),
+                                                  op_mem_block.y()));
+            new_mem_size = new_mem_block.x() * new_mem_block.y();
+          } else {
+            old_mem_size = mem_blocks_[idle_mem_id].x();
+            new_mem_size = std::max(op_mem_size, old_mem_size);
+            new_mem_block.set_x(new_mem_size);
+          }
+          int64_t added_mem_size = new_mem_size - old_mem_size;
+          int64_t wasted_mem_size = new_mem_size - op_mem_size;
+          // minimize add_mem_size; if best_mem_add_size is 0,
+          // then minimize waste_mem_size
+          if ((best_added_mem_size > 0 && added_mem_size < best_added_mem_size)
+              || (best_added_mem_size == 0 &&
+                  wasted_mem_size < best_wasted_mem_size)) {
+            best_mem_id = idle_mem_id;
+            best_added_mem_size = added_mem_size;
+            best_wasted_mem_size = wasted_mem_size;
+            best_mem_block = new_mem_block;
+          }
+        }
+      }
+
+      if (best_added_mem_size <= op_mem_size) {
+        best_mem_block.set_mem_id(best_mem_id);
+        best_mem_block.set_data_type(dt);
+        best_mem_block.set_mem_type(mem_type);
+        mem_blocks_[best_mem_id] = best_mem_block;
+        idle_blocks_.erase(best_mem_id);
+      } else {
+        best_mem_id = static_cast<int>(mem_blocks_.size());
+        best_mem_block.set_mem_id(best_mem_id);
+        best_mem_block.set_data_type(dt);
+        best_mem_block.set_mem_type(mem_type);
+        best_mem_block.set_x(op_mem_block.x());
+        best_mem_block.set_y(op_mem_block.y());
+        mem_blocks_.push_back(best_mem_block);
+      }
+    }
+
+    if (best_mem_id != -1) {
+      if (mem_ref_count_.count(best_mem_id) == 1) {
+        mem_ref_count_[best_mem_id] += 1;
+      } else {
+        mem_ref_count_[best_mem_id] = 1;
+      }
+      tensor_mem_map_[op_def->output(i)] = std::make_pair(best_mem_id, dt);
+    }
+  }
+
+  // de-refer input tensors
+  int input_size = op_def->input_size();
+  for (int i = 0; i < input_size; ++i) {
+    auto &input_name = op_def->input(i);
+    if (tensor_ref_count_.count(input_name) == 1) {
+      tensor_ref_count_[input_name] -= 1;
+      if (tensor_ref_count_.at(input_name) == 0 &&
+          tensor_mem_map_.count(input_name) == 1) {
+        int mem_id = tensor_mem_map_.at(input_name).first;
+        mem_ref_count_[mem_id] -= 1;
+        if (mem_ref_count_.at(mem_id) == 0) {
+          idle_blocks_.insert(mem_id);
+        }
+      } else {
+        MACE_CHECK(tensor_ref_count_.at(input_name) >= 0);
+      }
+    }
+  }
+}
+
+const std::vector<MemoryBlock>& MemoryOptimizer::mem_blocks() const {
+  return mem_blocks_;
+}
+
+const std::unordered_map<std::string, std::pair<int, DataType>>&
+    MemoryOptimizer::tensor_mem_map() const {
+  return tensor_mem_map_;
+}
+
+std::string MemoryOptimizer::DebugInfo() const {
+  auto memory_type_to_str = [](const MemoryType type) -> std::string {
+    if (type == MemoryType::CPU_BUFFER) {
+      return "CPU_BUFFER";
+    } else if (type == MemoryType::GPU_BUFFER) {
+      return "GPU_BUFFER";
+    } else if (type == MemoryType::GPU_IMAGE) {
+      return "GPU_IMAGE";
+    } else {
+      return "UNKNOWN";
+    }
+  };
+  std::stringstream sstream;
+  sstream << "\n";
+  size_t block_size = mem_blocks_.size();
+  for (size_t i = 0; i < block_size; ++i) {
+    sstream << i << " " << memory_type_to_str(mem_blocks_[i].mem_type())
+            << " ";
+    if (mem_blocks_[i].mem_type() == MemoryType::GPU_IMAGE) {
+      sstream << DataTypeToString(mem_blocks_[i].data_type()) << " "
+              "[" << mem_blocks_[i].x() << ", " << mem_blocks_[i].y() << "]";
+    } else {
+      sstream << "[" << mem_blocks_[i].x() << "]";
+    }
+    sstream << "\n";
+  }
+
+  return sstream.str();
+}
+
+}  // namespace mace
--- a/mace/core/memory_optimizer.h
+++ b/mace/core/memory_optimizer.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_MEMORY_OPTIMIZER_H_
+#define MACE_CORE_MEMORY_OPTIMIZER_H_
+
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "mace/proto/mace.pb.h"
+#include "mace/core/types.h"
+
+namespace mace {
+
+class MemoryBlock {
+ public:
+  inline void set_mem_id(int mem_id) {
+    mem_id_ = mem_id;
+  }
+
+  inline int mem_id() const {
+    return mem_id_;
+  }
+
+  inline void set_data_type(DataType data_type) {
+    data_type_ = data_type;
+  }
+
+  inline DataType data_type() const {
+    return data_type_;
+  }
+
+  inline void set_mem_type(MemoryType mem_type) {
+    mem_type_ = mem_type;
+  }
+
+  inline MemoryType mem_type() const {
+    return mem_type_;
+  }
+
+  inline void set_x(int64_t x) {
+    x_ = x;
+  }
+
+  inline int64_t x() const {
+    return x_;
+  }
+
+  inline void set_y(int64_t y) {
+    y_ = y;
+  }
+
+  inline int64_t y() const {
+    return y_;
+  }
+
+ private:
+  int mem_id_;
+  DataType data_type_;
+  MemoryType mem_type_;
+  int64_t x_;
+  int64_t y_;
+};
+
+class MemoryOptimizer {
+ public:
+  static bool IsMemoryReuseOp(const std::string &op_type);
+  void UpdateTensorRef(const std::string &tensor_name);
+  void UpdateTensorRef(const OperatorDef *op_def);
+  void Optimize(const OperatorDef *op_def,
+                const std::unordered_map<std::string, MemoryType> &mem_types);
+
+  const std::vector<MemoryBlock> &mem_blocks() const;
+
+  const std::unordered_map<std::string,
+                           std::pair<int, DataType>> &tensor_mem_map() const;
+
+  std::string DebugInfo() const;
+
+ private:
+  MemoryBlock CreateMemoryBlock(std::vector<int64_t> shape,
+                                DataType dt,
+                                MemoryType mem_type);
+
+ private:
+  std::unordered_map<std::string, int> tensor_ref_count_;
+  std::vector<MemoryBlock> mem_blocks_;
+  // tensor name : <mem_id, data_type>
+  // Buffer Memory do not different data type, so store the data type.
+  std::unordered_map<std::string, std::pair<int, DataType>> tensor_mem_map_;
+  std::unordered_map<int, int> mem_ref_count_;
+  std::set<int> idle_blocks_;
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_MEMORY_OPTIMIZER_H_
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -18,6 +18,7 @@

 #include "mace/core/future.h"
 #include "mace/core/macros.h"
+#include "mace/core/memory_optimizer.h"
 #include "mace/core/net.h"
 #include "mace/core/op_context.h"
 #include "mace/public/mace.h"
@@ -25,13 +26,94 @@
 #include "mace/utils/timer.h"
 #include "mace/utils/utils.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_util.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {

+namespace {
+struct InternalOutputInfo {
+  InternalOutputInfo(const MemoryType mem_type,
+                     const DataType dtype,
+                     const std::vector<index_t> &shape,
+                     int op_idx)
+      : mem_type(mem_type), dtype(dtype), shape(shape), op_idx(op_idx) {}
+
+  MemoryType mem_type;  // transformed memory type
+  DataType dtype;
+  std::vector<index_t> shape;  // tensor shape
+  int op_idx;  // operation which generate the tensor
+};
+
+#ifdef MACE_ENABLE_OPENCL
+std::string TransformedName(const std::string &input_name,
+                            const mace::MemoryType mem_type) {
+  std::stringstream ss;
+  ss << input_name << "_mem_type_" << mem_type;
+  return ss.str();
+}
+#endif  // MACE_ENABLE_OPENCL
+
+}  // namespace
+
+std::unique_ptr<Operation> SerialNet::CreateOperation(
+    const OpRegistryBase *op_registry,
+    OpConstructContext *construct_context,
+    std::shared_ptr<OperatorDef> op_def,
+    DataFormat data_format_flag,
+    bool is_quantize_model) {
+  // Create the Operation
+  DeviceType target_device_type = target_device_->device_type();
+  // Get available devices
+  auto available_devices = op_registry->AvailableDevices(op_def->type());
+  // Find the device type to run the op.
+  // If the target_device_type in available devices, use target_device_type,
+  // otherwise, fallback to CPU device.
+  DeviceType device_type = DeviceType::CPU;
+  construct_context->set_device(cpu_device_);
+  construct_context->set_output_mem_type(MemoryType::CPU_BUFFER);
+  for (auto device : available_devices) {
+    if (device == target_device_type) {
+      device_type = target_device_type;
+      construct_context->set_device(target_device_);
+      if (target_device_->device_type() == DeviceType::GPU) {
+        construct_context->set_output_mem_type(MemoryType::GPU_IMAGE);
+      }
+      break;
+    }
+  }
+  op_def->set_device_type(device_type);
+  // transpose output shape if run on CPU (default format is NHWC)
+  if (!is_quantize_model && device_type == DeviceType::CPU &&
+      op_def->output_shape_size() == op_def->output_size()) {
+    for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
+      if (data_format_flag == NHWC &&
+          op_def->output_shape(out_idx).dims_size() == 4) {
+        //  NHWC -> NCHW
+        std::vector<index_t> output_shape =
+            TransposeShape<index_t, index_t>(
+                std::vector<index_t>(
+                    op_def->output_shape(out_idx).dims().begin(),
+                    op_def->output_shape(out_idx).dims().end()),
+                {0, 3, 1, 2});
+        for (int i = 0; i < 4; ++i) {
+          op_def->mutable_output_shape(out_idx)->set_dims(i, output_shape[i]);
+        }
+      }
+    }
+  }
+  construct_context->set_operator_def(op_def);
+  std::unique_ptr<Operation> op(
+      op_registry->CreateOperation(construct_context, device_type));
+  return std::move(op);
+}
+
 SerialNet::SerialNet(const OpRegistryBase *op_registry,
                     const NetDef *net_def,
                     Workspace *ws,
                     Device *target_device,
-                     const NetMode mode)
+                     MemoryOptimizer *mem_optimizer)
    : NetBase(),
      ws_(ws),
      target_device_(target_device),
@@ -40,49 +122,213 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
                        target_device->cpu_runtime()->policy(),
                        target_device->cpu_runtime()->use_gemmlowp())) {
  MACE_LATENCY_LOGGER(1, "Constructing SerialNet");
-  // Create Operations
-  DeviceType target_device_type = target_device_->device_type();
+  // output tensor : related information
+  std::unordered_map<std::string, InternalOutputInfo> output_map;
+  // used for memory optimization
+  std::unordered_map<std::string, MemoryType> output_mem_map;
+  std::unordered_map<std::string, std::string> transformed_map;
+  // add input information
+  MemoryType target_mem_type;
+  // quantize model flag
+  bool is_quantize_model = IsQuantizedModel(*net_def);
+  //
+  DataFormat data_format_flag = NHWC;
+  if (target_device_->device_type() == DeviceType::CPU) {
+    target_mem_type = MemoryType::CPU_BUFFER;
+    for (auto &input_info : net_def->input_info()) {
+      std::vector<index_t> input_shape =
+          std::vector<index_t>(input_info.dims().begin(),
+                               input_info.dims().end());
+      // Only could be NONE or NHWC
+      auto input_data_format = static_cast<DataFormat>(
+          input_info.data_format());
+      if (!is_quantize_model &&
+          input_data_format == NHWC &&
+          input_info.dims_size() == 4) {
+        // NHWC -> NCHW
+        input_shape =
+            TransposeShape<index_t, index_t>(input_shape, {0, 3, 1, 2});
+      } else if (input_data_format == DataFormat::DF_NONE) {
+        data_format_flag = DataFormat::DF_NONE;
+      }
+      output_map.emplace(input_info.name(), InternalOutputInfo(
+          target_mem_type, DataType::DT_FLOAT, input_shape, -1));
+    }
+  }
+#ifdef MACE_ENABLE_OPENCL
+  else {  // GPU  NOLINT[readability/braces]
+    target_mem_type = MemoryType::GPU_BUFFER;
+    for (auto &input_info : net_def->input_info()) {
+      std::vector<index_t> input_shape =
+          std::vector<index_t>(input_info.dims().begin(),
+                               input_info.dims().end());
+      output_map.emplace(input_info.name(), InternalOutputInfo(
+          target_mem_type, DataType::DT_FLOAT, input_shape, -1));
+    }
+  }
+#endif  // MACE_ENABLE_OPENCL
+
  OpConstructContext construct_context(ws_);
  for (int idx = 0; idx < net_def->op_size(); ++idx) {
-    const auto &operator_def = net_def->op(idx);
-    // Create the Operation
-    const int op_device =
-        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            operator_def, "device", static_cast<int>(target_device_type));
-    if (op_device == target_device_type) {
-      // Get available devices (sorted based on priority)
-      OperatorDef temp_def(operator_def);
-      auto available_devices = op_registry->AvailableDevices(temp_def.type());
-      // Find the device type to run the op.
-      // If the target_device_type in available devices, use target_device_type,
-      // otherwise, fallback to CPU device.
-      DeviceType device_type = DeviceType::CPU;
-      construct_context.set_device(cpu_device_);
-      for (auto device : available_devices) {
-        if (device == target_device_type) {
-          device_type = target_device_type;
-          construct_context.set_device(target_device_);
-          break;
+    std::shared_ptr<OperatorDef> op_def(new OperatorDef(net_def->op(idx)));
+    // Create operation
+    auto op = CreateOperation(op_registry,
+                              &construct_context,
+                              op_def,
+                              data_format_flag,
+                              is_quantize_model);
+#ifdef MACE_ENABLE_OPENCL
+    // Add input transform operation if necessary
+    if (target_device_->device_type() == DeviceType::GPU) {
+      const DataType dt =
+          static_cast<DataType>(
+              ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                  *op_def, "T", static_cast<int>(DataType::DT_FLOAT)));
+      // the outputs' memory type of the operation
+      MemoryType out_mem_type = construct_context.output_mem_type();
+      int input_size = op_def->input_size();
+      for (int i = 0; i < input_size; ++i) {
+        if (output_map.count(op_def->input(i)) == 1) {
+          // if op is memory-reuse op, no transformation
+          if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) {
+            out_mem_type = output_map.at(op_def->input(i)).mem_type;
+            break;
+          }
+          // check whether is the output tensor of other operation
+          if (output_map.at(op_def->input(i)).mem_type != out_mem_type ||
+              output_map.at(op_def->input(i)).dtype != dt) {
+            auto key = TransformedName(op_def->input(i), out_mem_type);
+            auto &output_info = output_map.at(op_def->input(i));
+            // check whether the tensor has been transformed
+            if (transformed_map.count(key) == 0) {
+              VLOG(1) << "Add Transform operation to transform tensor '"
+                      << op_def->input(i) << "', from memory type "
+                      << output_info.mem_type << " to " << out_mem_type
+                      << ", from Data Type " << output_info.dtype << " to "
+                      << dt;
+              std::string input_name = op_def->input(i);
+              std::string t_input_name =
+                  TransformedName(input_name,
+                                  out_mem_type);
+              op_def->set_input(i, t_input_name);
+              auto input_shape = output_info.shape;
+              if (output_info.mem_type == MemoryType::CPU_BUFFER &&
+                  input_shape.size() == 4) {
+                // NCHW -> NHWC
+                input_shape =
+                    TransposeShape<index_t, index_t>(input_shape,
+                                                     {0, 2, 3, 1});
+              }
+              auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
+                  input_name, input_shape, t_input_name,
+                  dt, out_mem_type);
+              auto transform_op = CreateOperation(
+                  op_registry,
+                  &construct_context,
+                  transform_op_def,
+                  data_format_flag);
+              operators_.emplace_back(std::move(transform_op));
+              transformed_map.emplace(key, t_input_name);
+              output_mem_map[t_input_name] = out_mem_type;
+              // where to do graph reference count.
+              mem_optimizer->UpdateTensorRef(transform_op_def.get());
+            } else {
+              op_def->set_input(i, transformed_map[key]);
+            }
+          }
+        } else {
+          MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
+                         && ws_->GetTensor(op_def->input(i))->is_weight(),
+                     "Tensor ", op_def->input(i), " of ",
+                     op_def->name(), " not allocated");
        }
      }
-      temp_def.set_device_type(device_type);
-      construct_context.set_operator_def(&temp_def);
-      std::unique_ptr<Operation> op(
-          op_registry->CreateOperation(&construct_context, device_type, mode));
-      if (op) {
-        operators_.emplace_back(std::move(op));
+      // update the map : output_tensor -> Operation
+      for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
+        output_mem_map[op_def->output(out_idx)] = out_mem_type;
+        output_map.emplace(
+            op_def->output(out_idx),
+            InternalOutputInfo(
+                out_mem_type,
+                dt,
+                op_def->output_shape().empty() ?
+                std::vector<index_t>() :
+                std::vector<index_t>(
+                    op_def->output_shape(out_idx).dims().begin(),
+                    op_def->output_shape(out_idx).dims().end()),
+                static_cast<int>(operators_.size())));
+      }
+    }
+#endif  // MACE_ENABLE_OPENCL
+    operators_.emplace_back(std::move(op));
+    // where to do graph reference count.
+    mem_optimizer->UpdateTensorRef(op_def.get());
+  }
+
+#ifdef MACE_ENABLE_OPENCL
+  // Transform the output tensor if necessary
+  if (target_device_->device_type() == DeviceType::GPU) {
+    for (auto &output_info : net_def->output_info()) {
+      auto &internal_output_info = output_map.at(output_info.name());
+      if ((internal_output_info.mem_type != target_mem_type &&
+          internal_output_info.mem_type != MemoryType::CPU_BUFFER) ||
+          internal_output_info.dtype != DataType::DT_FLOAT) {
+        VLOG(1) << "Add Transform operation to transform output tensor '"
+                << output_info.name() << "', from memory type "
+                << internal_output_info.mem_type
+                << " to " << target_mem_type
+                << ", from Data Type " << internal_output_info.dtype
+                << " to " << DataType::DT_FLOAT;
+        std::string t_output_name = TransformedName(output_info.name(),
+            target_mem_type);
+        auto output_op_def =
+            operators_[internal_output_info.op_idx]->operator_def();
+        int output_size = output_op_def->output_size();
+        for (int i = 0; i < output_size; ++i) {
+          if (output_op_def->output(i) == output_info.name()) {
+            output_op_def->set_output(i, t_output_name);
+            // update the output : mem_type map
+            output_mem_map[t_output_name] = output_mem_map[output_info.name()];
+            output_mem_map[output_info.name()] = target_mem_type;
+          }
+        }
+        auto output_data_format =
+            static_cast<DataFormat>(output_info.data_format());
+        auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
+            t_output_name,
+            internal_output_info.shape,
+            output_info.name(),
+            DataType::DT_FLOAT,
+            target_mem_type);
+        auto transform_op = CreateOperation(
+            op_registry,
+            &construct_context,
+            transform_op_def,
+            output_data_format);
+        operators_.emplace_back(std::move(transform_op));
+        // where to do graph reference count.
+        mem_optimizer->UpdateTensorRef(transform_op_def.get());
      }
    }
  }
+#endif  // MACE_ENABLE_OPENCL
+  // Update output tensor reference
+  for (auto &output_info : net_def->output_info()) {
+    mem_optimizer->UpdateTensorRef(output_info.name());
+  }
+
+  // Do memory optimization
+  for (auto &op : operators_) {
+    VLOG(2) << "Operator " << op->debug_def().name() << "<" << op->device_type()
+            << ", " << op->debug_def().type() << ">";
+    mem_optimizer->Optimize(op->operator_def().get(), output_mem_map);
+  }
+  VLOG(1) << mem_optimizer->DebugInfo();
 }

 MaceStatus SerialNet::Init() {
  MACE_LATENCY_LOGGER(1, "Initializing SerialNet");
  OpInitContext init_context(ws_);
-  // TODO(liuqi): where to do memory reuse.
-  if (target_device_->device_type() == DeviceType::GPU) {
-
-  }
  for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
    auto &op = *iter;
    DeviceType device_type = op->device_type();
@@ -98,18 +344,18 @@ MaceStatus SerialNet::Init() {
 }

 MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
-  // TODO(liuqi): In/Out Buffer Transform
  MACE_MEMORY_LOGGING_GUARD();
  MACE_LATENCY_LOGGER(1, "Running net");
  OpContext context(ws_, cpu_device_);
  for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
    auto &op = *iter;
    DeviceType device_type = op->device_type();
-    MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(),
-                        "<", device_type, ", ", op->debug_def().type(), ">",
-                        ". mem_id: ",
-                        MakeListString(op->debug_def().mem_id().data(),
-                                       op->debug_def().mem_id().size()));
+    MACE_LATENCY_LOGGER(1, "Running operator ", op->debug_def().name(),
+                        "<", device_type, ", ", op->debug_def().type(),
+                        ", ",
+                        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                            op->debug_def(), "T", static_cast<int>(DT_FLOAT)),
+                        ">");
    if (device_type == target_device_->device_type()) {
      context.set_device(target_device_);
    } else {
@@ -176,7 +422,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
            float max_v = std::numeric_limits<float>::lowest();
            float min_v = std::numeric_limits<float>::max();
            Tensor::MappingGuard guard(op->Output(i));
-            const float *output_data = op->Output(i)->data<float>();
+            auto *output_data = op->Output(i)->data<float>();
            for (index_t j = 0; j < op->Output(i)->size(); ++j) {
              max_v = std::max(max_v, output_data[j]);
              min_v = std::min(min_v, output_data[j]);
@@ -192,14 +438,14 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
            std::vector<int> bin_distribution(bin_size, 0);
            float bin_v = (max_v - min_v) / bin_size;
            Tensor::MappingGuard guard(op->Output(i));
-            const float *output_data = op->Output(i)->data<float>();
+            auto *output_data = op->Output(i)->data<float>();
            for (index_t j = 0; j < op->Output(i)->size(); ++j) {
-                int ind = static_cast<int>((output_data[j] - min_v) / bin_v);
-                if (ind < 0)
-                  ind = 0;
-                else if (ind > bin_size-1)
-                  ind = bin_size-1;
-                bin_distribution[ind]++;
+                int index = static_cast<int>((output_data[j] - min_v) / bin_v);
+                if (index < 0)
+                  index = 0;
+                else if (index > bin_size-1)
+                  index = bin_size-1;
+                bin_distribution[index]++;
            }
            LOG(INFO) << "Tensor range @@" << op->debug_def().output(i)
                        << "@@" << min_v << "," << max_v<< "@@"

--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -27,6 +27,7 @@ namespace mace {

 class RunMetadata;
 class Workspace;
+class MemoryOptimizer;

 class NetBase {
 public:
@@ -47,12 +48,20 @@ class SerialNet : public NetBase {
            const NetDef *net_def,
            Workspace *ws,
            Device *target_device,
-            const NetMode mode = NetMode::NORMAL);
+            MemoryOptimizer * mem_optimizer);

  MaceStatus Init() override;

  MaceStatus Run(RunMetadata *run_metadata = nullptr) override;

+ private:
+  std::unique_ptr<Operation> CreateOperation(
+      const OpRegistryBase *op_registry,
+      OpConstructContext *construct_context,
+      std::shared_ptr<OperatorDef> op_def,
+      DataFormat input_format,
+      bool is_quantize_model = false);
+
 protected:
  Workspace *ws_;
  Device *target_device_;

--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -23,16 +23,12 @@ namespace mace {

 OpConstructContext::OpConstructContext(Workspace *ws)
    : operator_def_(nullptr), ws_(ws), device_(nullptr) {}
-OpConstructContext::OpConstructContext(OperatorDef *operator_def,
-                                       Workspace *ws,
-                                       Device *device)
-    : operator_def_(operator_def), ws_(ws), device_(device) {}

 OpInitContext::OpInitContext(Workspace *ws, Device *device)
    : ws_(ws), device_(device) {}

 Operation::Operation(OpConstructContext *context)
-    : operator_def_(std::make_shared<OperatorDef>(*(context->operator_def())))
+    : operator_def_(context->operator_def())
 {}

 MaceStatus Operation::Init(OpInitContext *context) {
@@ -43,11 +39,9 @@ MaceStatus Operation::Init(OpInitContext *context) {
               ": Encountered a non-existing input tensor: ", input_str);
    inputs_.push_back(tensor);
  }
-  // TODO(liuqi): filter transform
  for (int i = 0; i < operator_def_->output_size(); ++i) {
    const std::string output_str = operator_def_->output(i);
    if (ws->HasTensor(output_str)) {
-      // TODO(liuqi): Workspace should pre-allocate all of the output tensors
      outputs_.push_back(ws->GetTensor(output_str));
    } else {
      MACE_CHECK(
@@ -66,15 +60,14 @@ MaceStatus Operation::Init(OpInitContext *context) {
      }
      outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
          output_str, context->device()->allocator(), output_type)));
-
-      if (i < operator_def_->output_shape_size()) {
-        std::vector<index_t>
-            shape_configured(operator_def_->output_shape(i).dims_size());
-        for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
-          shape_configured[dim] = operator_def_->output_shape(i).dims(dim);
-        }
-        ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
+    }
+    if (i < operator_def_->output_shape_size()) {
+      std::vector<index_t>
+          shape_configured(operator_def_->output_shape(i).dims_size());
+      for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
+        shape_configured[dim] = operator_def_->output_shape(i).dims(dim);
      }
+      ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
    }
  }
  return MaceStatus::MACE_SUCCESS;
@@ -164,33 +157,34 @@ const std::set<DeviceType> OpRegistryBase::AvailableDevices(

 std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
    OpConstructContext *context,
-    DeviceType device_type,
-    const NetMode mode) const {
-  OperatorDef *operator_def = context->operator_def();
-  const DataType dtype = static_cast<DataType>(
+    DeviceType device_type) const {
+  auto operator_def = context->operator_def();
+  DataType dtype = static_cast<DataType>(
      ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
          *operator_def, "T", static_cast<int>(DT_FLOAT)));
-  const int op_mode_i = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-      *operator_def, "mode", static_cast<int>(NetMode::NORMAL));
-  const NetMode op_mode = static_cast<NetMode>(op_mode_i);
-  VLOG(3) << "Creating operator " << operator_def->name() << "("
+  if (device_type == DeviceType::CPU && dtype == DT_HALF) {
+    int arg_size = operator_def->arg_size();
+    for (int i = 0; i < arg_size; ++i) {
+      if (operator_def->arg(i).name() == "T") {
+        operator_def->mutable_arg(i)->set_i(DT_FLOAT);
+      }
+    }
+    dtype = DT_FLOAT;
+  }
+  VLOG(1) << "Creating operator " << operator_def->name() << "("
          << operator_def->type() << "<" << dtype << ">" << ") on "
          << device_type;
-  if (op_mode == mode) {
-    const std::string op_type = context->operator_def()->type();
-    MACE_CHECK(registry_.count(op_type) != 0,
-               op_type, " operation is not registered.");
-
-    std::string key = OpKeyBuilder(op_type)
-        .Device(device_type)
-        .TypeConstraint("T", dtype)
-        .Build();
-    if (registry_.at(op_type)->creators.count(key) == 0) {
-      LOG(FATAL) << "Key not registered: " << key;
-    }
-    return registry_.at(op_type)->creators.at(key)(context);
-  } else {
-    return nullptr;
+  const std::string op_type = context->operator_def()->type();
+  MACE_CHECK(registry_.count(op_type) != 0,
+             op_type, " operation is not registered.");
+
+  std::string key = OpKeyBuilder(op_type)
+      .Device(device_type)
+      .TypeConstraint("T", dtype)
+      .Build();
+  if (registry_.at(op_type)->creators.count(key) == 0) {
+    LOG(FATAL) << "Key not registered: " << key;
  }
+  return registry_.at(op_type)->creators.at(key)(context);
 }
 }  // namespace mace
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -33,14 +33,13 @@ namespace mace {
 class OpConstructContext {
 public:
  explicit OpConstructContext(Workspace *ws);
-  OpConstructContext(OperatorDef *operator_def, Workspace *ws, Device *device);
  ~OpConstructContext() = default;

-  inline void set_operator_def(OperatorDef *operator_def) {
+  inline void set_operator_def(std::shared_ptr<OperatorDef> operator_def) {
    operator_def_ = operator_def;
  }

-  inline OperatorDef *operator_def() const {
+  inline std::shared_ptr<OperatorDef> operator_def() const {
    return operator_def_;
  }

@@ -56,10 +55,19 @@ class OpConstructContext {
    return device_;
  }

+  inline void set_output_mem_type(MemoryType type) {
+    output_mem_type_ = type;
+  }
+
+  inline MemoryType output_mem_type() const {
+    return output_mem_type_;
+  }
+
 private:
-  OperatorDef *operator_def_;
+  std::shared_ptr<OperatorDef> operator_def_;
  Workspace *ws_;
  Device *device_;
+  MemoryType output_mem_type_;  // used for transform memory
 };

 // memory_optimizer, device
@@ -137,6 +145,10 @@ class Operation {

  inline bool has_debug_def() const { return operator_def_ != nullptr; }

+  inline std::shared_ptr<OperatorDef> operator_def() {
+    return operator_def_;
+  }
+
 protected:
  std::shared_ptr<OperatorDef> operator_def_;
  std::vector<const Tensor *> inputs_;
@@ -190,8 +202,7 @@ class OpRegistryBase {

  std::unique_ptr<Operation> CreateOperation(
      OpConstructContext *context,
-      DeviceType device_type,
-      const NetMode mode) const;
+      DeviceType device_type) const;

  template <class DerivedType>
  static std::unique_ptr<Operation> DefaultCreator(

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -285,7 +285,8 @@ OpenCLRuntime::OpenCLRuntime(
    is_profiling_enabled_(false),
    opencl_version_(CL_VER_UNKNOWN),
    gpu_type_(UNKNOWN),
-    mem_type_(MemoryType::GPU_IMAGE) {
+    mem_type_(MemoryType::GPU_IMAGE),
+    scratch_image_manager_(new ScratchImageManager) {
  std::vector<cl::Platform> all_platforms;
  cl::Platform::get(&all_platforms);
  if (all_platforms.size() == 0) {
@@ -791,4 +792,8 @@ bool OpenCLRuntime::is_profiling_enabled() const {
  return is_profiling_enabled_;
 }

+ScratchImageManager* OpenCLRuntime::scratch_image_manager() const {
+  return scratch_image_manager_.get();
+}
+
 }  // namespace mace
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -25,6 +25,7 @@
 #include "mace/core/file_storage.h"
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/runtime/opencl/scratch_image.h"
 #include "mace/proto/mace.pb.h"
 #include "mace/utils/string_util.h"
 #include "mace/utils/timer.h"
@@ -82,6 +83,7 @@ class OpenCLRuntime {
  uint64_t device_global_mem_cache_size() const;
  uint32_t device_compute_units() const;
  Tuner<uint32_t> *tuner();
+  ScratchImageManager *scratch_image_manager() const;
  bool is_opencl_avaliable();
  // TODO(liuqi): remove this function in the future, make decision at runtime.
  bool UseImageMemory();
@@ -134,6 +136,7 @@ class OpenCLRuntime {
  OpenCLVersion opencl_version_;
  GPUType gpu_type_;
  MemoryType mem_type_;
+  std::unique_ptr<ScratchImageManager> scratch_image_manager_;
  // All OpenCL object must be a pointer and manually deleted before unloading
  // OpenCL library.
  std::shared_ptr<cl::Context> context_;

--- a/mace/core/runtime/opencl/opencl_util.cc
+++ b/mace/core/runtime/opencl/opencl_util.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/opencl/opencl_util.h"
+
+#include <utility>
+
+#include "mace/utils/logging.h"
+
+namespace mace {
+
+namespace {
+// [(C + 3) / 4 * W, N * H]
+void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
+                           std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2];
+  (*image_shape)[1] = shape[0] * shape[1];
+}
+
+// [Ic, H * W * (Oc + 3) / 4]
+void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* OIHW */
+                               std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[1];
+  (*image_shape)[1] = shape[2] * shape[3] * RoundUpDiv4(shape[0]);
+}
+
+// [H * W * M, (Ic + 3) / 4]
+void CalDepthwiseConv2dFilterImageShape(
+    const std::vector<index_t> &shape, /* MIHW */
+    std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[0] * shape[2] * shape[3];
+  (*image_shape)[1] = RoundUpDiv4(shape[1]);
+}
+
+// [(size + 3) / 4, 1]
+void CalArgImageShape(const std::vector<index_t> &shape,
+                      std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 1);
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[0]);
+  (*image_shape)[1] = 1;
+}
+
+// Only support 3x3 now
+// [ (Ic + 3) / 4, 16 * Oc]
+void CalWinogradFilterImageShape(
+    const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
+    std::vector<size_t> *image_shape,
+    const int blk_size) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[1]);
+  (*image_shape)[1] = (shape[0] * (blk_size + 2) * (blk_size + 2));
+}
+
+
+// [W * C, N * RoundUp<4>(H)]
+void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
+                              std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[2] * shape[3];
+  (*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]);
+}
+
+// [RoundUp<4>(W) * C, N * H]
+void CalInOutWidthImageShape(const std::vector<index_t> &shape, /* NHWC */
+                             std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3];
+  (*image_shape)[1] = shape[0] * shape[1];
+}
+
+// [Ic * H * W, (Oc + 3) / 4]
+void CalWeightHeightImageShape(const std::vector<index_t> &shape, /* OIHW */
+                               std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[1] * shape[2] * shape[3];
+  (*image_shape)[1] = RoundUpDiv4(shape[0]);
+}
+
+// [(Ic + 3) / 4 * H * W, Oc]
+void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* OIHW */
+                              std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[1]) * shape[2] * shape[3];
+  (*image_shape)[1] = shape[0];
+}
+}  // namespace
+
+void OpenCLUtil::CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
+                                 const OpenCLBufferType type,
+                                 std::vector<size_t> *image_shape,
+                                 const int wino_block_size) {
+  MACE_CHECK_NOTNULL(image_shape);
+  switch (type) {
+    case CONV2D_FILTER:
+      CalConv2dFilterImageShape(shape, image_shape);
+      break;
+    case DW_CONV2D_FILTER:
+      CalDepthwiseConv2dFilterImageShape(shape, image_shape);
+      break;
+    case IN_OUT_CHANNEL:
+      CalInOutputImageShape(shape, image_shape);
+      break;
+    case ARGUMENT:
+      CalArgImageShape(shape, image_shape);
+      break;
+    case IN_OUT_HEIGHT:
+      CalInOutHeightImageShape(shape, image_shape);
+      break;
+    case IN_OUT_WIDTH:
+      CalInOutWidthImageShape(shape, image_shape);
+      break;
+    case WINOGRAD_FILTER:
+      CalWinogradFilterImageShape(shape, image_shape, wino_block_size);
+      break;
+    case WEIGHT_HEIGHT:
+      CalWeightHeightImageShape(shape, image_shape);
+      break;
+    case WEIGHT_WIDTH:
+      CalWeightWidthImageShape(shape, image_shape);
+      break;
+    default:
+      LOG(FATAL) << "Mace not supported yet.";
+  }
+}
+
+std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef(
+    const std::string &input_name,
+    const std::vector<mace::index_t> &input_shape,
+    const std::string &output_name,
+    const mace::DataType dt,
+    const mace::MemoryType mem_type) {
+  std::unique_ptr<OperatorDef> op(new OperatorDef);
+  std::string op_name = "mace_node_" + output_name;
+  op->set_name(op_name);
+  op->set_type("BufferTransform");
+  op->add_input(input_name);
+  op->add_output(output_name);
+  Argument *arg = op->add_arg();
+  arg->set_name("buffer_type");
+  arg->set_i(static_cast<int32_t>(OpenCLBufferType::IN_OUT_CHANNEL));
+  arg = op->add_arg();
+  arg->set_name("mem_type");
+  arg->set_i(static_cast<int32_t>(mem_type));
+  arg = op->add_arg();
+  arg->set_name("T");
+  arg->set_i(static_cast<int32_t>(dt));
+  arg = op->add_arg();
+  arg->set_name("device");
+  arg->set_i(DeviceType::GPU);
+  if (!input_shape.empty()) {
+    OutputShape *shape = op->add_output_shape();
+    for (auto value : input_shape) {
+      shape->add_dims(value);
+    }
+  }
+  return std::move(op);
+}
+}  // namespace mace
--- a/mace/ops/transformer.h
+++ b/mace/ops/transformer.h
@@ -12,33 +12,43 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_TRANSFORMER_H_
-#define MACE_KERNELS_TRANSFORMER_H_
+#ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_
+#define MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_

-#include "mace/core/transformer.h"
-#include "mace/ops/opencl/common.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mace/core/types.h"

 namespace mace {
-class OpContext;
-namespace ops {
+enum OpenCLBufferType {
+  CONV2D_FILTER = 0,
+  IN_OUT_CHANNEL = 1,
+  ARGUMENT = 2,
+  IN_OUT_HEIGHT = 3,
+  IN_OUT_WIDTH = 4,
+  WINOGRAD_FILTER = 5,
+  DW_CONV2D_FILTER = 6,
+  WEIGHT_HEIGHT = 7,
+  WEIGHT_WIDTH = 8,
+};
+

-class Transformer : public TransformerBase {
+class OpenCLUtil {
 public:
-  // Transform source tensor to target.
-  std::vector<std::unique_ptr<OperatorDef>> ConstructTranformOp(
-      OperatorDef *op_def,
-      bool transform_filter = true) override;
- private:
-  std::unique_ptr<OperatorDef> DoTransform(
-      mace::OperatorDef *op_def,
-      const int input_idx,
+  static void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
+                              const OpenCLBufferType type,
+                              std::vector<size_t> *image_shape,
+                              const int wino_blk_size = 2);
+
+  static std::shared_ptr<OperatorDef> CreateTransformOpDef(
+      const std::string &input_name,
+      const std::vector<mace::index_t> &input_shape,
+      const std::string &output_name,
      const mace::DataType dt,
-      const BufferType buffer_type,
      const MemoryType mem_type);
 };

-
-}  // namespace ops
 }  // namespace mace
-
-#endif  // MACE_KERNELS_TENSOR_TRANSFORMER_H_
+#endif  // MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_
--- a/mace/core/runtime/opencl/scratch_image.cc
+++ b/mace/core/runtime/opencl/scratch_image.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/opencl/scratch_image.h"
+
+#include <utility>
+#include <vector>
+
+namespace mace {
+
+ScratchImageManager::ScratchImageManager() = default;
+ScratchImageManager::~ScratchImageManager() = default;
+
+Image *ScratchImageManager::Spawn(
+    Allocator *allocator,
+    const std::vector<size_t> &shape,
+    const DataType dt,
+    int *id) {
+  // TODO(liuqi): not optimal memory reuse strategy
+  int found_image_idx = -1;
+  int image_count = static_cast<int>(reference_count_.size());
+  for (int i = 0; i < image_count; ++i) {
+    int count = reference_count_[i];
+    if (count == 0 && images_.at(count)->dtype() == dt) {
+      auto image_shape = images_.at(count)->image_shape();
+      if (image_shape[0] >= shape[0] && image_shape[1] >= shape[1]) {
+        found_image_idx = i;
+        break;
+      }
+    }
+  }
+  // if not found
+  if (found_image_idx == -1) {
+    reference_count_.push_back(0);
+    images_[image_count] =
+        std::move(std::unique_ptr<Image>(new Image(allocator)));
+    if (images_.at(image_count)->Allocate(shape, dt) !=
+        MaceStatus::MACE_SUCCESS) {
+      return nullptr;
+    }
+    found_image_idx = image_count;
+    VLOG(2) << "Spawn image " << found_image_idx << ": " << MakeString(shape)
+            << "<" << dt << ">";
+  }
+  reference_count_[found_image_idx] += 1;
+  *id = found_image_idx;
+  return images_.at(found_image_idx).get();
+}
+
+void ScratchImageManager::Deactive(int id) {
+  MACE_CHECK(reference_count_.size() > static_cast<size_t>(id)
+                 && reference_count_[id] > 0,
+             "Image id ", id, " exceed the vector size ",
+             reference_count_.size());
+  reference_count_[id] -= 1;
+}
+
+ScratchImage::ScratchImage(mace::ScratchImageManager *manager)
+    : manager_(manager), id_(-1) {}
+
+ScratchImage::~ScratchImage() {
+  if (id_ >= 0) {
+    manager_->Deactive(id_);
+  }
+}
+
+Image* ScratchImage::Scratch(Allocator *allocator,
+                             const std::vector<size_t> &shape,
+                             const mace::DataType dt) {
+  return manager_->Spawn(allocator, shape, dt, &id_);
+}
+
+}  // namespace mace
--- a/mace/ops/opencl/winograd_transform.h
+++ b/mace/ops/opencl/winograd_transform.h
@@ -12,39 +12,47 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_
-#define MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_
+#ifndef MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_
+#define MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_

+#include <memory>
+#include <unordered_map>
 #include <vector>

-#include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/core/buffer.h"
+
 namespace mace {

-class OpContext;
-class Tensor;
+class ScratchImageManager {
+ public:
+  ScratchImageManager();
+  ~ScratchImageManager();
+
+  Image *Spawn(Allocator *allocator,
+               const std::vector<size_t> &shape,
+               const DataType dt,
+               int *id);

-namespace ops {
+  void Deactive(int id);

-class OpenCLWinogradTransformKernel {
- public:
-  virtual MaceStatus Compute(
-      OpContext *context,
-      const Tensor *input,
-      Tensor *output) = 0;
-  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradTransformKernel);
+ private:
+  std::unordered_map<int, std::unique_ptr<Image>> images_;
+  std::vector<int> reference_count_;
 };

-class OpenCLWinogradInverseTransformKernel {
+class ScratchImage {
 public:
-  virtual MaceStatus Compute(
-      OpContext *context,
-      const std::vector<const Tensor*> &inputs,
-      Tensor *output) = 0;
-  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradInverseTransformKernel);
+  explicit ScratchImage(ScratchImageManager *);
+  ~ScratchImage();
+
+  Image *Scratch(Allocator *allocator,
+                 const std::vector<size_t> &shape,
+                 const DataType dt);
+
+ private:
+  ScratchImageManager *manager_;
+  int id_;
 };

-}  // namespace ops
 }  // namespace mace
-
-#endif  // MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_
+#endif  // MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -97,7 +97,7 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) {
 }
 }  // namespace numerical_chars

-enum DataFormat { NHWC = 0, NCHW = 1, HWOI = 2, OIHW = 3, HWIO = 4, OHWI = 5 };
+enum FilterDataFormat { HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103 };

 class Tensor {
 public:
@@ -223,7 +223,7 @@ class Tensor {
  }

  inline MemoryType memory_type() const {
-    MACE_CHECK(buffer_ != nullptr, "Tensor ", name_, " is empty" );
+    MACE_CHECK(buffer_ != nullptr, "Tensor ", name_, " is empty");
    if (buffer_->OnHost()) {
      return MemoryType::CPU_BUFFER;
    } else if (typeid(*buffer_) == typeid(Image)) {
@@ -233,6 +233,14 @@ class Tensor {
    }
  }

+  inline void set_data_format(DataFormat data_format) {
+    data_format_ = data_format;
+  }
+
+  inline DataFormat data_format() const {
+    return data_format_;
+  }
+
 #ifdef MACE_ENABLE_OPENCL
  inline cl::Image *opencl_image() const {
    MACE_CHECK(has_opencl_image(), name_, " do not have image");
@@ -499,6 +507,7 @@ class Tensor {
  int32_t zero_point_;
  float minval_;
  float maxval_;
+  DataFormat data_format_;  // used for 4D input/output tensor

  MACE_DISABLE_COPY_AND_ASSIGN(Tensor);
 };

--- a/mace/core/transformer.h
+++ b/mace/core/transformer.h
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_CORE_TRANSFORMER_H_
-#define MACE_CORE_TRANSFORMER_H_
-
-#include "mace/proto/mace.pb.h"
-
-namespace mace {
-
-class TransformerBase {
- public:
-  // Construct transform operation.
-  virtual std::vector<std::unique_ptr<OperatorDef>> ConstructTranformOp(
-      OperatorDef *op_def,
-      bool transform_filter = true) = 0;
-};
-
-}  // namespace mace
-
-#endif  // MACE_CORE_TRANSFORMER_H_
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -18,6 +18,7 @@
 #include <utility>

 #include "mace/core/arg_helper.h"
+#include "mace/core/memory_optimizer.h"
 #include "mace/utils/quantize.h"

 #ifdef MACE_ENABLE_OPENCL
@@ -27,13 +28,6 @@
 namespace mace {

 namespace {
-bool ShouldPreallocateMemoryForOp(const OperatorDef &op) {
-  static const std::unordered_set<std::string> reuse_buffer_ops {
-      "Reshape", "Identity", "Squeeze"
-  };
-  return reuse_buffer_ops.find(op.type()) == reuse_buffer_ops.end();
-}
-
 bool HasQuantizeOp(const NetDef &net_def) {
  for (auto &op : net_def.op()) {
    if (op.type() == "Quantize") {
@@ -48,13 +42,14 @@ Workspace::Workspace() = default;

 Tensor *Workspace::CreateTensor(const std::string &name,
                                Allocator *alloc,
-                                DataType type) {
+                                DataType type,
+                                bool is_weight) {
  if (HasTensor(name)) {
    VLOG(3) << "Tensor " << name << " already exists. Skipping.";
  } else {
    VLOG(3) << "Creating Tensor " << name;
    tensor_map_[name] = std::unique_ptr<Tensor>(new Tensor(alloc, type,
-                                                           false, name));
+                                                           is_weight, name));
  }
  return GetTensor(name);
 }
@@ -199,13 +194,79 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
      fused_buffer_ = true;
    }
  }
+  return MaceStatus::MACE_SUCCESS;
+}

-  if (device_type == DeviceType::CPU || device_type == DeviceType::GPU) {
-    MaceStatus status = CreateOutputTensorBuffer(net_def, device);
-    if (status != MaceStatus::MACE_SUCCESS) return status;
+MaceStatus Workspace::PreallocateOutputTensor(
+    const mace::NetDef &net_def,
+    const mace::MemoryOptimizer *mem_optimizer,
+    Device *device) {
+  auto &mem_blocks = mem_optimizer->mem_blocks();
+  for (auto &mem_block : mem_blocks) {
+    VLOG(3) << "Preallocate memory block. id: " << mem_block.mem_id()
+            << ", memory type: " << mem_block.mem_type()
+            << ", size: " << mem_block.x() << "x" << mem_block.y();
+    if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
+      std::unique_ptr<BufferBase> tensor_buf(
+          new Buffer(GetCPUAllocator()));
+      MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
+          mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE));
+      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
+                                        std::move(tensor_buf));
+    } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
+      std::unique_ptr<BufferBase> image_buf(
+          new Image(device->allocator()));
+      MACE_RETURN_IF_ERROR(image_buf->Allocate(
+          {static_cast<size_t>(mem_block.x()),
+           static_cast<size_t>(mem_block.y())}, mem_block.data_type()));
+      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
+                                        std::move(image_buf));
+    } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
+      std::unique_ptr<BufferBase> tensor_buf(
+          new Buffer(device->allocator()));
+      MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
+          mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE));
+      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
+                                        std::move(tensor_buf));
+    }
+  }
+  VLOG(1) << "Preallocate buffer to tensors";
+  bool is_quantize_model = IsQuantizedModel(net_def);
+  for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) {
+    std::unique_ptr<Tensor> tensor
+        (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.first),
+                    tensor_mem.second.second,
+                    false, tensor_mem.first));
+    if (mem_blocks[tensor_mem.second.first].mem_type()
+        == MemoryType::GPU_IMAGE) {
+      VLOG(1) << "Tensor: " << tensor_mem.first
+              << " Mem: " << tensor_mem.second.first
+              << " Data type: " << tensor->dtype()
+              << " Image shape: "
+              << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
+                  ->image_shape()[0]
+              << ", "
+              << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
+                  ->image_shape()[1];
+     tensor->set_data_format(DataFormat::NHWC);
+    } else {
+      VLOG(1) << "Tensor: " << tensor_mem.first
+              << " Mem: " << tensor_mem.second.first
+              << " Data type: " << tensor->dtype()
+              << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
+      if (mem_blocks[tensor_mem.second.first].mem_type()
+          == MemoryType::GPU_BUFFER ||
+          is_quantize_model) {
+        tensor->set_data_format(DataFormat::NHWC);
+      } else {
+        tensor->set_data_format(DataFormat::NCHW);
+      }
+    }
+    tensor_map_[tensor_mem.first] = std::move(tensor);
  }

-  if (device_type == DeviceType::CPU) {
+  // add quantize info for output tensors.
+  if (device->device_type() == DeviceType::CPU) {
    for (const auto &op : net_def.op()) {
      VLOG(2) << "Add quantize info for op: " << op.name();
      MACE_CHECK(op.quantize_info().empty()
@@ -225,139 +286,6 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
  return MaceStatus::MACE_SUCCESS;
 }

-MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
-                                               Device *device) {
-  DeviceType device_type = device->device_type();
-  DataType dtype = DataType::DT_INVALID;
-  if (net_def.mem_arena().mem_block_size() > 0) {
-    // We use the data type of the first op with mem id,
-    // as CPU&GPU have consistent data type for each layer for now.
-    // As DSP may have different data output type for each op,
-    // we stick to the same concept.
-    for (auto &op : net_def.op()) {
-      // TODO(liuqi): refactor to add device_type to OperatorDef
-      const int op_device =
-          ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-              op, "device", static_cast<int>(device_type));
-      if (op_device == device_type && !op.mem_id().empty()) {
-        const DataType op_dtype = static_cast<DataType>(
-            ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                op, "T", static_cast<int>(DT_FLOAT)));
-        if (op_dtype != DataType::DT_INVALID) {
-          dtype = op_dtype;
-          // find first valid data type, break
-          break;
-        }
-      }
-    }
-    MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
-  }
-  // TODO(liyin): memory block should not have concept of type, but to be
-  // consistent with gpu, all memory block use float/half as unit
-  for (auto &mem_block : net_def.mem_arena().mem_block()) {
-    if (mem_block.device_type() == device_type) {
-      VLOG(3) << "Preallocate memory block. id: " << mem_block.mem_id()
-              << ", device type: " << mem_block.device_type()
-              << ", memory type: " << mem_block.mem_type();
-      if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
-        std::unique_ptr<BufferBase> tensor_buf(
-            new Buffer(GetCPUAllocator()));
-        MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
-            mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE));
-        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
-                                          std::move(tensor_buf));
-      } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
-        std::unique_ptr<BufferBase> image_buf(
-            new Image(device->allocator()));
-        MACE_RETURN_IF_ERROR(image_buf->Allocate(
-            {mem_block.x(), mem_block.y()}, dtype));
-        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
-                                          std::move(image_buf));
-      } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
-        std::unique_ptr<BufferBase> tensor_buf(
-            new Buffer(device->allocator()));
-        MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
-            mem_block.x() * GetEnumTypeSize(dtype)
-                + MACE_EXTRA_BUFFER_PAD_SIZE));
-        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
-                                          std::move(tensor_buf));
-      }
-    }
-  }
-  VLOG(3) << "Preallocate buffer to tensors";
-  for (auto &op : net_def.op()) {
-    // TODO(liuqi): refactor to add device_type to OperatorDef
-    const int op_device =
-        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            op, "device", static_cast<int>(device_type));
-    if (op_device == device_type) {
-      if (!op.mem_id().empty()
-          && ShouldPreallocateMemoryForOp(op)) {
-        auto mem_ids = op.mem_id();
-        int count = mem_ids.size();
-        for (int i = 0; i < count; ++i) {
-          DataType output_type;
-          if (i < op.output_type_size()) {
-            output_type = op.output_type(i);
-          } else {
-            output_type = dtype;
-          }
-          std::unique_ptr<Tensor> tensor
-              (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]),
-                          output_type, false, op.output(i)));
-          if (device_type == DeviceType::GPU && tensor->has_opencl_image()) {
-            VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
-                    << " Mem: " << mem_ids[i]
-                    << " Image shape: "
-                    << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
-                        ->image_shape()[0]
-                    << ", "
-                    << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
-                        ->image_shape()[1];
-          } else {
-            VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
-                    << " Mem: " << mem_ids[i]
-                    << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
-          }
-          tensor_map_[op.output(i)] = std::move(tensor);
-        }
-      } else {
-        for (int i = 0; i < op.output().size(); ++i) {
-          MACE_CHECK(
-              op.output_type_size() == 0
-                  || op.output_size()
-                      == op.output_type_size(),
-              "operator output size != operator output type size",
-              op.output_size(),
-              op.output_type_size());
-          DataType output_type;
-          if (i < op.output_type_size()) {
-            output_type = op.output_type(i);
-          } else {
-            output_type = static_cast<DataType>(ProtoArgHelper::GetOptionalArg(
-                op, "T", static_cast<int>(DT_FLOAT)));
-          }
-          CreateTensor(op.output(i),
-                       device->allocator(),
-                       output_type);
-        }
-      }
-
-      for (int output_idx = 0; output_idx < op.output_shape_size();
-           ++output_idx) {
-        std::vector<index_t>
-            shape_configured(op.output_shape(output_idx).dims_size());
-        for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
-          shape_configured[dim] = op.output_shape(output_idx).dims(dim);
-        }
-        tensor_map_[op.output(output_idx)]->SetShapeConfigured(
-            shape_configured);
-      }
-    }
-  }
-  return MaceStatus::MACE_SUCCESS;
-}
-
 void Workspace::RemoveUnusedBuffer() {
  auto iter = tensor_map_.begin();
  auto end_iter = tensor_map_.end();
@@ -398,4 +326,11 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
  tensor_buffer_.reset(nullptr);
 }

+void Workspace::RemoveTensor(const std::string &name) {
+  auto iter = tensor_map_.find(name);
+  if (iter != tensor_map_.end()) {
+    tensor_map_.erase(iter);
+  }
+}
+
 }  // namespace mace
--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -27,6 +27,8 @@

 namespace mace {

+class MemoryOptimizer;
+
 class Workspace {
 public:
  typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap;
@@ -36,7 +38,8 @@ class Workspace {

  Tensor *CreateTensor(const std::string &name,
                       Allocator *alloc,
-                       DataType type);
+                       DataType type,
+                       bool is_weight = false);

  inline bool HasTensor(const std::string &name) const {
    return tensor_map_.find(name) != tensor_map_.end();
@@ -52,12 +55,19 @@ class Workspace {
                             Device *device,
                             const unsigned char *model_data);

+  MaceStatus PreallocateOutputTensor(const NetDef &net_def,
+                                     const MemoryOptimizer *mem_optimizer,
+                                     Device *device);
+
  void RemoveUnusedBuffer();

  void RemoveAndReloadBuffer(const NetDef &net_def,
                             const unsigned char *model_data,
                             Allocator *alloc);

+  void RemoveTensor(const std::string &name);
+
+
 private:
  MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
                                      Device *device);

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -20,9 +20,11 @@

 #include <memory>

-#include "mace/core/net.h"
 #include "mace/core/device_context.h"
+#include "mace/core/memory_optimizer.h"
+#include "mace/core/net.h"
 #include "mace/ops/ops_registry.h"
+#include "mace/ops/transpose.h"
 #include "mace/public/mace.h"

 #ifdef MACE_ENABLE_OPENCL
@@ -69,6 +71,7 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
  // Check OpenCL avaliable
  auto runtime = device->opencl_runtime();
  if (!runtime->is_opencl_avaliable()) {
+    LOG(WARNING) << "The device does not support OpenCL";
    return MaceStatus::MACE_OUT_OF_RESOURCES;
  }

@@ -84,28 +87,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
  const MemoryType mem_type = static_cast<MemoryType>(mem_type_i);

  runtime->set_mem_type(mem_type);
-  if (mem_type == MemoryType::GPU_IMAGE) {
-    if (!runtime->IsImageSupport()) {
-      return MaceStatus::MACE_OUT_OF_RESOURCES;
-    }
-
-    auto opencl_max_image_size = runtime->GetMaxImage2DSize();
-    if (opencl_max_image_size.empty()) {
-      return MaceStatus::MACE_OUT_OF_RESOURCES;
-    }
-
-    const std::vector<int64_t> net_max_image_size =
-        ProtoArgHelper::GetRepeatedArgs<NetDef, int64_t>(
-            *net_def, "opencl_max_image_size", {0, 0});
-
-    if (static_cast<uint64_t>(net_max_image_size[0]) > opencl_max_image_size[0]
-        || static_cast<uint64_t>(net_max_image_size[1])
-            > opencl_max_image_size[1]) {
-      LOG(INFO) << "opencl max image size " << MakeString(opencl_max_image_size)
-                << " vs " << MakeString(net_max_image_size);
-      return MaceStatus::MACE_OUT_OF_RESOURCES;
-    }
-  }

  return MaceStatus::MACE_SUCCESS;
 }
@@ -288,14 +269,17 @@ class MaceTensor::Impl {
 public:
  std::vector<int64_t> shape;
  std::shared_ptr<float> data;
+  DataFormat format;
 };

 MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
-                       std::shared_ptr<float> data) {
+                       std::shared_ptr<float> data,
+                       const DataFormat format) {
  MACE_CHECK_NOTNULL(data.get());
  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
  impl_->shape = shape;
  impl_->data = data;
+  impl_->format = format;
 }

 MaceTensor::MaceTensor() {
@@ -306,23 +290,27 @@ MaceTensor::MaceTensor(const MaceTensor &other) {
  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
  impl_->shape = other.shape();
  impl_->data = other.data();
+  impl_->format = other.data_format();
 }

 MaceTensor::MaceTensor(const MaceTensor &&other) {
  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
  impl_->shape = other.shape();
  impl_->data = other.data();
+  impl_->format = other.data_format();
 }

 MaceTensor &MaceTensor::operator=(const MaceTensor &other) {
  impl_->shape = other.shape();
  impl_->data = other.data();
+  impl_->format = other.data_format();
  return *this;
 }

 MaceTensor &MaceTensor::operator=(const MaceTensor &&other) {
  impl_->shape = other.shape();
  impl_->data = other.data();
+  impl_->format = other.data_format();
  return *this;
 }

@@ -334,6 +322,10 @@ const std::shared_ptr<float> MaceTensor::data() const { return impl_->data; }

 std::shared_ptr<float> MaceTensor::data() { return impl_->data; }

+DataFormat MaceTensor::data_format() const {
+  return impl_->format;
+}
+
 // Mace Engine
 class MaceEngine::Impl {
 public:
@@ -355,6 +347,14 @@ class MaceEngine::Impl {
                 std::map<std::string, MaceTensor> *outputs,
                 RunMetadata *run_metadata);

+ private:
+  MaceStatus TransposeInput(
+      const std::pair<const std::string, MaceTensor> &input,
+      Tensor *input_tensor);
+
+  MaceStatus TransposeOutput(const Tensor *output_tensor,
+                             std::pair<const std::string, MaceTensor> *output);
+
 private:
  const unsigned char *model_data_;
  size_t model_data_size_;
@@ -363,11 +363,12 @@ class MaceEngine::Impl {
  std::unique_ptr<Device> device_;
  std::unique_ptr<Workspace> ws_;
  std::unique_ptr<NetBase> net_;
-  std::map<std::string, mace::InputInfo> input_info_map_;
-  std::map<std::string, mace::OutputInfo> output_info_map_;
+  bool is_quantized_model_;
 #ifdef MACE_ENABLE_HEXAGON
  std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
 #endif
+  std::map<std::string, mace::InputInfo> input_info_map_;
+  std::map<std::string, mace::OutputInfo> output_info_map_;

  MACE_DISABLE_COPY_AND_ASSIGN(Impl);
 };
@@ -379,7 +380,8 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
      device_type_(config.impl_->device_type()),
      device_(nullptr),
      ws_(new Workspace()),
-      net_(nullptr)
+      net_(nullptr),
+      is_quantized_model_(false)
 #ifdef MACE_ENABLE_HEXAGON
      , hexagon_controller_(nullptr)
 #endif
@@ -417,6 +419,8 @@ MaceStatus MaceEngine::Impl::Init(
    MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def, device_.get()));
  }
 #endif
+  // mark quantized model flag
+  is_quantized_model_ = IsQuantizedModel(*net_def);
  // Get input and output information.
  for (auto &input_info : net_def->input_info()) {
    input_info_map_[input_info.name()] = input_info;
@@ -431,8 +435,7 @@ MaceStatus MaceEngine::Impl::Init(
                 << "' does not belong to model's inputs: "
                 << MakeString(MapKeys(input_info_map_));
    }
-    ws_->CreateTensor(MakeString("mace_input_node_", input_name),
-                      device_->allocator(), DT_FLOAT);
+    ws_->CreateTensor(input_name, device_->allocator(), DT_FLOAT);
  }
  for (auto output_name : output_nodes) {
    if (output_info_map_.find(output_name) == output_info_map_.end()) {
@@ -440,8 +443,6 @@ MaceStatus MaceEngine::Impl::Init(
                 << "' does not belong to model's outputs "
                 << MakeString(MapKeys(output_info_map_));
    }
-    ws_->CreateTensor(MakeString("mace_output_node_", output_name),
-                      device_->allocator(), DT_FLOAT);
  }
 #ifdef MACE_ENABLE_HEXAGON
  if (device_type_ == HEXAGON) {
@@ -461,19 +462,19 @@ MaceStatus MaceEngine::Impl::Init(
                                              device_.get(),
                                              model_data));

+    MemoryOptimizer mem_optimizer;
    // Init model
-    auto net = std::unique_ptr<NetBase>(new SerialNet(
-        op_registry_.get(),
-        net_def,
-        ws_.get(),
-        device_.get(),
-        NetMode::INIT));
-    MACE_RETURN_IF_ERROR(net->Init());
-    MACE_RETURN_IF_ERROR(net->Run());
    net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(),
                                                  net_def,
                                                  ws_.get(),
-                                                  device_.get()));
+                                                  device_.get(),
+                                                  &mem_optimizer));
+
+    // Preallocate all output tensors of ops
+    MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def,
+                                                      &mem_optimizer,
+                                                      device_.get()));
+
    MACE_RETURN_IF_ERROR(net_->Init());
 #ifdef MACE_ENABLE_HEXAGON
  }
@@ -524,6 +525,117 @@ MaceEngine::Impl::~Impl() {
 #endif
 }

+MaceStatus MaceEngine::Impl::TransposeInput(
+    const std::pair<const std::string, MaceTensor> &input,
+    Tensor *input_tensor) {
+  if (device_->device_type() == DeviceType::CPU &&
+      input.second.shape().size() == 4 &&
+      input.second.data_format() == NHWC &&
+      !is_quantized_model_) {
+    VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
+    input_tensor->set_data_format(DataFormat::NCHW);
+    std::vector<int> dst_dims = {0, 3, 1, 2};
+    std::vector<index_t> output_shape =
+        TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
+    MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
+    Tensor::MappingGuard input_guard(input_tensor);
+    float *input_data = input_tensor->mutable_data<float>();
+    return ops::Transpose(input.second.data().get(),
+                          input.second.shape(),
+                          dst_dims,
+                          input_data);
+  } else if (
+      (is_quantized_model_ || device_->device_type() == DeviceType::GPU) &&
+      input.second.shape().size() == 4 &&
+      input.second.data_format() == DataFormat::NCHW) {
+    VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC";
+    std::vector<int> dst_dims = {0, 2, 3, 1};
+    input_tensor->set_data_format(DataFormat::NHWC);
+    std::vector<index_t> output_shape =
+        TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
+    MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
+    Tensor::MappingGuard input_guard(input_tensor);
+    float *input_data = input_tensor->mutable_data<float>();
+    return ops::Transpose(input.second.data().get(),
+                          input.second.shape(),
+                          dst_dims,
+                          input_data);
+  } else {
+    input_tensor->set_data_format(input.second.data_format());
+    MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
+    Tensor::MappingGuard input_guard(input_tensor);
+    float *input_data = input_tensor->mutable_data<float>();
+    memcpy(input_data, input.second.data().get(),
+           input_tensor->size() * sizeof(float));
+    return MaceStatus::MACE_SUCCESS;
+  }
+}
+
+MaceStatus MaceEngine::Impl::TransposeOutput(
+    const mace::Tensor *output_tensor,
+    std::pair<const std::string, mace::MaceTensor> *output) {
+  // save output
+  if (output_tensor != nullptr && output->second.data() != nullptr) {
+    if (device_->device_type() == DeviceType::CPU &&
+        output->second.shape().size() == 4 &&
+        output->second.data_format() != output_tensor->data_format()) {
+      MACE_CHECK(output_tensor->data_format() == NCHW);
+      VLOG(1) << "Transform output " << output->first << " from NCHW to NHWC";
+      std::vector<int> dst_dims = {0, 2, 3, 1};
+      std::vector<index_t> shape =
+          TransposeShape<index_t, index_t>(output_tensor->shape(),
+                                           dst_dims);
+      MACE_CHECK(shape == output->second.shape())
+        << "Output shape mismatch: "
+        << MakeString<int64_t>(shape) << " != "
+        << MakeString<int64_t>(output->second.shape());
+      Tensor::MappingGuard output_guard(output_tensor);
+      const float *output_data = output_tensor->data<float>();
+      return ops::Transpose(output_data,
+                            output_tensor->shape(),
+                            dst_dims,
+                            output->second.data().get());
+    } else if (device_->device_type() == DeviceType::GPU &&
+        output->second.shape().size() == 4 &&
+        output->second.data_format() != output_tensor->data_format()) {
+      VLOG(1) << "Transform output " << output->first << " from "
+              << output_tensor->data_format() << " to "
+              << output->second.data_format();
+      std::vector<int> dst_dims = {0, 3, 1, 2};
+      if (output_tensor->data_format() == NCHW) {
+        dst_dims = {0, 2, 3, 1};
+      }
+      std::vector<index_t> shape =
+          TransposeShape<index_t, index_t>(output_tensor->shape(),
+                                           dst_dims);
+      MACE_CHECK(shape == output->second.shape())
+        << "Output shape mismatch: "
+        << MakeString<int64_t>(shape) << " != "
+        << MakeString<int64_t>(output->second.shape());
+      Tensor::MappingGuard output_guard(output_tensor);
+      const float *output_data = output_tensor->data<float>();
+      return ops::Transpose(output_data,
+                            output_tensor->shape(),
+                            dst_dims,
+                            output->second.data().get());
+    } else {
+      Tensor::MappingGuard output_guard(output_tensor);
+      auto shape = output_tensor->shape();
+      int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
+                                            std::multiplies<int64_t>());
+      MACE_CHECK(shape == output->second.shape())
+        << "Output shape mismatch: "
+        << MakeString<int64_t>(shape) << " != "
+        << MakeString<int64_t>(output->second.shape());
+      std::memcpy(output->second.data().get(), output_tensor->data<float>(),
+                  output_size * sizeof(float));
+      return MaceStatus::MACE_SUCCESS;
+    }
+  } else {
+    return MaceStatus::MACE_INVALID_ARGS;
+  }
+}
+
 MaceStatus MaceEngine::Impl::Run(
    const std::map<std::string, MaceTensor> &inputs,
    std::map<std::string, MaceTensor> *outputs,
@@ -537,15 +649,8 @@ MaceStatus MaceEngine::Impl::Run(
                 << "' does not belong to model's inputs: "
                 << MakeString(MapKeys(input_info_map_));
    }
-    Tensor *input_tensor =
-        ws_->GetTensor(MakeString("mace_input_node_", input.first));
-    MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
-    {
-      Tensor::MappingGuard input_guard(input_tensor);
-      float *input_data = input_tensor->mutable_data<float>();
-      memcpy(input_data, input.second.data().get(),
-             input_tensor->size() * sizeof(float));
-    }
+    Tensor *input_tensor = ws_->GetTensor(input.first);
+    MACE_RETURN_IF_ERROR(TransposeInput(input, input_tensor));
    input_tensors.push_back(input_tensor);
  }
  for (auto &output : *outputs) {
@@ -554,8 +659,7 @@ MaceStatus MaceEngine::Impl::Run(
                 << "' does not belong to model's outputs: "
                 << MakeString(MapKeys(output_info_map_));
    }
-    Tensor *output_tensor =
-        ws_->GetTensor(MakeString("mace_output_node_", output.first));
+    Tensor *output_tensor = ws_->GetTensor(output.first);
    output_tensors.push_back(output_tensor);
  }
 #ifdef MACE_ENABLE_HEXAGON
@@ -577,23 +681,9 @@ MaceStatus MaceEngine::Impl::Run(
  }
 #endif
  for (auto &output : *outputs) {
-    Tensor *output_tensor =
-        ws_->GetTensor(MakeString("mace_output_node_", output.first));
+    Tensor *output_tensor = ws_->GetTensor(output.first);
    // save output
-    if (output_tensor != nullptr && output.second.data() != nullptr) {
-      Tensor::MappingGuard output_guard(output_tensor);
-      auto shape = output_tensor->shape();
-      int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
-                                            std::multiplies<int64_t>());
-      MACE_CHECK(shape == output.second.shape())
-          << "Output shape mismatch: "
-          << MakeString<int64_t>(output.second.shape())
-          << " != " << MakeString<int64_t>(shape);
-      std::memcpy(output.second.data().get(), output_tensor->data<float>(),
-                  output_size * sizeof(float));
-    } else {
-      return MaceStatus::MACE_INVALID_ARGS;
-    }
+    MACE_RETURN_IF_ERROR(TransposeOutput(output_tensor, &output));
  }
  return MaceStatus::MACE_SUCCESS;
 }

--- a/mace/libmace/mace_version_script.lds
+++ b/mace/libmace/mace_version_script.lds
@@ -14,7 +14,6 @@ mace {
    *mace*NetDef*;
    *mace*MemoryType*;
    *mace*DataType*;
-    *mace*MemoryArena*;
    *mace*InputInfo*;
    *mace*OutputInfo*;
    *mace*OutputShape*;

--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -30,10 +30,8 @@ cc_library(
            "arm/*_test.cc",
            "ops_registry.cc",
            "ops_test_util.cc",
-            "buffer_inverse_transform.cc",
-            "buffer_transformer.cc",
+            "buffer_transform.cc",
            "lstm_cell.cc",
-            "winograd_transform.cc",
            "quantize.cc",
        ],
    ) + if_opencl_enabled(glob(
@@ -41,10 +39,8 @@ cc_library(
            "opencl/*.cc",
            "opencl/image/*.cc",
            "opencl/buffer/*.cc",
-            "buffer_inverse_transform.cc",
-            "buffer_transformer.cc",
+            "buffer_transform.cc",
            "lstm_cell.cc",
-            "winograd_transform.cc",
        ],
        exclude = [
            "opencl/*_test.cc",

--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -90,7 +90,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
    }
    if (type == ActivationType::PRELU) {
      MACE_CHECK(TransformFilter<T>(
-          context, operator_def_.get(), 1, BufferType::ARGUMENT, mem_type)
+          context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
                     == MaceStatus::MACE_SUCCESS);
    }
  }

--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -30,31 +30,19 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) {

  // Add input data
  if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
  } else if (D == DeviceType::GPU) {
-    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
  }

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("Activation", "ReluBM")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluBM")
-        .Input("InputImage")
-        .Output("Output")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
+  OpDefBuilder("Activation", "ReluBM")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELU")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());

  // Warm-up
  for (int i = 0; i < 5; ++i) {
@@ -100,29 +88,18 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {

  // Add input data
  if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
  } else {
-    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  }

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluxBM")
-        .Input("InputImage")
-        .Output("Output")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6.0)
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("Activation", "ReluxBM")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6.0)
-        .Finalize(net.NewOperatorDef());
-  }
+  OpDefBuilder("Activation", "ReluxBM")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELUX")
+      .AddFloatArg("max_limit", 6.0)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());

  // Warm-up
  for (int i = 0; i < 5; ++i) {
@@ -168,36 +145,21 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) {

  // Add input data
  if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
  } else if (D == DeviceType::GPU) {
-    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
  }
-  net.AddRandomInput<D, float>("Alpha", {channels});
+  net.AddRandomInput<D, T>("Alpha", {channels}, true);

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("Activation", "PreluBM")
-        .Input("Input")
-        .Input("Alpha")
-        .Output("Output")
-        .AddStringArg("activation", "PRELU")
-        .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
-                            ops::BufferType::ARGUMENT);
-
-    OpDefBuilder("Activation", "PreluBM")
-        .Input("InputImage")
-        .Input("AlphaImage")
-        .Output("Output")
-        .AddStringArg("activation", "PRELU")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
+  OpDefBuilder("Activation", "PreluBM")
+      .Input("Input")
+      .Input("Alpha")
+      .Output("Output")
+      .AddStringArg("activation", "PRELU")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());

  // Warm-up
  for (int i = 0; i < 5; ++i) {
@@ -243,27 +205,17 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) {

  // Add input data
  if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
  } else {
-    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  }

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "TanhBM")
-        .Input("InputImage")
-        .Output("Output")
-        .AddStringArg("activation", "TANH")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("Activation", "TanhBM")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "TANH")
-        .Finalize(net.NewOperatorDef());
-  }
+  OpDefBuilder("Activation", "TanhBM")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "TANH")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());

  // Warm-up
  for (int i = 0; i < 5; ++i) {
@@ -310,27 +262,17 @@ void SigmoidBenchmark(

  // Add input data
  if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
  } else {
-    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  }

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "SigmoidBM")
-        .Input("InputImage")
-        .Output("Output")
-        .AddStringArg("activation", "SIGMOID")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("Activation", "SigmoidBM")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "SIGMOID")
-        .Finalize(net.NewOperatorDef());
-  }
+  OpDefBuilder("Activation", "SigmoidBM")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "SIGMOID")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());

  // Warm-up
  for (int i = 0; i < 5; ++i) {

--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -30,32 +30,14 @@ void TestSimpleRelu() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
+  OpDefBuilder("Activation", "ReluTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELU")
+      .Finalize(net.NewOperatorDef());

-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "ReluTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);

  auto expected = net.CreateTensor<float>(
      {2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
@@ -78,32 +60,14 @@ void TestUnalignedSimpleRelu() {
  // Add input data
  net.AddInputFromArray<D, float>("Input", {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5});

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "ReluTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
+  OpDefBuilder("Activation", "ReluTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELU")
+      .Finalize(net.NewOperatorDef());

-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);

  auto expected = net.CreateTensor<float>({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5});

@@ -129,34 +93,15 @@ void TestSimpleRelux() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluxTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6)
-        .Finalize(net.NewOperatorDef());
+  OpDefBuilder("Activation", "ReluxTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELUX")
+      .AddFloatArg("max_limit", 6)
+      .Finalize(net.NewOperatorDef());

-    // Run
-    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "ReluxTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6)
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);

  auto expected = net.CreateTensor<float>(
      {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
@@ -179,34 +124,15 @@ void TestSimpleReluRelux() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluxTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6)
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "ReluxTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6)
-        .Finalize(net.NewOperatorDef());
+  OpDefBuilder("Activation", "ReluxTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELUX")
+      .AddFloatArg("max_limit", 6)
+      .Finalize(net.NewOperatorDef());

-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);

  auto expected = net.CreateTensor<float>(
      {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
@@ -232,43 +158,36 @@ void TestSimplePrelu() {
  net.AddInputFromArray<D, float>(
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0});
-  net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0});
+  net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0}, true);

  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
    OpDefBuilder("Activation", "PreluTest")
-        .Input("InputImage")
+        .Input("Input")
        .Input("Alpha")
-        .Output("OutputImage")
+        .Output("Output")
        .AddStringArg("activation", "PRELU")
        .Finalize(net.NewOperatorDef());

    // Run
    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
+    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
    OpDefBuilder("Activation", "PreluTest")
-        .Input("Input")
+        .Input("InputNCHW")
        .Input("Alpha")
-        .Output("Output")
+        .Output("OutputNCHW")
        .AddStringArg("activation", "PRELU")
        .Finalize(net.NewOperatorDef());

    // Run
    net.RunOp(D);
+    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
  }

-  if (D == DeviceType::CPU) {
-    auto expected = net.CreateTensor<float>(
-        {2, 2, 2, 2},
-        {-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0});
-    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
-  }
+  auto expected = net.CreateTensor<float>(
+      {2, 2, 2, 2},
+      {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace

@@ -288,32 +207,14 @@ void TestSimpleTanh() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "TanhTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "TANH")
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "TanhTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "TANH")
-        .Finalize(net.NewOperatorDef());
+  OpDefBuilder("Activation", "TanhTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "TANH")
+      .Finalize(net.NewOperatorDef());

-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);

  auto expected = net.CreateTensor<float>(
      {2, 2, 2, 2},
@@ -341,32 +242,14 @@ void TestSimpleSigmoid() {
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});

-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "SigmoidTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "SIGMOID")
-        .Finalize(net.NewOperatorDef());
+  OpDefBuilder("Activation", "SigmoidTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "SIGMOID")
+      .Finalize(net.NewOperatorDef());

-    // Run
-    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "SigmoidTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "SIGMOID")
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);

  auto expected = net.CreateTensor<float>(
      {2, 2, 2, 2},

--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -32,28 +32,13 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
    net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c});
  }

-  if (D == DeviceType::GPU) {
-    for (int i = 0; i < inputs; ++i) {
-      BufferToImage<D, T>(&net, MakeString("Input", i).c_str(),
-                          MakeString("InputImage", i).c_str(),
-                          ops::BufferType::IN_OUT_CHANNEL);
-    }
-    OpDefBuilder op_def_builder("AddN", "AddNBM");
-    for (int i = 0; i < inputs; ++i) {
-      op_def_builder.Input(MakeString("InputImage", i).c_str());
-    }
-    op_def_builder.Output("OutputImage")
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder op_def_builder("AddN", "AddNBM");
-    for (int i = 0; i < inputs; ++i) {
-      op_def_builder.Input(MakeString("Input", i).c_str());
-    }
-    op_def_builder.Output("Output")
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
+  OpDefBuilder op_def_builder("AddN", "AddNBM");
+  for (int i = 0; i < inputs; ++i) {
+    op_def_builder.Input(MakeString("Input", i).c_str());
  }
+  op_def_builder.Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());

  // Warm-up
  for (int i = 0; i < 5; ++i) {

--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -62,39 +62,15 @@ void SimpleAdd3() {
  net.AddInputFromArray<D, float>("Input3", {1, 2, 3, 1},
                                  {-0.1582, 2, 3, 4, 5, 6});

-  const int input_num = 4;
-  if (D == DeviceType::GPU) {
-    // run on gpu
-    for (int i = 0; i < input_num; ++i) {
-      BufferToImage<D, half>(&net, MakeString("Input", i),
-                             MakeString("InputImage", i),
-                             ops::BufferType::IN_OUT_CHANNEL);
-    }
-
-    auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
-    for (int i = 0; i < input_num; ++i) {
-      op_def_cl.Input(MakeString("InputImage", i));
-    }
-    op_def_cl.Output("OutputImage")
-        .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
-        .Finalize(net.NewOperatorDef());
-
-    // Run on device
-    net.RunOp(D);
-
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("AddN", "AddNTest")
-        .Input("Input0")
-        .Input("Input1")
-        .Input("Input2")
-        .Input("Input3")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
-  }
+  OpDefBuilder("AddN", "AddNTest")
+      .Input("Input0")
+      .Input("Input1")
+      .Input("Input2")
+      .Input("Input3")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+  // Run
+  net.RunOp(D);

  auto expected =
      net.CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24});
@@ -138,28 +114,10 @@ void RandomTest() {
    auto expected = net.CreateTensor<float>();
    expected->Copy(*net.GetOutput("Output"));

-    // run on gpu
-    for (int i = 0; i < input_num; ++i) {
-      BufferToImage<D, half>(&net, MakeString("Input", i),
-                             MakeString("InputImage", i),
-                             ops::BufferType::IN_OUT_CHANNEL);
-    }
-
-    auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
-    for (int i = 0; i < input_num; ++i) {
-      op_def_cl.Input(MakeString("InputImage", i));
-    }
-    op_def_cl.Output("OutputImage")
-        .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
-        .Finalize(net.NewOperatorDef());
-
-    // Run on device
+    // run on device
    net.RunOp(D);

-    ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2,
                            1e-2);
  }
 }

--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -158,14 +158,16 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
    }
    // Transform filters
    int input_size = operator_def_->input_size();
-    for (int i = 0; i < input_size; ++i) {
+    for (int i = 1; i < input_size; ++i) {
      const Tensor *input_tensor = context->workspace()->GetTensor(
          operator_def_->input(i));
-      if (input_tensor != nullptr && input_tensor->is_weight()) {
-        MACE_CHECK(TransformFilter<T>(
-            context, operator_def_.get(), i, BufferType::ARGUMENT, mem_type)
-                       == MaceStatus::MACE_SUCCESS);
-      }
+      MACE_CHECK(input_tensor != nullptr);
+      MACE_CHECK(TransformFilter<T>(
+          context,
+          operator_def_.get(),
+          i,
+          OpenCLBufferType::ARGUMENT,
+          mem_type) == MaceStatus::MACE_SUCCESS);
    }
  }
  MaceStatus Run(OpContext *context) override {

--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -36,13 +36,12 @@ void BatchNorm(
  } else {
    MACE_NOT_IMPLEMENTED;
  }
-  net.AddRandomInput<D, T>("Scale", {channels});
-  net.AddRandomInput<D, T>("Offset", {channels});
-  net.AddRandomInput<D, T>("Mean", {channels});
-  net.AddRandomInput<D, T>("Var", {channels}, true);
+  net.AddRandomInput<D, T>("Scale", {channels}, true);
+  net.AddRandomInput<D, T>("Offset", {channels}, true);
+  net.AddRandomInput<D, T>("Mean", {channels}, true);
+  net.AddRandomInput<D, T>("Var", {channels}, true, true);

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("BatchNorm", "BatchNormBM")
+  OpDefBuilder("BatchNorm", "BatchNormBM")
      .Input("Input")
      .Input("Scale")
      .Input("Offset")
@@ -50,30 +49,8 @@ void BatchNorm(
      .Input("Var")
      .AddFloatArg("epsilon", 1e-3)
      .Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Offset", "OffsetImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Mean", "MeanImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Var", "VarImage",
-                            ops::BufferType::ARGUMENT);
-    OpDefBuilder("BatchNorm", "BatchNormBM")
-        .Input("InputImage")
-        .Input("ScaleImage")
-        .Input("OffsetImage")
-        .Input("MeanImage")
-        .Input("VarImage")
-        .AddFloatArg("epsilon", 1e-3)
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }

  // tuning
  setenv("MACE_TUNING", "1", 1);

--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -28,10 +28,10 @@ void Simple() {
  // Add input data
  net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
                                  {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
-  net.AddInputFromArray<D, float>("Scale", {1}, {4.0f});
-  net.AddInputFromArray<D, float>("Offset", {1}, {2.0});
-  net.AddInputFromArray<D, float>("Mean", {1}, {10});
-  net.AddInputFromArray<D, float>("Var", {1}, {11.67f});
+  net.AddInputFromArray<D, float>("Scale", {1}, {4.0f}, true);
+  net.AddInputFromArray<D, float>("Offset", {1}, {2.0}, true);
+  net.AddInputFromArray<D, float>("Mean", {1}, {10}, true);
+  net.AddInputFromArray<D, float>("Var", {1}, {11.67f}, true);

  if (D == DeviceType::CPU) {
    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
@@ -49,32 +49,17 @@ void Simple() {
    net.RunOp(D);
    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Offset", "OffsetImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Mean", "MeanImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Var", "VarImage",
-                            ops::BufferType::ARGUMENT);
-
    OpDefBuilder("BatchNorm", "BatchNormTest")
-        .Input("InputImage")
-        .Input("ScaleImage")
-        .Input("OffsetImage")
-        .Input("MeanImage")
-        .Input("VarImage")
+        .Input("Input")
+        .Input("Scale")
+        .Input("Offset")
+        .Input("Mean")
+        .Input("Var")
        .AddFloatArg("epsilon", 1e-3)
-        .Output("OutputImage")
+        .Output("Output")
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
  }

  // Check
@@ -103,10 +88,10 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input",
                                             {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);

  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                  NCHW);
@@ -133,25 +118,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  expected->Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
-                                        ops::BufferType::ARGUMENT);
-
  OpDefBuilder("BatchNorm", "BatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Input("MeanImage")
-      .Input("VarImage")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
      .AddFloatArg("epsilon", 1e-3)
-      .Output("OutputImage")
+      .Output("Output")
      .Finalize(net.NewOperatorDef());

  // Tuning
@@ -162,10 +136,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  // Run on opencl
  net.RunOp(DeviceType::GPU);
  net.Sync();
-
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                          1e-5, 1e-4);
 }

@@ -183,10 +154,10 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input",
                                             {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);

  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                  NCHW);
@@ -212,25 +183,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  expected->Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
-                                       ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
-                                       ops::BufferType::ARGUMENT);
-
  OpDefBuilder("BatchNorm", "BatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Input("MeanImage")
-      .Input("VarImage")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
      .AddFloatArg("epsilon", 1e-1)
-      .Output("OutputImage")
+      .Output("Output")
      .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
      .Finalize(net.NewOperatorDef());

@@ -243,9 +203,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  net.RunOp(DeviceType::GPU);
  net.Sync();

-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                          1e-1, 1e-2);
 }

@@ -263,10 +221,10 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input",
                                             {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);

  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                  NCHW);
@@ -292,25 +250,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  expected->Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
-                                        ops::BufferType::ARGUMENT);
-
  OpDefBuilder("BatchNorm", "BatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Input("MeanImage")
-      .Input("VarImage")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
      .AddFloatArg("epsilon", 1e-3)
-      .Output("OutputImage")
+      .Output("Output")
      .Finalize(net.NewOperatorDef());

  // tuning
@@ -322,9 +269,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  net.RunOp(DeviceType::GPU);
  net.Sync();

-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                          1e-5, 1e-4);
 }

@@ -342,10 +287,10 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input",
                                             {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);

  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                  NCHW);
@@ -371,25 +316,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  expected->Copy(*net.GetOutput("Output"));

  // Run on opencl
-  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
-                                       ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
-                                       ops::BufferType::ARGUMENT);
-
  OpDefBuilder("BatchNorm", "BatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Input("MeanImage")
-      .Input("VarImage")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
      .AddFloatArg("epsilon", 1e-1)
-      .Output("OutputImage")
+      .Output("Output")
      .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
      .Finalize(net.NewOperatorDef());

@@ -402,9 +336,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  net.RunOp(DeviceType::GPU);
  net.Sync();

-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                          1e-1, 1e-2);
 }


--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
@@ -32,23 +32,13 @@ void BMBatchToSpace(
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
  }

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
-        .Input("Input")
-        .Output("Output")
-        .AddIntsArg("crops", {0, 0, 0, 0})
-        .AddIntsArg("block_shape", {arg, arg})
-        .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddIntsArg("crops", {0, 0, 0, 0})
-        .AddIntsArg("block_shape", {arg, arg})
-        .Finalize(net.NewOperatorDef());
-  }
+  OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
+      .Input("Input")
+      .Output("Output")
+      .AddIntsArg("crops", {0, 0, 0, 0})
+      .AddIntsArg("block_shape", {arg, arg})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
  // Warm-up
  for (int i = 0; i < 5; ++i) {
    net.RunOp(D);

--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -108,7 +108,7 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
      MACE_NOT_IMPLEMENTED;
    }
    MACE_CHECK(TransformFilter<T>(
-        context, operator_def_.get(), 1, BufferType::ARGUMENT, mem_type)
+        context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
                   == MaceStatus::MACE_SUCCESS);
  }
  MaceStatus Run(OpContext *context) override {

--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -28,35 +28,24 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
  OpsTestNet net;

  // Add input data
+  DataFormat data_format = NHWC;
  if (D == DeviceType::CPU) {
+    data_format = NCHW;
    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
  } else {
    MACE_NOT_IMPLEMENTED;
  }
-  net.AddRandomInput<D, T>("Bias", {channels}, true);
+  net.AddRandomInput<D, T>("Bias", {channels}, true, true);

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("BiasAdd", "BiasAddBM")
+  OpDefBuilder("BiasAdd", "BiasAddBM")
      .Input("Input")
      .Input("Bias")
-      .AddIntArg("data_format", NCHW)
+      .AddIntArg("data_format", data_format)
      .Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-    OpDefBuilder("BiasAdd", "BiasAddBM")
-        .Input("InputImage")
-        .Input("BiasImage")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }

  // Warm-up
  for (int i = 0; i < 5; ++i) {

--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -28,7 +28,7 @@ void BiasAddSimple() {
  // Add input data
  net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
                                  {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
-  net.AddInputFromArray<D, float>("Bias", {1}, {0.5f});
+  net.AddInputFromArray<D, float>("Bias", {1}, {0.5f}, true);

  if (D == DeviceType::CPU) {
    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
@@ -44,22 +44,13 @@ void BiasAddSimple() {
    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                    "Output", NHWC);
  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Bias", "BiasImage",
-                            ops::BufferType::ARGUMENT);
-
    OpDefBuilder("BiasAdd", "BiasAddTest")
-        .Input("InputImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Bias")
+        .Output("Output")
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
    MACE_NOT_IMPLEMENTED;
  }
@@ -90,7 +81,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input",
                                             {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);

  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                  NCHW);
@@ -113,25 +104,17 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
  auto expected = net.CreateTensor<float>();
  expected->Copy(*net.GetOutput("Output"));

-  // Run on opencl
-  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
-                                        ops::BufferType::ARGUMENT);
-
+  // Run on gpu
  OpDefBuilder("BiasAdd", "BiasAddTest")
-      .Input("InputImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Input("Bias")
+      .Output("Output")
      .Finalize(net.NewOperatorDef());

  // Run on opencl
  net.RunOp(DeviceType::GPU);
-  net.Sync();

-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }

 TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
@@ -147,7 +130,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input",
                                             {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);

  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                  NCHW);
@@ -169,25 +152,17 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
  auto expected = net.CreateTensor<float>();
  expected->Copy(*net.GetOutput("Output"));

-  // Run on opencl
-  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
-                                        ops::BufferType::ARGUMENT);
-
+  // Run on gpu
  OpDefBuilder("BiasAdd", "BiasAddTest")
-      .Input("InputImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Input("Bias")
+      .Output("Output")
      .Finalize(net.NewOperatorDef());

  // Run on opencl
  net.RunOp(DeviceType::GPU);
-  net.Sync();

-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }

 }  // namespace test

--- a/mace/ops/buffer_inverse_transform.cc
+++ b/mace/ops/buffer_inverse_transform.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-
-#include "mace/core/operator.h"
-#include "mace/ops/opencl/buffer/buffer_inverse_transform.h"
-#include "mace/ops/opencl/image/image_to_buffer.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class BufferInverseTransformOp;
-
-template <typename T>
-class BufferInverseTransformOp<DeviceType::GPU, T> : public Operation {
- public:
-  explicit BufferInverseTransformOp(OpConstructContext *context)
-      : Operation(context),
-        wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::ImageToBuffer<T>);
-    } else {
-      kernel_.reset(new opencl::buffer::BufferInverseTransform<T>);
-    }
-  }
-
-  MaceStatus Run(OpContext *context) override {
-    const Tensor *input = this->Input(0);
-    Tensor *output = this->Output(0);
-
-    ops::BufferType type =
-        static_cast<ops::BufferType>(Operation::GetOptionalArg<int>(
-            "buffer_type", static_cast<int>(ops::CONV2D_FILTER)));
-
-    return kernel_->Compute(context, input, type,
-                            wino_blk_size_, output);
-  }
-
- private:
-  const int wino_blk_size_;
-  std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
-};
-
-
-void RegisterBufferInverseTransform(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "BufferInverseTransform",
-                   BufferInverseTransformOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "BufferInverseTransform",
-                   BufferInverseTransformOp, DeviceType::GPU, half);
-}
-
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/buffer_to_image_benchmark.cc
+++ b/mace/ops/buffer_to_image_benchmark.cc
@@ -14,6 +14,7 @@

 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -28,26 +29,36 @@ void FilterBufferToImage(int iters,
  mace::testing::StopTiming();

  OpsTestNet net;
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));

  // Add input data
  net.AddRandomInput<D, T>("Input",
                           {out_channel, in_channel, height, width});
+  // Create output
+  Tensor *b2i_output = net.ws()->CreateTensor(
+      "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);

-  OpDefBuilder("BufferToImage", "BufferToImageBM")
-      .Input("Input")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
+  auto transform_func = [&]() {
+    OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+        .Transform(&context,
+                   net.ws()->GetTensor("Input"),
+                   OpenCLBufferType::IN_OUT_CHANNEL,
+                   MemoryType::GPU_IMAGE,
+                   0,
+                   b2i_output);
+  };

  // Warm-up
  net.Setup(D);
  for (int i = 0; i < 5; ++i) {
-    net.Run();
+    transform_func();
  }
  net.Sync();

  mace::testing::StartTiming();
  while (iters--) {
-    net.Run();
+    transform_func();
  }
  net.Sync();
 }

--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -14,6 +14,7 @@

 #include "gtest/gtest.h"
 #include "mace/ops/ops_test_util.h"
+#include "mace/ops/opencl/buffer_transformer.h"

 namespace mace {
 namespace ops {
@@ -21,31 +22,27 @@ namespace test {

 namespace {
 template <DeviceType D, typename T>
-void TestBidirectionTransform(const int type,
+void TestBidirectionTransform(const OpenCLBufferType type,
                              const std::vector<index_t> &input_shape) {
  OpsTestNet net;
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("B2IOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));

  // Add input data
  net.AddRandomInput<D, T>("Input", input_shape);
+  Tensor *b2i_output = net.ws()->CreateTensor(
+      "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);

-  // Run
-  net.RunOp(D);
+  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+      .Transform(&context, net.ws()->GetTensor("Input"),
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);

-  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
-      .Input("B2IOutput")
-      .Output("I2BOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
-
-  // Run
-  net.RunOp(D);
+  // Inverse Transform
+  Tensor *i2b_output = net.ws()->CreateTensor(
+      "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
+  OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
+      .Transform(&context, b2i_output,
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);

  // Check
  ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
@@ -54,132 +51,139 @@ void TestBidirectionTransform(const int type,
 }  // namespace

 TEST(BufferToImageTest, ArgSmall) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {1});
+  TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
+                                                   {1});
 }

 TEST(BufferToImageTest, ArgHalfSmall) {
-  TestBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT, {11});
+  TestBidirectionTransform<DeviceType::GPU, half>(OpenCLBufferType::ARGUMENT,
+                                                  {11});
 }

 TEST(BufferToImageTest, ArgMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {11});
+  TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
+                                                   {11});
 }

 TEST(BufferToImageTest, ArgLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {256});
+  TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
+                                                   {256});
 }

 TEST(BufferToImageTest, InputSmallSingleChannel) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
-                                                   {1, 2, 3, 1});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::IN_OUT_CHANNEL, {1, 2, 3, 1});
 }

 TEST(BufferToImageTest, InputSmallMultipleChannel) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
-                                                   {1, 2, 3, 3});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::IN_OUT_CHANNEL, {1, 2, 3, 3});
 }

 TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
-                                                   {3, 2, 3, 3});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::IN_OUT_CHANNEL, {3, 2, 3, 3});
 }

 TEST(BufferToImageTest, InputMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
-                                                   {3, 13, 17, 128});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::IN_OUT_CHANNEL, {3, 13, 17, 128});
 }

 TEST(BufferToImageTest, InputLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
-                                                   {3, 64, 64, 256});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::IN_OUT_CHANNEL, {3, 64, 64, 256});
 }

 TEST(BufferToImageTest, Filter1x1Small) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                   {5, 3, 1, 1});
 }

 TEST(BufferToImageTest, Filter1x1Medium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                   {13, 17, 1, 1});
 }

 TEST(BufferToImageTest, Filter1x1Large) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                   {512, 128, 1, 1});
 }

 TEST(BufferToImageTest, Filter3x3Small) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                   {3, 5, 3, 3});
 }

 TEST(BufferToImageTest, Filter3x3Medium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                   {17, 13, 3, 3});
 }

 TEST(BufferToImageTest, Filter3x3Large) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                   {256, 128, 3, 3});
 }

 TEST(BufferToImageTest, WeightWidthSmall) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
-                                                   {1, 3, 3, 3});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_WIDTH,
+      {1, 3, 3, 3});
 }

 TEST(BufferToImageTest, WeightWidthMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
-                                                   {11, 13, 13, 17});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_WIDTH,
+      {11, 13, 13, 17});
 }

 TEST(BufferToImageTest, WeightWidthLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
-                                                   {64, 64, 11, 13});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_WIDTH,
+      {64, 64, 11, 13});
 }

 TEST(BufferToImageTest, WeightHeightSmall) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
-                                                   {2, 1, 1, 1});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_HEIGHT,
+      {2, 1, 1, 1});
 }

 TEST(BufferToImageTest, WeightHeightMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
-                                                   {11, 13, 13, 17});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_HEIGHT,
+      {11, 13, 13, 17});
 }

 TEST(BufferToImageTest, WeightHeightLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
-                                                   {64, 16, 11, 13});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_HEIGHT,
+      {64, 16, 11, 13});
 }

 namespace {
 template <DeviceType D, typename T>
-void TestDiffTypeBidirectionTransform(const int type,
+void TestDiffTypeBidirectionTransform(const OpenCLBufferType type,
                                      const std::vector<index_t> &input_shape) {
  OpsTestNet net;
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("B2IOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));

  // Add input data
  net.AddRandomInput<D, float>("Input", input_shape);
+  Tensor *b2i_output = net.ws()->CreateTensor(
+      "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);

-  // Run
-  net.RunOp(D);
-
-  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
-      .Input("B2IOutput")
-      .Output("I2BOutput")
-      .AddIntArg("buffer_type", type)
-      .Finalize(net.NewOperatorDef());
+  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+      .Transform(&context, net.ws()->GetTensor("Input"),
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);

-  // Run
-  net.RunOp(D);
+  // Inverse Transform
+  Tensor *i2b_output = net.ws()->CreateTensor(
+      "I2BOutput", context.device()->allocator(), DT_FLOAT);
+  OpenCLBufferTransformer<float>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
+      .Transform(&context, b2i_output,
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);

  // Check
  ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
@@ -188,40 +192,38 @@ void TestDiffTypeBidirectionTransform(const int type,
 }  // namespace

 TEST(BufferToImageTest, ArgFloatToHalfSmall) {
-  TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT,
-                                                          {11});
+  TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(
+      OpenCLBufferType::ARGUMENT,
+      {11});
 }

 namespace {
 template <DeviceType D, typename T>
-void TestStringHalfBidirectionTransform(const int type,
+void TestStringHalfBidirectionTransform(const OpenCLBufferType type,
                                        const std::vector<index_t> &input_shape,
                                        const unsigned char *input_data) {
  OpsTestNet net;
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("B2IOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));

+  // Add input data
  const half *h_data = reinterpret_cast<const half *>(input_data);
-
  net.AddInputFromArray<D, half>("Input", input_shape,
                                 std::vector<half>(h_data, h_data + 2));
+  Tensor *b2i_output = net.ws()->CreateTensor(
+      "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);

-  // Run
-  net.RunOp(D);
-
-  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
-      .Input("B2IOutput")
-      .Output("I2BOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
+  // Transform
+  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+      .Transform(&context, net.ws()->GetTensor("Input"),
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);

-  // Run
-  net.RunOp(D);
+  // Inverse Transform
+  Tensor *i2b_output = net.ws()->CreateTensor(
+      "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
+  OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
+      .Transform(&context, b2i_output,
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);

  // Check
  ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
@@ -233,8 +235,8 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
  const unsigned char input_data[] = {
      0xCD, 0x3C, 0x33, 0x40,
  };
-  TestStringHalfBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT,
-                                                            {2}, input_data);
+  TestStringHalfBidirectionTransform<DeviceType::GPU, half>(
+      OpenCLBufferType::ARGUMENT, {2}, input_data);
 }

 }  // namespace test

--- a/mace/ops/buffer_transform.cc
+++ b/mace/ops/buffer_transform.cc
@@ -28,34 +28,27 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
 public:
  explicit BufferTransformOp(OpConstructContext *context)
      : Operation(context),
-        wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)),
-        out_mem_type_(MemoryType::GPU_BUFFER),
-        transformer_(nullptr) {
-    MemoryType in_mem_type = context->workspace()->GetTensor(
-        operator_def_->input(0))->memory_type();
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
-      out_mem_type_ = MemoryType::GPU_IMAGE;
-    }
-    transformer_.reset(new OpenCLBufferTransformer<T>(in_mem_type,
-                                                      out_mem_type_));
-  }
+        wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)),
+        out_mem_type_(static_cast<MemoryType>(Operation::GetOptionalArg<int>(
+            "mem_type", static_cast<int>(MemoryType::GPU_IMAGE)))) {}

  MaceStatus Run(OpContext *context) override {
    const Tensor *input = this->Input(0);
    Tensor *output = this->Output(0);

-    ops::BufferType type =
-        static_cast<ops::BufferType>(Operation::GetOptionalArg<int>(
-            "buffer_type", static_cast<int>(ops::CONV2D_FILTER)));
+    auto type =
+        static_cast<OpenCLBufferType>(Operation::GetOptionalArg<int>(
+            "buffer_type", static_cast<int>(CONV2D_FILTER)));

-    return transformer_->Transform(
-        context, input, type, wino_blk_size_, out_mem_type_, output);
+    MemoryType in_mem_type = context->workspace()->GetTensor(
+        operator_def_->input(0))->memory_type();
+    return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
+        context, input, type, out_mem_type_, wino_blk_size_, output);
  }

 private:
  const int wino_blk_size_;
  MemoryType out_mem_type_;
-  std::unique_ptr<OpenCLBufferTransformer<T>> transformer_;
 };



--- a/mace/ops/buffer_transform_test.cc
+++ b/mace/ops/buffer_transform_test.cc
@@ -15,6 +15,7 @@
 #include <cstring>

 #include "gtest/gtest.h"
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/ops_test_util.h"

 namespace mace {
@@ -30,31 +31,31 @@ class BufferTransformTest : public OpsTestBase {

 namespace {
 template <typename OrgType, typename DstType>
-void TestBidirectionTransform(const int type,
+void TestBidirectionTransform(const OpenCLBufferType type,
                              const std::vector<index_t> &input_shape) {
  OpsTestNet net;
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("TransformedOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<DstType>::value)
-      .Finalize(net.NewOperatorDef());
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));

  // Add input data
  net.AddRandomInput<DeviceType::GPU, OrgType>("Input", input_shape);
-
-  // Run
-  net.RunOp(DeviceType::GPU);
-
-  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
-      .Input("TransformedOutput")
-      .Output("Output")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<OrgType>::value)
-      .Finalize(net.NewOperatorDef());
-
-  // Run
-  net.RunOp(DeviceType::GPU);
+  Tensor *bt_output = net.ws()->CreateTensor(
+      "BtOutput", context.device()->allocator(),
+      DataTypeToEnum<DstType>::value);
+
+  OpenCLBufferTransformer<DstType>(MemoryType::GPU_BUFFER,
+                                   MemoryType::GPU_BUFFER)
+      .Transform(&context, net.ws()->GetTensor("Input"),
+                 type, MemoryType::GPU_BUFFER, 0, bt_output);
+
+  // Inverse Transform
+  Tensor *output = net.ws()->CreateTensor(
+      "Output", context.device()->allocator(),
+      DataTypeToEnum<OrgType>::value);
+  OpenCLBufferTransformer<OrgType>(MemoryType::GPU_BUFFER,
+                                   MemoryType::GPU_BUFFER)
+      .Transform(&context, bt_output,
+                 type, MemoryType::GPU_BUFFER, 0, output);

  if (DataTypeToEnum<OrgType>::value == DataTypeToEnum<DstType>::value) {
    EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(),
@@ -69,38 +70,35 @@ void TestBidirectionTransform(const int type,
 }  // namespace

 TEST_F(BufferTransformTest, FloatToHalf) {
-  TestBidirectionTransform<float, half>(ops::BufferType::IN_OUT_CHANNEL,
+  TestBidirectionTransform<float, half>(OpenCLBufferType::IN_OUT_CHANNEL,
                                        {1, 2, 3, 4});
 }

-TEST_F(BufferTransformTest, HalfToHalf) {
-  TestBidirectionTransform<half, half>(ops::BufferType::IN_OUT_CHANNEL,
-                                       {1, 2, 3, 4});
-}
-
 namespace {
 template <typename T>
 void TestArgumentTransform(const index_t input_size) {
  OpsTestNet net;
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("Output")
-      .AddIntArg("buffer_type", ops::BufferType::ARGUMENT)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));

  // Add input data
  net.AddRandomInput<DeviceType::GPU, T>("Input", {input_size});

  // Run
-  net.RunOp(DeviceType::GPU);
+  Tensor *output = net.ws()->CreateTensor(
+      "Output", context.device()->allocator(),
+      DataTypeToEnum<T>::value);
+  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER,
+                             MemoryType::GPU_BUFFER)
+      .Transform(&context, net.ws()->GetTensor("Input"),
+                 OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER,
+                 0, output);

-  auto output_tensor = net.GetOutput("Output");
  index_t expected_size = RoundUp<index_t>(input_size, 4);
-  EXPECT_EQ(expected_size, output_tensor->buffer_shape()[0]);
+  EXPECT_EQ(expected_size, output->buffer_shape()[0]);

  // Check
-  ExpectTensorNear<T>(*net.GetTensor("Input"), *output_tensor,
+  ExpectTensorNear<T>(*net.GetTensor("Input"), *output,
                      1e-3, 1e-4);
 }
 }  // namespace

--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
@@ -36,23 +36,11 @@ void ChannelShuffle(
    MACE_NOT_IMPLEMENTED;
  }

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("Softmax", "SoftmaxBM")
+  OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
      .Input("Input")
      .Output("Output")
+      .AddIntArg("group", group)
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
-        .Input("InputImage")
-        .Output("Output")
-        .AddIntArg("group", group)
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }

  // Warm-up
  for (int i = 0; i < 5; ++i) {

--- a/mace/ops/channel_shuffle_test.cc
+++ b/mace/ops/channel_shuffle_test.cc
@@ -59,22 +59,15 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
      "Input", {1, 1, 2, 16},
      {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
-  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-
  OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
-      .Input("InputImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Output("Output")
      .AddIntArg("group", 4)
      .Finalize(net.NewOperatorDef());

  // Run
  net.RunOp(DeviceType::GPU);

-  // Transfer output
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-
  // Check
  auto expected = net.CreateTensor<float>(
      {1, 1, 2, 16},

--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -28,7 +28,8 @@ class ConcatOpBase : public Operation {
 public:
  explicit ConcatOpBase(OpConstructContext *context)
      : Operation(context),
-        axis_(Operation::GetOptionalArg<int>("axis", 3)) {}
+        axis_(Operation::GetOptionalArg<int>("axis", 3)),
+        checked_(false) {}

 protected:
  void Validate() {
@@ -42,6 +43,7 @@ class ConcatOpBase : public Operation {

 protected:
  int axis_;
+  bool checked_;
 };

 template <DeviceType D, class T>
@@ -55,7 +57,15 @@ class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {

  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
-    Validate();
+    if (!checked_) {
+      Validate();
+      if (this->Input(0)->dim_size() == 4) {
+        if (axis_ == 3) axis_ = 1;
+        else if (axis_ == 2) axis_ = 3;
+        else if (axis_ == 1) axis_ = 2;
+      }
+      checked_ = true;
+    }
    const std::vector<const Tensor *> &inputs = this->Inputs();
    Tensor *output = this->Output(0);
    const Tensor *input0 = inputs.front();

--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -76,7 +76,7 @@ MACE_BM_CONCAT_CPU(1, 1225, 128);

 namespace {
 template <typename T>
-void OpenclConcatHelper(int iters,
+void OpenCLConcatHelper(int iters,
                        const std::vector<index_t> &shape0,
                        const std::vector<index_t> &shape1,
                        int concat_dim) {
@@ -88,15 +88,11 @@ void OpenclConcatHelper(int iters,
  net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
  net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);

-  BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
-                                       ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
-                                       ops::BufferType::IN_OUT_CHANNEL);
  OpDefBuilder("Concat", "ConcatBM")
-      .Input("InputImage0")
-      .Input("InputImage1")
+      .Input("Input0")
+      .Input("Input1")
      .AddIntArg("axis", concat_dim)
-      .Output("OutputImage")
+      .Output("Output")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

@@ -120,7 +116,7 @@ void OpenclConcatHelper(int iters,
 #define MACE_BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE)                          \
  static void MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) {\
    std::vector<index_t> shape = {N, H, W, C};                                 \
-    OpenclConcatHelper<TYPE>(iters, shape, shape, 3);                          \
+    OpenCLConcatHelper<TYPE>(iters, shape, shape, 3);                          \
  }                                                                            \
  MACE_BENCHMARK(MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE)


--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -104,7 +104,7 @@ TEST_F(ConcatOpTest, CPURandom) {
  static unsigned int seed = time(NULL);
  int dim = 5;
  int num_inputs = 2 + rand_r(&seed) % 10;
-  int axis = rand_r(&seed) % dim;
+  int axis = 1;
  // Construct graph
  OpsTestNet net;
  auto builder = OpDefBuilder("Concat", "ConcatTest");
@@ -157,7 +157,8 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
  static unsigned int seed = time(NULL);
  int dim = 4;
  int num_inputs = 2 + rand_r(&seed) % 10;
-  int axis = rand_r(&seed) % dim;
+  int axis = 1;
+  int axis_arg = 3;  // NHWC
  // Construct graph
  OpsTestNet net;

@@ -178,13 +179,13 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
  std::vector<index_t> output_shape = input_shapes[0];
  output_shape[axis] = concat_axis_size;
  net.AddRandomInput<DeviceType::CPU, float>(
-      "Output", output_shape, true, true);
+      "Output", output_shape, false, true, true);

  auto builder = OpDefBuilder("Concat", "ConcatTest");
  for (int i = 0; i < num_inputs; ++i) {
    builder = builder.Input(MakeString("Input", i));
  }
-  builder.AddIntArg("axis", axis)
+  builder.AddIntArg("axis", axis_arg)
      .Output("Output")
      .Finalize(net.NewOperatorDef());

@@ -212,7 +213,7 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
  net.RunOp();

  net.AddRandomInput<DeviceType::CPU, uint8_t>(
-      "QuantizedOutput", output_shape, true, true);
+      "QuantizedOutput", output_shape, false, true, true);
  auto q_builder = OpDefBuilder("Concat", "QuantizedConcatTest");
  for (int i = 0; i < num_inputs; ++i) {
    q_builder = q_builder.Input(MakeString("QuantizedInput", i));
@@ -255,32 +256,26 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
  OpsTestNet net;
  for (int i = 0; i < num_inputs; ++i) {
    const std::string input_name = MakeString("Input", i);
-    const std::string image_name = MakeString("InputImage", i);
    concat_axis_size += shapes[i][axis];
    GenerateRandomRealTypeData(shapes[i], &inputs[i]);
    input_ptrs[i] = inputs[i].data();
    net.AddInputFromArray<DeviceType::GPU, float>(input_name, shapes[i],
                                                  inputs[i]);
-    BufferToImage<DeviceType::GPU, T>(&net, input_name, image_name,
-                                      ops::BufferType::IN_OUT_CHANNEL);
  }

  auto builder = OpDefBuilder("Concat", "ConcatTest");
  for (int i = 0; i < num_inputs; ++i) {
-    const std::string image_name = MakeString("InputImage", i);
+    const std::string image_name = MakeString("Input", i);
    builder = builder.Input(image_name);
  }
  builder.AddIntArg("axis", axis)
-      .Output("OutputImage")
+      .Output("Output")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

  // Run
  net.RunOp(DeviceType::GPU);

-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-
  // Check
  auto output = net.GetOutput("Output");


--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -959,8 +959,9 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
      : ConvPool2dOpBase(context),
        activation_(ops::StringToActivationType(
            Operation::GetOptionalArg<std::string>("activation",
-                                                  "NOOP"))),
-        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {
+                                                   "NOOP"))),
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
+        wino_block_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)) {
    MemoryType mem_type;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
      mem_type = MemoryType::GPU_IMAGE;
@@ -969,13 +970,32 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
      mem_type = MemoryType::GPU_BUFFER;
      kernel_.reset(new opencl::buffer::Conv2dKernel<T>);
    }
+    context->set_output_mem_type(mem_type);
    // Transform filter tensor to target format
-    MACE_CHECK(TransformFilter<T>(
-        context, operator_def_.get(), 1, BufferType::CONV2D_FILTER, mem_type)
-                   == MaceStatus::MACE_SUCCESS);
+    if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
+        (kernel_->CheckUseWinograd(
+          context->device()->opencl_runtime(),
+          context->workspace()->GetTensor(
+              operator_def_->input(1))->shape(),
+          std::vector<index_t>(operator_def_->output_shape(0).dims().begin(),
+                               operator_def_->output_shape(0).dims().end()),
+          strides_.data(),
+          dilations_.data(),
+          &wino_block_size_))) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 1,
+          OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_)
+                     == MaceStatus::MACE_SUCCESS);
+    } else {
+      wino_block_size_ = 0;
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 1,
+          OpenCLBufferType::CONV2D_FILTER, mem_type)
+                     == MaceStatus::MACE_SUCCESS);
+    }
    if (operator_def_->input_size() > 2) {
      MACE_CHECK(TransformFilter<T>(
-          context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type)
+          context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
                     == MaceStatus::MACE_SUCCESS);
    }
  }
@@ -987,13 +1007,14 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
    return kernel_->Compute(context, input, filter, bias,
                            strides_.data(), padding_type_, paddings_,
                            dilations_.data(), activation_, relux_max_limit_,
-                            output);
+                            wino_block_size_, output);
  }

 private:
  const ActivationType activation_;
  const float relux_max_limit_;
  std::unique_ptr<OpenCLConv2dKernel> kernel_;
+  int wino_block_size_;

 private:
  MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);

--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -49,11 +49,10 @@ void Conv2d(int iters,
  }
  net.AddRandomInput<D, float>("Filter",
                               {output_channels, channels, kernel_h,
-                                kernel_w});
-  net.AddRandomInput<D, float>("Bias", {output_channels});
+                                kernel_w}, true);
+  net.AddRandomInput<D, float>("Bias", {output_channels}, true);

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("Conv2D", "Conv2dTest")
+  OpDefBuilder("Conv2D", "Conv2dTest")
      .Input("Input")
      .Input("Filter")
      .Input("Bias")
@@ -63,26 +62,6 @@ void Conv2d(int iters,
      .AddIntsArg("dilations", {dilation, dilation})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-    OpDefBuilder("Conv2D", "Conv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntArg("padding", padding)
-        .AddIntsArg("dilations", {dilation, dilation})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }

  net.Setup(D);

@@ -123,9 +102,9 @@ void Conv2d<CPU, uint8_t>(int iters,
      "Input", {batch, height, width, channels});
  net.GetTensor("Input")->SetScale(0.1);
  net.AddRandomInput<DeviceType::CPU, uint8_t>(
-      "Filter", {output_channels, kernel_h, kernel_w, channels});
+      "Filter", {output_channels, kernel_h, kernel_w, channels}, true);
  net.GetTensor("Filter")->SetScale(0.1);
-  net.AddRandomInput<DeviceType::CPU, int32_t>("Bias", {output_channels});
+  net.AddRandomInput<DeviceType::CPU, int32_t>("Bias", {output_channels}, true);
  OpDefBuilder("Conv2D", "Conv2dTest")
      .Input("Input")
      .Input("Filter")

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
--- a/mace/ops/conv_pool_2d_util.cc
+++ b/mace/ops/conv_pool_2d_util.cc
@@ -24,7 +24,7 @@ namespace ops {
 void CalcPaddingAndOutputSize(const index_t *input_shape,
                              const DataFormat input_format,
                              const index_t *filter_shape,
-                              const DataFormat filter_format,
+                              const FilterDataFormat filter_format,
                              const int *dilations,
                              const int *strides,
                              Padding padding,
@@ -137,7 +137,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape,   // NHWC
 void CalcOutputSize(const index_t *input_shape,
                    const DataFormat input_format,
                    const index_t *filter_shape,
-                    const DataFormat filter_format,
+                    const FilterDataFormat filter_format,
                    const int *padding_size,
                    const int *dilations,
                    const int *strides,

--- a/mace/ops/conv_pool_2d_util.h
+++ b/mace/ops/conv_pool_2d_util.h
@@ -35,7 +35,7 @@ namespace ops {
 void CalcPaddingAndOutputSize(const index_t *input_shape,
                              const DataFormat input_format,
                              const index_t *filter_shape,
-                              const DataFormat filter_format,
+                              const FilterDataFormat filter_format,
                              const int *dilations,
                              const int *strides,
                              Padding padding,
@@ -61,7 +61,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape,
 void CalcOutputSize(const index_t *input_shape,
                    const DataFormat input_format,
                    const index_t *filter_shape,
-                    const DataFormat filter_format,
+                    const FilterDataFormat filter_format,
                    const int *padding_size,
                    const int *dilations,
                    const int *strides,

--- a/mace/ops/core_test.cc
+++ b/mace/ops/core_test.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/ops_test_util.h"
-
-namespace mace {
-namespace ops {
-namespace test {
-
-TEST(CoreTest, INIT_MODE) {
-  std::vector<OperatorDef> op_defs;
-
-  Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU);
-  std::unique_ptr<Tuner<uint32_t>> tuner;
-  Workspace ws;
-
-  op_defs.emplace_back(OperatorDef());
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("B2IOutput")
-      .AddIntArg("buffer_type", ops::BufferType::CONV2D_FILTER)
-      .AddIntArg("mode", static_cast<int>(NetMode::INIT))
-      .Finalize(&op_defs[op_defs.size() - 1]);
-
-  Tensor *input = ws.CreateTensor("Input", device->allocator(),
-                                  DataTypeToEnum<float>::v());
-  input->Resize({1, 3, 3, 3});
-  {
-    Tensor::MappingGuard input_mapper(input);
-    float *input_data = input->mutable_data<float>();
-    std::fill(input_data, input_data + input->size(), 1);
-  }
-
-  op_defs.emplace_back(OperatorDef());
-  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
-      .Input("B2IOutput")
-      .Output("Output")
-      .AddIntArg("buffer_type", ops::BufferType::CONV2D_FILTER)
-      .Finalize(&op_defs[op_defs.size() - 1]);
-
-  NetDef net_def;
-  for (auto &op_def : op_defs) {
-    net_def.add_op()->CopyFrom(op_def);
-  }
-  std::shared_ptr<OpRegistry> op_registry(new OpRegistry());
-  auto net = std::unique_ptr<NetBase>(new SerialNet(
-      op_registry.get(), &net_def, &ws, device,
-      NetMode::INIT));
-  MaceStatus status = net->Init();
-  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
-  status = net->Run();
-  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
-
-  EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr);
-  EXPECT_TRUE(ws.GetTensor("Output") == nullptr);
-  net = std::unique_ptr<NetBase>(new SerialNet(
-      op_registry.get(), &net_def, &ws, device));
-  status = net->Init();
-  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
-  status = net->Run();
-  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
-  EXPECT_TRUE(ws.GetTensor("Output") != nullptr);
-
-  ExpectTensorNear<float>(*ws.GetTensor("Input"), *ws.GetTensor("Output"),
-                          1e-5);
-}
-
-}  // namespace test
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/crop_benchmark.cc
+++ b/mace/ops/crop_benchmark.cc
@@ -66,7 +66,7 @@ MACE_BM_CROP_CPU_MACRO(2, 512, 6);

 namespace {
 template <typename T>
-void OpenclCropHelper(int iters,
+void OpenCLCropHelper(int iters,
                      const std::vector<index_t> &shape0,
                      const std::vector<index_t> &shape1,
                      int crop_axis,
@@ -79,16 +79,12 @@ void OpenclCropHelper(int iters,
  net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
  net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);

-  BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
-                                       ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
-                                       ops::BufferType::IN_OUT_CHANNEL);
  OpDefBuilder("Crop", "CropBM")
-      .Input("InputImage0")
-      .Input("InputImage1")
+      .Input("Input0")
+      .Input("Input1")
      .AddIntArg("axis", crop_axis)
      .AddIntsArg("offset", {offset})
-      .Output("OutputImage")
+      .Output("Output")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

@@ -114,7 +110,7 @@ void OpenclCropHelper(int iters,
  _##TYPE(int iters) {                                                        \
    std::vector<index_t> shape0 = {N, H, W, C};                              \
    std::vector<index_t> shape1 = {N / 2, H / 2, W / 2, C / 2};              \
-    OpenclCropHelper<TYPE>(iters, shape0, shape1, AXIS, OFFSET);             \
+    OpenCLCropHelper<TYPE>(iters, shape0, shape1, AXIS, OFFSET);             \
  }                                                                          \
  MACE_BENCHMARK(MACE_BM_CROP_GPU_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET\
  ##_##TYPE)

--- a/mace/ops/crop_test.cc
+++ b/mace/ops/crop_test.cc
@@ -34,14 +34,10 @@ void RunCrop(const std::vector<index_t> &input_shape,
  net.AddRandomInput<D, float>("Input1", input_shape2);

  if (D == GPU) {
-    BufferToImage<D, float>(&net, "Input0", "InputImage0",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Input1", "InputImage1",
-                            ops::BufferType::IN_OUT_CHANNEL);
    OpDefBuilder("Crop", "CropTest")
-        .Input("InputImage0")
-        .Input("InputImage1")
-        .Output("OutputImage")
+        .Input("Input0")
+        .Input("Input1")
+        .Output("Output")
        .AddIntsArg("offset", offset)
        .AddIntArg("axis", axis)
        .Finalize(net.NewOperatorDef());
@@ -66,10 +62,7 @@ void RunCrop(const std::vector<index_t> &input_shape,
  // Run
  net.RunOp(D);

-  if (D == GPU) {
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else if (D == CPU) {
+  if (D == CPU) {
    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                    "Output", NHWC);
  }

--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -30,6 +30,7 @@
 #include "mace/ops/arm/deconv_2d_neon.h"
 #include "mace/utils/utils.h"
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/deconv_2d.h"
 #endif  // MACE_ENABLE_OPENCL

@@ -358,11 +359,27 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
 public:
  explicit Deconv2dOp(OpConstructContext *context)
      : Deconv2dOpBase(context) {
+    MemoryType mem_type = MemoryType::GPU_IMAGE;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
      kernel_.reset(new opencl::image::Deconv2dKernel<T>);
    } else {
      MACE_NOT_IMPLEMENTED;
    }
+    MACE_CHECK(TransformFilter<T>(
+        context, operator_def_.get(), 1,
+        OpenCLBufferType::CONV2D_FILTER, mem_type)
+                   == MaceStatus::MACE_SUCCESS);
+    if (model_type_ == FrameworkType::CAFFE) {
+      if (operator_def_->input_size() >= 3) {
+        MACE_CHECK(TransformFilter<T>(
+            context, operator_def_.get(), 2,
+            OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
+      }
+    } else if (operator_def_->input_size() >= 4) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 3, OpenCLBufferType::ARGUMENT, mem_type)
+                     == MaceStatus::MACE_SUCCESS);
+    }
  }
  MaceStatus Run(OpContext *context) override {
    const Tensor *input = this->Input(0);

--- a/mace/ops/deconv_2d_benchmark.cc
+++ b/mace/ops/deconv_2d_benchmark.cc
@@ -47,40 +47,21 @@ static void Deconv2d(int iters,
  }
  net.AddRandomInput<D, float>("Filter",
                               {output_channels, channels, kernel_h,
-                                kernel_w});
-  net.AddRandomInput<D, float>("Bias", {output_channels});
+                                kernel_w}, true);
+  net.AddRandomInput<D, float>("Bias", {output_channels}, true);
  net.AddInputFromArray<D, int32_t>("OutputShape", {4},
-                                    {batch, out_h, out_w, output_channels});
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-    OpDefBuilder("Deconv2D", "Deconv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("OutputShape")
-        .Input("BiasImage")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntArg("padding", padding)
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("Deconv2D", "Deconv2dTest")
-        .Input("Input")
-        .Input("Filter")
-        .Input("OutputShape")
-        .Input("Bias")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntArg("padding", padding)
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  }
-
+                                    {batch, out_h, out_w, output_channels},
+                                    true);
+  OpDefBuilder("Deconv2D", "Deconv2dTest")
+      .Input("Input")
+      .Input("Filter")
+      .Input("OutputShape")
+      .Input("Bias")
+      .Output("Output")
+      .AddIntsArg("strides", {stride, stride})
+      .AddIntArg("padding", padding)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
  net.Setup(D);

  // Warm-up

--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
@@ -41,40 +41,34 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
                   ops::FrameworkType model_type) {
  OpsTestNet net;
  // Add input data
-  const index_t batch = input_shape[0];
  const index_t out_channels = filter_shape[2];

  net.AddInputFromArray<D, float>("Input", input_shape, input_data);
-  net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data);
-  net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data);
-  net.TransformDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
+  net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data, true);
+  net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data, true);
+  // TODO(liutuo): remove the unused transform
+  net.TransformFilterDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Bias", "BiasImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "FilterOIHW", "FilterImage",
-                            ops::BufferType::CONV2D_FILTER);
    if (model_type == ops::FrameworkType::CAFFE) {
      OpDefBuilder("Deconv2D", "Deconv2dTest")
-          .Input("InputImage")
-          .Input("FilterImage")
-          .Input("BiasImage")
-          .Output("OutputImage")
+          .Input("Input")
+          .Input("FilterOIHW")
+          .Input("Bias")
+          .Output("Output")
          .AddIntsArg("strides", {stride, stride})
          .AddIntArg("padding", padding)
          .AddIntsArg("padding_values", padding_size)
          .AddIntArg("framework_type", model_type)
          .Finalize(net.NewOperatorDef());
    } else {
-      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape);
+      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape, true);

      OpDefBuilder("Deconv2D", "Deconv2dTest")
-          .Input("InputImage")
-          .Input("FilterImage")
+          .Input("Input")
+          .Input("FilterOIHW")
          .Input("OutputShape")
-          .Input("BiasImage")
-          .Output("OutputImage")
+          .Input("Bias")
+          .Output("Output")
          .AddIntsArg("strides", {stride, stride})
          .AddIntArg("padding", padding)
          .AddIntsArg("padding_values", padding_size)
@@ -82,10 +76,6 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
          .Finalize(net.NewOperatorDef());
    }
    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
  } else {
    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                    NCHW);
@@ -102,7 +92,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
          .AddIntArg("framework_type", model_type)
          .Finalize(net.NewOperatorDef());
    } else {
-      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape);
+      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape, true);

      OpDefBuilder("Deconv2D", "Deconv2dTest")
          .Input("InputNCHW")
@@ -387,8 +377,8 @@ void TestComplexDeconvNxN(const int batch,
    // Add input data
    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
    net.AddRandomInput<D, T>(
-        "Filter", {output_channels, input_channels, kernel_h, kernel_w});
-    net.AddRandomInput<D, T>("Bias", {output_channels});
+        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
+    net.AddRandomInput<D, T>("Bias", {output_channels}, true);
    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                    NCHW);
    int out_h = 0;
@@ -413,7 +403,7 @@ void TestComplexDeconvNxN(const int batch,
      output_shape.push_back(out_h);
      output_shape.push_back(out_w);
      output_shape.push_back(output_channels);
-      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape);
+      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape, true);
    } else {
      paddings.push_back(padding);
      paddings.push_back(padding);
@@ -455,19 +445,12 @@ void TestComplexDeconvNxN(const int batch,
    expected->Copy(*net.GetOutput("Output"));

    // run on gpu
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-
    if (model_type == ops::FrameworkType::CAFFE) {
      OpDefBuilder("Deconv2D", "Deconv2dTest")
-          .Input("InputImage")
-          .Input("FilterImage")
-          .Input("BiasImage")
-          .Output("OutputImage")
+          .Input("Input")
+          .Input("Filter")
+          .Input("Bias")
+          .Output("Output")
          .AddIntsArg("strides", {stride_h, stride_w})
          .AddIntsArg("padding_values", paddings)
          .AddIntArg("framework_type", model_type)
@@ -475,11 +458,11 @@ void TestComplexDeconvNxN(const int batch,
          .Finalize(net.NewOperatorDef());
    } else {
      OpDefBuilder("Deconv2D", "Deconv2dTest")
-          .Input("InputImage")
-          .Input("FilterImage")
+          .Input("Input")
+          .Input("Filter")
          .Input("OutputShape")
-          .Input("BiasImage")
-          .Output("OutputImage")
+          .Input("Bias")
+          .Output("Output")
          .AddIntsArg("strides", {stride_h, stride_w})
          .AddIntArg("padding", type)
          .AddIntArg("framework_type", model_type)
@@ -489,9 +472,7 @@ void TestComplexDeconvNxN(const int batch,
    // Run on device
    net.RunOp(D);

-    ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4,
                            1e-4);
  };


--- a/mace/ops/depth_to_space_benchmark.cc
+++ b/mace/ops/depth_to_space_benchmark.cc
@@ -36,23 +36,12 @@ void DepthToSpace(
    MACE_NOT_IMPLEMENTED;
  }

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
+  OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
      .Input("Input")
      .Output("Output")
+      .AddIntArg("block_size", block_size)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
-        .Input("InputImage")
-        .Output("Output")
-        .AddIntArg("block_size", block_size)
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }

  // Warm-up
  for (int i = 0; i < 5; ++i) {

--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
@@ -45,21 +45,15 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,
                                                    "Output", NHWC);

  } else {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
    OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
-        .Input("InputImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Output("Output")
        .AddIntArg("block_size", block_size)
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
  }

-  if (D == DeviceType::GPU) {
-    ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
-                                          ops::BufferType::IN_OUT_CHANNEL);
-  }
  auto expected = net.CreateTensor<float>(expected_shape, expected_data);
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -134,28 +128,23 @@ void RandomTest(const int block_size,
  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
                                                  NHWC);

-  BufferToImage<D, T>(&net, "Input", "InputImg",
-                      ops::BufferType::IN_OUT_CHANNEL);
-
  OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
-      .Input("InputImg")
+      .Input("Input")
      .AddIntArg("block_size", block_size)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Output("OutputImg")
+      .Output("GPUOutput")
      .Finalize(net.NewOperatorDef());

  // Run
  net.RunOp(D);

-  ImageToBuffer<D, float>(&net, "OutputImg", "OPENCLOutput",
-                          ops::BufferType::IN_OUT_CHANNEL);

  if (DataTypeToEnum<T>::value == DT_FLOAT) {
    ExpectTensorNear<float>(*net.GetTensor("Output"),
-                            *net.GetOutput("OPENCLOutput"), 1e-5);
+                            *net.GetOutput("GPUOutput"), 1e-5);
  } else {
    ExpectTensorNear<float>(*net.GetTensor("Output"),
-                            *net.GetOutput("OPENCLOutput"), 1e-3, 1e-4);
+                            *net.GetOutput("GPUOutput"), 1e-3, 1e-4);
  }
 }
 }  // namespace

--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -499,13 +499,17 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
      mem_type = MemoryType::GPU_BUFFER;
      kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel<T>);
    }
+    context->set_output_mem_type(mem_type);
    // Transform filter tensor to target format
    MACE_CHECK(TransformFilter<T>(
-        context, operator_def_.get(), 1, BufferType::DW_CONV2D_FILTER, mem_type)
-                   == MaceStatus::MACE_SUCCESS);
+        context,
+        operator_def_.get(),
+        1,
+        OpenCLBufferType::DW_CONV2D_FILTER,
+        mem_type) == MaceStatus::MACE_SUCCESS);
    if (operator_def_->input_size() > 2) {
      MACE_CHECK(TransformFilter<T>(
-          context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type)
+          context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
                     == MaceStatus::MACE_SUCCESS);
    }
  }

--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -57,18 +57,17 @@ void DepthwiseConv2d(int iters,
  }
  if (DataTypeToEnum<T>::value != DT_UINT8) {
    net.AddRandomInput<D, float>(
-        "Filter", {multiplier, input_channels, kernel_h, kernel_w});
-    net.AddRandomInput<D, float>("Bias", {input_channels * multiplier});
+        "Filter", {multiplier, input_channels, kernel_h, kernel_w}, true);
+    net.AddRandomInput<D, float>("Bias", {input_channels * multiplier}, true);
  } else {
    net.AddRandomInput<DeviceType::CPU, uint8_t>(
-        "Filter", {kernel_h, kernel_w, input_channels, multiplier});
+        "Filter", {kernel_h, kernel_w, input_channels, multiplier}, true);
    net.GetTensor("Filter")->SetScale(0.1);
    net.AddRandomInput<DeviceType::CPU, int32_t>(
-        "Bias", {input_channels * multiplier});
+        "Bias", {input_channels * multiplier}, true);
  }

-  if (D == DeviceType::CPU) {
-    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
+  OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
      .Input("Input")
      .Input("Filter")
      .Input("Bias")
@@ -78,26 +77,6 @@ void DepthwiseConv2d(int iters,
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::DW_CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntArg("padding", padding)
-        .AddIntsArg("dilations", {1, 1})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }

  net.Setup(D);


--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -29,6 +29,7 @@
 #include "mace/utils/utils.h"
 #include "mace/public/mace.h"
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/depthwise_deconv2d.h"
 #endif  // MACE_ENABLE_OPENCL

@@ -408,11 +409,21 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
 public:
  explicit DepthwiseDeconv2dOp(OpConstructContext *context)
      : Deconv2dOpBase(context) {
+    MemoryType mem_type = MemoryType::GPU_IMAGE;
    if (context->device()->opencl_runtime()->UseImageMemory()) {
      kernel_.reset(new opencl::image::DepthwiseDeconv2dKernel<T>);
    } else {
      MACE_NOT_IMPLEMENTED;
    }
+    MACE_CHECK(TransformFilter<T>(
+        context, operator_def_.get(), 1,
+        OpenCLBufferType::DW_CONV2D_FILTER, mem_type)
+                   == MaceStatus::MACE_SUCCESS);
+    if (operator_def_->input_size() >= 3) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 2,
+          OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
+    }
  }

  MaceStatus Run(OpContext *context) override {

--- a/mace/ops/depthwise_deconv2d_benchmark.cc
+++ b/mace/ops/depthwise_deconv2d_benchmark.cc
@@ -44,32 +44,16 @@ static void DepthwiseDeconv2d(int iters,
  }
  net.AddRandomInput<D, float>("Filter",
                               {1, channels, kernel_h,
-                                kernel_w});
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::DW_CONV2D_FILTER);
-    OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntsArg("padding_values", {padding, padding})
-        .AddIntArg("group", channels)
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
-        .Input("Input")
-        .Input("Filter")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntsArg("padding_values", {padding, padding})
-        .AddIntArg("group", channels)
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
-        .Finalize(net.NewOperatorDef());
-  }
+                                kernel_w}, true);
+  OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
+      .Input("Input")
+      .Input("Filter")
+      .Output("Output")
+      .AddIntsArg("strides", {stride, stride})
+      .AddIntsArg("padding_values", {padding, padding})
+      .AddIntArg("group", channels)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());

  net.Setup(D);


--- a/mace/ops/depthwise_deconv2d_test.cc
+++ b/mace/ops/depthwise_deconv2d_test.cc
--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -1097,13 +1097,16 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
    }
    // Transform filters
    int input_size = operator_def_->input_size();
+    Workspace *ws = context->workspace();
    for (int i = 0; i < input_size; ++i) {
-      const Tensor *input_tensor = context->workspace()->GetTensor(
-          operator_def_->input(i));
-      if (input_tensor != nullptr && input_tensor->is_weight()) {
+      if (ws->HasTensor(operator_def_->input(i)) &&
+          ws->GetTensor(operator_def_->input(i))->is_weight()) {
        MACE_CHECK(TransformFilter<T>(
-            context, operator_def_.get(), i, BufferType::ARGUMENT, mem_type)
-                       == MaceStatus::MACE_SUCCESS);
+            context,
+            operator_def_.get(),
+            i,
+            OpenCLBufferType::ARGUMENT,
+            mem_type) == MaceStatus::MACE_SUCCESS);
      }
    }
  }

--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -30,37 +30,23 @@ void EltwiseBenchmark(

  OpsTestNet net;
  // Add input data
-  net.AddRandomInput<D, T>("Input0", {n, h, w, c});
-  net.AddRandomInput<D, T>("Input1", {n, h, w, c});
-
  if (D == DeviceType::GPU) {
-    BufferToImage<D, half>(&net, "Input0", "InputImg0",
-                           ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, half>(&net, "Input1", "InputImg1",
-                           ops::BufferType::IN_OUT_CHANNEL);
-    OpDefBuilder("Eltwise", "EltwiseTest")
-        .Input("InputImg0")
-        .Input("InputImg1")
-        .AddIntArg("type", static_cast<int>(type))
-        .AddFloatsArg("coeff", {1.2, 2.1})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Output("OutputImg")
-        .Finalize(net.NewOperatorDef());
+    net.AddRandomInput<D, T>("Input0", {n, h, w, c});
+    net.AddRandomInput<D, T>("Input1", {n, h, w, c});
  } else {
-    net.TransformDataFormat<D, float>("Input0", NHWC,
-                                      "TInput0", NCHW);
-    net.TransformDataFormat<D, float>("Input1", NHWC,
-                                      "TInput1", NCHW);
-    OpDefBuilder("Eltwise", "EltwiseTest")
-        .Input("TInput0")
-        .Input("TInput1")
-        .AddIntArg("type", static_cast<int>(type))
-        .AddFloatsArg("coeff", {1.2, 2.1})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
+    net.AddRandomInput<D, T>("Input0", {n, c, h, w});
+    net.AddRandomInput<D, T>("Input1", {n, c, h, w});
  }

+  OpDefBuilder("Eltwise", "EltwiseTest")
+      .Input("Input0")
+      .Input("Input1")
+      .AddIntArg("type", static_cast<int>(type))
+      .AddFloatsArg("coeff", {1.2, 2.1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
  // Warm-up
  for (int i = 0; i < 5; ++i) {
    net.RunOp(D);

--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -202,11 +202,14 @@ class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
    }
    // Transform filter tensor to target format
    MACE_CHECK(TransformFilter<T>(
-        context, operator_def_.get(), 1, BufferType::WEIGHT_WIDTH, mem_type)
-                   == MaceStatus::MACE_SUCCESS);
+        context,
+        operator_def_.get(),
+        1,
+        OpenCLBufferType::WEIGHT_WIDTH,
+        mem_type) == MaceStatus::MACE_SUCCESS);
    if (operator_def_->input_size() > 2) {
      MACE_CHECK(TransformFilter<T>(
-          context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type)
+          context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
                     == MaceStatus::MACE_SUCCESS);
    }
  }

--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
--- a/mace/ops/lstm_cell.cc
+++ b/mace/ops/lstm_cell.cc
--- a/mace/ops/lstmcell_benchmark.cc
+++ b/mace/ops/lstmcell_benchmark.cc
--- a/mace/ops/lstmcell_test.cc
+++ b/mace/ops/lstmcell_test.cc
--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -31,6 +31,7 @@
 #endif  // MACE_ENABLE_QUANTIZE

 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/matmul.h"
 #endif  // MACE_ENABLE_OPENCL

@@ -351,11 +352,8 @@ class MatMulOp<DeviceType::GPU, T> : public MatMulOpBase {
 public:
  explicit MatMulOp(OpConstructContext *context)
      : MatMulOpBase(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::MatMulKernel<T>);
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
+    MACE_UNUSED(context);
+    MACE_NOT_IMPLEMENTED;
  }
  MaceStatus Run(OpContext *context) override {
    Validate();

--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
--- a/mace/ops/opencl/buffer/buffer_inverse_transform.h
+++ b/mace/ops/opencl/buffer/buffer_inverse_transform.h
--- a/mace/ops/opencl/buffer/buffer_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_transform.cc
--- a/mace/ops/opencl/buffer/buffer_transform.h
+++ b/mace/ops/opencl/buffer/buffer_transform.h
--- a/mace/ops/opencl/buffer/buffer_type_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_type_transform.cc
--- a/mace/ops/opencl/buffer/conv_2d.h
+++ b/mace/ops/opencl/buffer/conv_2d.h
--- a/mace/ops/opencl/buffer_transform_kernel.h
+++ b/mace/ops/opencl/buffer_transform_kernel.h
--- a/mace/ops/opencl/buffer_transformer.cc
+++ b/mace/ops/opencl/buffer_transformer.cc
--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
--- a/mace/ops/opencl/conv_2d.h
+++ b/mace/ops/opencl/conv_2d.h
--- a/mace/ops/opencl/depthwise_deconv2d.h
+++ b/mace/ops/opencl/depthwise_deconv2d.h
--- a/mace/ops/opencl/helper.cc
+++ b/mace/ops/opencl/helper.cc
--- a/mace/ops/opencl/helper.h
+++ b/mace/ops/opencl/helper.h
--- a/mace/ops/opencl/image/addn.h
+++ b/mace/ops/opencl/image/addn.h
--- a/mace/ops/opencl/image/batch_to_space.h
+++ b/mace/ops/opencl/image/batch_to_space.h
--- a/mace/ops/opencl/image/buffer_to_image.h
+++ b/mace/ops/opencl/image/buffer_to_image.h
--- a/mace/ops/opencl/image/concat.h
+++ b/mace/ops/opencl/image/concat.h
--- a/mace/ops/opencl/image/conv_2d.h
+++ b/mace/ops/opencl/image/conv_2d.h
--- a/mace/ops/opencl/image/conv_2d_1x1.cc
+++ b/mace/ops/opencl/image/conv_2d_1x1.cc
--- a/mace/ops/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
--- a/mace/ops/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
--- a/mace/ops/opencl/image/crop.h
+++ b/mace/ops/opencl/image/crop.h
--- a/mace/ops/opencl/image/deconv_2d.h
+++ b/mace/ops/opencl/image/deconv_2d.h
--- a/mace/ops/opencl/image/depth_to_space.h
+++ b/mace/ops/opencl/image/depth_to_space.h
--- a/mace/ops/opencl/image/depthwise_conv2d.h
+++ b/mace/ops/opencl/image/depthwise_conv2d.h
--- a/mace/ops/opencl/image/depthwise_deconv2d.h
+++ b/mace/ops/opencl/image/depthwise_deconv2d.h
--- a/mace/ops/opencl/image/eltwise.h
+++ b/mace/ops/opencl/image/eltwise.h
--- a/mace/ops/opencl/image/fully_connected.h
+++ b/mace/ops/opencl/image/fully_connected.h
--- a/mace/ops/opencl/image/image_to_buffer.h
+++ b/mace/ops/opencl/image/image_to_buffer.h
--- a/mace/ops/opencl/image/lstm_cell.h
+++ b/mace/ops/opencl/image/lstm_cell.h
--- a/mace/ops/opencl/image/matmul.h
+++ b/mace/ops/opencl/image/matmul.h
--- a/mace/ops/opencl/image/pad.h
+++ b/mace/ops/opencl/image/pad.h
--- a/mace/ops/opencl/image/pooling.h
+++ b/mace/ops/opencl/image/pooling.h
--- a/mace/ops/opencl/image/reduce_mean.h
+++ b/mace/ops/opencl/image/reduce_mean.h
--- a/mace/ops/opencl/image/resize_bicubic.h
+++ b/mace/ops/opencl/image/resize_bicubic.h
--- a/mace/ops/opencl/image/resize_bilinear.h
+++ b/mace/ops/opencl/image/resize_bilinear.h
--- a/mace/ops/opencl/image/space_to_batch.h
+++ b/mace/ops/opencl/image/space_to_batch.h
--- a/mace/ops/opencl/image/space_to_depth.h
+++ b/mace/ops/opencl/image/space_to_depth.h
--- a/mace/ops/opencl/image/split.h
+++ b/mace/ops/opencl/image/split.h
--- a/mace/ops/opencl/image/sqrdiff_mean.h
+++ b/mace/ops/opencl/image/sqrdiff_mean.h
--- a/mace/ops/opencl/image/winograd_transform.h
+++ b/mace/ops/opencl/image/winograd_transform.h
--- a/mace/ops/opencl/out_of_range_check_test.cc
+++ b/mace/ops/opencl/out_of_range_check_test.cc
--- a/mace/ops/ops_registry.cc
+++ b/mace/ops/ops_registry.cc
--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
--- a/mace/ops/pad_benchmark.cc
+++ b/mace/ops/pad_benchmark.cc
--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
--- a/mace/ops/reduce_mean.cc
+++ b/mace/ops/reduce_mean.cc
--- a/mace/ops/reduce_mean_benchmark.cc
+++ b/mace/ops/reduce_mean_benchmark.cc
--- a/mace/ops/reduce_mean_test.cc
+++ b/mace/ops/reduce_mean_test.cc
--- a/mace/ops/resize_bicubic_benchmark.cc
+++ b/mace/ops/resize_bicubic_benchmark.cc
--- a/mace/ops/resize_bicubic_test.cc
+++ b/mace/ops/resize_bicubic_test.cc
--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
--- a/mace/ops/scalar_math.cc
+++ b/mace/ops/scalar_math.cc
--- a/mace/ops/shape.cc
+++ b/mace/ops/shape.cc
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
--- a/mace/ops/space_to_depth_benchmark.cc
+++ b/mace/ops/space_to_depth_benchmark.cc
--- a/mace/ops/space_to_depth_test.cc
+++ b/mace/ops/space_to_depth_test.cc
--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
--- a/mace/ops/split_benchmark.cc
+++ b/mace/ops/split_benchmark.cc
--- a/mace/ops/split_test.cc
+++ b/mace/ops/split_test.cc
--- a/mace/ops/sqrdiff_mean_benchmark.cc
+++ b/mace/ops/sqrdiff_mean_benchmark.cc
--- a/mace/ops/sqrdiff_mean_test.cc
+++ b/mace/ops/sqrdiff_mean_test.cc
--- a/mace/ops/squeeze.cc
+++ b/mace/ops/squeeze.cc
--- a/mace/ops/squeeze_test.cc
+++ b/mace/ops/squeeze_test.cc
--- a/mace/ops/stack.cc
+++ b/mace/ops/stack.cc
--- a/mace/ops/transformer.cc
+++ b/mace/ops/transformer.cc
--- a/mace/ops/transpose.cc
+++ b/mace/ops/transpose.cc
--- a/mace/ops/opencl/common.h
+++ b/mace/ops/opencl/common.h
--- a/mace/ops/winograd_convolution_benchmark.cc
+++ b/mace/ops/winograd_convolution_benchmark.cc
--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
--- a/mace/ops/winograd_transform.cc
+++ b/mace/ops/winograd_transform.cc
--- a/mace/ops/winograd_transform_benchmark.cc
+++ b/mace/ops/winograd_transform_benchmark.cc
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
--- a/mace/python/tools/BUILD
+++ b/mace/python/tools/BUILD
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
--- a/mace/python/tools/memory_optimizer.py
+++ b/mace/python/tools/memory_optimizer.py
--- a/mace/python/tools/model.jinja2
+++ b/mace/python/tools/model.jinja2
--- a/mace/python/tools/model_saver.py
+++ b/mace/python/tools/model_saver.py
--- a/mace/python/tools/operator.jinja2
+++ b/mace/python/tools/operator.jinja2
--- a/mace/test/BUILD
+++ b/mace/test/BUILD
--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
--- a/mace/test/mace_api_test.h
+++ b/mace/test/mace_api_test.h
--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
--- a/tools/converter.py
+++ b/tools/converter.py
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py