diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst
index 2163c44c1563e35404cf51eadd36c27f717969d0..1c32b7996e0255bc288b3310fd2db76a952e1112 100644
--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -69,9 +69,9 @@ in one deployment file.
       - The output tensor name(s) (tensorflow) or top name(s) of outputs' layer (caffe).
         If there are more than one tensors, use one line for a tensor.
     * - input_shapes
-      - The shapes of the input tensors, in NHWC order.
+      - The shapes of the input tensors, default is NHWC order.
     * - output_shapes
-      - The shapes of the output tensors, in NHWC order.
+      - The shapes of the output tensors, default is NHWC order.
     * - input_ranges
       - The numerical range of the input tensors' data, default [-1, 1]. It is only for test.
     * - validation_inputs_data
@@ -84,6 +84,10 @@ in one deployment file.
       - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP.
     * - input_data_types
       - [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32.
+    * - input_data_formats
+      - [optional] The format of the input tensors, one of [NONE, NHWC]. If there is no format of the input, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order.
+    * - output_data_formats
+      - [optional] The format of the output tensors, one of [NONE, NHWC]. If there is no format of the output, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order.
     * - limit_opencl_kernel_time
       - [optional] Whether splitting the OpenCL kernel within 1 ms to keep UI responsiveness, default is 0.
     * - obfuscate
diff --git a/docs/user_guide/devices/demo_device_nanopi.yml b/docs/user_guide/devices/demo_device_nanopi.yml
new file mode 100644
index 0000000000000000000000000000000000000000..567f7c7e1ce08af39134527d9eae825a688cb76f
--- /dev/null
+++ b/docs/user_guide/devices/demo_device_nanopi.yml
@@ -0,0 +1,23 @@
+# one yaml config file can contain multi device info
+devices:
+  # The name of the device
+  nanopi:
+  # arm64 or armhf
+    target_abis: [arm64, armhf]
+  # device soc, you can get it from device manual
+    target_socs: RK3399
+  # device model full name
+    models: FriendlyElec Nanopi M4
+  # device ip address
+    address: 10.0.0.0
+  # login username
+    username: user
+  # login password, is required when you can login into device without password
+    password: 1234567
+  raspberry:
+    target_abis: [armv7l]
+    target_socs: BCM2837
+    models: Raspberry Pi 3 Model B Plus Rev 1.3
+    address: 10.0.0.1
+    username: user
+    password: 123456
diff --git a/mace/core/arg_helper.cc b/mace/core/arg_helper.cc
index 60fb38f7d71895db95ccd1ec88a765b5fecfc5cc..cd3c4d1f2071a547f4a5c034629b45deefe74b28 100644
--- a/mace/core/arg_helper.cc
+++ b/mace/core/arg_helper.cc
@@ -95,4 +95,12 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(float, floats, false)
 MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true)
 MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true)
 #undef MACE_GET_REPEATED_ARGUMENT_FUNC
+
+
+bool IsQuantizedModel(const NetDef &net_def) {
+  return
+      ProtoArgHelper::GetOptionalArg<NetDef, int>(net_def, "quantize_flag", 0)
+          == 1;
+}
+
 }  // namespace mace
diff --git a/mace/core/arg_helper.h b/mace/core/arg_helper.h
index 50ec4eade9c05eb12d0b555595a665e590a14965..238b0800e5f971287bb1e85c592b9ea5af01eaf3 100644
--- a/mace/core/arg_helper.h
+++ b/mace/core/arg_helper.h
@@ -55,6 +55,8 @@ class ProtoArgHelper {
   std::map<std::string, Argument> arg_map_;
 };
 
+bool IsQuantizedModel(const NetDef &def);
+
 }  // namespace mace
 
 #endif  // MACE_CORE_ARG_HELPER_H_
diff --git a/mace/core/buffer.h b/mace/core/buffer.h
index 521ccc82820597275adc387a3cd47e235e52df81..c859268f818d998983d610333636f187195e8aea 100644
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -233,6 +233,11 @@ class Image : public BufferBase {
     }
   }
 
+  inline DataType dtype() const {
+    MACE_CHECK_NOTNULL(buf_);
+    return data_type_;
+  }
+
   void *buffer() {
     MACE_CHECK_NOTNULL(buf_);
     return buf_;
diff --git a/mace/core/device.h b/mace/core/device.h
index bfa00b02f95c3fe9ab5af78dcc264f79ecc679df..b7fe5f329b99401d31b04af102b2ca1d32d06bff 100644
--- a/mace/core/device.h
+++ b/mace/core/device.h
@@ -34,7 +34,7 @@ class Device {
 
 #ifdef MACE_ENABLE_OPENCL
   virtual OpenCLRuntime *opencl_runtime() = 0;
-#endif
+#endif  // MACE_ENABLE_OPENCL
   virtual CPURuntime *cpu_runtime() = 0;
 
   virtual Allocator *allocator() = 0;
diff --git a/mace/core/memory_optimizer.cc b/mace/core/memory_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f773befca2f686bc17062ecf2cde19f22c68a81e
--- /dev/null
+++ b/mace/core/memory_optimizer.cc
@@ -0,0 +1,270 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/memory_optimizer.h"
+
+#include <algorithm>
+#include <functional>
+#include <numeric>
+#include <sstream>
+#include <unordered_set>
+
+#include "mace/core/arg_helper.h"
+#include "mace/core/macros.h"
+#include "mace/utils/logging.h"
+
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_util.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+
+bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) {
+  static const std::unordered_set<std::string> kReuseOp = {
+      "Reshape", "Identity", "Squeeze", "ExpandDims"
+  };
+  return kReuseOp.count(op_type) == 1;
+}
+
+void MemoryOptimizer::UpdateTensorRef(const std::string &tensor_name) {
+  if (tensor_ref_count_.count(tensor_name) == 0) {
+    tensor_ref_count_.emplace(tensor_name, 1);
+  } else {
+    tensor_ref_count_[tensor_name] += 1;
+  }
+}
+
+void MemoryOptimizer::UpdateTensorRef(const mace::OperatorDef *op_def) {
+  int input_size = op_def->input_size();
+  for (int i = 0; i < input_size; ++i) {
+    if (tensor_ref_count_.count(op_def->input(i)) == 1) {
+      tensor_ref_count_[op_def->input(i)] += 1;
+    }
+  }
+  int output_size = op_def->output_size();
+  for (int i = 0; i < output_size; ++i) {
+    if (tensor_ref_count_.count(op_def->output(i)) == 0) {
+      tensor_ref_count_.emplace(op_def->output(i), 0);
+    }
+  }
+}
+
+MemoryBlock MemoryOptimizer::CreateMemoryBlock(
+    std::vector<int64_t> shape,
+    DataType dt,
+    mace::MemoryType mem_type) {
+  MemoryBlock block;
+#ifdef MACE_ENABLE_OPENCL
+  if (mem_type == MemoryType::GPU_IMAGE) {
+    std::vector<size_t> image_shape;
+    if (shape.size() == 2) {
+      shape = {shape[0], 1, 1, shape[1]};
+    } else {
+      MACE_CHECK(shape.size() == 4) << "GPU only support 2D/4D input";
+    }
+    OpenCLUtil::CalImage2DShape(shape,
+                                OpenCLBufferType::IN_OUT_CHANNEL,
+                                &image_shape);
+    block.set_x(image_shape[0]);
+    block.set_y(image_shape[1]);
+    return block;
+  }
+#endif  // MACE_ENABLE_OPENCL
+  MACE_UNUSED(mem_type);
+  int64_t op_mem_size = std::accumulate(shape.begin(),
+                                        shape.end(),
+                                        GetEnumTypeSize(dt),
+                                        std::multiplies<int64_t>());
+  block.set_x(op_mem_size);
+  block.set_y(1);
+  return block;
+}
+
+void MemoryOptimizer::Optimize(
+    const mace::OperatorDef *op_def,
+    const std::unordered_map<std::string, MemoryType> &mem_types) {
+  MACE_LATENCY_LOGGER(2, "Optimize memory");
+  if (op_def->output_size() != op_def->output_shape_size()) {
+    VLOG(1) << op_def->name()
+            << ": the number of output shape "
+            << "is not equal to the number of output";
+    return;
+  }
+
+  auto device = static_cast<DeviceType>(op_def->device_type());
+  DataType op_dtype = static_cast<DataType>(ProtoArgHelper::GetOptionalArg(
+      *op_def,
+      "T",
+      static_cast<int>(DT_FLOAT)));
+  MACE_CHECK(
+      op_def->output_type_size() == 0 ||
+          op_def->output_size() == op_def->output_type_size(),
+      "operator output size != operator output type size",
+      op_def->output_size(),
+      op_def->output_type_size());
+  DataType dt;
+
+  int output_size = op_def->output_size();
+  for (int i = 0; i < output_size; ++i) {
+    if (i < op_def->output_type_size()) {
+      dt = op_def->output_type(i);
+    } else {
+      dt = op_dtype;
+    }
+    int best_mem_id = -1;
+    MemoryType mem_type = MemoryType::CPU_BUFFER;
+    if (device == DeviceType::GPU) {
+      mem_type = mem_types.at(op_def->output(i));
+    }
+    auto shape = std::vector<int64_t>(
+        op_def->output_shape(i).dims().begin(),
+        op_def->output_shape(i).dims().end());
+    MemoryBlock op_mem_block = CreateMemoryBlock(shape, dt, mem_type);
+    MemoryBlock best_mem_block;
+    if (IsMemoryReuseOp(op_def->type())) {
+      if (tensor_mem_map_.count(op_def->input(0)) == 1) {
+        best_mem_id = tensor_mem_map_[op_def->input(0)].first;
+      }
+    } else {
+      auto shape = std::vector<int64_t>(
+          op_def->output_shape(i).dims().begin(),
+          op_def->output_shape(i).dims().end());
+
+      int64_t op_mem_size = op_mem_block.x() * op_mem_block.y();
+      int64_t best_added_mem_size = LLONG_MAX;
+      int64_t best_wasted_mem_size = LLONG_MAX;
+
+      int64_t old_mem_size = 0, new_mem_size = 0;
+      MemoryBlock new_mem_block;
+      for (auto idle_mem_id : idle_blocks_) {
+        if (mem_blocks_[idle_mem_id].mem_type() == mem_type) {
+          if (mem_type == MemoryType::GPU_IMAGE) {
+            // GPU Image could reuse memory with same data type only
+            if (mem_blocks_[idle_mem_id].data_type() != dt) {
+              continue;
+            }
+            old_mem_size =
+                mem_blocks_[idle_mem_id].x() * mem_blocks_[idle_mem_id].y();
+            new_mem_block.set_x(std::max<int64_t>(mem_blocks_[idle_mem_id].x(),
+                                                  op_mem_block.x()));
+            new_mem_block.set_y(std::max<int64_t>(mem_blocks_[idle_mem_id].y(),
+                                                  op_mem_block.y()));
+            new_mem_size = new_mem_block.x() * new_mem_block.y();
+          } else {
+            old_mem_size = mem_blocks_[idle_mem_id].x();
+            new_mem_size = std::max(op_mem_size, old_mem_size);
+            new_mem_block.set_x(new_mem_size);
+          }
+          int64_t added_mem_size = new_mem_size - old_mem_size;
+          int64_t wasted_mem_size = new_mem_size - op_mem_size;
+          // minimize add_mem_size; if best_mem_add_size is 0,
+          // then minimize waste_mem_size
+          if ((best_added_mem_size > 0 && added_mem_size < best_added_mem_size)
+              || (best_added_mem_size == 0 &&
+                  wasted_mem_size < best_wasted_mem_size)) {
+            best_mem_id = idle_mem_id;
+            best_added_mem_size = added_mem_size;
+            best_wasted_mem_size = wasted_mem_size;
+            best_mem_block = new_mem_block;
+          }
+        }
+      }
+
+      if (best_added_mem_size <= op_mem_size) {
+        best_mem_block.set_mem_id(best_mem_id);
+        best_mem_block.set_data_type(dt);
+        best_mem_block.set_mem_type(mem_type);
+        mem_blocks_[best_mem_id] = best_mem_block;
+        idle_blocks_.erase(best_mem_id);
+      } else {
+        best_mem_id = static_cast<int>(mem_blocks_.size());
+        best_mem_block.set_mem_id(best_mem_id);
+        best_mem_block.set_data_type(dt);
+        best_mem_block.set_mem_type(mem_type);
+        best_mem_block.set_x(op_mem_block.x());
+        best_mem_block.set_y(op_mem_block.y());
+        mem_blocks_.push_back(best_mem_block);
+      }
+    }
+
+    if (best_mem_id != -1) {
+      if (mem_ref_count_.count(best_mem_id) == 1) {
+        mem_ref_count_[best_mem_id] += 1;
+      } else {
+        mem_ref_count_[best_mem_id] = 1;
+      }
+      tensor_mem_map_[op_def->output(i)] = std::make_pair(best_mem_id, dt);
+    }
+  }
+
+  // de-refer input tensors
+  int input_size = op_def->input_size();
+  for (int i = 0; i < input_size; ++i) {
+    auto &input_name = op_def->input(i);
+    if (tensor_ref_count_.count(input_name) == 1) {
+      tensor_ref_count_[input_name] -= 1;
+      if (tensor_ref_count_.at(input_name) == 0 &&
+          tensor_mem_map_.count(input_name) == 1) {
+        int mem_id = tensor_mem_map_.at(input_name).first;
+        mem_ref_count_[mem_id] -= 1;
+        if (mem_ref_count_.at(mem_id) == 0) {
+          idle_blocks_.insert(mem_id);
+        }
+      } else {
+        MACE_CHECK(tensor_ref_count_.at(input_name) >= 0);
+      }
+    }
+  }
+}
+
+const std::vector<MemoryBlock>& MemoryOptimizer::mem_blocks() const {
+  return mem_blocks_;
+}
+
+const std::unordered_map<std::string, std::pair<int, DataType>>&
+    MemoryOptimizer::tensor_mem_map() const {
+  return tensor_mem_map_;
+}
+
+std::string MemoryOptimizer::DebugInfo() const {
+  auto memory_type_to_str = [](const MemoryType type) -> std::string {
+    if (type == MemoryType::CPU_BUFFER) {
+      return "CPU_BUFFER";
+    } else if (type == MemoryType::GPU_BUFFER) {
+      return "GPU_BUFFER";
+    } else if (type == MemoryType::GPU_IMAGE) {
+      return "GPU_IMAGE";
+    } else {
+      return "UNKNOWN";
+    }
+  };
+  std::stringstream sstream;
+  sstream << "\n";
+  size_t block_size = mem_blocks_.size();
+  for (size_t i = 0; i < block_size; ++i) {
+    sstream << i << " " << memory_type_to_str(mem_blocks_[i].mem_type())
+            << " ";
+    if (mem_blocks_[i].mem_type() == MemoryType::GPU_IMAGE) {
+      sstream << DataTypeToString(mem_blocks_[i].data_type()) << " "
+              "[" << mem_blocks_[i].x() << ", " << mem_blocks_[i].y() << "]";
+    } else {
+      sstream << "[" << mem_blocks_[i].x() << "]";
+    }
+    sstream << "\n";
+  }
+
+  return sstream.str();
+}
+
+}  // namespace mace
diff --git a/mace/core/memory_optimizer.h b/mace/core/memory_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa24a206f1e42a2c2f3d5e7dde16b648dc19e2b0
--- /dev/null
+++ b/mace/core/memory_optimizer.h
@@ -0,0 +1,110 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_MEMORY_OPTIMIZER_H_
+#define MACE_CORE_MEMORY_OPTIMIZER_H_
+
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "mace/proto/mace.pb.h"
+#include "mace/core/types.h"
+
+namespace mace {
+
+class MemoryBlock {
+ public:
+  inline void set_mem_id(int mem_id) {
+    mem_id_ = mem_id;
+  }
+
+  inline int mem_id() const {
+    return mem_id_;
+  }
+
+  inline void set_data_type(DataType data_type) {
+    data_type_ = data_type;
+  }
+
+  inline DataType data_type() const {
+    return data_type_;
+  }
+
+  inline void set_mem_type(MemoryType mem_type) {
+    mem_type_ = mem_type;
+  }
+
+  inline MemoryType mem_type() const {
+    return mem_type_;
+  }
+
+  inline void set_x(int64_t x) {
+    x_ = x;
+  }
+
+  inline int64_t x() const {
+    return x_;
+  }
+
+  inline void set_y(int64_t y) {
+    y_ = y;
+  }
+
+  inline int64_t y() const {
+    return y_;
+  }
+
+ private:
+  int mem_id_;
+  DataType data_type_;
+  MemoryType mem_type_;
+  int64_t x_;
+  int64_t y_;
+};
+
+class MemoryOptimizer {
+ public:
+  static bool IsMemoryReuseOp(const std::string &op_type);
+  void UpdateTensorRef(const std::string &tensor_name);
+  void UpdateTensorRef(const OperatorDef *op_def);
+  void Optimize(const OperatorDef *op_def,
+                const std::unordered_map<std::string, MemoryType> &mem_types);
+
+  const std::vector<MemoryBlock> &mem_blocks() const;
+
+  const std::unordered_map<std::string,
+                           std::pair<int, DataType>> &tensor_mem_map() const;
+
+  std::string DebugInfo() const;
+
+ private:
+  MemoryBlock CreateMemoryBlock(std::vector<int64_t> shape,
+                                DataType dt,
+                                MemoryType mem_type);
+
+ private:
+  std::unordered_map<std::string, int> tensor_ref_count_;
+  std::vector<MemoryBlock> mem_blocks_;
+  // tensor name : <mem_id, data_type>
+  // Buffer Memory do not different data type, so store the data type.
+  std::unordered_map<std::string, std::pair<int, DataType>> tensor_mem_map_;
+  std::unordered_map<int, int> mem_ref_count_;
+  std::set<int> idle_blocks_;
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_MEMORY_OPTIMIZER_H_
diff --git a/mace/core/net.cc b/mace/core/net.cc
index 1fe5b0e947fb4d7f9f5d82b91e48f6096cdfcb8b..279724f6e791623923e8772b5db88a4bb8293413 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -18,6 +18,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/macros.h"
+#include "mace/core/memory_optimizer.h"
 #include "mace/core/net.h"
 #include "mace/core/op_context.h"
 #include "mace/public/mace.h"
@@ -25,13 +26,94 @@
 #include "mace/utils/timer.h"
 #include "mace/utils/utils.h"
 
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_util.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 
+namespace {
+struct InternalOutputInfo {
+  InternalOutputInfo(const MemoryType mem_type,
+                     const DataType dtype,
+                     const std::vector<index_t> &shape,
+                     int op_idx)
+      : mem_type(mem_type), dtype(dtype), shape(shape), op_idx(op_idx) {}
+
+  MemoryType mem_type;  // transformed memory type
+  DataType dtype;
+  std::vector<index_t> shape;  // tensor shape
+  int op_idx;  // operation which generate the tensor
+};
+
+#ifdef MACE_ENABLE_OPENCL
+std::string TransformedName(const std::string &input_name,
+                            const mace::MemoryType mem_type) {
+  std::stringstream ss;
+  ss << input_name << "_mem_type_" << mem_type;
+  return ss.str();
+}
+#endif  // MACE_ENABLE_OPENCL
+
+}  // namespace
+
+std::unique_ptr<Operation> SerialNet::CreateOperation(
+    const OpRegistryBase *op_registry,
+    OpConstructContext *construct_context,
+    std::shared_ptr<OperatorDef> op_def,
+    DataFormat data_format_flag,
+    bool is_quantize_model) {
+  // Create the Operation
+  DeviceType target_device_type = target_device_->device_type();
+  // Get available devices
+  auto available_devices = op_registry->AvailableDevices(op_def->type());
+  // Find the device type to run the op.
+  // If the target_device_type in available devices, use target_device_type,
+  // otherwise, fallback to CPU device.
+  DeviceType device_type = DeviceType::CPU;
+  construct_context->set_device(cpu_device_);
+  construct_context->set_output_mem_type(MemoryType::CPU_BUFFER);
+  for (auto device : available_devices) {
+    if (device == target_device_type) {
+      device_type = target_device_type;
+      construct_context->set_device(target_device_);
+      if (target_device_->device_type() == DeviceType::GPU) {
+        construct_context->set_output_mem_type(MemoryType::GPU_IMAGE);
+      }
+      break;
+    }
+  }
+  op_def->set_device_type(device_type);
+  // transpose output shape if run on CPU (default format is NHWC)
+  if (!is_quantize_model && device_type == DeviceType::CPU &&
+      op_def->output_shape_size() == op_def->output_size()) {
+    for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
+      if (data_format_flag == NHWC &&
+          op_def->output_shape(out_idx).dims_size() == 4) {
+        //  NHWC -> NCHW
+        std::vector<index_t> output_shape =
+            TransposeShape<index_t, index_t>(
+                std::vector<index_t>(
+                    op_def->output_shape(out_idx).dims().begin(),
+                    op_def->output_shape(out_idx).dims().end()),
+                {0, 3, 1, 2});
+        for (int i = 0; i < 4; ++i) {
+          op_def->mutable_output_shape(out_idx)->set_dims(i, output_shape[i]);
+        }
+      }
+    }
+  }
+  construct_context->set_operator_def(op_def);
+  std::unique_ptr<Operation> op(
+      op_registry->CreateOperation(construct_context, device_type));
+  return std::move(op);
+}
+
 SerialNet::SerialNet(const OpRegistryBase *op_registry,
                      const NetDef *net_def,
                      Workspace *ws,
                      Device *target_device,
-                     const NetMode mode)
+                     MemoryOptimizer *mem_optimizer)
     : NetBase(),
       ws_(ws),
       target_device_(target_device),
@@ -40,49 +122,213 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
                         target_device->cpu_runtime()->policy(),
                         target_device->cpu_runtime()->use_gemmlowp())) {
   MACE_LATENCY_LOGGER(1, "Constructing SerialNet");
-  // Create Operations
-  DeviceType target_device_type = target_device_->device_type();
+  // output tensor : related information
+  std::unordered_map<std::string, InternalOutputInfo> output_map;
+  // used for memory optimization
+  std::unordered_map<std::string, MemoryType> output_mem_map;
+  std::unordered_map<std::string, std::string> transformed_map;
+  // add input information
+  MemoryType target_mem_type;
+  // quantize model flag
+  bool is_quantize_model = IsQuantizedModel(*net_def);
+  //
+  DataFormat data_format_flag = NHWC;
+  if (target_device_->device_type() == DeviceType::CPU) {
+    target_mem_type = MemoryType::CPU_BUFFER;
+    for (auto &input_info : net_def->input_info()) {
+      std::vector<index_t> input_shape =
+          std::vector<index_t>(input_info.dims().begin(),
+                               input_info.dims().end());
+      // Only could be NONE or NHWC
+      auto input_data_format = static_cast<DataFormat>(
+          input_info.data_format());
+      if (!is_quantize_model &&
+          input_data_format == NHWC &&
+          input_info.dims_size() == 4) {
+        // NHWC -> NCHW
+        input_shape =
+            TransposeShape<index_t, index_t>(input_shape, {0, 3, 1, 2});
+      } else if (input_data_format == DataFormat::DF_NONE) {
+        data_format_flag = DataFormat::DF_NONE;
+      }
+      output_map.emplace(input_info.name(), InternalOutputInfo(
+          target_mem_type, DataType::DT_FLOAT, input_shape, -1));
+    }
+  }
+#ifdef MACE_ENABLE_OPENCL
+  else {  // GPU  NOLINT[readability/braces]
+    target_mem_type = MemoryType::GPU_BUFFER;
+    for (auto &input_info : net_def->input_info()) {
+      std::vector<index_t> input_shape =
+          std::vector<index_t>(input_info.dims().begin(),
+                               input_info.dims().end());
+      output_map.emplace(input_info.name(), InternalOutputInfo(
+          target_mem_type, DataType::DT_FLOAT, input_shape, -1));
+    }
+  }
+#endif  // MACE_ENABLE_OPENCL
+
   OpConstructContext construct_context(ws_);
   for (int idx = 0; idx < net_def->op_size(); ++idx) {
-    const auto &operator_def = net_def->op(idx);
-    // Create the Operation
-    const int op_device =
-        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            operator_def, "device", static_cast<int>(target_device_type));
-    if (op_device == target_device_type) {
-      // Get available devices (sorted based on priority)
-      OperatorDef temp_def(operator_def);
-      auto available_devices = op_registry->AvailableDevices(temp_def.type());
-      // Find the device type to run the op.
-      // If the target_device_type in available devices, use target_device_type,
-      // otherwise, fallback to CPU device.
-      DeviceType device_type = DeviceType::CPU;
-      construct_context.set_device(cpu_device_);
-      for (auto device : available_devices) {
-        if (device == target_device_type) {
-          device_type = target_device_type;
-          construct_context.set_device(target_device_);
-          break;
+    std::shared_ptr<OperatorDef> op_def(new OperatorDef(net_def->op(idx)));
+    // Create operation
+    auto op = CreateOperation(op_registry,
+                              &construct_context,
+                              op_def,
+                              data_format_flag,
+                              is_quantize_model);
+#ifdef MACE_ENABLE_OPENCL
+    // Add input transform operation if necessary
+    if (target_device_->device_type() == DeviceType::GPU) {
+      const DataType dt =
+          static_cast<DataType>(
+              ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                  *op_def, "T", static_cast<int>(DataType::DT_FLOAT)));
+      // the outputs' memory type of the operation
+      MemoryType out_mem_type = construct_context.output_mem_type();
+      int input_size = op_def->input_size();
+      for (int i = 0; i < input_size; ++i) {
+        if (output_map.count(op_def->input(i)) == 1) {
+          // if op is memory-reuse op, no transformation
+          if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) {
+            out_mem_type = output_map.at(op_def->input(i)).mem_type;
+            break;
+          }
+          // check whether is the output tensor of other operation
+          if (output_map.at(op_def->input(i)).mem_type != out_mem_type ||
+              output_map.at(op_def->input(i)).dtype != dt) {
+            auto key = TransformedName(op_def->input(i), out_mem_type);
+            auto &output_info = output_map.at(op_def->input(i));
+            // check whether the tensor has been transformed
+            if (transformed_map.count(key) == 0) {
+              VLOG(1) << "Add Transform operation to transform tensor '"
+                      << op_def->input(i) << "', from memory type "
+                      << output_info.mem_type << " to " << out_mem_type
+                      << ", from Data Type " << output_info.dtype << " to "
+                      << dt;
+              std::string input_name = op_def->input(i);
+              std::string t_input_name =
+                  TransformedName(input_name,
+                                  out_mem_type);
+              op_def->set_input(i, t_input_name);
+              auto input_shape = output_info.shape;
+              if (output_info.mem_type == MemoryType::CPU_BUFFER &&
+                  input_shape.size() == 4) {
+                // NCHW -> NHWC
+                input_shape =
+                    TransposeShape<index_t, index_t>(input_shape,
+                                                     {0, 2, 3, 1});
+              }
+              auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
+                  input_name, input_shape, t_input_name,
+                  dt, out_mem_type);
+              auto transform_op = CreateOperation(
+                  op_registry,
+                  &construct_context,
+                  transform_op_def,
+                  data_format_flag);
+              operators_.emplace_back(std::move(transform_op));
+              transformed_map.emplace(key, t_input_name);
+              output_mem_map[t_input_name] = out_mem_type;
+              // where to do graph reference count.
+              mem_optimizer->UpdateTensorRef(transform_op_def.get());
+            } else {
+              op_def->set_input(i, transformed_map[key]);
+            }
+          }
+        } else {
+          MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
+                         && ws_->GetTensor(op_def->input(i))->is_weight(),
+                     "Tensor ", op_def->input(i), " of ",
+                     op_def->name(), " not allocated");
         }
       }
-      temp_def.set_device_type(device_type);
-      construct_context.set_operator_def(&temp_def);
-      std::unique_ptr<Operation> op(
-          op_registry->CreateOperation(&construct_context, device_type, mode));
-      if (op) {
-        operators_.emplace_back(std::move(op));
+      // update the map : output_tensor -> Operation
+      for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
+        output_mem_map[op_def->output(out_idx)] = out_mem_type;
+        output_map.emplace(
+            op_def->output(out_idx),
+            InternalOutputInfo(
+                out_mem_type,
+                dt,
+                op_def->output_shape().empty() ?
+                std::vector<index_t>() :
+                std::vector<index_t>(
+                    op_def->output_shape(out_idx).dims().begin(),
+                    op_def->output_shape(out_idx).dims().end()),
+                static_cast<int>(operators_.size())));
+      }
+    }
+#endif  // MACE_ENABLE_OPENCL
+    operators_.emplace_back(std::move(op));
+    // where to do graph reference count.
+    mem_optimizer->UpdateTensorRef(op_def.get());
+  }
+
+#ifdef MACE_ENABLE_OPENCL
+  // Transform the output tensor if necessary
+  if (target_device_->device_type() == DeviceType::GPU) {
+    for (auto &output_info : net_def->output_info()) {
+      auto &internal_output_info = output_map.at(output_info.name());
+      if ((internal_output_info.mem_type != target_mem_type &&
+          internal_output_info.mem_type != MemoryType::CPU_BUFFER) ||
+          internal_output_info.dtype != DataType::DT_FLOAT) {
+        VLOG(1) << "Add Transform operation to transform output tensor '"
+                << output_info.name() << "', from memory type "
+                << internal_output_info.mem_type
+                << " to " << target_mem_type
+                << ", from Data Type " << internal_output_info.dtype
+                << " to " << DataType::DT_FLOAT;
+        std::string t_output_name = TransformedName(output_info.name(),
+            target_mem_type);
+        auto output_op_def =
+            operators_[internal_output_info.op_idx]->operator_def();
+        int output_size = output_op_def->output_size();
+        for (int i = 0; i < output_size; ++i) {
+          if (output_op_def->output(i) == output_info.name()) {
+            output_op_def->set_output(i, t_output_name);
+            // update the output : mem_type map
+            output_mem_map[t_output_name] = output_mem_map[output_info.name()];
+            output_mem_map[output_info.name()] = target_mem_type;
+          }
+        }
+        auto output_data_format =
+            static_cast<DataFormat>(output_info.data_format());
+        auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
+            t_output_name,
+            internal_output_info.shape,
+            output_info.name(),
+            DataType::DT_FLOAT,
+            target_mem_type);
+        auto transform_op = CreateOperation(
+            op_registry,
+            &construct_context,
+            transform_op_def,
+            output_data_format);
+        operators_.emplace_back(std::move(transform_op));
+        // where to do graph reference count.
+        mem_optimizer->UpdateTensorRef(transform_op_def.get());
       }
     }
   }
+#endif  // MACE_ENABLE_OPENCL
+  // Update output tensor reference
+  for (auto &output_info : net_def->output_info()) {
+    mem_optimizer->UpdateTensorRef(output_info.name());
+  }
+
+  // Do memory optimization
+  for (auto &op : operators_) {
+    VLOG(2) << "Operator " << op->debug_def().name() << "<" << op->device_type()
+            << ", " << op->debug_def().type() << ">";
+    mem_optimizer->Optimize(op->operator_def().get(), output_mem_map);
+  }
+  VLOG(1) << mem_optimizer->DebugInfo();
 }
 
 MaceStatus SerialNet::Init() {
   MACE_LATENCY_LOGGER(1, "Initializing SerialNet");
   OpInitContext init_context(ws_);
-  // TODO(liuqi): where to do memory reuse.
-  if (target_device_->device_type() == DeviceType::GPU) {
-
-  }
   for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
     auto &op = *iter;
     DeviceType device_type = op->device_type();
@@ -98,18 +344,18 @@ MaceStatus SerialNet::Init() {
 }
 
 MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
-  // TODO(liuqi): In/Out Buffer Transform
   MACE_MEMORY_LOGGING_GUARD();
   MACE_LATENCY_LOGGER(1, "Running net");
   OpContext context(ws_, cpu_device_);
   for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
     auto &op = *iter;
     DeviceType device_type = op->device_type();
-    MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(),
-                        "<", device_type, ", ", op->debug_def().type(), ">",
-                        ". mem_id: ",
-                        MakeListString(op->debug_def().mem_id().data(),
-                                       op->debug_def().mem_id().size()));
+    MACE_LATENCY_LOGGER(1, "Running operator ", op->debug_def().name(),
+                        "<", device_type, ", ", op->debug_def().type(),
+                        ", ",
+                        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                            op->debug_def(), "T", static_cast<int>(DT_FLOAT)),
+                        ">");
     if (device_type == target_device_->device_type()) {
       context.set_device(target_device_);
     } else {
@@ -176,7 +422,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
             float max_v = std::numeric_limits<float>::lowest();
             float min_v = std::numeric_limits<float>::max();
             Tensor::MappingGuard guard(op->Output(i));
-            const float *output_data = op->Output(i)->data<float>();
+            auto *output_data = op->Output(i)->data<float>();
             for (index_t j = 0; j < op->Output(i)->size(); ++j) {
               max_v = std::max(max_v, output_data[j]);
               min_v = std::min(min_v, output_data[j]);
@@ -192,14 +438,14 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
             std::vector<int> bin_distribution(bin_size, 0);
             float bin_v = (max_v - min_v) / bin_size;
             Tensor::MappingGuard guard(op->Output(i));
-            const float *output_data = op->Output(i)->data<float>();
+            auto *output_data = op->Output(i)->data<float>();
             for (index_t j = 0; j < op->Output(i)->size(); ++j) {
-                int ind = static_cast<int>((output_data[j] - min_v) / bin_v);
-                if (ind < 0)
-                  ind = 0;
-                else if (ind > bin_size-1)
-                  ind = bin_size-1;
-                bin_distribution[ind]++;
+                int index = static_cast<int>((output_data[j] - min_v) / bin_v);
+                if (index < 0)
+                  index = 0;
+                else if (index > bin_size-1)
+                  index = bin_size-1;
+                bin_distribution[index]++;
             }
             LOG(INFO) << "Tensor range @@" << op->debug_def().output(i)
                         << "@@" << min_v << "," << max_v<< "@@"
diff --git a/mace/core/net.h b/mace/core/net.h
index d5a6725f7265f86feabf7d6f4c82874c8394c7e0..10577a572f5a0629ae515d9b330befbaa639016e 100644
--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -27,6 +27,7 @@ namespace mace {
 
 class RunMetadata;
 class Workspace;
+class MemoryOptimizer;
 
 class NetBase {
  public:
@@ -47,12 +48,20 @@ class SerialNet : public NetBase {
             const NetDef *net_def,
             Workspace *ws,
             Device *target_device,
-            const NetMode mode = NetMode::NORMAL);
+            MemoryOptimizer * mem_optimizer);
 
   MaceStatus Init() override;
 
   MaceStatus Run(RunMetadata *run_metadata = nullptr) override;
 
+ private:
+  std::unique_ptr<Operation> CreateOperation(
+      const OpRegistryBase *op_registry,
+      OpConstructContext *construct_context,
+      std::shared_ptr<OperatorDef> op_def,
+      DataFormat input_format,
+      bool is_quantize_model = false);
+
  protected:
   Workspace *ws_;
   Device *target_device_;
diff --git a/mace/core/operator.cc b/mace/core/operator.cc
index 9a1da4c81170f76f5b18325f3149e5836c5f75cd..6a437f884c506af231db882a500560bdd8dc67ec 100644
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -23,16 +23,12 @@ namespace mace {
 
 OpConstructContext::OpConstructContext(Workspace *ws)
     : operator_def_(nullptr), ws_(ws), device_(nullptr) {}
-OpConstructContext::OpConstructContext(OperatorDef *operator_def,
-                                       Workspace *ws,
-                                       Device *device)
-    : operator_def_(operator_def), ws_(ws), device_(device) {}
 
 OpInitContext::OpInitContext(Workspace *ws, Device *device)
     : ws_(ws), device_(device) {}
 
 Operation::Operation(OpConstructContext *context)
-    : operator_def_(std::make_shared<OperatorDef>(*(context->operator_def())))
+    : operator_def_(context->operator_def())
 {}
 
 MaceStatus Operation::Init(OpInitContext *context) {
@@ -43,11 +39,9 @@ MaceStatus Operation::Init(OpInitContext *context) {
                ": Encountered a non-existing input tensor: ", input_str);
     inputs_.push_back(tensor);
   }
-  // TODO(liuqi): filter transform
   for (int i = 0; i < operator_def_->output_size(); ++i) {
     const std::string output_str = operator_def_->output(i);
     if (ws->HasTensor(output_str)) {
-      // TODO(liuqi): Workspace should pre-allocate all of the output tensors
       outputs_.push_back(ws->GetTensor(output_str));
     } else {
       MACE_CHECK(
@@ -66,15 +60,14 @@ MaceStatus Operation::Init(OpInitContext *context) {
       }
       outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
           output_str, context->device()->allocator(), output_type)));
-
-      if (i < operator_def_->output_shape_size()) {
-        std::vector<index_t>
-            shape_configured(operator_def_->output_shape(i).dims_size());
-        for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
-          shape_configured[dim] = operator_def_->output_shape(i).dims(dim);
-        }
-        ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
+    }
+    if (i < operator_def_->output_shape_size()) {
+      std::vector<index_t>
+          shape_configured(operator_def_->output_shape(i).dims_size());
+      for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
+        shape_configured[dim] = operator_def_->output_shape(i).dims(dim);
       }
+      ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
     }
   }
   return MaceStatus::MACE_SUCCESS;
@@ -164,33 +157,34 @@ const std::set<DeviceType> OpRegistryBase::AvailableDevices(
 
 std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
     OpConstructContext *context,
-    DeviceType device_type,
-    const NetMode mode) const {
-  OperatorDef *operator_def = context->operator_def();
-  const DataType dtype = static_cast<DataType>(
+    DeviceType device_type) const {
+  auto operator_def = context->operator_def();
+  DataType dtype = static_cast<DataType>(
       ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
           *operator_def, "T", static_cast<int>(DT_FLOAT)));
-  const int op_mode_i = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-      *operator_def, "mode", static_cast<int>(NetMode::NORMAL));
-  const NetMode op_mode = static_cast<NetMode>(op_mode_i);
-  VLOG(3) << "Creating operator " << operator_def->name() << "("
+  if (device_type == DeviceType::CPU && dtype == DT_HALF) {
+    int arg_size = operator_def->arg_size();
+    for (int i = 0; i < arg_size; ++i) {
+      if (operator_def->arg(i).name() == "T") {
+        operator_def->mutable_arg(i)->set_i(DT_FLOAT);
+      }
+    }
+    dtype = DT_FLOAT;
+  }
+  VLOG(1) << "Creating operator " << operator_def->name() << "("
           << operator_def->type() << "<" << dtype << ">" << ") on "
           << device_type;
-  if (op_mode == mode) {
-    const std::string op_type = context->operator_def()->type();
-    MACE_CHECK(registry_.count(op_type) != 0,
-               op_type, " operation is not registered.");
-
-    std::string key = OpKeyBuilder(op_type)
-        .Device(device_type)
-        .TypeConstraint("T", dtype)
-        .Build();
-    if (registry_.at(op_type)->creators.count(key) == 0) {
-      LOG(FATAL) << "Key not registered: " << key;
-    }
-    return registry_.at(op_type)->creators.at(key)(context);
-  } else {
-    return nullptr;
+  const std::string op_type = context->operator_def()->type();
+  MACE_CHECK(registry_.count(op_type) != 0,
+             op_type, " operation is not registered.");
+
+  std::string key = OpKeyBuilder(op_type)
+      .Device(device_type)
+      .TypeConstraint("T", dtype)
+      .Build();
+  if (registry_.at(op_type)->creators.count(key) == 0) {
+    LOG(FATAL) << "Key not registered: " << key;
   }
+  return registry_.at(op_type)->creators.at(key)(context);
 }
 }  // namespace mace
diff --git a/mace/core/operator.h b/mace/core/operator.h
index 7017240c8194e9bbe2cac7fe06c85b534683e7f2..8d3e1557bd5673ea07ddc4b3008711e43a8e27c2 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -33,14 +33,13 @@ namespace mace {
 class OpConstructContext {
  public:
   explicit OpConstructContext(Workspace *ws);
-  OpConstructContext(OperatorDef *operator_def, Workspace *ws, Device *device);
   ~OpConstructContext() = default;
 
-  inline void set_operator_def(OperatorDef *operator_def) {
+  inline void set_operator_def(std::shared_ptr<OperatorDef> operator_def) {
     operator_def_ = operator_def;
   }
 
-  inline OperatorDef *operator_def() const {
+  inline std::shared_ptr<OperatorDef> operator_def() const {
     return operator_def_;
   }
 
@@ -56,10 +55,19 @@ class OpConstructContext {
     return device_;
   }
 
+  inline void set_output_mem_type(MemoryType type) {
+    output_mem_type_ = type;
+  }
+
+  inline MemoryType output_mem_type() const {
+    return output_mem_type_;
+  }
+
  private:
-  OperatorDef *operator_def_;
+  std::shared_ptr<OperatorDef> operator_def_;
   Workspace *ws_;
   Device *device_;
+  MemoryType output_mem_type_;  // used for transform memory
 };
 
 // memory_optimizer, device
@@ -137,6 +145,10 @@ class Operation {
 
   inline bool has_debug_def() const { return operator_def_ != nullptr; }
 
+  inline std::shared_ptr<OperatorDef> operator_def() {
+    return operator_def_;
+  }
+
  protected:
   std::shared_ptr<OperatorDef> operator_def_;
   std::vector<const Tensor *> inputs_;
@@ -190,8 +202,7 @@ class OpRegistryBase {
 
   std::unique_ptr<Operation> CreateOperation(
       OpConstructContext *context,
-      DeviceType device_type,
-      const NetMode mode) const;
+      DeviceType device_type) const;
 
   template <class DerivedType>
   static std::unique_ptr<Operation> DefaultCreator(
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 18840b7107619adf94aca7ae739caa3358d33fd3..ee4eae5961cc2d2368c8a1aa41ebac40ddc7187f 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -285,7 +285,8 @@ OpenCLRuntime::OpenCLRuntime(
     is_profiling_enabled_(false),
     opencl_version_(CL_VER_UNKNOWN),
     gpu_type_(UNKNOWN),
-    mem_type_(MemoryType::GPU_IMAGE) {
+    mem_type_(MemoryType::GPU_IMAGE),
+    scratch_image_manager_(new ScratchImageManager) {
   std::vector<cl::Platform> all_platforms;
   cl::Platform::get(&all_platforms);
   if (all_platforms.size() == 0) {
@@ -791,4 +792,8 @@ bool OpenCLRuntime::is_profiling_enabled() const {
   return is_profiling_enabled_;
 }
 
+ScratchImageManager* OpenCLRuntime::scratch_image_manager() const {
+  return scratch_image_manager_.get();
+}
+
 }  // namespace mace
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index 3d182a9e0def86d6ecf73cd5140751b0d1702d31..ef1d50e1b9e1a47856f57bbdbb456c118c2c9dbf 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -25,6 +25,7 @@
 #include "mace/core/file_storage.h"
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/runtime/opencl/scratch_image.h"
 #include "mace/proto/mace.pb.h"
 #include "mace/utils/string_util.h"
 #include "mace/utils/timer.h"
@@ -82,6 +83,7 @@ class OpenCLRuntime {
   uint64_t device_global_mem_cache_size() const;
   uint32_t device_compute_units() const;
   Tuner<uint32_t> *tuner();
+  ScratchImageManager *scratch_image_manager() const;
   bool is_opencl_avaliable();
   // TODO(liuqi): remove this function in the future, make decision at runtime.
   bool UseImageMemory();
@@ -134,6 +136,7 @@ class OpenCLRuntime {
   OpenCLVersion opencl_version_;
   GPUType gpu_type_;
   MemoryType mem_type_;
+  std::unique_ptr<ScratchImageManager> scratch_image_manager_;
   // All OpenCL object must be a pointer and manually deleted before unloading
   // OpenCL library.
   std::shared_ptr<cl::Context> context_;
diff --git a/mace/core/runtime/opencl/opencl_util.cc b/mace/core/runtime/opencl/opencl_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..02ffc8e02222492e9ec9f8d7a0688c9e3c49c5e7
--- /dev/null
+++ b/mace/core/runtime/opencl/opencl_util.cc
@@ -0,0 +1,181 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/opencl/opencl_util.h"
+
+#include <utility>
+
+#include "mace/utils/logging.h"
+
+namespace mace {
+
+namespace {
+// [(C + 3) / 4 * W, N * H]
+void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
+                           std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2];
+  (*image_shape)[1] = shape[0] * shape[1];
+}
+
+// [Ic, H * W * (Oc + 3) / 4]
+void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* OIHW */
+                               std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[1];
+  (*image_shape)[1] = shape[2] * shape[3] * RoundUpDiv4(shape[0]);
+}
+
+// [H * W * M, (Ic + 3) / 4]
+void CalDepthwiseConv2dFilterImageShape(
+    const std::vector<index_t> &shape, /* MIHW */
+    std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[0] * shape[2] * shape[3];
+  (*image_shape)[1] = RoundUpDiv4(shape[1]);
+}
+
+// [(size + 3) / 4, 1]
+void CalArgImageShape(const std::vector<index_t> &shape,
+                      std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 1);
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[0]);
+  (*image_shape)[1] = 1;
+}
+
+// Only support 3x3 now
+// [ (Ic + 3) / 4, 16 * Oc]
+void CalWinogradFilterImageShape(
+    const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
+    std::vector<size_t> *image_shape,
+    const int blk_size) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[1]);
+  (*image_shape)[1] = (shape[0] * (blk_size + 2) * (blk_size + 2));
+}
+
+
+// [W * C, N * RoundUp<4>(H)]
+void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
+                              std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[2] * shape[3];
+  (*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]);
+}
+
+// [RoundUp<4>(W) * C, N * H]
+void CalInOutWidthImageShape(const std::vector<index_t> &shape, /* NHWC */
+                             std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3];
+  (*image_shape)[1] = shape[0] * shape[1];
+}
+
+// [Ic * H * W, (Oc + 3) / 4]
+void CalWeightHeightImageShape(const std::vector<index_t> &shape, /* OIHW */
+                               std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[1] * shape[2] * shape[3];
+  (*image_shape)[1] = RoundUpDiv4(shape[0]);
+}
+
+// [(Ic + 3) / 4 * H * W, Oc]
+void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* OIHW */
+                              std::vector<size_t> *image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[1]) * shape[2] * shape[3];
+  (*image_shape)[1] = shape[0];
+}
+}  // namespace
+
+void OpenCLUtil::CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
+                                 const OpenCLBufferType type,
+                                 std::vector<size_t> *image_shape,
+                                 const int wino_block_size) {
+  MACE_CHECK_NOTNULL(image_shape);
+  switch (type) {
+    case CONV2D_FILTER:
+      CalConv2dFilterImageShape(shape, image_shape);
+      break;
+    case DW_CONV2D_FILTER:
+      CalDepthwiseConv2dFilterImageShape(shape, image_shape);
+      break;
+    case IN_OUT_CHANNEL:
+      CalInOutputImageShape(shape, image_shape);
+      break;
+    case ARGUMENT:
+      CalArgImageShape(shape, image_shape);
+      break;
+    case IN_OUT_HEIGHT:
+      CalInOutHeightImageShape(shape, image_shape);
+      break;
+    case IN_OUT_WIDTH:
+      CalInOutWidthImageShape(shape, image_shape);
+      break;
+    case WINOGRAD_FILTER:
+      CalWinogradFilterImageShape(shape, image_shape, wino_block_size);
+      break;
+    case WEIGHT_HEIGHT:
+      CalWeightHeightImageShape(shape, image_shape);
+      break;
+    case WEIGHT_WIDTH:
+      CalWeightWidthImageShape(shape, image_shape);
+      break;
+    default:
+      LOG(FATAL) << "Mace not supported yet.";
+  }
+}
+
+std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef(
+    const std::string &input_name,
+    const std::vector<mace::index_t> &input_shape,
+    const std::string &output_name,
+    const mace::DataType dt,
+    const mace::MemoryType mem_type) {
+  std::unique_ptr<OperatorDef> op(new OperatorDef);
+  std::string op_name = "mace_node_" + output_name;
+  op->set_name(op_name);
+  op->set_type("BufferTransform");
+  op->add_input(input_name);
+  op->add_output(output_name);
+  Argument *arg = op->add_arg();
+  arg->set_name("buffer_type");
+  arg->set_i(static_cast<int32_t>(OpenCLBufferType::IN_OUT_CHANNEL));
+  arg = op->add_arg();
+  arg->set_name("mem_type");
+  arg->set_i(static_cast<int32_t>(mem_type));
+  arg = op->add_arg();
+  arg->set_name("T");
+  arg->set_i(static_cast<int32_t>(dt));
+  arg = op->add_arg();
+  arg->set_name("device");
+  arg->set_i(DeviceType::GPU);
+  if (!input_shape.empty()) {
+    OutputShape *shape = op->add_output_shape();
+    for (auto value : input_shape) {
+      shape->add_dims(value);
+    }
+  }
+  return std::move(op);
+}
+}  // namespace mace
diff --git a/mace/core/runtime/opencl/opencl_util.h b/mace/core/runtime/opencl/opencl_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb518317455dccebb6e05a7456765fbd0700f566
--- /dev/null
+++ b/mace/core/runtime/opencl/opencl_util.h
@@ -0,0 +1,54 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_
+#define MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mace/core/types.h"
+
+namespace mace {
+enum OpenCLBufferType {
+  CONV2D_FILTER = 0,
+  IN_OUT_CHANNEL = 1,
+  ARGUMENT = 2,
+  IN_OUT_HEIGHT = 3,
+  IN_OUT_WIDTH = 4,
+  WINOGRAD_FILTER = 5,
+  DW_CONV2D_FILTER = 6,
+  WEIGHT_HEIGHT = 7,
+  WEIGHT_WIDTH = 8,
+};
+
+
+class OpenCLUtil {
+ public:
+  static void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
+                              const OpenCLBufferType type,
+                              std::vector<size_t> *image_shape,
+                              const int wino_blk_size = 2);
+
+  static std::shared_ptr<OperatorDef> CreateTransformOpDef(
+      const std::string &input_name,
+      const std::vector<mace::index_t> &input_shape,
+      const std::string &output_name,
+      const mace::DataType dt,
+      const MemoryType mem_type);
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_
diff --git a/mace/core/runtime/opencl/scratch_image.cc b/mace/core/runtime/opencl/scratch_image.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d2d4dcfebca536e2ef99e37ac90cdd6194053108
--- /dev/null
+++ b/mace/core/runtime/opencl/scratch_image.cc
@@ -0,0 +1,84 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/opencl/scratch_image.h"
+
+#include <utility>
+#include <vector>
+
+namespace mace {
+
+ScratchImageManager::ScratchImageManager() = default;
+ScratchImageManager::~ScratchImageManager() = default;
+
+Image *ScratchImageManager::Spawn(
+    Allocator *allocator,
+    const std::vector<size_t> &shape,
+    const DataType dt,
+    int *id) {
+  // TODO(liuqi): not optimal memory reuse strategy
+  int found_image_idx = -1;
+  int image_count = static_cast<int>(reference_count_.size());
+  for (int i = 0; i < image_count; ++i) {
+    int count = reference_count_[i];
+    if (count == 0 && images_.at(count)->dtype() == dt) {
+      auto image_shape = images_.at(count)->image_shape();
+      if (image_shape[0] >= shape[0] && image_shape[1] >= shape[1]) {
+        found_image_idx = i;
+        break;
+      }
+    }
+  }
+  // if not found
+  if (found_image_idx == -1) {
+    reference_count_.push_back(0);
+    images_[image_count] =
+        std::move(std::unique_ptr<Image>(new Image(allocator)));
+    if (images_.at(image_count)->Allocate(shape, dt) !=
+        MaceStatus::MACE_SUCCESS) {
+      return nullptr;
+    }
+    found_image_idx = image_count;
+    VLOG(2) << "Spawn image " << found_image_idx << ": " << MakeString(shape)
+            << "<" << dt << ">";
+  }
+  reference_count_[found_image_idx] += 1;
+  *id = found_image_idx;
+  return images_.at(found_image_idx).get();
+}
+
+void ScratchImageManager::Deactive(int id) {
+  MACE_CHECK(reference_count_.size() > static_cast<size_t>(id)
+                 && reference_count_[id] > 0,
+             "Image id ", id, " exceed the vector size ",
+             reference_count_.size());
+  reference_count_[id] -= 1;
+}
+
+ScratchImage::ScratchImage(mace::ScratchImageManager *manager)
+    : manager_(manager), id_(-1) {}
+
+ScratchImage::~ScratchImage() {
+  if (id_ >= 0) {
+    manager_->Deactive(id_);
+  }
+}
+
+Image* ScratchImage::Scratch(Allocator *allocator,
+                             const std::vector<size_t> &shape,
+                             const mace::DataType dt) {
+  return manager_->Spawn(allocator, shape, dt, &id_);
+}
+
+}  // namespace mace
diff --git a/mace/core/runtime/opencl/scratch_image.h b/mace/core/runtime/opencl/scratch_image.h
new file mode 100644
index 0000000000000000000000000000000000000000..adfe208f8a376878fa1319a4fd935ae4ec8a6102
--- /dev/null
+++ b/mace/core/runtime/opencl/scratch_image.h
@@ -0,0 +1,58 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_
+#define MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "mace/core/buffer.h"
+
+namespace mace {
+
+class ScratchImageManager {
+ public:
+  ScratchImageManager();
+  ~ScratchImageManager();
+
+  Image *Spawn(Allocator *allocator,
+               const std::vector<size_t> &shape,
+               const DataType dt,
+               int *id);
+
+  void Deactive(int id);
+
+ private:
+  std::unordered_map<int, std::unique_ptr<Image>> images_;
+  std::vector<int> reference_count_;
+};
+
+class ScratchImage {
+ public:
+  explicit ScratchImage(ScratchImageManager *);
+  ~ScratchImage();
+
+  Image *Scratch(Allocator *allocator,
+                 const std::vector<size_t> &shape,
+                 const DataType dt);
+
+ private:
+  ScratchImageManager *manager_;
+  int id_;
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_
diff --git a/mace/core/tensor.h b/mace/core/tensor.h
index f217bee42f7b6615453704c375e79d08cb1c4666..22d5f77270fc030c6915805c850ef2bb379ee489 100644
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -97,7 +97,7 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) {
 }
 }  // namespace numerical_chars
 
-enum DataFormat { NHWC = 0, NCHW = 1, HWOI = 2, OIHW = 3, HWIO = 4, OHWI = 5 };
+enum FilterDataFormat { HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103 };
 
 class Tensor {
  public:
@@ -223,7 +223,7 @@ class Tensor {
   }
 
   inline MemoryType memory_type() const {
-    MACE_CHECK(buffer_ != nullptr, "Tensor ", name_, " is empty" );
+    MACE_CHECK(buffer_ != nullptr, "Tensor ", name_, " is empty");
     if (buffer_->OnHost()) {
       return MemoryType::CPU_BUFFER;
     } else if (typeid(*buffer_) == typeid(Image)) {
@@ -233,6 +233,14 @@ class Tensor {
     }
   }
 
+  inline void set_data_format(DataFormat data_format) {
+    data_format_ = data_format;
+  }
+
+  inline DataFormat data_format() const {
+    return data_format_;
+  }
+
 #ifdef MACE_ENABLE_OPENCL
   inline cl::Image *opencl_image() const {
     MACE_CHECK(has_opencl_image(), name_, " do not have image");
@@ -499,6 +507,7 @@ class Tensor {
   int32_t zero_point_;
   float minval_;
   float maxval_;
+  DataFormat data_format_;  // used for 4D input/output tensor
 
   MACE_DISABLE_COPY_AND_ASSIGN(Tensor);
 };
diff --git a/mace/core/transformer.h b/mace/core/transformer.h
deleted file mode 100644
index 09f56009e0114dd5de9f017a3dbeb66dbff2eea3..0000000000000000000000000000000000000000
--- a/mace/core/transformer.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_CORE_TRANSFORMER_H_
-#define MACE_CORE_TRANSFORMER_H_
-
-#include "mace/proto/mace.pb.h"
-
-namespace mace {
-
-class TransformerBase {
- public:
-  // Construct transform operation.
-  virtual std::vector<std::unique_ptr<OperatorDef>> ConstructTranformOp(
-      OperatorDef *op_def,
-      bool transform_filter = true) = 0;
-};
-
-}  // namespace mace
-
-#endif  // MACE_CORE_TRANSFORMER_H_
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index 766e125e20830cfc61891aaaf9f57dfd6eef8244..e98387eb31b9ee9f58923463ada94d7151753734 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -18,6 +18,7 @@
 #include <utility>
 
 #include "mace/core/arg_helper.h"
+#include "mace/core/memory_optimizer.h"
 #include "mace/utils/quantize.h"
 
 #ifdef MACE_ENABLE_OPENCL
@@ -27,13 +28,6 @@
 namespace mace {
 
 namespace {
-bool ShouldPreallocateMemoryForOp(const OperatorDef &op) {
-  static const std::unordered_set<std::string> reuse_buffer_ops {
-      "Reshape", "Identity", "Squeeze"
-  };
-  return reuse_buffer_ops.find(op.type()) == reuse_buffer_ops.end();
-}
-
 bool HasQuantizeOp(const NetDef &net_def) {
   for (auto &op : net_def.op()) {
     if (op.type() == "Quantize") {
@@ -48,13 +42,14 @@ Workspace::Workspace() = default;
 
 Tensor *Workspace::CreateTensor(const std::string &name,
                                 Allocator *alloc,
-                                DataType type) {
+                                DataType type,
+                                bool is_weight) {
   if (HasTensor(name)) {
     VLOG(3) << "Tensor " << name << " already exists. Skipping.";
   } else {
     VLOG(3) << "Creating Tensor " << name;
     tensor_map_[name] = std::unique_ptr<Tensor>(new Tensor(alloc, type,
-                                                           false, name));
+                                                           is_weight, name));
   }
   return GetTensor(name);
 }
@@ -199,13 +194,79 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
       fused_buffer_ = true;
     }
   }
+  return MaceStatus::MACE_SUCCESS;
+}
 
-  if (device_type == DeviceType::CPU || device_type == DeviceType::GPU) {
-    MaceStatus status = CreateOutputTensorBuffer(net_def, device);
-    if (status != MaceStatus::MACE_SUCCESS) return status;
+MaceStatus Workspace::PreallocateOutputTensor(
+    const mace::NetDef &net_def,
+    const mace::MemoryOptimizer *mem_optimizer,
+    Device *device) {
+  auto &mem_blocks = mem_optimizer->mem_blocks();
+  for (auto &mem_block : mem_blocks) {
+    VLOG(3) << "Preallocate memory block. id: " << mem_block.mem_id()
+            << ", memory type: " << mem_block.mem_type()
+            << ", size: " << mem_block.x() << "x" << mem_block.y();
+    if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
+      std::unique_ptr<BufferBase> tensor_buf(
+          new Buffer(GetCPUAllocator()));
+      MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
+          mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE));
+      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
+                                        std::move(tensor_buf));
+    } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
+      std::unique_ptr<BufferBase> image_buf(
+          new Image(device->allocator()));
+      MACE_RETURN_IF_ERROR(image_buf->Allocate(
+          {static_cast<size_t>(mem_block.x()),
+           static_cast<size_t>(mem_block.y())}, mem_block.data_type()));
+      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
+                                        std::move(image_buf));
+    } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
+      std::unique_ptr<BufferBase> tensor_buf(
+          new Buffer(device->allocator()));
+      MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
+          mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE));
+      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
+                                        std::move(tensor_buf));
+    }
+  }
+  VLOG(1) << "Preallocate buffer to tensors";
+  bool is_quantize_model = IsQuantizedModel(net_def);
+  for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) {
+    std::unique_ptr<Tensor> tensor
+        (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.first),
+                    tensor_mem.second.second,
+                    false, tensor_mem.first));
+    if (mem_blocks[tensor_mem.second.first].mem_type()
+        == MemoryType::GPU_IMAGE) {
+      VLOG(1) << "Tensor: " << tensor_mem.first
+              << " Mem: " << tensor_mem.second.first
+              << " Data type: " << tensor->dtype()
+              << " Image shape: "
+              << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
+                  ->image_shape()[0]
+              << ", "
+              << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
+                  ->image_shape()[1];
+     tensor->set_data_format(DataFormat::NHWC);
+    } else {
+      VLOG(1) << "Tensor: " << tensor_mem.first
+              << " Mem: " << tensor_mem.second.first
+              << " Data type: " << tensor->dtype()
+              << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
+      if (mem_blocks[tensor_mem.second.first].mem_type()
+          == MemoryType::GPU_BUFFER ||
+          is_quantize_model) {
+        tensor->set_data_format(DataFormat::NHWC);
+      } else {
+        tensor->set_data_format(DataFormat::NCHW);
+      }
+    }
+    tensor_map_[tensor_mem.first] = std::move(tensor);
   }
 
-  if (device_type == DeviceType::CPU) {
+  // add quantize info for output tensors.
+  if (device->device_type() == DeviceType::CPU) {
     for (const auto &op : net_def.op()) {
       VLOG(2) << "Add quantize info for op: " << op.name();
       MACE_CHECK(op.quantize_info().empty()
@@ -225,139 +286,6 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
-                                               Device *device) {
-  DeviceType device_type = device->device_type();
-  DataType dtype = DataType::DT_INVALID;
-  if (net_def.mem_arena().mem_block_size() > 0) {
-    // We use the data type of the first op with mem id,
-    // as CPU&GPU have consistent data type for each layer for now.
-    // As DSP may have different data output type for each op,
-    // we stick to the same concept.
-    for (auto &op : net_def.op()) {
-      // TODO(liuqi): refactor to add device_type to OperatorDef
-      const int op_device =
-          ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-              op, "device", static_cast<int>(device_type));
-      if (op_device == device_type && !op.mem_id().empty()) {
-        const DataType op_dtype = static_cast<DataType>(
-            ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                op, "T", static_cast<int>(DT_FLOAT)));
-        if (op_dtype != DataType::DT_INVALID) {
-          dtype = op_dtype;
-          // find first valid data type, break
-          break;
-        }
-      }
-    }
-    MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
-  }
-  // TODO(liyin): memory block should not have concept of type, but to be
-  // consistent with gpu, all memory block use float/half as unit
-  for (auto &mem_block : net_def.mem_arena().mem_block()) {
-    if (mem_block.device_type() == device_type) {
-      VLOG(3) << "Preallocate memory block. id: " << mem_block.mem_id()
-              << ", device type: " << mem_block.device_type()
-              << ", memory type: " << mem_block.mem_type();
-      if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
-        std::unique_ptr<BufferBase> tensor_buf(
-            new Buffer(GetCPUAllocator()));
-        MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
-            mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE));
-        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
-                                          std::move(tensor_buf));
-      } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
-        std::unique_ptr<BufferBase> image_buf(
-            new Image(device->allocator()));
-        MACE_RETURN_IF_ERROR(image_buf->Allocate(
-            {mem_block.x(), mem_block.y()}, dtype));
-        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
-                                          std::move(image_buf));
-      } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
-        std::unique_ptr<BufferBase> tensor_buf(
-            new Buffer(device->allocator()));
-        MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
-            mem_block.x() * GetEnumTypeSize(dtype)
-                + MACE_EXTRA_BUFFER_PAD_SIZE));
-        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
-                                          std::move(tensor_buf));
-      }
-    }
-  }
-  VLOG(3) << "Preallocate buffer to tensors";
-  for (auto &op : net_def.op()) {
-    // TODO(liuqi): refactor to add device_type to OperatorDef
-    const int op_device =
-        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            op, "device", static_cast<int>(device_type));
-    if (op_device == device_type) {
-      if (!op.mem_id().empty()
-          && ShouldPreallocateMemoryForOp(op)) {
-        auto mem_ids = op.mem_id();
-        int count = mem_ids.size();
-        for (int i = 0; i < count; ++i) {
-          DataType output_type;
-          if (i < op.output_type_size()) {
-            output_type = op.output_type(i);
-          } else {
-            output_type = dtype;
-          }
-          std::unique_ptr<Tensor> tensor
-              (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]),
-                          output_type, false, op.output(i)));
-          if (device_type == DeviceType::GPU && tensor->has_opencl_image()) {
-            VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
-                    << " Mem: " << mem_ids[i]
-                    << " Image shape: "
-                    << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
-                        ->image_shape()[0]
-                    << ", "
-                    << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
-                        ->image_shape()[1];
-          } else {
-            VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
-                    << " Mem: " << mem_ids[i]
-                    << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
-          }
-          tensor_map_[op.output(i)] = std::move(tensor);
-        }
-      } else {
-        for (int i = 0; i < op.output().size(); ++i) {
-          MACE_CHECK(
-              op.output_type_size() == 0
-                  || op.output_size()
-                      == op.output_type_size(),
-              "operator output size != operator output type size",
-              op.output_size(),
-              op.output_type_size());
-          DataType output_type;
-          if (i < op.output_type_size()) {
-            output_type = op.output_type(i);
-          } else {
-            output_type = static_cast<DataType>(ProtoArgHelper::GetOptionalArg(
-                op, "T", static_cast<int>(DT_FLOAT)));
-          }
-          CreateTensor(op.output(i),
-                       device->allocator(),
-                       output_type);
-        }
-      }
-
-      for (int output_idx = 0; output_idx < op.output_shape_size();
-           ++output_idx) {
-        std::vector<index_t>
-            shape_configured(op.output_shape(output_idx).dims_size());
-        for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
-          shape_configured[dim] = op.output_shape(output_idx).dims(dim);
-        }
-        tensor_map_[op.output(output_idx)]->SetShapeConfigured(
-            shape_configured);
-      }
-    }
-  }
-  return MaceStatus::MACE_SUCCESS;
-}
-
 void Workspace::RemoveUnusedBuffer() {
   auto iter = tensor_map_.begin();
   auto end_iter = tensor_map_.end();
@@ -398,4 +326,11 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
   tensor_buffer_.reset(nullptr);
 }
 
+void Workspace::RemoveTensor(const std::string &name) {
+  auto iter = tensor_map_.find(name);
+  if (iter != tensor_map_.end()) {
+    tensor_map_.erase(iter);
+  }
+}
+
 }  // namespace mace
diff --git a/mace/core/workspace.h b/mace/core/workspace.h
index 2a8089370c01c4341d6cd94a775ee6eaf1443910..e1d0a9829429cec5bd20c6b9d94aa73a574167a3 100644
--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -27,6 +27,8 @@
 
 namespace mace {
 
+class MemoryOptimizer;
+
 class Workspace {
  public:
   typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap;
@@ -36,7 +38,8 @@ class Workspace {
 
   Tensor *CreateTensor(const std::string &name,
                        Allocator *alloc,
-                       DataType type);
+                       DataType type,
+                       bool is_weight = false);
 
   inline bool HasTensor(const std::string &name) const {
     return tensor_map_.find(name) != tensor_map_.end();
@@ -52,12 +55,19 @@ class Workspace {
                              Device *device,
                              const unsigned char *model_data);
 
+  MaceStatus PreallocateOutputTensor(const NetDef &net_def,
+                                     const MemoryOptimizer *mem_optimizer,
+                                     Device *device);
+
   void RemoveUnusedBuffer();
 
   void RemoveAndReloadBuffer(const NetDef &net_def,
                              const unsigned char *model_data,
                              Allocator *alloc);
 
+  void RemoveTensor(const std::string &name);
+
+
  private:
   MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
                                       Device *device);
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index c4d65f7bd5496ef5e3e8afbe816c38dbe5cd12ef..bd94886bf728507ffc0f3d22b910f5c0b5bf5198 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -20,9 +20,11 @@
 
 #include <memory>
 
-#include "mace/core/net.h"
 #include "mace/core/device_context.h"
+#include "mace/core/memory_optimizer.h"
+#include "mace/core/net.h"
 #include "mace/ops/ops_registry.h"
+#include "mace/ops/transpose.h"
 #include "mace/public/mace.h"
 
 #ifdef MACE_ENABLE_OPENCL
@@ -69,6 +71,7 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
   // Check OpenCL avaliable
   auto runtime = device->opencl_runtime();
   if (!runtime->is_opencl_avaliable()) {
+    LOG(WARNING) << "The device does not support OpenCL";
     return MaceStatus::MACE_OUT_OF_RESOURCES;
   }
 
@@ -84,28 +87,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
   const MemoryType mem_type = static_cast<MemoryType>(mem_type_i);
 
   runtime->set_mem_type(mem_type);
-  if (mem_type == MemoryType::GPU_IMAGE) {
-    if (!runtime->IsImageSupport()) {
-      return MaceStatus::MACE_OUT_OF_RESOURCES;
-    }
-
-    auto opencl_max_image_size = runtime->GetMaxImage2DSize();
-    if (opencl_max_image_size.empty()) {
-      return MaceStatus::MACE_OUT_OF_RESOURCES;
-    }
-
-    const std::vector<int64_t> net_max_image_size =
-        ProtoArgHelper::GetRepeatedArgs<NetDef, int64_t>(
-            *net_def, "opencl_max_image_size", {0, 0});
-
-    if (static_cast<uint64_t>(net_max_image_size[0]) > opencl_max_image_size[0]
-        || static_cast<uint64_t>(net_max_image_size[1])
-            > opencl_max_image_size[1]) {
-      LOG(INFO) << "opencl max image size " << MakeString(opencl_max_image_size)
-                << " vs " << MakeString(net_max_image_size);
-      return MaceStatus::MACE_OUT_OF_RESOURCES;
-    }
-  }
 
   return MaceStatus::MACE_SUCCESS;
 }
@@ -288,14 +269,17 @@ class MaceTensor::Impl {
  public:
   std::vector<int64_t> shape;
   std::shared_ptr<float> data;
+  DataFormat format;
 };
 
 MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
-                       std::shared_ptr<float> data) {
+                       std::shared_ptr<float> data,
+                       const DataFormat format) {
   MACE_CHECK_NOTNULL(data.get());
   impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
   impl_->shape = shape;
   impl_->data = data;
+  impl_->format = format;
 }
 
 MaceTensor::MaceTensor() {
@@ -306,23 +290,27 @@ MaceTensor::MaceTensor(const MaceTensor &other) {
   impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
   impl_->shape = other.shape();
   impl_->data = other.data();
+  impl_->format = other.data_format();
 }
 
 MaceTensor::MaceTensor(const MaceTensor &&other) {
   impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
   impl_->shape = other.shape();
   impl_->data = other.data();
+  impl_->format = other.data_format();
 }
 
 MaceTensor &MaceTensor::operator=(const MaceTensor &other) {
   impl_->shape = other.shape();
   impl_->data = other.data();
+  impl_->format = other.data_format();
   return *this;
 }
 
 MaceTensor &MaceTensor::operator=(const MaceTensor &&other) {
   impl_->shape = other.shape();
   impl_->data = other.data();
+  impl_->format = other.data_format();
   return *this;
 }
 
@@ -334,6 +322,10 @@ const std::shared_ptr<float> MaceTensor::data() const { return impl_->data; }
 
 std::shared_ptr<float> MaceTensor::data() { return impl_->data; }
 
+DataFormat MaceTensor::data_format() const {
+  return impl_->format;
+}
+
 // Mace Engine
 class MaceEngine::Impl {
  public:
@@ -355,6 +347,14 @@ class MaceEngine::Impl {
                  std::map<std::string, MaceTensor> *outputs,
                  RunMetadata *run_metadata);
 
+ private:
+  MaceStatus TransposeInput(
+      const std::pair<const std::string, MaceTensor> &input,
+      Tensor *input_tensor);
+
+  MaceStatus TransposeOutput(const Tensor *output_tensor,
+                             std::pair<const std::string, MaceTensor> *output);
+
  private:
   const unsigned char *model_data_;
   size_t model_data_size_;
@@ -363,11 +363,12 @@ class MaceEngine::Impl {
   std::unique_ptr<Device> device_;
   std::unique_ptr<Workspace> ws_;
   std::unique_ptr<NetBase> net_;
-  std::map<std::string, mace::InputInfo> input_info_map_;
-  std::map<std::string, mace::OutputInfo> output_info_map_;
+  bool is_quantized_model_;
 #ifdef MACE_ENABLE_HEXAGON
   std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
 #endif
+  std::map<std::string, mace::InputInfo> input_info_map_;
+  std::map<std::string, mace::OutputInfo> output_info_map_;
 
   MACE_DISABLE_COPY_AND_ASSIGN(Impl);
 };
@@ -379,7 +380,8 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
       device_type_(config.impl_->device_type()),
       device_(nullptr),
       ws_(new Workspace()),
-      net_(nullptr)
+      net_(nullptr),
+      is_quantized_model_(false)
 #ifdef MACE_ENABLE_HEXAGON
       , hexagon_controller_(nullptr)
 #endif
@@ -417,6 +419,8 @@ MaceStatus MaceEngine::Impl::Init(
     MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def, device_.get()));
   }
 #endif
+  // mark quantized model flag
+  is_quantized_model_ = IsQuantizedModel(*net_def);
   // Get input and output information.
   for (auto &input_info : net_def->input_info()) {
     input_info_map_[input_info.name()] = input_info;
@@ -431,8 +435,7 @@ MaceStatus MaceEngine::Impl::Init(
                  << "' does not belong to model's inputs: "
                  << MakeString(MapKeys(input_info_map_));
     }
-    ws_->CreateTensor(MakeString("mace_input_node_", input_name),
-                      device_->allocator(), DT_FLOAT);
+    ws_->CreateTensor(input_name, device_->allocator(), DT_FLOAT);
   }
   for (auto output_name : output_nodes) {
     if (output_info_map_.find(output_name) == output_info_map_.end()) {
@@ -440,8 +443,6 @@ MaceStatus MaceEngine::Impl::Init(
                  << "' does not belong to model's outputs "
                  << MakeString(MapKeys(output_info_map_));
     }
-    ws_->CreateTensor(MakeString("mace_output_node_", output_name),
-                      device_->allocator(), DT_FLOAT);
   }
 #ifdef MACE_ENABLE_HEXAGON
   if (device_type_ == HEXAGON) {
@@ -461,19 +462,19 @@ MaceStatus MaceEngine::Impl::Init(
                                               device_.get(),
                                               model_data));
 
+    MemoryOptimizer mem_optimizer;
     // Init model
-    auto net = std::unique_ptr<NetBase>(new SerialNet(
-        op_registry_.get(),
-        net_def,
-        ws_.get(),
-        device_.get(),
-        NetMode::INIT));
-    MACE_RETURN_IF_ERROR(net->Init());
-    MACE_RETURN_IF_ERROR(net->Run());
     net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(),
                                                   net_def,
                                                   ws_.get(),
-                                                  device_.get()));
+                                                  device_.get(),
+                                                  &mem_optimizer));
+
+    // Preallocate all output tensors of ops
+    MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def,
+                                                      &mem_optimizer,
+                                                      device_.get()));
+
     MACE_RETURN_IF_ERROR(net_->Init());
 #ifdef MACE_ENABLE_HEXAGON
   }
@@ -524,6 +525,117 @@ MaceEngine::Impl::~Impl() {
 #endif
 }
 
+MaceStatus MaceEngine::Impl::TransposeInput(
+    const std::pair<const std::string, MaceTensor> &input,
+    Tensor *input_tensor) {
+  if (device_->device_type() == DeviceType::CPU &&
+      input.second.shape().size() == 4 &&
+      input.second.data_format() == NHWC &&
+      !is_quantized_model_) {
+    VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
+    input_tensor->set_data_format(DataFormat::NCHW);
+    std::vector<int> dst_dims = {0, 3, 1, 2};
+    std::vector<index_t> output_shape =
+        TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
+    MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
+    Tensor::MappingGuard input_guard(input_tensor);
+    float *input_data = input_tensor->mutable_data<float>();
+    return ops::Transpose(input.second.data().get(),
+                          input.second.shape(),
+                          dst_dims,
+                          input_data);
+  } else if (
+      (is_quantized_model_ || device_->device_type() == DeviceType::GPU) &&
+      input.second.shape().size() == 4 &&
+      input.second.data_format() == DataFormat::NCHW) {
+    VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC";
+    std::vector<int> dst_dims = {0, 2, 3, 1};
+    input_tensor->set_data_format(DataFormat::NHWC);
+    std::vector<index_t> output_shape =
+        TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
+    MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
+    Tensor::MappingGuard input_guard(input_tensor);
+    float *input_data = input_tensor->mutable_data<float>();
+    return ops::Transpose(input.second.data().get(),
+                          input.second.shape(),
+                          dst_dims,
+                          input_data);
+  } else {
+    input_tensor->set_data_format(input.second.data_format());
+    MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
+    Tensor::MappingGuard input_guard(input_tensor);
+    float *input_data = input_tensor->mutable_data<float>();
+    memcpy(input_data, input.second.data().get(),
+           input_tensor->size() * sizeof(float));
+    return MaceStatus::MACE_SUCCESS;
+  }
+}
+
+MaceStatus MaceEngine::Impl::TransposeOutput(
+    const mace::Tensor *output_tensor,
+    std::pair<const std::string, mace::MaceTensor> *output) {
+  // save output
+  if (output_tensor != nullptr && output->second.data() != nullptr) {
+    if (device_->device_type() == DeviceType::CPU &&
+        output->second.shape().size() == 4 &&
+        output->second.data_format() != output_tensor->data_format()) {
+      MACE_CHECK(output_tensor->data_format() == NCHW);
+      VLOG(1) << "Transform output " << output->first << " from NCHW to NHWC";
+      std::vector<int> dst_dims = {0, 2, 3, 1};
+      std::vector<index_t> shape =
+          TransposeShape<index_t, index_t>(output_tensor->shape(),
+                                           dst_dims);
+      MACE_CHECK(shape == output->second.shape())
+        << "Output shape mismatch: "
+        << MakeString<int64_t>(shape) << " != "
+        << MakeString<int64_t>(output->second.shape());
+      Tensor::MappingGuard output_guard(output_tensor);
+      const float *output_data = output_tensor->data<float>();
+      return ops::Transpose(output_data,
+                            output_tensor->shape(),
+                            dst_dims,
+                            output->second.data().get());
+    } else if (device_->device_type() == DeviceType::GPU &&
+        output->second.shape().size() == 4 &&
+        output->second.data_format() != output_tensor->data_format()) {
+      VLOG(1) << "Transform output " << output->first << " from "
+              << output_tensor->data_format() << " to "
+              << output->second.data_format();
+      std::vector<int> dst_dims = {0, 3, 1, 2};
+      if (output_tensor->data_format() == NCHW) {
+        dst_dims = {0, 2, 3, 1};
+      }
+      std::vector<index_t> shape =
+          TransposeShape<index_t, index_t>(output_tensor->shape(),
+                                           dst_dims);
+      MACE_CHECK(shape == output->second.shape())
+        << "Output shape mismatch: "
+        << MakeString<int64_t>(shape) << " != "
+        << MakeString<int64_t>(output->second.shape());
+      Tensor::MappingGuard output_guard(output_tensor);
+      const float *output_data = output_tensor->data<float>();
+      return ops::Transpose(output_data,
+                            output_tensor->shape(),
+                            dst_dims,
+                            output->second.data().get());
+    } else {
+      Tensor::MappingGuard output_guard(output_tensor);
+      auto shape = output_tensor->shape();
+      int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
+                                            std::multiplies<int64_t>());
+      MACE_CHECK(shape == output->second.shape())
+        << "Output shape mismatch: "
+        << MakeString<int64_t>(shape) << " != "
+        << MakeString<int64_t>(output->second.shape());
+      std::memcpy(output->second.data().get(), output_tensor->data<float>(),
+                  output_size * sizeof(float));
+      return MaceStatus::MACE_SUCCESS;
+    }
+  } else {
+    return MaceStatus::MACE_INVALID_ARGS;
+  }
+}
+
 MaceStatus MaceEngine::Impl::Run(
     const std::map<std::string, MaceTensor> &inputs,
     std::map<std::string, MaceTensor> *outputs,
@@ -537,15 +649,8 @@ MaceStatus MaceEngine::Impl::Run(
                  << "' does not belong to model's inputs: "
                  << MakeString(MapKeys(input_info_map_));
     }
-    Tensor *input_tensor =
-        ws_->GetTensor(MakeString("mace_input_node_", input.first));
-    MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
-    {
-      Tensor::MappingGuard input_guard(input_tensor);
-      float *input_data = input_tensor->mutable_data<float>();
-      memcpy(input_data, input.second.data().get(),
-             input_tensor->size() * sizeof(float));
-    }
+    Tensor *input_tensor = ws_->GetTensor(input.first);
+    MACE_RETURN_IF_ERROR(TransposeInput(input, input_tensor));
     input_tensors.push_back(input_tensor);
   }
   for (auto &output : *outputs) {
@@ -554,8 +659,7 @@ MaceStatus MaceEngine::Impl::Run(
                  << "' does not belong to model's outputs: "
                  << MakeString(MapKeys(output_info_map_));
     }
-    Tensor *output_tensor =
-        ws_->GetTensor(MakeString("mace_output_node_", output.first));
+    Tensor *output_tensor = ws_->GetTensor(output.first);
     output_tensors.push_back(output_tensor);
   }
 #ifdef MACE_ENABLE_HEXAGON
@@ -577,23 +681,9 @@ MaceStatus MaceEngine::Impl::Run(
   }
 #endif
   for (auto &output : *outputs) {
-    Tensor *output_tensor =
-        ws_->GetTensor(MakeString("mace_output_node_", output.first));
+    Tensor *output_tensor = ws_->GetTensor(output.first);
     // save output
-    if (output_tensor != nullptr && output.second.data() != nullptr) {
-      Tensor::MappingGuard output_guard(output_tensor);
-      auto shape = output_tensor->shape();
-      int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
-                                            std::multiplies<int64_t>());
-      MACE_CHECK(shape == output.second.shape())
-          << "Output shape mismatch: "
-          << MakeString<int64_t>(output.second.shape())
-          << " != " << MakeString<int64_t>(shape);
-      std::memcpy(output.second.data().get(), output_tensor->data<float>(),
-                  output_size * sizeof(float));
-    } else {
-      return MaceStatus::MACE_INVALID_ARGS;
-    }
+    MACE_RETURN_IF_ERROR(TransposeOutput(output_tensor, &output));
   }
   return MaceStatus::MACE_SUCCESS;
 }
diff --git a/mace/libmace/mace_version_script.lds b/mace/libmace/mace_version_script.lds
index 04e88455f67c209c0e6c7d70cce12167a81fbad5..9b7d34538ad20417e59051420048e98998c5afd7 100644
--- a/mace/libmace/mace_version_script.lds
+++ b/mace/libmace/mace_version_script.lds
@@ -14,7 +14,6 @@ mace {
     *mace*NetDef*;
     *mace*MemoryType*;
     *mace*DataType*;
-    *mace*MemoryArena*;
     *mace*InputInfo*;
     *mace*OutputInfo*;
     *mace*OutputShape*;
diff --git a/mace/ops/BUILD b/mace/ops/BUILD
index 5fb0683cf1be1d8936ec411877f4f3492ac1f960..99a50f7fe1a3cf92149623eb3395271cca580ad7 100644
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -30,10 +30,8 @@ cc_library(
             "arm/*_test.cc",
             "ops_registry.cc",
             "ops_test_util.cc",
-            "buffer_inverse_transform.cc",
-            "buffer_transformer.cc",
+            "buffer_transform.cc",
             "lstm_cell.cc",
-            "winograd_transform.cc",
             "quantize.cc",
         ],
     ) + if_opencl_enabled(glob(
@@ -41,10 +39,8 @@ cc_library(
             "opencl/*.cc",
             "opencl/image/*.cc",
             "opencl/buffer/*.cc",
-            "buffer_inverse_transform.cc",
-            "buffer_transformer.cc",
+            "buffer_transform.cc",
             "lstm_cell.cc",
-            "winograd_transform.cc",
         ],
         exclude = [
             "opencl/*_test.cc",
diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc
index 19b3289fcb70b16344edd4dcd8f80552ba6f389a..b904b5c275373e48f59358b8a238f61dd6917bf6 100644
--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -90,7 +90,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
     }
     if (type == ActivationType::PRELU) {
       MACE_CHECK(TransformFilter<T>(
-          context, operator_def_.get(), 1, BufferType::ARGUMENT, mem_type)
+          context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
                      == MaceStatus::MACE_SUCCESS);
     }
   }
diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc
index bd766047169bdf67c9cba4dfdb186c3883655e01..76447e9b6134229a002ac94bb09f58b2f857d038 100644
--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -30,31 +30,19 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) {
 
   // Add input data
   if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
   } else if (D == DeviceType::GPU) {
-    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
   } else {
     MACE_NOT_IMPLEMENTED;
   }
 
-  if (D == DeviceType::CPU) {
-    OpDefBuilder("Activation", "ReluBM")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluBM")
-        .Input("InputImage")
-        .Output("Output")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
+  OpDefBuilder("Activation", "ReluBM")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELU")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
@@ -100,29 +88,18 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {
 
   // Add input data
   if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
   } else {
-    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
   }
 
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluxBM")
-        .Input("InputImage")
-        .Output("Output")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6.0)
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("Activation", "ReluxBM")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6.0)
-        .Finalize(net.NewOperatorDef());
-  }
+  OpDefBuilder("Activation", "ReluxBM")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELUX")
+      .AddFloatArg("max_limit", 6.0)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
@@ -168,36 +145,21 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) {
 
   // Add input data
   if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
   } else if (D == DeviceType::GPU) {
-    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
   } else {
     MACE_NOT_IMPLEMENTED;
   }
-  net.AddRandomInput<D, float>("Alpha", {channels});
+  net.AddRandomInput<D, T>("Alpha", {channels}, true);
 
-  if (D == DeviceType::CPU) {
-    OpDefBuilder("Activation", "PreluBM")
-        .Input("Input")
-        .Input("Alpha")
-        .Output("Output")
-        .AddStringArg("activation", "PRELU")
-        .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
-                            ops::BufferType::ARGUMENT);
-
-    OpDefBuilder("Activation", "PreluBM")
-        .Input("InputImage")
-        .Input("AlphaImage")
-        .Output("Output")
-        .AddStringArg("activation", "PRELU")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
+  OpDefBuilder("Activation", "PreluBM")
+      .Input("Input")
+      .Input("Alpha")
+      .Output("Output")
+      .AddStringArg("activation", "PRELU")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
@@ -243,27 +205,17 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) {
 
   // Add input data
   if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
   } else {
-    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
   }
 
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "TanhBM")
-        .Input("InputImage")
-        .Output("Output")
-        .AddStringArg("activation", "TANH")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("Activation", "TanhBM")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "TANH")
-        .Finalize(net.NewOperatorDef());
-  }
+  OpDefBuilder("Activation", "TanhBM")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "TANH")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
@@ -310,27 +262,17 @@ void SigmoidBenchmark(
 
   // Add input data
   if (D == DeviceType::CPU) {
-    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
   } else {
-    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
   }
 
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "SigmoidBM")
-        .Input("InputImage")
-        .Output("Output")
-        .AddStringArg("activation", "SIGMOID")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("Activation", "SigmoidBM")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "SIGMOID")
-        .Finalize(net.NewOperatorDef());
-  }
+  OpDefBuilder("Activation", "SigmoidBM")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "SIGMOID")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc
index f127be425f9be4478f0fdf7fbadcaeb2ff6bc0a8..4cd63ab6b070d36d45c6547ee7bbe4ea9c2ebf0e 100644
--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -30,32 +30,14 @@ void TestSimpleRelu() {
       "Input", {2, 2, 2, 2},
       {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
 
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
+  OpDefBuilder("Activation", "ReluTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELU")
+      .Finalize(net.NewOperatorDef());
 
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "ReluTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);
 
   auto expected = net.CreateTensor<float>(
       {2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
@@ -78,32 +60,14 @@ void TestUnalignedSimpleRelu() {
   // Add input data
   net.AddInputFromArray<D, float>("Input", {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5});
 
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "ReluTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
+  OpDefBuilder("Activation", "ReluTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELU")
+      .Finalize(net.NewOperatorDef());
 
-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);
 
   auto expected = net.CreateTensor<float>({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5});
 
@@ -129,34 +93,15 @@ void TestSimpleRelux() {
       "Input", {2, 2, 2, 2},
       {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
 
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluxTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6)
-        .Finalize(net.NewOperatorDef());
+  OpDefBuilder("Activation", "ReluxTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELUX")
+      .AddFloatArg("max_limit", 6)
+      .Finalize(net.NewOperatorDef());
 
-    // Run
-    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "ReluxTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6)
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);
 
   auto expected = net.CreateTensor<float>(
       {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
@@ -179,34 +124,15 @@ void TestSimpleReluRelux() {
       "Input", {2, 2, 2, 2},
       {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
 
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "ReluxTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6)
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "ReluxTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6)
-        .Finalize(net.NewOperatorDef());
+  OpDefBuilder("Activation", "ReluxTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELUX")
+      .AddFloatArg("max_limit", 6)
+      .Finalize(net.NewOperatorDef());
 
-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);
 
   auto expected = net.CreateTensor<float>(
       {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
@@ -232,43 +158,36 @@ void TestSimplePrelu() {
   net.AddInputFromArray<D, float>(
       "Input", {2, 2, 2, 2},
       {-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0});
-  net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0});
+  net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0}, true);
 
   if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
     OpDefBuilder("Activation", "PreluTest")
-        .Input("InputImage")
+        .Input("Input")
         .Input("Alpha")
-        .Output("OutputImage")
+        .Output("Output")
         .AddStringArg("activation", "PRELU")
         .Finalize(net.NewOperatorDef());
 
     // Run
     net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
+    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
     OpDefBuilder("Activation", "PreluTest")
-        .Input("Input")
+        .Input("InputNCHW")
         .Input("Alpha")
-        .Output("Output")
+        .Output("OutputNCHW")
         .AddStringArg("activation", "PRELU")
         .Finalize(net.NewOperatorDef());
 
     // Run
     net.RunOp(D);
+    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
   }
 
-  if (D == DeviceType::CPU) {
-    auto expected = net.CreateTensor<float>(
-        {2, 2, 2, 2},
-        {-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0});
-    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
-  }
+  auto expected = net.CreateTensor<float>(
+      {2, 2, 2, 2},
+      {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace
 
@@ -288,32 +207,14 @@ void TestSimpleTanh() {
       "Input", {2, 2, 2, 2},
       {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
 
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "TanhTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "TANH")
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "TanhTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "TANH")
-        .Finalize(net.NewOperatorDef());
+  OpDefBuilder("Activation", "TanhTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "TANH")
+      .Finalize(net.NewOperatorDef());
 
-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);
 
   auto expected = net.CreateTensor<float>(
       {2, 2, 2, 2},
@@ -341,32 +242,14 @@ void TestSimpleSigmoid() {
       "Input", {2, 2, 2, 2},
       {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
 
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Activation", "SigmoidTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddStringArg("activation", "SIGMOID")
-        .Finalize(net.NewOperatorDef());
+  OpDefBuilder("Activation", "SigmoidTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "SIGMOID")
+      .Finalize(net.NewOperatorDef());
 
-    // Run
-    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("Activation", "SigmoidTest")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "SIGMOID")
-        .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
-  }
+  // Run
+  net.RunOp(D);
 
   auto expected = net.CreateTensor<float>(
       {2, 2, 2, 2},
diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc
index 5db2bda4bd67a818019b6a163ec912cb80799151..f5e11740d79597bc02e9f2fba3c55a6e286b8a7c 100644
--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -32,28 +32,13 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
     net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c});
   }
 
-  if (D == DeviceType::GPU) {
-    for (int i = 0; i < inputs; ++i) {
-      BufferToImage<D, T>(&net, MakeString("Input", i).c_str(),
-                          MakeString("InputImage", i).c_str(),
-                          ops::BufferType::IN_OUT_CHANNEL);
-    }
-    OpDefBuilder op_def_builder("AddN", "AddNBM");
-    for (int i = 0; i < inputs; ++i) {
-      op_def_builder.Input(MakeString("InputImage", i).c_str());
-    }
-    op_def_builder.Output("OutputImage")
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder op_def_builder("AddN", "AddNBM");
-    for (int i = 0; i < inputs; ++i) {
-      op_def_builder.Input(MakeString("Input", i).c_str());
-    }
-    op_def_builder.Output("Output")
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
+  OpDefBuilder op_def_builder("AddN", "AddNBM");
+  for (int i = 0; i < inputs; ++i) {
+    op_def_builder.Input(MakeString("Input", i).c_str());
   }
+  op_def_builder.Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
diff --git a/mace/ops/addn_test.cc b/mace/ops/addn_test.cc
index 865fdd7f95159f5f9f1030376583fbdb5f40e1e1..f006570c49627de85ab61e4b55d74f9d825c12ac 100644
--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -62,39 +62,15 @@ void SimpleAdd3() {
   net.AddInputFromArray<D, float>("Input3", {1, 2, 3, 1},
                                   {-0.1582, 2, 3, 4, 5, 6});
 
-  const int input_num = 4;
-  if (D == DeviceType::GPU) {
-    // run on gpu
-    for (int i = 0; i < input_num; ++i) {
-      BufferToImage<D, half>(&net, MakeString("Input", i),
-                             MakeString("InputImage", i),
-                             ops::BufferType::IN_OUT_CHANNEL);
-    }
-
-    auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
-    for (int i = 0; i < input_num; ++i) {
-      op_def_cl.Input(MakeString("InputImage", i));
-    }
-    op_def_cl.Output("OutputImage")
-        .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
-        .Finalize(net.NewOperatorDef());
-
-    // Run on device
-    net.RunOp(D);
-
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("AddN", "AddNTest")
-        .Input("Input0")
-        .Input("Input1")
-        .Input("Input2")
-        .Input("Input3")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
-  }
+  OpDefBuilder("AddN", "AddNTest")
+      .Input("Input0")
+      .Input("Input1")
+      .Input("Input2")
+      .Input("Input3")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+  // Run
+  net.RunOp(D);
 
   auto expected =
       net.CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24});
@@ -138,28 +114,10 @@ void RandomTest() {
     auto expected = net.CreateTensor<float>();
     expected->Copy(*net.GetOutput("Output"));
 
-    // run on gpu
-    for (int i = 0; i < input_num; ++i) {
-      BufferToImage<D, half>(&net, MakeString("Input", i),
-                             MakeString("InputImage", i),
-                             ops::BufferType::IN_OUT_CHANNEL);
-    }
-
-    auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
-    for (int i = 0; i < input_num; ++i) {
-      op_def_cl.Input(MakeString("InputImage", i));
-    }
-    op_def_cl.Output("OutputImage")
-        .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
-        .Finalize(net.NewOperatorDef());
-
-    // Run on device
+    // run on device
     net.RunOp(D);
 
-    ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2,
                             1e-2);
   }
 }
diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc
index cf022d6ae7a2e9cee8bf4368869f8e8eab9faf68..1758f79b799a11df6b075222ffb022be5a71b615 100644
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -158,14 +158,16 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
     }
     // Transform filters
     int input_size = operator_def_->input_size();
-    for (int i = 0; i < input_size; ++i) {
+    for (int i = 1; i < input_size; ++i) {
       const Tensor *input_tensor = context->workspace()->GetTensor(
           operator_def_->input(i));
-      if (input_tensor != nullptr && input_tensor->is_weight()) {
-        MACE_CHECK(TransformFilter<T>(
-            context, operator_def_.get(), i, BufferType::ARGUMENT, mem_type)
-                       == MaceStatus::MACE_SUCCESS);
-      }
+      MACE_CHECK(input_tensor != nullptr);
+      MACE_CHECK(TransformFilter<T>(
+          context,
+          operator_def_.get(),
+          i,
+          OpenCLBufferType::ARGUMENT,
+          mem_type) == MaceStatus::MACE_SUCCESS);
     }
   }
   MaceStatus Run(OpContext *context) override {
diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc
index 814b631e66a25a26d4faf45b23308c2476144319..d3467e769f32a69732b366e2d077f5fb6c8959e8 100644
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -36,13 +36,12 @@ void BatchNorm(
   } else {
     MACE_NOT_IMPLEMENTED;
   }
-  net.AddRandomInput<D, T>("Scale", {channels});
-  net.AddRandomInput<D, T>("Offset", {channels});
-  net.AddRandomInput<D, T>("Mean", {channels});
-  net.AddRandomInput<D, T>("Var", {channels}, true);
+  net.AddRandomInput<D, T>("Scale", {channels}, true);
+  net.AddRandomInput<D, T>("Offset", {channels}, true);
+  net.AddRandomInput<D, T>("Mean", {channels}, true);
+  net.AddRandomInput<D, T>("Var", {channels}, true, true);
 
-  if (D == DeviceType::CPU) {
-    OpDefBuilder("BatchNorm", "BatchNormBM")
+  OpDefBuilder("BatchNorm", "BatchNormBM")
       .Input("Input")
       .Input("Scale")
       .Input("Offset")
@@ -50,30 +49,8 @@ void BatchNorm(
       .Input("Var")
       .AddFloatArg("epsilon", 1e-3)
       .Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Offset", "OffsetImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Mean", "MeanImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Var", "VarImage",
-                            ops::BufferType::ARGUMENT);
-    OpDefBuilder("BatchNorm", "BatchNormBM")
-        .Input("InputImage")
-        .Input("ScaleImage")
-        .Input("OffsetImage")
-        .Input("MeanImage")
-        .Input("VarImage")
-        .AddFloatArg("epsilon", 1e-3)
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
 
   // tuning
   setenv("MACE_TUNING", "1", 1);
diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc
index 214fd5075c328af4435563c52c3fadb120f39651..d7c4903e1449371a14830d057e8ede4c03cf0cea 100644
--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -28,10 +28,10 @@ void Simple() {
   // Add input data
   net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
                                   {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
-  net.AddInputFromArray<D, float>("Scale", {1}, {4.0f});
-  net.AddInputFromArray<D, float>("Offset", {1}, {2.0});
-  net.AddInputFromArray<D, float>("Mean", {1}, {10});
-  net.AddInputFromArray<D, float>("Var", {1}, {11.67f});
+  net.AddInputFromArray<D, float>("Scale", {1}, {4.0f}, true);
+  net.AddInputFromArray<D, float>("Offset", {1}, {2.0}, true);
+  net.AddInputFromArray<D, float>("Mean", {1}, {10}, true);
+  net.AddInputFromArray<D, float>("Var", {1}, {11.67f}, true);
 
   if (D == DeviceType::CPU) {
     net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
@@ -49,32 +49,17 @@ void Simple() {
     net.RunOp(D);
     net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
   } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Offset", "OffsetImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Mean", "MeanImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Var", "VarImage",
-                            ops::BufferType::ARGUMENT);
-
     OpDefBuilder("BatchNorm", "BatchNormTest")
-        .Input("InputImage")
-        .Input("ScaleImage")
-        .Input("OffsetImage")
-        .Input("MeanImage")
-        .Input("VarImage")
+        .Input("Input")
+        .Input("Scale")
+        .Input("Offset")
+        .Input("Mean")
+        .Input("Var")
         .AddFloatArg("epsilon", 1e-3)
-        .Output("OutputImage")
+        .Output("Output")
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
   }
 
   // Check
@@ -103,10 +88,10 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
   // Add input data
   net.AddRandomInput<DeviceType::GPU, float>("Input",
                                              {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
 
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                   NCHW);
@@ -133,25 +118,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
   expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
-  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
-                                        ops::BufferType::ARGUMENT);
-
   OpDefBuilder("BatchNorm", "BatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Input("MeanImage")
-      .Input("VarImage")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
       .AddFloatArg("epsilon", 1e-3)
-      .Output("OutputImage")
+      .Output("Output")
       .Finalize(net.NewOperatorDef());
 
   // Tuning
@@ -162,10 +136,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
   // Run on opencl
   net.RunOp(DeviceType::GPU);
   net.Sync();
-
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                           1e-5, 1e-4);
 }
 
@@ -183,10 +154,10 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
   // Add input data
   net.AddRandomInput<DeviceType::GPU, float>("Input",
                                              {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
 
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                   NCHW);
@@ -212,25 +183,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
   expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
-  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
-                                       ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
-                                       ops::BufferType::ARGUMENT);
-
   OpDefBuilder("BatchNorm", "BatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Input("MeanImage")
-      .Input("VarImage")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
       .AddFloatArg("epsilon", 1e-1)
-      .Output("OutputImage")
+      .Output("Output")
       .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
       .Finalize(net.NewOperatorDef());
 
@@ -243,9 +203,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
   net.RunOp(DeviceType::GPU);
   net.Sync();
 
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                           1e-1, 1e-2);
 }
 
@@ -263,10 +221,10 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
   // Add input data
   net.AddRandomInput<DeviceType::GPU, float>("Input",
                                              {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
 
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                   NCHW);
@@ -292,25 +250,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
   expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
-  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
-                                        ops::BufferType::ARGUMENT);
-
   OpDefBuilder("BatchNorm", "BatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Input("MeanImage")
-      .Input("VarImage")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
       .AddFloatArg("epsilon", 1e-3)
-      .Output("OutputImage")
+      .Output("Output")
       .Finalize(net.NewOperatorDef());
 
   // tuning
@@ -322,9 +269,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
   net.RunOp(DeviceType::GPU);
   net.Sync();
 
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                           1e-5, 1e-4);
 }
 
@@ -342,10 +287,10 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
   // Add input data
   net.AddRandomInput<DeviceType::GPU, float>("Input",
                                              {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
 
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                   NCHW);
@@ -371,25 +316,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
   expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
-  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
-                                       ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
-                                       ops::BufferType::ARGUMENT);
-
   OpDefBuilder("BatchNorm", "BatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Input("MeanImage")
-      .Input("VarImage")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
       .AddFloatArg("epsilon", 1e-1)
-      .Output("OutputImage")
+      .Output("Output")
       .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
       .Finalize(net.NewOperatorDef());
 
@@ -402,9 +336,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
   net.RunOp(DeviceType::GPU);
   net.Sync();
 
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                           1e-1, 1e-2);
 }
 
diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc
index 7ea19f6bcdb57e270092ecb21e497dd4b7ee7e3c..9664a917e6256687a7c0bba75a3c5cb52732071e 100644
--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
@@ -32,23 +32,13 @@ void BMBatchToSpace(
     net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
   }
 
-  if (D == DeviceType::CPU) {
-    OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
-        .Input("Input")
-        .Output("Output")
-        .AddIntsArg("crops", {0, 0, 0, 0})
-        .AddIntsArg("block_shape", {arg, arg})
-        .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddIntsArg("crops", {0, 0, 0, 0})
-        .AddIntsArg("block_shape", {arg, arg})
-        .Finalize(net.NewOperatorDef());
-  }
+  OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
+      .Input("Input")
+      .Output("Output")
+      .AddIntsArg("crops", {0, 0, 0, 0})
+      .AddIntsArg("block_shape", {arg, arg})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
   // Warm-up
   for (int i = 0; i < 5; ++i) {
     net.RunOp(D);
diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc
index 0b406dd1269b2f2ad6925b232b2d845566836c9b..59579fa518bd613700251ee74b2265025337d58d 100644
--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -108,7 +108,7 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
       MACE_NOT_IMPLEMENTED;
     }
     MACE_CHECK(TransformFilter<T>(
-        context, operator_def_.get(), 1, BufferType::ARGUMENT, mem_type)
+        context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
                    == MaceStatus::MACE_SUCCESS);
   }
   MaceStatus Run(OpContext *context) override {
diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc
index dce361e94130ebb3fb9c55a50c822306228cbcf7..9026ffb2b2142b4b7d9d99c303401fc759ca0e05 100644
--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -28,35 +28,24 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
   OpsTestNet net;
 
   // Add input data
+  DataFormat data_format = NHWC;
   if (D == DeviceType::CPU) {
+    data_format = NCHW;
     net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
   } else if (D == DeviceType::GPU) {
     net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
   } else {
     MACE_NOT_IMPLEMENTED;
   }
-  net.AddRandomInput<D, T>("Bias", {channels}, true);
+  net.AddRandomInput<D, T>("Bias", {channels}, true, true);
 
-  if (D == DeviceType::CPU) {
-    OpDefBuilder("BiasAdd", "BiasAddBM")
+  OpDefBuilder("BiasAdd", "BiasAddBM")
       .Input("Input")
       .Input("Bias")
-      .AddIntArg("data_format", NCHW)
+      .AddIntArg("data_format", data_format)
       .Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-    OpDefBuilder("BiasAdd", "BiasAddBM")
-        .Input("InputImage")
-        .Input("BiasImage")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
diff --git a/mace/ops/bias_add_test.cc b/mace/ops/bias_add_test.cc
index ba31ccec30e53b54f02e42890fc5060e6c7437b7..77c6e7c4a14b8fcf5be00805877b4717770cc732 100644
--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -28,7 +28,7 @@ void BiasAddSimple() {
   // Add input data
   net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
                                   {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
-  net.AddInputFromArray<D, float>("Bias", {1}, {0.5f});
+  net.AddInputFromArray<D, float>("Bias", {1}, {0.5f}, true);
 
   if (D == DeviceType::CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
@@ -44,22 +44,13 @@ void BiasAddSimple() {
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Bias", "BiasImage",
-                            ops::BufferType::ARGUMENT);
-
     OpDefBuilder("BiasAdd", "BiasAddTest")
-        .Input("InputImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Bias")
+        .Output("Output")
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     MACE_NOT_IMPLEMENTED;
   }
@@ -90,7 +81,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
   // Add input data
   net.AddRandomInput<DeviceType::GPU, float>("Input",
                                              {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);
 
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                   NCHW);
@@ -113,25 +104,17 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
   auto expected = net.CreateTensor<float>();
   expected->Copy(*net.GetOutput("Output"));
 
-  // Run on opencl
-  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
-                                        ops::BufferType::ARGUMENT);
-
+  // Run on gpu
   OpDefBuilder("BiasAdd", "BiasAddTest")
-      .Input("InputImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Input("Bias")
+      .Output("Output")
       .Finalize(net.NewOperatorDef());
 
   // Run on opencl
   net.RunOp(DeviceType::GPU);
-  net.Sync();
 
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 
 TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
@@ -147,7 +130,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
   // Add input data
   net.AddRandomInput<DeviceType::GPU, float>("Input",
                                              {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);
 
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                   NCHW);
@@ -169,25 +152,17 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
   auto expected = net.CreateTensor<float>();
   expected->Copy(*net.GetOutput("Output"));
 
-  // Run on opencl
-  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
-                                        ops::BufferType::ARGUMENT);
-
+  // Run on gpu
   OpDefBuilder("BiasAdd", "BiasAddTest")
-      .Input("InputImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Input("Bias")
+      .Output("Output")
       .Finalize(net.NewOperatorDef());
 
   // Run on opencl
   net.RunOp(DeviceType::GPU);
-  net.Sync();
 
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 
 }  // namespace test
diff --git a/mace/ops/buffer_inverse_transform.cc b/mace/ops/buffer_inverse_transform.cc
deleted file mode 100644
index 8482e2552a55c7e7d681a4e5239d510cc4f2bdfb..0000000000000000000000000000000000000000
--- a/mace/ops/buffer_inverse_transform.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-
-#include "mace/core/operator.h"
-#include "mace/ops/opencl/buffer/buffer_inverse_transform.h"
-#include "mace/ops/opencl/image/image_to_buffer.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class BufferInverseTransformOp;
-
-template <typename T>
-class BufferInverseTransformOp<DeviceType::GPU, T> : public Operation {
- public:
-  explicit BufferInverseTransformOp(OpConstructContext *context)
-      : Operation(context),
-        wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::ImageToBuffer<T>);
-    } else {
-      kernel_.reset(new opencl::buffer::BufferInverseTransform<T>);
-    }
-  }
-
-  MaceStatus Run(OpContext *context) override {
-    const Tensor *input = this->Input(0);
-    Tensor *output = this->Output(0);
-
-    ops::BufferType type =
-        static_cast<ops::BufferType>(Operation::GetOptionalArg<int>(
-            "buffer_type", static_cast<int>(ops::CONV2D_FILTER)));
-
-    return kernel_->Compute(context, input, type,
-                            wino_blk_size_, output);
-  }
-
- private:
-  const int wino_blk_size_;
-  std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
-};
-
-
-void RegisterBufferInverseTransform(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "BufferInverseTransform",
-                   BufferInverseTransformOp, DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "BufferInverseTransform",
-                   BufferInverseTransformOp, DeviceType::GPU, half);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/buffer_to_image_benchmark.cc b/mace/ops/buffer_to_image_benchmark.cc
index 825ba1053361c3b897c3bf6e7a93b7918a7f7acf..f5f1df413258fc1a1a66729b7af7d39604281039 100644
--- a/mace/ops/buffer_to_image_benchmark.cc
+++ b/mace/ops/buffer_to_image_benchmark.cc
@@ -14,6 +14,7 @@
 
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -28,26 +29,36 @@ void FilterBufferToImage(int iters,
   mace::testing::StopTiming();
 
   OpsTestNet net;
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));
 
   // Add input data
   net.AddRandomInput<D, T>("Input",
                            {out_channel, in_channel, height, width});
+  // Create output
+  Tensor *b2i_output = net.ws()->CreateTensor(
+      "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
 
-  OpDefBuilder("BufferToImage", "BufferToImageBM")
-      .Input("Input")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
+  auto transform_func = [&]() {
+    OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+        .Transform(&context,
+                   net.ws()->GetTensor("Input"),
+                   OpenCLBufferType::IN_OUT_CHANNEL,
+                   MemoryType::GPU_IMAGE,
+                   0,
+                   b2i_output);
+  };
 
   // Warm-up
   net.Setup(D);
   for (int i = 0; i < 5; ++i) {
-    net.Run();
+    transform_func();
   }
   net.Sync();
 
   mace::testing::StartTiming();
   while (iters--) {
-    net.Run();
+    transform_func();
   }
   net.Sync();
 }
diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc
index fcf7e37015cf3ff0d2db7f3c48c392ede4b452f2..e6a65aa258fa8c76328c5be88a99e04e0bb1f074 100644
--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -14,6 +14,7 @@
 
 #include "gtest/gtest.h"
 #include "mace/ops/ops_test_util.h"
+#include "mace/ops/opencl/buffer_transformer.h"
 
 namespace mace {
 namespace ops {
@@ -21,31 +22,27 @@ namespace test {
 
 namespace {
 template <DeviceType D, typename T>
-void TestBidirectionTransform(const int type,
+void TestBidirectionTransform(const OpenCLBufferType type,
                               const std::vector<index_t> &input_shape) {
   OpsTestNet net;
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("B2IOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));
 
   // Add input data
   net.AddRandomInput<D, T>("Input", input_shape);
+  Tensor *b2i_output = net.ws()->CreateTensor(
+      "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
 
-  // Run
-  net.RunOp(D);
+  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+      .Transform(&context, net.ws()->GetTensor("Input"),
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);
 
-  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
-      .Input("B2IOutput")
-      .Output("I2BOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
-
-  // Run
-  net.RunOp(D);
+  // Inverse Transform
+  Tensor *i2b_output = net.ws()->CreateTensor(
+      "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
+  OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
+      .Transform(&context, b2i_output,
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);
 
   // Check
   ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
@@ -54,132 +51,139 @@ void TestBidirectionTransform(const int type,
 }  // namespace
 
 TEST(BufferToImageTest, ArgSmall) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {1});
+  TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
+                                                   {1});
 }
 
 TEST(BufferToImageTest, ArgHalfSmall) {
-  TestBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT, {11});
+  TestBidirectionTransform<DeviceType::GPU, half>(OpenCLBufferType::ARGUMENT,
+                                                  {11});
 }
 
 TEST(BufferToImageTest, ArgMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {11});
+  TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
+                                                   {11});
 }
 
 TEST(BufferToImageTest, ArgLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {256});
+  TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
+                                                   {256});
 }
 
 TEST(BufferToImageTest, InputSmallSingleChannel) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
-                                                   {1, 2, 3, 1});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::IN_OUT_CHANNEL, {1, 2, 3, 1});
 }
 
 TEST(BufferToImageTest, InputSmallMultipleChannel) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
-                                                   {1, 2, 3, 3});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::IN_OUT_CHANNEL, {1, 2, 3, 3});
 }
 
 TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
-                                                   {3, 2, 3, 3});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::IN_OUT_CHANNEL, {3, 2, 3, 3});
 }
 
 TEST(BufferToImageTest, InputMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
-                                                   {3, 13, 17, 128});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::IN_OUT_CHANNEL, {3, 13, 17, 128});
 }
 
 TEST(BufferToImageTest, InputLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
-                                                   {3, 64, 64, 256});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::IN_OUT_CHANNEL, {3, 64, 64, 256});
 }
 
 TEST(BufferToImageTest, Filter1x1Small) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                    {5, 3, 1, 1});
 }
 
 TEST(BufferToImageTest, Filter1x1Medium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                    {13, 17, 1, 1});
 }
 
 TEST(BufferToImageTest, Filter1x1Large) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                    {512, 128, 1, 1});
 }
 
 TEST(BufferToImageTest, Filter3x3Small) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                    {3, 5, 3, 3});
 }
 
 TEST(BufferToImageTest, Filter3x3Medium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                    {17, 13, 3, 3});
 }
 
 TEST(BufferToImageTest, Filter3x3Large) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
+  TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
                                                    {256, 128, 3, 3});
 }
 
 TEST(BufferToImageTest, WeightWidthSmall) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
-                                                   {1, 3, 3, 3});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_WIDTH,
+      {1, 3, 3, 3});
 }
 
 TEST(BufferToImageTest, WeightWidthMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
-                                                   {11, 13, 13, 17});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_WIDTH,
+      {11, 13, 13, 17});
 }
 
 TEST(BufferToImageTest, WeightWidthLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
-                                                   {64, 64, 11, 13});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_WIDTH,
+      {64, 64, 11, 13});
 }
 
 TEST(BufferToImageTest, WeightHeightSmall) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
-                                                   {2, 1, 1, 1});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_HEIGHT,
+      {2, 1, 1, 1});
 }
 
 TEST(BufferToImageTest, WeightHeightMedium) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
-                                                   {11, 13, 13, 17});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_HEIGHT,
+      {11, 13, 13, 17});
 }
 
 TEST(BufferToImageTest, WeightHeightLarge) {
-  TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
-                                                   {64, 16, 11, 13});
+  TestBidirectionTransform<DeviceType::GPU, float>(
+      OpenCLBufferType::WEIGHT_HEIGHT,
+      {64, 16, 11, 13});
 }
 
 namespace {
 template <DeviceType D, typename T>
-void TestDiffTypeBidirectionTransform(const int type,
+void TestDiffTypeBidirectionTransform(const OpenCLBufferType type,
                                       const std::vector<index_t> &input_shape) {
   OpsTestNet net;
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("B2IOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));
 
   // Add input data
   net.AddRandomInput<D, float>("Input", input_shape);
+  Tensor *b2i_output = net.ws()->CreateTensor(
+      "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
 
-  // Run
-  net.RunOp(D);
-
-  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
-      .Input("B2IOutput")
-      .Output("I2BOutput")
-      .AddIntArg("buffer_type", type)
-      .Finalize(net.NewOperatorDef());
+  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+      .Transform(&context, net.ws()->GetTensor("Input"),
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);
 
-  // Run
-  net.RunOp(D);
+  // Inverse Transform
+  Tensor *i2b_output = net.ws()->CreateTensor(
+      "I2BOutput", context.device()->allocator(), DT_FLOAT);
+  OpenCLBufferTransformer<float>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
+      .Transform(&context, b2i_output,
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);
 
   // Check
   ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
@@ -188,40 +192,38 @@ void TestDiffTypeBidirectionTransform(const int type,
 }  // namespace
 
 TEST(BufferToImageTest, ArgFloatToHalfSmall) {
-  TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT,
-                                                          {11});
+  TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(
+      OpenCLBufferType::ARGUMENT,
+      {11});
 }
 
 namespace {
 template <DeviceType D, typename T>
-void TestStringHalfBidirectionTransform(const int type,
+void TestStringHalfBidirectionTransform(const OpenCLBufferType type,
                                         const std::vector<index_t> &input_shape,
                                         const unsigned char *input_data) {
   OpsTestNet net;
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("B2IOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));
 
+  // Add input data
   const half *h_data = reinterpret_cast<const half *>(input_data);
-
   net.AddInputFromArray<D, half>("Input", input_shape,
                                  std::vector<half>(h_data, h_data + 2));
+  Tensor *b2i_output = net.ws()->CreateTensor(
+      "B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
 
-  // Run
-  net.RunOp(D);
-
-  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
-      .Input("B2IOutput")
-      .Output("I2BOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
+  // Transform
+  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
+      .Transform(&context, net.ws()->GetTensor("Input"),
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);
 
-  // Run
-  net.RunOp(D);
+  // Inverse Transform
+  Tensor *i2b_output = net.ws()->CreateTensor(
+      "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
+  OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
+      .Transform(&context, b2i_output,
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);
 
   // Check
   ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
@@ -233,8 +235,8 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
   const unsigned char input_data[] = {
       0xCD, 0x3C, 0x33, 0x40,
   };
-  TestStringHalfBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT,
-                                                            {2}, input_data);
+  TestStringHalfBidirectionTransform<DeviceType::GPU, half>(
+      OpenCLBufferType::ARGUMENT, {2}, input_data);
 }
 
 }  // namespace test
diff --git a/mace/ops/buffer_transform.cc b/mace/ops/buffer_transform.cc
index 1accbe213585ddb6d8c0058fee076fd191d87f2f..088b149d894b76439b32087134d3958da4ad0af4 100644
--- a/mace/ops/buffer_transform.cc
+++ b/mace/ops/buffer_transform.cc
@@ -28,34 +28,27 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
  public:
   explicit BufferTransformOp(OpConstructContext *context)
       : Operation(context),
-        wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)),
-        out_mem_type_(MemoryType::GPU_BUFFER),
-        transformer_(nullptr) {
-    MemoryType in_mem_type = context->workspace()->GetTensor(
-        operator_def_->input(0))->memory_type();
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
-      out_mem_type_ = MemoryType::GPU_IMAGE;
-    }
-    transformer_.reset(new OpenCLBufferTransformer<T>(in_mem_type,
-                                                      out_mem_type_));
-  }
+        wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)),
+        out_mem_type_(static_cast<MemoryType>(Operation::GetOptionalArg<int>(
+            "mem_type", static_cast<int>(MemoryType::GPU_IMAGE)))) {}
 
   MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(0);
     Tensor *output = this->Output(0);
 
-    ops::BufferType type =
-        static_cast<ops::BufferType>(Operation::GetOptionalArg<int>(
-            "buffer_type", static_cast<int>(ops::CONV2D_FILTER)));
+    auto type =
+        static_cast<OpenCLBufferType>(Operation::GetOptionalArg<int>(
+            "buffer_type", static_cast<int>(CONV2D_FILTER)));
 
-    return transformer_->Transform(
-        context, input, type, wino_blk_size_, out_mem_type_, output);
+    MemoryType in_mem_type = context->workspace()->GetTensor(
+        operator_def_->input(0))->memory_type();
+    return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
+        context, input, type, out_mem_type_, wino_blk_size_, output);
   }
 
  private:
   const int wino_blk_size_;
   MemoryType out_mem_type_;
-  std::unique_ptr<OpenCLBufferTransformer<T>> transformer_;
 };
 
 
diff --git a/mace/ops/buffer_transform_test.cc b/mace/ops/buffer_transform_test.cc
index c768d671c3b7cd1da70b673054207306e75f56dc..c18e81cf99f4b8d6d1fef29ba3d95aa8873292f2 100644
--- a/mace/ops/buffer_transform_test.cc
+++ b/mace/ops/buffer_transform_test.cc
@@ -15,6 +15,7 @@
 #include <cstring>
 
 #include "gtest/gtest.h"
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -30,31 +31,31 @@ class BufferTransformTest : public OpsTestBase {
 
 namespace {
 template <typename OrgType, typename DstType>
-void TestBidirectionTransform(const int type,
+void TestBidirectionTransform(const OpenCLBufferType type,
                               const std::vector<index_t> &input_shape) {
   OpsTestNet net;
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("TransformedOutput")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<DstType>::value)
-      .Finalize(net.NewOperatorDef());
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));
 
   // Add input data
   net.AddRandomInput<DeviceType::GPU, OrgType>("Input", input_shape);
-
-  // Run
-  net.RunOp(DeviceType::GPU);
-
-  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
-      .Input("TransformedOutput")
-      .Output("Output")
-      .AddIntArg("buffer_type", type)
-      .AddIntArg("T", DataTypeToEnum<OrgType>::value)
-      .Finalize(net.NewOperatorDef());
-
-  // Run
-  net.RunOp(DeviceType::GPU);
+  Tensor *bt_output = net.ws()->CreateTensor(
+      "BtOutput", context.device()->allocator(),
+      DataTypeToEnum<DstType>::value);
+
+  OpenCLBufferTransformer<DstType>(MemoryType::GPU_BUFFER,
+                                   MemoryType::GPU_BUFFER)
+      .Transform(&context, net.ws()->GetTensor("Input"),
+                 type, MemoryType::GPU_BUFFER, 0, bt_output);
+
+  // Inverse Transform
+  Tensor *output = net.ws()->CreateTensor(
+      "Output", context.device()->allocator(),
+      DataTypeToEnum<OrgType>::value);
+  OpenCLBufferTransformer<OrgType>(MemoryType::GPU_BUFFER,
+                                   MemoryType::GPU_BUFFER)
+      .Transform(&context, bt_output,
+                 type, MemoryType::GPU_BUFFER, 0, output);
 
   if (DataTypeToEnum<OrgType>::value == DataTypeToEnum<DstType>::value) {
     EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(),
@@ -69,38 +70,35 @@ void TestBidirectionTransform(const int type,
 }  // namespace
 
 TEST_F(BufferTransformTest, FloatToHalf) {
-  TestBidirectionTransform<float, half>(ops::BufferType::IN_OUT_CHANNEL,
+  TestBidirectionTransform<float, half>(OpenCLBufferType::IN_OUT_CHANNEL,
                                         {1, 2, 3, 4});
 }
 
-TEST_F(BufferTransformTest, HalfToHalf) {
-  TestBidirectionTransform<half, half>(ops::BufferType::IN_OUT_CHANNEL,
-                                       {1, 2, 3, 4});
-}
-
 namespace {
 template <typename T>
 void TestArgumentTransform(const index_t input_size) {
   OpsTestNet net;
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("Output")
-      .AddIntArg("buffer_type", ops::BufferType::ARGUMENT)
-      .AddIntArg("T", DataTypeToEnum<T>::value)
-      .Finalize(net.NewOperatorDef());
+  OpContext context(net.ws(),
+                    OpTestContext::Get()->GetDevice(DeviceType::GPU));
 
   // Add input data
   net.AddRandomInput<DeviceType::GPU, T>("Input", {input_size});
 
   // Run
-  net.RunOp(DeviceType::GPU);
+  Tensor *output = net.ws()->CreateTensor(
+      "Output", context.device()->allocator(),
+      DataTypeToEnum<T>::value);
+  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER,
+                             MemoryType::GPU_BUFFER)
+      .Transform(&context, net.ws()->GetTensor("Input"),
+                 OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER,
+                 0, output);
 
-  auto output_tensor = net.GetOutput("Output");
   index_t expected_size = RoundUp<index_t>(input_size, 4);
-  EXPECT_EQ(expected_size, output_tensor->buffer_shape()[0]);
+  EXPECT_EQ(expected_size, output->buffer_shape()[0]);
 
   // Check
-  ExpectTensorNear<T>(*net.GetTensor("Input"), *output_tensor,
+  ExpectTensorNear<T>(*net.GetTensor("Input"), *output,
                       1e-3, 1e-4);
 }
 }  // namespace
diff --git a/mace/ops/channel_shuffle_benchmark.cc b/mace/ops/channel_shuffle_benchmark.cc
index 6707a5c950aab3c46b90094254f26ef20dde7e84..db5f8494af4d2f0bfceb1288d250572d1e15a830 100644
--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
@@ -36,23 +36,11 @@ void ChannelShuffle(
     MACE_NOT_IMPLEMENTED;
   }
 
-  if (D == DeviceType::CPU) {
-    OpDefBuilder("Softmax", "SoftmaxBM")
+  OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
       .Input("Input")
       .Output("Output")
+      .AddIntArg("group", group)
       .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
-        .Input("InputImage")
-        .Output("Output")
-        .AddIntArg("group", group)
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
diff --git a/mace/ops/channel_shuffle_test.cc b/mace/ops/channel_shuffle_test.cc
index ca301a1f3433daefe95480fbfba5991dd25d60b3..1afcc41f55aa6bf45ca4b10ac3180ea8d0d6188c 100644
--- a/mace/ops/channel_shuffle_test.cc
+++ b/mace/ops/channel_shuffle_test.cc
@@ -59,22 +59,15 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
       "Input", {1, 1, 2, 16},
       {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
-  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-
   OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
-      .Input("InputImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Output("Output")
       .AddIntArg("group", 4)
       .Finalize(net.NewOperatorDef());
 
   // Run
   net.RunOp(DeviceType::GPU);
 
-  // Transfer output
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-
   // Check
   auto expected = net.CreateTensor<float>(
       {1, 1, 2, 16},
diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc
index f4c7ebbefac649e87af26a1a295f5613e171a4a7..eec11e0bb132055238d0dee95091d088729799bc 100644
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -28,7 +28,8 @@ class ConcatOpBase : public Operation {
  public:
   explicit ConcatOpBase(OpConstructContext *context)
       : Operation(context),
-        axis_(Operation::GetOptionalArg<int>("axis", 3)) {}
+        axis_(Operation::GetOptionalArg<int>("axis", 3)),
+        checked_(false) {}
 
  protected:
   void Validate() {
@@ -42,6 +43,7 @@ class ConcatOpBase : public Operation {
 
  protected:
   int axis_;
+  bool checked_;
 };
 
 template <DeviceType D, class T>
@@ -55,7 +57,15 @@ class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
-    Validate();
+    if (!checked_) {
+      Validate();
+      if (this->Input(0)->dim_size() == 4) {
+        if (axis_ == 3) axis_ = 1;
+        else if (axis_ == 2) axis_ = 3;
+        else if (axis_ == 1) axis_ = 2;
+      }
+      checked_ = true;
+    }
     const std::vector<const Tensor *> &inputs = this->Inputs();
     Tensor *output = this->Output(0);
     const Tensor *input0 = inputs.front();
diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc
index 02411591b558503c78508908b0a312e611eb8ca7..a43fc3084f880754612e50d75753d353d09dd04f 100644
--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -76,7 +76,7 @@ MACE_BM_CONCAT_CPU(1, 1225, 128);
 
 namespace {
 template <typename T>
-void OpenclConcatHelper(int iters,
+void OpenCLConcatHelper(int iters,
                         const std::vector<index_t> &shape0,
                         const std::vector<index_t> &shape1,
                         int concat_dim) {
@@ -88,15 +88,11 @@ void OpenclConcatHelper(int iters,
   net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
   net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);
 
-  BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
-                                       ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
-                                       ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Concat", "ConcatBM")
-      .Input("InputImage0")
-      .Input("InputImage1")
+      .Input("Input0")
+      .Input("Input1")
       .AddIntArg("axis", concat_dim)
-      .Output("OutputImage")
+      .Output("Output")
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
 
@@ -120,7 +116,7 @@ void OpenclConcatHelper(int iters,
 #define MACE_BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE)                          \
   static void MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) {\
     std::vector<index_t> shape = {N, H, W, C};                                 \
-    OpenclConcatHelper<TYPE>(iters, shape, shape, 3);                          \
+    OpenCLConcatHelper<TYPE>(iters, shape, shape, 3);                          \
   }                                                                            \
   MACE_BENCHMARK(MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE)
 
diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc
index 83307e781a29eccbaa065588c9e2b554219ed2b0..a1b38898f9cf05919edf4433a7d502d3ae1626c7 100644
--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -104,7 +104,7 @@ TEST_F(ConcatOpTest, CPURandom) {
   static unsigned int seed = time(NULL);
   int dim = 5;
   int num_inputs = 2 + rand_r(&seed) % 10;
-  int axis = rand_r(&seed) % dim;
+  int axis = 1;
   // Construct graph
   OpsTestNet net;
   auto builder = OpDefBuilder("Concat", "ConcatTest");
@@ -157,7 +157,8 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
   static unsigned int seed = time(NULL);
   int dim = 4;
   int num_inputs = 2 + rand_r(&seed) % 10;
-  int axis = rand_r(&seed) % dim;
+  int axis = 1;
+  int axis_arg = 3;  // NHWC
   // Construct graph
   OpsTestNet net;
 
@@ -178,13 +179,13 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
   std::vector<index_t> output_shape = input_shapes[0];
   output_shape[axis] = concat_axis_size;
   net.AddRandomInput<DeviceType::CPU, float>(
-      "Output", output_shape, true, true);
+      "Output", output_shape, false, true, true);
 
   auto builder = OpDefBuilder("Concat", "ConcatTest");
   for (int i = 0; i < num_inputs; ++i) {
     builder = builder.Input(MakeString("Input", i));
   }
-  builder.AddIntArg("axis", axis)
+  builder.AddIntArg("axis", axis_arg)
       .Output("Output")
       .Finalize(net.NewOperatorDef());
 
@@ -212,7 +213,7 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
   net.RunOp();
 
   net.AddRandomInput<DeviceType::CPU, uint8_t>(
-      "QuantizedOutput", output_shape, true, true);
+      "QuantizedOutput", output_shape, false, true, true);
   auto q_builder = OpDefBuilder("Concat", "QuantizedConcatTest");
   for (int i = 0; i < num_inputs; ++i) {
     q_builder = q_builder.Input(MakeString("QuantizedInput", i));
@@ -255,32 +256,26 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
   OpsTestNet net;
   for (int i = 0; i < num_inputs; ++i) {
     const std::string input_name = MakeString("Input", i);
-    const std::string image_name = MakeString("InputImage", i);
     concat_axis_size += shapes[i][axis];
     GenerateRandomRealTypeData(shapes[i], &inputs[i]);
     input_ptrs[i] = inputs[i].data();
     net.AddInputFromArray<DeviceType::GPU, float>(input_name, shapes[i],
                                                   inputs[i]);
-    BufferToImage<DeviceType::GPU, T>(&net, input_name, image_name,
-                                      ops::BufferType::IN_OUT_CHANNEL);
   }
 
   auto builder = OpDefBuilder("Concat", "ConcatTest");
   for (int i = 0; i < num_inputs; ++i) {
-    const std::string image_name = MakeString("InputImage", i);
+    const std::string image_name = MakeString("Input", i);
     builder = builder.Input(image_name);
   }
   builder.AddIntArg("axis", axis)
-      .Output("OutputImage")
+      .Output("Output")
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
 
   // Run
   net.RunOp(DeviceType::GPU);
 
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-
   // Check
   auto output = net.GetOutput("Output");
 
diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc
index bf5ebaa0c07abd30cc7884bb1b896621d6e67e09..a5cbec7411aaa47f82717e50a71ee1cf3d4d87e6 100644
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -959,8 +959,9 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
       : ConvPool2dOpBase(context),
         activation_(ops::StringToActivationType(
             Operation::GetOptionalArg<std::string>("activation",
-                                                  "NOOP"))),
-        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {
+                                                   "NOOP"))),
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
+        wino_block_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)) {
     MemoryType mem_type;
     if (context->device()->opencl_runtime()->UseImageMemory()) {
       mem_type = MemoryType::GPU_IMAGE;
@@ -969,13 +970,32 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
       mem_type = MemoryType::GPU_BUFFER;
       kernel_.reset(new opencl::buffer::Conv2dKernel<T>);
     }
+    context->set_output_mem_type(mem_type);
     // Transform filter tensor to target format
-    MACE_CHECK(TransformFilter<T>(
-        context, operator_def_.get(), 1, BufferType::CONV2D_FILTER, mem_type)
-                   == MaceStatus::MACE_SUCCESS);
+    if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
+        (kernel_->CheckUseWinograd(
+          context->device()->opencl_runtime(),
+          context->workspace()->GetTensor(
+              operator_def_->input(1))->shape(),
+          std::vector<index_t>(operator_def_->output_shape(0).dims().begin(),
+                               operator_def_->output_shape(0).dims().end()),
+          strides_.data(),
+          dilations_.data(),
+          &wino_block_size_))) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 1,
+          OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_)
+                     == MaceStatus::MACE_SUCCESS);
+    } else {
+      wino_block_size_ = 0;
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 1,
+          OpenCLBufferType::CONV2D_FILTER, mem_type)
+                     == MaceStatus::MACE_SUCCESS);
+    }
     if (operator_def_->input_size() > 2) {
       MACE_CHECK(TransformFilter<T>(
-          context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type)
+          context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
                      == MaceStatus::MACE_SUCCESS);
     }
   }
@@ -987,13 +1007,14 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
     return kernel_->Compute(context, input, filter, bias,
                             strides_.data(), padding_type_, paddings_,
                             dilations_.data(), activation_, relux_max_limit_,
-                            output);
+                            wino_block_size_, output);
   }
 
  private:
   const ActivationType activation_;
   const float relux_max_limit_;
   std::unique_ptr<OpenCLConv2dKernel> kernel_;
+  int wino_block_size_;
 
  private:
   MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index 96be2902a3b07d5e1a7bf0a9e25587af6de3c2cb..91efff7974df9e159f531fb4fcd104751e5ed0f4 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -49,11 +49,10 @@ void Conv2d(int iters,
   }
   net.AddRandomInput<D, float>("Filter",
                                {output_channels, channels, kernel_h,
-                                kernel_w});
-  net.AddRandomInput<D, float>("Bias", {output_channels});
+                                kernel_w}, true);
+  net.AddRandomInput<D, float>("Bias", {output_channels}, true);
 
-  if (D == DeviceType::CPU) {
-    OpDefBuilder("Conv2D", "Conv2dTest")
+  OpDefBuilder("Conv2D", "Conv2dTest")
       .Input("Input")
       .Input("Filter")
       .Input("Bias")
@@ -63,26 +62,6 @@ void Conv2d(int iters,
       .AddIntsArg("dilations", {dilation, dilation})
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-    OpDefBuilder("Conv2D", "Conv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntArg("padding", padding)
-        .AddIntsArg("dilations", {dilation, dilation})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
 
   net.Setup(D);
 
@@ -123,9 +102,9 @@ void Conv2d<CPU, uint8_t>(int iters,
       "Input", {batch, height, width, channels});
   net.GetTensor("Input")->SetScale(0.1);
   net.AddRandomInput<DeviceType::CPU, uint8_t>(
-      "Filter", {output_channels, kernel_h, kernel_w, channels});
+      "Filter", {output_channels, kernel_h, kernel_w, channels}, true);
   net.GetTensor("Filter")->SetScale(0.1);
-  net.AddRandomInput<DeviceType::CPU, int32_t>("Bias", {output_channels});
+  net.AddRandomInput<DeviceType::CPU, int32_t>("Bias", {output_channels}, true);
   OpDefBuilder("Conv2D", "Conv2dTest")
       .Input("Input")
       .Input("Filter")
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index db7f0458fb9327c8cffa06a773c102a3421aa5ea..eb21ef2c3e596ba28ce4178574dcb74db59a434f 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -31,7 +31,7 @@ class Conv2dOpTest : public OpsTestBase {
 
 namespace {
 template <DeviceType D, typename T>
-void TestNHWCSimple3x3VALID() {
+void TestNHWCSimple3x3VALID(int wino_blk_size = 0) {
   OpsTestNet net;
   // Add input data
   net.AddInputFromArray<D, float>(
@@ -40,8 +40,9 @@ void TestNHWCSimple3x3VALID() {
   net.AddInputFromArray<D, float>(
       "Filter", {1, 2, 3, 3},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true);
+  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f}, true);
+  const std::vector<index_t> output_shape = {1, 1, 1, 1};
 
   if (D == DeviceType::CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
@@ -60,34 +61,25 @@ void TestNHWCSimple3x3VALID() {
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
     OpDefBuilder("Conv2D", "Conv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .OutputShape(output_shape)
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::VALID)
         .AddIntsArg("dilations", {1, 1})
         .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .AddIntArg("wino_block_size", wino_blk_size)
         .Finalize(net.NewOperatorDef());
 
     net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
   } else {
     MACE_NOT_IMPLEMENTED;
   }
 
-  auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {18.1f});
+  auto expected = net.CreateTensor<float>(output_shape, {18.1f});
   if (DataTypeToEnum<T>::value == DataType::DT_FLOAT) {
     ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
   } else {
@@ -96,7 +88,7 @@ void TestNHWCSimple3x3VALID() {
 }
 
 template <DeviceType D, typename T>
-void TestNHWCSimple3x3SAME() {
+void TestNHWCSimple3x3SAME(int wino_blk_size = 0) {
   OpsTestNet net;
 
   // Add input data
@@ -106,8 +98,9 @@ void TestNHWCSimple3x3SAME() {
   net.AddInputFromArray<D, float>(
       "Filter", {1, 2, 3, 3},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true);
+  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f}, true);
+  const std::vector<index_t> output_shape = {1, 3, 3, 1};
 
   if (D == DeviceType::CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
@@ -126,35 +119,26 @@ void TestNHWCSimple3x3SAME() {
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
     OpDefBuilder("Conv2D", "Conv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .OutputShape(output_shape)
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::SAME)
         .AddIntsArg("dilations", {1, 1})
         .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .AddIntArg("wino_block_size", wino_blk_size)
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
   } else {
     MACE_NOT_IMPLEMENTED;
   }
 
   auto expected = net.CreateTensor<float>(
-      {1, 3, 3, 1},
+      output_shape,
       {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
 
   if (DataTypeToEnum<T>::value == DataType::DT_FLOAT) {
@@ -180,6 +164,14 @@ TEST_F(Conv2dOpTest, OPENCLHalfSimple) {
   TestNHWCSimple3x3SAME<DeviceType::GPU, half>();
 }
 
+TEST_F(Conv2dOpTest, OPENCLSimpleWinograd) {
+  TestNHWCSimple3x3SAME<DeviceType::GPU, float>(4);
+  TestNHWCSimple3x3VALID<DeviceType::GPU, float>(2);
+  TestNHWCSimple3x3VALID<DeviceType::GPU, half>(2);
+  // TODO(liutuo): the precision of the last value is not normal.
+//  TestNHWCSimple3x3SAME<DeviceType::GPU, half>(4);
+}
+
 namespace {
 template <DeviceType D, typename T>
 void TestNHWCSimple3x3WithoutBias() {
@@ -192,7 +184,7 @@ void TestNHWCSimple3x3WithoutBias() {
   net.AddInputFromArray<D, T>(
       "Filter", {1, 2, 3, 3},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true);
 
   if (D == DeviceType::CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
@@ -212,15 +204,10 @@ void TestNHWCSimple3x3WithoutBias() {
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-
     OpDefBuilder("Conv2D", "Conv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Filter")
+        .Output("Output")
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::VALID)
         .AddIntsArg("dilations", {1, 1})
@@ -228,9 +215,6 @@ void TestNHWCSimple3x3WithoutBias() {
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    // Transfer output
-    ImageToBuffer<D, T>(&net, "OutputImage", "Output",
-                        ops::BufferType::IN_OUT_CHANNEL);
   } else {
     MACE_NOT_IMPLEMENTED;
   }
@@ -265,8 +249,9 @@ void TestNHWCCombined3x3() {
       "Filter", {2, 2, 3, 3},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
-       0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f});
-  net.AddInputFromArray<D, T>("Bias", {2}, {0.1f, 0.2f});
+       0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f},
+      true);
+  net.AddInputFromArray<D, T>("Bias", {2}, {0.1f, 0.2f}, true);
 
   if (D == DeviceType::CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
@@ -286,18 +271,11 @@ void TestNHWCCombined3x3() {
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-
     OpDefBuilder("Conv2D", "Conv2DTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
         .AddIntsArg("strides", {2, 2})
         .AddIntArg("padding", Padding::SAME)
         .AddIntsArg("dilations", {1, 1})
@@ -305,9 +283,6 @@ void TestNHWCCombined3x3() {
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-
-    ImageToBuffer<D, T>(&net, "OutputImage", "Output",
-                        ops::BufferType::IN_OUT_CHANNEL);
   } else {
     MACE_NOT_IMPLEMENTED;
   }
@@ -330,7 +305,7 @@ TEST_F(Conv2dOpTest, OPENCLStride2) {
 
 namespace {
 template <DeviceType D, typename T>
-void TestFusedNHWCSimple3x3VALID() {
+void TestFusedNHWCSimple3x3VALID(int wino_blk_size = 0) {
   OpsTestNet net;
   // Add input data
   net.AddInputFromArray<D, float>(
@@ -339,8 +314,9 @@ void TestFusedNHWCSimple3x3VALID() {
   net.AddInputFromArray<D, float>(
       "Filter", {1, 2, 3, 3},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<D, float>("Bias", {1}, {-0.1f});
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true);
+  net.AddInputFromArray<D, float>("Bias", {1}, {-0.1f}, true);
+  const std::vector<index_t> output_shape = {1, 1, 1, 1};
 
   if (D == DeviceType::CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
@@ -361,39 +337,30 @@ void TestFusedNHWCSimple3x3VALID() {
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
     OpDefBuilder("Conv2D", "Conv2DTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .OutputShape(output_shape)
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::VALID)
         .AddIntsArg("dilations", {1, 1})
         .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .AddStringArg("activation", "RELU")
+        .AddIntArg("wino_block_size", wino_blk_size)
         .Finalize(net.NewOperatorDef());
 
     net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
   } else {
     MACE_NOT_IMPLEMENTED;
   }
 
-  auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {0.0f});
+  auto expected = net.CreateTensor<float>(output_shape, {0.0f});
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
 }
 template <DeviceType D, typename T>
-void TestFusedNHWCSimple3x3WithoutBias() {
+void TestFusedNHWCSimple3x3WithoutBias(int wino_blk_size = 0) {
   OpsTestNet net;
 
   // Add input data
@@ -403,7 +370,8 @@ void TestFusedNHWCSimple3x3WithoutBias() {
   net.AddInputFromArray<D, float>(
       "Filter", {1, 2, 3, 3},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true);
+  const std::vector<index_t> output_shape = {1, 1, 1, 1};
 
   if (D == DeviceType::CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
@@ -424,32 +392,26 @@ void TestFusedNHWCSimple3x3WithoutBias() {
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-
     OpDefBuilder("Conv2D", "Conv2DTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Filter")
+        .Output("Output")
+        .OutputShape(output_shape)
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::VALID)
         .AddIntsArg("dilations", {1, 1})
         .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .AddStringArg("activation", "RELU")
+        .AddIntArg("wino_block_size", wino_blk_size)
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     MACE_NOT_IMPLEMENTED;
   }
 
   // Check
-  auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {0.0f});
+  auto expected = net.CreateTensor<float>(output_shape, {0.0f});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
 }
@@ -466,6 +428,13 @@ TEST_F(Conv2dOpTest, FusedOPENCLSimple) {
   TestFusedNHWCSimple3x3WithoutBias<DeviceType::GPU, float>();
 }
 
+TEST_F(Conv2dOpTest, FusedOPENCLSimpleWinograd) {
+  TestFusedNHWCSimple3x3VALID<DeviceType::GPU, float>(2);
+  TestFusedNHWCSimple3x3WithoutBias<DeviceType::GPU, float>(2);
+  TestFusedNHWCSimple3x3VALID<DeviceType::GPU, float>(4);
+  TestFusedNHWCSimple3x3WithoutBias<DeviceType::GPU, float>(4);
+}
+
 namespace {
 template <DeviceType D>
 void TestConv1x1() {
@@ -484,8 +453,8 @@ void TestConv1x1() {
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
   net.AddInputFromArray<D, float>(
       "Filter", {2, 5, 1, 1},
-      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f});
-  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f}, true);
+  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f}, true);
 
   if (D == DeviceType::CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
@@ -504,27 +473,17 @@ void TestConv1x1() {
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Filter", "FilterImage",
-                            ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, float>(&net, "Bias", "BiasImage",
-                            ops::BufferType::ARGUMENT);
-
     OpDefBuilder("Conv2D", "Conv2DTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::VALID)
         .AddIntsArg("dilations", {1, 1})
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     MACE_NOT_IMPLEMENTED;
   }
@@ -550,7 +509,8 @@ TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::GPU>(); }
 namespace {
 template <DeviceType D, typename T>
 void TestComplexConvNxNS12(const std::vector<index_t> &shape,
-                           const int stride) {
+                           const int stride,
+                           const int wino_blk_size = 0) {
   testing::internal::LogToStderr();
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                   Padding type) {
@@ -559,16 +519,16 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
     index_t batch = 3 + (rand_r(&seed) % 10);
     index_t height = shape[0];
     index_t width = shape[1];
-    index_t input_channels = shape[2] + (rand_r(&seed) % 10);
-    index_t output_channels = shape[3] + (rand_r(&seed) % 10);
+    index_t input_channels = shape[2];
+    index_t output_channels = shape[3];
 
     OpsTestNet net;
 
     // Add input data
     net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
     net.AddRandomInput<D, T>(
-        "Filter", {output_channels, input_channels, kernel_h, kernel_w});
-    net.AddRandomInput<D, T>("Bias", {output_channels});
+        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
+    net.AddRandomInput<D, T>("Bias", {output_channels}, true);
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                     NCHW);
 
@@ -595,28 +555,20 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
     expected->Copy(*net.GetOutput("Output"));
 
     // run on gpu
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-
     OpDefBuilder("Conv2D", "Conv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("OPENCLOutput")
+        .OutputShape(expected->shape())
         .AddIntsArg("strides", {stride_h, stride_w})
         .AddIntArg("padding", type)
         .AddIntsArg("dilations", {1, 1})
         .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .AddIntArg("wino_block_size", wino_blk_size)
         .Finalize(net.NewOperatorDef());
     // Run on device
     net.RunOp(D);
-
-    ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
-                        ops::BufferType::IN_OUT_CHANNEL);
     ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                             1e-4);
   };
@@ -633,11 +585,21 @@ TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
   TestComplexConvNxNS12<DeviceType::GPU, float>({32, 16, 16, 32}, 2);
 }
 
+TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNWinograd) {
+  TestComplexConvNxNS12<DeviceType::GPU, float>({32, 16, 16, 32}, 1, 2);
+  TestComplexConvNxNS12<DeviceType::GPU, float>({32, 16, 16, 32}, 1, 4);
+}
+
 TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
   TestComplexConvNxNS12<DeviceType::GPU, float>({17, 113, 5, 7}, 1);
   TestComplexConvNxNS12<DeviceType::GPU, float>({17, 113, 5, 7}, 2);
 }
 
+TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNWinograd) {
+  TestComplexConvNxNS12<DeviceType::GPU, float>({17, 113, 5, 7}, 1, 4);
+  TestComplexConvNxNS12<DeviceType::GPU, float>({17, 113, 5, 7}, 1, 2);
+}
+
 TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) {
   TestComplexConvNxNS12<DeviceType::GPU, float>({31, 113, 13, 17}, 3);
   TestComplexConvNxNS12<DeviceType::GPU, float>({32, 32, 13, 17}, 4);
@@ -647,13 +609,14 @@ namespace {
 template <DeviceType D>
 void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
                                const std::vector<index_t> &filter_shape,
-                               const std::vector<int> &dilations) {
+                               const std::vector<int> &dilations,
+                               const int wino_blk_size = 0) {
   testing::internal::LogToStderr();
   srand(time(NULL));
 
-  auto func = [&](int stride_h, int stride_w, Padding padding) {
+  auto func = [&](index_t batch, int stride_h, int stride_w, Padding padding) {
     // generate random input
-    index_t batch = 1;
+    static unsigned int seed = time(NULL);
     index_t height = input_shape[0];
     index_t width = input_shape[1];
     index_t kernel_h = filter_shape[0];
@@ -677,8 +640,11 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
         "Input", {batch, height, width, input_channels}, float_input_data);
     net.AddInputFromArray<D, float>(
         "Filter", {output_channels, input_channels, kernel_h, kernel_w},
-        float_filter_data);
-    net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);
+        float_filter_data, true);
+    net.AddInputFromArray<D, float>("Bias",
+                                    {output_channels},
+                                    float_bias_data,
+                                    true);
 
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                     NCHW);
@@ -704,38 +670,31 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
     expected->Copy(*net.GetOutput("Output"));
 
     // run on gpu
-    BufferToImage<D, half>(&net, "Input", "InputImage",
-                           ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, half>(&net, "Filter", "FilterImage",
-                           ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, half>(&net, "Bias", "BiasImage",
-                           ops::BufferType::ARGUMENT);
-
     OpDefBuilder("Conv2D", "Conv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("OPENCLOutput")
+        .OutputShape(expected->shape())
         .AddIntsArg("strides", {stride_h, stride_w})
         .AddIntArg("padding", padding)
         .AddIntsArg("dilations", {dilations[0], dilations[1]})
         .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
+        .AddIntArg("wino_block_size", wino_blk_size)
         .Finalize(net.NewOperatorDef());
     // Run on device
     net.RunOp(D);
-
-    ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
     ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                             1e-2);
   };
 
-  func(1, 1, VALID);
-  func(1, 1, SAME);
-  if (dilations[0] == 1) {
-    func(2, 2, VALID);
-    func(2, 2, SAME);
+  for (auto batch : {1, 5}) {
+    func(batch, 1, 1, VALID);
+    func(batch, 1, 1, SAME);
+    if (dilations[0] == 1 && wino_blk_size == 0) {
+      func(batch, 2, 2, VALID);
+      func(batch, 2, 2, SAME);
+    }
   }
 }
 }  // namespace
@@ -748,6 +707,14 @@ TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
   TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {3, 3, 32, 64}, {1, 1});
 }
 
+TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3Winograd) {
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {3, 3, 32, 64},
+                                             {1, 1}, 2);
+// TODO(liutuo) : the precision error is large.
+//  TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {3, 3, 32, 64},
+//                                             {1, 1}, 4);
+}
+
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv5x5S12) {
   TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {5, 5, 3, 64}, {1, 1});
   TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {5, 5, 3, 63}, {1, 1});
@@ -795,6 +762,14 @@ TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
   TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {3, 3, 5, 7}, {1, 1});
 }
 
+TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3Winograd) {
+  // TODO(liutuo) : the precision error is large.
+//  TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {3, 3, 5, 7},
+//                                             {1, 1}, 4);
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {3, 3, 5, 7},
+                                             {1, 1}, 2);
+}
+
 TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) {
   TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {5, 5, 16, 16}, {2, 2});
 }
@@ -828,8 +803,8 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
     // Add input data
     net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
     net.AddRandomInput<D, T>(
-        "Filter", {output_channels, input_channels, kernel_h, kernel_w});
-    net.AddRandomInput<D, T>("Bias", {output_channels});
+        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
+    net.AddRandomInput<D, T>("Bias", {output_channels}, true);
 
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                     NCHW);
@@ -856,18 +831,11 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
     expected->Copy(*net.GetOutput("Output"));
 
     // run on gpu
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-
     OpDefBuilder("Conv2D", "Conv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("OPENCLOutput")
         .AddIntsArg("strides", {stride_h, stride_w})
         .AddIntArg("padding", type)
         .AddIntsArg("dilations", {dilation_rate, dilation_rate})
@@ -875,9 +843,6 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
         .Finalize(net.NewOperatorDef());
     // Run on device
     net.RunOp(D);
-
-    ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
-                        ops::BufferType::IN_OUT_CHANNEL);
     ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
                             1e-4);
   };
@@ -927,8 +892,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
     net.AddRandomInput<D, float>("Input",
                                  {batch, height, width, input_channels});
     net.AddRandomInput<D, float>(
-        "Filter", {output_channels, input_channels, kernel_h, kernel_w});
-    net.AddRandomInput<D, float>("Bias", {output_channels});
+        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
+    net.AddRandomInput<D, float>("Bias", {output_channels}, true);
 
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                     NCHW);
@@ -953,18 +918,11 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
     expected->Copy(*net.GetOutput("Output"));
 
     // run on gpu
-    BufferToImage<D, half>(&net, "Input", "InputImage",
-                           ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, half>(&net, "Filter", "FilterImage",
-                           ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, half>(&net, "Bias", "BiasImage",
-                           ops::BufferType::ARGUMENT);
-
     OpDefBuilder("Conv2D", "Conv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("OPENCLOutput")
         .AddIntsArg("strides", {stride_h, stride_w})
         .AddIntArg("padding", type)
         .AddIntsArg("dilations", dilations)
@@ -973,8 +931,6 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
     // Run on device
     net.RunOp(D);
 
-    ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                            ops::BufferType::IN_OUT_CHANNEL);
     ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
                             1e-1);
   };
@@ -996,7 +952,8 @@ TEST_F(Conv2dOpTest, OPENCLHalf15X15AtrousConvD4) {
 namespace {
 template <DeviceType D, typename T>
 void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
-                             const std::vector<int> &paddings) {
+                             const std::vector<int> &paddings,
+                             const int wino_blk_size = 0) {
   testing::internal::LogToStderr();
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w) {
     srand(time(NULL));
@@ -1011,10 +968,11 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
     OpsTestNet net;
 
     // Add input data
-    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
-    net.AddRandomInput<D, T>(
-        "Filter", {output_channels, input_channels, kernel_h, kernel_w});
-    net.AddRandomInput<D, T>("Bias", {output_channels});
+    net.AddRandomInput<D, float>("Input",
+                                 {batch, height, width, input_channels});
+    net.AddRandomInput<D, float>(
+        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
+    net.AddRandomInput<D, float>("Bias", {output_channels}, true);
 
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                     NCHW);
@@ -1026,7 +984,6 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
         .Output("OutputNCHW")
         .AddIntsArg("strides", {stride_h, stride_w})
         .AddIntsArg("padding_values", paddings)
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
 
     // run on cpu
@@ -1040,34 +997,35 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
     expected->Copy(*net.GetOutput("Output"));
 
     // run on gpu
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-
     OpDefBuilder("Conv2D", "Conv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("OPENCLOutput")
+        .OutputShape(expected->shape())
         .AddIntsArg("strides", {stride_h, stride_w})
         .AddIntsArg("padding_values", paddings)
         .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .AddIntArg("wino_block_size", wino_blk_size)
         .Finalize(net.NewOperatorDef());
     // Run on device
     net.RunOp(D);
-
-    ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
-                            1e-4);
+    if (DataTypeToEnum<T>::value == DT_HALF) {
+      ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-1,
+                              1e-2);
+    } else {
+      ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
+                              1e-4);
+    }
   };
 
-  for (int kernel_size : {3, 5, 7}) {
-    for (int stride : {2, 3}) {
-      func(kernel_size, kernel_size, stride, stride);
+  if (wino_blk_size != 0) {
+    func(3, 3, 1, 1);
+  } else {
+    for (int kernel_size : {3, 5, 7}) {
+      for (int stride : {2, 3}) {
+        func(kernel_size, kernel_size, stride, stride);
+      }
     }
   }
 }
@@ -1081,8 +1039,24 @@ TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
   TestArbitraryPadConvNxN<DeviceType::GPU, float>({128, 128, 16, 16}, {2, 2});
 }
 
+TEST_F(Conv2dOpTest, OPENCLAlignedPad2Winograd) {
+  TestArbitraryPadConvNxN<DeviceType::GPU, float>({128, 128, 16, 16},
+                                                  {2, 2}, 2);
+  TestArbitraryPadConvNxN<DeviceType::GPU, float>({128, 128, 16, 16},
+                                                  {2, 2}, 4);
+}
+
 TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) {
   TestArbitraryPadConvNxN<DeviceType::GPU, float>({107, 113, 5, 7}, {4, 4});
+  TestArbitraryPadConvNxN<DeviceType::GPU, half>({107, 113, 5, 7}, {4, 4});
+}
+
+TEST_F(Conv2dOpTest, OPENCLUnalignedPad4Winograd) {
+  TestArbitraryPadConvNxN<DeviceType::GPU, float>({107, 113, 5, 7}, {1, 1}, 2);
+  TestArbitraryPadConvNxN<DeviceType::GPU, half>({107, 113, 5, 7}, {1, 1}, 2);
+  TestArbitraryPadConvNxN<DeviceType::GPU, float>({107, 113, 5, 7}, {4, 4}, 4);
+  // TODO(liutuo) : the precision error is large.
+  TestArbitraryPadConvNxN<DeviceType::GPU, half>({107, 113, 5, 7}, {4, 4}, 4);
 }
 
 namespace {
@@ -1094,13 +1068,13 @@ void TestQuantSimple3x3() {
   net.AddInputFromArray<DeviceType::CPU, uint8_t>(
       "Filter", {1, 3, 3, 2},
       {102, 150, 123, 135, 1, 216, 137, 47, 53, 75, 145, 130, 171, 62, 255,
-       122, 72, 211}, 0.0226, 127);
+       122, 72, 211}, true, 0.0226, 127);
   net.AddInputFromArray<DeviceType::CPU, uint8_t>(
       "Input", {1, 3, 3, 2},
       {1, 75, 117, 161, 127, 119, 94, 151, 203, 151, 84, 61, 55, 142, 113, 139,
-       3, 255}, 0.0204, 93);
+       3, 255}, false, 0.0204, 93);
 
-  net.AddInputFromArray<DeviceType::CPU, int32_t>("Bias", {1}, {2});
+  net.AddInputFromArray<DeviceType::CPU, int32_t>("Bias", {1}, {2}, true);
   OpDefBuilder("Conv2D", "Conv2dTest")
       .Input("Input")
       .Input("Filter")
@@ -1136,12 +1110,14 @@ void TestQuant(const index_t batch,
   net.AddRandomInput<CPU, float>("Input", {batch, in_height, in_width,
                                            in_channels});
   net.AddRandomInput<CPU, float>("Filter", {out_channels, k_height, k_width,
-                                            in_channels});
-  net.AddRandomInput<CPU, float>("Bias", {out_channels});
+                                            in_channels}, true);
+  net.AddRandomInput<CPU, float>("Bias", {out_channels}, true);
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                   NCHW);
-  net.TransformDataFormat<DeviceType::CPU, float>("Filter", OHWI, "FilterOIHW",
-                                                  OIHW);
+  net.TransformFilterDataFormat<DeviceType::CPU, float>("Filter",
+                                                        OHWI,
+                                                        "FilterOIHW",
+                                                        OIHW);
 
   OpDefBuilder("Conv2D", "Conv2dTest")
       .Input("InputNCHW")
@@ -1193,7 +1169,7 @@ void TestQuant(const index_t batch,
       bias_data, bias->size(), q_input->scale() * q_filter->scale(), 0,
       q_bias.data());
   net.AddInputFromArray<DeviceType::CPU, int32_t>("QuantizedBias",
-                                                  {out_channels}, q_bias);
+                                                  {out_channels}, q_bias, true);
   OpDefBuilder("Conv2D", "QuantizeConv2dTest")
       .Input("QuantizedInput")
       .Input("QuantizedFilter")
diff --git a/mace/ops/conv_pool_2d_util.cc b/mace/ops/conv_pool_2d_util.cc
index a056743e85af91b562781d9821aebad87115221d..fcc44e789dbeb55f6455655420566e514f0fa1a3 100644
--- a/mace/ops/conv_pool_2d_util.cc
+++ b/mace/ops/conv_pool_2d_util.cc
@@ -24,7 +24,7 @@ namespace ops {
 void CalcPaddingAndOutputSize(const index_t *input_shape,
                               const DataFormat input_format,
                               const index_t *filter_shape,
-                              const DataFormat filter_format,
+                              const FilterDataFormat filter_format,
                               const int *dilations,
                               const int *strides,
                               Padding padding,
@@ -137,7 +137,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape,   // NHWC
 void CalcOutputSize(const index_t *input_shape,
                     const DataFormat input_format,
                     const index_t *filter_shape,
-                    const DataFormat filter_format,
+                    const FilterDataFormat filter_format,
                     const int *padding_size,
                     const int *dilations,
                     const int *strides,
diff --git a/mace/ops/conv_pool_2d_util.h b/mace/ops/conv_pool_2d_util.h
index 0e45c31e4be04938adc3f4e4271b8c6140106fb0..78333717e098903e46704d870d4f93a41f52b018 100644
--- a/mace/ops/conv_pool_2d_util.h
+++ b/mace/ops/conv_pool_2d_util.h
@@ -35,7 +35,7 @@ namespace ops {
 void CalcPaddingAndOutputSize(const index_t *input_shape,
                               const DataFormat input_format,
                               const index_t *filter_shape,
-                              const DataFormat filter_format,
+                              const FilterDataFormat filter_format,
                               const int *dilations,
                               const int *strides,
                               Padding padding,
@@ -61,7 +61,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape,
 void CalcOutputSize(const index_t *input_shape,
                     const DataFormat input_format,
                     const index_t *filter_shape,
-                    const DataFormat filter_format,
+                    const FilterDataFormat filter_format,
                     const int *padding_size,
                     const int *dilations,
                     const int *strides,
diff --git a/mace/ops/core_test.cc b/mace/ops/core_test.cc
deleted file mode 100644
index 3e3185b34ada65c40fb03e398a2033cb020217f9..0000000000000000000000000000000000000000
--- a/mace/ops/core_test.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/ops_test_util.h"
-
-namespace mace {
-namespace ops {
-namespace test {
-
-TEST(CoreTest, INIT_MODE) {
-  std::vector<OperatorDef> op_defs;
-
-  Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU);
-  std::unique_ptr<Tuner<uint32_t>> tuner;
-  Workspace ws;
-
-  op_defs.emplace_back(OperatorDef());
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-      .Input("Input")
-      .Output("B2IOutput")
-      .AddIntArg("buffer_type", ops::BufferType::CONV2D_FILTER)
-      .AddIntArg("mode", static_cast<int>(NetMode::INIT))
-      .Finalize(&op_defs[op_defs.size() - 1]);
-
-  Tensor *input = ws.CreateTensor("Input", device->allocator(),
-                                  DataTypeToEnum<float>::v());
-  input->Resize({1, 3, 3, 3});
-  {
-    Tensor::MappingGuard input_mapper(input);
-    float *input_data = input->mutable_data<float>();
-    std::fill(input_data, input_data + input->size(), 1);
-  }
-
-  op_defs.emplace_back(OperatorDef());
-  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
-      .Input("B2IOutput")
-      .Output("Output")
-      .AddIntArg("buffer_type", ops::BufferType::CONV2D_FILTER)
-      .Finalize(&op_defs[op_defs.size() - 1]);
-
-  NetDef net_def;
-  for (auto &op_def : op_defs) {
-    net_def.add_op()->CopyFrom(op_def);
-  }
-  std::shared_ptr<OpRegistry> op_registry(new OpRegistry());
-  auto net = std::unique_ptr<NetBase>(new SerialNet(
-      op_registry.get(), &net_def, &ws, device,
-      NetMode::INIT));
-  MaceStatus status = net->Init();
-  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
-  status = net->Run();
-  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
-
-  EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr);
-  EXPECT_TRUE(ws.GetTensor("Output") == nullptr);
-  net = std::unique_ptr<NetBase>(new SerialNet(
-      op_registry.get(), &net_def, &ws, device));
-  status = net->Init();
-  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
-  status = net->Run();
-  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
-  EXPECT_TRUE(ws.GetTensor("Output") != nullptr);
-
-  ExpectTensorNear<float>(*ws.GetTensor("Input"), *ws.GetTensor("Output"),
-                          1e-5);
-}
-
-}  // namespace test
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/crop_benchmark.cc b/mace/ops/crop_benchmark.cc
index b186cecc8437773c96a494fa8ad3066cf8027625..aad6f93d610e8ac6eed96bd0aef9bcbcbf27cdca 100644
--- a/mace/ops/crop_benchmark.cc
+++ b/mace/ops/crop_benchmark.cc
@@ -66,7 +66,7 @@ MACE_BM_CROP_CPU_MACRO(2, 512, 6);
 
 namespace {
 template <typename T>
-void OpenclCropHelper(int iters,
+void OpenCLCropHelper(int iters,
                       const std::vector<index_t> &shape0,
                       const std::vector<index_t> &shape1,
                       int crop_axis,
@@ -79,16 +79,12 @@ void OpenclCropHelper(int iters,
   net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
   net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);
 
-  BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
-                                       ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
-                                       ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Crop", "CropBM")
-      .Input("InputImage0")
-      .Input("InputImage1")
+      .Input("Input0")
+      .Input("Input1")
       .AddIntArg("axis", crop_axis)
       .AddIntsArg("offset", {offset})
-      .Output("OutputImage")
+      .Output("Output")
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
 
@@ -114,7 +110,7 @@ void OpenclCropHelper(int iters,
   _##TYPE(int iters) {                                                        \
     std::vector<index_t> shape0 = {N, H, W, C};                              \
     std::vector<index_t> shape1 = {N / 2, H / 2, W / 2, C / 2};              \
-    OpenclCropHelper<TYPE>(iters, shape0, shape1, AXIS, OFFSET);             \
+    OpenCLCropHelper<TYPE>(iters, shape0, shape1, AXIS, OFFSET);             \
   }                                                                          \
   MACE_BENCHMARK(MACE_BM_CROP_GPU_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET\
   ##_##TYPE)
diff --git a/mace/ops/crop_test.cc b/mace/ops/crop_test.cc
index efada981b70a3316bce239d39912845484e85c5a..b757946c4c933bab9d6bf241fc589c5afa063566 100644
--- a/mace/ops/crop_test.cc
+++ b/mace/ops/crop_test.cc
@@ -34,14 +34,10 @@ void RunCrop(const std::vector<index_t> &input_shape,
   net.AddRandomInput<D, float>("Input1", input_shape2);
 
   if (D == GPU) {
-    BufferToImage<D, float>(&net, "Input0", "InputImage0",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Input1", "InputImage1",
-                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("Crop", "CropTest")
-        .Input("InputImage0")
-        .Input("InputImage1")
-        .Output("OutputImage")
+        .Input("Input0")
+        .Input("Input1")
+        .Output("Output")
         .AddIntsArg("offset", offset)
         .AddIntArg("axis", axis)
         .Finalize(net.NewOperatorDef());
@@ -66,10 +62,7 @@ void RunCrop(const std::vector<index_t> &input_shape,
   // Run
   net.RunOp(D);
 
-  if (D == GPU) {
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else if (D == CPU) {
+  if (D == CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
   }
diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc
index c9113439536746e9ce05d33e4b20feb35a075060..0b11667e39843378d7b58e86abefb15fa76fae89 100644
--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -30,6 +30,7 @@
 #include "mace/ops/arm/deconv_2d_neon.h"
 #include "mace/utils/utils.h"
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/deconv_2d.h"
 #endif  // MACE_ENABLE_OPENCL
 
@@ -358,11 +359,27 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
  public:
   explicit Deconv2dOp(OpConstructContext *context)
       : Deconv2dOpBase(context) {
+    MemoryType mem_type = MemoryType::GPU_IMAGE;
     if (context->device()->opencl_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::Deconv2dKernel<T>);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
+    MACE_CHECK(TransformFilter<T>(
+        context, operator_def_.get(), 1,
+        OpenCLBufferType::CONV2D_FILTER, mem_type)
+                   == MaceStatus::MACE_SUCCESS);
+    if (model_type_ == FrameworkType::CAFFE) {
+      if (operator_def_->input_size() >= 3) {
+        MACE_CHECK(TransformFilter<T>(
+            context, operator_def_.get(), 2,
+            OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
+      }
+    } else if (operator_def_->input_size() >= 4) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 3, OpenCLBufferType::ARGUMENT, mem_type)
+                     == MaceStatus::MACE_SUCCESS);
+    }
   }
   MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(0);
diff --git a/mace/ops/deconv_2d_benchmark.cc b/mace/ops/deconv_2d_benchmark.cc
index 175feacabcd514408a76434f9ee84f0fdbecdfe2..81be17c092ad0d6e91bbdf0514a4c0d94e641b10 100644
--- a/mace/ops/deconv_2d_benchmark.cc
+++ b/mace/ops/deconv_2d_benchmark.cc
@@ -47,40 +47,21 @@ static void Deconv2d(int iters,
   }
   net.AddRandomInput<D, float>("Filter",
                                {output_channels, channels, kernel_h,
-                                kernel_w});
-  net.AddRandomInput<D, float>("Bias", {output_channels});
+                                kernel_w}, true);
+  net.AddRandomInput<D, float>("Bias", {output_channels}, true);
   net.AddInputFromArray<D, int32_t>("OutputShape", {4},
-                                    {batch, out_h, out_w, output_channels});
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-    OpDefBuilder("Deconv2D", "Deconv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("OutputShape")
-        .Input("BiasImage")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntArg("padding", padding)
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("Deconv2D", "Deconv2dTest")
-        .Input("Input")
-        .Input("Filter")
-        .Input("OutputShape")
-        .Input("Bias")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntArg("padding", padding)
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  }
-
+                                    {batch, out_h, out_w, output_channels},
+                                    true);
+  OpDefBuilder("Deconv2D", "Deconv2dTest")
+      .Input("Input")
+      .Input("Filter")
+      .Input("OutputShape")
+      .Input("Bias")
+      .Output("Output")
+      .AddIntsArg("strides", {stride, stride})
+      .AddIntArg("padding", padding)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
   net.Setup(D);
 
   // Warm-up
diff --git a/mace/ops/deconv_2d_test.cc b/mace/ops/deconv_2d_test.cc
index 9aadd42c0c345da3bac268f1c645639850fafc80..1847c9432bb642175facf3a19f757ad4dd653e4b 100644
--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
@@ -41,40 +41,34 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
                    ops::FrameworkType model_type) {
   OpsTestNet net;
   // Add input data
-  const index_t batch = input_shape[0];
   const index_t out_channels = filter_shape[2];
 
   net.AddInputFromArray<D, float>("Input", input_shape, input_data);
-  net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data);
-  net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data);
-  net.TransformDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
+  net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data, true);
+  net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data, true);
+  // TODO(liutuo): remove the unused transform
+  net.TransformFilterDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
   if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Bias", "BiasImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "FilterOIHW", "FilterImage",
-                            ops::BufferType::CONV2D_FILTER);
     if (model_type == ops::FrameworkType::CAFFE) {
       OpDefBuilder("Deconv2D", "Deconv2dTest")
-          .Input("InputImage")
-          .Input("FilterImage")
-          .Input("BiasImage")
-          .Output("OutputImage")
+          .Input("Input")
+          .Input("FilterOIHW")
+          .Input("Bias")
+          .Output("Output")
           .AddIntsArg("strides", {stride, stride})
           .AddIntArg("padding", padding)
           .AddIntsArg("padding_values", padding_size)
           .AddIntArg("framework_type", model_type)
           .Finalize(net.NewOperatorDef());
     } else {
-      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape);
+      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape, true);
 
       OpDefBuilder("Deconv2D", "Deconv2dTest")
-          .Input("InputImage")
-          .Input("FilterImage")
+          .Input("Input")
+          .Input("FilterOIHW")
           .Input("OutputShape")
-          .Input("BiasImage")
-          .Output("OutputImage")
+          .Input("Bias")
+          .Output("Output")
           .AddIntsArg("strides", {stride, stride})
           .AddIntArg("padding", padding)
           .AddIntsArg("padding_values", padding_size)
@@ -82,10 +76,6 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
           .Finalize(net.NewOperatorDef());
     }
     net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                     NCHW);
@@ -102,7 +92,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
           .AddIntArg("framework_type", model_type)
           .Finalize(net.NewOperatorDef());
     } else {
-      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape);
+      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape, true);
 
       OpDefBuilder("Deconv2D", "Deconv2dTest")
           .Input("InputNCHW")
@@ -387,8 +377,8 @@ void TestComplexDeconvNxN(const int batch,
     // Add input data
     net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
     net.AddRandomInput<D, T>(
-        "Filter", {output_channels, input_channels, kernel_h, kernel_w});
-    net.AddRandomInput<D, T>("Bias", {output_channels});
+        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
+    net.AddRandomInput<D, T>("Bias", {output_channels}, true);
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                     NCHW);
     int out_h = 0;
@@ -413,7 +403,7 @@ void TestComplexDeconvNxN(const int batch,
       output_shape.push_back(out_h);
       output_shape.push_back(out_w);
       output_shape.push_back(output_channels);
-      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape);
+      net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape, true);
     } else {
       paddings.push_back(padding);
       paddings.push_back(padding);
@@ -455,19 +445,12 @@ void TestComplexDeconvNxN(const int batch,
     expected->Copy(*net.GetOutput("Output"));
 
     // run on gpu
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-
     if (model_type == ops::FrameworkType::CAFFE) {
       OpDefBuilder("Deconv2D", "Deconv2dTest")
-          .Input("InputImage")
-          .Input("FilterImage")
-          .Input("BiasImage")
-          .Output("OutputImage")
+          .Input("Input")
+          .Input("Filter")
+          .Input("Bias")
+          .Output("Output")
           .AddIntsArg("strides", {stride_h, stride_w})
           .AddIntsArg("padding_values", paddings)
           .AddIntArg("framework_type", model_type)
@@ -475,11 +458,11 @@ void TestComplexDeconvNxN(const int batch,
           .Finalize(net.NewOperatorDef());
     } else {
       OpDefBuilder("Deconv2D", "Deconv2dTest")
-          .Input("InputImage")
-          .Input("FilterImage")
+          .Input("Input")
+          .Input("Filter")
           .Input("OutputShape")
-          .Input("BiasImage")
-          .Output("OutputImage")
+          .Input("Bias")
+          .Output("Output")
           .AddIntsArg("strides", {stride_h, stride_w})
           .AddIntArg("padding", type)
           .AddIntArg("framework_type", model_type)
@@ -489,9 +472,7 @@ void TestComplexDeconvNxN(const int batch,
     // Run on device
     net.RunOp(D);
 
-    ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4,
                             1e-4);
   };
 
diff --git a/mace/ops/depth_to_space_benchmark.cc b/mace/ops/depth_to_space_benchmark.cc
index 822bf8f018793c3d12e3ccd7820cb8d9196d044d..c9c6dd4016b97869289388ecbfbe200347846269 100644
--- a/mace/ops/depth_to_space_benchmark.cc
+++ b/mace/ops/depth_to_space_benchmark.cc
@@ -36,23 +36,12 @@ void DepthToSpace(
     MACE_NOT_IMPLEMENTED;
   }
 
-  if (D == DeviceType::CPU) {
-    OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
+  OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
       .Input("Input")
       .Output("Output")
+      .AddIntArg("block_size", block_size)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
-        .Input("InputImage")
-        .Output("Output")
-        .AddIntArg("block_size", block_size)
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
diff --git a/mace/ops/depth_to_space_test.cc b/mace/ops/depth_to_space_test.cc
index aa9b9c28d83020c0acdcf85e92f5647e4c84d678..c369bd67f4d034ba9a9e9468be73459ec002f19a 100644
--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
@@ -45,21 +45,15 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,
                                                     "Output", NHWC);
 
   } else {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
-        .Input("InputImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Output("Output")
         .AddIntArg("block_size", block_size)
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
   }
 
-  if (D == DeviceType::GPU) {
-    ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
-                                          ops::BufferType::IN_OUT_CHANNEL);
-  }
   auto expected = net.CreateTensor<float>(expected_shape, expected_data);
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -134,28 +128,23 @@ void RandomTest(const int block_size,
   net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
                                                   NHWC);
 
-  BufferToImage<D, T>(&net, "Input", "InputImg",
-                      ops::BufferType::IN_OUT_CHANNEL);
-
   OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
-      .Input("InputImg")
+      .Input("Input")
       .AddIntArg("block_size", block_size)
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Output("OutputImg")
+      .Output("GPUOutput")
       .Finalize(net.NewOperatorDef());
 
   // Run
   net.RunOp(D);
 
-  ImageToBuffer<D, float>(&net, "OutputImg", "OPENCLOutput",
-                          ops::BufferType::IN_OUT_CHANNEL);
 
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
     ExpectTensorNear<float>(*net.GetTensor("Output"),
-                            *net.GetOutput("OPENCLOutput"), 1e-5);
+                            *net.GetOutput("GPUOutput"), 1e-5);
   } else {
     ExpectTensorNear<float>(*net.GetTensor("Output"),
-                            *net.GetOutput("OPENCLOutput"), 1e-3, 1e-4);
+                            *net.GetOutput("GPUOutput"), 1e-3, 1e-4);
   }
 }
 }  // namespace
diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc
index 47f45e4a35277726aefe948a7fa5079b0616c2c2..8a85ab464ca0911b95a3ea4f039e1c61eb60da17 100644
--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -499,13 +499,17 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
       mem_type = MemoryType::GPU_BUFFER;
       kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel<T>);
     }
+    context->set_output_mem_type(mem_type);
     // Transform filter tensor to target format
     MACE_CHECK(TransformFilter<T>(
-        context, operator_def_.get(), 1, BufferType::DW_CONV2D_FILTER, mem_type)
-                   == MaceStatus::MACE_SUCCESS);
+        context,
+        operator_def_.get(),
+        1,
+        OpenCLBufferType::DW_CONV2D_FILTER,
+        mem_type) == MaceStatus::MACE_SUCCESS);
     if (operator_def_->input_size() > 2) {
       MACE_CHECK(TransformFilter<T>(
-          context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type)
+          context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
                      == MaceStatus::MACE_SUCCESS);
     }
   }
diff --git a/mace/ops/depthwise_conv2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc
index 54f3e8b739305e748dbf38fb76fd41c6c72c4fb1..4d44a9bc136b59fc5e29dd93343638f65b58db88 100644
--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -57,18 +57,17 @@ void DepthwiseConv2d(int iters,
   }
   if (DataTypeToEnum<T>::value != DT_UINT8) {
     net.AddRandomInput<D, float>(
-        "Filter", {multiplier, input_channels, kernel_h, kernel_w});
-    net.AddRandomInput<D, float>("Bias", {input_channels * multiplier});
+        "Filter", {multiplier, input_channels, kernel_h, kernel_w}, true);
+    net.AddRandomInput<D, float>("Bias", {input_channels * multiplier}, true);
   } else {
     net.AddRandomInput<DeviceType::CPU, uint8_t>(
-        "Filter", {kernel_h, kernel_w, input_channels, multiplier});
+        "Filter", {kernel_h, kernel_w, input_channels, multiplier}, true);
     net.GetTensor("Filter")->SetScale(0.1);
     net.AddRandomInput<DeviceType::CPU, int32_t>(
-        "Bias", {input_channels * multiplier});
+        "Bias", {input_channels * multiplier}, true);
   }
 
-  if (D == DeviceType::CPU) {
-    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
+  OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
       .Input("Input")
       .Input("Filter")
       .Input("Bias")
@@ -78,26 +77,6 @@ void DepthwiseConv2d(int iters,
       .AddIntsArg("dilations", {1, 1})
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::DW_CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntArg("padding", padding)
-        .AddIntsArg("dilations", {1, 1})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
 
   net.Setup(D);
 
diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc
index dfb76b44dea241f9aa44fa0e9a1f5c3f5e088d3c..d757bf097b1b18720f5e79ad053c911fa0a6d609 100644
--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -33,8 +33,11 @@ void SimpleValidTest() {
       "Input", {1, 3, 3, 2},
       {1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18});
   net.AddInputFromArray<D, float>(
-      "Filter", {1, 2, 2, 2}, {1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 4.0f, 6.0f, 8.0f});
-  net.AddInputFromArray<D, float>("Bias", {2}, {.1f, .2f});
+      "Filter",
+      {1, 2, 2, 2},
+      {1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 4.0f, 6.0f, 8.0f},
+      true);
+  net.AddInputFromArray<D, float>("Bias", {2}, {.1f, .2f}, true);
   if (D == DeviceType::CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                     NCHW);
@@ -52,17 +55,11 @@ void SimpleValidTest() {
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::DW_CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
     OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
         .AddIntsArg("strides", {1, 1})
         .AddIntArg("padding", Padding::VALID)
         .AddIntsArg("dilations", {1, 1})
@@ -70,11 +67,6 @@ void SimpleValidTest() {
         .Finalize(net.NewOperatorDef());
 
     net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
   } else {
     MACE_NOT_IMPLEMENTED;
   }
@@ -126,10 +118,13 @@ void ComplexValidTest(index_t batch,
   GenerateRandomRealTypeData({multiplier, channel, kernel, kernel},
                              &filter_data);
   net.AddInputFromArray<D, float>(
-      "Filter", {multiplier, channel, kernel, kernel}, filter_data);
+      "Filter", {multiplier, channel, kernel, kernel}, filter_data, true);
   std::vector<float> bias_data(channel * multiplier);
   GenerateRandomRealTypeData({channel * multiplier}, &bias_data);
-  net.AddInputFromArray<D, float>("Bias", {channel * multiplier}, bias_data);
+  net.AddInputFromArray<D, float>("Bias",
+                                  {channel * multiplier},
+                                  bias_data,
+                                  true);
 
   if (D == DeviceType::CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
@@ -149,17 +144,11 @@ void ComplexValidTest(index_t batch,
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::DW_CONV2D_FILTER);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
     OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
         .AddIntsArg("strides", {stride, stride})
         .AddIntArg("padding", Padding::SAME)
         .AddIntsArg("dilations", {1, 1})
@@ -167,11 +156,6 @@ void ComplexValidTest(index_t batch,
         .Finalize(net.NewOperatorDef());
 
     net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, T>(&net, "OutputImage", "Output",
-                        ops::BufferType::IN_OUT_CHANNEL);
-
   } else {
     MACE_NOT_IMPLEMENTED;
   }
@@ -182,7 +166,7 @@ void ComplexValidTest(index_t batch,
   index_t pad_top = ((out_height - 1) * stride + kernel - height) >> 1;
   index_t pad_left = ((out_width - 1) * stride + kernel - width) >> 1;
   index_t out_channels = channel * multiplier;
-  std::vector<T> expect(batch * out_height * out_width * out_channels);
+  std::vector<float> expect(batch * out_height * out_width * out_channels);
   for (index_t b = 0; b < batch; ++b) {
     for (index_t h = 0; h < out_height; ++h) {
       for (index_t w = 0; w < out_width; ++w) {
@@ -212,12 +196,12 @@ void ComplexValidTest(index_t batch,
   }
 
   auto expected =
-      net.CreateTensor<T>({1, out_height, out_width, out_channels}, expect);
+      net.CreateTensor<float>({1, out_height, out_width, out_channels}, expect);
 
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
-    ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
   } else {
-    ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-2);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
   }
 }
 }  // namespace
@@ -260,9 +244,10 @@ void TestNxNS12(const index_t height, const index_t width) {
     net.AddRandomInput<DeviceType::GPU, float>(
         "Input", {batch, height, width, channel});
     net.AddRandomInput<DeviceType::GPU, float>(
-        "Filter", {multiplier, channel, kernel_h, kernel_w});
+        "Filter", {multiplier, channel, kernel_h, kernel_w}, true);
     net.AddRandomInput<DeviceType::GPU, float>("Bias",
-                                               {multiplier * channel});
+                                               {multiplier * channel},
+                                               true);
 
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                     NCHW);
@@ -289,17 +274,11 @@ void TestNxNS12(const index_t height, const index_t width) {
     auto expected = net.CreateTensor<float>();
     expected->Copy(*net.GetOutput("Output"));
 
-    BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
-                                      ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<DeviceType::GPU, T>(&net, "Filter", "FilterImage",
-                                      ops::BufferType::DW_CONV2D_FILTER);
-    BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
-                                      ops::BufferType::ARGUMENT);
     OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
         .AddIntsArg("strides", {stride_h, stride_w})
         .AddIntArg("padding", type)
         .AddIntsArg("dilations", {1, 1})
@@ -309,17 +288,12 @@ void TestNxNS12(const index_t height, const index_t width) {
         .Finalize(net.NewOperatorDef());
 
     net.RunOp(DeviceType::GPU);
-
-    // Transfer output
-    ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "DeviceOutput",
-                                          ops::BufferType::IN_OUT_CHANNEL);
-
     // Check
     if (DataTypeToEnum<T>::value == DT_FLOAT) {
-      ExpectTensorNear<float>(*expected, *net.GetOutput("DeviceOutput"), 1e-5,
+      ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5,
                               1e-4);
     } else {
-      ExpectTensorNear<float>(*expected, *net.GetOutput("DeviceOutput"), 1e-2,
+      ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2,
                               1e-2);
     }
   };
@@ -366,12 +340,12 @@ void QuantSimpleValidTest() {
   net.AddInputFromArray<CPU, uint8_t>(
       "Input", {1, 3, 3, 2},
       {31, 98, 1, 54, 197, 172, 70, 146, 255, 71, 24, 182, 28, 78, 85, 96, 180,
-       59}, 0.00735299, 86);
+       59}, false, 0.00735299, 86);
   net.AddInputFromArray<CPU, uint8_t>(
       "Filter", {3, 3, 2, 1},
       {212, 239, 110, 170, 216, 91, 162, 161, 255, 2, 10, 120, 183, 101, 100,
-       33, 137, 51}, 0.0137587, 120);
-  net.AddInputFromArray<CPU, int32_t>("Bias", {2}, {2, 2});
+       33, 137, 51}, true, 0.0137587, 120);
+  net.AddInputFromArray<CPU, int32_t>("Bias", {2}, {2, 2}, true);
   OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
       .Input("Input")
       .Input("Filter")
@@ -408,13 +382,13 @@ void TestQuant(const index_t batch,
   OpsTestNet net;
   const index_t out_channels = multiplier * in_channels;
   net.AddRandomInput<CPU, float>(
-      "Input", {batch, in_height, in_width, in_channels}, false);
+      "Input", {batch, in_height, in_width, in_channels}, false, false);
   net.AddRandomInput<CPU, float>(
-      "Filter", {k_height, k_width, in_channels, multiplier}, false);
-  net.AddRandomInput<CPU, float>("Bias", {out_channels});
+      "Filter", {k_height, k_width, in_channels, multiplier}, true, false);
+  net.AddRandomInput<CPU, float>("Bias", {out_channels}, true);
   net.TransformDataFormat<DeviceType::CPU, float>(
       "Input", NHWC, "InputNCHW", NCHW);
-  net.TransformDataFormat<DeviceType::CPU, float>(
+  net.TransformFilterDataFormat<DeviceType::CPU, float>(
       "Filter", HWIO, "FilterOIHW", OIHW);
 
   OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
@@ -467,7 +441,7 @@ void TestQuant(const index_t batch,
       bias_data, bias->size(), q_input->scale() * q_filter->scale(), 0,
       q_bias.data());
   net.AddInputFromArray<DeviceType::CPU, int32_t>(
-      "QuantizedBias", {out_channels}, q_bias);
+      "QuantizedBias", {out_channels}, q_bias, true);
   OpDefBuilder("DepthwiseConv2d", "QuantizedDepthwiseConv2DTest")
       .Input("QuantizedInput")
       .Input("QuantizedFilter")
diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc
index e3dcb1e00bcdb24208fd2e501c9a12439677f8cb..3f10a514cec8712b583b1f0fcae2166fe747da46 100644
--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -29,6 +29,7 @@
 #include "mace/utils/utils.h"
 #include "mace/public/mace.h"
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/depthwise_deconv2d.h"
 #endif  // MACE_ENABLE_OPENCL
 
@@ -408,11 +409,21 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
  public:
   explicit DepthwiseDeconv2dOp(OpConstructContext *context)
       : Deconv2dOpBase(context) {
+    MemoryType mem_type = MemoryType::GPU_IMAGE;
     if (context->device()->opencl_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::DepthwiseDeconv2dKernel<T>);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
+    MACE_CHECK(TransformFilter<T>(
+        context, operator_def_.get(), 1,
+        OpenCLBufferType::DW_CONV2D_FILTER, mem_type)
+                   == MaceStatus::MACE_SUCCESS);
+    if (operator_def_->input_size() >= 3) {
+      MACE_CHECK(TransformFilter<T>(
+          context, operator_def_.get(), 2,
+          OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
+    }
   }
 
   MaceStatus Run(OpContext *context) override {
diff --git a/mace/ops/depthwise_deconv2d_benchmark.cc b/mace/ops/depthwise_deconv2d_benchmark.cc
index 3e3da26fe31bda1f5c3873bbf5f143309bd0247e..081e10d27ce6748d397f635d53b9f74673a15c20 100644
--- a/mace/ops/depthwise_deconv2d_benchmark.cc
+++ b/mace/ops/depthwise_deconv2d_benchmark.cc
@@ -44,32 +44,16 @@ static void DepthwiseDeconv2d(int iters,
   }
   net.AddRandomInput<D, float>("Filter",
                                {1, channels, kernel_h,
-                                kernel_w});
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                        ops::BufferType::DW_CONV2D_FILTER);
-    OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntsArg("padding_values", {padding, padding})
-        .AddIntArg("group", channels)
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
-        .Input("Input")
-        .Input("Filter")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntsArg("padding_values", {padding, padding})
-        .AddIntArg("group", channels)
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
-        .Finalize(net.NewOperatorDef());
-  }
+                                kernel_w}, true);
+  OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
+      .Input("Input")
+      .Input("Filter")
+      .Output("Output")
+      .AddIntsArg("strides", {stride, stride})
+      .AddIntsArg("padding_values", {padding, padding})
+      .AddIntArg("group", channels)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
 
   net.Setup(D);
 
diff --git a/mace/ops/depthwise_deconv2d_test.cc b/mace/ops/depthwise_deconv2d_test.cc
index b1f36845c7d351bfad879be4b0895071b9842a68..fe3b0b18a06c076d87e130b2dc1b17f1599577b1 100644
--- a/mace/ops/depthwise_deconv2d_test.cc
+++ b/mace/ops/depthwise_deconv2d_test.cc
@@ -38,33 +38,23 @@ void RunTestSimple(const int group,
   OpsTestNet net;
   // Add input data
   net.AddInputFromArray<D, float>("Input", input_shape, input_data);
-  net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data);
-  net.TransformDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
+  net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data, true);
+  net.TransformFilterDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
   const index_t out_channels = expected_shape[3];
-  net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data);
+  net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data, true);
 
   if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "FilterOIHW", "FilterImage",
-                            ops::BufferType::DW_CONV2D_FILTER);
-    BufferToImage<D, float>(&net, "Bias", "BiasImage",
-                            ops::BufferType::ARGUMENT);
     OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("FilterOIHW")
+        .Input("Bias")
+        .Output("Output")
         .AddIntsArg("strides", {stride, stride})
         .AddIntArg("group", group)
         .AddIntsArg("padding_values", paddings)
         .Finalize(net.NewOperatorDef());
 
     net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC,
                                                     "InputNCHW", NCHW);
@@ -161,22 +151,22 @@ TEST_F(DepthwiseDeconv2dOpTest, CPUSimple3X3Depthwise) {
 }
 
 TEST_F(DepthwiseDeconv2dOpTest, CPUSimple3X3Group) {
-TestNHWCSimple3x3_Group<DeviceType::CPU>();
+  TestNHWCSimple3x3_Group<DeviceType::CPU>();
 }
 
 TEST_F(DepthwiseDeconv2dOpTest, GPUSimple3X3Depthwise) {
-TestNHWCSimple3x3_DW<DeviceType::GPU>();
+  TestNHWCSimple3x3_DW<DeviceType::GPU>();
 }
 
 namespace {
 template <typename T>
 void RandomTest(index_t batch,
-                      index_t channel,
-                      index_t height,
-                      index_t width,
-                      index_t kernel,
-                      int stride,
-                      int padding) {
+                index_t channel,
+                index_t height,
+                index_t width,
+                index_t kernel,
+                int stride,
+                int padding) {
   testing::internal::LogToStderr();
   // Construct graph
   OpsTestNet net;
@@ -195,12 +185,12 @@ void RandomTest(index_t batch,
   GenerateRandomRealTypeData({multiplier, channel, kernel, kernel},
                              &filter_data);
   net.AddInputFromArray<DeviceType::GPU, float>(
-      "Filter", {multiplier, channel, kernel, kernel}, filter_data);
+      "Filter", {multiplier, channel, kernel, kernel}, filter_data, true);
   std::vector<float> bias_data(channel * multiplier);
   GenerateRandomRealTypeData({channel * multiplier}, &bias_data);
   net.AddInputFromArray<DeviceType::GPU, float>("Bias",
                                                 {channel * multiplier},
-                                                bias_data);
+                                                bias_data, true);
 
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                   NCHW);
@@ -226,17 +216,11 @@ void RandomTest(index_t batch,
   expected->Copy(*net.GetOutput("Output"));
 
 
-  BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
-                      ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, T>(&net, "Filter", "FilterImage",
-                      ops::BufferType::DW_CONV2D_FILTER);
-  BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
-                      ops::BufferType::ARGUMENT);
   OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
-      .Input("InputImage")
-      .Input("FilterImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Input("Filter")
+      .Input("Bias")
+      .Output("Output")
       .AddIntsArg("strides", {stride, stride})
       .AddIntsArg("padding_values", {padding, padding})
       .AddIntArg("group", channel)
@@ -245,14 +229,10 @@ void RandomTest(index_t batch,
 
   net.RunOp(DeviceType::GPU);
 
-  // Transfer output
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                      ops::BufferType::IN_OUT_CHANNEL);
-
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
   } else {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
   }
 }
 
diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc
index 96384cde4fac981064952a7cc7f916e671b63ab6..863b69edc2033e54866f5935b097d4f93c968395 100644
--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -1097,13 +1097,16 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
     }
     // Transform filters
     int input_size = operator_def_->input_size();
+    Workspace *ws = context->workspace();
     for (int i = 0; i < input_size; ++i) {
-      const Tensor *input_tensor = context->workspace()->GetTensor(
-          operator_def_->input(i));
-      if (input_tensor != nullptr && input_tensor->is_weight()) {
+      if (ws->HasTensor(operator_def_->input(i)) &&
+          ws->GetTensor(operator_def_->input(i))->is_weight()) {
         MACE_CHECK(TransformFilter<T>(
-            context, operator_def_.get(), i, BufferType::ARGUMENT, mem_type)
-                       == MaceStatus::MACE_SUCCESS);
+            context,
+            operator_def_.get(),
+            i,
+            OpenCLBufferType::ARGUMENT,
+            mem_type) == MaceStatus::MACE_SUCCESS);
       }
     }
   }
diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc
index 82fbc63f25b3b587d1d839d7e4c69d5090038a89..95808bc336a46231d920a7c409e846b89725e2ed 100644
--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -30,37 +30,23 @@ void EltwiseBenchmark(
 
   OpsTestNet net;
   // Add input data
-  net.AddRandomInput<D, T>("Input0", {n, h, w, c});
-  net.AddRandomInput<D, T>("Input1", {n, h, w, c});
-
   if (D == DeviceType::GPU) {
-    BufferToImage<D, half>(&net, "Input0", "InputImg0",
-                           ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, half>(&net, "Input1", "InputImg1",
-                           ops::BufferType::IN_OUT_CHANNEL);
-    OpDefBuilder("Eltwise", "EltwiseTest")
-        .Input("InputImg0")
-        .Input("InputImg1")
-        .AddIntArg("type", static_cast<int>(type))
-        .AddFloatsArg("coeff", {1.2, 2.1})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Output("OutputImg")
-        .Finalize(net.NewOperatorDef());
+    net.AddRandomInput<D, T>("Input0", {n, h, w, c});
+    net.AddRandomInput<D, T>("Input1", {n, h, w, c});
   } else {
-    net.TransformDataFormat<D, float>("Input0", NHWC,
-                                      "TInput0", NCHW);
-    net.TransformDataFormat<D, float>("Input1", NHWC,
-                                      "TInput1", NCHW);
-    OpDefBuilder("Eltwise", "EltwiseTest")
-        .Input("TInput0")
-        .Input("TInput1")
-        .AddIntArg("type", static_cast<int>(type))
-        .AddFloatsArg("coeff", {1.2, 2.1})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
+    net.AddRandomInput<D, T>("Input0", {n, c, h, w});
+    net.AddRandomInput<D, T>("Input1", {n, c, h, w});
   }
 
+  OpDefBuilder("Eltwise", "EltwiseTest")
+      .Input("Input0")
+      .Input("Input1")
+      .AddIntArg("type", static_cast<int>(type))
+      .AddFloatsArg("coeff", {1.2, 2.1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
   // Warm-up
   for (int i = 0; i < 5; ++i) {
     net.RunOp(D);
diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc
index ac920ac00f1150ca3336e78e4087bd2ec0ce545a..a6d7ea21d313ca9a979f3473285daa91d25b0c08 100644
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -82,20 +82,15 @@ void SimpleTensorScalar(const ops::EltwiseType type,
     net.RunOp(D);
     net.TransformDataFormat<D, DstType>("TOutput", NCHW, "Output", NHWC);
   } else {
-    BufferToImage<D, T>(&net, "Input", "InputImg",
-                        ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("Eltwise", "EltwiseTest")
-        .Input("InputImg")
+        .Input("Input")
         .AddIntArg("type", static_cast<int>(type))
         .AddFloatArg("scalar_input", x)
-        .Output("OutputImg")
+        .Output("Output")
         .Finalize(net.NewOperatorDef());
 
     // Run
     net.RunOp(D);
-
-    ImageToBuffer<D, DstType>(&net, "OutputImg", "Output",
-                              ops::BufferType::IN_OUT_CHANNEL);
   }
 
   auto expected = net.CreateTensor<DstType>(shape, output);
@@ -145,23 +140,16 @@ void SimpleTensorEltwise(const ops::EltwiseType type,
     net.RunOp(D);
     net.TransformDataFormat<D, DstType>("TOutput", NCHW, "Output", NHWC);
   } else {
-    BufferToImage<D, T>(&net, "Input0", "InputImg0",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Input1", "InputImg1",
-                        ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("Eltwise", "EltwiseTest")
-        .Input("InputImg0")
-        .Input("InputImg1")
+        .Input("Input0")
+        .Input("Input1")
         .AddIntArg("type", static_cast<int>(type))
         .AddFloatsArg("coeff", coeff)
-        .Output("OutputImg")
+        .Output("Output")
         .Finalize(net.NewOperatorDef());
 
     // Run
     net.RunOp(D);
-
-    ImageToBuffer<D, DstType>(&net, "OutputImg", "Output",
-                              ops::BufferType::IN_OUT_CHANNEL);
   }
 
   std::vector<index_t> output_shape = shape0;
@@ -204,26 +192,19 @@ void TensorGeneralBroadcastEltwise(const ops::EltwiseType type,
     // Run
     net.RunOp(D);
   } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input0", "InputImage0",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Input1", "InputImage1",
-                        ops::BufferType::IN_OUT_CHANNEL);
     auto op_builder =
         OpDefBuilder("Eltwise", "EltwiseTest")
             .AddIntArg("T", DataTypeToEnum<T>::v())
-            .Input("InputImage0")
-            .Input("InputImage1")
+            .Input("Input0")
+            .Input("Input1")
             .AddIntArg("type", static_cast<int>(type))
             .AddFloatsArg("coeff", coeff)
             .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
-            .Output("OutputImage");
+            .Output("Output");
     op_builder.Finalize(net.NewOperatorDef());
 
     // Run
     net.RunOp(D);
-
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     MACE_NOT_IMPLEMENTED;
   }
@@ -483,7 +464,7 @@ void RandomTensorScalar(const ops::EltwiseType type,
   OpsTestNet net;
 
   // Add input data
-  net.AddRandomInput<DeviceType::GPU, float>("Input", shape, true, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Input", shape, false, true, true);
 
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
                                                   NCHW);
@@ -501,26 +482,21 @@ void RandomTensorScalar(const ops::EltwiseType type,
   auto expected = net.CreateTensor<float>();
   expected->Copy(*net.GetOutput("Output"));
 
-  BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImg",
-                                    ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Eltwise", "EltwiseTest")
-      .Input("InputImg")
+      .Input("Input")
       .AddIntArg("type", static_cast<int>(type))
       .AddFloatArg("scalar_input", 0.1)
-      .Output("OutputImg")
+      .Output("Output")
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
 
   // Run
   net.RunOp(DeviceType::GPU);
 
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-5);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
   } else {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2, 1e-2);
   }
 }
 
@@ -533,8 +509,16 @@ void RandomTensorEltwise(const ops::EltwiseType type,
   OpsTestNet net;
 
   // Add input data
-  net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0, true, true);
-  net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1, true, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Input0",
+                                             shape0,
+                                             false,
+                                             true,
+                                             true);
+  net.AddRandomInput<DeviceType::GPU, float>("Input1",
+                                             shape1,
+                                             false,
+                                             true,
+                                             true);
 
   net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC, "TInput0",
                                                   NCHW);
@@ -556,29 +540,22 @@ void RandomTensorEltwise(const ops::EltwiseType type,
   auto expected = net.CreateTensor<float>();
   expected->Copy(*net.GetOutput("Output"));
 
-  BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImg0",
-                                    ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImg1",
-                                    ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Eltwise", "EltwiseTest")
-      .Input("InputImg0")
-      .Input("InputImg1")
+      .Input("Input0")
+      .Input("Input1")
       .AddIntArg("type", static_cast<int>(type))
       .AddFloatsArg("coeff", coeff)
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Output("OutputImg")
+      .Output("Output")
       .Finalize(net.NewOperatorDef());
 
   // Run
   net.RunOp(DeviceType::GPU);
 
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImg", "GPUOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-5);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
   } else {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2, 1e-2);
   }
 }
 
@@ -587,8 +564,16 @@ void QuantizedSum(const std::vector<index_t> &shape) {
   OpsTestNet net;
 
   // Add input data
-  net.AddRandomInput<DeviceType::CPU, float>("Input0", shape, true, true);
-  net.AddRandomInput<DeviceType::CPU, float>("Input1", shape, true, true);
+  net.AddRandomInput<DeviceType::CPU, float>("Input0",
+                                             shape,
+                                             false,
+                                             true,
+                                             true);
+  net.AddRandomInput<DeviceType::CPU, float>("Input1",
+                                             shape,
+                                             false,
+                                             true,
+                                             true);
 
   net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC, "TInput0",
                                                   NCHW);
diff --git a/mace/ops/folded_batch_norm_test.cc b/mace/ops/folded_batch_norm_test.cc
index 35e69cd159942d328ab46d9ef54fb98acef66e4e..e645eb4e8b7fb665e821a6affc07433953702421 100644
--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
@@ -45,8 +45,8 @@ void Simple() {
   std::vector<float> scale(1);
   std::vector<float> offset(1);
   CalculateScaleOffset({4.0f}, {2.0}, {10}, {11.67f}, 1e-3, &scale, &offset);
-  net.AddInputFromArray<D, float>("Scale", {1}, scale);
-  net.AddInputFromArray<D, float>("Offset", {1}, offset);
+  net.AddInputFromArray<D, float>("Scale", {1}, scale, true);
+  net.AddInputFromArray<D, float>("Offset", {1}, offset, true);
 
   if (D == DeviceType::CPU) {
     net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
@@ -60,25 +60,14 @@ void Simple() {
     net.RunOp(D);
     net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
   } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
-                            ops::BufferType::ARGUMENT);
-    BufferToImage<D, float>(&net, "Offset", "OffsetImage",
-                            ops::BufferType::ARGUMENT);
-
     OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
-        .Input("InputImage")
-        .Input("ScaleImage")
-        .Input("OffsetImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Scale")
+        .Input("Offset")
+        .Output("Output")
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
   }
 
   // Check
@@ -108,8 +97,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
   // Add input data
   net.AddRandomInput<DeviceType::GPU, float>("Input",
                                              {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
 
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                   NCHW);
@@ -132,27 +121,16 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
   expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
-  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
-                                        ops::BufferType::ARGUMENT);
-
   OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Output("Output")
       .Finalize(net.NewOperatorDef());
 
   // Run on opencl
   net.RunOp(DeviceType::GPU);
-  net.Sync();
-
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                           1e-5, 1e-4);
 }
 
@@ -170,8 +148,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
   // Add input data
   net.AddRandomInput<DeviceType::GPU, float>("Input",
                                              {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
 
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                   NCHW);
@@ -194,18 +172,11 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
   expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
-  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
-                                       ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
-                                       ops::BufferType::ARGUMENT);
-
   OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Output("Output")
       .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
       .Finalize(net.NewOperatorDef());
 
@@ -213,9 +184,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
   net.RunOp(DeviceType::GPU);
   net.Sync();
 
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                           1e-2, 1e-2);
 }
 
@@ -233,8 +202,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
   // Add input data
   net.AddRandomInput<DeviceType::GPU, float>("Input",
                                              {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
 
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                   NCHW);
@@ -257,26 +226,17 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
   expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
-  BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
-                                        ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
-                                        ops::BufferType::ARGUMENT);
-
   OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Output("Output")
       .Finalize(net.NewOperatorDef());
 
   // Run on opencl
   net.RunOp(DeviceType::GPU);
 
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                           1e-5, 1e-4);
 }
 
@@ -318,27 +278,18 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
   expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
-  BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
-                                       ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
-                                       ops::BufferType::ARGUMENT);
-  BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
-                                       ops::BufferType::ARGUMENT);
-
   OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Output("Output")
       .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
       .Finalize(net.NewOperatorDef());
 
   // Run on opencl
   net.RunOp(DeviceType::GPU);
 
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                           1e-2, 1e-2);
 }
 
diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc
index cea80bf51e998300b53e5c8729a66aa82147fc0b..ef919d9292bab8b2474a40ab30053b587bd79d96 100644
--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -202,11 +202,14 @@ class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
     }
     // Transform filter tensor to target format
     MACE_CHECK(TransformFilter<T>(
-        context, operator_def_.get(), 1, BufferType::WEIGHT_WIDTH, mem_type)
-                   == MaceStatus::MACE_SUCCESS);
+        context,
+        operator_def_.get(),
+        1,
+        OpenCLBufferType::WEIGHT_WIDTH,
+        mem_type) == MaceStatus::MACE_SUCCESS);
     if (operator_def_->input_size() > 2) {
       MACE_CHECK(TransformFilter<T>(
-          context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type)
+          context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
                      == MaceStatus::MACE_SUCCESS);
     }
   }
diff --git a/mace/ops/fully_connected_benchmark.cc b/mace/ops/fully_connected_benchmark.cc
index 6b75e60dd93648045af1719947708735ab1226c3..bb27c97dcdf2197c6f1e60ef59589b4d7a39b429 100644
--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
@@ -30,42 +30,25 @@ void FCBenchmark(
   OpsTestNet net;
 
   // Add input data
-  net.AddRandomInput<D, float>("Input", {batch, height, width, channel});
+  if (D == DeviceType::GPU) {
+    net.AddRandomInput<D, float>("Input", {batch, height, width, channel});
+  } else {
+    net.AddRandomInput<D, float>("Input", {batch, channel, height, width});
+  }
+
   net.AddRandomInput<D, float>("Weight",
-                               {out_channel, channel, height, width});
-  net.AddRandomInput<D, float>("Bias", {out_channel});
-
-  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input",
-                                                    NHWC,
-                                                    "InputNCHW",
-                                                    NCHW);
-    OpDefBuilder("FullyConnected", "FullyConnectedTest")
-      .Input("InputNCHW")
+                               {out_channel, channel, height, width}, true);
+  net.AddRandomInput<D, float>("Bias", {out_channel}, true);
+
+  OpenCLBufferType weight_type = OpenCLBufferType::WEIGHT_WIDTH;
+  OpDefBuilder("FullyConnected", "FullyConnectedTest")
+      .Input("Input")
       .Input("Weight")
       .Input("Bias")
       .Output("Output")
+      .AddIntArg("weight_type", static_cast<int>(weight_type))
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    ops::BufferType weight_type = ops::BufferType::WEIGHT_WIDTH;
-    BufferToImage<D, T>(&net, "Weight", "WeightImage",
-                        weight_type);
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-
-    OpDefBuilder("FullyConnected", "FullyConnectedTest")
-        .Input("InputImage")
-        .Input("WeightImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
-        .AddIntArg("weight_type", static_cast<int>(weight_type))
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc
index 0fd98848d323198dfdf108b7f25ad51667dd6ada..26134bb5e140e4b01aecb3c87a63dcb95bcf6aff 100644
--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -36,8 +36,8 @@ void Simple(const std::vector<index_t> &input_shape,
 
   // Add input data
   net.AddInputFromArray<D, float>("Input", input_shape, input_value);
-  net.AddInputFromArray<D, float>("Weight", weight_shape, weight_value);
-  net.AddInputFromArray<D, float>("Bias", bias_shape, bias_value);
+  net.AddInputFromArray<D, float>("Weight", weight_shape, weight_value, true);
+  net.AddInputFromArray<D, float>("Bias", bias_shape, bias_value, true);
 
   if (D == DeviceType::CPU) {
     OpDefBuilder("FullyConnected", "FullyConnectedTest")
@@ -50,25 +50,14 @@ void Simple(const std::vector<index_t> &input_shape,
     net.RunOp(D);
     net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
   } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Weight", "WeightImage",
-                            ops::BufferType::WEIGHT_WIDTH);
-    BufferToImage<D, float>(&net, "Bias", "BiasImage",
-                            ops::BufferType::ARGUMENT);
-
     OpDefBuilder("FullyConnected", "FullyConnectedTest")
-        .Input("InputImage")
-        .Input("WeightImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Input("Weight")
+        .Input("Bias")
+        .Output("Output")
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     MACE_NOT_IMPLEMENTED;
   }
@@ -136,8 +125,8 @@ void Random(const index_t batch,
   net.AddRandomInput<DeviceType::GPU, float>("Input",
                                              {batch, height, width, channels});
   net.AddRandomInput<DeviceType::GPU, float>(
-      "Weight", {out_channel, channels, height, width});
-  net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel});
+      "Weight", {out_channel, channels, height, width}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel}, true);
 
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                   NCHW);
@@ -158,31 +147,22 @@ void Random(const index_t batch,
   expected->Copy(*net.GetOutput("Output"));
 
   // Run on opencl
-  BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
-                                    ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<DeviceType::GPU, T>(&net, "Weight", "WeightImage",
-                                    ops::BufferType::WEIGHT_WIDTH);
-  BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
-                                    ops::BufferType::ARGUMENT);
-
   OpDefBuilder("FullyConnected", "FullyConnectedTest")
-      .Input("InputImage")
-      .Input("WeightImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Input("Weight")
+      .Input("Bias")
+      .Output("Output")
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
 
   // Run
   net.RunOp(DeviceType::GPU);
 
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
   if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-1,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-1,
                             1e-1);
   } else {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2,
                             1e-3);
   }
 }
@@ -228,10 +208,10 @@ void QuantRandom(const index_t batch,
   net.AddRandomInput<CPU, float>(
       "Input", {batch, height, width, channels});
   net.AddRandomInput<CPU, float>(
-      "Weight", {out_channel, height, width, channels});
-  net.AddRandomInput<CPU, float>("Bias", {out_channel});
+      "Weight", {out_channel, height, width, channels}, true);
+  net.AddRandomInput<CPU, float>("Bias", {out_channel}, true);
   net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
-  net.TransformDataFormat<CPU, float>("Weight", OHWI, "WeightOIHW", OIHW);
+  net.TransformFilterDataFormat<CPU, float>("Weight", OHWI, "WeightOIHW", OIHW);
 
   OpDefBuilder("FullyConnected", "FullyConnectedTest")
       .Input("InputNCHW")
diff --git a/mace/ops/lstm_cell.cc b/mace/ops/lstm_cell.cc
index 19abafe098abbbe97fb93e46c13c22af30f6bb0d..dfbfa155a31377dbbbd20cbd7d6c6ebe5df48838 100644
--- a/mace/ops/lstm_cell.cc
+++ b/mace/ops/lstm_cell.cc
@@ -16,6 +16,7 @@
 #include <memory>
 
 #include "mace/core/operator.h"
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/lstm_cell.h"
 
 namespace mace {
@@ -30,13 +31,43 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation {
   explicit LSTMCellOp(OpConstructContext *context)
       : Operation(context) {
     T forget_bias = static_cast<T>(
-                     Operation::GetOptionalArg<float>("scalar_input",
-                                                         0.0));
+        Operation::GetOptionalArg<float>("scalar_input",
+                                         0.0));
+    MemoryType mem_type = MemoryType::GPU_IMAGE;
     if (context->device()->opencl_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::LSTMCellKernel<T>(forget_bias));
     } else {
       MACE_NOT_IMPLEMENTED;
     }
+    // Transform filters
+    const Tensor *pre_output = context->workspace()->GetTensor(
+        operator_def_->input(1));
+    if (pre_output->is_weight()) {
+      MACE_CHECK(TransformFilter<T>(context,
+                                    operator_def_.get(),
+                                    1,
+                                    OpenCLBufferType::IN_OUT_CHANNEL,
+                                    mem_type) == MaceStatus::MACE_SUCCESS);
+    }
+    MACE_CHECK(TransformFilter<T>(context,
+                                  operator_def_.get(),
+                                  2,
+                                  OpenCLBufferType::IN_OUT_CHANNEL,
+                                  mem_type) == MaceStatus::MACE_SUCCESS);
+    MACE_CHECK(TransformFilter<T>(context,
+                                  operator_def_.get(),
+                                  3,
+                                  OpenCLBufferType::ARGUMENT,
+                                  mem_type) == MaceStatus::MACE_SUCCESS);
+    const Tensor *pre_cell = context->workspace()->GetTensor(
+        operator_def_->input(4));
+    if (pre_cell->is_weight()) {
+      MACE_CHECK(TransformFilter<T>(context,
+                                    operator_def_.get(),
+                                    4,
+                                    OpenCLBufferType::IN_OUT_CHANNEL,
+                                    mem_type) == MaceStatus::MACE_SUCCESS);
+    }
   }
 
   MaceStatus Run(OpContext *context) override {
diff --git a/mace/ops/lstmcell_benchmark.cc b/mace/ops/lstmcell_benchmark.cc
index b8840bba42a1617380a764bae1431ac1e78d24fd..6568025a1a169ed856cf3df8704f635bb9824b2b 100644
--- a/mace/ops/lstmcell_benchmark.cc
+++ b/mace/ops/lstmcell_benchmark.cc
@@ -29,11 +29,11 @@ void LSTMCell(int iters, int batch, int input_size, int hidden_units) {
 
   // Add input data
   net.AddRandomInput<D, float>("Input", {batch, input_size});
-  net.AddRandomInput<D, float>("PreOutput", {batch, hidden_units});
+  net.AddRandomInput<D, float>("PreOutput", {batch, hidden_units}, true);
   net.AddRandomInput<D, float>("Weight", {input_size + hidden_units,
-                                          4 * hidden_units});
-  net.AddRandomInput<D, float>("Bias", {4 * hidden_units});
-  net.AddRandomInput<D, float>("PreCell", {batch, hidden_units});
+                                          4 * hidden_units}, true);
+  net.AddRandomInput<D, float>("Bias", {4 * hidden_units}, true);
+  net.AddRandomInput<D, float>("PreCell", {batch, hidden_units}, true);
 
   const float &forget_add = 0.0f;
 
@@ -45,28 +45,17 @@ void LSTMCell(int iters, int batch, int input_size, int hidden_units) {
     net.CopyData<DeviceType::CPU, float>("PreCell", "PreCellCPU");
 
     LSTMCellCPU<float>(&net, "InputCPU", "PreOutputCPU", "WeightCPU", "BiasCPU",
-                   "PreCellCPU", forget_add, "CellCPU", "OutputCPU");
+                       "PreCellCPU", forget_add, "CellCPU", "OutputCPU");
   } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "PreOutput", "PreOutputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Weight", "WeightImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                        ops::BufferType::ARGUMENT);
-    BufferToImage<D, T>(&net, "PreCell", "PreCellImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-
     OpDefBuilder("LSTMCell", "LSTMCellTest")
-        .Input("InputImage")
-        .Input("PreOutputImage")
-        .Input("WeightImage")
-        .Input("BiasImage")
-        .Input("PreCellImage")
+        .Input("Input")
+        .Input("PreOutput")
+        .Input("Weight")
+        .Input("Bias")
+        .Input("PreCell")
         .AddFloatArg("scalar_input", forget_add)
-        .Output("CellImage")
-        .Output("OutputImage")
+        .Output("Cell")
+        .Output("Output")
         .Finalize(net.NewOperatorDef());
   } else {
     MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/lstmcell_test.cc b/mace/ops/lstmcell_test.cc
index 99dea59c10827832d21cdc699c6105446fc4fc7b..2d1affeeb41840d4d25d58041bb77685f60c1066 100644
--- a/mace/ops/lstmcell_test.cc
+++ b/mace/ops/lstmcell_test.cc
@@ -32,11 +32,11 @@ void TestLSTMCell(const uint32_t &batch,
   OpsTestNet net;
 
   net.AddRandomInput<D, float>("Input", {batch, input_size});
-  net.AddRandomInput<D, float>("PreOutput", {batch, hidden_units});
+  net.AddRandomInput<D, float>("PreOutput", {batch, hidden_units}, true);
   net.AddRandomInput<D, float>("Weight", {input_size + hidden_units,
-                                          4 * hidden_units});
-  net.AddRandomInput<D, float>("Bias", {4 * hidden_units});
-  net.AddRandomInput<D, float>("PreCell", {batch, hidden_units});
+                                          4 * hidden_units}, true);
+  net.AddRandomInput<D, float>("Bias", {4 * hidden_units}, true);
+  net.AddRandomInput<D, float>("PreCell", {batch, hidden_units}, true);
 
   net.CopyData<DeviceType::CPU, float>("Input", "InputCPU");
   net.CopyData<DeviceType::CPU, float>("PreOutput", "PreOutputCPU");
@@ -46,42 +46,25 @@ void TestLSTMCell(const uint32_t &batch,
 
   // Run on CPU
   LSTMCellCPU<float>(&net, "InputCPU", "PreOutputCPU", "WeightCPU", "BiasCPU",
-                 "PreCellCPU", forget_add, "CellCPU", "OutputCPU");
+                     "PreCellCPU", forget_add, "CellCPU", "OutputCPU");
   // Run
   net.RunOp(DeviceType::CPU);
 
   // Run on GPU
-  BufferToImage<D, T>(&net, "Input", "InputImage",
-                      ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<D, T>(&net, "PreOutput", "PreOutputImage",
-                      ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<D, T>(&net, "Weight", "WeightImage",
-                      ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                      ops::BufferType::ARGUMENT);
-  BufferToImage<D, T>(&net, "PreCell", "PreCellImage",
-                      ops::BufferType::IN_OUT_CHANNEL);
-
   OpDefBuilder("LSTMCell", "LSTMCellTest")
-      .Input("InputImage")
-      .Input("PreOutputImage")
-      .Input("WeightImage")
-      .Input("BiasImage")
-      .Input("PreCellImage")
+      .Input("Input")
+      .Input("PreOutput")
+      .Input("Weight")
+      .Input("Bias")
+      .Input("PreCell")
       .AddFloatArg("scalar_input", forget_add)
-      .Output("CellImage")
-      .Output("OutputImage")
+      .Output("Cell")
+      .Output("Output")
       .Finalize(net.NewOperatorDef());
 
   // Run
   net.RunOp(D);
 
-  ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                      ops::BufferType::IN_OUT_CHANNEL);
-  ImageToBuffer<D, float>(&net, "CellImage", "Cell",
-                      ops::BufferType::IN_OUT_CHANNEL);
-
-
   Tensor expected_cell, expected_output;
   expected_cell.Copy(*net.GetOutput("CellCPU"));
   expected_output.Copy(*net.GetOutput("OutputCPU"));
diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc
index 614788d8100ae5080642ab6202fa7b53535a5b75..411f0f16edae58548441d1fc696c3802e6e3bf20 100644
--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -31,6 +31,7 @@
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_OPENCL
+#include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/matmul.h"
 #endif  // MACE_ENABLE_OPENCL
 
@@ -351,11 +352,8 @@ class MatMulOp<DeviceType::GPU, T> : public MatMulOpBase {
  public:
   explicit MatMulOp(OpConstructContext *context)
       : MatMulOpBase(context) {
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::MatMulKernel<T>);
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
+    MACE_UNUSED(context);
+    MACE_NOT_IMPLEMENTED;
   }
   MaceStatus Run(OpContext *context) override {
     Validate();
diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc
index 2d62d86a8690485df773616d65a122eaf7ac77fa..f118e63f4680b68f0f77bc55697cf318f729caaa 100644
--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -301,26 +301,12 @@ void MatMulBenchmark(
     net.GetTensor("A")->SetScale(0.1);
     net.GetTensor("B")->SetScale(0.1);
   }
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "A", "AImage",
-                        ops::BufferType::IN_OUT_WIDTH);
-    BufferToImage<D, T>(&net, "B", "BImage",
-                        ops::BufferType::IN_OUT_HEIGHT);
-
-    OpDefBuilder("MatMul", "MatMulBM")
-        .Input("AImage")
-        .Input("BImage")
-        .Output("Output")
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("MatMul", "MatMulBM")
-        .Input("A")
-        .Input("B")
-        .Output("Output")
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  }
+  OpDefBuilder("MatMul", "MatMulBM")
+      .Input("A")
+      .Input("B")
+      .Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
 
   net.Setup(D);
   if (DataTypeToEnum<T>::value == DT_UINT8) {
@@ -401,8 +387,6 @@ void MatMulTransposeBenchmark(
 
 #define MACE_BM_MATMUL_OP(N, H, C, W)              \
   MACE_BM_MATMUL_MACRO(N, H, C, W, float, CPU);    \
-  MACE_BM_MATMUL_MACRO(N, H, C, W, float, GPU);    \
-  MACE_BM_MATMUL_MACRO(N, H, C, W, half, GPU);     \
   MACE_BM_MATMUL_MACRO(N, H, C, W, uint8_t, CPU);
 
 #define MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, TYPE, DEVICE)               \
diff --git a/mace/ops/matmul_test.cc b/mace/ops/matmul_test.cc
index 82187b8b1903d2e1b7137be680f5ff4ab1b4e4a8..f2ed8478cdf5801c4d76827cd3e0abc699090cc9 100644
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
@@ -36,32 +36,13 @@ void Simple(const std::vector<index_t> &A_shape,
   net.AddInputFromArray<D, float>("A", A_shape, A_value);
   net.AddInputFromArray<D, float>("B", B_shape, B_value);
 
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "A", "AImage",
-                            ops::BufferType::IN_OUT_WIDTH);
-    BufferToImage<D, float>(&net, "B", "BImage",
-                            ops::BufferType::IN_OUT_HEIGHT);
-
-    OpDefBuilder("MatMul", "MatMulTest")
-        .Input("AImage")
-        .Input("BImage")
-        .Output("OutputImage")
-        .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
-
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_HEIGHT);
-  } else {
-    OpDefBuilder("MatMul", "MatMulTest")
-        .Input("A")
-        .Input("B")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
-  }
+  OpDefBuilder("MatMul", "MatMulTest")
+      .Input("A")
+      .Input("B")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+  // Run
+  net.RunOp(D);
 
   // Check
   auto expected = net.CreateTensor<float>(C_shape, C_value);
@@ -89,129 +70,6 @@ TEST_F(MatMulOpTest, SimpleCPUWithBatch) {
                           {2, 2, 2}, {22, 28, 49, 64, 22, 28, 49, 64});
 }
 
-TEST_F(MatMulOpTest, SimpleOPENCL) {
-  Simple<DeviceType::GPU>({1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 3, 2},
-                          {1, 2, 3, 4, 5, 6}, {1, 2, 2}, {22, 28, 49, 64});
-  Simple<DeviceType::GPU>(
-      {1, 5, 5}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
-                  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
-      {1, 5, 5}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
-                  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
-      {1, 5, 5}, {215,  230,  245,  260,  275,  490,  530,  570,  610,
-                  650,  765,  830,  895,  960,  1025, 1040, 1130, 1220,
-                  1310, 1400, 1315, 1430, 1545, 1660, 1775});
-}
-
-TEST_F(MatMulOpTest, SimpleGPUWithBatch) {
-  Simple<DeviceType::CPU>({2, 2, 3}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
-                          {2, 3, 2}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
-                          {2, 2, 2}, {22, 28, 49, 64, 22, 28, 49, 64});
-}
-
-namespace {
-template <typename T>
-void Complex(const std::vector<index_t> &batch,
-             const index_t height,
-             const index_t channels,
-             const index_t out_width) {
-  srand(time(NULL));
-
-  // Construct graph
-  OpsTestNet net;
-
-  // Add input data
-  index_t batch_count = std::accumulate(batch.begin(), batch.end(), 1,
-                                        std::multiplies<index_t>());
-  net.AddRandomInput<DeviceType::GPU, float>("A",
-                                             {batch_count, height, channels});
-  net.AddRandomInput<DeviceType::GPU, float>(
-      "B", {batch_count, channels, out_width});
-
-  // Run on opencl
-  BufferToImage<DeviceType::GPU, T>(&net, "A", "AImage",
-                                    ops::BufferType::IN_OUT_WIDTH);
-  BufferToImage<DeviceType::GPU, T>(&net, "B", "BImage",
-                                    ops::BufferType::IN_OUT_HEIGHT);
-
-  OpDefBuilder("MatMul", "MatMulTest")
-      .Input("AImage")
-      .Input("BImage")
-      .Output("OutputImage")
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-
-  net.RunOp(DeviceType::GPU);
-
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
-                                        ops::BufferType::IN_OUT_HEIGHT);
-
-  // run cpu
-  std::vector<index_t> shape_a = batch;
-  shape_a.push_back(height);
-  shape_a.push_back(channels);
-  std::vector<index_t> shape_b = batch;
-  shape_b.push_back(channels);
-  shape_b.push_back(out_width);
-  std::vector<index_t> expected_output_shape = batch;
-  expected_output_shape.push_back(height);
-  expected_output_shape.push_back(out_width);
-
-  net.GetTensor("A")->Reshape(shape_a);
-  net.GetTensor("B")->Reshape(shape_b);
-
-  OpDefBuilder("MatMul", "MatMulTest")
-      .Input("A")
-      .Input("B")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
-
-  net.RunOp();
-
-  // Check
-  EXPECT_EQ(expected_output_shape, net.GetOutput("Output")->shape());
-
-  auto expected = net.CreateTensor<float>();
-  expected->Copy(*net.GetOutput("Output"));
-  expected->Reshape({batch_count, height, out_width});
-
-  if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
-                            1e-1);
-  } else {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5,
-                            1e-5);
-  }
-}
-}  // namespace
-
-TEST_F(MatMulOpTest, OPENCLAlignedWithoutBatch) {
-  Complex<float>({1}, 64, 128, 32);
-  Complex<float>({1}, 64, 32, 128);
-  Complex<float>({2, 3}, 64, 32, 128);
-}
-TEST_F(MatMulOpTest, OPENCLUnAlignedWithoutBatch) {
-  Complex<float>({1}, 31, 113, 61);
-  Complex<float>({1}, 113, 31, 73);
-  Complex<float>({2, 3}, 113, 31, 73);
-}
-TEST_F(MatMulOpTest, OPENCLUnAlignedWithBatch) {
-  Complex<float>({2}, 3, 3, 3);
-  Complex<float>({16}, 31, 61, 67);
-  Complex<float>({31}, 31, 61, 67);
-  Complex<float>({2, 3}, 31, 61, 67);
-}
-TEST_F(MatMulOpTest, OPENCLHalfAlignedWithoutBatch) {
-  Complex<half>({1}, 64, 128, 32);
-  Complex<half>({1}, 64, 32, 128);
-  Complex<half>({2, 3}, 64, 32, 128);
-}
-TEST_F(MatMulOpTest, OPENCLHalfUnAlignedWithBatch) {
-  Complex<half>({2}, 31, 113, 61);
-  Complex<half>({16}, 32, 64, 64);
-  Complex<half>({31}, 31, 61, 67);
-  Complex<half>({2, 3}, 31, 61, 67);
-}
-
 namespace {
 void QuantOutputUint8(const std::vector<index_t> &batch,
                       const index_t height,
diff --git a/mace/ops/opencl/buffer/buffer_inverse_transform.h b/mace/ops/opencl/buffer/buffer_inverse_transform.h
deleted file mode 100644
index 8b05bf5f0c34e801d501b390b05f64cb4b7e29c8..0000000000000000000000000000000000000000
--- a/mace/ops/opencl/buffer/buffer_inverse_transform.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
-#define MACE_OPS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
-
-#include "mace/ops/opencl/buffer_transformer.h"
-
-#include "mace/core/op_context.h"
-#include "mace/core/tensor.h"
-#include "mace/ops/opencl/helper.h"
-
-namespace mace {
-namespace ops {
-namespace opencl {
-namespace buffer {
-
-MaceStatus BufferTypeTransform(
-    OpContext *context,
-    cl::Kernel *kernel,
-    const Tensor *input,
-    const DataType dt,
-    Tensor *output);
-
-template <typename T>
-class BufferInverseTransform: public OpenCLBufferTransformKernel {
- public:
-  MaceStatus Compute(OpContext *context,
-                     const Tensor *input,
-                     const BufferType type,
-                     const int wino_blk_size,
-                     Tensor *output) override;
- private:
-  cl::Kernel kernel_;
-};
-
-template <typename T>
-MaceStatus BufferInverseTransform<T>::Compute(OpContext *context,
-                                              const Tensor *input,
-                                              const BufferType type,
-                                              const int wino_blk_size,
-                                              Tensor *output) {
-  MACE_UNUSED(type);
-  MACE_UNUSED(wino_blk_size);
-  const DataType dt = DataTypeToEnum<T>::value;
-  if (input->dtype() != output->dtype()) {
-    return BufferTypeTransform(context, &kernel_, input, dt, output);
-  } else {
-    SetFutureDefaultWaitFn(context->future());
-    output->ReuseTensorBuffer(*input);
-    return MaceStatus::MACE_SUCCESS;
-  }
-}
-
-}  // namespace buffer
-}  // namespace opencl
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
diff --git a/mace/ops/opencl/buffer/buffer_transform.cc b/mace/ops/opencl/buffer/buffer_transform.cc
index 29f467e2d0d3292508eb5fa4997492b61176642f..9ba3f81d1e7b59bd1c7b0b015616da1cec775ac7 100644
--- a/mace/ops/opencl/buffer/buffer_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_transform.cc
@@ -91,8 +91,6 @@ MaceStatus TransformConv2DFilter(
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                            gws, lws, context->future()));
   MACE_OUT_OF_RANGE_VALIDATION
-  // Mark the buffer unused.
-  const_cast<Tensor *>(input)->MarkUnused();
   return MaceStatus::MACE_SUCCESS;
 }
 
@@ -159,8 +157,6 @@ MaceStatus TransformDWConv2DFilter(
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                            gws, lws, context->future()));
   MACE_OUT_OF_RANGE_VALIDATION
-  // Mark the buffer unused.
-  const_cast<Tensor *>(input)->MarkUnused();
   return MaceStatus::MACE_SUCCESS;
 }
 
@@ -230,8 +226,6 @@ MaceStatus TransformArgument(
       }
     };
   }
-  // Mark the buffer unused.
-  const_cast<Tensor *>(input)->MarkUnused();
   return MaceStatus::MACE_SUCCESS;
 }
 
diff --git a/mace/ops/opencl/buffer/buffer_transform.h b/mace/ops/opencl/buffer/buffer_transform.h
index c9e31cfa04432d3b2758a13993e0850224d7cf43..7f9eae2125be87790151a26f404cb4119890ecd2 100644
--- a/mace/ops/opencl/buffer/buffer_transform.h
+++ b/mace/ops/opencl/buffer/buffer_transform.h
@@ -63,7 +63,7 @@ class BufferTransform: public OpenCLBufferTransformKernel {
   MaceStatus Compute(
       OpContext *context,
       const Tensor *input,
-      const BufferType type,
+      const OpenCLBufferType type,
       const int wino_blk_size,
       Tensor *output) override;
 
@@ -75,7 +75,7 @@ class BufferTransform: public OpenCLBufferTransformKernel {
 template <typename T>
 MaceStatus BufferTransform<T>::Compute(OpContext *context,
                                        const Tensor *input,
-                                       const BufferType type,
+                                       const OpenCLBufferType type,
                                        const int wino_blk_size,
                                        Tensor *output) {
   MACE_UNUSED(type);
@@ -92,8 +92,8 @@ MaceStatus BufferTransform<T>::Compute(OpContext *context,
       if (input->dtype() != dt) {
         return BufferTypeTransform(context, &kernel_, input, dt, output);
       } else {
-        SetFutureDefaultWaitFn(context->future());
-        output->ReuseTensorBuffer(*input);
+        LOG(FATAL) << "Should not reach here. " << input->name()
+                   << "<" << type << "> to " << output->name();
         return MaceStatus::MACE_SUCCESS;
       }
   }
diff --git a/mace/ops/opencl/buffer/buffer_type_transform.cc b/mace/ops/opencl/buffer/buffer_type_transform.cc
index d1d52fe4152e7033755517c958cff35b659eebfd..ce405e9f3da2865c4a2547389f15cdb9434f6996 100644
--- a/mace/ops/opencl/buffer/buffer_type_transform.cc
+++ b/mace/ops/opencl/buffer/buffer_type_transform.cc
@@ -86,8 +86,6 @@ MaceStatus BufferTypeTransform(
       }
     };
   }
-  // Mark the buffer unused.
-  const_cast<Tensor *>(input)->MarkUnused();
   return MaceStatus::MACE_SUCCESS;
 }
 
diff --git a/mace/ops/opencl/buffer/conv_2d.h b/mace/ops/opencl/buffer/conv_2d.h
index e50d8e5c2ad77b1c4d64ce371f5f6770a4f562ee..dca574047aa79575cd9c7b6b2cabc18f779cb330 100644
--- a/mace/ops/opencl/buffer/conv_2d.h
+++ b/mace/ops/opencl/buffer/conv_2d.h
@@ -62,6 +62,14 @@ class Conv2dKernel : public OpenCLConv2dKernel {
  public:
   Conv2dKernel() : old_scratch_size_(0) {}
 
+  bool CheckUseWinograd(
+      OpenCLRuntime *runtime,
+      const std::vector<index_t> &filter_shape,
+      const std::vector<index_t> &output_shape,
+      const int *strides,
+      const int *dilations,
+      int *wino_block_size) override;
+
   MaceStatus Compute(
       OpContext *context,
       const Tensor *input,
@@ -73,6 +81,7 @@ class Conv2dKernel : public OpenCLConv2dKernel {
       const int *dilations,
       const ActivationType activation,
       const float relux_max_limit,
+      const int winograd_blk_size,
       Tensor *output) override;
 
  private:
@@ -82,6 +91,23 @@ class Conv2dKernel : public OpenCLConv2dKernel {
   std::vector<index_t> input_shape_;
 };
 
+
+template <typename T>
+bool Conv2dKernel<T>::CheckUseWinograd(
+    OpenCLRuntime *runtime,
+    const std::vector<index_t> &filter_shape,
+    const std::vector<index_t> &output_shape,
+    const int *strides,
+    const int *dilations,
+    int *wino_block_size) {
+  MACE_UNUSED(runtime);
+  MACE_UNUSED(output_shape);
+  MACE_UNUSED(wino_block_size);
+  return (filter_shape[2] == 3 && filter_shape[3] == 3 &&
+      strides[0] == 1 && strides[1] == 1 &&
+      dilations[0] == 1 && dilations[1] == 1);
+}
+
 template <typename T>
 MaceStatus Conv2dKernel<T>::Compute(
       OpContext *context,
@@ -94,7 +120,9 @@ MaceStatus Conv2dKernel<T>::Compute(
       const int *dilations,
       const ActivationType activation,
       const float relux_max_limit,
+      const int winograd_blk_size,
       Tensor *output) {
+  MACE_UNUSED(winograd_blk_size);
   StatsFuture pad_future, conv_future;
   index_t filter_h = filter->dim(2);
   index_t filter_w = filter->dim(3);
diff --git a/mace/ops/opencl/buffer_transform_kernel.h b/mace/ops/opencl/buffer_transform_kernel.h
index 5d4ff09448cfee8f70af71f2365e43525a9e3087..83159eeaa29db37162981d4752c79adc848be20c 100644
--- a/mace/ops/opencl/buffer_transform_kernel.h
+++ b/mace/ops/opencl/buffer_transform_kernel.h
@@ -15,7 +15,7 @@
 #ifndef MACE_OPS_OPENCL_BUFFER_TRANSFORM_KERNEL_H_
 #define MACE_OPS_OPENCL_BUFFER_TRANSFORM_KERNEL_H_
 
-#include "mace/ops/opencl/common.h"
+#include "mace/core/runtime/opencl/opencl_util.h"
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
 
@@ -27,10 +27,10 @@ class OpenCLBufferTransformKernel {
  public:
   virtual MaceStatus Compute(OpContext *context,
                              const Tensor *input,
-                             const BufferType type,
+                             const OpenCLBufferType type,
                              const int wino_blk_size,
                              Tensor *output) = 0;
- MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferTransformKernel)
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferTransformKernel)
 };
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/opencl/buffer_transformer.cc b/mace/ops/opencl/buffer_transformer.cc
index e3b1b67b417b83879b1949874afa2624795d31f0..1176df1303cfb552aa1e880d855dfb9065e2d245 100644
--- a/mace/ops/opencl/buffer_transformer.cc
+++ b/mace/ops/opencl/buffer_transformer.cc
@@ -17,7 +17,7 @@
 namespace mace {
 namespace ops {
 
-std::string TransformedName(const std::string &name) {
+std::string TransformedFilterName(const std::string &name) {
   // TODO(liuqi): This may create a conflict.
   const char *postfix = "_mace_identity_transformed";
   return name + postfix;
diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h
index 78f82177ee3fd091cf55bfcd79a815f1ebaa925d..7acc39a90d7ffb7c89f7d3407402cd27ab19efb6 100644
--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
@@ -15,11 +15,15 @@
 #ifndef MACE_OPS_OPENCL_BUFFER_TRANSFORMER_H_
 #define MACE_OPS_OPENCL_BUFFER_TRANSFORMER_H_
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "mace/core/operator.h"
-#include "mace/ops/opencl/common.h"
 #include "mace/ops/opencl/image/buffer_to_image.h"
 #include "mace/ops/opencl/image/image_to_buffer.h"
 #include "mace/ops/opencl/buffer/buffer_transform.h"
+#include "mace/ops/transpose.h"
 
 namespace mace {
 namespace ops {
@@ -28,10 +32,10 @@ template <typename T>
 class OpenCLBufferTransformer {
  public:
   OpenCLBufferTransformer(const MemoryType in_mem_type,
-                         const MemoryType out_mem_type) {
+                          const MemoryType out_mem_type) {
     if (out_mem_type == MemoryType::GPU_IMAGE) {
       kernel_.reset(new opencl::image::BufferToImage<T>);
-    } else if (in_mem_type == MemoryType::GPU_IMAGE){
+    } else if (in_mem_type == MemoryType::GPU_IMAGE) {
       kernel_.reset(new opencl::image::ImageToBuffer<T>);
     } else {
       kernel_.reset(new opencl::buffer::BufferTransform<T>);
@@ -40,9 +44,9 @@ class OpenCLBufferTransformer {
 
   MaceStatus Transform(OpContext *context,
                        const Tensor *input,
-                       const BufferType type,
-                       const int wino_blk_size,
+                       const OpenCLBufferType type,
                        const MemoryType out_mem_type,
+                       const int wino_blk_size,
                        Tensor *output) {
     Workspace *ws = context->workspace();
     DataType dt = DataTypeToEnum<T>::value;
@@ -54,39 +58,81 @@ class OpenCLBufferTransformer {
             context, input, type, wino_blk_size, output);
       } else {
         // convert to the GPU Buffer with the input's data type.
+        // 1. CPU buffer to GPU Buffer
         Tensor *internal_tensor = ws->CreateTensor(
             InternalTransformedName(input->name()),
             context->device()->allocator(), input->dtype());
-        output->Resize(input->shape());
-        const uint8_t *input_ptr = input->data<uint8_t>();
-        Tensor::MappingGuard guard(internal_tensor);
-        uint8_t *internal_ptr = internal_tensor->mutable_data<uint8_t>();
-        memcpy(internal_ptr, input_ptr, input->raw_size());
-        // convert the internal GPU Buffer to output.
+        VLOG(2) << "Transform CPU Buffer " << input->name()
+                << " to GPU Buffer " << internal_tensor->name()
+                << " with data type " << dt;
+        if (input->shape().size() == 4) {
+          // 1. (NCHW -> NHWC)
+          std::vector<int> dst_dims = {0, 2, 3, 1};
+          std::vector<index_t> output_shape =
+              TransposeShape<index_t, index_t>(input->shape(),
+                                               dst_dims);
+          internal_tensor->Resize(output_shape);
+          // TODO(liuqi): Only support float now
+          const float *input_ptr = input->data<float>();
+          Tensor::MappingGuard guard(internal_tensor);
+          float *internal_ptr = internal_tensor->mutable_data<float>();
+          MACE_RETURN_IF_ERROR(ops::Transpose(input_ptr,
+                                              input->shape(),
+                                              dst_dims,
+                                              internal_ptr));
+        } else {
+          internal_tensor->Resize(input->shape());
+          const uint8_t *input_ptr = input->data<uint8_t>();
+          Tensor::MappingGuard guard(internal_tensor);
+          uint8_t *internal_ptr = internal_tensor->mutable_data<uint8_t>();
+          memcpy(internal_ptr, input_ptr, input->raw_size());
+        }
+        // 2. convert the internal GPU Buffer to output.
         return kernel_->Compute(
             context, internal_tensor, type, wino_blk_size, output);
       }
-    } else {  // out_mem_type == MemoryType::CPU_BUFFER
-      // convert to the GPU Buffer with the output's data type.
+    } else if (out_mem_type == MemoryType::CPU_BUFFER) {
+      // 1. convert to the GPU Buffer with the output's data type.
       Tensor internal_tensor(context->device()->allocator(),
                              dt,
                              false,
                              InternalTransformedName(input->name()));
       MACE_RETURN_IF_ERROR(kernel_->Compute(
           context, input, type, wino_blk_size, &internal_tensor));
-      // convert the internal GPU Buffer to output.
-      Tensor::MappingGuard guard(&internal_tensor);
-      const T *internal_ptr = internal_tensor.data<T>();
-      output->Resize(internal_tensor.shape());
-      T *output_ptr = output->mutable_data<T>();
-      memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T));
+      // 2. convert the internal GPU Buffer to output.
+      VLOG(2) << "Transform GPU Buffer " << internal_tensor.name()
+              << " to CPU Buffer " << output->name()
+              << " with data type " << dt;
+      if (internal_tensor.shape().size() == 4) {
+        // NHWC -> NCHW
+        std::vector<int> dst_dims = {0, 3, 1, 2};
+        std::vector<index_t> output_shape =
+            TransposeShape<index_t, index_t>(internal_tensor.shape(),
+                                             dst_dims);
+        Tensor::MappingGuard guard(&internal_tensor);
+        const float *internal_ptr = internal_tensor.data<float>();
+        output->Resize(output_shape);
+        float *output_ptr = output->mutable_data<float>();
+        return ops::Transpose(internal_ptr,
+                              internal_tensor.shape(),
+                              dst_dims,
+                              output_ptr);
+      } else {
+        Tensor::MappingGuard guard(&internal_tensor);
+        const T *internal_ptr = internal_tensor.data<T>();
+        output->Resize(internal_tensor.shape());
+        T *output_ptr = output->mutable_data<T>();
+        memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T));
+        return MaceStatus::MACE_SUCCESS;
+      }
+    } else {
+      LOG(FATAL) << "Unexpected error: " << out_mem_type;
       return MaceStatus::MACE_SUCCESS;
     }
   }
 
  private:
   std::string InternalTransformedName(const std::string &name) {
-    // TODO(liuqi): This may create a conflict.
     const char *postfix = "_mace_identity_internal";
     return name + postfix;
   }
@@ -95,29 +141,31 @@ class OpenCLBufferTransformer {
   std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
 };
 
-std::string TransformedName(const std::string &name);
+std::string TransformedFilterName(const std::string &name);
 
 template <typename T>
 MaceStatus TransformFilter(
     mace::OpConstructContext *context,
     OperatorDef *op_def,
     const int input_idx,
-    const BufferType buffer_type,
-    const MemoryType mem_type) {
+    const OpenCLBufferType buffer_type,
+    const MemoryType mem_type,
+    const int wino_blk_size = 0) {
   const DataType dt = DataTypeToEnum<T>::value;
   OpContext op_context(context->workspace(), context->device());
   Workspace *ws = context->workspace();
   std::string input_name = op_def->input(input_idx);
   Tensor *input = ws->GetTensor(input_name);
-  std::string output_name = TransformedName(input_name);
+  std::string output_name = TransformedFilterName(input_name);
   Tensor *output =
-      ws->CreateTensor(output_name, context->device()->allocator(), dt);
+      ws->CreateTensor(output_name, context->device()->allocator(), dt, true);
 
   // update the information
   op_def->set_input(input_idx, output_name);
   input->MarkUnused();
   return OpenCLBufferTransformer<T>(input->memory_type(), mem_type).
-      Transform(&op_context, input, buffer_type, 0, mem_type, output);
+      Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
+                output);
 }
 
 }  // namespace ops
diff --git a/mace/ops/opencl/conv_2d.h b/mace/ops/opencl/conv_2d.h
index cf0911f79ee6cff726383f804a590bf42ed2b229..03f2cd49861fec380d2effc6fc88a2e8e6d580de 100644
--- a/mace/ops/opencl/conv_2d.h
+++ b/mace/ops/opencl/conv_2d.h
@@ -26,6 +26,14 @@ class OpContext;
 namespace ops {
 class OpenCLConv2dKernel {
  public:
+  virtual bool CheckUseWinograd(
+      OpenCLRuntime *runtime,
+      const std::vector<index_t> &filter_shape,
+      const std::vector<index_t> &output_shape,
+      const int *strides,
+      const int *dilations,
+      int *wino_block_size) = 0;
+
   virtual MaceStatus Compute(
       OpContext *context,
       const Tensor *input,
@@ -37,6 +45,7 @@ class OpenCLConv2dKernel {
       const int *dilations,
       const ActivationType activation,
       const float relux_max_limit,
+      const int winograd_blk_size,
       Tensor *output) = 0;
   MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLConv2dKernel);
 };
diff --git a/mace/ops/opencl/depthwise_deconv2d.h b/mace/ops/opencl/depthwise_deconv2d.h
index 994c98a21f012dd6dbc72f8c39ad0ef0e2e839d3..4238f0d279e746ab5a7efb564878181b68ddae8e 100644
--- a/mace/ops/opencl/depthwise_deconv2d.h
+++ b/mace/ops/opencl/depthwise_deconv2d.h
@@ -15,6 +15,7 @@
 #ifndef MACE_OPS_OPENCL_DEPTHWISE_DECONV2D_H_
 #define MACE_OPS_OPENCL_DEPTHWISE_DECONV2D_H_
 
+#include <string>
 #include <vector>
 
 #include "mace/ops/activation.h"
diff --git a/mace/ops/opencl/helper.cc b/mace/ops/opencl/helper.cc
index 7eb392a8ea05569ba266c660f89328325fdb3c8e..11487b14446c08b3a086c2e8a0284f8ec28ccf24 100644
--- a/mace/ops/opencl/helper.cc
+++ b/mace/ops/opencl/helper.cc
@@ -24,136 +24,9 @@
 namespace mace {
 namespace ops {
 
-namespace {
-// [(C + 3) / 4 * W, N * H]
-void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
-                           std::vector<size_t> *image_shape) {
-  MACE_CHECK(shape.size() == 4);
-  image_shape->resize(2);
-  (*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2];
-  (*image_shape)[1] = shape[0] * shape[1];
-}
-
-// [Ic, H * W * (Oc + 3) / 4]
-void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* OIHW */
-                               std::vector<size_t> *image_shape) {
-  MACE_CHECK(shape.size() == 4);
-  image_shape->resize(2);
-  (*image_shape)[0] = shape[1];
-  (*image_shape)[1] = shape[2] * shape[3] * RoundUpDiv4(shape[0]);
-}
-
-// [H * W * M, (Ic + 3) / 4]
-void CalDepthwiseConv2dFilterImageShape(
-    const std::vector<index_t> &shape, /* MIHW */
-    std::vector<size_t> *image_shape) {
-  MACE_CHECK(shape.size() == 4);
-  image_shape->resize(2);
-  (*image_shape)[0] = shape[0] * shape[2] * shape[3];
-  (*image_shape)[1] = RoundUpDiv4(shape[1]);
-}
-
-// [(size + 3) / 4, 1]
-void CalArgImageShape(const std::vector<index_t> &shape,
-                      std::vector<size_t> *image_shape) {
-  MACE_CHECK(shape.size() == 1);
-  image_shape->resize(2);
-  (*image_shape)[0] = RoundUpDiv4(shape[0]);
-  (*image_shape)[1] = 1;
-}
-
-// Only support 3x3 now
-// [ (Ic + 3) / 4, 16 * Oc]
-void CalWinogradFilterImageShape(
-    const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
-    std::vector<size_t> *image_shape,
-    const int blk_size) {
-  MACE_CHECK(shape.size() == 4);
-  image_shape->resize(2);
-  (*image_shape)[0] = RoundUpDiv4(shape[1]);
-  (*image_shape)[1] = (shape[0] * (blk_size + 2) * (blk_size + 2));
-}
-
-
-// [W * C, N * RoundUp<4>(H)]
-void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
-                              std::vector<size_t> *image_shape) {
-  MACE_CHECK(shape.size() == 4);
-  image_shape->resize(2);
-  (*image_shape)[0] = shape[2] * shape[3];
-  (*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]);
-}
-
-// [RoundUp<4>(W) * C, N * H]
-void CalInOutWidthImageShape(const std::vector<index_t> &shape, /* NHWC */
-                             std::vector<size_t> *image_shape) {
-  MACE_CHECK(shape.size() == 4);
-  image_shape->resize(2);
-  (*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3];
-  (*image_shape)[1] = shape[0] * shape[1];
-}
-
-// [Ic * H * W, (Oc + 3) / 4]
-void CalWeightHeightImageShape(const std::vector<index_t> &shape, /* OIHW */
-                               std::vector<size_t> *image_shape) {
-  MACE_CHECK(shape.size() == 4);
-  image_shape->resize(2);
-  (*image_shape)[0] = shape[1] * shape[2] * shape[3];
-  (*image_shape)[1] = RoundUpDiv4(shape[0]);
-}
-
-// [(Ic + 3) / 4 * H * W, Oc]
-void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* OIHW */
-                              std::vector<size_t> *image_shape) {
-  MACE_CHECK(shape.size() == 4);
-  image_shape->resize(2);
-  (*image_shape)[0] = RoundUpDiv4(shape[1]) * shape[2] * shape[3];
-  (*image_shape)[1] = shape[0];
-}
-}  // namespace
-
-void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
-                     const BufferType type,
-                     std::vector<size_t> *image_shape,
-                     const int wino_block_size) {
-  MACE_CHECK_NOTNULL(image_shape);
-  switch (type) {
-    case CONV2D_FILTER:
-      CalConv2dFilterImageShape(shape, image_shape);
-      break;
-    case DW_CONV2D_FILTER:
-      CalDepthwiseConv2dFilterImageShape(shape, image_shape);
-      break;
-    case IN_OUT_CHANNEL:
-      CalInOutputImageShape(shape, image_shape);
-      break;
-    case ARGUMENT:
-      CalArgImageShape(shape, image_shape);
-      break;
-    case IN_OUT_HEIGHT:
-      CalInOutHeightImageShape(shape, image_shape);
-      break;
-    case IN_OUT_WIDTH:
-      CalInOutWidthImageShape(shape, image_shape);
-      break;
-    case WINOGRAD_FILTER:
-      CalWinogradFilterImageShape(shape, image_shape, wino_block_size);
-      break;
-    case WEIGHT_HEIGHT:
-      CalWeightHeightImageShape(shape, image_shape);
-      break;
-    case WEIGHT_WIDTH:
-      CalWeightWidthImageShape(shape, image_shape);
-      break;
-    default:
-      LOG(FATAL) << "Mace not supported yet.";
-  }
-}
-
 std::vector<index_t> FormatBufferShape(
     const std::vector<index_t> &buffer_shape,
-    const BufferType type) {
-
+    const OpenCLBufferType type) {
   const size_t buffer_shape_size = buffer_shape.size();
   switch (type) {
     case IN_OUT_CHANNEL:
diff --git a/mace/ops/opencl/helper.h b/mace/ops/opencl/helper.h
index d4b5aa51a0f6a53da15e4862135bf6d0b6fd721e..e2f51a43d7dab565067d1f8bf450fc3a97f060c8 100644
--- a/mace/ops/opencl/helper.h
+++ b/mace/ops/opencl/helper.h
@@ -24,8 +24,8 @@
 #include "mace/core/macros.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/core/runtime/opencl/opencl_util.h"
 #include "mace/core/types.h"
-#include "mace/ops/opencl/common.h"
 #include "mace/utils/utils.h"
 
 namespace mace {
@@ -92,14 +92,9 @@ const float kMaxKernelExecTime = 1000.0;  // microseconds
 // Base GPU cache size used for computing local work group size.
 const int32_t kBaseGPUMemCacheSize = 16384;
 
-void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
-                     const BufferType type,
-                     std::vector<size_t> *image_shape,
-                     const int wino_blk_size = 2);
-
 std::vector<index_t> FormatBufferShape(
     const std::vector<index_t> &buffer_shape,
-    const BufferType type);
+    const OpenCLBufferType type);
 
 // CPU data type to OpenCL command data type
 std::string DtToCLCMDDt(const DataType dt);
diff --git a/mace/ops/opencl/image/addn.h b/mace/ops/opencl/image/addn.h
index bde9c6b06ff8bc7bfcfc63fcd5cd324d7f23cb83..7692ac06b8e281295381b7ecf77d446784988859 100644
--- a/mace/ops/opencl/image/addn.h
+++ b/mace/ops/opencl/image/addn.h
@@ -101,8 +101,8 @@ MaceStatus AddNKernel<T>::Compute(
   MACE_OUT_OF_RANGE_INIT(kernel_);
   if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
     std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                    &output_image_shape);
+    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                                &output_image_shape);
     MACE_RETURN_IF_ERROR(
         output_tensor->ResizeImage(output_shape, output_image_shape));
 
diff --git a/mace/ops/opencl/image/batch_to_space.h b/mace/ops/opencl/image/batch_to_space.h
index b92c9a4eea17bfb1ea5df710de9485c1e5293b7c..9d91802627c840538b70d5a4f994d3ca572e8504 100644
--- a/mace/ops/opencl/image/batch_to_space.h
+++ b/mace/ops/opencl/image/batch_to_space.h
@@ -56,8 +56,8 @@ MaceStatus BatchToSpaceKernel<T>::Compute(
     const std::vector<index_t> &output_shape,
     Tensor *space_tensor) {
   std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
   MACE_RETURN_IF_ERROR(
       space_tensor->ResizeImage(output_shape, output_image_shape));
 
diff --git a/mace/ops/opencl/image/buffer_to_image.h b/mace/ops/opencl/image/buffer_to_image.h
index e84691f85d1149f5cc87cbc6659b80ae786f2c71..14a0ae4b3e474eb464580701446346248f5d1982 100644
--- a/mace/ops/opencl/image/buffer_to_image.h
+++ b/mace/ops/opencl/image/buffer_to_image.h
@@ -36,7 +36,7 @@ class BufferToImage : public OpenCLBufferTransformKernel {
   MaceStatus Compute(
       OpContext *context,
       const Tensor *input,
-      const BufferType type,
+      const OpenCLBufferType type,
       const int wino_blk_size,
       Tensor *output) override;
 
@@ -49,20 +49,16 @@ template <typename T>
 MaceStatus BufferToImage<T>::Compute(
     OpContext *context,
     const Tensor *input,
-    const BufferType type,
+    const OpenCLBufferType type,
     const int wino_blk_size,
     Tensor *output) {
   auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
   std::vector<size_t> image_shape;
-  CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size);
-  if (type == WINOGRAD_FILTER) {
-    std::vector<index_t> new_shape =
-        {(wino_blk_size + 2) * (wino_blk_size + 2),
-         input->dim(0), input->dim(1)};
-    MACE_RETURN_IF_ERROR(output->ResizeImage(new_shape, image_shape));
-  } else {
-    MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
-  }
+  OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
+                              type,
+                              &image_shape,
+                              wino_blk_size);
+  MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape));
 
   uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
                      static_cast<uint32_t>(image_shape[1])};
@@ -196,9 +192,6 @@ MaceStatus BufferToImage<T>::Compute(
     };
   }
 
-  // Mark the buffer unused.
-  const_cast<Tensor *>(input)->MarkUnused();
-
   return MaceStatus::MACE_SUCCESS;
 }
 
diff --git a/mace/ops/opencl/image/concat.h b/mace/ops/opencl/image/concat.h
index f12ad25cddecaa85989921445f8e0c258c83989a..c7f5e099168f43182cdb9e7bb39ac9df0dbdaeb6 100644
--- a/mace/ops/opencl/image/concat.h
+++ b/mace/ops/opencl/image/concat.h
@@ -92,7 +92,9 @@ MaceStatus ConcatKernel<T>::Compute(
       inputs_count == 2 || divisible_four,
       "Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
   std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
 
   switch (inputs_count) {
diff --git a/mace/ops/opencl/image/conv_2d.h b/mace/ops/opencl/image/conv_2d.h
index 224432e894fe7e25f873ac45a725ea2e8de13571..51c9d1dfe8a6f9ddbf2fccbae600576d536c5301 100644
--- a/mace/ops/opencl/image/conv_2d.h
+++ b/mace/ops/opencl/image/conv_2d.h
@@ -28,55 +28,76 @@ namespace ops {
 namespace opencl {
 namespace image {
 
-extern MaceStatus Conv2dOpenclK1x1(OpContext *context,
-                                   cl::Kernel *kernel,
-                                   const Tensor *input,
-                                   const Tensor *filter,
-                                   const Tensor *bias,
-                                   const int stride,
-                                   const int *padding,
-                                   const int *dilations,
-                                   const ActivationType activation,
-                                   const float relux_max_limit,
-                                   const DataType dt,
-                                   std::vector<index_t> *prev_input_shape,
-                                   Tensor *output,
-                                   uint32_t *kwg_size);
-
-extern MaceStatus Conv2dOpenclK3x3(OpContext *context,
-                                   cl::Kernel *kernel,
-                                   const Tensor *input,
-                                   const Tensor *filter,
-                                   const Tensor *bias,
-                                   const int stride,
-                                   const int *padding,
-                                   const int *dilations,
-                                   const ActivationType activation,
-                                   const float relux_max_limit,
-                                   const DataType dt,
-                                   std::vector<index_t> *prev_input_shape,
-                                   Tensor *output,
-                                   uint32_t *kwg_size);
-
-extern MaceStatus Conv2dOpencl(OpContext *context,
-                               cl::Kernel *kernel,
-                               const Tensor *input,
-                               const Tensor *filter,
-                               const Tensor *bias,
-                               const int stride,
-                               const int *padding,
-                               const int *dilations,
-                               const ActivationType activation,
-                               const float relux_max_limit,
-                               const DataType dt,
-                               std::vector<index_t> *prev_input_shape,
-                               Tensor *output,
-                               uint32_t *kwg_size);
+extern MaceStatus Conv2dK1x1(OpContext *context,
+                             cl::Kernel *kernel,
+                             const Tensor *input,
+                             const Tensor *filter,
+                             const Tensor *bias,
+                             const int stride,
+                             const int *padding,
+                             const int *dilations,
+                             const ActivationType activation,
+                             const float relux_max_limit,
+                             const DataType dt,
+                             std::vector<index_t> *prev_input_shape,
+                             Tensor *output,
+                             uint32_t *kwg_size);
 
+extern MaceStatus Conv2dK3x3(OpContext *context,
+                             cl::Kernel *kernel,
+                             const Tensor *input,
+                             const Tensor *filter,
+                             const Tensor *bias,
+                             const int stride,
+                             const int *padding,
+                             const int *dilations,
+                             const ActivationType activation,
+                             const float relux_max_limit,
+                             const DataType dt,
+                             std::vector<index_t> *prev_input_shape,
+                             Tensor *output,
+                             uint32_t *kwg_size);
+
+extern MaceStatus Conv2d(OpContext *context,
+                         cl::Kernel *kernel,
+                         const Tensor *input,
+                         const Tensor *filter,
+                         const Tensor *bias,
+                         const int stride,
+                         const int *padding,
+                         const int *dilations,
+                         const ActivationType activation,
+                         const float relux_max_limit,
+                         const DataType dt,
+                         std::vector<index_t> *prev_input_shape,
+                         Tensor *output,
+                         uint32_t *kwg_size);
+
+extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
+                                       cl::Kernel *kernels[3],
+                                       const Tensor *input,
+                                       const Tensor *filter,
+                                       const Tensor *bias,
+                                       const int *padding,
+                                       const ActivationType activation,
+                                       const float relux_max_limit,
+                                       const DataType dt,
+                                       const int wino_blk_size,
+                                       std::vector<index_t> *prev_input_shape,
+                                       Tensor *output,
+                                       uint32_t *kwg_size[3]);
 
 template <typename T>
 class Conv2dKernel : public OpenCLConv2dKernel {
  public:
+  bool CheckUseWinograd(
+      OpenCLRuntime *runtime,
+      const std::vector<index_t> &filter_shape,
+      const std::vector<index_t> &output_shape,
+      const int *strides,
+      const int *dilations,
+      int *wino_block_size) override;
+
   MaceStatus Compute(
       OpContext *context,
       const Tensor *input,
@@ -88,14 +109,54 @@ class Conv2dKernel : public OpenCLConv2dKernel {
       const int *dilations,
       const ActivationType activation,
       const float relux_max_limit,
+      const int wino_blk_size,
       Tensor *output) override;
 
  private:
-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
+  cl::Kernel kernels_[3];
+  uint32_t kwg_size_[3];
   std::vector<index_t> input_shape_;
 };
 
+template <typename T>
+bool Conv2dKernel<T>::CheckUseWinograd(
+    OpenCLRuntime *runtime,
+    const std::vector<mace::index_t> &filter_shape,
+    const std::vector<mace::index_t> &output_shape,
+    const int *strides,
+    const int *dilations,
+    int *wino_blk_size) {
+  if (filter_shape[2] != 3 || filter_shape[3] != 3 ||
+      strides[0] > 1 || strides[1] > 1 ||
+      dilations[0] > 1 || dilations[1] > 1) {
+    return false;
+  }
+  index_t out_channels = filter_shape[0];
+  index_t in_channels = filter_shape[1];
+  auto opencl_image_max_size = runtime->GetMaxImage2DSize();
+  auto check_opencl_limit = [&](int block_size) -> bool {
+    int sqr_block = (block_size + 2) * (block_size + 2);
+    uint64_t transformed_width = static_cast<uint64_t>(output_shape[0] *
+        ((output_shape[1] + block_size - 1) / block_size) *
+        ((output_shape[2] + block_size - 1) / block_size));
+    return (transformed_width < opencl_image_max_size[0] &&
+        static_cast<uint64_t>(sqr_block * in_channels)
+        < opencl_image_max_size[1] &&
+        static_cast<uint64_t>(sqr_block * out_channels)
+            < opencl_image_max_size[1]);
+  };
+  // GPU only supports 4x4 and 2x2 gpu winograd convolution
+  if (*wino_blk_size == 4) {
+    // if block size == 4 exceed OpenCL image size limitation, fallback to 2
+    if (!check_opencl_limit(4)) {
+      *wino_blk_size = 2;
+    } else {
+      return true;
+    }
+  }
+  return check_opencl_limit(2);
+}
+
 template <typename T>
 MaceStatus Conv2dKernel<T>::Compute(
       OpContext *context,
@@ -108,19 +169,8 @@ MaceStatus Conv2dKernel<T>::Compute(
       const int *dilations,
       const ActivationType activation,
       const float relux_max_limit,
+      const int wino_blk_size,
       Tensor *output) {
-  typedef MaceStatus (*Conv2dOpenclFunction)(
-      OpContext *context,
-      cl::Kernel *kernel, const Tensor *input, const Tensor *filter,
-      const Tensor *bias, const int stride, const int *padding,
-      const int *dilations, const ActivationType activation,
-      const float relux_max_limit, const DataType dt,
-      std::vector<index_t> *input_shape, Tensor *output,
-      uint32_t *kwg_size);
-  // Selection matrix: kernel_size x stride_size
-  static const Conv2dOpenclFunction selector[3] = {
-      Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3};
-
   index_t kernel_h = filter->dim(2);
   index_t kernel_w = filter->dim(3);
   if (strides[0] != strides[1] ||
@@ -148,24 +198,85 @@ MaceStatus Conv2dKernel<T>::Compute(
   }
 
   std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
-  if (kernel_h == kernel_w && kernel_h <= 3 &&
-      selector[kernel_h - 1] != nullptr) {
-    auto conv2d_func = selector[kernel_h - 1];
-    return conv2d_func(context,
-        &kernel_, input, filter, bias, strides[0], paddings.data(), dilations,
-        activation, relux_max_limit, DataTypeToEnum<T>::value, &input_shape_,
-        output, &kwg_size_);
+  std::function<MaceStatus()> conv_func;
+
+  if (wino_blk_size != 0) {
+    // use winograd covolution
+    conv_func = [&]() -> MaceStatus {
+      cl::Kernel *kernels[3] = {&kernels_[0], &kernels_[1], &kernels_[2]};
+      uint32_t *kwg_size[3] = {&kwg_size_[0], &kwg_size_[1], &kwg_size_[2]};
+      return WinogradConv2dK3x3S1(context,
+                                  kernels,
+                                  input,
+                                  filter,
+                                  bias,
+                                  paddings.data(),
+                                  activation,
+                                  relux_max_limit,
+                                  DataTypeToEnum<T>::value,
+                                  wino_blk_size,
+                                  &input_shape_,
+                                  output,
+                                  kwg_size);
+    };
+  } else if (kernel_h == 1 && kernel_w == 1)  {
+    conv_func = [&]() -> MaceStatus {
+      return Conv2dK1x1(context,
+                        &kernels_[0],
+                        input,
+                        filter,
+                        bias,
+                        strides[0],
+                        paddings.data(),
+                        dilations,
+                        activation,
+                        relux_max_limit,
+                        DataTypeToEnum<T>::value,
+                        &input_shape_,
+                        output,
+                        &kwg_size_[0]);
+    };
+  } else if (kernel_h == 3 && kernel_w == 3) {
+    conv_func = [&]() -> MaceStatus {
+      return Conv2dK3x3(context,
+                        &kernels_[0],
+                        input,
+                        filter,
+                        bias,
+                        strides[0],
+                        paddings.data(),
+                        dilations,
+                        activation,
+                        relux_max_limit,
+                        DataTypeToEnum<T>::value,
+                        &input_shape_,
+                        output,
+                        &kwg_size_[0]);
+    };
   } else {
-    return Conv2dOpencl(
-        context, &kernel_, input, filter, bias,
-        strides[0], paddings.data(), dilations,
-        activation, relux_max_limit, DataTypeToEnum<T>::value, &input_shape_,
-        output, &kwg_size_);
+    conv_func = [&]() -> MaceStatus {
+      return Conv2d(context,
+                    &kernels_[0],
+                    input,
+                    filter,
+                    bias,
+                    strides[0],
+                    paddings.data(),
+                    dilations,
+                    activation,
+                    relux_max_limit,
+                    DataTypeToEnum<T>::value,
+                    &input_shape_,
+                    output,
+                    &kwg_size_[0]);
+    };
   }
+
+  return conv_func();
 }
 
 }  // namespace image
diff --git a/mace/ops/opencl/image/conv_2d_1x1.cc b/mace/ops/opencl/image/conv_2d_1x1.cc
index 74a7ddc9ace77bb5b2abfa2608cdb8aee35ea842..f88882ee645814f81d13bef5cd80ef9ebcb5092f 100644
--- a/mace/ops/opencl/image/conv_2d_1x1.cc
+++ b/mace/ops/opencl/image/conv_2d_1x1.cc
@@ -66,20 +66,20 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 
 }  // namespace
 
-extern MaceStatus Conv2dOpenclK1x1(OpContext *context,
-                                   cl::Kernel *kernel,
-                                   const Tensor *input,
-                                   const Tensor *filter,
-                                   const Tensor *bias,
-                                   const int stride,
-                                   const int *padding,
-                                   const int *dilations,
-                                   const ActivationType activation,
-                                   const float relux_max_limit,
-                                   const DataType dt,
-                                   std::vector<index_t> *prev_input_shape,
-                                   Tensor *output,
-                                   uint32_t *kwg_size) {
+extern MaceStatus Conv2dK1x1(OpContext *context,
+                             cl::Kernel *kernel,
+                             const Tensor *input,
+                             const Tensor *filter,
+                             const Tensor *bias,
+                             const int stride,
+                             const int *padding,
+                             const int *dilations,
+                             const ActivationType activation,
+                             const float relux_max_limit,
+                             const DataType dt,
+                             std::vector<index_t> *prev_input_shape,
+                             Tensor *output,
+                             uint32_t *kwg_size) {
   MACE_UNUSED(padding);
   MACE_UNUSED(dilations);
   const index_t batch = output->dim(0);
diff --git a/mace/ops/opencl/image/conv_2d_3x3.cc b/mace/ops/opencl/image/conv_2d_3x3.cc
index 42a2a81ef3077197b0752b818cc6a34f48f6a233..3e5aee909c89bbed8e94488c5d38d8be3f93615d 100644
--- a/mace/ops/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
@@ -59,20 +59,20 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 
 }  // namespace
 
-extern MaceStatus Conv2dOpenclK3x3(OpContext *context,
-                                   cl::Kernel *kernel,
-                                   const Tensor *input,
-                                   const Tensor *filter,
-                                   const Tensor *bias,
-                                   const int stride,
-                                   const int *padding,
-                                   const int *dilations,
-                                   const ActivationType activation,
-                                   const float relux_max_limit,
-                                   const DataType dt,
-                                   std::vector<index_t> *prev_input_shape,
-                                   Tensor *output,
-                                   uint32_t *kwg_size) {
+extern MaceStatus Conv2dK3x3(OpContext *context,
+                             cl::Kernel *kernel,
+                             const Tensor *input,
+                             const Tensor *filter,
+                             const Tensor *bias,
+                             const int stride,
+                             const int *padding,
+                             const int *dilations,
+                             const ActivationType activation,
+                             const float relux_max_limit,
+                             const DataType dt,
+                             std::vector<index_t> *prev_input_shape,
+                             Tensor *output,
+                             uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
diff --git a/mace/ops/opencl/image/conv_2d_general.cc b/mace/ops/opencl/image/conv_2d_general.cc
index 9b577c2b08f1b2cfd1bb90b266b8cb45b9e72f5a..120a3daa3067d91118c101e8b95798f7bde84a1d 100644
--- a/mace/ops/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
@@ -67,20 +67,20 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 
 }  // namespace
 
-extern MaceStatus Conv2dOpencl(OpContext *context,
-                               cl::Kernel *kernel,
-                               const Tensor *input,
-                               const Tensor *filter,
-                               const Tensor *bias,
-                               const int stride,
-                               const int *padding,
-                               const int *dilations,
-                               const ActivationType activation,
-                               const float relux_max_limit,
-                               const DataType dt,
-                               std::vector<index_t> *prev_input_shape,
-                               Tensor *output,
-                               uint32_t *kwg_size) {
+extern MaceStatus Conv2d(OpContext *context,
+                         cl::Kernel *kernel,
+                         const Tensor *input,
+                         const Tensor *filter,
+                         const Tensor *bias,
+                         const int stride,
+                         const int *padding,
+                         const int *dilations,
+                         const ActivationType activation,
+                         const float relux_max_limit,
+                         const DataType dt,
+                         std::vector<index_t> *prev_input_shape,
+                         Tensor *output,
+                         uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
diff --git a/mace/ops/opencl/image/crop.h b/mace/ops/opencl/image/crop.h
index 95a989a1b901b73712efc4db463b403f85817b66..c8f98a4ca7a2f2cdf8ba96135444e31e25ed1867 100644
--- a/mace/ops/opencl/image/crop.h
+++ b/mace/ops/opencl/image/crop.h
@@ -129,7 +129,9 @@ MaceStatus CropKernel<T>::Compute(
       << input1->dim(i) << "and offset" << offsets[i];
   }
   std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
 
   const index_t offset_chan_blk = RoundUpDiv4(offsets[3]);
diff --git a/mace/ops/opencl/image/deconv_2d.h b/mace/ops/opencl/image/deconv_2d.h
index a0f51874dc2b1de7bbd3d7cd8be9dad8328be49e..f3d6cbe92049380634540ae94419b96a2a1444e1 100644
--- a/mace/ops/opencl/image/deconv_2d.h
+++ b/mace/ops/opencl/image/deconv_2d.h
@@ -64,8 +64,8 @@ MaceStatus Deconv2dKernel<T>::Compute(
       const std::vector<index_t> &output_shape,
       Tensor *output) {
   std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
   const DataType dt = DataTypeToEnum<T>::value;
   const index_t batch = output->dim(0);
diff --git a/mace/ops/opencl/image/depth_to_space.h b/mace/ops/opencl/image/depth_to_space.h
index 2ed253df28dc47c63bbb47f1009f8b2a70f80f74..77c4bd53dfc661fd23381d9e8ebac3cf33c15017 100644
--- a/mace/ops/opencl/image/depth_to_space.h
+++ b/mace/ops/opencl/image/depth_to_space.h
@@ -77,7 +77,9 @@ MaceStatus DepthToSpaceKernel<T>::Compute(
                                        output_width,
                                        output_depth};
   std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
 
   const uint32_t gws[3] = {
diff --git a/mace/ops/opencl/image/depthwise_conv2d.h b/mace/ops/opencl/image/depthwise_conv2d.h
index e818b039caac7e8a207fde5aef88c01bbe4f9bd7..c4ee3cb79ea54424938206cb16b2ec63a54c8cc9 100644
--- a/mace/ops/opencl/image/depthwise_conv2d.h
+++ b/mace/ops/opencl/image/depthwise_conv2d.h
@@ -112,8 +112,8 @@ MaceStatus DepthwiseConv2dKernel<T>::Compute(
   }
 
   std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
   return depthwise::DepthwiseConv2d(
diff --git a/mace/ops/opencl/image/depthwise_deconv2d.h b/mace/ops/opencl/image/depthwise_deconv2d.h
index 040c349d4382864a100060b539e4a323529963ee..96fdfa51e110395f3028003f3058a029765519f5 100644
--- a/mace/ops/opencl/image/depthwise_deconv2d.h
+++ b/mace/ops/opencl/image/depthwise_deconv2d.h
@@ -76,8 +76,8 @@ MaceStatus DepthwiseDeconv2dKernel<T>::Compute(
              "opencl image deconv only supports depthwise type group.");
 
   std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
   const DataType dt = DataTypeToEnum<T>::value;
 
diff --git a/mace/ops/opencl/image/eltwise.h b/mace/ops/opencl/image/eltwise.h
index 25235a442ffb0a5cab8ca90ddc29a8fb9caead88..2afb334233731307582d83ea77d2ec1ad77ce661 100644
--- a/mace/ops/opencl/image/eltwise.h
+++ b/mace/ops/opencl/image/eltwise.h
@@ -101,8 +101,8 @@ MaceStatus EltwiseKernel<T>::Compute(
   output_shape[3] = input0->dim(3);
 
   std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
   const index_t batch = output->dim(0);
diff --git a/mace/ops/opencl/image/fully_connected.h b/mace/ops/opencl/image/fully_connected.h
index 2d8fbb88ccf4abbfe46c9a23056af611ec59bc6a..962ffaf082ca93e1f6129fa2f5d123c0e3454603 100644
--- a/mace/ops/opencl/image/fully_connected.h
+++ b/mace/ops/opencl/image/fully_connected.h
@@ -60,8 +60,8 @@ MaceStatus FullyConnectedKernel<T>::Compute(
     Tensor *output) {
   std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
   std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
   auto runtime = context->device()->opencl_runtime();
diff --git a/mace/ops/opencl/image/image_to_buffer.h b/mace/ops/opencl/image/image_to_buffer.h
index 4200087eeeb6052e2e36a1f63e9ce373dd773cd6..6ca73fa6af9b8a39c43d6586d9167ca8655d6ffa 100644
--- a/mace/ops/opencl/image/image_to_buffer.h
+++ b/mace/ops/opencl/image/image_to_buffer.h
@@ -33,7 +33,7 @@ class ImageToBuffer : public OpenCLBufferTransformKernel {
  public:
   MaceStatus Compute(OpContext *context,
                      const Tensor *input,
-                     const BufferType type,
+                     const OpenCLBufferType type,
                      const int wino_blk_size,
                      Tensor *output) override;
 
@@ -45,12 +45,15 @@ class ImageToBuffer : public OpenCLBufferTransformKernel {
 template <typename T>
 MaceStatus ImageToBuffer<T>::Compute(OpContext *context,
                                      const Tensor *input,
-                                     const BufferType type,
+                                     const OpenCLBufferType type,
                                      const int wino_blk_size,
                                      Tensor *output) {
   auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
   std::vector<size_t> image_shape;
-  CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size);
+  OpenCLUtil::CalImage2DShape(formatted_buffer_shape,
+                              type,
+                              &image_shape,
+                              wino_blk_size);
   MACE_RETURN_IF_ERROR(output->Resize(input->shape()));
 
   uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
diff --git a/mace/ops/opencl/image/lstm_cell.h b/mace/ops/opencl/image/lstm_cell.h
index 967c4bf4c101e31f0f88413e216e82bf87d804ae..546b4a792de1c892a3fd9d6c0e11f255b9cb7501 100644
--- a/mace/ops/opencl/image/lstm_cell.h
+++ b/mace/ops/opencl/image/lstm_cell.h
@@ -98,8 +98,9 @@ MaceStatus LSTMCellKernel<T>::Compute(
   if (!IsVecEqual(input_shape_, input->shape())) {
     std::vector<index_t> output_shape_padded = {height, 1, 1, hidden_units};
     std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape_padded, BufferType::IN_OUT_CHANNEL,
-                    &output_image_shape);
+    OpenCLUtil::CalImage2DShape(output_shape_padded,
+                                OpenCLBufferType::IN_OUT_CHANNEL,
+                                &output_image_shape);
     MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(),
                                              output_image_shape));
     MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(),
diff --git a/mace/ops/opencl/image/matmul.h b/mace/ops/opencl/image/matmul.h
index 899df5a5a8899cade7608e1895208113c4c26d00..763082f610f5b4a115a76fc55be08c459a278d14 100644
--- a/mace/ops/opencl/image/matmul.h
+++ b/mace/ops/opencl/image/matmul.h
@@ -70,7 +70,9 @@ MaceStatus MatMulKernel<T>::Compute(
   c_shape[rank - 1] = width;
   std::vector<size_t> c_image_shape;
   std::vector<index_t> padded_c_shape = {batch, height, width, 1};
-  CalImage2DShape(padded_c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
+  OpenCLUtil::CalImage2DShape(padded_c_shape,
+                              OpenCLBufferType::IN_OUT_HEIGHT,
+                              &c_image_shape);
   MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape));
 
   const index_t height_blocks = RoundUpDiv4(height);
diff --git a/mace/ops/opencl/image/pad.h b/mace/ops/opencl/image/pad.h
index c96d964a5623f68f1df7d441ec61ff675d218296..cb0c390b667a46329ab4f9728caeea10f1eea0c7 100644
--- a/mace/ops/opencl/image/pad.h
+++ b/mace/ops/opencl/image/pad.h
@@ -68,7 +68,9 @@ MaceStatus PadKernel<T>::Compute(
       input_shape[3] + this->paddings_[6] + this->paddings_[7]};
 
   std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
 
   const index_t batch = output->dim(0);
diff --git a/mace/ops/opencl/image/pooling.h b/mace/ops/opencl/image/pooling.h
index 1384b54b812e85bb20d75aa9f25a9dbcb257f44d..f246efa426618e9c197f30d253e23338bd11f73d 100644
--- a/mace/ops/opencl/image/pooling.h
+++ b/mace/ops/opencl/image/pooling.h
@@ -108,8 +108,8 @@ MaceStatus PoolingKernel<T>::Compute(
   }
 
   std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
   auto runtime = context->device()->opencl_runtime();
diff --git a/mace/ops/opencl/image/reduce_mean.h b/mace/ops/opencl/image/reduce_mean.h
index 953742cbbec2e24f257f28d4684a80729cadf9ac..95b51d86f883338fd0e4e57952edfd5965f85a61 100644
--- a/mace/ops/opencl/image/reduce_mean.h
+++ b/mace/ops/opencl/image/reduce_mean.h
@@ -72,8 +72,8 @@ MaceStatus ReduceMeanKernel<T>::Compute(
   std::vector<uint32_t> lws(3);
   std::vector<index_t> output_shape{batch, 1, 1, channels};
   std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
   auto runtime = context->device()->opencl_runtime();
diff --git a/mace/ops/opencl/image/resize_bicubic.h b/mace/ops/opencl/image/resize_bicubic.h
index e801e59f35c3514b7d02ff1173899eb9a2466a4e..bf5bfcf1921254c3939f77a5f3dc7711ea780289 100644
--- a/mace/ops/opencl/image/resize_bicubic.h
+++ b/mace/ops/opencl/image/resize_bicubic.h
@@ -133,8 +133,8 @@ MaceStatus ResizeBicubicKernel<T>::Compute(
     std::vector<index_t> output_shape{batch, out_height, out_width, channels};
 
     std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                    &output_image_shape);
+    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                                &output_image_shape);
     MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
     float height_scale =
diff --git a/mace/ops/opencl/image/resize_bilinear.h b/mace/ops/opencl/image/resize_bilinear.h
index 7af9a5f60a0b69775923beebc71bc2e7276983cc..b3f1b09c6ee08f356f328e9e729c573abd5bb4e4 100644
--- a/mace/ops/opencl/image/resize_bilinear.h
+++ b/mace/ops/opencl/image/resize_bilinear.h
@@ -134,8 +134,8 @@ MaceStatus ResizeBilinearKernel<T>::Compute(
     std::vector<index_t> output_shape{batch, out_height, out_width, channels};
 
     std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                    &output_image_shape);
+    OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                                &output_image_shape);
     MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
     float height_scale =
diff --git a/mace/ops/opencl/image/space_to_batch.h b/mace/ops/opencl/image/space_to_batch.h
index 9924f02fdc82fe30527a7c958341f9cd3f2c5540..f2baaba48259da64f2f8ed18620da37edd154245 100644
--- a/mace/ops/opencl/image/space_to_batch.h
+++ b/mace/ops/opencl/image/space_to_batch.h
@@ -56,8 +56,8 @@ MaceStatus SpaceToBatchKernel<T>::Compute(
     const std::vector<index_t> &output_shape,
     Tensor *batch_tensor) {
   std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
   MACE_RETURN_IF_ERROR(
       batch_tensor->ResizeImage(output_shape, output_image_shape));
   const char *kernel_name = "space_to_batch";
diff --git a/mace/ops/opencl/image/space_to_depth.h b/mace/ops/opencl/image/space_to_depth.h
index 961d16066097d9f5448c2d3c61752cb97695e316..e225b37693377acf57f2d91b17cc3269bc8a20a3 100644
--- a/mace/ops/opencl/image/space_to_depth.h
+++ b/mace/ops/opencl/image/space_to_depth.h
@@ -74,7 +74,9 @@ MaceStatus SpaceToDepthKernel<T>::Compute(
                                        output_depth};
 
   std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
 
   auto runtime = context->device()->opencl_runtime();
diff --git a/mace/ops/opencl/image/split.h b/mace/ops/opencl/image/split.h
index 12286a6d737eff94f96ec4d3194b8d2bc5a36d6f..7b7f790597f4daba916a0ab2cc1d103fdf11df26 100644
--- a/mace/ops/opencl/image/split.h
+++ b/mace/ops/opencl/image/split.h
@@ -34,7 +34,9 @@ namespace image {
 template <typename T>
 class SplitKernel : public OpenCLSplitKernel {
  public:
-  explicit SplitKernel(const int32_t axis) : axis_(axis) {}
+  explicit SplitKernel(const int32_t axis) : axis_(axis) {
+    MACE_CHECK(axis == 3) << "GPU only support channel-dimension split";
+  }
   MaceStatus Compute(
       OpContext *context,
       const Tensor *input,
@@ -60,7 +62,9 @@ MaceStatus SplitKernel<T>::Compute(
       {input->dim(0), input->dim(1), input->dim(2), output_channels});
 
   std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
+  OpenCLUtil::CalImage2DShape(output_shape,
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
   for (size_t i = 0; i < outputs_count; ++i) {
     MACE_RETURN_IF_ERROR(
         output_list[i]->ResizeImage(output_shape, image_shape));
diff --git a/mace/ops/opencl/image/sqrdiff_mean.h b/mace/ops/opencl/image/sqrdiff_mean.h
index d356b89859ee9a9c24541a1270f919f188be62eb..d0c217fe450018d038e2d617fe4bdf5e6c4ba5de 100644
--- a/mace/ops/opencl/image/sqrdiff_mean.h
+++ b/mace/ops/opencl/image/sqrdiff_mean.h
@@ -68,8 +68,8 @@ MaceStatus SqrDiffMeanKernel<T>::Compute(
   std::vector<uint32_t> lws(3);
   std::vector<index_t> output_shape{batch, 1, 1, channels};
   std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                  &output_image_shape);
+  OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL,
+                              &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
   auto runtime = context->device()->opencl_runtime();
diff --git a/mace/ops/opencl/image/winograd_conv2d.cc b/mace/ops/opencl/image/winograd_conv2d.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a9bd717155b7cfed5f5a6cac32a64d57fad63545
--- /dev/null
+++ b/mace/ops/opencl/image/winograd_conv2d.cc
@@ -0,0 +1,350 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/activation.h"
+#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/opencl/helper.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+namespace ops {
+namespace opencl {
+namespace image {
+
+namespace {
+MaceStatus WinogradInputTransform(OpContext *context,
+                                  cl::Kernel *kernel,
+                                  const Tensor *input_tensor,
+                                  const DataType dt,
+                                  const int *paddings,
+                                  const index_t round_h,
+                                  const index_t round_w,
+                                  const int wino_blk_size,
+                                  const bool input_changed,
+                                  Tensor *output_tensor,
+                                  uint32_t *kwg_size,
+                                  StatsFuture *future) {
+  OpenCLRuntime *runtime = context->device()->opencl_runtime();
+  const index_t out_width = output_tensor->dim(2);
+
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernel->get() == nullptr) {
+    std::string obfuscated_kernel_name;
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    if (wino_blk_size == 4) {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4");
+      built_options.emplace("-Dwinograd_transform_4x4="
+                                + obfuscated_kernel_name);
+    } else if (wino_blk_size == 2) {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
+      built_options.emplace("-Dwinograd_transform_2x2="
+                                + obfuscated_kernel_name);
+    } else {
+      MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
+      return MaceStatus::MACE_SUCCESS;
+    }
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              kernel));
+
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
+  }
+
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(out_width),
+      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))
+  };
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  if (input_changed) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(*kernel);
+    MACE_SET_2D_GWS_ARGS(*kernel, gws);
+    kernel->setArg(idx++, *(input_tensor->opencl_image()));
+    kernel->setArg(idx++, *(output_tensor->opencl_image()));
+    kernel->setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1)));
+    kernel->setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2)));
+    kernel->setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
+    kernel->setArg(idx++, static_cast<uint32_t>(round_h * round_w));
+    kernel->setArg(idx++, static_cast<uint32_t>(round_w));
+    kernel->setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
+    kernel->setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
+  }
+
+
+  const std::vector<uint32_t> lws = {*kwg_size / 8, 8, 0};
+  std::string tuning_key = Concat("winograd_transform_kernel",
+                                  output_tensor->dim(0),
+                                  output_tensor->dim(1),
+                                  output_tensor->dim(2));
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
+                                           gws, lws, future));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus WinogradOutputTransform(OpContext *context,
+                                   cl::Kernel *kernel,
+                                   const Tensor *input_tensor,
+                                   const Tensor *bias,
+                                   const DataType dt,
+                                   const index_t round_h,
+                                   const index_t round_w,
+                                   const int wino_blk_size,
+                                   const ActivationType activation,
+                                   const float relux_max_limit,
+                                   const bool input_changed,
+                                   Tensor *output_tensor,
+                                   uint32_t *kwg_size,
+                                   StatsFuture *future) {
+  OpenCLRuntime *runtime = context->device()->opencl_runtime();
+  auto &output_shape = output_tensor->shape();
+
+  MACE_OUT_OF_RANGE_DEFINITION;
+  if (kernel->get() == nullptr) {
+    std::string obfuscated_kernel_name;
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    if (wino_blk_size == 4) {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4");
+      built_options.emplace("-Dwinograd_inverse_transform_4x4="
+                                + obfuscated_kernel_name);
+    } else if (wino_blk_size == 2) {
+      obfuscated_kernel_name =
+          MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
+      built_options.emplace("-Dwinograd_inverse_transform_2x2="
+                                + obfuscated_kernel_name);
+    } else {
+      MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
+      return MaceStatus::MACE_SUCCESS;
+    }
+
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case PRELU:
+        built_options.emplace("-DUSE_PRELU");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      default:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
+                                              obfuscated_kernel_name,
+                                              built_options,
+                                              kernel));
+
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
+  }
+
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(input_tensor->dim(2)),
+      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
+  MACE_OUT_OF_RANGE_INIT(*kernel);
+  if (input_changed) {
+    uint32_t idx = 0;
+    MACE_OUT_OF_RANGE_SET_ARGS(*kernel);
+    MACE_SET_2D_GWS_ARGS(*kernel, gws);
+    kernel->setArg(
+        idx++,
+        *(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
+    if (bias != nullptr) {
+      kernel->setArg(idx++,
+                     *(static_cast<const cl::Image2D *>(bias->opencl_image())));
+    }
+    kernel->setArg(
+        idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image())));
+    kernel->setArg(idx++, static_cast<uint32_t>(output_shape[1]));
+    kernel->setArg(idx++, static_cast<uint32_t>(output_shape[2]));
+    kernel->setArg(idx++, static_cast<uint32_t>(round_h * round_w));
+    kernel->setArg(idx++, static_cast<uint32_t>(round_w));
+    kernel->setArg(idx++, relux_max_limit);
+  }
+  const std::vector<uint32_t> lws = {*kwg_size / 8, 8, 0};
+  std::string tuning_key =
+      Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
+             output_tensor->dim(1), output_tensor->dim(2),
+             output_tensor->dim(3), input_tensor->dim(2));
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
+                                           gws, lws, future));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+  return MaceStatus::MACE_SUCCESS;
+}
+}  // namespace
+
+
+extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
+                                       cl::Kernel *kernels[3],
+                                       const Tensor *input,
+                                       const Tensor *filter,
+                                       const Tensor *bias,
+                                       const int *paddings,
+                                       const ActivationType activation,
+                                       const float relux_max_limit,
+                                       const DataType dt,
+                                       const int wino_blk_size,
+                                       std::vector<index_t> *prev_input_shape,
+                                       Tensor *output,
+                                       uint32_t *kwg_size[3]) {
+  OpenCLRuntime *runtime = context->device()->opencl_runtime();
+  ScratchImageManager *scratch_manager = runtime->scratch_image_manager();
+  StatsFuture t_input_future, mm_future, t_output_future;
+  bool input_changed = !IsVecEqual(*prev_input_shape, input->shape());
+  *prev_input_shape = input->shape();
+
+  auto output_shape = output->shape();
+  const index_t round_h =
+      (output_shape[1] + wino_blk_size - 1) / wino_blk_size;
+  const index_t round_w =
+      (output_shape[2] + wino_blk_size - 1) / wino_blk_size;
+  const index_t out_width = input->dim(0) * round_h * round_w;
+
+  const index_t blk_sqr = (wino_blk_size + 2) * (wino_blk_size + 2);
+
+  index_t in_channel = input->dim(3);
+  index_t out_channel = output->dim(3);
+
+  // 0. transform input
+  // input(NHWC) -> t_input(blk_sqr, in_channel, out_width)
+  std::vector<index_t> t_input_shape =
+      {blk_sqr, in_channel, out_width};
+  std::vector<index_t> padded_t_input_shape = {
+      t_input_shape[0], t_input_shape[1], t_input_shape[2], 1
+  };
+  std::vector<size_t> t_input_image_shape;
+  OpenCLUtil::CalImage2DShape(padded_t_input_shape,
+                              OpenCLBufferType::IN_OUT_HEIGHT,
+                              &t_input_image_shape);
+  ScratchImage transformed_input_image(scratch_manager);
+  std::unique_ptr<Tensor> transformed_input(new Tensor(
+      transformed_input_image.Scratch(context->device()->allocator(),
+                                      t_input_image_shape, dt), dt));
+  MACE_RETURN_IF_ERROR(transformed_input->ResizeImage(t_input_shape,
+                                                      t_input_image_shape));
+  MACE_RETURN_IF_ERROR(WinogradInputTransform(
+      context, kernels[0], input, dt, paddings,
+      round_h, round_w, wino_blk_size,
+      input_changed, transformed_input.get(),
+      kwg_size[0], &t_input_future));
+
+  // 1. mat mul
+  // t_filter(blk_sqr, out_chan, in_chan)*t_input(blk_sqr, in_chan, out_width)
+  //     -> t_output (blk_sqr, out_chan, out_width)
+  std::vector<index_t> mm_output_shape =
+      {blk_sqr, out_channel, out_width};
+
+  std::vector<index_t> padded_mm_output_shape =
+      {mm_output_shape[0], mm_output_shape[1], mm_output_shape[2], 1};
+  std::vector<size_t> mm_output_image_shape;
+  OpenCLUtil::CalImage2DShape(padded_mm_output_shape,
+                              OpenCLBufferType::IN_OUT_HEIGHT,
+                              &mm_output_image_shape);
+
+  ScratchImage mm_output_image(scratch_manager);
+  std::unique_ptr<Tensor> mm_output(new Tensor(
+      mm_output_image.Scratch(context->device()->allocator(),
+                              mm_output_image_shape, dt), dt));
+  MACE_RETURN_IF_ERROR(mm_output->ResizeImage(mm_output_shape,
+                                              mm_output_image_shape));
+
+  const index_t height_blocks = RoundUpDiv4(mm_output_shape[1]);
+  const index_t width_blocks = RoundUpDiv4(mm_output_shape[2]);
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(width_blocks),
+      static_cast<uint32_t>(height_blocks * blk_sqr),
+  };
+
+  MACE_OUT_OF_RANGE_DEFINITION;
+
+  if (kernels[1]->get() == nullptr) {
+    std::set<std::string> built_options;
+    MACE_OUT_OF_RANGE_CONFIG;
+    MACE_NON_UNIFORM_WG_CONFIG;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
+    built_options.emplace("-Dmatmul=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
+    MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
+                                              built_options, kernels[1]));
+
+    *kwg_size[1] =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernels[1]));
+  }
+  MACE_OUT_OF_RANGE_INIT(*kernels[1]);
+  uint32_t idx = 0;
+  MACE_OUT_OF_RANGE_SET_ARGS(*kernels[1]);
+  MACE_SET_2D_GWS_ARGS(*kernels[1], gws);
+  kernels[1]->setArg(idx++, *(filter->opencl_image()));
+  kernels[1]->setArg(idx++, *(transformed_input->opencl_image()));
+  kernels[1]->setArg(idx++, *(mm_output->opencl_image()));
+  kernels[1]->setArg(idx++, static_cast<int>(mm_output_shape[1]));
+  kernels[1]->setArg(idx++, static_cast<int>(mm_output_shape[2]));
+  kernels[1]->setArg(idx++, static_cast<int>(in_channel));
+  kernels[1]->setArg(idx++, static_cast<int>(height_blocks));
+  kernels[1]->setArg(idx++, static_cast<int>(RoundUpDiv4(in_channel)));
+
+  const std::vector<uint32_t> lws = {*kwg_size[1] / 64, 64, 0};
+  std::string tuning_key = Concat("matmul_opencl_kernel", mm_output_shape[0],
+      mm_output_shape[1], mm_output_shape[2]);
+  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernels[1], tuning_key,
+                                           gws, lws, &mm_future));
+
+  MACE_OUT_OF_RANGE_VALIDATION;
+
+  // 2. transform output
+  // t_output (blk_sqr, out_chan, out_width) -> output(NHWC)
+  MACE_RETURN_IF_ERROR(WinogradOutputTransform(
+      context, kernels[2], mm_output.get(), bias,
+      dt, round_h, round_w, wino_blk_size, activation, relux_max_limit,
+      input_changed, output, kwg_size[2], &t_output_future))
+
+  MergeMultipleFutureWaitFn({t_input_future, mm_future, t_output_future},
+                            context->future());
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace image
+}  // namespace opencl
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/opencl/image/winograd_transform.h b/mace/ops/opencl/image/winograd_transform.h
deleted file mode 100644
index a555322dfc6327fbfd3d1f6e448af8b649724901..0000000000000000000000000000000000000000
--- a/mace/ops/opencl/image/winograd_transform.h
+++ /dev/null
@@ -1,316 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef MACE_OPS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_
-#define MACE_OPS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_
-
-#include "mace/ops/opencl/winograd_transform.h"
-
-#include <memory>
-#include <vector>
-#include <set>
-#include <string>
-
-#include "mace/core/op_context.h"
-#include "mace/core/tensor.h"
-#include "mace/ops/activation.h"
-#include "mace/ops/conv_pool_2d_util.h"
-#include "mace/ops/opencl/helper.h"
-
-namespace mace {
-namespace ops {
-namespace opencl {
-namespace image {
-
-template <typename T>
-class WinogradTransformKernel : public OpenCLWinogradTransformKernel {
- public:
-  WinogradTransformKernel(
-      Padding padding_type,
-      const std::vector<int> &paddings,
-      const int block_size)
-      : strides_({1, 1}),
-        dilations_({1, 1}),
-        padding_type_(padding_type),
-        paddings_(paddings),
-        wino_blk_size_(block_size) {}
-  MaceStatus Compute(
-      OpContext *context,
-      const Tensor *input_tensor,
-      Tensor *output_tensor) override;
-
- private:
-  const std::vector<int> strides_;    // [stride_h, stride_w]
-  const std::vector<int> dilations_;  // [dilation_h, dilation_w]
-  Padding padding_type_;
-  std::vector<int> paddings_;
-  const int wino_blk_size_;
-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::vector<index_t> input_shape_;
-};
-
-template <typename T>
-MaceStatus WinogradTransformKernel<T>::Compute(
-    OpContext *context,
-    const Tensor *input_tensor,
-    Tensor *output_tensor) {
-  auto runtime = context->device()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  if (kernel_.get() == nullptr) {
-    std::string obfuscated_kernel_name;
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    if (wino_blk_size_ == 4) {
-      obfuscated_kernel_name =
-          MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4");
-      built_options.emplace("-Dwinograd_transform_4x4="
-                                + obfuscated_kernel_name);
-    } else if (wino_blk_size_ == 2) {
-      obfuscated_kernel_name =
-          MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
-      built_options.emplace("-Dwinograd_transform_2x2="
-                                + obfuscated_kernel_name);
-    } else {
-      MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
-      return MaceStatus::MACE_SUCCESS;
-    }
-    built_options.emplace("-DDATA_TYPE=" +
-        DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" +
-        DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-  std::vector<index_t> output_shape(4);
-  std::vector<index_t> filter_shape = {1, input_tensor->dim(3), 3, 3};
-  std::vector<int> paddings(2);
-  if (paddings_.empty()) {
-    ops::CalcNHWCPaddingAndOutputSize(
-        input_tensor->shape().data(), filter_shape.data(), dilations_.data(),
-        strides_.data(), padding_type_, output_shape.data(), paddings.data());
-  } else {
-    paddings = paddings_;
-    CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
-                   paddings_.data(), dilations_.data(), strides_.data(),
-                   RoundType::FLOOR, output_shape.data());
-  }
-  const index_t round_h =
-      (output_shape[1] + wino_blk_size_ - 1) / wino_blk_size_;
-  const index_t round_w =
-      (output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_;
-  const index_t out_width = input_tensor->dim(0) * round_h * round_w;
-
-  const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2);
-
-  const uint32_t gws[2] = {
-      static_cast<uint32_t>(out_width),
-      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))
-  };
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input_tensor->shape())) {
-    output_shape = {blk_sqr, input_tensor->dim(3), out_width};
-    std::vector<index_t> padded_output_shape = {
-        output_shape[0], output_shape[1], output_shape[2], 1
-    };
-    std::vector<size_t> image_shape;
-    CalImage2DShape(padded_output_shape,
-                    BufferType::IN_OUT_HEIGHT,
-                    &image_shape);
-    // remove unused last dimension
-    MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
-
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_2D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(idx++, *(input_tensor->opencl_image()));
-    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
-    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1)));
-    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2)));
-    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
-    kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
-    kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
-    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
-    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
-
-    input_shape_ = input_tensor->shape();
-  }
-
-
-  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
-  std::string tuning_key = Concat("winograd_transform_kernel",
-                                  output_tensor->dim(0),
-                                  output_tensor->dim(1),
-                                  output_tensor->dim(2));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-
-template <typename T>
-class WinogradInverseTransformKernel
-    : public OpenCLWinogradInverseTransformKernel {
- public:
-  WinogradInverseTransformKernel(
-      ActivationType activation,
-      const float relux_max_limit,
-      const int block_size)
-      : wino_blk_size_(block_size),
-        activation_(activation),
-        relux_max_limit_(relux_max_limit) {}
-  MaceStatus Compute(
-      OpContext *context,
-      const std::vector<const Tensor*> &inputs,
-      Tensor *output_tensor) override;
-
- private:
-  const int wino_blk_size_;
-  const ActivationType activation_;
-  const float relux_max_limit_;
-  cl::Kernel kernel_;
-  uint32_t kwg_size_;
-  std::vector<index_t> input_shape_;
-};
-
-template <typename T>
-MaceStatus WinogradInverseTransformKernel<T>::Compute(
-    OpContext *context,
-    const std::vector<const Tensor*> &inputs,
-    Tensor *output_tensor) {
-  auto runtime = context->device()->opencl_runtime();
-  MACE_OUT_OF_RANGE_DEFINITION;
-
-  const Tensor *input_tensor = inputs[0];
-  const Tensor *bias = inputs.size() == 3 ? inputs[2] : nullptr;
-
-  if (kernel_.get() == nullptr) {
-    std::string obfuscated_kernel_name;
-    std::set<std::string> built_options;
-    MACE_OUT_OF_RANGE_CONFIG;
-    MACE_NON_UNIFORM_WG_CONFIG;
-    if (wino_blk_size_ == 4) {
-      obfuscated_kernel_name =
-          MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4");
-      built_options.emplace("-Dwinograd_inverse_transform_4x4="
-                                + obfuscated_kernel_name);
-    } else if (wino_blk_size_ == 2) {
-      obfuscated_kernel_name =
-          MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
-      built_options.emplace("-Dwinograd_inverse_transform_2x2="
-                                + obfuscated_kernel_name);
-    } else {
-      MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
-      return MaceStatus::MACE_SUCCESS;
-    }
-
-    built_options.emplace("-DDATA_TYPE=" +
-        DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" +
-        DtToUpCompatibleCLCMDDt(DataTypeToEnum<T>::value));
-    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-    switch (activation_) {
-      case NOOP:
-        break;
-      case RELU:
-        built_options.emplace("-DUSE_RELU");
-        break;
-      case RELUX:
-        built_options.emplace("-DUSE_RELUX");
-        break;
-      case PRELU:
-        built_options.emplace("-DUSE_PRELU");
-        break;
-      case TANH:
-        built_options.emplace("-DUSE_TANH");
-        break;
-      case SIGMOID:
-        built_options.emplace("-DUSE_SIGMOID");
-        break;
-      default:
-        LOG(FATAL) << "Unknown activation type: " << activation_;
-    }
-
-    MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
-                                              obfuscated_kernel_name,
-                                              built_options,
-                                              &kernel_));
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  }
-
-  Tensor::MappingGuard output_shape_guard(inputs[1]);
-  const int32_t *output_shape_data = inputs[1]->data<int32_t>();
-  const index_t batch = output_shape_data[0];
-  const index_t height = output_shape_data[1];
-  const index_t width = output_shape_data[2];
-  const uint32_t gws[2] = {
-      static_cast<uint32_t>(input_tensor->dim(2)),
-      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
-  MACE_OUT_OF_RANGE_INIT(kernel_);
-  if (!IsVecEqual(input_shape_, input_tensor->shape())) {
-    std::vector<index_t> output_shape = {batch, height, width,
-                                         input_tensor->dim(1)};
-    std::vector<size_t> image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
-    MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape));
-
-    const index_t round_h = (height + wino_blk_size_ - 1) / wino_blk_size_;
-    const index_t round_w = (width + wino_blk_size_ - 1) / wino_blk_size_;
-
-    uint32_t idx = 0;
-    MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
-    MACE_SET_2D_GWS_ARGS(kernel_, gws);
-    kernel_.setArg(
-        idx++,
-        *(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
-    if (bias != nullptr) {
-      kernel_.setArg(idx++,
-                     *(static_cast<const cl::Image2D *>(bias->opencl_image())));
-    }
-    kernel_.setArg(
-        idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image())));
-    kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
-    kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
-    kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
-    kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
-    kernel_.setArg(idx++, relux_max_limit_);
-
-    input_shape_ = input_tensor->shape();
-  }
-  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
-  std::string tuning_key =
-      Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
-             output_tensor->dim(1), output_tensor->dim(2),
-             output_tensor->dim(3), input_tensor->dim(2));
-  MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, context->future()));
-
-  MACE_OUT_OF_RANGE_VALIDATION;
-  return MaceStatus::MACE_SUCCESS;
-}
-}  // namespace image
-}  // namespace opencl
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_
diff --git a/mace/ops/opencl/out_of_range_check_test.cc b/mace/ops/opencl/out_of_range_check_test.cc
index f63d1db056e5ace6eb10cf37ba8cefa08d4f1bac..eb2236931b08561715ef08e3e3194084261004d8 100644
--- a/mace/ops/opencl/out_of_range_check_test.cc
+++ b/mace/ops/opencl/out_of_range_check_test.cc
@@ -144,7 +144,9 @@ TEST(OutOfRangeCheckTest, RandomTest) {
   std::vector<size_t> image_shape;
   Tensor *image = ws.CreateTensor("Image", device->allocator(),
                                   DataTypeToEnum<float>::v());
-  CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape);
+  OpenCLUtil::CalImage2DShape(buffer->shape(),
+                              OpenCLBufferType::IN_OUT_CHANNEL,
+                              &image_shape);
   image->ResizeImage(buffer->shape(), image_shape);
   ASSERT_FALSE(BufferToImageOpImpl(&context, buffer, image, image_shape)
                    != MaceStatus::MACE_SUCCESS);
diff --git a/mace/ops/opencl/winograd_transform.h b/mace/ops/opencl/winograd_transform.h
deleted file mode 100644
index f150481a7cacd173fcec7bb0a705206acebc6c45..0000000000000000000000000000000000000000
--- a/mace/ops/opencl/winograd_transform.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_
-#define MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_
-
-#include <vector>
-
-#include "mace/public/mace.h"
-#include "mace/utils/utils.h"
-namespace mace {
-
-class OpContext;
-class Tensor;
-
-namespace ops {
-
-class OpenCLWinogradTransformKernel {
- public:
-  virtual MaceStatus Compute(
-      OpContext *context,
-      const Tensor *input,
-      Tensor *output) = 0;
-  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradTransformKernel);
-};
-
-class OpenCLWinogradInverseTransformKernel {
- public:
-  virtual MaceStatus Compute(
-      OpContext *context,
-      const std::vector<const Tensor*> &inputs,
-      Tensor *output) = 0;
-  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradInverseTransformKernel);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_
diff --git a/mace/ops/ops_registry.cc b/mace/ops/ops_registry.cc
index 48a893760a8789fb8f831774726332d55e17a922..7407683d6464ea2559eca1d55ee548bd4e3c75dc 100644
--- a/mace/ops/ops_registry.cc
+++ b/mace/ops/ops_registry.cc
@@ -69,10 +69,7 @@ extern void RegisterQuantize(OpRegistryBase *op_registry);
 
 #ifdef MACE_ENABLE_OPENCL
 extern void RegisterBufferTransform(OpRegistryBase *op_registry);
-extern void RegisterBufferInverseTransform(OpRegistryBase *op_registry);
 extern void RegisterLSTMCell(OpRegistryBase *op_registry);
-extern void RegisterWinogradInverseTransform(OpRegistryBase *op_registry);
-extern void RegisterWinogradTransform(OpRegistryBase *op_registry);
 #endif  // MACE_ENABLE_OPENCL
 }  // namespace ops
 
@@ -130,10 +127,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() {
 
 #ifdef MACE_ENABLE_OPENCL
   ops::RegisterBufferTransform(this);
-  ops::RegisterBufferInverseTransform(this);
   ops::RegisterLSTMCell(this);
-  ops::RegisterWinogradInverseTransform(this);
-  ops::RegisterWinogradTransform(this);
 #endif  // MACE_ENABLE_OPENCL
 }
 
diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc
index a41e5b0947b59718a9ef275b9971eb71726e2f17..21407c6a743491820d431e077d01e30aa629ac9b 100644
--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
@@ -13,11 +13,93 @@
 // limitations under the License.
 
 #include "mace/ops/ops_test_util.h"
+#include "mace/core/memory_optimizer.h"
 
 namespace mace {
 namespace ops {
 namespace test {
 
+
+OpDefBuilder::OpDefBuilder(const char *type, const std::string &name) {
+  op_def_.set_type(type);
+  op_def_.set_name(name);
+}
+
+OpDefBuilder &OpDefBuilder::Input(const std::string &input_name) {
+  op_def_.add_input(input_name);
+  return *this;
+}
+
+OpDefBuilder &OpDefBuilder::Output(const std::string &output_name) {
+  op_def_.add_output(output_name);
+  return *this;
+}
+
+OpDefBuilder &OpDefBuilder::OutputType(
+    const std::vector<DataType> &output_type) {
+  for (auto out_t : output_type) {
+    op_def_.add_output_type(out_t);
+  }
+  return *this;
+}
+
+OpDefBuilder &OpDefBuilder::OutputShape(
+    const std::vector<mace::index_t> &output_shape) {
+  auto shape = op_def_.add_output_shape();
+  for (auto s : output_shape) {
+    shape->add_dims(s);
+  }
+  return *this;
+}
+
+OpDefBuilder OpDefBuilder::AddIntArg(const std::string &name, const int value) {
+  auto arg = op_def_.add_arg();
+  arg->set_name(name);
+  arg->set_i(value);
+  return *this;
+}
+
+OpDefBuilder OpDefBuilder::AddFloatArg(const std::string &name,
+                                       const float value) {
+  auto arg = op_def_.add_arg();
+  arg->set_name(name);
+  arg->set_f(value);
+  return *this;
+}
+
+OpDefBuilder OpDefBuilder::AddStringArg(const std::string &name,
+                                        const char *value) {
+  auto arg = op_def_.add_arg();
+  arg->set_name(name);
+  arg->set_s(value);
+  return *this;
+}
+
+OpDefBuilder OpDefBuilder::AddIntsArg(const std::string &name,
+                                      const std::vector<int> &values) {
+  auto arg = op_def_.add_arg();
+  arg->set_name(name);
+  for (auto value : values) {
+    arg->add_ints(value);
+  }
+  return *this;
+}
+
+OpDefBuilder OpDefBuilder::AddFloatsArg(const std::string &name,
+                                        const std::vector<float> &values) {
+  auto arg = op_def_.add_arg();
+  arg->set_name(name);
+  for (auto value : values) {
+    arg->add_floats(value);
+  }
+  return *this;
+}
+
+void OpDefBuilder::Finalize(OperatorDef *op_def) const {
+  MACE_CHECK(op_def != nullptr, "input should not be null.");
+  *op_def = op_def_;
+}
+
 OpTestContext *OpTestContext::Get(int num_threads,
                                   CPUAffinityPolicy cpu_affinity_policy,
                                   bool use_gemmlowp) {
@@ -67,6 +149,100 @@ void OpTestContext::SetOCLImageAndBufferTestFlag() {
   opencl_mem_types_ = {MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER};
 }
 
+bool OpsTestNet::Setup(mace::DeviceType device) {
+  NetDef net_def;
+  for (auto &op_def_ : op_defs_) {
+    net_def.add_op()->CopyFrom(op_def_);
+
+    for (auto input : op_def_.input()) {
+      if (ws_.GetTensor(input) != nullptr &&
+          !ws_.GetTensor(input)->is_weight()) {
+        auto input_info = net_def.add_input_info();
+        input_info->set_name(input);
+        auto &shape = ws_.GetTensor(input)->shape();
+        for (auto d : shape) {
+          input_info->add_dims(static_cast<int>(d));
+        }
+      }
+    }
+
+    for (auto output : op_def_.output()) {
+      ws_.RemoveTensor(output);
+      auto output_info = net_def.add_output_info();
+      output_info->set_name(output);
+    }
+  }
+  MemoryOptimizer mem_optimizer;
+  net_ = std::unique_ptr<NetBase>(new SerialNet(
+      op_registry_.get(),
+      &net_def,
+      &ws_,
+      OpTestContext::Get()->GetDevice(device),
+      &mem_optimizer));
+  MaceStatus status = (ws_.PreallocateOutputTensor(
+      net_def,
+      &mem_optimizer,
+      OpTestContext::Get()->GetDevice(device)));
+  if (status != MaceStatus::MACE_SUCCESS) return false;
+  status = net_->Init();
+  device_type_ = device;
+  return status == MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus OpsTestNet::Run() {
+  MACE_CHECK_NOTNULL(net_);
+  MACE_RETURN_IF_ERROR(net_->Run());
+  Sync();
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus OpsTestNet::RunOp(mace::DeviceType device) {
+  if (device == DeviceType::GPU) {
+    auto opencl_mem_types = OpTestContext::Get()->opencl_mem_types();
+    for (auto type : opencl_mem_types) {
+      OpTestContext::Get()->GetDevice(device)
+          ->opencl_runtime()->set_mem_type(type);
+      Setup(device);
+      MACE_RETURN_IF_ERROR(Run());
+    }
+    return MaceStatus::MACE_SUCCESS;
+  } else {
+    Setup(device);
+    return Run();
+  }
+}
+
+MaceStatus OpsTestNet::RunOp() {
+  return RunOp(DeviceType::CPU);
+}
+
+MaceStatus OpsTestNet::RunNet(const mace::NetDef &net_def,
+                              const mace::DeviceType device) {
+  device_type_ = device;
+  MemoryOptimizer mem_optimizer;
+  net_ = std::unique_ptr<NetBase>(new SerialNet(
+      op_registry_.get(),
+      &net_def,
+      &ws_,
+      OpTestContext::Get()->GetDevice(device),
+      &mem_optimizer));
+  MACE_RETURN_IF_ERROR(ws_.PreallocateOutputTensor(
+      net_def,
+      &mem_optimizer,
+      OpTestContext::Get()->GetDevice(device)));
+  MACE_RETURN_IF_ERROR(net_->Init());
+  return net_->Run();
+}
+
+void OpsTestNet::Sync() {
+#ifdef MACE_ENABLE_OPENCL
+  if (net_ && device_type_ == DeviceType::GPU) {
+      OpTestContext::Get()->GetDevice(DeviceType::GPU)->opencl_runtime()
+          ->command_queue().finish();
+    }
+#endif
+}
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index 5bf842f38be4b7612669c122a8853be7dbb4537a..0596119194b30850eb2aca8492a23e86e7efc9d3 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -29,9 +29,9 @@
 #include "mace/core/net.h"
 #include "mace/core/device_context.h"
 #include "mace/core/runtime/opencl/gpu_device.h"
+#include "mace/core/runtime/opencl/opencl_util.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
-#include "mace/ops/opencl/common.h"
 #include "mace/ops/ops_registry.h"
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"
@@ -43,73 +43,29 @@ namespace test {
 
 class OpDefBuilder {
  public:
-  OpDefBuilder(const char *type, const std::string &name) {
-    op_def_.set_type(type);
-    op_def_.set_name(name);
-  }
+  OpDefBuilder(const char *type, const std::string &name);
 
-  OpDefBuilder &Input(const std::string &input_name) {
-    op_def_.add_input(input_name);
-    return *this;
-  }
+  OpDefBuilder &Input(const std::string &input_name);
 
-  OpDefBuilder &Output(const std::string &output_name) {
-    op_def_.add_output(output_name);
-    return *this;
-  }
+  OpDefBuilder &Output(const std::string &output_name);
 
-  OpDefBuilder &OutputType(const std::vector<DataType> &output_type) {
-    for (auto out_t : output_type) {
-      op_def_.add_output_type(out_t);
-    }
-    return *this;
-  }
+  OpDefBuilder &OutputType(const std::vector<DataType> &output_type);
 
-  OpDefBuilder AddIntArg(const std::string &name, const int value) {
-    auto arg = op_def_.add_arg();
-    arg->set_name(name);
-    arg->set_i(value);
-    return *this;
-  }
+  OpDefBuilder &OutputShape(const std::vector<index_t> &output_shape);
 
-  OpDefBuilder AddFloatArg(const std::string &name, const float value) {
-    auto arg = op_def_.add_arg();
-    arg->set_name(name);
-    arg->set_f(value);
-    return *this;
-  }
+  OpDefBuilder AddIntArg(const std::string &name, const int value);
 
-  OpDefBuilder AddStringArg(const std::string &name, const char *value) {
-    auto arg = op_def_.add_arg();
-    arg->set_name(name);
-    arg->set_s(value);
-    return *this;
-  }
+  OpDefBuilder AddFloatArg(const std::string &name, const float value);
+
+  OpDefBuilder AddStringArg(const std::string &name, const char *value);
 
   OpDefBuilder AddIntsArg(const std::string &name,
-                          const std::vector<int> &values) {
-    auto arg = op_def_.add_arg();
-    arg->set_name(name);
-    for (auto value : values) {
-      arg->add_ints(value);
-    }
-    return *this;
-  }
+                          const std::vector<int> &values);
 
   OpDefBuilder AddFloatsArg(const std::string &name,
-                            const std::vector<float> &values) {
-    auto arg = op_def_.add_arg();
-    arg->set_name(name);
-    for (auto value : values) {
-      arg->add_floats(value);
-    }
-    return *this;
-  }
+                            const std::vector<float> &values);
 
-  void Finalize(OperatorDef *op_def) const {
-    MACE_CHECK(op_def != nullptr, "input should not be null.");
-    *op_def = op_def_;
-  }
+  void Finalize(OperatorDef *op_def) const;
 
   OperatorDef op_def_;
 };
@@ -146,11 +102,12 @@ class OpsTestNet {
   void AddInputFromArray(const std::string &name,
                          const std::vector<index_t> &shape,
                          const std::vector<T> &data,
+                         bool is_weight = false,
                          const float scale = 0.0,
                          const int32_t zero_point = 0) {
     Tensor *input =
         ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(),
-                         DataTypeToEnum<T>::v());
+                         DataTypeToEnum<T>::v(), is_weight);
     input->Resize(shape);
     Tensor::MappingGuard input_mapper(input);
     T *input_data = input->mutable_data<T>();
@@ -163,10 +120,11 @@ class OpsTestNet {
   template <DeviceType D, typename T>
   void AddRepeatedInput(const std::string &name,
                         const std::vector<index_t> &shape,
-                        const T data) {
+                        const T data,
+                        bool is_weight = false) {
     Tensor *input =
         ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(),
-                         DataTypeToEnum<T>::v());
+                         DataTypeToEnum<T>::v(), is_weight);
     input->Resize(shape);
     Tensor::MappingGuard input_mapper(input);
     T *input_data = input->mutable_data<T>();
@@ -176,13 +134,14 @@ class OpsTestNet {
   template<DeviceType D, typename T>
   void AddRandomInput(const std::string &name,
                       const std::vector<index_t> &shape,
+                      bool is_weight = false,
                       bool positive = true,
                       bool truncate = false,
                       const float truncate_min = 0.001f,
                       const float truncate_max = 100.f) {
     Tensor *input =
         ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(),
-                         DataTypeToEnum<T>::v());
+                         DataTypeToEnum<T>::v(), is_weight);
     input->Resize(shape);
     Tensor::MappingGuard input_mapper(input);
     T *input_data = input->mutable_data<T>();
@@ -220,28 +179,6 @@ class OpsTestNet {
     }
   }
 
-  template <DeviceType D, typename T>
-  void Transpose2D(const std::string &src_name, const std::string &dst_name) {
-    Tensor *input = ws_.GetTensor(src_name);
-    Tensor *output = ws_.CreateTensor(
-        dst_name,
-        OpTestContext::Get()->GetDevice(D)->allocator(),
-        DataTypeToEnum<T>::v());
-    const std::vector<index_t> input_shape = input->shape();
-    MACE_CHECK(input_shape.size() == 2, "input shape != 2");
-    output->Resize({input_shape[1], input_shape[0]});
-    Tensor::MappingGuard input_guard(input);
-    Tensor::MappingGuard output_guard(output);
-    const T *input_data = input->data<T>();
-    T *output_data = output->mutable_data<T>();
-    for (index_t i = 0; i < input_shape[0]; ++i) {
-      for (index_t j = 0; j < input_shape[1]; ++j) {
-        output_data[j * input_shape[0] + i] =
-            input_data[i * input_shape[1] + j];
-      }
-    }
-  }
-
   template <DeviceType D, typename T>
   void CopyData(const std::string &src_name,
                 const std::string &dst_name) {
@@ -249,7 +186,8 @@ class OpsTestNet {
     Tensor *output = ws_.CreateTensor(
         dst_name,
         OpTestContext::Get()->GetDevice(D)->allocator(),
-        DataTypeToEnum<T>::v());
+        DataTypeToEnum<T>::v(),
+        input->is_weight());
 
     const std::vector<index_t> input_shape = input->shape();
     output->Resize(input_shape);
@@ -267,7 +205,8 @@ class OpsTestNet {
     Tensor *output = ws_.CreateTensor(
         dst_name,
         OpTestContext::Get()->GetDevice(D)->allocator(),
-        DataTypeToEnum<T>::v());
+        DataTypeToEnum<T>::v(),
+        input->is_weight());
     const std::vector<index_t> input_shape = input->shape();
     MACE_CHECK(input_shape.size() == 4, "input shape != 4");
 
@@ -311,7 +250,25 @@ class OpsTestNet {
           }
         }
       }
-    } else if (src_format == HWOI && dst_format == OIHW) {
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+
+  template <DeviceType D, typename T>
+  void TransformFilterDataFormat(const std::string &src_name,
+                                 const FilterDataFormat src_format,
+                                 const std::string &dst_name,
+                                 const FilterDataFormat dst_format) {
+    Tensor *input = ws_.GetTensor(src_name);
+    Tensor *output = ws_.CreateTensor(
+        dst_name,
+        OpTestContext::Get()->GetDevice(D)->allocator(),
+        DataTypeToEnum<T>::v(),
+        input->is_weight());
+    const std::vector<index_t> input_shape = input->shape();
+    MACE_CHECK(input_shape.size() == 4, "input shape != 4");
+    if (src_format == HWOI && dst_format == OIHW) {
       index_t height = input_shape[0];
       index_t width = input_shape[1];
       index_t out_channels = input_shape[2];
@@ -392,34 +349,6 @@ class OpsTestNet {
     }
   }
 
-  template <DeviceType D, typename T>
-  void FillNHWCInputToNCHWInput(const std::string &name_nchw,
-                                const std::string &name_nhwc) {
-    Tensor *input = ws_.GetTensor(name_nhwc);
-    Tensor *output = ws_.CreateTensor(
-        name_nchw,
-        OpTestContext::Get()->GetDevice(D)->allocator(),
-        DataTypeToEnum<T>::v());
-    const std::vector<index_t> input_shape = input->shape();
-    index_t batch = input_shape[0];
-    index_t height = input_shape[1];
-    index_t width = input_shape[2];
-    index_t channels = input_shape[3];
-    output->Resize({batch, channels, height, width});
-    const T *input_data = input->data<T>();
-    T *output_data = output->mutable_data<T>();
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t c = 0; c < channels; ++c) {
-        for (index_t h = 0; h < height; ++h) {
-          for (index_t w = 0; w < width; ++w) {
-            output_data[((b * channels + c) * height + h) * width + w] =
-                input_data[((b * height + h) * width + w) * channels + c];
-          }
-        }
-      }
-    }
-  }
-
   // Create standalone tensor on device D with T type.
   template <typename T, DeviceType D = DeviceType::CPU>
   std::unique_ptr<Tensor> CreateTensor(
@@ -447,89 +376,33 @@ class OpsTestNet {
     return &op_defs_[op_defs_.size() - 1];
   }
 
-  Workspace *ws() { return &ws_; }
+  inline Workspace *ws() { return &ws_; }
 
-  bool Setup(DeviceType device) {
-    NetDef net_def;
-    for (auto &op_def_ : op_defs_) {
-      net_def.add_op()->CopyFrom(op_def_);
-    }
-    net_ = std::unique_ptr<NetBase>(new SerialNet(
-        op_registry_.get(),
-        &net_def,
-        &ws_,
-        OpTestContext::Get()->GetDevice(device)));
-    MaceStatus status = net_->Init();
-    device_type_ = device;
-    return status == MaceStatus::MACE_SUCCESS;
-  }
+  bool Setup(DeviceType device);
 
-  MaceStatus Run() {
-    MACE_CHECK_NOTNULL(net_);
-    MACE_RETURN_IF_ERROR(net_->Run());
-    Sync();
-    return MaceStatus::MACE_SUCCESS;
-  }
+  MaceStatus Run();
 
   // DEPRECATED(liyin):
   // Test and benchmark should setup model once and run multiple times.
   // Setup time should not be counted during benchmark.
-  MaceStatus RunOp(DeviceType device) {
-    if (device == DeviceType::GPU) {
-      auto opencl_mem_types = OpTestContext::Get()->opencl_mem_types();
-      for (auto type : opencl_mem_types) {
-        OpTestContext::Get()->GetDevice(device)
-            ->opencl_runtime()->set_mem_type(type);
-        Setup(device);
-        MACE_RETURN_IF_ERROR(Run());
-      }
-      return MaceStatus::MACE_SUCCESS;
-    } else {
-      Setup(device);
-      return Run();
-    }
-  }
+  MaceStatus RunOp(DeviceType device);
 
   // DEPRECATED(liyin):
   // Test and benchmark should setup model once and run multiple times.
   // Setup time should not be counted during benchmark.
-  MaceStatus RunOp() { return RunOp(DeviceType::CPU); }
-
-  MaceStatus RunNet(const NetDef &net_def, const DeviceType device) {
-    device_type_ = device;
-    auto net = std::unique_ptr<NetBase>(new SerialNet(
-        op_registry_.get(),
-        &net_def,
-        &ws_,
-        OpTestContext::Get()->GetDevice(device),
-        NetMode::INIT));
-    MACE_RETURN_IF_ERROR(net->Init());
-    MACE_RETURN_IF_ERROR(net->Run());
-    net_ = std::unique_ptr<NetBase>(new SerialNet(
-        op_registry_.get(),
-        &net_def,
-        &ws_,
-        OpTestContext::Get()->GetDevice(device)));
-    MACE_RETURN_IF_ERROR(net_->Init());
-    return net_->Run();
-  }
+  MaceStatus RunOp();
+
+  MaceStatus RunNet(const NetDef &net_def, const DeviceType device);
 
-  Tensor *GetOutput(const char *output_name) {
+  inline Tensor *GetOutput(const char *output_name) {
     return ws_.GetTensor(output_name);
   }
 
-  Tensor *GetTensor(const char *tensor_name) {
+  inline Tensor *GetTensor(const char *tensor_name) {
     return ws_.GetTensor(tensor_name);
   }
 
-  void Sync() {
-#ifdef MACE_ENABLE_OPENCL
-    if (net_ && device_type_ == DeviceType::GPU) {
-      OpTestContext::Get()->GetDevice(DeviceType::GPU)->opencl_runtime()
-          ->command_queue().finish();
-    }
-#endif
-  }
+  void Sync();
 
  public:
   std::shared_ptr<OpRegistryBase> op_registry_;
@@ -773,50 +646,6 @@ void ExpectTensorSimilar(const Tensor &x,
   EXPECT_NEAR(1.0, similarity, abs_err);
 }
 
-template <DeviceType D, typename T>
-void BufferToImage(OpsTestNet *net,
-                   const std::string &input_name,
-                   const std::string &output_name,
-                   const ops::BufferType type,
-                   const int wino_block_size = 2) {
-  MACE_CHECK_NOTNULL(net);
-
-  OpDefBuilder("BufferTransform", "BufferTransformTest")
-    .Input(input_name)
-    .Output(output_name)
-    .AddIntArg("buffer_type", type)
-    .AddIntArg("wino_block_size", wino_block_size)
-    .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-    .Finalize(net->NewOperatorDef());
-
-  // TODO(liuqi): Use AddNewOperatorDef, and run all ops with same NetDef.
-  net->RunOp(D);
-
-  net->Sync();
-}
-
-template <DeviceType D, typename T>
-void ImageToBuffer(OpsTestNet *net,
-                   const std::string &input_name,
-                   const std::string &output_name,
-                   const ops::BufferType type,
-                   const int wino_block_size = 2) {
-  MACE_CHECK_NOTNULL(net);
-
-  OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
-    .Input(input_name)
-    .Output(output_name)
-    .AddIntArg("buffer_type", type)
-    .AddIntArg("wino_block_size", wino_block_size)
-    .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-    .Finalize(net->NewOperatorDef());
-
-  // Run
-  net->RunOp(D);
-
-  net->Sync();
-}
-
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc
index 60bfbc192c6f50a05ca68dd0ad6d82d12182080d..cb7979063097a07be88337b5b14db63a7ffe99f4 100644
--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
@@ -33,7 +33,10 @@ class PadOp<DeviceType::CPU, T> : public Operation {
       : Operation(context),
         paddings_(Operation::GetRepeatedArgs<int>("paddings")),
         constant_value_(Operation::GetOptionalArg<float>(
-            "constant_value", 0.0)) {}
+            "constant_value", 0.0)) {
+    MACE_CHECK(paddings_.size() == 8);
+    paddings_ = TransposeShape<int, int>(paddings_, {0, 1, 6, 7, 2, 3, 4, 5});
+  }
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
diff --git a/mace/ops/pad_benchmark.cc b/mace/ops/pad_benchmark.cc
index e295d2ad8799cc52d4b9c9ed52218a9a85ddd0cc..fb7f4e14426677b1ee26bf0ba3459ea5043074ea 100644
--- a/mace/ops/pad_benchmark.cc
+++ b/mace/ops/pad_benchmark.cc
@@ -31,23 +31,13 @@ void Pad(int iters, int batch, int height,
   net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
 
   const std::vector<int> paddings = {0, 0, pad, pad, pad, pad, 0, 0};
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    OpDefBuilder("Pad", "PadTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddIntsArg("paddings", paddings)
-        .AddFloatArg("constant_value", 1.0)
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("Pad", "PadTest")
-        .Input("Input")
-        .Output("Output")
-        .AddIntsArg("paddings", paddings)
-        .AddFloatArg("constant_value", 1.0)
-        .Finalize(net.NewOperatorDef());
-  }
+  OpDefBuilder("Pad", "PadTest")
+      .Input("Input")
+      .Output("Output")
+      .AddIntsArg("paddings", paddings)
+      .AddFloatArg("constant_value", 1.0)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
diff --git a/mace/ops/pad_test.cc b/mace/ops/pad_test.cc
index f0eece255af09b8e9d44a16dfe5965e67d1503c0..5de799f243e9cc51fb541f6ad5c7601e5de34cc3 100644
--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
@@ -29,27 +29,22 @@ void Simple() {
   // Add input data
   net.AddRepeatedInput<D, float>("Input", {1, 2, 3, 1}, 2);
   if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("Pad", "PadTest")
-        .Input("InputImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Output("Output")
         .AddIntsArg("paddings", {0, 0, 1, 2, 1, 2, 0, 0})
         .AddFloatArg("constant_value", 1.0)
         .Finalize(net.NewOperatorDef());
 
     // Run
     net.RunOp(D);
-
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
   } else {
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
                                                     NCHW);
     OpDefBuilder("Pad", "PadTest")
         .Input("TInput")
         .Output("TOutput")
-        .AddIntsArg("paddings", {0, 0, 0, 0, 1, 2, 1, 2})
+        .AddIntsArg("paddings", {0, 0, 1, 2, 1, 2, 0, 0})
         .AddFloatArg("constant_value", 1.0)
         .Finalize(net.NewOperatorDef());
 
@@ -111,8 +106,7 @@ TEST_F(PadTest, ComplexCPU) {
 namespace {
 template <typename T>
 void Complex(const std::vector<index_t> &input_shape,
-             const std::vector<int> &cpu_paddings,
-             const std::vector<int> &gpu_paddings) {
+             const std::vector<int> &paddings) {
   // Construct graph
   OpsTestNet net;
 
@@ -124,7 +118,7 @@ void Complex(const std::vector<index_t> &input_shape,
   OpDefBuilder("Pad", "PadTest")
       .Input("TInput")
       .Output("TOutput")
-      .AddIntsArg("paddings", cpu_paddings)
+      .AddIntsArg("paddings", paddings)
       .AddFloatArg("constant_value", 1.0)
       .Finalize(net.NewOperatorDef());
 
@@ -136,22 +130,17 @@ void Complex(const std::vector<index_t> &input_shape,
   auto expected = net.CreateTensor<float>();
   expected->Copy(*net.GetOutput("Output"));
 
-  BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
-                                    ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Pad", "PadTest")
-      .Input("InputImage")
-      .Output("OutputImage")
-      .AddIntsArg("paddings", gpu_paddings)
+      .Input("Input")
+      .Output("Output")
+      .AddIntsArg("paddings", paddings)
       .AddFloatArg("constant_value", 1.0)
       .Finalize(net.NewOperatorDef());
 
   // Run
   net.RunOp(DeviceType::GPU);
 
-  ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OpenCLOutput",
-                                        ops::BufferType::IN_OUT_CHANNEL);
-
-  auto output = net.GetTensor("OpenCLOutput");
+  auto output = net.GetTensor("Output");
 
   if (DataTypeToEnum<T>::value == DT_HALF) {
     ExpectTensorNear<float>(*expected, *output, 1e-2, 1e-2);
@@ -162,21 +151,15 @@ void Complex(const std::vector<index_t> &input_shape,
 }  // namespace
 
 TEST_F(PadTest, ComplexFloat) {
-  Complex<float>({1, 32, 32, 4}, {0, 0, 0, 0, 2, 2, 1, 1},
-                 {0, 0, 2, 2, 1, 1, 0, 0});
-  Complex<float>({1, 31, 37, 16}, {0, 0, 0, 0, 2, 0, 1, 0},
-                 {0, 0, 2, 0, 1, 0, 0, 0});
-  Complex<float>({1, 128, 128, 32}, {0, 0, 0, 0, 0, 1, 0, 2},
-                 {0, 0, 0, 1, 0, 2, 0, 0});
+  Complex<float>({1, 32, 32, 4}, {0, 0, 2, 2, 1, 1, 0, 0});
+  Complex<float>({1, 31, 37, 16}, {0, 0, 2, 0, 1, 0, 0, 0});
+  Complex<float>({1, 128, 128, 32}, {0, 0, 0, 1, 0, 2, 0, 0});
 }
 
 TEST_F(PadTest, ComplexHalf) {
-  Complex<half>({1, 32, 32, 4}, {0, 0, 0, 0, 2, 2, 1, 1},
-                {0, 0, 2, 2, 1, 1, 0, 0});
-  Complex<half>({1, 31, 37, 16}, {0, 0, 0, 0, 2, 0, 1, 0},
-                {0, 0, 2, 0, 1, 0, 0, 0});
-  Complex<half>({1, 128, 128, 32}, {0, 0, 0, 0, 0, 1, 0, 2},
-                {0, 0, 0, 1, 0, 2, 0, 0});
+  Complex<half>({1, 32, 32, 4}, {0, 0, 2, 2, 1, 1, 0, 0});
+  Complex<half>({1, 31, 37, 16}, {0, 0, 2, 0, 1, 0, 0, 0});
+  Complex<half>({1, 128, 128, 32}, {0, 0, 0, 1, 0, 2, 0, 0});
 }
 
 }  // namespace test
diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc
index 2ce9d6acb6ac535311b5dc77e6161721a6c716cd..b2aef666266dfcd77b06047eab7891fd6cb82cef 100644
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -432,6 +432,7 @@ class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase {
     if (context->device()->opencl_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::PoolingKernel<T>);
     } else {
+      context->set_output_mem_type(MemoryType::GPU_BUFFER);
       kernel_.reset(new opencl::buffer::PoolingKernel<T>);
     }
   }
diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc
index ff915ec0f60f07d5626cbf931bb36806caea997c..c48cc8771fec57898dfe648abc7db7438bd5e330 100644
--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
@@ -52,8 +52,7 @@ void Pooling(int iters,
     MACE_NOT_IMPLEMENTED;
   }
 
-  if (D == DeviceType::CPU) {
-    OpDefBuilder("Pooling", "PoolingTest")
+  OpDefBuilder("Pooling", "PoolingTest")
       .Input("Input")
       .Output("Output")
       .AddIntArg("pooling_type", pooling_type)
@@ -63,22 +62,6 @@ void Pooling(int iters,
       .AddIntsArg("dilations", {1, 1})
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Pooling", "PoolingTest")
-      .Input("InputImage")
-      .Output("OutputImage")
-      .AddIntArg("pooling_type", pooling_type)
-      .AddIntsArg("kernels", {kernel, kernel})
-      .AddIntsArg("strides", {stride, stride})
-      .AddIntArg("padding", padding)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc
index 99691db84f1ef4882e676ad335874d45ddda4a4e..6db144e4f8fc77f2b6d58219236c1edd439bf242 100644
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -190,11 +190,9 @@ void SimpleMaxPooling3S2() {
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
   } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("Pooling", "PoolingTest")
-        .Input("InputImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Output("Output")
         .AddIntArg("pooling_type", PoolingType::MAX)
         .AddIntsArg("kernels", {3, 3})
         .AddIntsArg("strides", {2, 2})
@@ -202,8 +200,6 @@ void SimpleMaxPooling3S2() {
         .AddIntsArg("dilations", {1, 1})
         .Finalize(net.NewOperatorDef());
     net.RunOp(D);
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
   }
 
   // Check
@@ -250,11 +246,9 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
   auto expected = net.CreateTensor<float>();
   expected->Copy(*net.GetOutput("Output"));
 
-  BufferToImage<D, T>(&net, "Input", "InputImage",
-                      ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Pooling", "PoolingTest")
-      .Input("InputImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Output("Output")
       .AddIntArg("pooling_type", PoolingType::MAX)
       .AddIntsArg("kernels", {3, 3})
       .AddIntsArg("strides", strides)
@@ -263,14 +257,12 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
   net.RunOp(D);
-  ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                          ops::BufferType::IN_OUT_CHANNEL);
 
   if (DataTypeToEnum<T>::value == DT_HALF) {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-3,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-3,
                             1e-4);
   } else {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
   }
 }
 }  // namespace
@@ -349,11 +341,9 @@ void SimpleAvgPoolingTest() {
       "Input", {1, 2, 8, 1},
       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
 
-  BufferToImage<D, float>(&net, "Input", "InputImage",
-                          ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Pooling", "PoolingTest")
-      .Input("InputImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Output("Output")
       .AddIntArg("pooling_type", PoolingType::AVG)
       .AddIntsArg("kernels", {2, 2})
       .AddIntsArg("strides", {2, 2})
@@ -362,9 +352,6 @@ void SimpleAvgPoolingTest() {
       .Finalize(net.NewOperatorDef());
   // Run
   net.RunOp(D);
-  ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                          ops::BufferType::IN_OUT_CHANNEL);
-
   // Check
   auto expected = net.CreateTensor<float>({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5});
 
@@ -408,11 +395,9 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
   auto expected = net.CreateTensor<float>();
   expected->Copy(*net.GetOutput("Output"));
 
-  BufferToImage<D, T>(&net, "Input", "InputImage",
-                      ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Pooling", "PoolingTest")
-      .Input("InputImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Output("Output")
       .AddIntArg("pooling_type", PoolingType::AVG)
       .AddIntsArg("kernels", kernels)
       .AddIntsArg("strides", strides)
@@ -421,14 +406,12 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
   net.RunOp(D);
-  ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                          ops::BufferType::IN_OUT_CHANNEL);
 
   if (DataTypeToEnum<T>::value == DT_HALF) {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-3,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-3,
                             1e-3);
   } else {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
   }
 }
 }  // namespace
@@ -578,12 +561,12 @@ void TestQuant(const index_t batch,
   OpsTestNet net;
   std::vector<index_t> input_shape{batch, in_height, in_width, channels};
   net.AddRandomInput<CPU, float>(
-      "Input", input_shape, false);
+      "Input", input_shape, false, false);
   net.TransformDataFormat<DeviceType::CPU, float>(
       "Input", NHWC, "InputNCHW", NCHW);
 
   net.AddRandomInput<DeviceType::CPU, float>(
-      "OutputNCHW", input_shape, true, true);
+      "OutputNCHW", input_shape, false, true, true);
   OpDefBuilder("Pooling", "PoolingTest")
       .Input("InputNCHW")
       .Output("OutputNCHW")
diff --git a/mace/ops/reduce_mean.cc b/mace/ops/reduce_mean.cc
index 9364146f267cabd203dc75989c129c58ba466b76..20f7e81c8b54165388de9f5fd2f359c4d42d1862 100644
--- a/mace/ops/reduce_mean.cc
+++ b/mace/ops/reduce_mean.cc
@@ -47,7 +47,7 @@ class ReduceMeanOpBase : public Operation {
   }
 
  protected:
-  const std::vector<int> axis_;
+  std::vector<int> axis_;
   bool keep_dims_;
 };
 
@@ -58,7 +58,8 @@ template <typename T>
 class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
  public:
   explicit ReduceMeanOp(OpConstructContext *context)
-      : ReduceMeanOpBase(context) {}
+      : ReduceMeanOpBase(context) {
+  }
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -80,9 +81,15 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
       }
     } else {
       for (unsigned int i = 0; i < axis_.size(); ++i) {
-        const int index = axis_[i] >= 0 ?
-                          axis_[i] :
-                          axis_[i] + input->dim_size();
+        int index = axis_[i] >= 0 ?
+                    axis_[i] :
+                    axis_[i] + input->dim_size();
+        // axis format is NHWC
+        if (input->dim_size() == 4) {
+          if (index == 1) index = 2;
+          else if (index == 2) index = 3;
+          else if (index == 3) index = 1;
+        }
         bitmap[index] = true;
       }
     }
diff --git a/mace/ops/reduce_mean_benchmark.cc b/mace/ops/reduce_mean_benchmark.cc
index 24338ce77e3258af1f23f04a64ae57421f629a5e..60a255009c3b614c90aeb2607dc3c5e78ef2472e 100644
--- a/mace/ops/reduce_mean_benchmark.cc
+++ b/mace/ops/reduce_mean_benchmark.cc
@@ -27,26 +27,20 @@ void ReduceMean(int iters, int batch, int channels,
 
   OpsTestNet net;
   // Add input data
-  net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
-
+  std::vector<int> axis = {1, 2};
   if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    OpDefBuilder("ReduceMean", "ReduceMeanBM")
-        .Input("InputImage")
-        .AddIntsArg("axis", {1, 2})
-        .Output("OutputImage")
-        .Finalize(net.NewOperatorDef());
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
   } else {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
-    OpDefBuilder("ReduceMean", "ReduceMeanBM")
-        .Input("InputNCHW")
-        .AddIntsArg("axis", {2, 3})
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
   }
 
+  OpDefBuilder("ReduceMean", "ReduceMeanBM")
+      .Input("Input")
+      .AddIntsArg("axis", axis)
+      .Output("OutputImage")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+
   // Warm-up
   for (int i = 0; i < 5; ++i) {
     net.RunOp(D);
diff --git a/mace/ops/reduce_mean_test.cc b/mace/ops/reduce_mean_test.cc
index bc2577e29352f193d97403d6f221c94e033ddb2e..ef455f85a4cf0961fb24975b47fe88640d2e7150 100644
--- a/mace/ops/reduce_mean_test.cc
+++ b/mace/ops/reduce_mean_test.cc
@@ -34,32 +34,54 @@ void Simple(const std::vector<index_t> &input_shape,
   net.AddInputFromArray<D, float>("Input", input_shape, input);
 
   if (D == DeviceType::CPU) {
+    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
     OpDefBuilder("ReduceMean", "ReduceMeanTest")
-        .Input("Input")
+        .Input("InputNCHW")
         .AddIntsArg("axis", axis)
         .AddIntArg("keepdims", keepdims ? 1 : 0)
-        .Output("Output")
+        .Output("OutputNCHW")
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
+    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
   } else {
-    BufferToImage<D, float>(&net, "Input", "InputImg",
-                           ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("ReduceMean", "ReduceMeanTest")
-        .Input("InputImg")
+        .Input("Input")
         .AddIntsArg("axis", axis)
         .AddIntArg("keepdims", keepdims ? 1 : 0)
-        .Output("OutputImg")
+        .Output("Output")
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    ImageToBuffer<D, float>(&net, "OutputImg", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
   }
   auto expected = net.CreateTensor<float>(output_shape, output);
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5, 1e-3);
 }
 
+template <DeviceType D>
+void Simple3D(const std::vector<index_t> &input_shape,
+              const std::vector<float> &input,
+              const std::vector<int> &axis,
+              const std::vector<index_t> &output_shape,
+              const std::vector<float> &output,
+              const bool keepdims = true) {
+  // Construct graph
+  OpsTestNet net;
+  // Add input data
+  net.AddInputFromArray<D, float>("Input", input_shape, input);
+
+  OpDefBuilder("ReduceMean", "ReduceMeanTest")
+      .Input("Input")
+      .AddIntsArg("axis", axis)
+      .AddIntArg("keepdims", keepdims ? 1 : 0)
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+  // Run
+  net.RunOp(D);
+  auto expected = net.CreateTensor<float>(output_shape, output);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5, 1e-3);
+}
+
 template <DeviceType D>
 void Simple12Test() {
   Simple<D>({2, 2, 3, 4},
@@ -157,26 +179,6 @@ void Simple2Axis() {
             {0, 1},
             {1, 1, 3, 4},
             {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
-  Simple<D>({2, 3, 4},
-            {0, 1, 2, 3,
-             4, 5, 6, 7,
-             8, 9, 10, 11,
-             12, 13, 14, 15,
-             16, 17, 18, 19,
-             20, 21, 22, 23},
-            {0, 1},
-            {1, 1, 4},
-            {10, 11, 12, 13});
-  Simple<D>({2, 3, 4},
-            {0, 1, 2, 3,
-             4, 5, 6, 7,
-             8, 9, 10, 11,
-             12, 13, 14, 15,
-             16, 17, 18, 19,
-             20, 21, 22, 23},
-            {1, 2},
-            {2, 1, 1},
-            {5.5, 17.5});
   Simple<D>({1, 2, 3, 4},
             {0, 1, 2, 3,
              4, 5, 6, 7,
@@ -220,6 +222,31 @@ void Simple2Axis() {
             {4, 13, 22});
 }
 
+template <DeviceType D>
+void Simple2Axis3D() {
+  Simple3D<D>({2, 3, 4},
+              {0, 1, 2, 3,
+               4, 5, 6, 7,
+               8, 9, 10, 11,
+               12, 13, 14, 15,
+               16, 17, 18, 19,
+               20, 21, 22, 23},
+              {0, 1},
+              {1, 1, 4},
+              {10, 11, 12, 13});
+  Simple3D<D>({2, 3, 4},
+              {0, 1, 2, 3,
+               4, 5, 6, 7,
+               8, 9, 10, 11,
+               12, 13, 14, 15,
+               16, 17, 18, 19,
+               20, 21, 22, 23},
+              {1, 2},
+              {2, 1, 1},
+              {5.5, 17.5});
+}
+
+
 template <DeviceType D>
 void Simple3Axis() {
   Simple<D>({1, 2, 3, 4},
@@ -310,21 +337,22 @@ TEST_F(ReduceMeanOpTest, CPUSimple2Axis) {
   Simple2Axis<DeviceType::CPU>();
 }
 
+TEST_F(ReduceMeanOpTest, CPUSimple2Axis3D) {
+  Simple2Axis3D<DeviceType::CPU>();
+}
+
 TEST_F(ReduceMeanOpTest, CPUSimple3Axis) {
   Simple3Axis<DeviceType::CPU>();
 }
 
 TEST_F(ReduceMeanOpTest, CPUSimpleReduceDims) {
-  Simple<CPU>({2, 2, 3, 4},
-              {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
-               12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
-               0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
-               12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
-              {1, 2},
-              {2, 4},
-              {10, 11, 12, 13,
-               10, 11, 12, 13},
-              false);
+  Simple3D<CPU>({2, 3, 4},
+                {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23},
+                {0, 1},
+                {4},
+                {10, 11, 12, 13},
+                false);
 }
 
 namespace {
@@ -338,21 +366,11 @@ void RandomTest(const std::vector<index_t> &input_shape,
   // Add input data
   net.AddRandomInput<D, float>("Input", input_shape);
 
-  std::vector<int> axis_cpu(axis.size());
-  for (unsigned int i = 0; i < axis.size(); ++i) {
-    if (axis[i] == 1 || axis[i] == 2)
-      axis_cpu[i] = axis[i] + 1;
-    else if (axis[i] == 3)
-      axis_cpu[i] = 1;
-    else
-      axis_cpu[i] = axis[i];
-  }
-
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                   NCHW);
   OpDefBuilder("ReduceMean", "ReduceMeanTest")
       .Input("InputNCHW")
-      .AddIntsArg("axis", axis_cpu)
+      .AddIntsArg("axis", axis)
       .AddIntArg("keepdims", 1)
       .Output("OutputNCHW")
       .Finalize(net.NewOperatorDef());
@@ -360,18 +378,14 @@ void RandomTest(const std::vector<index_t> &input_shape,
   net.RunOp();
   net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                   "Output", NHWC);
-  BufferToImage<D, T>(&net, "Input", "InputImg",
-                      ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("ReduceMean", "ReduceMeanTest")
-      .Input("InputImg")
+      .Input("Input")
       .AddIntsArg("axis", axis)
       .AddIntArg("keepdims", 1)
-      .Output("OutputImg")
+      .Output("OPENCLOutput")
       .Finalize(net.NewOperatorDef());
   // Run
   net.RunOp(D);
-  ImageToBuffer<D, float>(&net, "OutputImg", "OPENCLOutput",
-                          ops::BufferType::IN_OUT_CHANNEL);
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
     ExpectTensorNear<float>(*net.GetTensor("Output"),
                             *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4);
diff --git a/mace/ops/resize_bicubic_benchmark.cc b/mace/ops/resize_bicubic_benchmark.cc
index 896fb1e0056fdb37aa095707a7504d5e75da7533..5ababebaa29676f289c368222bde120acf9c0aca 100644
--- a/mace/ops/resize_bicubic_benchmark.cc
+++ b/mace/ops/resize_bicubic_benchmark.cc
@@ -43,30 +43,13 @@ void ResizeBicubicBenchmark(int iters,
   } else {
     MACE_NOT_IMPLEMENTED;
   }
-  net.AddInputFromArray<D, int>("OutSize", {2},
-                                    {output_height, output_width});
 
-  if (D == DeviceType::CPU) {
-    OpDefBuilder("ResizeBicubic", "ResizeBicubicBenchmark")
+  OpDefBuilder("ResizeBicubic", "ResizeBicubicBenchmark")
       .Input("Input")
-      .Input("OutSize")
       .Output("Output")
       .AddIntsArg("size", {output_height, output_width})
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    OpDefBuilder("ResizeBicubic", "ResizeBicubicBenchmark")
-        .Input("InputImage")
-        .Input("OutSize")
-        .Output("OutputImage")
-        .AddIntsArg("size", {output_height, output_width})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
diff --git a/mace/ops/resize_bicubic_test.cc b/mace/ops/resize_bicubic_test.cc
index 3a33eefc5b930ba0a89ae3d579e42efd5abdc620..5a4afc355a021179d0453344b6d2247a62721cf6 100644
--- a/mace/ops/resize_bicubic_test.cc
+++ b/mace/ops/resize_bicubic_test.cc
@@ -132,7 +132,7 @@ void TestRandomResizeBicubic() {
     // Add input data
     net.AddRandomInput<D, float>("Input",
                                  {batch, in_height, in_width, channels},
-                                 true, true);
+                                 false, true, true);
     net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                     NCHW);
 
@@ -151,23 +151,17 @@ void TestRandomResizeBicubic() {
     expected.Copy(*net.GetOutput("Output"));
 
     if (D == DeviceType::GPU) {
-      BufferToImage<D, float>(&net, "Input", "InputImage",
-                              ops::BufferType::IN_OUT_CHANNEL);
-
       OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
-          .Input("InputImage")
-          .Output("OutputImage")
+          .Input("Input")
+          .Output("Output")
           .AddIntArg("align_corners", align_corners)
           .AddIntsArg("size", {height, width})
           .Finalize(net.NewOperatorDef());
       // Run
       net.RunOp(D);
-
-      ImageToBuffer<D, float>(&net, "OutputImage", "DeviceOutput",
-                              ops::BufferType::IN_OUT_CHANNEL);
     }
     // Check
-    ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-2,
+    ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2,
                             1e-2);
   }
 }
diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc
index 345f86bb041730337a594922d09a6ca3d2a32743..bace4f10374d681df889e6fd5451c37abc2d646c 100644
--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
@@ -50,30 +50,12 @@ void ResizeBilinearBenchmark(int iters,
   } else {
     MACE_NOT_IMPLEMENTED;
   }
-  net.AddInputFromArray<D, int>("OutSize", {2},
-                                    {output_height, output_width});
-
-  if (D == DeviceType::CPU) {
-    OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
+  OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
       .Input("Input")
-      .Input("OutSize")
       .Output("Output")
       .AddIntsArg("size", {output_height, output_width})
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
-        .Input("InputImage")
-        .Input("OutSize")
-        .Output("OutputImage")
-        .AddIntsArg("size", {output_height, output_width})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc
index b611854fadb86814fd8b24732ba9eb1de07931b9..e7b7a296929b0aae2fef068a072c03a9fdabfebc 100644
--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -118,23 +118,17 @@ void TestRandomResizeBilinear() {
     expected->Copy(*net.GetOutput("Output"));
 
     if (D == DeviceType::GPU) {
-      BufferToImage<D, float>(&net, "Input", "InputImage",
-                              ops::BufferType::IN_OUT_CHANNEL);
-
       OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
-          .Input("InputImage")
-          .Output("OutputImage")
+          .Input("Input")
+          .Output("Output")
           .AddIntArg("align_corners", align_corners)
           .AddIntsArg("size", {height, width})
           .Finalize(net.NewOperatorDef());
       // Run
       net.RunOp(D);
-
-      ImageToBuffer<D, float>(&net, "OutputImage", "DeviceOutput",
-                              ops::BufferType::IN_OUT_CHANNEL);
     }
     // Check
-    ExpectTensorNear<float>(*expected, *net.GetOutput("DeviceOutput"), 1e-5,
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5,
                             1e-6);
   }
 }
@@ -157,6 +151,7 @@ void TestQuantizedResizeBilinear() {
     net.AddRandomInput<CPU, float>("Input",
                                    {batch, in_height, in_width, channels},
                                    false,
+                                   false,
                                    true,
                                    -1.f,
                                    1.f);
diff --git a/mace/ops/scalar_math.cc b/mace/ops/scalar_math.cc
index 4a866ae7592787f89b002bac6153b820fc4aaf2a..5539e53f83be152a839e9bfa98178c2fedb933c6 100644
--- a/mace/ops/scalar_math.cc
+++ b/mace/ops/scalar_math.cc
@@ -93,7 +93,11 @@ class ScalarMathOp : public Operation {
         coeff_(Operation::GetRepeatedArgs<float>("coeff")),
         scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
         scalar_input_index_(Operation::GetOptionalArg<int32_t>(
-            "scalar_input_index", 1)) {}
+            "scalar_input_index", 1)) {
+    if (D == DeviceType::GPU) {
+      context->set_output_mem_type(MemoryType::GPU_BUFFER);
+    }
+  }
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
diff --git a/mace/ops/shape.cc b/mace/ops/shape.cc
index b981267aa3c9e20e3c793a5174c3813fb9c59d1a..675ab7c82a7fa553d9ec69cd6f4a77b68f5ceb98 100644
--- a/mace/ops/shape.cc
+++ b/mace/ops/shape.cc
@@ -21,7 +21,11 @@ template <DeviceType D, typename T>
 class ShapeOp : public Operation {
  public:
   explicit ShapeOp(OpConstructContext *context)
-      : Operation(context) {}
+      : Operation(context) {
+    if (D == DeviceType::GPU) {
+      context->set_output_mem_type(MemoryType::GPU_BUFFER);
+    }
+  }
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc
index bf06114430be46dfd37046921f09afa33ce3fe5d..4a7505ae79bcbc211ae9fa17f65a4f941b8988a2 100644
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -367,6 +367,7 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation {
     if (context->device()->opencl_runtime()->UseImageMemory()) {
       kernel_.reset(new opencl::image::SoftmaxKernel<T>);
     } else {
+      context->set_output_mem_type(MemoryType::GPU_BUFFER);
       kernel_.reset(new opencl::buffer::SoftmaxKernel<T>);
     }
   }
diff --git a/mace/ops/softmax_benchmark.cc b/mace/ops/softmax_benchmark.cc
index 66e27434c82175b895272d5bab862ee83d0c5ae2..25095da54f94324afd34274f79b09c59c1b4e3a7 100644
--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
@@ -38,22 +38,11 @@ void SoftmaxBenchmark(
     MACE_NOT_IMPLEMENTED;
   }
 
-  if (D == DeviceType::CPU) {
-    OpDefBuilder("Softmax", "SoftmaxBM")
+  OpDefBuilder("Softmax", "SoftmaxBM")
       .Input("Input")
       .Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("Softmax", "SoftmaxBM")
-        .Input("InputImage")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc
index 69b5dafdfeb8390615f926f092d77c0b47fea071..af32d4ab8ad97a10ed58707f02efdd1c67741fb1 100644
--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
@@ -59,21 +59,14 @@ void Simple() {
     net.GetOutput("Output")->Reshape({1, 1, 2, 4});
     ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
   } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
     OpDefBuilder("Softmax", "SoftmaxTest")
-        .Input("InputImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Output("Output")
         .Finalize(net.NewOperatorDef());
 
     // Run
     net.RunOp(D);
 
-    // Transfer output
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
     ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
   } else {
     MACE_NOT_IMPLEMENTED;
@@ -115,22 +108,15 @@ void Complex(const std::vector<index_t> &logits_shape) {
   auto expected = net.CreateTensor<float>();
   expected->Copy(*net.GetOutput("Output"));
 
-  BufferToImage<D, float>(&net, "Input", "InputImage",
-                          ops::BufferType::IN_OUT_CHANNEL);
-
   OpDefBuilder("Softmax", "SoftmaxTest")
-      .Input("InputImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Output("Output")
       .Finalize(net.NewOperatorDef());
 
   // Run on gpu
   net.RunOp(D);
 
-  // Transfer output
-  ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                          ops::BufferType::IN_OUT_CHANNEL);
-
-  ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace
 
@@ -158,7 +144,7 @@ namespace {
 
 void TestQuantizedSoftmax(const std::vector<index_t> &input_shape) {
   OpsTestNet net;
-  net.AddRandomInput<CPU, float>("Input", input_shape, false, true);
+  net.AddRandomInput<CPU, float>("Input", input_shape, false, false, true);
 
   OpDefBuilder("Softmax", "SoftmaxTest")
       .Input("Input")
diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc
index f6d5ad1aab8a98352d4548e1002694b219a77334..cacadfcd9673019a9c3f7938d72ebc3d45608c96 100644
--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
@@ -38,24 +38,13 @@ void BMSpaceToBatch(
     net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
   }
 
-  if (D == DeviceType::CPU) {
-    OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
-        .Input("Input")
-        .Output("Output")
-        .AddIntsArg("paddings", {shape, shape, shape, shape})
-        .AddIntsArg("block_shape", {shape, shape})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-    OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
-        .Input("InputImage")
-        .Output("OutputImage")
-        .AddIntsArg("paddings", {shape, shape, shape, shape})
-        .AddIntsArg("block_shape", {shape, shape})
-        .Finalize(net.NewOperatorDef());
-  }
+  OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
+      .Input("Input")
+      .Output("Output")
+      .AddIntsArg("paddings", {shape, shape, shape, shape})
+      .AddIntsArg("block_shape", {shape, shape})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
   // Warm-up
   for (int i = 0; i < 5; ++i) {
     net.RunOp(D);
diff --git a/mace/ops/space_to_batch_test.cc b/mace/ops/space_to_batch_test.cc
index 956dedc2c7f069c937bb09e5174225c426b3c7a5..3a928c6de0802ecd194ddae9723e4c4399a03dc1 100644
--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
@@ -32,11 +32,9 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
   net.AddInputFromArray<D, float>("Input", input_shape, input_data);
 
   if (D == GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
-        .Input("InputImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Output("Output")
         .AddIntsArg("paddings", padding_data)
         .AddIntsArg("block_shape", block_shape_data)
         .Finalize(net.NewOperatorDef());
@@ -54,10 +52,7 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
   // Run
   net.RunOp(D);
 
-  if (D == GPU) {
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else if (D == CPU) {
+  if (D == CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
   }
@@ -76,11 +71,9 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
   net.AddInputFromArray<D, float>("Input", input_shape, input_data);
 
   if (D == GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
-        .Input("InputImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Output("Output")
         .AddIntsArg("crops", crops_data)
         .AddIntsArg("block_shape", block_shape_data)
         .Finalize(net.NewOperatorDef());
@@ -98,10 +91,7 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
   // Run
   net.RunOp(D);
 
-  if (D == GPU) {
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
-  } else if (D == CPU) {
+  if (D == CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                     "Output", NHWC);
   }
@@ -156,17 +146,13 @@ void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
   net.AddRandomInput<GPU, float>("Input", input_shape);
 
   // run gpu
-  BufferToImage<GPU, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
-      .Input("InputImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Output("OutputGPU")
       .AddIntsArg("paddings", padding_data)
       .AddIntsArg("block_shape", block_shape_data)
       .Finalize(net.NewOperatorDef());
   net.RunOp(GPU);
-  ImageToBuffer<GPU, float>(&net, "OutputImage", "OutputGPU",
-                            ops::BufferType::IN_OUT_CHANNEL);
 
   // run cpu
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
@@ -193,17 +179,13 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
   net.AddRandomInput<GPU, float>("Input", input_shape);
 
   // run gpu
-  BufferToImage<GPU, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
-      .Input("InputImage")
-      .Output("OutputImage")
+      .Input("Input")
+      .Output("OutputGPU")
       .AddIntsArg("crops", crops_data)
       .AddIntsArg("block_shape", block_shape_data)
       .Finalize(net.NewOperatorDef());
   net.RunOp(GPU);
-  ImageToBuffer<GPU, float>(&net, "OutputImage", "OutputGPU",
-                            ops::BufferType::IN_OUT_CHANNEL);
 
   // run cpu
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
@@ -227,7 +209,13 @@ void TestSpaceToBatchQuantize(const std::vector<index_t> &input_shape,
                               const std::vector<int> &block_shape_data,
                               const std::vector<int> &padding_data) {
   OpsTestNet net;
-  net.AddRandomInput<CPU, float>("Input", input_shape, false, true, -1.f, 1.f);
+  net.AddRandomInput<CPU, float>("Input",
+                                 input_shape,
+                                 false,
+                                 false,
+                                 true,
+                                 -1.f,
+                                 1.f);
 
   // run cpu
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
@@ -282,7 +270,13 @@ void TestoBatchToSpaceQuantize(const std::vector<index_t> &input_shape,
                                const std::vector<int> &block_shape_data,
                                const std::vector<int> &crops_data) {
   OpsTestNet net;
-  net.AddRandomInput<CPU, float>("Input", input_shape, false, true, -1.f, 1.f);
+  net.AddRandomInput<CPU, float>("Input",
+                                 input_shape,
+                                 false,
+                                 false,
+                                 true,
+                                 -1.f,
+                                 1.f);
 
   // run cpu
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
diff --git a/mace/ops/space_to_depth_benchmark.cc b/mace/ops/space_to_depth_benchmark.cc
index 04760c5454457ab06848c3b715abd7697ef27ce0..3311d6186272cee46cc53f8e6d9426e9eb962295 100644
--- a/mace/ops/space_to_depth_benchmark.cc
+++ b/mace/ops/space_to_depth_benchmark.cc
@@ -36,23 +36,12 @@ void SpaceToDepth(
     MACE_NOT_IMPLEMENTED;
   }
 
-  if (D == DeviceType::CPU) {
-    OpDefBuilder("SpaceToDepth", "SpaceToDepthBM")
+  OpDefBuilder("SpaceToDepth", "SpaceToDepthBM")
       .Input("Input")
       .Output("Output")
+      .AddIntArg("block_size", block_size)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
-  } else if (D == DeviceType::GPU) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
-
-    OpDefBuilder("SpaceToDepth", "SpaceToDepthBM")
-        .Input("InputImage")
-        .Output("Output")
-        .AddIntArg("block_size", block_size)
-        .Finalize(net.NewOperatorDef());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
diff --git a/mace/ops/space_to_depth_test.cc b/mace/ops/space_to_depth_test.cc
index e7ae77d6637225b9a7377d5fcb6e806c22992931..a0c4a9b86a4994e5ba7d59e297ae274132a9db37 100644
--- a/mace/ops/space_to_depth_test.cc
+++ b/mace/ops/space_to_depth_test.cc
@@ -45,21 +45,15 @@ void RunSpaceToDepth(const std::vector<index_t> &input_shape,
                                                     "Output", NHWC);
 
   } else {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
-        .Input("InputImage")
-        .Output("OutputImage")
+        .Input("Input")
+        .Output("Output")
         .AddIntArg("block_size", block_size)
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
   }
 
-  if (D == DeviceType::GPU) {
-    ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
-                                          ops::BufferType::IN_OUT_CHANNEL);
-  }
   auto expected = net.CreateTensor<float>(expected_shape, expected_data);
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
@@ -127,22 +121,16 @@ void RandomTest(const int block_size,
   net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
                                                   NHWC);
 
-  BufferToImage<D, T>(&net, "Input", "InputImg",
-                      ops::BufferType::IN_OUT_CHANNEL);
-
   OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
-      .Input("InputImg")
+      .Input("Input")
       .AddIntArg("block_size", block_size)
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Output("OutputImg")
+      .Output("OPENCLOutput")
       .Finalize(net.NewOperatorDef());
 
   // Run
   net.RunOp(D);
 
-  ImageToBuffer<D, float>(&net, "OutputImg", "OPENCLOutput",
-                          ops::BufferType::IN_OUT_CHANNEL);
-
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
     ExpectTensorNear<float>(*net.GetTensor("Output"),
                             *net.GetOutput("OPENCLOutput"), 1e-5);
diff --git a/mace/ops/split.cc b/mace/ops/split.cc
index 1d632329cdef22fd37a2689498b05bbf1f2a60dc..2e09663178c45495b670b75a72ac7a013f478dc0 100644
--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
@@ -31,16 +31,27 @@ class SplitOp<DeviceType::CPU, T> : public Operation {
  public:
   explicit SplitOp(OpConstructContext *context)
       : Operation(context),
-        axis_(Operation::GetOptionalArg<int>("axis", 3)) {}
+        axis_(Operation::GetOptionalArg<int>("axis", 3)),
+        checked_(false) {}
+
+  void Validate() {
+    if (this->Input(0)->dim_size() == 4) {
+      if (axis_ == 3) axis_ = 1;
+      else if (axis_ == 2) axis_ = 3;
+      else if (axis_ == 1) axis_ = 2;
+    }
+    MACE_CHECK(this->OutputSize() >= 2)
+      << "There must be at least two outputs for slicing";
+    MACE_CHECK((this->Input(0)->dim(axis_) % this->OutputSize()) == 0)
+      << "Outputs do not split input equally.";
+    checked_ = true;
+  }
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
-    MACE_CHECK(this->OutputSize() >= 2)
-      << "There must be at least two outputs for slicing";
+    if (!checked_) Validate();
     const Tensor *input = this->Input(0);
     const std::vector<Tensor *> output_list = this->Outputs();
-    MACE_CHECK((input->dim(axis_) % this->OutputSize()) == 0)
-      << "Outputs do not split input equally.";
     const index_t input_channels = input->dim(axis_);
     const size_t outputs_count = output_list.size();
     const index_t output_channels = input_channels / outputs_count;
@@ -83,6 +94,7 @@ class SplitOp<DeviceType::CPU, T> : public Operation {
 
  private:
   int32_t axis_;
+  bool checked_;
 };
 
 
diff --git a/mace/ops/split_benchmark.cc b/mace/ops/split_benchmark.cc
index 687fc5739bdb9f01c262b4cebfd7cf1361890f7d..b21da8f5c7f055437a6a59952c3bea4957636efd 100644
--- a/mace/ops/split_benchmark.cc
+++ b/mace/ops/split_benchmark.cc
@@ -37,26 +37,14 @@ void BMSplitHelper(int iters,
   GenerateRandomRealTypeData(input_shape, &input_data);
   net.AddInputFromArray<D, float>("Input", input_shape, input_data);
 
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-
-    auto builder = OpDefBuilder("Split", "SplitTest");
-    builder.Input("InputImage");
-    for (int i = 0; i < num_outputs; ++i) {
-      builder = builder.Output(MakeString("OutputImage", i));
-    }
-    builder
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    auto builder = OpDefBuilder("Split", "SplitTest");
-    builder.Input("Input");
-    for (int i = 0; i < num_outputs; ++i) {
-      builder = builder.Output(MakeString("Output", i));
-    }
-    builder.Finalize(net.NewOperatorDef());
+  auto builder = OpDefBuilder("Split", "SplitTest");
+  builder.Input("Input");
+  for (int i = 0; i < num_outputs; ++i) {
+    builder = builder.Output(MakeString("Output", i));
   }
+  builder
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
 
   // Warm-up
   for (int i = 0; i < 2; ++i) {
diff --git a/mace/ops/split_test.cc b/mace/ops/split_test.cc
index 906a47dd8f0a3ad8b74f091bd5c50b20d53669ad..89fbbadefbb39b4a3bc6446f8c6ed58e074636f5 100644
--- a/mace/ops/split_test.cc
+++ b/mace/ops/split_test.cc
@@ -26,7 +26,7 @@ class SplitOpTest : public OpsTestBase {};
 
 namespace {
 template <DeviceType D, typename T>
-void RandomTest(const int num_outputs, const int axis) {
+void RandomTest(const int num_outputs, int axis) {
   static unsigned int seed = time(NULL);
   const index_t output_channels = 4 * (1 + rand_r(&seed) % 10);
   const index_t input_channels = num_outputs * output_channels;
@@ -38,9 +38,9 @@ void RandomTest(const int num_outputs, const int axis) {
   OpsTestNet net;
 
   std::vector<index_t> input_shape;
-  if (axis == 1)
+  if (D == DeviceType::CPU)
     input_shape = {batch, input_channels, height, width};
-  else if (axis == 3)
+  else
     input_shape = {batch, height, width, input_channels};
   const index_t input_size = std::accumulate(
       input_shape.begin(), input_shape.end(), 1, std::multiplies<index_t>());
@@ -48,43 +48,25 @@ void RandomTest(const int num_outputs, const int axis) {
   GenerateRandomRealTypeData(input_shape, &input_data);
   net.AddInputFromArray<D, float>("Input", input_shape, input_data);
 
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-
-    auto builder = OpDefBuilder("Split", "SplitTest");
-    builder.Input("InputImage");
-    for (int i = 0; i < num_outputs; ++i) {
-      builder = builder.Output(MakeString("OutputImage", i));
-    }
-    builder.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    auto builder = OpDefBuilder("Split", "SplitTest").AddIntArg("axis", axis);
-    builder.Input("Input");
-    for (int i = 0; i < num_outputs; ++i) {
-      builder = builder.Output(MakeString("Output", i));
-    }
-    builder.Finalize(net.NewOperatorDef());
+  auto builder = OpDefBuilder("Split", "SplitTest").AddIntArg("axis", axis);
+  builder.Input("Input");
+  for (int i = 0; i < num_outputs; ++i) {
+    builder = builder.Output(MakeString("Output", i));
   }
+  builder.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
 
   // Run
   net.RunOp(D);
 
-  if (D == DeviceType::GPU) {
-    for (int i = 0; i < num_outputs; ++i) {
-      ImageToBuffer<D, float>(&net, MakeString("OutputImage", i),
-                              MakeString("Output", i),
-                              ops::BufferType::IN_OUT_CHANNEL);
-    }
-  }
-
   // Check
   std::vector<index_t> expected_shape;
-  if (axis == 1)
+  if (D == DeviceType::CPU) {
+    if (axis == 3) axis = 1;
     expected_shape = {batch, output_channels, height, width};
-  else if (axis == 3)
+  } else {
     expected_shape = {batch, height, width, output_channels};
+  }
   const index_t outer_size =
       std::accumulate(expected_shape.begin(), expected_shape.begin() + axis, 1,
                       std::multiplies<index_t>());
@@ -117,9 +99,9 @@ TEST_F(SplitOpTest, CPU) {
 }
 
 TEST_F(SplitOpTest, CPUAxis1) {
-  RandomTest<DeviceType::CPU, float>(2, 1);
-  RandomTest<DeviceType::CPU, float>(4, 1);
-  RandomTest<DeviceType::CPU, float>(11, 1);
+  RandomTest<DeviceType::CPU, float>(2, 3);
+  RandomTest<DeviceType::CPU, float>(4, 3);
+  RandomTest<DeviceType::CPU, float>(11, 3);
 }
 
 TEST_F(SplitOpTest, OPENCLFloat) {
diff --git a/mace/ops/sqrdiff_mean_benchmark.cc b/mace/ops/sqrdiff_mean_benchmark.cc
index bcf075004835b99e347c2139acd3f2e2244a65aa..353d8e7addfa4748fb7a160710bea226d3c569ab 100644
--- a/mace/ops/sqrdiff_mean_benchmark.cc
+++ b/mace/ops/sqrdiff_mean_benchmark.cc
@@ -29,35 +29,21 @@ void SqrDiffMean(int iters, int batch, int channels,
 
   OpsTestNet net;
   // Add input data
-  net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
-  net.AddRandomInput<D, T>("Input1", {batch, 1, 1, channels});
-
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "Input", "InputImage",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(&net, "Input1", "InputImage1",
-                        ops::BufferType::IN_OUT_CHANNEL);
-    OpDefBuilder("SqrDiffMean", "SqrDiffMeanBM")
-        .Input("InputImage")
-        .Input("InputImage1")
-        .Output("OutputImage")
-        .Finalize(net.NewOperatorDef());
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
+    net.AddRandomInput<D, T>("Input1", {batch, channels, 1, 1});
   } else {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input",
-                                                    NHWC,
-                                                    "InputNCHW",
-                                                    NCHW);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input1",
-                                                    NHWC,
-                                                    "InputNCHW1",
-                                                    NCHW);
-    OpDefBuilder("SqrDiffMean", "SqrDiffMeanBM")
-        .Input("InputNCHW")
-        .Input("InputNCHW1")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
+    net.AddRandomInput<D, T>("Input1", {batch, 1, 1, channels});
   }
 
+  OpDefBuilder("SqrDiffMean", "SqrDiffMeanBM")
+      .Input("Input")
+      .Input("Input1")
+      .Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+
   // Warm-up
   for (int i = 0; i < 5; ++i) {
     net.RunOp(D);
diff --git a/mace/ops/sqrdiff_mean_test.cc b/mace/ops/sqrdiff_mean_test.cc
index 66f852b71653b2b2769dd1fac46542bd7a8b48b3..d71e8f7f0107af479b7f728dde253f63b728fe05 100644
--- a/mace/ops/sqrdiff_mean_test.cc
+++ b/mace/ops/sqrdiff_mean_test.cc
@@ -58,19 +58,13 @@ void Simple(const std::vector<index_t> &input_shape0,
                                                     "Output",
                                                     NHWC);
   } else {
-    BufferToImage<D, float>(&net, "Input0", "InputImg0",
-                           ops::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, float>(&net, "Input1", "InputImg1",
-                            ops::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest")
-        .Input("InputImg0")
-        .Input("InputImg1")
-        .Output("OutputImg")
+        .Input("Input0")
+        .Input("Input1")
+        .Output("Output")
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    ImageToBuffer<D, float>(&net, "OutputImg", "Output",
-                            ops::BufferType::IN_OUT_CHANNEL);
   }
   auto expected = net.CreateTensor<float>(output_shape, output);
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5, 1e-3);
@@ -126,19 +120,13 @@ void RandomTest(const std::vector<index_t> &input_shape0,
   net.RunOp();
   net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
                                                   "Output", NHWC);
-  BufferToImage<D, T>(&net, "Input0", "InputImg0",
-                      ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<D, T>(&net, "Input1", "InputImg1",
-                      ops::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest")
-      .Input("InputImg0")
-      .Input("InputImg1")
-      .Output("OutputImg")
+      .Input("Input0")
+      .Input("Input1")
+      .Output("OPENCLOutput")
       .Finalize(net.NewOperatorDef());
   // Run
   net.RunOp(D);
-  ImageToBuffer<D, float>(&net, "OutputImg", "OPENCLOutput",
-                          ops::BufferType::IN_OUT_CHANNEL);
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
     ExpectTensorNear<float>(*net.GetTensor("Output"),
                             *net.GetOutput("OPENCLOutput"), 1e-4, 1e-3);
diff --git a/mace/ops/squeeze.cc b/mace/ops/squeeze.cc
index 0cd15752f52adc7383239591c96c077e4354ac04..bf86a84feb33026047c44951e2acdfbc30467ec2 100644
--- a/mace/ops/squeeze.cc
+++ b/mace/ops/squeeze.cc
@@ -25,10 +25,20 @@ class SqueezeOp : public Operation {
  public:
   explicit SqueezeOp(OpConstructContext *context)
       : Operation(context),
-        axis_(Operation::GetRepeatedArgs<int>("axis", {})) {}
+        axis_(Operation::GetRepeatedArgs<int>("axis", {})),
+        checked_(false) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
+    if (!checked_ && D == DeviceType::CPU
+        && DataTypeToEnum<T>::value != DT_UINT8
+        && this->Input(0)->dim_size() == 4) {
+      if (axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2) {
+        axis_[0] = 2;
+        axis_[1] = 3;
+      }
+      checked_ = true;
+    }
     const Tensor *input = this->Input(0);
     Tensor *output = this->Output(0);
 
@@ -48,6 +58,7 @@ class SqueezeOp : public Operation {
 
  private:
   std::vector<int> axis_;
+  bool checked_;
 };
 
 void RegisterSqueeze(OpRegistryBase *op_registry) {
diff --git a/mace/ops/squeeze_test.cc b/mace/ops/squeeze_test.cc
index 1bcd6c37c41facaa857b9d31a46b17c4bdc5178d..b0fc972cd0479d52bbbd0eff3e96a0bdd7b0a176 100644
--- a/mace/ops/squeeze_test.cc
+++ b/mace/ops/squeeze_test.cc
@@ -58,6 +58,7 @@ TEST_F(SqueezeOpTest, TestSqueeze) {
   TestSqueeze({1, 2, 1, 4}, {1}, {1, 2, 1, 4});
   TestSqueeze({1, 2, 1, 4}, {2}, {1, 2, 4});
   TestSqueeze({1}, {}, {});
+  TestSqueeze({1, 4, 1, 1}, {1, 2}, {1, 4});
 }
 
 }  // namespace test
diff --git a/mace/ops/stack.cc b/mace/ops/stack.cc
index de795965d3211d50d8c29aabbe87294754dbe502..f6269b0f4a08d471a0e25efbe3374142e5a9e20c 100644
--- a/mace/ops/stack.cc
+++ b/mace/ops/stack.cc
@@ -25,7 +25,11 @@ class StackOp : public Operation {
  public:
   explicit StackOp(OpConstructContext *context)
       : Operation(context),
-        axis_(Operation::GetOptionalArg<int>("axis", 0)) {}
+        axis_(Operation::GetOptionalArg<int>("axis", 0)) {
+    if (D == DeviceType::GPU) {
+      context->set_output_mem_type(MemoryType::GPU_BUFFER);
+    }
+  }
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
diff --git a/mace/ops/transformer.cc b/mace/ops/transformer.cc
deleted file mode 100644
index 7df66ffaf96f79d84d2ef454f16728e959386373..0000000000000000000000000000000000000000
--- a/mace/ops/transformer.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/transformer.h"
-
-#include <string>
-#include <memory>
-
-namespace mace {
-namespace ops {
-
-std::unique_ptr<OperatorDef> Transformer::DoTransform(
-    mace::OperatorDef *op_def,
-    const int input_idx,
-    const mace::DataType dt,
-    const BufferType buffer_type,
-    const MemoryType mem_type) {
-  int32_t device = op_def->device_type();
-  std::string input_name = op_def->input(input_idx);
-  std::string output_name = input_name + "_transformed";
-
-  op_def->set_input(input_idx, output_name);
-  std::unique_ptr<OperatorDef> op(new OperatorDef);
-  op->set_name(output_name);
-  op->set_type("BufferTransform");
-  op->add_input(input_name);
-  op->add_output(output_name);
-  Argument *arg = op->add_arg();
-  arg->set_name("buffer_type");
-  arg->set_i(static_cast<int32_t>(buffer_type));
-  arg = op->add_arg();
-  arg->set_name("mem_type");
-  arg->set_i(static_cast<int32_t>(mem_type));
-  arg = op->add_arg();
-  arg->set_name("T");
-  arg->set_i(static_cast<int32_t>(dt));
-  arg = op->add_arg();
-  arg->set_name("device");
-  arg->set_i(device);
-
-  return std::move(op);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/transformer.h b/mace/ops/transformer.h
deleted file mode 100644
index 67ecd60f768f9b4326f8f331fa600134d0d8776b..0000000000000000000000000000000000000000
--- a/mace/ops/transformer.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_KERNELS_TRANSFORMER_H_
-#define MACE_KERNELS_TRANSFORMER_H_
-
-#include "mace/core/transformer.h"
-#include "mace/ops/opencl/common.h"
-
-namespace mace {
-class OpContext;
-namespace ops {
-
-class Transformer : public TransformerBase {
- public:
-  // Transform source tensor to target.
-  std::vector<std::unique_ptr<OperatorDef>> ConstructTranformOp(
-      OperatorDef *op_def,
-      bool transform_filter = true) override;
- private:
-  std::unique_ptr<OperatorDef> DoTransform(
-      mace::OperatorDef *op_def,
-      const int input_idx,
-      const mace::DataType dt,
-      const BufferType buffer_type,
-      const MemoryType mem_type);
-};
-
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_KERNELS_TENSOR_TRANSFORMER_H_
diff --git a/mace/ops/transpose.cc b/mace/ops/transpose.cc
index 4e98944cd889aa4669fe8fd1d53003b6d069896d..7c25ea4f5b679eef411202bfdbe0d01a03aa2977 100644
--- a/mace/ops/transpose.cc
+++ b/mace/ops/transpose.cc
@@ -20,14 +20,16 @@
 #include <algorithm>
 
 #include "mace/core/operator.h"
+#include "mace/ops/transpose.h"
 
 namespace mace {
 namespace ops {
 
-static void TransposeNHWCToNCHWC3(const float *input,
-                                  float *output,
-                                  const index_t height,
-                                  const index_t width) {
+namespace {
+void TransposeNHWCToNCHWC3(const float *input,
+                           float *output,
+                           const index_t height,
+                           const index_t width) {
   index_t image_size = height * width;
 
 #pragma omp parallel for
@@ -62,10 +64,10 @@ static void TransposeNHWCToNCHWC3(const float *input,
   }
 }
 
-static void TransposeNCHWToNHWCC2(const float *input,
-                                  float *output,
-                                  const index_t height,
-                                  const index_t width) {
+void TransposeNCHWToNHWCC2(const float *input,
+                           float *output,
+                           const index_t height,
+                           const index_t width) {
   index_t image_size = height * width;
 #pragma omp parallel for
   for (index_t h = 0; h < height; ++h) {
@@ -97,9 +99,125 @@ static void TransposeNCHWToNHWCC2(const float *input,
 #endif
   }
 }
+}  // namespace
+
+MaceStatus Transpose(const float *input,
+                     const std::vector<int64_t> &input_shape,
+                     const std::vector<int> &dst_dims,
+                     float *output) {
+  MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) ||
+      (input_shape.size() == 4 && dst_dims.size() == 4),
+             "Only support 2D or 4D transpose");
+
+  std::vector<index_t> output_shape;
+  for (size_t i = 0; i < dst_dims.size(); ++i) {
+    output_shape.push_back(input_shape[dst_dims[i]]);
+  }
+
+  if (input_shape.size() == 2) {
+    MACE_CHECK(dst_dims[0] == 1 && dst_dims[1] == 0, "no need transform");
+    index_t height = input_shape[0];
+    index_t width = input_shape[1];
+    index_t stride_i = height;
+    index_t stride_j = width;
+    index_t tile_size = height > 512 || width > 512 ? 64 : 32;
+#pragma omp parallel for collapse(2)
+    for (index_t i = 0; i < height; i += tile_size) {
+      for (index_t j = 0; j < width; j += tile_size) {
+        index_t end_i = std::min(i + tile_size, height);
+        index_t end_j = std::min(j + tile_size, width);
+        for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
+          for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
+            output[tile_j * stride_i + tile_i] =
+                input[tile_i * stride_j + tile_j];
+          }
+        }
+      }
+    }
+  } else if (input_shape.size() == 4) {
+    std::vector<int> transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2};
+    std::vector<int> transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1};
+    index_t batch_size = input_shape[1] * input_shape[2] * input_shape[3];
+
+    if (dst_dims == transpose_order_from_NHWC_to_NCHW && input_shape[3] == 3) {
+      for (index_t b = 0; b < input_shape[0]; ++b) {
+        TransposeNHWCToNCHWC3(input + b * batch_size,
+                              output + b * batch_size,
+                              input_shape[1],
+                              input_shape[2]);
+      }
+    } else if (dst_dims == transpose_order_from_NCHW_to_NHWC
+        && input_shape[1] == 2) {
+      for (index_t b = 0; b < input_shape[0]; ++b) {
+        TransposeNCHWToNHWCC2(input + b * batch_size,
+                              output + b * batch_size,
+                              input_shape[2],
+                              input_shape[3]);
+      }
+    } else if (dst_dims == std::vector<int>{0, 2, 1, 3}) {
+      index_t height = input_shape[1];
+      index_t width = input_shape[2];
+      index_t channel = input_shape[3];
+      index_t channel_raw_size = channel * sizeof(float);
+      index_t stride_i = height;
+      index_t stride_j = width;
+      index_t tile_size = std::max(static_cast<index_t>(1),
+                                   static_cast<index_t>(std::sqrt(
+                                       8 * 1024 / channel)));
+#pragma omp parallel for collapse(2)
+      for (index_t i = 0; i < height; i += tile_size) {
+        for (index_t j = 0; j < width; j += tile_size) {
+          index_t end_i = std::min(i + tile_size, height);
+          index_t end_j = std::min(j + tile_size, width);
+          for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
+            for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
+              memcpy(output + (tile_j * stride_i + tile_i) * channel,
+                     input + (tile_i * stride_j + tile_j) * channel,
+                     channel_raw_size);
+            }
+          }
+        }
+      }
+    } else {
+      std::vector<index_t>
+          in_stride{input_shape[1] * input_shape[2] * input_shape[3],
+                    input_shape[2] * input_shape[3], input_shape[3], 1};
+      std::vector<index_t>
+          out_stride{output_shape[1] * output_shape[2] * output_shape[3],
+                     output_shape[2] * output_shape[3], output_shape[3], 1};
+
+      std::vector<index_t> idim(4, 0);
+      std::vector<index_t> odim(4, 0);
+      for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) {
+        for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) {
+          for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) {
+            for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) {
+              idim[dst_dims[0]] = odim[0];
+              idim[dst_dims[1]] = odim[1];
+              idim[dst_dims[2]] = odim[2];
+              idim[dst_dims[3]] = odim[3];
+
+              output[odim[0] * out_stride[0] + odim[1] * out_stride[1]
+                  + odim[2] * out_stride[2] + odim[3]] =
+                  input[idim[0] * in_stride[0] + idim[1] * in_stride[1]
+                      + idim[2] * in_stride[2] + idim[3]];
+            }
+          }
+        }
+      }
+    }
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
 
 template <DeviceType D, typename T>
-class TransposeOp : public Operation {
+class TransposeOp;
+
+template <DeviceType D>
+class TransposeOp<D, float> : public Operation {
  public:
   explicit TransposeOp(OpConstructContext *context)
       : Operation(context),
@@ -121,106 +239,10 @@ class TransposeOp : public Operation {
 
     Tensor::MappingGuard input_guard(input);
     Tensor::MappingGuard output_guard(output);
-    const T *input_data = input->data<T>();
-    T *output_data = output->mutable_data<T>();
-
-    if (input->dim_size() == 2) {
-      MACE_CHECK(dims_[0] == 1 && dims_[1] == 0, "no need transform");
-      index_t height = input_shape[0];
-      index_t width = input_shape[1];
-      index_t stride_i = height;
-      index_t stride_j = width;
-      index_t tile_size = height > 512 || width > 512 ? 64 : 32;
-#pragma omp parallel for collapse(2)
-      for (index_t i = 0; i < height; i += tile_size) {
-        for (index_t j = 0; j < width; j += tile_size) {
-          index_t end_i = std::min(i + tile_size, height);
-          index_t end_j = std::min(j + tile_size, width);
-          for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
-            for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
-              output_data[tile_j * stride_i + tile_i] =
-                  input_data[tile_i * stride_j + tile_j];
-            }
-          }
-        }
-      }
-    } else if (input->dim_size() == 4) {
-      std::vector<int> transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2};
-      std::vector<int> transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1};
-      index_t batch_size = input->dim(1) * input->dim(2) * input->dim(3);
-
-      if (dims_ == transpose_order_from_NHWC_to_NCHW && input->dim(3) == 3) {
-        for (index_t b = 0; b < input->dim(0); ++b) {
-          TransposeNHWCToNCHWC3(input_data + b * batch_size,
-                                output_data + b * batch_size,
-                                input->dim(1),
-                                input->dim(2));
-        }
-      } else if (dims_ == transpose_order_from_NCHW_to_NHWC
-          && input->dim(1) == 2) {
-        for (index_t b = 0; b < input->dim(0); ++b) {
-          TransposeNCHWToNHWCC2(input_data + b * batch_size,
-                                output_data + b * batch_size,
-                                input->dim(2),
-                                input->dim(3));
-        }
-      } else if (dims_ == std::vector<int>{0, 2, 1, 3}) {
-        index_t height = input_shape[1];
-        index_t width = input_shape[2];
-        index_t channel = input_shape[3];
-        index_t channel_raw_size = channel * sizeof(T);
-        index_t stride_i = height;
-        index_t stride_j = width;
-        index_t tile_size = std::max(static_cast<index_t>(1),
-                                     static_cast<index_t>(std::sqrt(
-                                         8 * 1024 / channel)));
-#pragma omp parallel for collapse(2)
-        for (index_t i = 0; i < height; i += tile_size) {
-          for (index_t j = 0; j < width; j += tile_size) {
-            index_t end_i = std::min(i + tile_size, height);
-            index_t end_j = std::min(j + tile_size, width);
-            for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
-              for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
-                memcpy(output_data + (tile_j * stride_i + tile_i) * channel,
-                       input_data + (tile_i * stride_j + tile_j) * channel,
-                       channel_raw_size);
-              }
-            }
-          }
-        }
-      } else {
-        std::vector<index_t>
-            in_stride{input_shape[1] * input_shape[2] * input_shape[3],
-                      input_shape[2] * input_shape[3], input_shape[3], 1};
-        std::vector<index_t>
-            out_stride{output_shape[1] * output_shape[2] * output_shape[3],
-                       output_shape[2] * output_shape[3], output_shape[3], 1};
-
-        std::vector<index_t> idim(4, 0);
-        std::vector<index_t> odim(4, 0);
-        for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) {
-          for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) {
-            for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) {
-              for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) {
-                idim[dims_[0]] = odim[0];
-                idim[dims_[1]] = odim[1];
-                idim[dims_[2]] = odim[2];
-                idim[dims_[3]] = odim[3];
-
-                output_data[odim[0] * out_stride[0] + odim[1] * out_stride[1]
-                    + odim[2] * out_stride[2] + odim[3]] =
-                    input_data[idim[0] * in_stride[0] + idim[1] * in_stride[1]
-                        + idim[2] * in_stride[2] + idim[3]];
-              }
-            }
-          }
-        }
-      }
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
+    const float *input_data = input->data<float>();
+    float *output_data = output->mutable_data<float>();
 
-    return MaceStatus::MACE_SUCCESS;
+    return Transpose(input_data, input->shape(), dims_, output_data);
   }
 
  private:
diff --git a/mace/ops/opencl/common.h b/mace/ops/transpose.h
similarity index 66%
rename from mace/ops/opencl/common.h
rename to mace/ops/transpose.h
index f0bf872eb84c4b4dd1705ec0b594b10d987b03a7..c4ab39dcaa5ed87877eda681febf82901dfa2b81 100644
--- a/mace/ops/opencl/common.h
+++ b/mace/ops/transpose.h
@@ -12,23 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_OPENCL_COMMON_H_
-#define MACE_OPS_OPENCL_COMMON_H_
+#ifndef MACE_OPS_TRANSPOSE_H_
+#define MACE_OPS_TRANSPOSE_H_
+
+#include <vector>
+
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
 
-enum BufferType {
-  CONV2D_FILTER = 0,
-  IN_OUT_CHANNEL = 1,
-  ARGUMENT = 2,
-  IN_OUT_HEIGHT = 3,
-  IN_OUT_WIDTH = 4,
-  WINOGRAD_FILTER = 5,
-  DW_CONV2D_FILTER = 6,
-  WEIGHT_HEIGHT = 7,
-  WEIGHT_WIDTH = 8,
-};
+MaceStatus Transpose(const float *input,
+                     const std::vector<int64_t> &input_shape,
+                     const std::vector<int> &dst_dims,
+                     float *output);
+
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_OPENCL_COMMON_H_
+
+#endif  // MACE_OPS_TRANSPOSE_H_
diff --git a/mace/ops/winograd_convolution_benchmark.cc b/mace/ops/winograd_convolution_benchmark.cc
deleted file mode 100644
index 624851657e7b704e4eea46a213978541facd52dc..0000000000000000000000000000000000000000
--- a/mace/ops/winograd_convolution_benchmark.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/core/testing/test_benchmark.h"
-#include "mace/ops/conv_pool_2d_util.h"
-#include "mace/ops/ops_test_util.h"
-
-namespace mace {
-namespace ops {
-namespace test {
-
-namespace {
-template <DeviceType D, typename T>
-void BMWinogradConvolution(
-    int iters, int batch, int height, int width,
-    int in_channels, int out_channels, int block_size) {
-  mace::testing::StopTiming();
-  OpsTestNet net;
-  net.AddRandomInput<D, float>("Input", {batch, height, width, in_channels});
-
-  net.AddRandomInput<D, float>("Filter", {out_channels, in_channels, 3, 3});
-  net.AddRandomInput<D, T>("Bias", {out_channels});
-
-  BufferToImage<D, T>(&net, "Input", "InputImage",
-                      ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                      ops::BufferType::CONV2D_FILTER);
-  BufferToImage<D, T>(&net, "Bias", "BiasImage", ops::BufferType::ARGUMENT);
-
-  // Winograd convolution
-  // transform filter
-    BufferToImage<D, T>(&net, "Filter", "WinoFilter",
-                        ops::BufferType::WINOGRAD_FILTER, block_size);
-
-  // Inference convolution output shape
-  OpDefBuilder("InferConv2dShape", "InferConv2dShapeTest")
-      .Input("InputImage")
-      .Output("ShapeOutput")
-      .AddIntArg("data_format", 0)
-      .AddIntsArg("strides", {1, 1})
-      .AddIntsArg("kernels", {static_cast<int>(out_channels),
-                              static_cast<int>(in_channels),
-                              3, 3})
-      .AddIntArg("padding", Padding::SAME)
-      .OutputType({DataTypeToEnum<int32_t>::v()})
-      .Finalize(net.NewOperatorDef());
-
-  // Transform input
-  OpDefBuilder("WinogradTransform", "WinogradTransformTest")
-      .Input("InputImage")
-      .Output("WinoInput")
-      .AddIntArg("padding", Padding::SAME)
-      .AddIntArg("wino_block_size", block_size)
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.AddNewOperatorDef());
-
-  // MatMul
-  OpDefBuilder("MatMul", "MatMulTest")
-      .Input("WinoFilter")
-      .Input("WinoInput")
-      .Output("WinoGemm")
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.AddNewOperatorDef());
-
-  // Inverse transform
-  OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest")
-      .Input("WinoGemm")
-      .Input("ShapeOutput")
-      .Input("BiasImage")
-      .AddIntArg("batch", batch)
-      .AddIntArg("height", height)
-      .AddIntArg("width", width)
-      .AddIntArg("wino_block_size", block_size)
-      .Output("OutputImage")
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.AddNewOperatorDef());
-  net.Setup(D);
-  // Warm-up
-  for (int i = 0; i < 5; ++i) {
-    net.Run();
-  }
-  net.Sync();
-  mace::testing::StartTiming();
-  while (iters--) {
-    net.Run();
-  }
-  net.Sync();
-}
-}  // namespace
-
-#define MACE_BM_WINOGRAD_CONV_MACRO(N, H, W, IC, OC, M, TYPE, DEVICE)          \
-  static void MACE_BM_WINOGRAD_CONV_##N##_##H##_##W##_##IC##_##OC##_##M##_##\
-    TYPE##_##DEVICE( \
-      int iters) {                                                             \
-    const int64_t tot = static_cast<int64_t>(iters) * N * IC * H * W;          \
-    const int64_t macc =                                                      \
-        static_cast<int64_t>(iters) * N * OC * H * W * (3 * 3 * IC + 1);      \
-    mace::testing::MaccProcessed(macc);                                       \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                       \
-    BMWinogradConvolution<DEVICE, TYPE>(iters, N, H, W, IC, OC, M);            \
-  }                                                                            \
-  MACE_BENCHMARK(                                                              \
-  MACE_BM_WINOGRAD_CONV_##N##_##H##_##W##_##IC##_##OC##_##M##_##TYPE##_##DEVICE)
-
-#define MACE_BM_WINOGRAD_CONV(N, H, W, IC, OC, M) \
-  MACE_BM_WINOGRAD_CONV_MACRO(N, H, W, IC, OC, M, half, GPU);
-
-
-MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 2);
-MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 2);
-MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 2);
-MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 4);
-MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 4);
-MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 4);
-MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 2);
-MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 4);
-MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 2);
-MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 4);
-MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 2);
-MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 4);
-
-}  // namespace test
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc
deleted file mode 100644
index 556ee0ba8a3d20de45711b4b201682fcf662a9e6..0000000000000000000000000000000000000000
--- a/mace/ops/winograd_convolution_test.cc
+++ /dev/null
@@ -1,330 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <fstream>
-
-#include "mace/ops/conv_pool_2d_util.h"
-#include "mace/ops/ops_test_util.h"
-
-namespace mace {
-namespace ops {
-namespace test {
-
-class WinogradConvolutionTest : public OpsTestBase {};
-
-namespace {
-
-template <DeviceType D, typename T>
-void WinogradConvolution(const index_t batch,
-                         const index_t height,
-                         const index_t width,
-                         const index_t in_channels,
-                         const index_t out_channels,
-                         const Padding padding,
-                         const int block_size) {
-  // srand(time(NULL));
-
-  // Construct graph
-  OpsTestNet net;
-  // Add input data
-  net.AddRandomInput<D, float>("Input", {batch, height, width, in_channels});
-  net.AddRandomInput<D, float>("Filter", {out_channels, in_channels, 3, 3});
-  net.AddRandomInput<D, T>("Bias", {out_channels});
-
-  BufferToImage<D, T>(&net, "Input", "InputImage",
-                      ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                      ops::BufferType::CONV2D_FILTER);
-  BufferToImage<D, T>(&net, "Bias", "BiasImage", ops::BufferType::ARGUMENT);
-  OpDefBuilder("Conv2D", "Conv2dTest")
-      .Input("InputImage")
-      .Input("FilterImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", padding)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-
-  net.RunOp(D);
-
-  // Transfer output
-  ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput",
-                          ops::BufferType::IN_OUT_CHANNEL);
-
-  auto expected = net.CreateTensor<float>();
-  expected->Copy(*net.GetOutput("ConvOutput"));
-  auto output_shape = expected->shape();
-
-  // Winograd convolution
-  // transform filter
-  BufferToImage<D, T>(&net, "Filter", "WinoFilter",
-                      ops::BufferType::WINOGRAD_FILTER, block_size);
-  // transform input
-  OpDefBuilder("WinogradTransform", "WinogradTransformTest")
-      .Input("InputImage")
-      .Output("WinoInput")
-      .AddIntArg("padding", padding)
-      .AddIntArg("wino_block_size", block_size)
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-
-  // Run on opencl
-  net.RunOp(D);
-
-  OpDefBuilder("InferConv2dShape", "InferConv2dShapeTest")
-      .Input("InputImage")
-      .Output("ShapeOutput")
-      .AddIntArg("data_format", 0)
-      .AddIntsArg("strides", {1, 1})
-      .AddIntsArg("kernels", {static_cast<int>(out_channels),
-                              static_cast<int>(in_channels),
-                              3, 3})
-      .AddIntArg("padding", padding)
-      .OutputType({DataTypeToEnum<int32_t>::v()})
-      .Finalize(net.NewOperatorDef());
-  net.RunOp(D);
-
-  // MatMul
-  OpDefBuilder("MatMul", "MatMulTest")
-      .Input("WinoFilter")
-      .Input("WinoInput")
-      .Output("WinoGemm")
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-  // Run on opencl
-  net.RunOp(D);
-
-  // Inverse transform
-  OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest")
-      .Input("WinoGemm")
-      .Input("ShapeOutput")
-      .Input("BiasImage")
-      .AddIntArg("wino_block_size", block_size)
-      .Output("WinoOutputImage")
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-
-  // Run on opencl
-  net.RunOp(D);
-  net.Sync();
-
-  ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput",
-                          ops::BufferType::IN_OUT_CHANNEL);
-  if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
-                            1e-2, 1e-2);
-  } else {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
-                            1e-5, 1e-4);
-  }
-}
-}  // namespace
-
-TEST_F(WinogradConvolutionTest, AlignedConvolutionM2) {
-  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
-                                                 Padding::VALID, 2);
-  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
-                                                 Padding::SAME, 2);
-}
-
-TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM2) {
-  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
-                                                 Padding::VALID, 2);
-  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
-                                                 Padding::SAME, 2);
-}
-
-TEST_F(WinogradConvolutionTest, BatchConvolutionM2) {
-  WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
-                                                 Padding::VALID, 2);
-  WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
-                                                 Padding::SAME, 2);
-}
-
-TEST_F(WinogradConvolutionTest, AlignedConvolutionM4) {
-  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
-                                              Padding::VALID, 4);
-  WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
-                                              Padding::SAME, 4);
-}
-
-TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM4) {
-  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
-                                              Padding::VALID, 4);
-  WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
-                                              Padding::SAME, 4);
-}
-
-TEST_F(WinogradConvolutionTest, BatchConvolutionM4) {
-  WinogradConvolution<DeviceType::GPU, float>(3, 107, 113, 5, 7,
-                                              Padding::VALID, 4);
-  WinogradConvolution<DeviceType::GPU, float>(5, 107, 113, 5, 7,
-                                              Padding::SAME, 4);
-}
-
-namespace {
-template <DeviceType D, typename T>
-void WinogradConvolutionWithPad(const index_t batch,
-                                const index_t height,
-                                const index_t width,
-                                const index_t in_channels,
-                                const index_t out_channels,
-                                const int padding,
-                                const int block_size) {
-  // srand(time(NULL));
-
-  // Construct graph
-  OpsTestNet net;
-  // Add input data
-  net.AddRandomInput<D, float>("Input", {batch, height, width, in_channels});
-  net.AddRandomInput<D, float>("Filter", {out_channels, in_channels, 3, 3});
-  net.AddRandomInput<D, float>("Bias", {out_channels});
-
-  BufferToImage<D, T>(&net, "Input", "InputImage",
-                      ops::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                      ops::BufferType::CONV2D_FILTER);
-  BufferToImage<D, T>(&net, "Bias", "BiasImage", ops::BufferType::ARGUMENT);
-  OpDefBuilder("Conv2D", "Conv2dTest")
-      .Input("InputImage")
-      .Input("FilterImage")
-      .Input("BiasImage")
-      .Output("OutputImage")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntsArg("padding_values", {padding, padding})
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-
-  net.RunOp(D);
-
-  // Transfer output
-  ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput",
-                          ops::BufferType::IN_OUT_CHANNEL);
-  auto expected = net.CreateTensor<float>();
-  expected->Copy(*net.GetOutput("ConvOutput"));
-  auto output_shape = expected->shape();
-
-  // Winograd convolution
-  // transform filter
-  BufferToImage<D, T>(&net, "Filter", "WinoFilter",
-                      ops::BufferType::WINOGRAD_FILTER, block_size);
-  // transform input
-  OpDefBuilder("WinogradTransform", "WinogradTransformTest")
-      .Input("InputImage")
-      .Output("WinoInput")
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddIntsArg("padding_values", {padding, padding})
-      .AddIntArg("wino_block_size", block_size)
-      .Finalize(net.NewOperatorDef());
-
-  // Run on opencl
-  net.RunOp(D);
-
-  OpDefBuilder("InferConv2dShape", "InferConv2dShapeTest")
-      .Input("InputImage")
-      .Output("ShapeOutput")
-      .AddIntArg("data_format", 0)
-      .AddIntsArg("strides", {1, 1})
-      .AddIntsArg("kernels", {static_cast<int>(out_channels),
-                              static_cast<int>(in_channels),
-                              3, 3})
-      .AddIntsArg("padding_values", {padding, padding})
-      .OutputType({DataTypeToEnum<int32_t>::v()})
-      .Finalize(net.NewOperatorDef());
-  net.RunOp(D);
-
-  // MatMul
-  OpDefBuilder("MatMul", "MatMulTest")
-      .Input("WinoFilter")
-      .Input("WinoInput")
-      .Output("WinoGemm")
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-  // Run on opencl
-  net.RunOp(D);
-
-  // Inverse transform
-  OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest")
-      .Input("WinoGemm")
-      .Input("ShapeOutput")
-      .Input("BiasImage")
-      .AddIntArg("wino_block_size", block_size)
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Output("WinoOutputImage")
-      .Finalize(net.NewOperatorDef());
-
-  // Run on opencl
-  net.RunOp(D);
-  net.Sync();
-
-  ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput",
-                          ops::BufferType::IN_OUT_CHANNEL);
-  if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
-                            1e-2, 1e-2);
-  } else {
-    ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
-                            1e-5, 1e-4);
-  }
-}
-}  // namespace
-
-TEST_F(WinogradConvolutionTest, AlignedConvolutionM2WithPad) {
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
-                                                     1, 2);
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
-                                                    2, 2);
-}
-
-TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM2WithPad) {
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
-                                                     1, 2);
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 37, 31,
-                                                    2, 2);
-}
-
-TEST_F(WinogradConvolutionTest, BatchConvolutionWithM2Pad) {
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
-                                                     1, 2);
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(5, 61, 67, 37, 31,
-                                                    2, 2);
-}
-
-TEST_F(WinogradConvolutionTest, AlignedConvolutionM4WithPad) {
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
-                                                     1, 4);
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
-                                                    2, 4);
-}
-
-TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM4WithPad) {
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
-                                                     1, 4);
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 37, 31,
-                                                    2, 4);
-}
-
-TEST_F(WinogradConvolutionTest, BatchConvolutionWithM4Pad) {
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
-                                                     1, 4);
-  WinogradConvolutionWithPad<DeviceType::GPU, float>(5, 61, 67, 37, 31,
-                                                    2, 4);
-}
-
-}  // namespace test
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/winograd_transform.cc b/mace/ops/winograd_transform.cc
deleted file mode 100644
index b2635f4de1d9f622d99808af2d1dc7fcb69c720b..0000000000000000000000000000000000000000
--- a/mace/ops/winograd_transform.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <string>
-
-#include "mace/core/operator.h"
-#include "mace/ops/activation.h"
-#include "mace/ops/conv_pool_2d_util.h"
-#include "mace/ops/opencl/image/winograd_transform.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class WinogradTransformOp;
-
-template <typename T>
-class WinogradTransformOp<DeviceType::GPU, T> : public Operation {
- public:
-  explicit WinogradTransformOp(OpConstructContext *context)
-      : Operation(context) {
-    Padding padding_type = static_cast<Padding>(Operation::GetOptionalArg<int>(
-        "padding", static_cast<int>(VALID)));
-    std::vector<int> paddings = Operation::GetRepeatedArgs<int>(
-        "padding_values");
-    int block_size = Operation::GetOptionalArg<int>("wino_block_size", 2);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::WinogradTransformKernel<T>(
-          padding_type, paddings, block_size));
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
-  }
-
-  MaceStatus Run(OpContext *context) override {
-    const Tensor *input_tensor = this->Input(0);
-    Tensor *output_tensor = this->Output(0);
-    return kernel_->Compute(context, input_tensor, output_tensor);
-  }
-
- private:
-  std::unique_ptr<OpenCLWinogradTransformKernel> kernel_;
-};
-
-template <DeviceType D, typename T>
-class WinogradInverseTransformOp;
-
-template <typename T>
-class WinogradInverseTransformOp<DeviceType::GPU, T> : public Operation {
- public:
-  explicit WinogradInverseTransformOp(OpConstructContext *context)
-      : Operation(context) {
-    ActivationType activation = ops::StringToActivationType(
-        Operation::GetOptionalArg<std::string>("activation", "NOOP"));
-    float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
-    int block_size = Operation::GetOptionalArg<int>("wino_block_size", 2);
-    if (context->device()->opencl_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::WinogradInverseTransformKernel<T>(
-          activation, relux_max_limit, block_size));
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
-  }
-
-  MaceStatus Run(OpContext *context) override {
-    Tensor *output_tensor = this->Output(0);
-    return kernel_->Compute(context, inputs_, output_tensor);
-  }
-
- private:
-  std::unique_ptr<OpenCLWinogradInverseTransformKernel> kernel_;
-};
-
-void RegisterWinogradTransform(OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "WinogradTransform",
-                   WinogradTransformOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "WinogradTransform",
-                   WinogradTransformOp, DeviceType::GPU, half);
-}
-
-void RegisterWinogradInverseTransform(
-    OpRegistryBase *op_registry) {
-  MACE_REGISTER_OP(op_registry, "WinogradInverseTransform",
-                   WinogradInverseTransformOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "WinogradInverseTransform",
-                   WinogradInverseTransformOp, DeviceType::GPU, half);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/winograd_transform_benchmark.cc b/mace/ops/winograd_transform_benchmark.cc
deleted file mode 100644
index bb6679bbe12d147fa3842369cb81071f746f970d..0000000000000000000000000000000000000000
--- a/mace/ops/winograd_transform_benchmark.cc
+++ /dev/null
@@ -1,271 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/core/testing/test_benchmark.h"
-#include "mace/ops/ops_test_util.h"
-
-namespace mace {
-namespace ops {
-namespace test {
-
-namespace {
-template <DeviceType D, typename T>
-void BMWinogradTransform(
-    int iters, int batch, int height, int width, int channels, int block_size) {
-  mace::testing::StopTiming();
-
-  OpsTestNet net;
-  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
-
-  BufferToImage<D, T>(&net, "Input", "InputImage",
-                      ops::BufferType::IN_OUT_CHANNEL);
-  OpDefBuilder("WinogradTransform", "WinogradTransformTest")
-      .Input("InputImage")
-      .Output("OutputImage")
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddIntArg("block_size", block_size)
-      .Finalize(net.NewOperatorDef());
-
-  net.Setup(D);
-  // Warm-up
-  for (int i = 0; i < 5; ++i) {
-    net.Run();
-  }
-  net.Sync();
-
-  mace::testing::StartTiming();
-  while (iters--) {
-    net.Run();
-  }
-  net.Sync();
-}
-}  // namespace
-
-#define MACE_BM_WINO_TRANSFORM_MACRO(N, H, W, C, M, TYPE, DEVICE)              \
-  static void MACE_BM_WINO_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##\
-    DEVICE( \
-      int iters) {                                                             \
-    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
-    mace::testing::MaccProcessed(tot);                                         \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
-    BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C, M);                   \
-  }                                                                            \
-  MACE_BENCHMARK(                                                              \
-    MACE_BM_WINO_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##DEVICE)
-
-#define MACE_BM_WINO_TRANSFORM(N, H, W, C, M) \
-  MACE_BM_WINO_TRANSFORM_MACRO(N, H, W, C, M, half, GPU);
-
-MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 2);
-MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 2);
-MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 2);
-MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 4);
-MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 4);
-MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 4);
-
-namespace {
-template <DeviceType D, typename T>
-void BMWinogradInverseTransform(
-    int iters, int batch, int height, int width, int channels, int block_size) {
-  mace::testing::StopTiming();
-
-  index_t p = batch * ((height + block_size - 1) / block_size) *
-      ((width + block_size - 1) / block_size);
-  OpsTestNet net;
-  net.AddRandomInput<D, float>("Input", {(block_size + 2) *
-      (block_size + 2), channels, p, 1});
-
-  BufferToImage<D, T>(&net, "Input", "InputImage",
-                      ops::BufferType::IN_OUT_HEIGHT);
-  OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest")
-      .Input("InputImage")
-      .AddIntArg("batch", batch)
-      .AddIntArg("height", height)
-      .AddIntArg("width", width)
-      .AddIntArg("block_size", block_size)
-      .Output("OutputImage")
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-  net.Setup(D);
-
-  // Warm-up
-  for (int i = 0; i < 5; ++i) {
-    net.Run();
-  }
-  net.Sync();
-
-  mace::testing::StartTiming();
-  while (iters--) {
-    net.Run();
-  }
-  net.Sync();
-}
-}  // namespace
-
-#define MACE_BM_WINO_INVERSE_TRANSFORM_MACRO(N, H, W, C, M, TYPE, DEVICE) \
-  static void                                                             \
-    MACE_BM_WINO_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_\
-    ##DEVICE(                                                             \
-          int iters) {                                                    \
-    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;      \
-    mace::testing::MaccProcessed(tot);                                    \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                   \
-    BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C, M);       \
-  }                                                                       \
-  MACE_BENCHMARK(                                                         \
-  MACE_BM_WINO_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##\
-  DEVICE)
-
-#define MACE_BM_WINO_INVERSE_TRANSFORM(N, H, W, C, M) \
-  MACE_BM_WINO_INVERSE_TRANSFORM_MACRO(N, H, W, C, M, half, GPU);
-
-MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 2);
-MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 2);
-MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 2);
-
-MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 4);
-MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 4);
-MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 4);
-
-namespace {
-template <DeviceType D, typename T>
-void WinoFilterBufferToImage(int iters,
-                         int out_channel, int in_channel,
-                         int height, int width, int wino_block_size) {
-  mace::testing::StopTiming();
-
-  OpsTestNet net;
-
-  // Add input data
-  net.AddRandomInput<D, T>("Input",
-                           {out_channel, in_channel, height, width});
-
-  OpDefBuilder("BufferToImage", "BufferToImageTest")
-      .Input("Input")
-      .Output("Output")
-      .AddIntArg("buffer_type", ops::BufferType::WINOGRAD_FILTER)
-      .AddIntArg("wino_block_size", wino_block_size)
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-
-  // Warm-up
-  net.Setup(D);
-  for (int i = 0; i < 5; ++i) {
-    net.Run();
-  }
-  net.Sync();
-
-  mace::testing::StartTiming();
-  while (iters--) {
-    net.Run();
-  }
-  net.Sync();
-}
-}  // namespace
-
-#define MACE_BM_WINO_B2I_MACRO(O, I, H, W, M, TYPE, DEVICE)                  \
-  static void MACE_BM_WINO_B2I_##O##_##I##_##H##_##W##_##M##_##TYPE##_##DEVICE(\
-      int iters) {                                                   \
-    const int64_t tot = static_cast<int64_t>(iters) * O * I * H * W; \
-    mace::testing::MaccProcessed(tot);                               \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
-    WinoFilterBufferToImage<DEVICE, TYPE>(iters, O, I, H, W, M);     \
-  }                                                                  \
-  MACE_BENCHMARK(\
-  MACE_BM_WINO_B2I_##O##_##I##_##H##_##W##_##M##_##TYPE##_##DEVICE)
-
-#define MACE_BM_WINO_B2I(O, I, H, W, M)              \
-  MACE_BM_WINO_B2I_MACRO(O, I, H, W, M, half, GPU);
-
-MACE_BM_WINO_B2I(16, 3, 3, 3, 2);
-MACE_BM_WINO_B2I(16, 3, 3, 3, 4);
-MACE_BM_WINO_B2I(32, 3, 3, 3, 2);
-MACE_BM_WINO_B2I(32, 3, 3, 3, 4);
-MACE_BM_WINO_B2I(128, 3, 3, 3, 2);
-MACE_BM_WINO_B2I(128, 3, 3, 3, 4);
-MACE_BM_WINO_B2I(256, 3, 3, 3, 2);
-MACE_BM_WINO_B2I(256, 3, 3, 3, 4);
-
-namespace {
-template <DeviceType D, typename T>
-void WinoMatMulBenchmark(
-    int iters, int out_channels, int in_channels,
-    int height, int width, int block_size) {
-  mace::testing::StopTiming();
-
-  OpsTestNet net;
-  const int batch = (block_size + 2) * (block_size + 2);
-  const index_t round_h = (height + block_size - 1) / block_size;
-  const index_t round_w = (width + block_size - 1) / block_size;
-  const index_t out_width = round_h * round_w;
-  // Add input data
-  net.AddRandomInput<D, float>("A", {batch, out_channels, in_channels});
-  net.AddRandomInput<D, float>("B", {batch, in_channels, out_width});
-
-  if (D == DeviceType::GPU) {
-    BufferToImage<D, T>(&net, "A", "AImage", ops::BufferType::IN_OUT_WIDTH);
-    BufferToImage<D, T>(&net, "B", "BImage",
-                        ops::BufferType::IN_OUT_HEIGHT);
-
-    OpDefBuilder("MatMul", "MatMulBM")
-        .Input("AImage")
-        .Input("BImage")
-        .Output("Output")
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-  } else {
-    OpDefBuilder("MatMul", "MatMulBM")
-        .Input("A")
-        .Input("B")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
-  }
-  net.Setup(D);
-  // Warm-up
-  for (int i = 0; i < 5; ++i) {
-    net.Run();
-  }
-  net.Sync();
-
-  mace::testing::StartTiming();
-  while (iters--) {
-    net.Run();
-  }
-  net.Sync();
-}
-}  // namespace
-
-#define MACE_BM_WINO_MATMUL_MACRO(OC, IC, H, W, M, TYPE, DEVICE)               \
-  static void MACE_BM_WINO_MATMUL_##OC##_##IC##_##H##_##W##_##M##_##TYPE##_##\
-    DEVICE(int iters) {                                                        \
-    const int64_t macc = static_cast<int64_t>(iters) * OC * IC * H * W;        \
-    const int64_t tot = static_cast<int64_t>(iters) * OC * (IC * H + H * W);   \
-    mace::testing::MaccProcessed(macc);                                        \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
-    WinoMatMulBenchmark<DEVICE, TYPE>(iters, OC, IC, H, W, M);                 \
-  }                                                                            \
-  MACE_BENCHMARK(\
-  MACE_BM_WINO_MATMUL_##OC##_##IC##_##H##_##W##_##M##_##TYPE##_##DEVICE)
-
-#define MACE_BM_WINO_MATMUL(OC, IC, H, W, M)                 \
-  MACE_BM_WINO_MATMUL_MACRO(OC, IC, H, W, M, half, GPU);
-
-MACE_BM_WINO_MATMUL(16, 3, 128, 128, 2);
-MACE_BM_WINO_MATMUL(16, 3, 128, 128, 4);
-MACE_BM_WINO_MATMUL(32, 3, 256, 256, 2);
-MACE_BM_WINO_MATMUL(32, 3, 256, 256, 4);
-
-}  // namespace test
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto
index a3064df0f2a945c0b960e7c6b55bff103c71519a..530de3aedfcd6a94d9ee840f8e368a4447d6cd8c 100644
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -7,11 +7,6 @@ option optimize_for = LITE_RUNTIME;
 // For better compatibility,
 // the mace.proto is refered from tensorflow and caffe2.
 
-enum NetMode {
-  INIT = 0;
-  NORMAL = 1;
-}
-
 enum DataType {
   DT_INVALID = 0;
   DT_FLOAT = 1;
@@ -90,18 +85,6 @@ message OperatorDef {
   repeated int32 out_max_byte_size = 104;  // only support 32-bit len
 }
 
-// for memory optimization
-message MemoryBlock {
-  optional int32 mem_id = 1;
-  optional int32 device_type = 2;
-  optional MemoryType mem_type = 3;
-  optional uint32 x = 4;
-  optional uint32 y = 5;
-}
-message MemoryArena {
-  repeated MemoryBlock mem_block = 1;
-}
-
 // for hexagon mace-nnlib
 message InputInfo {
   optional string name = 1;
@@ -109,6 +92,7 @@ message InputInfo {
   repeated int32 dims = 3;
   optional int32 max_byte_size = 4;  // only support 32-bit len
   optional DataType data_type = 5 [default = DT_FLOAT];
+  optional int32 data_format = 6 [default = 1];  // NHWC
 }
 message OutputInfo {
   optional string name = 1;
@@ -116,6 +100,7 @@ message OutputInfo {
   repeated int32 dims = 3;
   optional int32 max_byte_size = 4;  // only support 32-bit len
   optional DataType data_type = 5 [default = DT_FLOAT];
+  optional int32 data_format = 6 [default = 1];  // NHWC
 }
 
 message NetDef {
@@ -123,9 +108,6 @@ message NetDef {
   repeated Argument arg = 2;
   repeated ConstTensor tensors = 3;
 
-  // for mem optimization
-  optional MemoryArena mem_arena = 10;
-
   // for hexagon mace-nnlib
   repeated InputInfo input_info = 100;
   repeated OutputInfo output_info = 101;
diff --git a/mace/public/mace.h b/mace/public/mace.h
index 9e7f568638cc71a9cf358f141b4c0ed46853ab34..01818ef5719b48298bd501967bb91cb99521336f 100644
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -34,6 +34,8 @@ class NetDef;
 
 enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3 };
 
+enum DataFormat { DF_NONE = 0, NHWC = 1, NCHW = 2};
+
 enum GPUPerfHint {
   PERF_DEFAULT = 0,
   PERF_LOW = 1,
@@ -259,7 +261,8 @@ class MACE_API MaceTensor {
   // data - the buffer of the tensor, must not be null with size equals
   //        shape[0] * shape[1] * ... * shape[n-1]
   MaceTensor(const std::vector<int64_t> &shape,
-             std::shared_ptr<float> data);
+             std::shared_ptr<float> data,
+             const DataFormat format = DataFormat::NHWC);
   MaceTensor();
   MaceTensor(const MaceTensor &other);
   MaceTensor(const MaceTensor &&other);
@@ -270,6 +273,7 @@ class MACE_API MaceTensor {
   const std::vector<int64_t> &shape() const;
   const std::shared_ptr<float> data() const;
   std::shared_ptr<float> data();
+  DataFormat data_format() const;
 
  private:
   class Impl;
diff --git a/mace/python/tools/BUILD b/mace/python/tools/BUILD
index 693ed9dea2fecce8df3cdc246fa6d4ff87b47024..41f039476ee7f6b50a15ac8cac1dc30dc7738121 100644
--- a/mace/python/tools/BUILD
+++ b/mace/python/tools/BUILD
@@ -21,7 +21,6 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":memory_optimizer",
         ":quantization_lib",
         "//mace/proto:mace_py",
         "//third_party/caffe:caffe_py",
@@ -39,15 +38,6 @@ py_library(
     ],
 )
 
-py_binary(
-    name = "memory_optimizer",
-    srcs = ["memory_optimizer.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//mace/proto:mace_py",
-    ],
-)
-
 py_binary(
     name = "converter",
     srcs = ["converter.py"],
diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py
index a89e3abdb1e4a75fdf3ee5489439cb7d89cbfcfd..790654f49472a7159740db72a2ec6c6ff6c9be52 100644
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -21,7 +21,6 @@ import copy
 import six
 
 from mace.proto import mace_pb2
-from mace.python.tools import memory_optimizer
 from mace.python.tools import model_saver
 from mace.python.tools.converter_tool import base_converter as cvt
 from mace.python.tools.converter_tool import transformer
@@ -36,7 +35,13 @@ FLAGS = None
 
 device_type_map = {'cpu': cvt.DeviceType.CPU.value,
                    'gpu': cvt.DeviceType.GPU.value,
-                   'dsp': cvt.DeviceType.HEXAGON.value}
+                   'dsp': cvt.DeviceType.HEXAGON.value,
+                   'cpu+gpu': cvt.DeviceType.CPU.value}
+
+data_format_map = {
+    'NONE': cvt.DataFormat.DF_NONE,
+    'NHWC': cvt.DataFormat.NHWC,
+}
 
 
 def parse_data_type(data_type, device_type):
@@ -117,6 +122,7 @@ def main(unused_args):
 
     input_node_names = FLAGS.input_node.split(',')
     input_node_shapes = FLAGS.input_shape.split(':')
+    input_node_formats = FLAGS.input_data_formats.split(",")
     if FLAGS.input_range:
         input_node_ranges = FLAGS.input_range.split(':')
     else:
@@ -126,6 +132,10 @@ def main(unused_args):
     for i in six.moves.range(len(input_node_names)):
         input_node = cvt.NodeInfo()
         input_node.name = input_node_names[i]
+        if len(input_node_formats) == 1:
+            input_node.data_format = data_format_map[input_node_formats[0]]
+        else:
+            input_node.data_format = data_format_map[input_node_formats[i]]
         input_node.shape = parse_int_array_from_str(input_node_shapes[i])
         if len(input_node_ranges) > i:
             input_node.range = parse_float_array_from_str(input_node_ranges[i])
@@ -133,11 +143,16 @@ def main(unused_args):
 
     output_node_names = FLAGS.output_node.split(',')
     output_node_shapes = FLAGS.output_shape.split(':')
+    output_node_formats = FLAGS.output_data_formats.split(",")
     if len(output_node_names) != len(output_node_shapes):
         raise Exception('output node count and shape count do not match.')
     for i in six.moves.range(len(output_node_names)):
         output_node = cvt.NodeInfo()
         output_node.name = output_node_names[i]
+        if len(output_node_formats) == 1:
+            output_node.data_format = data_format_map[output_node_formats[0]]
+        else:
+            output_node.data_format = data_format_map[output_node_formats[i]]
         output_node.shape = parse_int_array_from_str(output_node_shapes[i])
         option.add_output_node(output_node)
 
@@ -179,74 +194,25 @@ def main(unused_args):
 
         output_graph_def = converter.run()
 
-        if FLAGS.runtime == 'cpu+gpu':
-            cpu_graph_def = copy.deepcopy(output_graph_def)
-
-            option.device = cvt.DeviceType.GPU.value
-            option.data_type = parse_data_type(
-                FLAGS.data_type, cvt.DeviceType.GPU.value)
-            mace_gpu_transformer = transformer.Transformer(
-                option, output_graph_def)
-            output_graph_def, _ = mace_gpu_transformer.run()
-            six.print_("start optimize gpu memory.")
-            memory_optimizer.optimize_gpu_memory(output_graph_def)
-            six.print_("GPU memory optimization done.")
-
-            option.device = cvt.DeviceType.CPU.value
-            option.data_type = parse_data_type(
-                FLAGS.data_type, cvt.DeviceType.CPU.value)
-            option.disable_transpose_filters()
-            mace_cpu_transformer = transformer.Transformer(
-                option, cpu_graph_def)
-            cpu_graph_def, _ = mace_cpu_transformer.run()
-            print("start optimize cpu memory.")
-            memory_optimizer.optimize_cpu_memory(cpu_graph_def)
-            print("CPU memory optimization done.")
-
-            print("Merge cpu and gpu ops together")
-            output_graph_def.op.extend(cpu_graph_def.op)
-            output_graph_def.mem_arena.mem_block.extend(
-                cpu_graph_def.mem_arena.mem_block)
-            output_graph_arg_names = set()
-            for arg in output_graph_def.arg:
-                output_graph_arg_names.add(arg.name)
-
-            for arg in cpu_graph_def.arg:
-                if arg.name not in output_graph_arg_names:
-                    output_graph_def.arg.extend(arg)
-            print("Merge done")
-        else:
-            option.device = device_type_map[FLAGS.runtime]
-            option.data_type = parse_data_type(
-                FLAGS.data_type, option.device)
-            mace_transformer = transformer.Transformer(
-                option, output_graph_def)
-            output_graph_def, quantize_activation_info = mace_transformer.run()
-
-            if FLAGS.runtime == 'dsp':
-                from mace.python.tools.converter_tool import hexagon_converter
-                converter = hexagon_converter.HexagonConverter(
-                    option, output_graph_def, quantize_activation_info)
-                output_graph_def = converter.run()
-
-            print("start optimize memory.")
-            if FLAGS.runtime == 'gpu':
-                memory_optimizer.optimize_gpu_memory(output_graph_def)
-            elif FLAGS.runtime == 'cpu':
-                memory_optimizer.optimize_cpu_memory(output_graph_def)
-            elif FLAGS.runtime == 'dsp':
-                pass
-            else:
-                mace_check(False, "runtime only support [gpu|cpu|dsp]")
-
-            print("Memory optimization done.")
+        option.device = device_type_map[FLAGS.runtime]
+        option.data_type = parse_data_type(
+            FLAGS.data_type, option.device)
+        mace_transformer = transformer.Transformer(
+            option, output_graph_def)
+        output_graph_def, quantize_activation_info = mace_transformer.run()
+
+        if FLAGS.runtime == 'dsp':
+            from mace.python.tools.converter_tool import hexagon_converter
+            converter = hexagon_converter.HexagonConverter(
+                option, output_graph_def, quantize_activation_info)
+            output_graph_def = converter.run()
 
     model_saver.save_model(
-        output_graph_def, model_checksum, weight_checksum,
+        option, output_graph_def, model_checksum, weight_checksum,
         FLAGS.template_dir, FLAGS.obfuscate, FLAGS.model_tag,
-        FLAGS.output_dir, FLAGS.runtime,
+        FLAGS.output_dir,
         FLAGS.embed_model_data,
-        FLAGS.winograd, FLAGS.data_type,
+        FLAGS.winograd,
         FLAGS.model_graph_format)
 
 
@@ -293,8 +259,18 @@ def parse_args():
         type=str,
         default="input_node",
         help="e.g., input_node")
+    parser.add_argument(
+        "--input_data_formats",
+        type=str,
+        default="NHWC",
+        help="e.g., NHWC,NONE")
     parser.add_argument(
         "--output_node", type=str, default="softmax", help="e.g., softmax")
+    parser.add_argument(
+        "--output_data_formats",
+        type=str,
+        default="NHWC",
+        help="e.g., NHWC,NONE")
     parser.add_argument(
         "--check_node", type=str, default="softmax", help="e.g., softmax")
     parser.add_argument(
diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py
index 6b5d227eb5e5e6967b7510602e7d03fd9ef033c4..3f8d7164b64ec4d64253f8562b54e9e7b31f377d 100644
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -25,15 +25,16 @@ class DeviceType(Enum):
 
 
 class DataFormat(Enum):
-    NHWC = 0
-    NCHW = 1
+    DF_NONE = 0
+    NHWC = 1
+    NCHW = 2
 
 
 class FilterFormat(Enum):
-    HWIO = 0
-    OIHW = 1
-    HWOI = 2
-    OHWI = 3
+    HWIO = 100
+    OIHW = 101
+    HWOI = 102
+    OHWI = 103
 
 
 class PaddingMode(Enum):
@@ -113,7 +114,6 @@ MaceSupportedOps = [
     'ResizeBilinear',
     'Reverse',
     'ScalarMath',
-    'Slice',
     'Split',
     'Shape',
     'Squeeze',
@@ -137,9 +137,6 @@ class MaceKeyword(object):
     mace_input_node_name = 'mace_input_node'
     mace_output_node_name = 'mace_output_node'
     mace_buffer_type = 'buffer_type'
-    mace_mode = 'mode'
-    mace_buffer_transform = 'BufferTransform'
-    mace_buffer_inverse_transform = 'BufferInverseTransform'
     # arg related str
     mace_padding_str = 'padding'
     mace_padding_values_str = 'padding_values'
@@ -185,6 +182,8 @@ class MaceKeyword(object):
     mace_opencl_mem_type = "opencl_mem_type"
     mace_framework_type_str = "framework_type"
     mace_group_str = "group"
+    mace_wino_arg_str = "wino_block_size"
+    mace_quantize_flag_arg_str = "quantize_flag"
 
 
 class TransformerRule(Enum):
@@ -195,7 +194,7 @@ class TransformerRule(Enum):
     FOLD_BATCHNORM = 5
     FOLD_CONV_AND_BN = 6
     FOLD_DEPTHWISE_CONV_AND_BN = 7
-    TRANSFORM_GPU_WINOGRAD = 8
+    ADD_WINOGRAD_ARG = 8
     TRANSFORM_ADD_TO_BIASADD = 9
     FOLD_BIASADD = 10
     FLATTEN_ATROUS_CONV = 11
@@ -238,6 +237,7 @@ class NodeInfo(object):
     def __init__(self):
         self._name = None
         self._shape = []
+        self._data_format = DataFormat.NHWC
         self._range = [-1.0, 1.0]
 
     @property
@@ -248,6 +248,10 @@ class NodeInfo(object):
     def shape(self):
         return self._shape
 
+    @property
+    def data_format(self):
+        return self._data_format
+
     @property
     def range(self):
         return self._range
@@ -260,6 +264,10 @@ class NodeInfo(object):
     def shape(self, shape):
         self._shape = shape
 
+    @data_format.setter
+    def data_format(self, data_format):
+        self._data_format = data_format
+
     @range.setter
     def range(self, range):
         self._range = range
@@ -410,7 +418,6 @@ class ConverterOption(object):
                 TransformerRule.FOLD_CONV_AND_BN,
                 TransformerRule.FOLD_DECONV_AND_BN,
                 TransformerRule.FOLD_DEPTHWISE_CONV_AND_BN,
-                TransformerRule.TRANSFORM_GPU_WINOGRAD,
                 TransformerRule.TRANSFORM_ADD_TO_BIASADD,
                 TransformerRule.REARRANGE_BATCH_TO_SPACE,
                 TransformerRule.FOLD_BIASADD,
@@ -422,16 +429,14 @@ class ConverterOption(object):
                 # Model data format related transformation
                 TransformerRule.TRANSPOSE_FILTERS,
                 TransformerRule.TRANSPOSE_DATA_FORMAT,
+                # Add winograd argument
+                TransformerRule.ADD_WINOGRAD_ARG,
                 # Mace model structure related transformation
                 TransformerRule.ADD_IN_OUT_TENSOR_INFO,
-                # Device related transformation
-                TransformerRule.ADD_BUFFER_TRANSFORM,
-                TransformerRule.ADD_DEVICE,
                 # Data type related transformation
                 TransformerRule.UPDATE_FLOAT_OP_DATA_TYPE,
                 # Transform finalization
                 TransformerRule.ADD_OPENCL_INFORMATIONS,
-                TransformerRule.ADD_MACE_INPUT_AND_OUTPUT_NODES,
                 # for quantization entropy calibration use
                 TransformerRule.SORT_BY_EXECUTION,
                 # Need to be put after SORT_BY_EXECUTION
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index 7769b81e5f83e38add2289d564fdbb39c47b929b..d736719355d80df993f08c395f3495fb592cc993 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 
-import enum
 import re
 
 import numpy as np
@@ -21,7 +20,6 @@ import six
 
 from mace.proto import mace_pb2
 from mace.python.tools.converter_tool import base_converter
-from mace.python.tools.converter_tool.base_converter import ActivationType
 from mace.python.tools.converter_tool.base_converter import ConverterUtil
 from mace.python.tools.converter_tool.base_converter import DataFormat
 from mace.python.tools.converter_tool.base_converter import DeviceType
@@ -32,13 +30,9 @@ from mace.python.tools.converter_tool.base_converter import MaceKeyword
 from mace.python.tools.converter_tool.base_converter import MaceOp
 from mace.python.tools.converter_tool.base_converter import PaddingMode
 from mace.python.tools.converter_tool.base_converter import TransformerRule
-from mace.python.tools.convert_util import calculate_image_shape
 from mace.python.tools.convert_util import mace_check
-from mace.python.tools.convert_util import OpenCLBufferType
 from mace.python.tools.quantization import quantize_util
 
-OPENCL_IMAGE_MAX_SIZE = 16384
-
 
 class Transformer(base_converter.ConverterInterface):
     """A class for transform naive mace model to optimized model.
@@ -69,8 +63,6 @@ class Transformer(base_converter.ConverterInterface):
                 self.fold_deconv_and_bn,  # data_format related
             TransformerRule.FOLD_DEPTHWISE_CONV_AND_BN:
                 self.fold_depthwise_conv_and_bn,  # data_format related
-            TransformerRule.TRANSFORM_GPU_WINOGRAD:
-                self.transform_gpu_winograd,  # data_format related
             TransformerRule.TRANSFORM_ADD_TO_BIASADD:
                 self.transform_add_to_biasadd,
             TransformerRule.REARRANGE_BATCH_TO_SPACE:
@@ -84,25 +76,20 @@ class Transformer(base_converter.ConverterInterface):
             TransformerRule.TRANSPOSE_MATMUL_WEIGHT:
                 self.transpose_matmul_weight,
             TransformerRule.TRANSPOSE_DATA_FORMAT: self.transpose_data_format,
+            TransformerRule.ADD_WINOGRAD_ARG: self.add_winograd_arg,
             TransformerRule.ADD_IN_OUT_TENSOR_INFO:
                 self.add_in_out_tensor_info,
             TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC:
                 self.transform_global_conv_to_fc,
             TransformerRule.RESHAPE_FC_WEIGHT: self.reshape_fc_weight,
-            TransformerRule.ADD_BUFFER_TRANSFORM:
-                self.add_buffer_transform,
             TransformerRule.QUANTIZE_NODES:
                 self.quantize_nodes,
             TransformerRule.ADD_QUANTIZE_TENSOR_RANGE:
                 self.add_quantize_tensor_range,
             TransformerRule.QUANTIZE_WEIGHTS:
                 self.quantize_weights,
-            TransformerRule.ADD_DEVICE:
-                self.add_device,
             TransformerRule.UPDATE_FLOAT_OP_DATA_TYPE:
                 self.update_float_op_data_type,
-            TransformerRule.ADD_MACE_INPUT_AND_OUTPUT_NODES:
-                self.add_mace_input_and_output_nodes,
             TransformerRule.ADD_OPENCL_INFORMATIONS:
                 self.add_opencl_informations,
             TransformerRule.SORT_BY_EXECUTION: self.sort_by_execution,
@@ -112,28 +99,22 @@ class Transformer(base_converter.ConverterInterface):
 
         self._option = option
         self._model = model
-        self._gpu_wino_blk = self._option.winograd
+        self._wino_arg = self._option.winograd
 
         self._ops = {}
         self._consts = {}
         self._consumers = {}
         self._producer = {}
         self._target_data_format = DataFormat.NHWC
-        self._input_output_added = False
-        self._opencl_max_image_size = [0, 0]
         self._output_op_names = set()
         self._quantize_activation_info = {}
         self._quantized_tensor = set()
 
-        if self._option.device == DeviceType.CPU.value and \
-                not self._option.quantize:
-            self._target_data_format = DataFormat.NCHW
-
     def run(self):
         for key in self._option.transformer_option:
             transformer = self._registered_transformers[key]
             while True:
-                self.construct_ops_and_consumers()
+                self.construct_ops_and_consumers(key)
                 changed = transformer()
                 if not changed:
                         break
@@ -162,7 +143,7 @@ class Transformer(base_converter.ConverterInterface):
                                     MaceKeyword.mace_filter_format_str)
         arg.i = filter_format.value
 
-    def construct_ops_and_consumers(self):
+    def construct_ops_and_consumers(self, key):
         self._ops.clear()
         self._consumers.clear()
         self._producer.clear()
@@ -178,27 +159,28 @@ class Transformer(base_converter.ConverterInterface):
 
             for output_tensor in op.output:
                 self._producer[output_tensor] = op
-        for input_node in self._option.input_nodes.values():
-            input_node_existed = False
-            for op in self._model.op:
-                if input_node.name in op.output:
-                    input_node_existed = True
-                    break
-            if not input_node_existed:
-                op = mace_pb2.OperatorDef()
-                op.name = self.normalize_op_name(input_node.name)
-                op.type = 'Input'
-                op.output.extend([input_node.name])
-                output_shape = op.output_shape.add()
-                output_shape.dims.extend(input_node.shape)
-                if ConverterUtil.data_format(
-                        self._consumers[input_node.name][0]) \
-                        == DataFormat.NCHW:
-                    self.transpose_shape(output_shape.dims, [0, 3, 1, 2])
-                    ConverterUtil.add_data_format_arg(op, DataFormat.NCHW)
-                else:
-                    ConverterUtil.add_data_format_arg(op, DataFormat.NHWC)
-                self._producer[op.output[0]] = op
+        if key != TransformerRule.SORT_BY_EXECUTION:
+            for input_node in self._option.input_nodes.values():
+                input_node_existed = False
+                for op in self._model.op:
+                    if input_node.name in op.output:
+                        input_node_existed = True
+                        break
+                if not input_node_existed:
+                    op = mace_pb2.OperatorDef()
+                    op.name = self.normalize_op_name(input_node.name)
+                    op.type = "Input"
+                    op.output.extend([input_node.name])
+                    output_shape = op.output_shape.add()
+                    output_shape.dims.extend(input_node.shape)
+                    if ConverterUtil.data_format(
+                            self._consumers[input_node.name][0]) \
+                            == DataFormat.NCHW:
+                        self.transpose_shape(output_shape.dims, [0, 3, 1, 2])
+                        ConverterUtil.add_data_format_arg(op, DataFormat.NCHW)
+                    else:
+                        ConverterUtil.add_data_format_arg(op, DataFormat.NHWC)
+                    self._producer[op.output[0]] = op
 
     @staticmethod
     def replace(obj_list, source, target):
@@ -288,21 +270,17 @@ class Transformer(base_converter.ConverterInterface):
         for input_node in self._option.input_nodes.values():
             input_info = net.input_info.add()
             input_info.name = input_node.name
+            input_info.data_format = input_node.data_format.value
             input_info.dims.extend(input_node.shape)
-            if self._option.quantize:
-                input_info.data_type = mace_pb2.DT_FLOAT
-            else:
-                input_info.data_type = self._option.data_type
+            input_info.data_type = mace_pb2.DT_FLOAT
 
         for output_node in self._option.output_nodes.values():
             output_info = net.output_info.add()
             output_info.name = output_node.name
+            output_info.data_format = output_node.data_format.value
             output_info.dims.extend(
                 self._producer[output_node.name].output_shape[0].dims)
-            if self._option.quantize:
-                output_info.data_type = mace_pb2.DT_FLOAT
-            else:
-                output_info.data_type = self._option.data_type
+            output_info.data_type = mace_pb2.DT_FLOAT
 
         return False
 
@@ -725,173 +703,6 @@ class Transformer(base_converter.ConverterInterface):
             mace_check(False, "filter format %s not supported" % filter_format)
         return filter_height, filter_width, in_channels, out_channels
 
-    def check_if_gpu_use_winograd_conv(self, op):
-        if not self._option.winograd:
-            return False
-        if op.type != MaceOp.Conv2D.name:
-            return False
-
-        filter_shape = self._consts[op.input[1]].dims
-        output_shape = op.output_shape[0].dims
-        strides = ConverterUtil.get_arg(op, MaceKeyword.mace_strides_str).ints
-        dilations_arg = ConverterUtil.get_arg(op,
-                                              MaceKeyword.mace_dilations_str)
-        if dilations_arg is None:
-            dilations = [1, 1]
-        else:
-            dilations = dilations_arg.ints
-        filter_height, filter_width, in_channels, out_channels = \
-            Transformer.sort_filter_shape(filter_shape, self.filter_format())
-        batch, out_height, out_width, _ = Transformer.sort_feature_map_shape(
-            output_shape, ConverterUtil.data_format(op))
-
-        if filter_height != 3 or filter_width != 3 or strides[0] > 1 \
-                or strides[1] > 1 or dilations[0] > 1 or dilations[1] > 1:
-            return False
-        self._gpu_wino_blk = self._option.winograd
-        block_size = self._option.winograd
-        blk_sqr = (block_size + 2) * (block_size + 2)
-        width =\
-            batch * ((out_height + block_size - 1) / block_size) *\
-            ((out_width + block_size - 1) / block_size)
-        if blk_sqr * in_channels >= OPENCL_IMAGE_MAX_SIZE \
-                or blk_sqr * out_channels >= OPENCL_IMAGE_MAX_SIZE \
-                or width >= OPENCL_IMAGE_MAX_SIZE:
-            self._gpu_wino_blk = 2
-            block_size = self._gpu_wino_blk
-            blk_sqr = (block_size + 2) * (block_size + 2)
-            width = \
-                batch * ((out_height + block_size - 1) / block_size) * \
-                ((out_width + block_size - 1) / block_size)
-        return (blk_sqr * in_channels < OPENCL_IMAGE_MAX_SIZE) and \
-               (blk_sqr * out_channels < OPENCL_IMAGE_MAX_SIZE) and \
-               (width < OPENCL_IMAGE_MAX_SIZE)
-
-    def transform_gpu_winograd(self):
-        """Only gpu needs winograd transform."""
-        net = self._model
-        filter_format = self.filter_format()
-        if self._option.device == DeviceType.GPU.value:
-            for op in net.op:
-                if op.type == MaceOp.Conv2D.name \
-                        and self.check_if_gpu_use_winograd_conv(op):
-                    print("Transform gpu winograd %s(%s)" % (op.name, op.type))
-                    block_size = self._gpu_wino_blk
-                    blk_sqr = (block_size + 2) * (block_size + 2)
-                    output_shape = op.output_shape[0].dims
-                    filter = self._consts[op.input[1]]
-                    filter_shape = filter.dims
-                    data_format = ConverterUtil.data_format(op)
-                    filter_height, filter_width, in_channels, out_channels = \
-                        Transformer.sort_filter_shape(filter_shape,
-                                                      filter_format)
-                    batch, out_height, out_width, _ = \
-                        Transformer.sort_feature_map_shape(output_shape,
-                                                           data_format)
-
-                    # Input transform
-                    wt_op = net.op.add()
-                    wt_op.name = op.name + '_input_transform'
-                    wt_op.type = MaceOp.WinogradTransform.name
-                    wt_op.input.extend([op.input[0]])
-                    wt_op.output.extend([wt_op.name])
-                    wt_output_shape = wt_op.output_shape.add()
-                    wt_output_width =\
-                        batch * ((out_height + block_size - 1) / block_size) *\
-                        ((out_width + block_size - 1) / block_size)
-                    wt_output_shape.dims.extend(
-                        [blk_sqr, in_channels, wt_output_width])
-
-                    blk_size_arg = wt_op.arg.add()
-                    blk_size_arg.name = MaceKeyword.mace_wino_block_size
-                    blk_size_arg.i = block_size
-
-                    if ConverterUtil.get_arg(op,
-                                             MaceKeyword.mace_padding_str) \
-                            is not None:
-                        padding_arg = wt_op.arg.add()
-                        padding_arg.name = MaceKeyword.mace_padding_str
-                        padding_arg.i = ConverterUtil.get_arg(
-                            op, MaceKeyword.mace_padding_str).i
-                    elif ConverterUtil.get_arg(
-                            op, MaceKeyword.mace_padding_values_str) \
-                            is not None:
-                        padding_arg = wt_op.arg.add()
-                        padding_arg.name = MaceKeyword.mace_padding_values_str
-                        padding_arg.ints.extend(ConverterUtil.get_arg(
-                            op, MaceKeyword.mace_padding_values_str).ints)
-
-                    # MatMul
-                    matmul_op = net.op.add()
-                    matmul_op.name = op.name + '_matmul'
-                    matmul_op.type = MaceOp.MatMul.name
-                    matmul_op.input.extend([op.input[1], wt_op.output[0]])
-                    matmul_op.output.extend([matmul_op.name])
-                    matmul_output_shape = matmul_op.output_shape.add()
-                    matmul_output_shape.dims.extend(
-                        [blk_sqr, out_channels, wt_output_width])
-
-                    arg = matmul_op.arg.add()
-                    arg.name = MaceKeyword.mace_winograd_filter_transformed
-                    arg.i = 1
-
-                    shape_op = net.op.add()
-                    shape_op.name = op.name + '_infer_shape'
-                    shape_op.type = MaceOp.InferConv2dShape.name
-                    shape_op.input.extend([op.input[0]])
-                    shape_op.output.extend([shape_op.name])
-                    shape_output_shape = shape_op.output_shape.add()
-                    shape_output_shape.dims.extend([4])
-
-                    kernels_arg = shape_op.arg.add()
-                    kernels_arg.name = MaceKeyword.mace_kernel_str
-                    kernels_arg.ints.extend([out_channels,
-                                             in_channels,
-                                             filter_height,
-                                             filter_width])
-
-                    if data_format is not None:
-                        data_format_arg = shape_op.arg.add()
-                        data_format_arg.name = MaceKeyword.mace_data_format_str
-                        data_format_arg.i = data_format.value
-
-                    if ConverterUtil.get_arg(op,
-                                             MaceKeyword.mace_padding_str) \
-                            is not None:
-                        padding_arg = shape_op.arg.add()
-                        padding_arg.name = MaceKeyword.mace_padding_str
-                        padding_arg.i = ConverterUtil.get_arg(
-                            op, MaceKeyword.mace_padding_str).i
-                    elif ConverterUtil.get_arg(
-                            op, MaceKeyword.mace_padding_values_str) \
-                            is not None:
-                        padding_arg = shape_op.arg.add()
-                        padding_arg.name = MaceKeyword.mace_padding_values_str
-                        padding_arg.ints.extend(ConverterUtil.get_arg(
-                            op, MaceKeyword.mace_padding_values_str).ints)
-
-                    # Inverse transform
-                    iwt_op = net.op.add()
-                    iwt_op.name = op.name + '_inverse_transform'
-                    iwt_op.type = MaceOp.WinogradInverseTransform.name
-                    iwt_op.input.extend([matmul_op.output[0]])
-                    iwt_op.input.extend([shape_op.output[0]])
-                    # biasadd
-                    if len(op.input) >= 3:
-                        iwt_op.input.extend([op.input[2]])
-                    iwt_op.output.extend(op.output)
-                    iwt_output_shape = iwt_op.output_shape.add()
-                    iwt_output_shape.dims.extend(op.output_shape[0].dims)
-
-                    blk_size_arg = iwt_op.arg.add()
-                    blk_size_arg.name = MaceKeyword.mace_wino_block_size
-                    blk_size_arg.i = block_size
-                    ConverterUtil.add_data_format_arg(iwt_op, data_format)
-
-                    self.safe_remove_node(op, iwt_op)
-
-        return False
-
     def transform_add_to_biasadd(self):
         net = self._model
         for op in net.op:
@@ -1105,37 +916,25 @@ class Transformer(base_converter.ConverterInterface):
                     if arg.name == MaceKeyword.mace_paddings_str:
                         mace_check(len(arg.ints) == 8,
                                    "pad dim rank should be 8.")
-                        if ConverterUtil.data_format(op) == DataFormat.NHWC \
-                                and self._target_data_format == DataFormat.NCHW:  # noqa
-                            print("Transpose pad args: %s(%s)"
-                                  % (op.name, op.type))
-                            self.transpose_shape(arg.ints,
-                                                 [0, 1, 6, 7, 2, 3, 4, 5])
-                        elif ConverterUtil.data_format(op) == DataFormat.NCHW \
+                        if ConverterUtil.data_format(op) == DataFormat.NCHW \
                                 and self._target_data_format == DataFormat.NHWC:  # noqa
                             print("Transpose pad args: %s(%s)"
                                   % (op.name, op.type))
                             self.transpose_shape(arg.ints,
                                                  [0, 1, 4, 5, 6, 7, 2, 3])
-            elif op.type == MaceOp.Concat.name or op.type == MaceOp.Slice.name:
+            elif op.type == MaceOp.Concat.name or op.type == MaceOp.Split.name:
                 for arg in op.arg:
                     if arg.name == MaceKeyword.mace_axis_str:
-                        if ConverterUtil.data_format(op) == DataFormat.NHWC \
-                                and self._target_data_format == DataFormat.NCHW:  # noqa
-                            print("Transpose slice args: %s(%s)"
-                                  % (op.name, op.type))
-                            mace_check(arg.i == 3,
-                                       'only support concat at '
-                                       'channel dimension')
-                            arg.i = 1
-                        elif ConverterUtil.data_format(op) == DataFormat.NCHW \
+                        if ConverterUtil.data_format(op) == DataFormat.NCHW \
                                 and self._target_data_format == DataFormat.NHWC:  # noqa
-                            print("Transpose slice args: %s(%s)"
+                            print("Transpose concat/split args: %s(%s)"
                                   % (op.name, op.type))
-                            mace_check(arg.i == 1,
-                                       "only support concat at "
-                                       "channel dimension")
-                            arg.i = 3
+                            if arg.i == 1:
+                                arg.i = 3
+                            elif arg.i == 2:
+                                arg.i = 1
+                            elif arg.i == 3:
+                                arg.i = 2
 
                         producer = self._producer[op.input[0]]
                         input_shape = producer.output_shape[0].dims
@@ -1150,17 +949,7 @@ class Transformer(base_converter.ConverterInterface):
             elif op.type == MaceOp.Squeeze.name:
                 for arg in op.arg:
                     if arg.name == MaceKeyword.mace_axis_str:
-                        if ConverterUtil.data_format(
-                                op) == DataFormat.NHWC \
-                                and self._target_data_format == DataFormat.NCHW:  # noqa
-                            print("Transpose squeeze args: %s(%s)"
-                                  % (op.name, op.type))
-                            mace_check(list(arg.ints) == [1, 2],
-                                       'only support squeeze at at [1, 2]')
-                            arg.ints[:] = [2, 3]
-                        elif ConverterUtil.data_format(
-                                op) == DataFormat.NCHW \
-                                and self._target_data_format == DataFormat.NHWC:  # noqa
+                        if ConverterUtil.data_format(op) == DataFormat.NCHW:
                             print("Transpose squeeze args: %s(%s)"
                                   % (op.name, op.type))
                             mace_check(list(arg.ints) == [2, 3],
@@ -1171,24 +960,6 @@ class Transformer(base_converter.ConverterInterface):
                 for arg in op.arg:
                     if arg.name == MaceKeyword.mace_axis_str:
                         if ConverterUtil.data_format(
-                                op) == DataFormat.NHWC \
-                                and self._target_data_format == DataFormat.NCHW:  # noqa
-                            print("Transpose reduce mean args: %s(%s)"
-                                  % (op.name, op.type))
-                            reduce_axises = list(arg.ints)
-                            new_axises = []
-                            for i in range(len(reduce_axises)):
-                                idx = reduce_axises[i]
-                                if idx == 1 or idx == 2:
-                                    new_axises.append(idx + 1)
-                                elif idx == 3:
-                                    new_axises.append(1)
-                                else:
-                                    new_axises.append(idx)
-                            new_axises.sort()
-                            arg.ints[:] = []
-                            arg.ints.extend(new_axises)
-                        elif ConverterUtil.data_format(
                                 op) == DataFormat.NCHW \
                                 and self._target_data_format == DataFormat.NHWC:  # noqa
                             print("Transpose reduce mean args: %s(%s)"
@@ -1212,69 +983,26 @@ class Transformer(base_converter.ConverterInterface):
             if data_format is not None \
                     and data_format != self._target_data_format:
                 print("Transpose output shapes: %s(%s)" % (op.name, op.type))
-                if self._target_data_format == DataFormat.NHWC:  # NCHW -> NHWC
-                    for output_shape in op.output_shape:
-                        if len(output_shape.dims) == 4:
-                            self.transpose_shape(output_shape.dims,
-                                                 [0, 2, 3, 1])
-                else:  # NHWC -> NCHW
-                    for output_shape in op.output_shape:
-                        if len(output_shape.dims) == 4:
-                            self.transpose_shape(output_shape.dims,
-                                                 [0, 3, 1, 2])
+                for output_shape in op.output_shape:
+                    if len(output_shape.dims) == 4:
+                        self.transpose_shape(output_shape.dims,
+                                             [0, 2, 3, 1])
                 ConverterUtil.get_arg(op,
                                       MaceKeyword.mace_data_format_str).i = \
                     self._target_data_format.value
 
-        # transpose input/output
-        if self._target_data_format == DataFormat.NCHW:
-            print("Transpose input/output to NCHW")
-            for input_node in self._option.input_nodes.values():
-                new_input_name = MaceKeyword.mace_input_node_name \
-                                 + '_' + input_node.name
-                op = net.op.add()
-                op.name = self.normalize_op_name(input_node.name)
-                op.input.extend([new_input_name])
-                op.output.extend([input_node.name])
-                output_shape = op.output_shape.add()
-                output_shape.dims.extend(input_node.shape)
-                if len(output_shape.dims) == 4:
-                    op.type = MaceOp.Transpose.name
-                    self.transpose_shape(output_shape.dims, [0, 3, 1, 2])
-
-                    dims_arg = op.arg.add()
-                    dims_arg.name = MaceKeyword.mace_dims_str
-                    dims_arg.ints.extend([0, 3, 1, 2])
-                else:
-                    op.type = MaceOp.Identity.name
-
-                ConverterUtil.add_data_type_arg(op, mace_pb2.DT_FLOAT)
-                ConverterUtil.add_data_format_arg(op, DataFormat.NCHW)
+        return False
 
-            for output_node in self._option.output_nodes.values():
-                output_name = MaceKeyword.mace_output_node_name \
-                              + '_' + output_node.name
-                op = self._model.op.add()
-                op.name = self.normalize_op_name(output_name)
-                op.input.extend([output_node.name])
-                op.output.extend([output_name])
-                output_shape = op.output_shape.add()
-                output_shape.dims.extend(
-                    self._producer[output_node.name].output_shape[0].dims)
-                if len(output_shape.dims) == 4:
-                    op.type = MaceOp.Transpose.name
-                    self.transpose_shape(output_shape.dims, [0, 2, 3, 1])
-
-                    dims_arg = op.arg.add()
-                    dims_arg.name = MaceKeyword.mace_dims_str
-                    dims_arg.ints.extend([0, 2, 3, 1])
-
-                    ConverterUtil.add_data_format_arg(op, DataFormat.NHWC)
-                else:
-                    op.type = MaceOp.Identity.name
-                ConverterUtil.add_data_type_arg(op, mace_pb2.DT_FLOAT)
+    def add_winograd_arg(self):
+        if self._wino_arg == 0:
+            return False
+        net = self._model
 
-            self._input_output_added = True
+        for op in net.op:
+            if op.type == MaceOp.Conv2D.name:
+                winograd_arg = op.arg.add()
+                winograd_arg.name = MaceKeyword.mace_wino_arg_str
+                winograd_arg.i = self._wino_arg
 
         return False
 
@@ -1400,168 +1128,6 @@ class Transformer(base_converter.ConverterInterface):
 
         return False
 
-    def buffer_transform(self, op, input_idx, input_type):
-        net = self._model
-        input_name = op.input[input_idx]
-        op_def = net.op.add()
-        op_def.name = input_name.replace(':', '_') + "_b2i"
-        output_name = op_def.name
-        op_def.type = MaceKeyword.mace_buffer_transform
-        op_def.input.extend([input_name])
-        op_def.output.extend([output_name])
-
-        arg = op_def.arg.add()
-        arg.name = MaceKeyword.mace_buffer_type
-        arg.i = input_type.value
-        arg = op_def.arg.add()
-        arg.name = MaceKeyword.mace_mode
-        arg.i = 0
-        ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_FLOAT)
-
-        tensor_shape = list(self._consts[input_name].dims)
-        if input_type == OpenCLBufferType.WINOGRAD_FILTER:
-            blk_sqr = op.output_shape[0].dims[0]
-            wino_blk = int(np.sqrt(blk_sqr)) - 2
-            wino_arg = op_def.arg.add()
-            wino_arg.name = MaceKeyword.mace_wino_block_size
-            wino_arg.i = wino_blk
-            img_shape = calculate_image_shape(input_type, tensor_shape,
-                                              wino_blk)
-        else:
-            img_shape = calculate_image_shape(input_type, tensor_shape)
-
-        op.input[input_idx] = output_name
-
-        # update OpenCL max image size
-        self._opencl_max_image_size[0] = max(self._opencl_max_image_size[0],
-                                             img_shape[0])
-        self._opencl_max_image_size[1] = max(self._opencl_max_image_size[1],
-                                             img_shape[1])
-
-    def add_buffer_transform(self):
-        if self._option.device != DeviceType.GPU.value:
-            return False
-
-        print("Add buffer transform op")
-
-        net = self._model
-        for op in net.op:
-            if op.type == MaceOp.Conv2D.name:
-                self.buffer_transform(op, 1, OpenCLBufferType.CONV2D_FILTER)
-                if len(op.input) >= 3:
-                    self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
-            elif op.type == MaceOp.Deconv2D.name\
-                    or op.type == MaceOp.DepthwiseDeconv2d.name:
-                if op.type == MaceOp.Deconv2D.name:
-                    self.buffer_transform(op, 1,
-                                          OpenCLBufferType.CONV2D_FILTER)
-                elif op.type == MaceOp.DepthwiseDeconv2d.name:
-                    self.buffer_transform(op, 1,
-                                          OpenCLBufferType.DW_CONV2D_FILTER)
-                if ConverterUtil.get_arg(
-                        op,
-                        MaceKeyword.mace_framework_type_str).i == \
-                        FrameworkType.CAFFE.value:
-                    if len(op.input) >= 3:
-                        self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
-                elif len(op.input) >= 4:
-                    self.buffer_transform(op, 3, OpenCLBufferType.ARGUMENT)
-            elif op.type == MaceOp.DepthwiseConv2d.name:
-                self.buffer_transform(op, 1, OpenCLBufferType.DW_CONV2D_FILTER)
-                if len(op.input) >= 3:
-                    self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
-            elif op.type == MaceOp.BiasAdd.name:
-                self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT)
-            elif op.type == MaceOp.Eltwise.name and len(op.input) == 2:
-                if op.input[0] in self._consts \
-                        and len(self._consts[op.input[0]].dims) == 1:
-                    self.buffer_transform(op, 0, OpenCLBufferType.ARGUMENT)
-                if op.input[1] in self._consts \
-                        and len(self._consts[op.input[1]].dims) == 1:
-                    self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT)
-            elif op.type == MaceOp.BatchNorm.name:
-                self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT)
-                self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
-                if len(op.input) >= 4:
-                    self.buffer_transform(op, 3, OpenCLBufferType.ARGUMENT)
-            elif op.type == MaceOp.MatMul.name and \
-                    ConverterUtil.get_arg(
-                        op,
-                        MaceKeyword.mace_winograd_filter_transformed
-                    ) is not None:  # noqa
-                self.buffer_transform(op, 0, OpenCLBufferType.WINOGRAD_FILTER)
-            elif op.type == MaceOp.WinogradInverseTransform.name \
-                    and len(op.input) >= 3:
-                self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
-            elif op.type == MaceOp.FullyConnected.name:
-                self.buffer_transform(op, 1, OpenCLBufferType.WEIGHT_WIDTH)
-                if len(op.input) >= 3:
-                    self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
-            elif op.type == MaceOp.Activation.name:
-                if ConverterUtil.get_arg(
-                        op,
-                        MaceKeyword.mace_activation_type_str
-                ).s == ActivationType.PRELU.name:  # noqa
-                    self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT)
-            elif op.type == MaceOp.LSTMCell.name:
-                if op.input[1] in self._consts:
-                    self.buffer_transform(op, 1,
-                                          OpenCLBufferType.IN_OUT_CHANNEL)
-                self.buffer_transform(op, 2, OpenCLBufferType.IN_OUT_CHANNEL)
-                self.buffer_transform(op, 3, OpenCLBufferType.ARGUMENT)
-                if op.input[4] in self._consts:
-                    self.buffer_transform(op, 4,
-                                          OpenCLBufferType.IN_OUT_CHANNEL)
-
-        # Add OpenCL max image size
-        if self._option.cl_mem_type == "image":
-            arg = net.arg.add()
-            arg.name = MaceKeyword.mace_opencl_max_image_size
-            arg.ints.extend(self._opencl_max_image_size)
-
-        for input_node in self._option.input_nodes.values():
-            new_input_name = MaceKeyword.mace_input_node_name \
-                             + '_' + input_node.name
-            op_def = self._model.op.add()
-
-            op_def.name = self.normalize_op_name(input_node.name)
-            op_def.type = MaceKeyword.mace_buffer_transform
-            op_def.input.extend([new_input_name])
-            op_def.output.extend([input_node.name])
-            output_shape = op_def.output_shape.add()
-            output_shape.dims.extend(input_node.shape)
-
-            arg = op_def.arg.add()
-            arg.name = MaceKeyword.mace_buffer_type
-            arg.i = OpenCLBufferType.IN_OUT_CHANNEL.value
-
-            ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_FLOAT)
-            ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC)
-
-        for output_node in self._option.output_nodes.values():
-            output_name = MaceKeyword.mace_output_node_name \
-                          + '_' + output_node.name
-            op_def = self._model.op.add()
-            op_def.name = self.normalize_op_name(output_name)
-            op_def.type = MaceKeyword.mace_buffer_inverse_transform
-            op_def.input.extend([output_node.name])
-            op_def.output.extend([output_name])
-            if output_node.shape:
-                output_shape = op_def.output_shape.add()
-                output_shape.dims.extend(output_node.shape)
-
-            arg = op_def.arg.add()
-            arg.name = MaceKeyword.mace_buffer_type
-            arg.i = OpenCLBufferType.IN_OUT_CHANNEL.value
-
-            ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_FLOAT)
-            ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC)
-            self._output_op_names.add(op_def.name)
-
-        self._input_output_added = True
-
-        return False
-
     def fold_reshape(self):
         net = self._model
         for op in net.op:
@@ -1654,37 +1220,33 @@ class Transformer(base_converter.ConverterInterface):
 
         return False
 
-    def add_device(self):
-        # TODO(liuqi) add device definition in OperatorDef
-        net = self._model
-        for op in net.op:
-            arg = op.arg.add()
-            arg.name = MaceKeyword.mace_device
-            arg.i = self._option.device
-
-        return False
-
     def update_float_op_data_type(self):
         if self._option.quantize:
             return
 
         print("update op with float data type")
         net = self._model
+        # TODO(liuqi): unify the data_type when CPU support half storage
+        data_type = self._option.data_type
+        if self._option.device == DeviceType.CPU.value:
+            data_type = mace_pb2.DT_HALF
         for op in net.op:
             data_type_arg = ConverterUtil.get_arg(
                 op, MaceKeyword.mace_op_data_type_str)
             if not data_type_arg:
                 data_type_arg = op.arg.add()
                 data_type_arg.name = MaceKeyword.mace_op_data_type_str
-                data_type_arg.i = self._option.data_type
-            elif data_type_arg.i != self._option.data_type \
+                data_type_arg.i = data_type
+            elif data_type_arg.i != data_type \
                     and data_type_arg.i == mace_pb2.DT_FLOAT \
                     and op.name not in self._output_op_names:
-                data_type_arg.i = self._option.data_type
+                data_type_arg.i = data_type
 
         return False
 
     def sort_dfs(self, op, visited, sorted_nodes):
+        if op.name in visited:
+            return
         visited.update([op.name])
         if len(op.input) > 0:
             for input_tensor in op.input:
@@ -1695,40 +1257,6 @@ class Transformer(base_converter.ConverterInterface):
                     self.sort_dfs(producer_op, visited, sorted_nodes)
         sorted_nodes.append(op)
 
-    def add_mace_input_and_output_nodes(self):
-        if self._input_output_added:
-            return
-
-        print("add mace input and output nodes")
-
-        for input_node in self._option.input_nodes.values():
-            new_input_name = MaceKeyword.mace_input_node_name \
-                             + '_' + input_node.name
-            op_def = self._model.op.add()
-            op_def.name = self.normalize_op_name(input_node.name)
-            op_def.type = MaceOp.Identity.name
-            op_def.input.extend([new_input_name])
-            op_def.output.extend([input_node.name])
-            output_shape = op_def.output_shape.add()
-            output_shape.dims.extend(input_node.shape)
-
-            ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_FLOAT)
-            ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC)
-
-        for output_node in self._option.output_nodes.values():
-            output_name = MaceKeyword.mace_output_node_name \
-                          + '_' + output_node.name
-            op_def = self._model.op.add()
-            op_def.name = self.normalize_op_name(output_name)
-            op_def.type = MaceOp.Identity.name
-            op_def.input.extend([output_node.name])
-            op_def.output.extend([output_name])
-            output_shape = op_def.output_shape.add()
-            output_shape.dims.extend(
-                self._producer[output_node.name].output_shape[0].dims)
-
-            ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_FLOAT)
-
     def sort_by_execution(self):
         print("Sort by execution")
         net = self._model
@@ -1736,11 +1264,9 @@ class Transformer(base_converter.ConverterInterface):
         sorted_nodes = []
 
         for output_node in self._option.output_nodes:
-            output_tensor = MaceKeyword.mace_output_node_name \
-                            + '_' + output_node
-            mace_check(output_tensor in self._producer,
-                       "output_tensor %s not existed in model" % output_tensor)
-            self.sort_dfs(self._producer[output_tensor], visited, sorted_nodes)
+            mace_check(output_node in self._producer,
+                       "output_tensor %s not existed in model" % output_node)
+            self.sort_dfs(self._producer[output_node], visited, sorted_nodes)
 
         del net.op[:]
         net.op.extend(sorted_nodes)
@@ -1756,28 +1282,50 @@ class Transformer(base_converter.ConverterInterface):
             return False
 
         print("Add mace quantize and dequantize nodes")
+        input_name_map = {}
+        output_name_map = {}
+
+        for input_node in self._option.input_nodes.values():
+            new_input_name = MaceKeyword.mace_input_node_name \
+                             + '_' + input_node.name
+            input_name_map[input_node.name] = new_input_name
+
+        for output_node in self._option.output_nodes.values():
+            new_output_name = MaceKeyword.mace_output_node_name \
+                              + '_' + output_node.name
+            output_name_map[output_node.name] = new_output_name
 
         for op in self._model.op:
+            for i in range(len(op.input)):
+                if op.input[i] in input_name_map:
+                    op.input[i] = input_name_map[op.input[i]]
+            for i in range(len(op.output)):
+                if op.output[i] in output_name_map:
+                    op.output[i] = output_name_map[op.output[i]]
+
             data_type_arg = ConverterUtil.get_arg(
                 op, MaceKeyword.mace_op_data_type_str)
             mace_check(data_type_arg, "Data type does not exist for %s(%s)"
                        % (op.name, op.type))
             if data_type_arg.i == mace_pb2.DT_FLOAT:
                 data_type_arg.i = mace_pb2.DT_UINT8
+            elif data_type_arg.i == mace_pb2.DT_UINT8:
+                mace_check(op.type == MaceOp.Quantize.name
+                           or op.type == MaceOp.Dequantize.name,
+                           "Only Quantization ops support uint8, "
+                           "but got %s(%s)" % (op.name, op.type))
             else:
-                mace_check(False,
+                mace_check(op.type == MaceOp.Quantize.name,
                            "Quantization only support float ops, "
                            "but get %s(%s)"
                            % (op.name, op.type))
 
         for input_node in self._option.input_nodes.values():
-            new_input_name = MaceKeyword.mace_input_node_name \
-                             + '_' + input_node.name
             op_def = self._model.op.add()
             op_def.name = self.normalize_op_name(input_node.name)
             op_def.type = MaceOp.Quantize.name
-            op_def.input.extend([new_input_name])
-            op_def.output.extend([input_node.name])
+            op_def.input.extend([input_node.name])
+            op_def.output.extend([input_name_map[input_node.name]])
             output_shape = op_def.output_shape.add()
             output_shape.dims.extend(input_node.shape)
 
@@ -1785,13 +1333,12 @@ class Transformer(base_converter.ConverterInterface):
             ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC)
 
         for output_node in self._option.output_nodes.values():
-            output_name = MaceKeyword.mace_output_node_name \
-                          + '_' + output_node.name
             op_def = self._model.op.add()
-            op_def.name = self.normalize_op_name(output_name)
+            op_def.name = self.normalize_op_name(
+                output_name_map[output_node.name])
             op_def.type = MaceOp.Dequantize.name
-            op_def.input.extend([output_node.name])
-            op_def.output.extend([output_name])
+            op_def.input.extend([output_name_map[output_node.name]])
+            op_def.output.extend([output_node.name])
             output_shape = op_def.output_shape.add()
             output_shape.dims.extend(
                 self._producer[output_node.name].output_shape[0].dims)
@@ -1799,7 +1346,9 @@ class Transformer(base_converter.ConverterInterface):
 
             ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8)
 
-        self._input_output_added = True
+        quantize_flag_arg = self._model.arg.add()
+        quantize_flag_arg.name = MaceKeyword.mace_quantize_flag_arg_str
+        quantize_flag_arg.i = 1
 
         return False
 
@@ -2057,6 +1606,8 @@ class Transformer(base_converter.ConverterInterface):
             if input_node.name not in self._quantize_activation_info:
                 print("Input range %s: %s" % (input_node.name,
                                               str(input_node.range)))
+                new_input_name = MaceKeyword.mace_input_node_name \
+                    + '_' + input_node.name
                 scale, zero, minval, maxval = \
                     quantize_util.adjust_range(input_node.range[0],
                                                input_node.range[1],
@@ -2066,7 +1617,7 @@ class Transformer(base_converter.ConverterInterface):
                 quantize_info.maxval = maxval
                 quantize_info.scale = scale
                 quantize_info.zero_point = zero
-                self._quantize_activation_info[input_node.name] = quantize_info
+                self._quantize_activation_info[new_input_name] = quantize_info
 
         return False
 
@@ -2084,9 +1635,6 @@ class Transformer(base_converter.ConverterInterface):
                            "missing quantize info: %s" % op)
 
     def add_opencl_informations(self):
-        if self._option.device != DeviceType.GPU.value:
-            return False
-
         print("Add OpenCL informations")
 
         net = self._model
diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py
deleted file mode 100644
index 1de554d4cd5df0dadf83e60b2231750f691ded62..0000000000000000000000000000000000000000
--- a/mace/python/tools/memory_optimizer.py
+++ /dev/null
@@ -1,349 +0,0 @@
-# Copyright 2018 Xiaomi, Inc.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import operator
-
-import six
-
-from mace.proto import mace_pb2
-
-from mace.python.tools.converter_tool import base_converter as cvt
-from mace.python.tools.converter_tool.base_converter import DeviceType
-from mace.python.tools.converter_tool.base_converter import ConverterUtil
-from mace.python.tools.converter_tool.base_converter import MaceKeyword
-from mace.python.tools.convert_util import calculate_image_shape
-from mace.python.tools.convert_util import OpenCLBufferType
-
-
-def MemoryTypeToStr(mem_type):
-    if mem_type == mace_pb2.CPU_BUFFER:
-        return 'CPU_BUFFER'
-    elif mem_type == mace_pb2.GPU_BUFFER:
-        return 'GPU_BUFFER'
-    elif mem_type == mace_pb2.GPU_IMAGE:
-        return 'GPU_IMAGE'
-    else:
-        return 'UNKNOWN'
-
-
-class MemoryBlock(object):
-    def __init__(self, mem_type, block):
-        self._mem_type = mem_type
-        self._block = block
-
-    @property
-    def mem_type(self):
-        return self._mem_type
-
-    @property
-    def block(self):
-        return self._block
-
-
-class MemoryOptimizer(object):
-    def __init__(self, net_def):
-        self.net_def = net_def
-        self.idle_mem = set()
-        self.op_mem = {}  # op_name->mem_id
-        self.mem_block = {}  # mem_id->[size] or mem_id->[x, y]
-        self.total_mem_count = 0
-        self.input_ref_counter = {}
-        self.mem_ref_counter = {}
-        ocl_mem_type_arg = ConverterUtil.get_arg(
-            net_def, MaceKeyword.mace_opencl_mem_type)
-        self.cl_mem_type = ocl_mem_type_arg.i if ocl_mem_type_arg is not None \
-            else None
-
-        consumers = {}
-        for op in net_def.op:
-            if not self.op_need_optimize_memory(op):
-                continue
-            for ipt in op.input:
-                if ipt not in consumers:
-                    consumers[ipt] = []
-                consumers[ipt].append(op)
-        # only ref op's output tensor
-        for op in net_def.op:
-            if not self.op_need_optimize_memory(op):
-                continue
-            for output in op.output:
-                tensor_name = output
-                if tensor_name in consumers:
-                    self.input_ref_counter[tensor_name] = \
-                        len(consumers[tensor_name])
-                else:
-                    self.input_ref_counter[tensor_name] = 0
-
-    def op_need_optimize_memory(self, op):
-        return True
-
-    def get_op_mem_block(self, op_type, output_shape, output_type):
-        data_type_size = 4
-        if output_type == mace_pb2.DT_UINT8:
-            data_type_size = 1
-        return MemoryBlock(mace_pb2.CPU_BUFFER,
-                           [six.moves.reduce(operator.mul, output_shape, 1) *
-                            data_type_size])
-
-    def mem_size(self, memory_block):
-        return memory_block.block[0]
-
-    def sub_mem_block(self, mem_block1, mem_block2):
-        return self.mem_size(mem_block1) - self.mem_size(mem_block2)
-
-    def resize_mem_block(self, old_mem_block, op_mem_block):
-        return MemoryBlock(
-            old_mem_block.mem_type,
-            [max(old_mem_block.block[0], op_mem_block.block[0])])
-
-    def add_net_mem_blocks(self):
-        for mem in self.mem_block:
-            arena = self.net_def.mem_arena
-            block = arena.mem_block.add()
-            block.mem_id = mem
-            block.device_type = DeviceType.CPU.value
-            block.mem_type = self.mem_block[mem].mem_type
-            block.x = self.mem_block[mem].block[0]
-            block.y = 1
-
-    def get_total_origin_mem_size(self):
-        origin_mem_size = 0
-        for op in self.net_def.op:
-            if not self.op_need_optimize_memory(op):
-                continue
-            origin_mem_size += six.moves.reduce(operator.mul,
-                                                op.output_shape[0].dims,
-                                                1)
-        return origin_mem_size
-
-    def get_total_optimized_mem_size(self):
-        optimized_mem_size = 0
-        for mem in self.mem_block:
-            print(mem, MemoryTypeToStr(self.mem_block[mem].mem_type),
-                  self.mem_block[mem].block)
-            optimized_mem_size += self.mem_size(self.mem_block[mem])
-        return optimized_mem_size
-
-    @staticmethod
-    def is_memory_reuse_op(op):
-        return op.type == 'Reshape' or op.type == 'Identity' \
-               or op.type == 'Squeeze' or op.type == 'ExpandDims'
-
-    def optimize(self):
-        for op in self.net_def.op:
-            if not self.op_need_optimize_memory(op):
-                continue
-            if not op.output_shape:
-                six.print_("WARNING: There is no output shape information to "
-                           "do memory optimization. %s (%s)" %
-                           (op.name, op.type), file=sys.stderr)
-                return
-            if len(op.output_shape) != len(op.output):
-                six.print_('WARNING: the number of output shape is '
-                           'not equal to the number of output.',
-                           file=sys.stderr)
-                return
-            for i in range(len(op.output)):
-                if self.is_memory_reuse_op(op):
-                    # make these ops reuse memory of input tensor
-                    mem_id = self.op_mem.get(op.input[0], -1)
-                else:
-                    output_type = mace_pb2.DT_FLOAT
-                    for arg in op.arg:
-                        if arg.name == 'T':
-                            output_type = arg.i
-                    if len(op.output_type) > i:
-                        output_type = op.output_type[i]
-                    op_mem_block = self.get_op_mem_block(
-                        op.type,
-                        op.output_shape[i].dims,
-                        output_type)
-                    mem_id = -1
-                    if len(self.idle_mem) > 0:
-                        best_mem_add_size = six.MAXSIZE
-                        best_mem_waste_size = six.MAXSIZE
-                        for mid in self.idle_mem:
-                            old_mem_block = self.mem_block[mid]
-                            if old_mem_block.mem_type != op_mem_block.mem_type:
-                                continue
-                            new_mem_block = self.resize_mem_block(
-                                old_mem_block, op_mem_block)
-                            add_mem_size = self.sub_mem_block(new_mem_block,
-                                                              old_mem_block)
-                            waste_mem_size = self.sub_mem_block(new_mem_block,
-                                                                op_mem_block)
-
-                            # minimize add_mem_size; if best_mem_add_size is 0,
-                            # then minimize waste_mem_size
-                            if (best_mem_add_size > 0 and
-                                add_mem_size < best_mem_add_size) \
-                                    or (best_mem_add_size == 0 and
-                                        waste_mem_size < best_mem_waste_size):
-                                best_mem_id = mid
-                                best_mem_add_size = add_mem_size
-                                best_mem_waste_size = waste_mem_size
-                                best_mem_block = new_mem_block
-
-                        # if add mem size < op mem size, then reuse it
-                        if best_mem_add_size <= self.mem_size(op_mem_block):
-                            self.mem_block[best_mem_id] = best_mem_block
-                            mem_id = best_mem_id
-                            self.idle_mem.remove(mem_id)
-
-                    if mem_id == -1:
-                        mem_id = self.total_mem_count
-                        self.total_mem_count += 1
-                        self.mem_block[mem_id] = op_mem_block
-
-                if mem_id != -1:
-                    op.mem_id.extend([mem_id])
-                    self.op_mem[op.output[i]] = mem_id
-                    if mem_id not in self.mem_ref_counter:
-                        self.mem_ref_counter[mem_id] = 1
-                    else:
-                        self.mem_ref_counter[mem_id] += 1
-
-            # de-ref input tensor mem
-            for idx in six.moves.range(len(op.input)):
-                ipt = op.input[idx]
-                if ipt in self.input_ref_counter:
-                    self.input_ref_counter[ipt] -= 1
-                    if self.input_ref_counter[ipt] == 0 \
-                            and ipt in self.op_mem:
-                        mem_id = self.op_mem[ipt]
-                        self.mem_ref_counter[mem_id] -= 1
-                        if self.mem_ref_counter[mem_id] == 0:
-                            self.idle_mem.add(self.op_mem[ipt])
-                    elif self.input_ref_counter[ipt] < 0:
-                        raise Exception('ref count is less than 0')
-
-        self.add_net_mem_blocks()
-
-        print("total op: %d" % len(self.net_def.op))
-        print("origin mem: %d, optimized mem: %d" % (
-            self.get_total_origin_mem_size(),
-            self.get_total_optimized_mem_size()))
-
-
-class GPUMemoryOptimizer(MemoryOptimizer):
-    def op_need_optimize_memory(self, op):
-        if op.type == MaceKeyword.mace_buffer_transform:
-            for arg in op.arg:
-                if arg.name == 'mode' and arg.i == 0:
-                    return False
-        return op.type != MaceKeyword.mace_buffer_inverse_transform
-
-    def get_op_image_mem_block(self, op_type, output_shape):
-        if op_type == 'WinogradTransform' or op_type == 'MatMul':
-            buffer_shape = list(output_shape) + [1]
-            mem_block = MemoryBlock(
-                mace_pb2.GPU_IMAGE,
-                calculate_image_shape(OpenCLBufferType.IN_OUT_HEIGHT,
-                                      buffer_shape))
-        elif op_type in ['Shape',
-                         'InferConv2dShape',
-                         'StridedSlice',
-                         'Stack',
-                         'ScalarMath']:
-            if len(output_shape) == 1:
-                mem_block = MemoryBlock(mace_pb2.CPU_BUFFER,
-                                        [output_shape[0], 1])
-            elif len(output_shape) == 0:
-                mem_block = MemoryBlock(mace_pb2.CPU_BUFFER,
-                                        [1, 1])
-            else:
-                raise Exception('%s output shape dim size is not 0 or 1.' %
-                                op_type)
-        else:
-            if len(output_shape) == 2:  # only support fc/softmax
-                buffer_shape = [output_shape[0], output_shape[1]]
-            elif len(output_shape) == 4:
-                buffer_shape = output_shape
-            else:
-                raise Exception('%s output shape dim size is not 2 or 4.' %
-                                op_type)
-            mem_block = MemoryBlock(
-                mace_pb2.GPU_IMAGE,
-                calculate_image_shape(OpenCLBufferType.IN_OUT_CHANNEL,
-                                      buffer_shape))
-        return mem_block
-
-    def get_op_buffer_mem_block(self, output_shape):
-        return MemoryBlock(mace_pb2.GPU_BUFFER,
-                           [reduce(operator.mul, output_shape, 1), 1])
-
-    def get_op_mem_block(self, op_type, output_shape, output_type):
-        if self.cl_mem_type == mace_pb2.GPU_IMAGE:
-            return self.get_op_image_mem_block(op_type, output_shape)
-        else:
-            return self.get_op_buffer_mem_block(output_shape)
-
-    def mem_size(self, memory_block):
-        if memory_block.mem_type == mace_pb2.GPU_IMAGE:
-            return memory_block.block[0] * memory_block.block[1] * 4
-        else:
-            return memory_block.block[0]
-
-    def resize_mem_block(self, old_mem_block, op_mem_block):
-        resize_mem_block = MemoryBlock(
-            old_mem_block.mem_type,
-            [
-                max(old_mem_block.block[0], op_mem_block.block[0]),
-                max(old_mem_block.block[1], op_mem_block.block[1])
-            ])
-
-        return resize_mem_block
-
-    def add_net_mem_blocks(self):
-        max_image_size_x = 0
-        max_image_size_y = 0
-        for mem in self.mem_block:
-            arena = self.net_def.mem_arena
-            block = arena.mem_block.add()
-            block.mem_id = mem
-            block.device_type = DeviceType.GPU.value
-            block.mem_type = self.mem_block[mem].mem_type
-            block.x = self.mem_block[mem].block[0]
-            block.y = self.mem_block[mem].block[1]
-            if self.mem_block[mem].mem_type == mace_pb2.GPU_IMAGE:
-                max_image_size_x = max(max_image_size_x, block.x)
-                max_image_size_y = max(max_image_size_y, block.y)
-
-        if self.cl_mem_type == mace_pb2.GPU_IMAGE:
-            # Update OpenCL max image size
-            net_ocl_max_img_size_arg = None
-            for arg in self.net_def.arg:
-                if arg.name == cvt.MaceKeyword.mace_opencl_max_image_size:
-                    net_ocl_max_img_size_arg = arg
-                    max_image_size_x = max(arg.ints[0], max_image_size_x)
-                    max_image_size_y = max(arg.ints[1], max_image_size_y)
-                    break
-            if net_ocl_max_img_size_arg is None:
-                net_ocl_max_img_size_arg = self.net_def.arg.add()
-                net_ocl_max_img_size_arg.name = \
-                    cvt.MaceKeyword.mace_opencl_max_image_size
-
-            net_ocl_max_img_size_arg.ints[:] = [max_image_size_x,
-                                                max_image_size_y]
-
-
-def optimize_gpu_memory(net_def):
-    mem_optimizer = GPUMemoryOptimizer(net_def)
-    mem_optimizer.optimize()
-
-
-def optimize_cpu_memory(net_def):
-    mem_optimizer = MemoryOptimizer(net_def)
-    mem_optimizer.optimize()
diff --git a/mace/python/tools/model.jinja2 b/mace/python/tools/model.jinja2
index 3f4ba1c4f5d907352a0cee9bca719fa29be08768..f985a75abc718c32adf1215777a01aaaa8fe5df9 100644
--- a/mace/python/tools/model.jinja2
+++ b/mace/python/tools/model.jinja2
@@ -80,6 +80,7 @@ void CreateInputInfo(NetDef *net_def) {
   input_info = net_def->add_input_info();
   input_info->set_name({{ net.input_info[idx].name|tojson }});
   input_info->set_data_type(static_cast<DataType>({{ net.input_info[idx].data_type }}));
+  input_info->set_data_format(static_cast<DataFormat>({{ net.input_info[idx].data_format }}));
   input_info->mutable_dims()->Reserve({{ net.input_info[idx].dims|length }});
   {% for dim in net.input_info[idx].dims %}
   input_info->add_dims({{ dim }});
@@ -96,6 +97,7 @@ void CreateOutputInfo(NetDef *net_def) {
   output_info = net_def->add_output_info();
   output_info->set_name({{ net.output_info[idx].name|tojson }});
   output_info->set_data_type(static_cast<DataType>({{ net.output_info[idx].data_type }}));
+  output_info->set_data_format(static_cast<DataFormat>({{ net.output_info[idx].data_format }}));
   output_info->mutable_dims()->Reserve({{ net.output_info[idx].dims|length }});
   {% for dim in net.output_info[idx].dims %}
   output_info->add_dims({{dim}});
@@ -121,23 +123,6 @@ void CreateTensors(NetDef *net_def) {
   mace::{{tag}}::CreateTensor{{ i }}(net_def->add_tensors());
   {% endfor %}
 }
-
-{% if net.mem_arena.mem_block|length != 0 %}
-void CreateMemoryArena(mace::MemoryArena *mem_arena) {
-  mem_arena->mutable_mem_block()->Reserve({{ net.mem_arena.mem_block|length }});
-  {% for i in range(net.mem_arena.mem_block|length) %}
-
-  mace::MemoryBlock* mem_block{{i}} = mem_arena->add_mem_block();
-  mem_block{{i}}->set_mem_id({{net.mem_arena.mem_block[i].mem_id}});
-  mem_block{{i}}->set_device_type(static_cast<DeviceType>({{net.mem_arena.mem_block[i].device_type}}));
-  mem_block{{i}}->set_mem_type(static_cast<MemoryType>({{net.mem_arena.mem_block[i].mem_type}}));
-  mem_block{{i}}->set_x({{net.mem_arena.mem_block[i].x}});
-  mem_block{{i}}->set_y({{net.mem_arena.mem_block[i].y}});
-
-  {% endfor %}
-}
-{% endif %}
-
 }  // namespace
 
 namespace {{tag}} {
@@ -153,9 +138,6 @@ const std::shared_ptr<NetDef> CreateNet() {
   {% if net.arg|length != 0 %}
   CreateNetArg(net_def.get());
   {% endif %}
-  {% if net.mem_arena.mem_block|length != 0 %}
-  CreateMemoryArena(net_def->mutable_mem_arena());
-  {% endif %}
   {% if net.input_info | length > 0 %}
   CreateInputInfo(net_def.get());
   {% endif %}
@@ -179,8 +161,8 @@ const std::string ModelBuildTime() {
 }
 
 const std::string ModelBuildOptions() {
-  return {{ "runtime: {}, obfuscate: {}, embed_model_data: {}, winograd: {}"
-        .format(runtime, obfuscate, embed_model_data, winograd_conv)|tojson }};
+  return {{ "obfuscate: {}, embed_model_data: {}, winograd: {}"
+        .format(obfuscate, embed_model_data, winograd_conv)|tojson }};
 }
 
 }  // namespace {{tag}}
diff --git a/mace/python/tools/model_saver.py b/mace/python/tools/model_saver.py
index ea90a6264cc6a8697fc1857d0ff58833abd29389..c2221426f03ead1e6709fb756521d4ca75bc731a 100644
--- a/mace/python/tools/model_saver.py
+++ b/mace/python/tools/model_saver.py
@@ -20,6 +20,7 @@ import hashlib
 from enum import Enum
 
 from mace.proto import mace_pb2
+from mace.python.tools.converter_tool import base_converter as cvt
 from mace.python.tools.convert_util import mace_check
 from jinja2 import Environment, FileSystemLoader
 
@@ -82,20 +83,24 @@ def generate_in_out_map(ops, tensor_map):
     return in_out_map
 
 
-def obfuscate_name(net_def):
-    input_node = "mace_input_node"
-    output_node = "mace_output_node"
+def obfuscate_name(option, net_def):
+    input_nodes = set()
+    for name in option.input_nodes:
+        input_nodes.add(name)
+    output_nodes = set()
+    for name in option.output_nodes:
+        output_nodes.add(name)
     tensor_map = generate_tensor_map(net_def.tensors)
     in_out_map = generate_in_out_map(net_def.op, tensor_map)
     for t in net_def.tensors:
-        if input_node not in t.name and output_node not in t.name:
+        if t.name not in input_nodes and t.name not in output_nodes:
             t.name = tensor_map[t.name]
     for op in net_def.op:
         for i in range(len(op.input)):
-            if input_node not in op.input[i]:
+            if op.input[i] not in input_nodes:
                 op.input[i] = in_out_map[op.input[i]]
         for i in range(len(op.output)):
-            if output_node not in op.output[i]:
+            if op.output[i] not in output_nodes:
                 op.output[i] = in_out_map[op.output[i]]
 
 
@@ -124,15 +129,14 @@ class TensorInfo:
                             tensor.data_type)
 
 
-def update_tensor_infos(net_def, runtime, data_type):
+def update_tensor_infos(net_def, data_type, device):
     offset = 0
     counter = 0
     tensor_infos = []
     for tensor in net_def.tensors:
-        # update data_type
-        if tensor.data_type == mace_pb2.DT_FLOAT and runtime == 'gpu' \
-                and data_type == GPUDataType.fp16_fp32:
-            tensor.data_type = mace_pb2.DT_HALF
+        if device == cvt.DeviceType.GPU.value and\
+                tensor.data_type == mace_pb2.DT_FLOAT:
+            tensor.data_type = data_type
 
         # Add offset and data_size
         tensor_info = TensorInfo(counter, tensor)
@@ -195,7 +199,7 @@ def save_model_to_proto(net_def, model_tag, output_dir):
         f.write(str(net_def))
 
 
-def save_model_to_code(net_def, model_tag, runtime,
+def save_model_to_code(net_def, model_tag, device,
                        template_dir, output_dir, embed_model_data,
                        model_checksum, weight_checksum,
                        obfuscate, winograd_conv):
@@ -241,7 +245,7 @@ def save_model_to_code(net_def, model_tag, runtime,
             end=min(start + 10, op_size),
             net=net_def,
             tag=model_tag,
-            runtime=runtime,
+            device=device,
         )
         with open(output_dir + 'op' + str(counter) + '.cc', "w") as f:
             f.write(source)
@@ -256,7 +260,6 @@ def save_model_to_code(net_def, model_tag, runtime,
     source = j2_env.get_template(template_name).render(
         net=net_def,
         tag=model_tag,
-        runtime=runtime,
         obfuscate=obfuscate,
         embed_model_data=embed_model_data,
         winograd_conv=winograd_conv,
@@ -272,15 +275,15 @@ def save_model_to_code(net_def, model_tag, runtime,
         f.write(source)
 
 
-def save_model(net_def, model_checksum, weight_checksum, template_dir,
-               obfuscate, model_tag, output_dir, runtime, embed_model_data,
-               winograd_conv, data_type, model_graph_format):
+def save_model(option, net_def, model_checksum, weight_checksum, template_dir,
+               obfuscate, model_tag, output_dir, embed_model_data,
+               winograd_conv, model_graph_format):
     if obfuscate:
-        obfuscate_name(net_def)
+        obfuscate_name(option, net_def)
 
     output_dir = output_dir + '/'
     # update tensor type
-    update_tensor_infos(net_def, runtime, data_type)
+    update_tensor_infos(net_def, option.data_type, option.device)
 
     if model_graph_format == ModelFormat.file or not embed_model_data:
         save_model_data(net_def, model_tag, output_dir)
@@ -288,7 +291,7 @@ def save_model(net_def, model_checksum, weight_checksum, template_dir,
     if model_graph_format == ModelFormat.file:
         save_model_to_proto(net_def, model_tag, output_dir)
     else:
-        save_model_to_code(net_def, model_tag, runtime,
+        save_model_to_code(net_def, model_tag, option.device,
                            template_dir, output_dir, embed_model_data,
                            model_checksum, weight_checksum,
                            obfuscate, winograd_conv)
diff --git a/mace/python/tools/operator.jinja2 b/mace/python/tools/operator.jinja2
index e3492ddf7f112fef067c1b51ef7ba83b3065711d..7b4c95029d4d087d438c3f019cac51275880ce1e 100644
--- a/mace/python/tools/operator.jinja2
+++ b/mace/python/tools/operator.jinja2
@@ -132,7 +132,7 @@ void CreateOperator{{i}}(mace::OperatorDef *op) {
     quantize_info{{j}}->set_maxval({{ net.op[i].quantize_info[j].maxval }});
   {% endfor %}
 
-  {% if runtime == 'dsp' %}
+  {% if device == 3 %}
     op->set_padding({{ net.op[i].padding }});
     {% if net.op[i].node_input | length > 0 %}
     std::vector<int> input_node_ids({ {{ net.op[i].node_input | map(attribute='node_id') | join(', ') }} });
diff --git a/mace/test/BUILD b/mace/test/BUILD
index 04253cda9a117cd6b7905837e8e4a09ffdd1ca21..593076a385725ae6058d7aa0070d1f03d1b9caba 100644
--- a/mace/test/BUILD
+++ b/mace/test/BUILD
@@ -6,6 +6,14 @@ licenses(["notice"])  # Apache 2.0
 
 load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled", "if_android_armv7", "if_hexagon_enabled")
 
+cc_library(
+    name = "mace_api_test_header",
+    hdrs = [
+        "mace_api_test.h",
+    ],
+    copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
+)
+
 cc_test(
     name = "mace_api_test",
     testonly = 1,
@@ -20,6 +28,7 @@ cc_test(
     linkopts = ["-fopenmp"],
     linkstatic = 1,
     deps = [
+        ":mace_api_test_header",
         "//mace/ops:test",
         "//mace/libmace:libmace",
         "@gtest//:gtest_main",
@@ -40,6 +49,7 @@ cc_test(
     linkopts = ["-fopenmp"],
     linkstatic = 1,
     deps = [
+        ":mace_api_test_header",
         "//mace/ops:test",
         "//mace/libmace:libmace",
         "@gtest//:gtest_main",
diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc
index 0bb8342dcffe376598fd061795b9eb26e971ce65..cce492736b6344ae2e064cab87e8ca687c37ed36 100644
--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
@@ -12,12 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <fstream>
 #include <thread>  // NOLINT(build/c++11)
 
-#include "mace/core/operator.h"
-#include "mace/ops/conv_pool_2d_util.h"
-#include "mace/ops/ops_test_util.h"
+#include "mace/test/mace_api_test.h"
 
 namespace mace {
 namespace test {
@@ -26,253 +23,6 @@ class MaceMTAPITest  : public ::testing::Test {};
 
 namespace {
 
-void GenerateInputs(const std::vector<std::string> &input_names,
-                    const std::vector<int64_t> &input_shape,
-                    std::map<std::string, mace::MaceTensor> *inputs) {
-  size_t input_size = input_names.size();
-  for (size_t i = 0; i < input_size; ++i) {
-    // Allocate input and output
-    int64_t input_size =
-        std::accumulate(input_shape.begin(), input_shape.end(), 1,
-                        std::multiplies<int64_t>());
-    auto buffer_in = std::shared_ptr<float>(new float[input_size],
-                                            std::default_delete<float[]>());
-    // load input
-    std::vector<float> input_data;
-    ops::test::GenerateRandomRealTypeData(input_shape, &input_data);
-    memcpy(buffer_in.get(), input_data.data(), input_size * sizeof(float));
-    (*inputs)[input_names[i]] = mace::MaceTensor(input_shape, buffer_in);
-  }
-}
-
-void GenerateOutputs(const std::vector<std::string> &output_names,
-                     const std::vector<int64_t> &output_shape,
-                     std::map<std::string, mace::MaceTensor> *outputs) {
-  size_t output_size = output_names.size();
-  for (size_t i = 0; i < output_size; ++i) {
-    int64_t output_size =
-        std::accumulate(output_shape.begin(), output_shape.end(), 1,
-                        std::multiplies<int64_t>());
-    auto buffer_out = std::shared_ptr<float>(new float[output_size],
-                                             std::default_delete<float[]>());
-    (*outputs)[output_names[i]] = mace::MaceTensor(output_shape, buffer_out);
-  }
-}
-
-template <typename T>
-void BufferToImage(const std::string &input_name,
-                   const std::string &output_name,
-                   const int buffer_type,
-                   const std::vector<int> &mem_ids,
-                   const DeviceType device_type,
-                   NetDef *net_def,
-                   const int mode = NetMode::NORMAL) {
-  OperatorDef operator_def;
-
-  ops::test::OpDefBuilder("BufferTransform", "BufferTransformOp")
-      .Input(input_name)
-      .Output(output_name)
-      .AddIntArg("buffer_type", buffer_type)
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddIntArg("device", static_cast<int>(device_type))
-      .AddIntArg("mode", mode)
-      .Finalize(&operator_def);
-
-  operator_def.mutable_mem_id()->Reserve(mem_ids.size());
-  for (auto mem_id : mem_ids) {
-    operator_def.add_mem_id(mem_id);
-  }
-  net_def->add_op()->CopyFrom(operator_def);
-}
-
-template <typename T>
-void ImageToBuffer(const std::string &input_name,
-                   const std::string &output_name,
-                   const int buffer_type,
-                   const DeviceType device_type,
-                   NetDef *net_def) {
-  OperatorDef operator_def;
-
-  ops::test::OpDefBuilder("BufferInverseTransform", "BufferInverseTransformOp")
-      .Input(input_name)
-      .Output(output_name)
-      .AddIntArg("buffer_type", buffer_type)
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddIntArg("device", static_cast<int>(device_type))
-      .Finalize(&operator_def);
-
-  net_def->add_op()->CopyFrom(operator_def);
-}
-
-template <typename T>
-void Conv3x3(const std::string &input_name,
-             const std::string &filter_name,
-             const std::string &output_name,
-             const std::vector<int> &mem_ids,
-             const DeviceType device_type,
-             NetDef *net_def) {
-  OperatorDef operator_def;
-  ops::test::OpDefBuilder("Conv2D", "Conv2dOp")
-      .Input(input_name)
-      .Input(filter_name)
-      .Output(output_name)
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::SAME)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddIntArg("device", static_cast<int>(device_type))
-      .Finalize(&operator_def);
-
-  operator_def.mutable_mem_id()->Reserve(mem_ids.size());
-  for (auto mem_id : mem_ids) {
-    operator_def.add_mem_id(mem_id);
-  }
-  net_def->add_op()->CopyFrom(operator_def);
-}
-
-template <typename T>
-void Relu(const std::string &input_name,
-          const std::string &output_name,
-          const DeviceType device_type,
-          NetDef *net_def) {
-  OperatorDef operator_def;
-  ops::test::OpDefBuilder("Activation", "ReluTest")
-      .Input(input_name)
-      .Output(output_name)
-      .AddStringArg("activation", "RELU")
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddIntArg("device", static_cast<int>(device_type))
-      .Finalize(&operator_def);
-
-  net_def->add_op()->CopyFrom(operator_def);
-}
-
-template <typename T>
-void AddTensor(const std::string &name,
-               const std::vector<int64_t> &shape,
-               const int offset,
-               const int data_size,
-               NetDef *net_def) {
-  ConstTensor *tensor_ptr = net_def->add_tensors();
-  tensor_ptr->set_name(name);
-  tensor_ptr->mutable_dims()->Reserve(shape.size());
-  for (auto dim : shape) {
-    tensor_ptr->add_dims(dim);
-  }
-  tensor_ptr->set_offset(offset);
-  tensor_ptr->set_data_size(data_size);
-  tensor_ptr->set_data_type(DataTypeToEnum<T>::value);
-}
-
-template <DeviceType D, typename T>
-void CheckOutputs(const NetDef &net_def,
-                  const std::map<std::string, mace::MaceTensor> &inputs,
-                  const std::map<std::string, mace::MaceTensor> &outputs,
-                  const std::vector<T> &tensor_data) {
-  ops::test::OpsTestNet net;
-  for (auto input : inputs) {
-    auto input_shape = input.second.shape();
-    const int64_t data_size = std::accumulate(input_shape.begin(),
-                                              input_shape.end(), 1,
-                                              std::multiplies<int64_t>());
-    std::vector<float> input_data(data_size);
-    memcpy(input_data.data(), input.second.data().get(),
-           data_size * sizeof(float));
-    std::string input_name = MakeString("mace_input_node_",
-                                        input.first);
-    net.AddInputFromArray<D, float>(input_name, input.second.shape(),
-                                    input_data);
-  }
-  auto tensors = net_def.tensors();
-  for (auto tensor : tensors) {
-    std::vector<index_t> shape = {tensor.dims().begin(), tensor.dims().end()};
-    const int64_t data_size = std::accumulate(shape.begin(),
-                                              shape.end(), 1,
-                                              std::multiplies<int64_t>());
-    std::vector<T> data(data_size);
-    memcpy(data.data(),
-           reinterpret_cast<const T *>(tensor_data.data()) + tensor.offset(),
-           tensor.data_size() * sizeof(T));
-    net.AddInputFromArray<D, T>(tensor.name(), shape, data);
-  }
-  net.RunNet(net_def, D);
-
-  for (auto output : outputs) {
-    std::unique_ptr<Tensor> tmp_tensor(
-        new Tensor(GetCPUAllocator(),
-                   DataTypeToEnum<float>::v()));
-    auto output_shape = output.second.shape();
-    const int64_t data_size = std::accumulate(output_shape.begin(),
-                                              output_shape.end(), 1,
-                                              std::multiplies<float>());
-    tmp_tensor->Resize(output.second.shape());
-    float *data = tmp_tensor->mutable_data<float>();
-    memcpy(data, output.second.data().get(), data_size * sizeof(float));
-    std::string output_name = MakeString("mace_output_node_",
-                                         output.first);
-    ops::test::ExpectTensorNear<float>(*tmp_tensor,
-                                       *net.GetOutput(output_name.data()),
-                                       1e-5);
-  }
-}
-
-std::map<std::string, int> AddMemoryOptimization(
-    const std::vector<std::string> &input_names,
-    const std::vector<std::string> &output_names,
-    const std::vector<std::vector<int64_t>> &input_shapes,
-    const std::vector<std::vector<int64_t>> &output_shapes,
-    NetDef *net_def) {
-  std::map<std::string, int> res;
-  int mem_id = 0;
-  size_t input_shape_size = input_shapes.size();
-  uint32_t in_mem_block_x = 0;
-  uint32_t in_mem_block_y = 0;
-  for (size_t i = 0; i < input_shape_size; ++i) {
-    in_mem_block_x = std::max<uint32_t>(in_mem_block_x,
-                                        input_shapes[i][2] *
-                                            RoundUpDiv4(input_shapes[i][3]));
-    in_mem_block_y = std::max<uint32_t>(in_mem_block_y,
-                                        input_shapes[i][0] *
-                                            input_shapes[i][1]);
-  }
-  size_t input_size = input_names.size();
-  size_t output_size = output_names.size();
-  MemoryArena *mem_arena_ptr = net_def->mutable_mem_arena();
-  mem_arena_ptr->mutable_mem_block()->Reserve(input_size + output_size);
-  for (size_t i = 0; i < input_size; ++i) {
-    MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
-    mem_blk_ptr->set_mem_id(mem_id);
-    mem_blk_ptr->set_device_type(DeviceType::GPU);
-    mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
-    mem_blk_ptr->set_x(in_mem_block_x);
-    mem_blk_ptr->set_y(in_mem_block_y);
-    res[input_names[i]] = mem_id;
-    mem_id++;
-  }
-  size_t output_shape_size = output_shapes.size();
-  uint32_t out_mem_block_x = 0;
-  uint32_t out_mem_block_y = 0;
-  for (size_t i = 0; i < output_shape_size; ++i) {
-    out_mem_block_x = std::max<uint32_t>(out_mem_block_x,
-                                         output_shapes[i][2] *
-                                             RoundUpDiv4(output_shapes[i][3]));
-    out_mem_block_y = std::max<uint32_t>(out_mem_block_y,
-                                         output_shapes[i][0] *
-                                             output_shapes[i][1]);
-  }
-  for (size_t i = 0; i < output_size; ++i) {
-    MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
-    mem_blk_ptr->set_mem_id(mem_id);
-    mem_blk_ptr->set_device_type(DeviceType::GPU);
-    mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
-    mem_blk_ptr->set_x(out_mem_block_x);
-    mem_blk_ptr->set_y(out_mem_block_y);
-    res[output_names[i]] = mem_id;
-    mem_id++;
-  }
-  return res;
-}
-
 // The height and width of input and output must be equal.
 void MaceRunFunc(const int in_out_size) {
   std::vector<std::string> input_names;
@@ -282,7 +32,6 @@ void MaceRunFunc(const int in_out_size) {
     output_names.push_back(MakeString("output", i));
   }
   std::string filter_tensor_name = "filter";
-  std::string filter_tensor_img_name = filter_tensor_name + "_image";
 
   const DeviceType device = DeviceType::GPU;
 
@@ -292,10 +41,6 @@ void MaceRunFunc(const int in_out_size) {
 
   std::shared_ptr<NetDef> net_def(new NetDef());
 
-  // Add memory optimization
-  auto mem_map = AddMemoryOptimization(input_names, output_names,
-                                       input_shapes, output_shapes,
-                                       net_def.get());
 
   std::vector<half> data;
   ops::test::GenerateRandomRealTypeData<half>(filter_shape, &data);
@@ -303,35 +48,21 @@ void MaceRunFunc(const int in_out_size) {
       filter_tensor_name, filter_shape, 0, data.size(), net_def.get());
 
   for (size_t i = 0; i < input_names.size(); ++i) {
-    std::string input_name = MakeString("mace_input_node_",
-                                        input_names[i]);
-    BufferToImage<half>(input_name, input_names[i],
-                        mace::ops::IN_OUT_CHANNEL,
-                        {mem_map[input_names[i]]},
-                        device,
-                        net_def.get());
     InputInfo *info = net_def->add_input_info();
     info->set_name(input_names[i]);
-  }
-  BufferToImage<half>(filter_tensor_name, filter_tensor_img_name,
-                      mace::ops::CONV2D_FILTER, {}, device,
-                      net_def.get(), NetMode::INIT);
-  for (size_t i = 0; i < output_names.size(); ++i) {
-    Conv3x3<half>(input_names[i], filter_tensor_img_name,
-                  output_names[i], {mem_map[output_names[i]]},
-                  device,
-                  net_def.get());
+    for (auto d : input_shapes[0]) {
+      info->add_dims(static_cast<int>(d));
+    }
   }
   for (size_t i = 0; i < output_names.size(); ++i) {
-    std::string output_name = MakeString("mace_output_node_",
-                                         output_names[i]);
-    ImageToBuffer<float>(output_names[i], output_name,
-                         mace::ops::IN_OUT_CHANNEL,
-                         device,
-                         net_def.get());
     OutputInfo *info = net_def->add_output_info();
     info->set_name(output_names[i]);
   }
+  for (size_t i = 0; i < output_names.size(); ++i) {
+    Conv3x3<half>(input_names[i], filter_tensor_name,
+                  output_names[i], output_shapes[0],
+                  net_def.get());
+  }
 
   MaceEngineConfig config(DeviceType::GPU);
 
diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc
index 127e58492f4f2178744267f1d61df9edc0345e8f..48011ace3e9bf50a1fff270a206e57b9b698f044 100644
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -12,12 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-
-#include <fstream>
-
-#include "mace/ops/conv_pool_2d_util.h"
-#include "mace/ops/ops_test_util.h"
-#include "mace/public/mace.h"
+#include "mace/test/mace_api_test.h"
 
 namespace mace {
 namespace test {
@@ -26,258 +21,11 @@ class MaceAPITest  : public ::testing::Test {};
 
 namespace {
 
-void GenerateInputs(const std::vector<std::string> &input_names,
-                    const std::vector<int64_t> &input_shape,
-                    std::map<std::string, mace::MaceTensor> *inputs) {
-  size_t input_size = input_names.size();
-  for (size_t i = 0; i < input_size; ++i) {
-    // Allocate input and output
-    int64_t input_size =
-        std::accumulate(input_shape.begin(), input_shape.end(), 1,
-                        std::multiplies<int64_t>());
-    auto buffer_in = std::shared_ptr<float>(new float[input_size],
-                                            std::default_delete<float[]>());
-    // load input
-    std::vector<float> input_data;
-    ops::test::GenerateRandomRealTypeData(input_shape, &input_data);
-    memcpy(buffer_in.get(), input_data.data(), input_size * sizeof(float));
-    (*inputs)[input_names[i]] = mace::MaceTensor(input_shape, buffer_in);
-  }
-}
-
-void GenerateOutputs(const std::vector<std::string> &output_names,
-                     const std::vector<int64_t> &output_shape,
-                     std::map<std::string, mace::MaceTensor> *outputs) {
-  size_t output_size = output_names.size();
-  for (size_t i = 0; i < output_size; ++i) {
-    int64_t output_size =
-        std::accumulate(output_shape.begin(), output_shape.end(), 1,
-                        std::multiplies<int64_t>());
-    auto buffer_out = std::shared_ptr<float>(new float[output_size],
-                                             std::default_delete<float[]>());
-    (*outputs)[output_names[i]] = mace::MaceTensor(output_shape, buffer_out);
-  }
-}
-
-template <typename T>
-void BufferToImage(const std::string &input_name,
-                   const std::string &output_name,
-                   const int buffer_type,
-                   const std::vector<int> &mem_ids,
-                   const DeviceType device_type,
-                   NetDef *net_def,
-                   const int mode = NetMode::NORMAL) {
-  OperatorDef operator_def;
-
-  ops::test::OpDefBuilder("BufferTransform", "BufferTransformOp")
-      .Input(input_name)
-      .Output(output_name)
-      .AddIntArg("buffer_type", buffer_type)
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddIntArg("device", static_cast<int>(device_type))
-      .AddIntArg("mode", mode)
-      .Finalize(&operator_def);
-
-  operator_def.mutable_mem_id()->Reserve(mem_ids.size());
-  for (auto mem_id : mem_ids) {
-    operator_def.add_mem_id(mem_id);
-  }
-
-  net_def->add_op()->CopyFrom(operator_def);
-}
-
-template <typename T>
-void ImageToBuffer(const std::string &input_name,
-                   const std::string &output_name,
-                   const int buffer_type,
-                   const DeviceType device_type,
-                   NetDef *net_def) {
-  OperatorDef operator_def;
-
-  ops::test::OpDefBuilder("BufferInverseTransform", "BufferInverseTransformOp")
-      .Input(input_name)
-      .Output(output_name)
-      .AddIntArg("buffer_type", buffer_type)
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddIntArg("device", static_cast<int>(device_type))
-      .Finalize(&operator_def);
-
-  net_def->add_op()->CopyFrom(operator_def);
-}
-
-template <typename T>
-void Conv3x3(const std::string &input_name,
-             const std::string &filter_name,
-             const std::string &output_name,
-             const std::vector<int> &mem_ids,
-             const DeviceType device_type,
-             NetDef *net_def) {
-  OperatorDef operator_def;
-  ops::test::OpDefBuilder("Conv2D", "Conv2dOp")
-      .Input(input_name)
-      .Input(filter_name)
-      .Output(output_name)
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::SAME)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddIntArg("device", static_cast<int>(device_type))
-      .Finalize(&operator_def);
-
-  operator_def.mutable_mem_id()->Reserve(mem_ids.size());
-  for (auto mem_id : mem_ids) {
-    operator_def.add_mem_id(mem_id);
-  }
-  net_def->add_op()->CopyFrom(operator_def);
-}
-
-template <typename T>
-void Relu(const std::string &input_name,
-          const std::string &output_name,
-          const DeviceType device_type,
-          NetDef *net_def) {
-  OperatorDef operator_def;
-  ops::test::OpDefBuilder("Activation", "ReluTest")
-      .Input(input_name)
-      .Output(output_name)
-      .AddStringArg("activation", "RELU")
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddIntArg("device", static_cast<int>(device_type))
-      .Finalize(&operator_def);
-
-  net_def->add_op()->CopyFrom(operator_def);
-}
-
-template <typename T>
-void AddTensor(const std::string &name,
-               const std::vector<int64_t> &shape,
-               const int offset,
-               const int data_size,
-               NetDef *net_def) {
-  ConstTensor *tensor_ptr = net_def->add_tensors();
-  tensor_ptr->set_name(name);
-  tensor_ptr->mutable_dims()->Reserve(shape.size());
-  for (auto dim : shape) {
-    tensor_ptr->add_dims(dim);
-  }
-  tensor_ptr->set_offset(offset);
-  tensor_ptr->set_data_size(data_size);
-  tensor_ptr->set_data_type(DataTypeToEnum<T>::value);
-}
-
-template <DeviceType D, typename T>
-void CheckOutputs(const NetDef &net_def,
-                  const std::map<std::string, mace::MaceTensor> &inputs,
-                  const std::map<std::string, mace::MaceTensor> &outputs,
-                  const std::vector<T> &tensor_data) {
-  ops::test::OpsTestNet net;
-  for (auto input : inputs) {
-    auto input_shape = input.second.shape();
-    const int64_t data_size = std::accumulate(input_shape.begin(),
-                                              input_shape.end(), 1,
-                                              std::multiplies<int64_t>());
-    std::vector<float> input_data(data_size);
-    memcpy(input_data.data(), input.second.data().get(),
-           data_size * sizeof(float));
-    std::string input_name = MakeString("mace_input_node_",
-                                        input.first);
-    net.AddInputFromArray<D, float>(input_name, input.second.shape(),
-                                    input_data);
-  }
-  auto tensors = net_def.tensors();
-  for (auto tensor : tensors) {
-    std::vector<index_t> shape = {tensor.dims().begin(), tensor.dims().end()};
-    const int64_t data_size = std::accumulate(shape.begin(),
-                                              shape.end(), 1,
-                                              std::multiplies<int64_t>());
-    std::vector<T> data(data_size);
-    memcpy(data.data(),
-           reinterpret_cast<const T *>(tensor_data.data()) + tensor.offset(),
-           tensor.data_size() * sizeof(T));
-    net.AddInputFromArray<D, T>(tensor.name(), shape, data);
-  }
-  net.RunNet(net_def, D);
-
-  std::unique_ptr<Allocator> allocator(new CPUAllocator);
-  for (auto output : outputs) {
-    std::unique_ptr<Tensor> tmp_tensor(
-        new Tensor(allocator.get(),
-                   DataTypeToEnum<float>::v()));
-    auto output_shape = output.second.shape();
-    const int64_t data_size = std::accumulate(output_shape.begin(),
-                                              output_shape.end(), 1,
-                                              std::multiplies<float>());
-    tmp_tensor->Resize(output.second.shape());
-    float *data = tmp_tensor->mutable_data<float>();
-    memcpy(data, output.second.data().get(), data_size * sizeof(float));
-    std::string output_name = MakeString("mace_output_node_",
-                                         output.first);
-    ops::test::ExpectTensorNear<float>(*tmp_tensor,
-                                       *net.GetOutput(output_name.data()),
-                                       1e-5);
-  }
-}
-
-std::map<std::string, int> AddMemoryOptimization(
-    const std::vector<std::string> &input_names,
-    const std::vector<std::string> &output_names,
-    const std::vector<std::vector<int64_t>> &input_shapes,
-    const std::vector<std::vector<int64_t>> &output_shapes,
-    NetDef *net_def) {
-  std::map<std::string, int> res;
-  int mem_id = 0;
-  size_t input_shape_size = input_shapes.size();
-  uint32_t in_mem_block_x = 0;
-  uint32_t in_mem_block_y = 0;
-  for (size_t i = 0; i < input_shape_size; ++i) {
-    in_mem_block_x = std::max<uint32_t>(in_mem_block_x,
-                                        input_shapes[i][2] *
-                                            RoundUpDiv4(input_shapes[i][3]));
-    in_mem_block_y = std::max<uint32_t>(in_mem_block_y,
-                                        input_shapes[i][0] *
-                                            input_shapes[i][1]);
-  }
-  size_t input_size = input_names.size();
-  size_t output_size = output_names.size();
-  MemoryArena *mem_arena_ptr = net_def->mutable_mem_arena();
-  mem_arena_ptr->mutable_mem_block()->Reserve(input_size + output_size);
-  for (size_t i = 0; i < input_size; ++i) {
-    MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
-    mem_blk_ptr->set_mem_id(mem_id);
-    mem_blk_ptr->set_device_type(DeviceType::GPU);
-    mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
-    mem_blk_ptr->set_x(in_mem_block_x);
-    mem_blk_ptr->set_y(in_mem_block_y);
-    res[input_names[i]] = mem_id;
-    mem_id++;
-  }
-  size_t output_shape_size = output_shapes.size();
-  uint32_t out_mem_block_x = 0;
-  uint32_t out_mem_block_y = 0;
-  for (size_t i = 0; i < output_shape_size; ++i) {
-    out_mem_block_x = std::max<uint32_t>(out_mem_block_x,
-                                         output_shapes[i][2] *
-                                             RoundUpDiv4(output_shapes[i][3]));
-    out_mem_block_y = std::max<uint32_t>(out_mem_block_y,
-                                         output_shapes[i][0] *
-                                             output_shapes[i][1]);
-  }
-  for (size_t i = 0; i < output_size; ++i) {
-    MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
-    mem_blk_ptr->set_mem_id(mem_id);
-    mem_blk_ptr->set_device_type(DeviceType::GPU);
-    mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
-    mem_blk_ptr->set_x(out_mem_block_x);
-    mem_blk_ptr->set_y(out_mem_block_y);
-    res[output_names[i]] = mem_id;
-    mem_id++;
-  }
-  return res;
-}
 
 // The height and width of input and output must be equal.
-template <typename T>
+template <DeviceType D, typename T>
 void MaceRun(const int in_out_size,
+             const std::vector<int64_t> &max_shape,
              const std::vector<std::vector<int64_t>> &input_shapes,
              const std::vector<std::vector<int64_t>> &output_shapes,
              const std::vector<int64_t> &filter_shape) {
@@ -288,52 +36,31 @@ void MaceRun(const int in_out_size,
     output_names.push_back(MakeString("output", i));
   }
   std::string filter_tensor_name = "filter";
-  std::string filter_tensor_img_name = filter_tensor_name + "_image";
-
-  const DeviceType device = DeviceType::GPU;
 
   std::shared_ptr<NetDef> net_def(new NetDef());
 
-  // Add memory optimization
-  auto mem_map = AddMemoryOptimization(input_names, output_names,
-                                       input_shapes, output_shapes,
-                                       net_def.get());
-
   std::vector<T> data;
   ops::test::GenerateRandomRealTypeData<T>(filter_shape, &data);
   AddTensor<T>(filter_tensor_name, filter_shape, 0, data.size(), net_def.get());
 
   for (size_t i = 0; i < input_names.size(); ++i) {
-    std::string input_name = MakeString("mace_input_node_",
-                                        input_names[i]);
-    BufferToImage<half>(input_name, input_names[i],
-                        mace::ops::IN_OUT_CHANNEL,
-                        {mem_map[input_names[i]]},
-                        device,
-                        net_def.get());
     InputInfo *info = net_def->add_input_info();
     info->set_name(input_names[i]);
-  }
-  BufferToImage<half>(filter_tensor_name, filter_tensor_img_name,
-                      mace::ops::CONV2D_FILTER, {}, device,
-                      net_def.get(), NetMode::INIT);
-  for (size_t i = 0; i < output_names.size(); ++i) {
-    Conv3x3<half>(input_names[i], filter_tensor_img_name,
-                  output_names[i], {mem_map[output_names[i]]},
-                  device, net_def.get());
+    for (auto d : max_shape) {
+      info->add_dims(static_cast<int>(d));
+    }
   }
   for (size_t i = 0; i < output_names.size(); ++i) {
-    std::string output_name = MakeString("mace_output_node_",
-                                         output_names[i]);
-    ImageToBuffer<float>(output_names[i], output_name,
-                         mace::ops::IN_OUT_CHANNEL,
-                         device,
-                         net_def.get());
     OutputInfo *info = net_def->add_output_info();
     info->set_name(output_names[i]);
   }
+  for (size_t i = 0; i < output_names.size(); ++i) {
+    Conv3x3<T>(input_names[i], filter_tensor_name,
+               output_names[i], max_shape,
+               net_def.get());
+  }
 
-  MaceEngineConfig config(DeviceType::GPU);
+  MaceEngineConfig config(D);
 
   MaceEngine engine(config);
   MaceStatus status = engine.Init(net_def.get(), input_names, output_names,
@@ -354,36 +81,64 @@ void MaceRun(const int in_out_size,
     }
   }
 
-  CheckOutputs<DeviceType::GPU, T>(*net_def, inputs, outputs, data);
+  CheckOutputs<D, T>(*net_def, inputs, outputs, data);
 }
 
 }  // namespace
 
-TEST_F(MaceAPITest, GPUSingleInputOutput) {
-  MaceRun<float>(1, {{1, 32, 32, 16}}, {{1, 32, 32, 16}}, {16, 16, 3, 3});
-  MaceRun<half>(1, {{1, 32, 32, 16}}, {{1, 32, 32, 16}}, {16, 16, 3, 3});
+TEST_F(MaceAPITest, SingleInputOutput) {
+  MaceRun<CPU, float>(1,
+                      {1, 32, 32, 16},
+                      {{1, 32, 32, 16}},
+                      {{1, 32, 32, 16}},
+                      {16, 16, 3, 3});
+  MaceRun<GPU, float>(1,
+                      {1, 32, 32, 16},
+                      {{1, 32, 32, 16}},
+                      {{1, 32, 32, 16}},
+                      {16, 16, 3, 3});
+  MaceRun<GPU, half>(1,
+                     {1, 32, 32, 16},
+                     {{1, 32, 32, 16}},
+                     {{1, 32, 32, 16}},
+                     {16, 16, 3, 3});
 }
 
-TEST_F(MaceAPITest, GPUMultipleInputOutput) {
-  MaceRun<float>(2,
-                 {{1, 16, 32, 16}},
-                 {{1, 16, 32, 16}},
-                 {16, 16, 3, 3});
-  MaceRun<half>(2,
-                {{1, 16, 32, 16}},
-                {{1, 16, 32, 16}},
-                {16, 16, 3, 3});
+TEST_F(MaceAPITest, MultipleInputOutput) {
+  MaceRun<CPU, float>(2,
+                      {1, 16, 32, 16},
+                      {{1, 16, 32, 16}},
+                      {{1, 16, 32, 16}},
+                      {16, 16, 3, 3});
+  MaceRun<GPU, float>(2,
+                      {1, 16, 32, 16},
+                      {{1, 16, 32, 16}},
+                      {{1, 16, 32, 16}},
+                      {16, 16, 3, 3});
+  MaceRun<GPU, half>(2,
+                     {1, 16, 32, 16},
+                     {{1, 16, 32, 16}},
+                     {{1, 16, 32, 16}},
+                     {16, 16, 3, 3});
 }
 
-TEST_F(MaceAPITest, GPUVariableInputShape) {
-  MaceRun<float>(1,
-                 {{1, 16, 32, 16}, {1, 32, 64, 16}},
-                 {{1, 16, 32, 16}, {1, 32, 64, 16}},
-                 {16, 16, 3, 3});
-  MaceRun<half>(2,
-                {{1, 16, 32, 16}, {1, 32, 64, 16}},
-                {{1, 16, 32, 16}, {1, 32, 64, 16}},
-                {16, 16, 3, 3});
+TEST_F(MaceAPITest, VariableInputShape) {
+  // TODO(liyin): there is a bug of cpu convolution
+//  MaceRun<CPU, float>(1,
+//                      {1, 32, 64, 16},
+//                      {{1, 16, 32, 16}, {1, 32, 64, 16}},
+//                      {{1, 16, 32, 16}, {1, 32, 64, 16}},
+//                      {16, 16, 3, 3});
+  MaceRun<GPU, float>(1,
+                      {1, 32, 64, 16},
+                      {{1, 16, 32, 16}, {1, 32, 64, 16}},
+                      {{1, 16, 32, 16}, {1, 32, 64, 16}},
+                      {16, 16, 3, 3});
+  MaceRun<GPU, half>(2,
+                     {1, 32, 64, 16},
+                     {{1, 16, 32, 16}, {1, 32, 64, 16}},
+                     {{1, 16, 32, 16}, {1, 32, 64, 16}},
+                     {16, 16, 3, 3});
 }
 
 }  // namespace test
diff --git a/mace/test/mace_api_test.h b/mace/test/mace_api_test.h
new file mode 100644
index 0000000000000000000000000000000000000000..f43815ccc0d39596229108c62d6082e120ed71ae
--- /dev/null
+++ b/mace/test/mace_api_test.h
@@ -0,0 +1,187 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_TEST_MACE_API_TEST_H_
+#define MACE_TEST_MACE_API_TEST_H_
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mace/ops/conv_pool_2d_util.h"
+#include "mace/ops/ops_test_util.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace test {
+
+inline void GenerateInputs(const std::vector<std::string> &input_names,
+                           const std::vector<int64_t> &input_shape,
+                           std::map<std::string, mace::MaceTensor> *inputs) {
+  size_t input_size = input_names.size();
+  for (size_t i = 0; i < input_size; ++i) {
+    // Allocate input and output
+    int64_t input_size =
+        std::accumulate(input_shape.begin(), input_shape.end(), 1,
+                        std::multiplies<int64_t>());
+    auto buffer_in = std::shared_ptr<float>(new float[input_size],
+                                            std::default_delete<float[]>());
+    // load input
+    std::vector<float> input_data;
+    ops::test::GenerateRandomRealTypeData(input_shape, &input_data);
+    memcpy(buffer_in.get(), input_data.data(), input_size * sizeof(float));
+    (*inputs)[input_names[i]] = mace::MaceTensor(input_shape, buffer_in);
+  }
+}
+
+inline void GenerateOutputs(const std::vector<std::string> &output_names,
+                            const std::vector<int64_t> &output_shape,
+                            std::map<std::string, mace::MaceTensor> *outputs) {
+  size_t output_size = output_names.size();
+  for (size_t i = 0; i < output_size; ++i) {
+    int64_t output_size =
+        std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                        std::multiplies<int64_t>());
+    auto buffer_out = std::shared_ptr<float>(new float[output_size],
+                                             std::default_delete<float[]>());
+    (*outputs)[output_names[i]] = mace::MaceTensor(output_shape, buffer_out);
+  }
+}
+
+template <typename T>
+void Conv3x3(const std::string &input_name,
+             const std::string &filter_name,
+             const std::string &output_name,
+             const std::vector<index_t> &output_shape,
+             NetDef *net_def) {
+  OperatorDef operator_def;
+  ops::test::OpDefBuilder("Conv2D", "Conv2dOp")
+      .Input(input_name)
+      .Input(filter_name)
+      .Output(output_name)
+      .AddIntsArg("strides", {1, 1})
+      .AddIntArg("padding", Padding::SAME)
+      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(&operator_def);
+
+  OutputShape *shape = operator_def.add_output_shape();
+  for (auto dim : output_shape) {
+    shape->add_dims(dim);
+  }
+
+  net_def->add_op()->CopyFrom(operator_def);
+}
+
+template <typename T>
+void Relu(const std::string &input_name,
+          const std::string &output_name,
+          const DeviceType device_type,
+          NetDef *net_def) {
+  OperatorDef operator_def;
+  ops::test::OpDefBuilder("Activation", "ReluTest")
+      .Input(input_name)
+      .Output(output_name)
+      .AddStringArg("activation", "RELU")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntArg("device", static_cast<int>(device_type))
+      .Finalize(&operator_def);
+
+  net_def->add_op()->CopyFrom(operator_def);
+}
+
+template <typename T>
+void AddTensor(const std::string &name,
+               const std::vector<int64_t> &shape,
+               const int offset,
+               const int data_size,
+               NetDef *net_def) {
+  ConstTensor *tensor_ptr = net_def->add_tensors();
+  tensor_ptr->set_name(name);
+  tensor_ptr->mutable_dims()->Reserve(shape.size());
+  for (auto dim : shape) {
+    tensor_ptr->add_dims(dim);
+  }
+  tensor_ptr->set_offset(offset);
+  tensor_ptr->set_data_size(data_size);
+  tensor_ptr->set_data_type(DataTypeToEnum<T>::value);
+}
+
+template <DeviceType D, typename T>
+void CheckOutputs(const NetDef &net_def,
+                  const std::map<std::string, mace::MaceTensor> &inputs,
+                  const std::map<std::string, mace::MaceTensor> &outputs,
+                  const std::vector<T> &tensor_data) {
+  ops::test::OpsTestNet net;
+  for (auto input : inputs) {
+    auto input_shape = input.second.shape();
+    const int64_t data_size = std::accumulate(input_shape.begin(),
+                                              input_shape.end(), 1,
+                                              std::multiplies<int64_t>());
+    std::vector<float> input_data(data_size);
+    memcpy(input_data.data(), input.second.data().get(),
+           data_size * sizeof(float));
+    if (D == DeviceType::CPU) {
+      std::string input_name = input.first + "NHWC";
+      net.AddInputFromArray<D, float>(input_name, input_shape, input_data);
+      net.TransformDataFormat<D, float>(input_name, NHWC, input.first, NCHW);
+    } else {
+      net.AddInputFromArray<D, float>(input.first, input_shape, input_data);
+    }
+  }
+  auto tensors = net_def.tensors();
+  for (auto tensor : tensors) {
+    std::vector<index_t> shape = {tensor.dims().begin(), tensor.dims().end()};
+    const int64_t data_size = std::accumulate(shape.begin(),
+                                              shape.end(), 1,
+                                              std::multiplies<int64_t>());
+    std::vector<T> data(data_size);
+    memcpy(data.data(),
+           reinterpret_cast<const T *>(tensor_data.data()) + tensor.offset(),
+           tensor.data_size() * sizeof(T));
+    net.AddInputFromArray<D, T>(tensor.name(), shape, data);
+  }
+  net.RunNet(net_def, D);
+
+  std::unique_ptr<Allocator> allocator(new CPUAllocator);
+  for (auto output : outputs) {
+    std::unique_ptr<Tensor> tmp_tensor(
+        new Tensor(allocator.get(),
+                   DataTypeToEnum<float>::v()));
+    auto output_shape = output.second.shape();
+    const int64_t data_size = std::accumulate(output_shape.begin(),
+                                              output_shape.end(), 1,
+                                              std::multiplies<float>());
+    tmp_tensor->Resize(output.second.shape());
+    float *data = tmp_tensor->mutable_data<float>();
+    memcpy(data, output.second.data().get(), data_size * sizeof(float));
+
+    std::string output_name = output.first;
+    if (D == DeviceType::CPU) {
+      output_name = output.first + "NHWC";
+      net.TransformDataFormat<CPU, float>(output.first,
+                                          NCHW,
+                                          output_name,
+                                          NHWC);
+    }
+    ops::test::ExpectTensorNear<float>(*tmp_tensor,
+                                       *net.GetOutput(output_name.data()),
+                                       1e-5);
+  }
+}
+}  // namespace test
+}  // namespace mace
+#endif  // MACE_TEST_MACE_API_TEST_H_
diff --git a/mace/utils/utils.h b/mace/utils/utils.h
index 237febcce69f9d849ad3431c502295273bea89b3..1d9eebc9ff5a2897bd70e5c8cac439957c4b9441 100644
--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
@@ -174,5 +174,16 @@ inline bool EnvEnabled(std::string env_name) {
   return !(!env || env[0] == 0 || env[0] == '0');
 }
 
+template <typename SrcType, typename DstType>
+std::vector<DstType> TransposeShape(const std::vector<SrcType> &shape,
+                                    const std::vector<int> &dst_dims) {
+  size_t shape_dims = shape.size();
+  std::vector<DstType> output_shape(shape_dims);
+  for (size_t i = 0; i < shape_dims; ++i) {
+    output_shape[i] = static_cast<DstType>(shape[dst_dims[i]]);
+  }
+  return output_shape;
+}
+
 }  // namespace mace
 #endif  // MACE_UTILS_UTILS_H_
diff --git a/tools/converter.py b/tools/converter.py
index 6f66dafd3038e97360b0c451cdd4ce33d11e44f6..e98715fc95def1972c376c76c211758b19c6b2b2 100644
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -163,6 +163,16 @@ DSPDataType = Enum('DSPDataType', [(ele, ele) for ele in DSPDataTypeStrs],
 
 WinogradParameters = [0, 2, 4]
 
+DataFormatStrs = [
+    "NONE",
+    "NHWC",
+]
+
+
+class DataFormat(object):
+    NONE = "NONE"
+    NHWC = "NHWC"
+
 
 class DefaultValues(object):
     mace_lib_type = MACELibType.static
@@ -195,6 +205,8 @@ class YAMLKeyword(object):
     runtime = 'runtime'
     data_type = 'data_type'
     input_data_types = 'input_data_types'
+    input_data_formats = 'input_data_formats'
+    output_data_formats = 'output_data_formats'
     limit_opencl_kernel_time = 'limit_opencl_kernel_time'
     nnlib_graph_mode = 'nnlib_graph_mode'
     obfuscate = 'obfuscate'
@@ -487,7 +499,7 @@ def format_model_config(flags):
             if input_data_types:
                 if not isinstance(input_data_types, list):
                     subgraph[YAMLKeyword.input_data_types] = [input_data_types]
-                for input_data_type in input_data_types:
+                for input_data_type in subgraph[YAMLKeyword.input_data_types]:
                     mace_check(input_data_type in InputDataTypeStrs,
                                ModuleName.YAML_CONFIG,
                                "'input_data_types' must be in "
@@ -495,6 +507,49 @@ def format_model_config(flags):
             else:
                 subgraph[YAMLKeyword.input_data_types] = []
 
+            input_data_formats = subgraph.get(YAMLKeyword.input_data_formats,
+                                              [])
+            if input_data_formats:
+                if not isinstance(input_data_formats, list):
+                    subgraph[YAMLKeyword.input_data_formats] =\
+                        [input_data_formats]
+                else:
+                    mace_check(len(input_data_formats)
+                               == len(subgraph[YAMLKeyword.input_tensors]),
+                               ModuleName.YAML_CONFIG,
+                               "input_data_formats should match"
+                               " the size of input")
+                for input_data_format in\
+                        subgraph[YAMLKeyword.input_data_formats]:
+                    mace_check(input_data_format in DataFormatStrs,
+                               ModuleName.YAML_CONFIG,
+                               "'input_data_formats' must be in "
+                               + str(DataFormatStrs) + ", but got "
+                               + input_data_formats)
+            else:
+                subgraph[YAMLKeyword.input_data_formats] = [DataFormat.NHWC]
+
+            output_data_formats = subgraph.get(YAMLKeyword.output_data_formats,
+                                               [])
+            if output_data_formats:
+                if not isinstance(output_data_formats, list):
+                    subgraph[YAMLKeyword.output_data_formats] = \
+                        [output_data_formats]
+                else:
+                    mace_check(len(output_data_formats)
+                               == len(subgraph[YAMLKeyword.output_tensors]),
+                               ModuleName.YAML_CONFIG,
+                               "output_data_formats should match"
+                               " the size of output")
+                for output_data_format in\
+                        subgraph[YAMLKeyword.output_data_formats]:
+                    mace_check(output_data_format in DataFormatStrs,
+                               ModuleName.YAML_CONFIG,
+                               "'input_data_formats' must be in "
+                               + str(DataFormatStrs))
+            else:
+                subgraph[YAMLKeyword.output_data_formats] = [DataFormat.NHWC]
+
             validation_threshold = subgraph.get(
                 YAMLKeyword.validation_threshold, {})
             if not isinstance(validation_threshold, dict):
@@ -803,7 +858,9 @@ def convert_model(configs, cl_mem_type):
             model_config[YAMLKeyword.model_sha256_checksum],
             model_config[YAMLKeyword.weight_sha256_checksum],
             ",".join(subgraphs[0][YAMLKeyword.input_tensors]),
+            ",".join(subgraphs[0][YAMLKeyword.input_data_formats]),
             ",".join(subgraphs[0][YAMLKeyword.output_tensors]),
+            ",".join(subgraphs[0][YAMLKeyword.output_data_formats]),
             ",".join(subgraphs[0][YAMLKeyword.check_tensors]),
             runtime,
             model_name,
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index 0eb991296d180225b25bc2cc1429f0de17c10e76..601f5b2cbe45b4898f0683dcd93095dfca333bc1 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -555,7 +555,9 @@ def gen_model_code(model_codegen_dir,
                    model_sha256_checksum,
                    weight_sha256_checksum,
                    input_nodes,
+                   input_data_formats,
                    output_nodes,
+                   output_data_formats,
                    check_nodes,
                    runtime,
                    model_tag,
@@ -588,7 +590,9 @@ def gen_model_code(model_codegen_dir,
               "--model_checksum=%s" % model_sha256_checksum,
               "--weight_checksum=%s" % weight_sha256_checksum,
               "--input_node=%s" % input_nodes,
+              "--input_data_formats=%s" % input_data_formats,
               "--output_node=%s" % output_nodes,
+              "--output_data_formats=%s" % output_data_formats,
               "--check_node=%s" % check_nodes,
               "--runtime=%s" % runtime,
               "--template=%s" % "mace/python/tools",