Merge branch 'refactor-data-format' into 'master'

Refactor data format See merge request !1069

Merge branch 'refactor-data-format' into 'master'
Refactor data format See merge request !1069
50cf1737 · 李寅 · 9b0b03c9 · 95b32c24 · 50cf1737 · 50cf1737
102 changed file
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -83,7 +83,7 @@ DataFormat ParseDataFormat(const std::string &data_format_str) {
  } else if (data_format_str == "OIHW") {
    return DataFormat::OIHW;
  } else {
-    return DataFormat::DF_NONE;
+    return DataFormat::NONE;
  }
 }


--- a/mace/core/arg_helper.cc
+++ b/mace/core/arg_helper.cc
@@ -96,6 +96,43 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true)
 MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true)
 #undef MACE_GET_REPEATED_ARGUMENT_FUNC

+#define MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, T, fieldname)                     \
+  template<>                                                                   \
+  void SetProtoArg<T>(Def *def,                                                \
+                      const std::string &arg_name,                             \
+                      const T &value) {                                        \
+    int size = def->arg_size();                                                \
+    for (int i = 0; i < size; ++i) {                                           \
+      auto arg = def->mutable_arg(i);                                          \
+      if (arg->name() == arg_name) {                                           \
+        VLOG(3) << "Update old argument value from "                           \
+                << arg->fieldname() << " to "                                  \
+                << value << " for " << arg_name;                               \
+        arg->set_##fieldname(value);                                           \
+        return;                                                                \
+      }                                                                        \
+    }                                                                          \
+    VLOG(3) << "Add new argument " << arg_name << "(name: "                    \
+            << arg_name << ", value: " << value << ")";                        \
+    auto arg = def->add_arg();                                                 \
+    arg->set_name(arg_name);                                                   \
+    arg->set_##fieldname(value);                                               \
+  }
+
+#define MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(Def)     \
+  MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, float, f)       \
+  MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, bool, i)        \
+  MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int, i)         \
+  MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int64_t, i)
+
+MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(OperatorDef)
+MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(NetDef)
+#undef MACE_SET_OPTIONAL_ARGUMENT_FUNC
+
+const std::string OutputMemoryTypeTagName() {
+  static const char *kOutputMemTypeArgName = "output_mem_type";
+  return kOutputMemTypeArgName;
+}

 bool IsQuantizedModel(const NetDef &net_def) {
  return

--- a/mace/core/arg_helper.h
+++ b/mace/core/arg_helper.h
@@ -55,6 +55,18 @@ class ProtoArgHelper {
  std::map<std::string, Argument> arg_map_;
 };

+template <typename T>
+void SetProtoArg(OperatorDef *op_def,
+                 const std::string &arg_name,
+                 const T&value);
+
+template <typename T>
+void SetProtoArg(NetDef *op_def,
+                 const std::string &arg_name,
+                 const T&value);
+
+const std::string OutputMemoryTypeTagName();
+
 bool IsQuantizedModel(const NetDef &def);

 }  // namespace mace

--- a/mace/core/memory_optimizer.cc
+++ b/mace/core/memory_optimizer.cc
@@ -33,7 +33,7 @@ namespace mace {

 bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) {
  static const std::unordered_set<std::string> kReuseOp = {
-      "Reshape", "Identity", "Squeeze"
+      "Reshape", "Identity", "Squeeze", "ExpandDims"
  };
  return kReuseOp.count(op_type) == 1;
 }
@@ -124,8 +124,10 @@ void MemoryOptimizer::Optimize(
      op_def->output_type_size());
  DataType dt;

-  bool has_data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-      *op_def, "has_data_format", 0) != 0;
+  DataFormat data_format = static_cast<DataFormat>(
+      ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+          *op_def, "data_format",
+          static_cast<int>(DataFormat::NONE)));
  int output_size = op_def->output_size();
  for (int i = 0; i < output_size; ++i) {
    if (i < op_def->output_type_size()) {
@@ -209,7 +211,7 @@ void MemoryOptimizer::Optimize(
        mem_ref_count_[best_mem_id] = 1;
      }
      tensor_mem_map_.emplace(op_def->output(i), TensorMemInfo(best_mem_id,
-          dt, has_data_format));
+          dt, data_format));
    }
  }


--- a/mace/core/memory_optimizer.h
+++ b/mace/core/memory_optimizer.h
@@ -22,6 +22,7 @@
 #include <vector>

 #include "mace/proto/mace.pb.h"
+#include "mace/port/port.h"
 #include "mace/core/types.h"

 namespace mace {
@@ -81,10 +82,10 @@ class MemoryOptimizer {
  struct TensorMemInfo {
    int mem_id;
    DataType data_type;
-    bool has_data_format;
+    DataFormat data_format;

-    TensorMemInfo(int mem_id, DataType data_type, bool has_data_format) :
-        mem_id(mem_id), data_type(data_type), has_data_format(has_data_format)
+    TensorMemInfo(int mem_id, DataType data_type, DataFormat data_format) :
+        mem_id(mem_id), data_type(data_type), data_format(data_format)
    {}
  };


--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -31,99 +31,8 @@
 #include "mace/utils/memory.h"
 #include "mace/utils/timer.h"

-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/opencl_util.h"
-#endif  // MACE_ENABLE_OPENCL
-
 namespace mace {

-namespace {
-struct InternalOutputInfo {
-  InternalOutputInfo(const MemoryType mem_type,
-                     const DataType dtype,
-                     const DataFormat data_format,
-                     const std::vector<index_t> &shape,
-                     int op_idx)
-      : mem_type(mem_type), dtype(dtype), data_format(data_format),
-        shape(shape), op_idx(op_idx) {}
-
-  MemoryType mem_type;  // transformed memory type
-  DataType dtype;
-  DataFormat data_format;
-  std::vector<index_t> shape;  // tensor shape
-  int op_idx;  // operation which generate the tensor
-};
-
-#ifdef MACE_ENABLE_OPENCL
-std::string TransformedName(const std::string &input_name,
-                            const mace::MemoryType mem_type) {
-  std::stringstream ss;
-  ss << input_name << "_mem_type_" << mem_type;
-  return ss.str();
-}
-
-bool TransformRequiredOp(const std::string &op_type) {
-  static const std::unordered_set<std::string> kNoTransformOp = {
-      "Shape", "InferConv2dShape"
-  };
-  return kNoTransformOp.count(op_type) == 0;
-}
-#endif  // MACE_ENABLE_OPENCL
-
-}  // namespace
-
-std::unique_ptr<Operation> SerialNet::CreateOperation(
-    const OpRegistryBase *op_registry,
-    OpConstructContext *construct_context,
-    std::shared_ptr<OperatorDef> op_def,
-    bool has_data_format,
-    bool is_quantize_model) {
-  // Create the Operation
-  DeviceType target_device_type = target_device_->device_type();
-  DeviceType device_type = DeviceType::CPU;
-  construct_context->set_device(cpu_device_.get());
-  construct_context->set_operator_def(op_def);
-  construct_context->set_output_mem_type(MemoryType::CPU_BUFFER);
-  // Get available devices
-  auto available_devices =
-      op_registry->AvailableDevices(op_def->type(), construct_context);
-  // Find the device type to run the op.
-  // If the target_device_type in available devices, use target_device_type,
-  // otherwise, fallback to CPU device.
-  for (auto device : available_devices) {
-    if (device == target_device_type) {
-      device_type = target_device_type;
-      construct_context->set_device(target_device_);
-      if (target_device_->device_type() == DeviceType::GPU) {
-        construct_context->set_output_mem_type(MemoryType::GPU_IMAGE);
-      }
-      break;
-    }
-  }
-  op_def->set_device_type(device_type);
-
-  // transpose output shape if run on CPU (default format is NHWC)
-  if (!is_quantize_model && device_type == DeviceType::CPU &&
-      op_def->output_shape_size() == op_def->output_size()) {
-    for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
-      if (has_data_format && op_def->output_shape(out_idx).dims_size() == 4) {
-        //  NHWC -> NCHW
-        std::vector<index_t> output_shape =
-            TransposeShape<index_t, index_t>(
-                std::vector<index_t>(
-                    op_def->output_shape(out_idx).dims().begin(),
-                    op_def->output_shape(out_idx).dims().end()),
-                {0, 3, 1, 2});
-        for (int i = 0; i < 4; ++i) {
-          op_def->mutable_output_shape(out_idx)->set_dims(i, output_shape[i]);
-        }
-      }
-    }
-  }
-
-  return op_registry->CreateOperation(construct_context, device_type);
-}
-
 SerialNet::SerialNet(const OpRegistryBase *op_registry,
                     const NetDef *net_def,
                     Workspace *ws,
@@ -138,237 +47,47 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
              target_device->cpu_runtime()->policy(),
              &target_device->cpu_runtime()->thread_pool())) {
  MACE_LATENCY_LOGGER(1, "Constructing SerialNet");
-  // quantize model flag
-  bool is_quantize_model = IsQuantizedModel(*net_def);
-  // Tensor Shape map
-  std::unordered_map<std::string, std::vector<index_t>> tensor_shape_map;
-  for (auto &op : net_def->op()) {
-    if (op.output_size() != op.output_shape_size()) {
-      continue;
-    }
-    for (int i = 0; i < op.output_size(); ++i) {
-      tensor_shape_map[op.output(i)] = std::vector<index_t>(
-          op.output_shape(i).dims().begin(),
-          op.output_shape(i).dims().end());
-    }
-  }
-  for (auto &tensor : net_def->tensors()) {
-    tensor_shape_map[tensor.name()] =
-        std::vector<index_t>(tensor.dims().begin(), tensor.dims().end());
-  }

-  bool has_data_format = false;
-  if (target_device_->device_type() == DeviceType::CPU) {
-    for (auto &input_info : net_def->input_info()) {
-      std::vector<index_t> input_shape =
-          std::vector<index_t>(input_info.dims().begin(),
-                               input_info.dims().end());
-      // update tensor shape map
-      tensor_shape_map[input_info.name()] = input_shape;
-      // Only could be NONE or NHWC
-      DataFormat input_data_format = static_cast<DataFormat>(
-          input_info.data_format());
-      has_data_format = has_data_format ||
-          (input_data_format != DataFormat::DF_NONE);
-      if (!is_quantize_model && input_data_format == DataFormat::NHWC &&
-          input_info.dims_size() == 4) {
-        // NHWC -> NCHW
-        input_shape =
-            TransposeShape<index_t, index_t>(input_shape, {0, 3, 1, 2});
-      }
-    }
-  }
 #ifdef MACE_ENABLE_OPENCL
-  // output tensor : related information
-  std::unordered_map<std::string, InternalOutputInfo> output_map;
  // used for memory optimization
  std::unordered_map<std::string, MemoryType> output_mem_map;
-  std::unordered_set<std::string> transformed_set;
-  // add input information
-  MemoryType target_mem_type;
-  // default data format of output tensor
-  DataFormat default_output_df = DataFormat::DF_NONE;
-  if (target_device_->device_type() == DeviceType::GPU) {
-    target_mem_type = MemoryType::GPU_BUFFER;
-    for (auto &input_info : net_def->input_info()) {
-      DataFormat input_data_format = static_cast<DataFormat>(
-          input_info.data_format());
-      has_data_format = input_data_format != DataFormat::DF_NONE;
-      std::vector<index_t> input_shape =
-          std::vector<index_t>(input_info.dims().begin(),
-                               input_info.dims().end());
-      // update tensor shape map
-      tensor_shape_map[input_info.name()] = input_shape;
-      output_map.emplace(input_info.name(), InternalOutputInfo(
-          target_mem_type, DataType::DT_FLOAT, input_data_format,
-          input_shape, -1));
-    }
-    default_output_df =
-        has_data_format ? DataFormat::NHWC : DataFormat::DF_NONE;
-  }
 #endif  // MACE_ENABLE_OPENCL

-  OpConstructContext construct_context(ws_, &tensor_shape_map);
+  OpConstructContext construct_context(ws_);
  for (int idx = 0; idx < net_def->op_size(); ++idx) {
    std::shared_ptr<OperatorDef> op_def(new OperatorDef(net_def->op(idx)));
    // Create operation
-    auto op = CreateOperation(op_registry,
-                              &construct_context,
-                              op_def,
-                              has_data_format,
-                              is_quantize_model);
-#ifdef MACE_ENABLE_OPENCL
-    // Add input transform operation if necessary
-    if (target_device_->device_type() == DeviceType::GPU) {
-      // the outputs' memory type of the operation
-      MemoryType out_mem_type = construct_context.output_mem_type();
-      int input_size = op_def->input_size();
-      // if op is memory-unused op, no transformation
-      if (TransformRequiredOp(op_def->type())) {
-        for (int i = 0; i < input_size; ++i) {
-          if (output_map.count(op_def->input(i)) == 1) {
-            // if op is memory-reuse op, no transformation
-            if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) {
-              out_mem_type = output_map.at(op_def->input(i)).mem_type;
-              break;
-            }
-            // check whether to do transform
-            MemoryType wanted_in_mem_type =
-                construct_context.GetInputMemType(i);
-            DataType wanted_in_dt = construct_context.GetInputDataType(i);
-            if (output_map.at(op_def->input(i)).mem_type != wanted_in_mem_type
-                || output_map.at(op_def->input(i)).dtype != wanted_in_dt) {
-              auto t_input_name = TransformedName(op_def->input(i),
-                                                  wanted_in_mem_type);
-              auto &output_info = output_map.at(op_def->input(i));
-              // check whether the tensor has been transformed
-              if (transformed_set.count(t_input_name) == 0) {
-                VLOG(1) << "Add Transform operation " << op_def->name()
-                        << " to transform tensor "
-                        << op_def->input(i) << "', from memory type "
-                        << output_info.mem_type << " to "
-                        << wanted_in_mem_type
-                        << ", from Data Type " << output_info.dtype << " to "
-                        << wanted_in_dt << ". with data format "
-                        << output_info.data_format;
-                std::string input_name = op_def->input(i);
-                op_def->set_input(i, t_input_name);
-                auto input_shape = output_info.shape;
-                if (output_info.mem_type == MemoryType::CPU_BUFFER &&
-                    output_info.data_format == DataFormat::NCHW &&
-                    input_shape.size() == 4) {
-                  // NCHW -> NHWC
-                  input_shape =
-                      TransposeShape<index_t, index_t>(input_shape,
-                                                       {0, 2, 3, 1});
-                }
-                auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
-                    input_name, input_shape, t_input_name, wanted_in_dt,
-                    construct_context.GetInputOpenCLBufferType(i),
-                    wanted_in_mem_type, has_data_format);
-                OpConstructContext t_construct_context(ws_);
-                auto transform_op = CreateOperation(
-                    op_registry,
-                    &t_construct_context,
-                    transform_op_def,
-                    has_data_format);
-                operators_.emplace_back(std::move(transform_op));
-                transformed_set.insert(t_input_name);
-                output_mem_map[t_input_name] = wanted_in_mem_type;
-                // where to do graph reference count.
-                mem_optimizer->UpdateTensorRef(transform_op_def.get());
-              } else {
-                op_def->set_input(i, t_input_name);
-              }
-            }
-          } else {
-            MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
-                           && ws_->GetTensor(op_def->input(i))->is_weight(),
-                       "Tensor ", op_def->input(i), " of ",
-                       op_def->name(), " not allocated");
-          }
-        }
-      }
-      // update the map : output_tensor -> Operation
-      for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
-        DataType dt;
-        if (op_def->output_type_size() == op_def->output_size()) {
-          dt = op_def->output_type(out_idx);
-        } else {
-          dt = static_cast<DataType>(
-              ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                  *op_def, "T", static_cast<int>(DataType::DT_FLOAT)));
-        }
-        output_mem_map[op_def->output(out_idx)] = out_mem_type;
-        output_map.emplace(
-            op_def->output(out_idx),
-            InternalOutputInfo(
-                out_mem_type,
-                dt,
-                default_output_df,
-                op_def->output_shape().empty() ?
-                std::vector<index_t>() :
-                std::vector<index_t>(
-                    op_def->output_shape(out_idx).dims().begin(),
-                    op_def->output_shape(out_idx).dims().end()),
-                static_cast<int>(operators_.size())));
-      }
+    auto op_device_type = static_cast<DeviceType>(op_def->device_type());
+    if (op_device_type == target_device_->device_type()) {
+      construct_context.set_device(target_device_);
+    } else if (op_device_type == DeviceType::CPU) {
+      construct_context.set_device(cpu_device_.get());
+    } else {
+      LOG(FATAL) << "Encounter unexpected error: "
+                 << op_device_type << " vs " << target_device_->device_type();
    }
-#endif  // MACE_ENABLE_OPENCL
+    construct_context.set_operator_def(op_def);
+
+    auto op = op_registry->CreateOperation(&construct_context,
+                                           op_device_type);
    operators_.emplace_back(std::move(op));
    // where to do graph reference count.
    mem_optimizer->UpdateTensorRef(op_def.get());
-  }

 #ifdef MACE_ENABLE_OPENCL
-  // Transform the output tensor if necessary
-  if (target_device_->device_type() == DeviceType::GPU) {
-    for (auto &output_info : net_def->output_info()) {
-      auto &internal_output_info = output_map.at(output_info.name());
-      if ((internal_output_info.mem_type != target_mem_type &&
-          internal_output_info.mem_type != MemoryType::CPU_BUFFER) ||
-          internal_output_info.dtype != output_info.data_type()) {
-        VLOG(1) << "Add Transform operation to transform output tensor '"
-                << output_info.name() << "', from memory type "
-                << internal_output_info.mem_type
-                << " to " << target_mem_type
-                << ", from Data Type " << internal_output_info.dtype
-                << " to " << output_info.data_type();
-        std::string t_output_name = TransformedName(output_info.name(),
-            target_mem_type);
-        auto output_op_def =
-            operators_[internal_output_info.op_idx]->operator_def();
-        int output_size = output_op_def->output_size();
-        for (int i = 0; i < output_size; ++i) {
-          if (output_op_def->output(i) == output_info.name()) {
-            output_op_def->set_output(i, t_output_name);
-            // update the output : mem_type map
-            output_mem_map[t_output_name] = output_mem_map[output_info.name()];
-            output_mem_map[output_info.name()] = target_mem_type;
-          }
-        }
-        bool output_has_data_format =
-            static_cast<DataFormat>(output_info.data_format());
-        auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
-            t_output_name,
-            internal_output_info.shape,
-            output_info.name(),
-            output_info.data_type(),
-            OpenCLBufferType::IN_OUT_CHANNEL,
-            target_mem_type,
-            output_has_data_format);
-        auto transform_op = CreateOperation(
-            op_registry,
-            &construct_context,
-            transform_op_def,
-            output_has_data_format);
-        operators_.emplace_back(std::move(transform_op));
-        // where to do graph reference count.
-        mem_optimizer->UpdateTensorRef(transform_op_def.get());
+    if (target_device_->device_type() == DeviceType::GPU) {
+      // update the map : output_tensor -> MemoryType
+      MemoryType out_mem_type =
+          static_cast<MemoryType>(
+              ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                  net_def->op(idx), OutputMemoryTypeTagName(),
+                  static_cast<int>(MemoryType::CPU_BUFFER)));
+      for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
+        output_mem_map[op_def->output(out_idx)] = out_mem_type;
      }
    }
-  }
 #endif  // MACE_ENABLE_OPENCL
+  }
  // Update output tensor reference
  for (auto &output_info : net_def->output_info()) {
    mem_optimizer->UpdateTensorRef(output_info.name());

--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -54,14 +54,6 @@ class SerialNet : public NetBase {

  MaceStatus Run(RunMetadata *run_metadata = nullptr) override;

- private:
-  std::unique_ptr<Operation> CreateOperation(
-      const OpRegistryBase *op_registry,
-      OpConstructContext *construct_context,
-      std::shared_ptr<OperatorDef> op_def,
-      bool has_data_format,
-      bool is_quantize_model = false);
-
 protected:
  Workspace *ws_;
  Device *target_device_;

--- a/mace/core/net_def_adapter.cc
+++ b/mace/core/net_def_adapter.cc
--- a/mace/core/net_def_adapter.h
+++ b/mace/core/net_def_adapter.h
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_NET_DEF_ADAPTER_H_
+#define MACE_CORE_NET_DEF_ADAPTER_H_
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "mace/core/types.h"
+#include "mace/proto/mace.pb.h"
+#include "mace/port/port.h"
+#include "mace/core/operator.h"
+#include "mace/core/net_optimizer.h"
+
+namespace mace {
+
+class OpRegistryBase;
+class Workspace;
+class Device;
+
+/// Conventions:
+/// 1. DataFormat::AUTO stands for formatted (NHWC or NCHW)
+/// 2. if Op with DataFormat::AUTO, the arguments of this op
+///    is formatted to NHWC
+class NetDefAdapter {
+ public:
+  NetDefAdapter(const OpRegistryBase *op_registry,
+                const Workspace *ws);
+  // Adapt original net_def to a better net.
+  // 1. Adapt device: choose best device for every op in the net.
+  // 2. Adapt data type: Add data type related transform ops
+  //                     for mixing precision.
+  // 3. Adapt data format: confirm data format of every op
+  //                       and add transpose if necessary.
+  // 4. Adapt memory type: Add BufferTransform if necessary
+  //                       for transforming memory type between ops.
+  MaceStatus AdaptNetDef(
+      const NetDef *net_def,
+      Device *target_device,
+      NetDef *target_net_def);
+
+ public:
+  NetDefAdapter(const NetDefAdapter&) = delete;
+  NetDefAdapter(const NetDefAdapter&&) = delete;
+  NetDefAdapter &operator=(const NetDefAdapter &) = delete;
+  NetDefAdapter &operator=(const NetDefAdapter &&) = delete;
+
+ private:
+  struct InternalOutputInfo {
+    InternalOutputInfo(const MemoryType mem_type,
+                       const DataType dtype,
+                       const DataFormat data_format,
+                       const std::vector<index_t> &shape,
+                       int op_idx)
+        : mem_type(mem_type), dtype(dtype), data_format(data_format),
+          shape(shape), op_idx(op_idx) {}
+
+    MemoryType mem_type;
+    DataType dtype;
+    DataFormat data_format;
+    std::vector<index_t> shape;  // tensor shape
+    int op_idx;  // operation which generate the tensor
+  };
+
+  typedef std::unordered_map<std::string, InternalOutputInfo> TensorInfoMap;
+
+ private:
+  MaceStatus AdaptDevice(OpConditionContext *context,
+                         Device *target_device,
+                         Device *cpu_device,
+                         const TensorInfoMap &output_map,
+                         const NetDef *net_def,
+                         OperatorDef *op);
+  MaceStatus AdaptDataType(OpConditionContext *context,
+                           OperatorDef *op);
+  MaceStatus AdaptDataFormat(
+      OpConditionContext *context,
+      OperatorDef *op,
+      bool is_quantized_model,
+      TensorInfoMap *output_map,
+      std::unordered_set<std::string> *transformed_set,
+      DataFormat *op_output_df,
+      NetDef *target_net_def);
+
+  MaceStatus AdaptMemoryType(
+      OpConditionContext *context,
+      OperatorDef *op_def,
+      TensorInfoMap *output_map,
+      std::unordered_set<std::string> *transformed_set,
+      MemoryType *op_output_mem_types,
+      NetDef *target_net_def);
+
+  std::string DebugString(const NetDef *net_def);
+
+ private:
+  const OpRegistryBase *op_registry_;
+  const Workspace *ws_;
+  NetOptimizer net_optimizer_;
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_NET_DEF_ADAPTER_H_
--- a/mace/core/net_optimizer.cc
+++ b/mace/core/net_optimizer.cc
+//  Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/net_optimizer.h"
+
+#include <string>
+
+namespace mace {
+
+DeviceType NetOptimizer::SelectBestDevice(
+    const OperatorDef *op_def,
+    DeviceType target_device_type,
+    const std::set<DeviceType> &available_devices,
+    const std::vector<DeviceType> &inputs_op_devices) {
+  static const std::set<std::string> kComputeIntensiveOps = {
+      "Conv2D", "DepthwiseConv2d", "Deconv2D", "DepthwiseDeconv2d",
+      "FullyConnected"
+  };
+  // CPU is the device to fall back
+  DeviceType best_device = DeviceType::CPU;
+  if (available_devices.count(target_device_type) == 1) {
+    best_device = target_device_type;
+  }
+  if (best_device == DeviceType::CPU) {
+    return best_device;
+  }
+  // Put compute-intensive ops in target device
+  if (kComputeIntensiveOps.count(op_def->type()) == 1) {
+    return best_device;
+  }
+  // Greedy strategy: Use input op's device type as current op's device
+  for (auto device_type : inputs_op_devices) {
+    if (device_type != best_device) {
+      best_device = device_type;
+    }
+  }
+  return best_device;
+}
+}  // namespace mace
--- a/mace/core/net_optimizer.h
+++ b/mace/core/net_optimizer.h
+//  Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_NET_OPTIMIZER_H_
+#define MACE_CORE_NET_OPTIMIZER_H_
+
+#include <set>
+#include <vector>
+
+#include "mace/port/port.h"
+#include "mace/proto/mace.pb.h"
+
+namespace mace {
+
+/// Any optimization for Net could be put in here in the future.
+class NetOptimizer {
+ public:
+  /// Select best device for the op to support mixing usage of CPU and GPU.
+  /// Greedy strategy: one way to the end. If the op fallback to CPU, then
+  /// the follow-up ops will run on CPU too util meet
+  /// some compute-intensive ops(Convolution) to
+  /// reduce the memory copy between CPU and GPU.
+  /// Simple but effective.
+  ///
+  /// \param op_def the op
+  /// \param target_device target device to run on
+  /// \param available_devices available devices of the op
+  /// \param inputs_op_devices devices of father ops run on
+  /// \return Best device for the op_def
+  DeviceType SelectBestDevice(const OperatorDef *op_def,
+                              DeviceType target_device,
+                              const std::set<DeviceType> &available_devices,
+                              const std::vector<DeviceType> &inputs_op_devices);
+};
+
+}  // namespace mace
+#endif  // MACE_CORE_NET_OPTIMIZER_H_
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -20,36 +20,23 @@
 #include "mace/core/operator.h"

 namespace mace {
-
-OpConstructContext::OpConstructContext(Workspace *ws)
-    : operator_def_(nullptr),
-      ws_(ws),
-      device_(nullptr),
-      tensor_shape_info_(nullptr) {}
-
-OpConstructContext::OpConstructContext(
-    mace::Workspace *ws,
-    mace::OpConstructContext::TensorShapeMap *info)
+OpConditionContext::OpConditionContext(
+    const Workspace *ws,
+    OpConditionContext::TensorShapeMap *info)
    : operator_def_(nullptr),
      ws_(ws),
      device_(nullptr),
      tensor_shape_info_(info) {}

-void OpConstructContext::set_operator_def(
-    std::shared_ptr<mace::OperatorDef> operator_def) {
+void OpConditionContext::set_operator_def(
+    const OperatorDef *operator_def) {
  operator_def_ = operator_def;
  input_data_types_.clear();
 }

-void OpConstructContext::set_output_mem_type(mace::MemoryType type) {
-  MACE_CHECK(operator_def_ != nullptr);
-  output_mem_type_ = type;
-  input_mem_types_.clear();
-}
-
-void OpConstructContext::SetInputInfo(size_t idx,
-                                      mace::MemoryType mem_type,
-                                      mace::DataType dt) {
+void OpConditionContext::SetInputInfo(size_t idx,
+                                      MemoryType mem_type,
+                                      DataType dt) {
  if (input_mem_types_.empty()) {
    // the default inputs' memory types are same as output memory type.
    input_mem_types_.resize(operator_def_->input_size(), output_mem_type_);
@@ -66,7 +53,13 @@ void OpConstructContext::SetInputInfo(size_t idx,
  input_data_types_[idx] = dt;
 }

-MemoryType OpConstructContext::GetInputMemType(size_t idx) const {
+void OpConditionContext::set_output_mem_type(MemoryType type) {
+  MACE_CHECK(operator_def_ != nullptr);
+  output_mem_type_ = type;
+  input_mem_types_.clear();
+}
+
+MemoryType OpConditionContext::GetInputMemType(size_t idx) const {
  if (input_mem_types_.empty()) {
    return output_mem_type_;
  }
@@ -75,7 +68,7 @@ MemoryType OpConstructContext::GetInputMemType(size_t idx) const {
  return input_mem_types_[idx];
 }

-DataType OpConstructContext::GetInputDataType(size_t idx) const {
+DataType OpConditionContext::GetInputDataType(size_t idx) const {
  if (input_data_types_.empty()) {
    // the default inputs' data types are same as operation's data type.
    return static_cast<DataType>(
@@ -87,17 +80,17 @@ DataType OpConstructContext::GetInputDataType(size_t idx) const {
 }

 #ifdef MACE_ENABLE_OPENCL
-void OpConstructContext::SetInputOpenCLBufferType(
+void OpConditionContext::SetInputOpenCLBufferType(
    size_t idx, OpenCLBufferType buffer_type) {
  if (input_opencl_buffer_types_.empty()) {
    // the default inputs' memory types are same as output memory type.
    input_opencl_buffer_types_.resize(operator_def_->input_size(),
-                               OpenCLBufferType::IN_OUT_CHANNEL);
+                                      OpenCLBufferType::IN_OUT_CHANNEL);
  }
  MACE_CHECK(idx < input_opencl_buffer_types_.size());
  input_opencl_buffer_types_[idx] = buffer_type;
 }
-OpenCLBufferType OpConstructContext::GetInputOpenCLBufferType(
+OpenCLBufferType OpConditionContext::GetInputOpenCLBufferType(
    size_t idx) const {
  if (input_opencl_buffer_types_.empty()) {
    return OpenCLBufferType::IN_OUT_CHANNEL;
@@ -107,6 +100,16 @@ OpenCLBufferType OpConstructContext::GetInputOpenCLBufferType(
 }
 #endif  // MACE_ENABLE_OPENCL

+OpConstructContext::OpConstructContext(Workspace *ws)
+    : operator_def_(nullptr),
+      ws_(ws),
+      device_(nullptr) {}
+
+void OpConstructContext::set_operator_def(
+    std::shared_ptr<OperatorDef> operator_def) {
+  operator_def_ = operator_def;
+}
+
 OpInitContext::OpInitContext(Workspace *ws, Device *device)
    : ws_(ws), device_(device) {}

@@ -202,19 +205,40 @@ const std::string OpKeyBuilder::Build() {
 }  // namespace

 OpRegistrationInfo::OpRegistrationInfo() {
-  device_placer = [this](OpConstructContext *context) -> std::set<DeviceType> {
-    auto op = context->operator_def();
-    // The GPU ops only support 4D In/Out tensor by default
-    if (this->devices.count(DeviceType::CPU) == 1 &&
-        op->output_shape_size() == op->output_size() &&
-        op->output_shape(0).dims_size() != 4) {
-      return { DeviceType::CPU };
-    }
+  // default device type placer
+  device_placer = [this](OpConditionContext *context) -> std::set<DeviceType> {
+    MACE_UNUSED(context);
    return this->devices;
  };
+
+  // default input and output memory type setter
+  memory_type_setter = [](OpConditionContext *context) -> void {
+    if (context->device()->device_type() == DeviceType::GPU) {
+#ifdef MACE_ENABLE_OPENCL
+      if (context->device()->gpu_runtime()->UseImageMemory()) {
+        context->set_output_mem_type(MemoryType::GPU_IMAGE);
+      } else {
+        context->set_output_mem_type(MemoryType::GPU_BUFFER);
+      }
+#endif  // MACE_ENABLE_OPENCL
+    } else {
+      context->set_output_mem_type(MemoryType::CPU_BUFFER);
+    }
+  };
+
+  data_format_selector = [](OpConditionContext *context)
+      -> std::vector<DataFormat> {
+    DataFormat op_data_format =
+        static_cast<DataFormat>(
+            ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                *context->operator_def(), "data_format",
+                static_cast<int>(DataFormat::NONE)));
+    return std::vector<DataFormat>(context->operator_def()->input_size(),
+                                   op_data_format);
+  };
 }

-void OpRegistrationInfo::AddDevice(mace::DeviceType device) {
+void OpRegistrationInfo::AddDevice(DeviceType device) {
  devices.insert(device);
 }

@@ -226,9 +250,9 @@ void OpRegistrationInfo::Register(const std::string &key, OpCreator creator) {

 MaceStatus OpRegistryBase::Register(
    const std::string &op_type,
-    const mace::DeviceType device_type,
-    const mace::DataType dt,
-    mace::OpRegistrationInfo::OpCreator creator) {
+    const DeviceType device_type,
+    const DataType dt,
+    OpRegistrationInfo::OpCreator creator) {
  if (registry_.count(op_type) == 0) {
    registry_[op_type] = std::unique_ptr<OpRegistrationInfo>(
        new OpRegistrationInfo);
@@ -255,13 +279,29 @@ MaceStatus OpRegistryBase::Register(
 }

 const std::set<DeviceType> OpRegistryBase::AvailableDevices(
-    const std::string &op_type, OpConstructContext *context) const {
+    const std::string &op_type, OpConditionContext *context) const {
  MACE_CHECK(registry_.count(op_type) != 0,
             op_type, " operation is not registered.");

  return registry_.at(op_type)->device_placer(context);
 }

+void OpRegistryBase::GetInOutMemoryTypes(
+    const std::string &op_type,
+    OpConditionContext *context) const {
+  MACE_CHECK(registry_.count(op_type) != 0,
+             op_type, " operation is not registered.");
+  return registry_.at(op_type)->memory_type_setter(context);
+}
+
+const std::vector<DataFormat> OpRegistryBase::InputsDataFormat(
+    const std::string &op_type,
+    OpConditionContext *context) const {
+  MACE_CHECK(registry_.count(op_type) != 0,
+             op_type, " operation is not registered.");
+  return registry_.at(op_type)->data_format_selector(context);
+}
+
 std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
    OpConstructContext *context,
    DeviceType device_type) const {
@@ -269,15 +309,6 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
  DataType dtype = static_cast<DataType>(
      ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
          *operator_def, "T", static_cast<int>(DT_FLOAT)));
-  if (device_type == DeviceType::CPU && dtype == DT_HALF) {
-    int arg_size = operator_def->arg_size();
-    for (int i = 0; i < arg_size; ++i) {
-      if (operator_def->arg(i).name() == "T") {
-        operator_def->mutable_arg(i)->set_i(DT_FLOAT);
-      }
-    }
-    dtype = DT_FLOAT;
-  }
  VLOG(1) << "Creating operator " << operator_def->name() << "("
          << operator_def->type() << "<" << dtype << ">" << ") on "
          << device_type;
@@ -308,9 +339,30 @@ OpConditionBuilder &OpConditionBuilder::SetDevicePlacerFunc(
  return *this;
 }

+OpConditionBuilder& OpConditionBuilder::SetInputMemoryTypeSetter(
+    OpRegistrationInfo::MemoryTypeSetter setter) {
+  memory_type_setter_ = setter;
+  return *this;
+}
+
+OpConditionBuilder& OpConditionBuilder::SetInputsDataFormatSelector(
+    OpRegistrationInfo::DataFormatSelector selector) {
+  data_format_selector_ = selector;
+  return *this;
+}
+
 void OpConditionBuilder::Finalize(OpRegistrationInfo *info) const {
-  if (info != nullptr && placer_) {
-    info->device_placer = placer_;
+  if (info != nullptr) {
+    if (placer_) {
+      info->device_placer = placer_;
+    }
+    if (memory_type_setter_) {
+      info->memory_type_setter = memory_type_setter_;
+    }
+
+    if (data_format_selector_) {
+      info->data_format_selector = data_format_selector_;
+    }
  }
 }


--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -32,22 +32,20 @@

 namespace mace {

-// memory_optimizer, device
-class OpConstructContext {
+// OpConditionContext has all information used for choosing proper Op
+class OpConditionContext {
+ public:
  typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
+  OpConditionContext(const Workspace *ws, TensorShapeMap *info);
+  ~OpConditionContext() = default;

- public:
-  explicit OpConstructContext(Workspace *ws);
-  OpConstructContext(Workspace *ws, TensorShapeMap *info);
-  ~OpConstructContext() = default;
+  void set_operator_def(const OperatorDef* operator_def);

-  void set_operator_def(std::shared_ptr<OperatorDef> operator_def);
-
-  inline std::shared_ptr<OperatorDef> operator_def() const {
+  inline const OperatorDef *operator_def() const {
    return operator_def_;
  }

-  inline Workspace *workspace() const {
+  inline const Workspace *workspace() const {
    return ws_;
  }

@@ -81,8 +79,8 @@ class OpConstructContext {
 #endif  // MACE_ENABLE_OPENCL

 private:
-  std::shared_ptr<OperatorDef> operator_def_;
-  Workspace *ws_;
+  const OperatorDef *operator_def_;
+  const Workspace *ws_;
  Device *device_;
  TensorShapeMap *tensor_shape_info_;
  // used for memory transform
@@ -94,6 +92,46 @@ class OpConstructContext {
 #endif  // MACE_ENABLE_OPENCL
 };

+// memory_optimizer, device
+class OpConstructContext {
+  typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
+
+ public:
+  explicit OpConstructContext(Workspace *ws);
+  ~OpConstructContext() = default;
+
+  void set_operator_def(std::shared_ptr<OperatorDef> operator_def);
+
+  inline std::shared_ptr<OperatorDef> operator_def() const {
+    return operator_def_;
+  }
+
+  inline Workspace *workspace() const {
+    return ws_;
+  }
+
+  inline void set_device(Device* device) {
+    device_ = device;
+  }
+
+  inline Device *device() const {
+    return device_;
+  }
+#ifdef MACE_ENABLE_OPENCL
+  inline MemoryType GetOpMemoryType() const {
+    return static_cast<MemoryType>(
+        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            *operator_def_, OutputMemoryTypeTagName(),
+            static_cast<int>(MemoryType::CPU_BUFFER)));
+  }
+#endif  // MACE_ENABLE_OPENCL
+
+ private:
+  std::shared_ptr<OperatorDef> operator_def_;
+  Workspace *ws_;
+  Device *device_;
+};
+
 // memory_optimizer, device
 class OpInitContext {
 public:
@@ -207,8 +245,11 @@ struct OpRegistrationInfo {
 public:
  typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
      OpCreator;
-  typedef std::function<std::set<DeviceType>(OpConstructContext *)>
+  typedef std::function<std::set<DeviceType>(OpConditionContext *)>
      DevicePlacer;
+  typedef std::function<void(OpConditionContext *)> MemoryTypeSetter;
+  typedef std::function<std::vector<DataFormat>(OpConditionContext *)>
+      DataFormatSelector;

  OpRegistrationInfo();

@@ -219,6 +260,8 @@ struct OpRegistrationInfo {
  std::set<DeviceType> devices;
  std::unordered_map<std::string, OpCreator> creators;
  DevicePlacer device_placer;
+  MemoryTypeSetter memory_type_setter;
+  DataFormatSelector data_format_selector;
 };

 class OpConditionBuilder {
@@ -230,11 +273,21 @@ class OpConditionBuilder {
  OpConditionBuilder &SetDevicePlacerFunc(
      OpRegistrationInfo::DevicePlacer placer);

+  // If you set input memory type for specified Op,
+  // you must call OpConditionContext::set_output_mem_type
+  OpConditionBuilder &SetInputMemoryTypeSetter(
+      OpRegistrationInfo::MemoryTypeSetter setter);
+
+  OpConditionBuilder &SetInputsDataFormatSelector(
+      OpRegistrationInfo::DataFormatSelector selector);
+
  void Finalize(OpRegistrationInfo *info) const;

 private:
  std::string type_;
  OpRegistrationInfo::DevicePlacer placer_;
+  OpRegistrationInfo::MemoryTypeSetter memory_type_setter_;
+  OpRegistrationInfo::DataFormatSelector data_format_selector_;
 };


@@ -250,7 +303,13 @@ class OpRegistryBase {
  MaceStatus Register(const OpConditionBuilder &builder);

  const std::set<DeviceType> AvailableDevices(
-      const std::string &op_type, OpConstructContext *context) const;
+      const std::string &op_type, OpConditionContext *context) const;
+
+  void GetInOutMemoryTypes(
+      const std::string &op_type, OpConditionContext *context) const;
+
+  const std::vector<DataFormat> InputsDataFormat(
+      const std::string &op_type, OpConditionContext *context) const;

  std::unique_ptr<Operation> CreateOperation(
      OpConstructContext *context,

--- a/mace/core/runtime/opencl/opencl_util.cc
+++ b/mace/core/runtime/opencl/opencl_util.cc
@@ -147,38 +147,38 @@ void OpenCLUtil::CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
  }
 }

-std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef(
+void OpenCLUtil::BuildTransformOpDef(
    const std::string &input_name,
    const std::vector<mace::index_t> &input_shape,
    const std::string &output_name,
    const mace::DataType dt,
    const OpenCLBufferType buffer_type,
    const mace::MemoryType mem_type,
-    bool has_data_format) {
-  std::unique_ptr<OperatorDef> op(new OperatorDef);
+    DataFormat data_format,
+    OperatorDef *op_def) {
  std::string op_name = "mace_node_" + output_name;
-  op->set_name(op_name);
-  op->set_type("BufferTransform");
-  op->add_input(input_name);
-  op->add_output(output_name);
-  Argument *arg = op->add_arg();
+  op_def->set_name(op_name);
+  op_def->set_type("BufferTransform");
+  op_def->add_input(input_name);
+  op_def->add_output(output_name);
+  op_def->set_device_type(DeviceType::GPU);
+  Argument *arg = op_def->add_arg();
  arg->set_name("buffer_type");
  arg->set_i(static_cast<int32_t>(buffer_type));
-  arg = op->add_arg();
+  arg = op_def->add_arg();
  arg->set_name("mem_type");
  arg->set_i(static_cast<int32_t>(mem_type));
-  arg = op->add_arg();
+  arg = op_def->add_arg();
  arg->set_name("T");
  arg->set_i(static_cast<int32_t>(dt));
-  arg = op->add_arg();
-  arg->set_name("has_data_format");
-  arg->set_i(has_data_format);
+  arg = op_def->add_arg();
+  arg->set_name("data_format");
+  arg->set_i(static_cast<int>(data_format));
  if (!input_shape.empty()) {
-    OutputShape *shape = op->add_output_shape();
+    OutputShape *shape = op_def->add_output_shape();
    for (auto value : input_shape) {
      shape->add_dims(value);
    }
  }
-  return std::move(op);
 }
 }  // namespace mace
--- a/mace/core/runtime/opencl/opencl_util.h
+++ b/mace/core/runtime/opencl/opencl_util.h
@@ -43,14 +43,15 @@ class OpenCLUtil {
                              std::vector<size_t> *image_shape,
                              const int wino_blk_size = 2);

-  static std::shared_ptr<OperatorDef> CreateTransformOpDef(
+  static void BuildTransformOpDef(
      const std::string &input_name,
      const std::vector<mace::index_t> &input_shape,
      const std::string &output_name,
      const mace::DataType dt,
      const OpenCLBufferType buffer_type,
      const MemoryType mem_type,
-      bool has_data_format);
+      DataFormat data_format,
+      OperatorDef *op_def);
 };

 }  // namespace mace

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -263,13 +263,13 @@ MaceStatus Workspace::PreallocateOutputTensor(
    }
  }
  VLOG(1) << "Preallocate buffer to tensors";
-  bool is_quantize_model = IsQuantizedModel(net_def);
  for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) {
    std::unique_ptr<Tensor> tensor
        (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.mem_id),
                    tensor_mem.second.data_type,
                    false, tensor_mem.first));
-    if (tensor_mem.second.has_data_format) {
+    tensor->set_data_format(tensor_mem.second.data_format);
+    if (tensor_mem.second.data_format != DataFormat::NONE) {
      if (mem_blocks[tensor_mem.second.mem_id].mem_type()
          == MemoryType::GPU_IMAGE) {
        VLOG(1) << "Tensor: " << tensor_mem.first
@@ -279,22 +279,12 @@ MaceStatus Workspace::PreallocateOutputTensor(
                << tensor->UnderlyingBuffer()->shape()[0]
                << ", "
                << tensor->UnderlyingBuffer()->shape()[1];
-        tensor->set_data_format(DataFormat::NHWC);
      } else {
        VLOG(1) << "Tensor: " << tensor_mem.first
                << " Mem: " << tensor_mem.second.mem_id
                << " Data type: " << tensor->dtype()
                << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
-        if (mem_blocks[tensor_mem.second.mem_id].mem_type()
-            == MemoryType::GPU_BUFFER ||
-            is_quantize_model) {
-          tensor->set_data_format(DataFormat::NHWC);
-        } else {
-          tensor->set_data_format(DataFormat::NCHW);
-        }
      }
-    } else {
-      tensor->set_data_format(DataFormat::DF_NONE);
    }
    tensor_map_[tensor_mem.first] = std::move(tensor);
  }

--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -94,7 +94,7 @@ DataFormat ParseDataFormat(const std::string &data_format_str) {
  } else if (data_format_str == "OIHW") {
    return DataFormat::OIHW;
  } else {
-    return DataFormat::DF_NONE;
+    return DataFormat::NONE;
  }
 }


--- a/mace/libmace/capability.cc
+++ b/mace/libmace/capability.cc
@@ -143,7 +143,7 @@ void BMNet::SetUp() {
  // Add input and output information
  for (size_t i = 0; i < input_names_.size(); ++i) {
    InputOutputInfo *info = net_.add_input_info();
-    info->set_data_format(DataFormat::NHWC);
+    info->set_data_format(static_cast<int>(DataFormat::NHWC));
    info->set_name(input_names_[i]);
    for (auto d : input_shapes_[i]) {
      info->add_dims(static_cast<int>(d));
@@ -244,7 +244,7 @@ void BMNet::AddConv(const std::string &conv_type,
  op_def->add_output(output_name);
  AddIntsArg(op_def, "strides", strides);
  AddIntArg(op_def, "padding", padding_type);
-  AddIntArg(op_def, "has_data_format", 1);
+  AddIntArg(op_def, "data_format", static_cast<int>(DataFormat::AUTO));
  AddIntArg(op_def, "T", DT_HALF);
  if (has_relu6) {
    AddStringArg(op_def, "activation", "RELUX");
@@ -271,7 +271,7 @@ void BMNet::AddEltwise(const std::string &op_name,
  op_def->add_output(output);
  AddIntArg(op_def, "type", type);
  AddIntArg(op_def, "T", DT_HALF);
-  AddIntArg(op_def, "has_data_format", 1);
+  AddIntArg(op_def, "data_format", static_cast<int>(DataFormat::AUTO));
  OutputShape *shape = op_def->add_output_shape();
  for (auto dim : output_shape) {
    shape->add_dims(dim);

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -27,6 +27,7 @@
 #include "mace/public/mace.h"
 #include "mace/port/env.h"
 #include "mace/port/file_system.h"
+#include "mace/core/net_def_adapter.h"

 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/gpu_device.h"
@@ -282,9 +283,9 @@ MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
                       std::shared_ptr<void> data,
                       const DataFormat format) {
  MACE_CHECK_NOTNULL(data.get());
-  MACE_CHECK(format == DataFormat::DF_NONE || format == DataFormat::NHWC
-                 || format == DataFormat::NCHW || format == OIHW,
-             "MACE only support DF_NONE, NHWC, NCHW and OIHW "
+  MACE_CHECK(format == DataFormat::NONE || format == DataFormat::NHWC
+                 || format == DataFormat::NCHW || format == DataFormat::OIHW,
+             "MACE only support NONE, NHWC, NCHW and OIHW "
             "formats of input now.");
  impl_ = make_unique<MaceTensor::Impl>();
  impl_->shape = shape;
@@ -495,7 +496,7 @@ MaceStatus MaceEngine::Impl::Init(
    DataType output_dt = output_info_map_[output_name].data_type();
    Tensor *output_tensor =
        ws_->CreateTensor(output_name, device_->allocator(), output_dt);
-    output_tensor->set_data_format(NHWC);
+    output_tensor->set_data_format(DataFormat::NHWC);
 #endif
  }
 #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
@@ -512,26 +513,32 @@ MaceStatus MaceEngine::Impl::Init(
    }
  } else {
 #endif
-  MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(*net_def,
-                                            device_.get(),
-                                            model_data));
-
-  MemoryOptimizer mem_optimizer;
-  // Init model
-  net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(),
-                                                net_def,
-                                                ws_.get(),
-                                                device_.get(),
-                                                &mem_optimizer));
-
-  // Preallocate all output tensors of ops
-  MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def,
-                                                    &mem_optimizer,
-                                                    device_.get()));
-  if (device_type_ == DeviceType::GPU) {
-    ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator());
-  }
-  MACE_RETURN_IF_ERROR(net_->Init());
+    MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(*net_def,
+                                              device_.get(),
+                                              model_data));
+
+    NetDef adapted_net_def;
+    NetDefAdapter net_def_adapter(op_registry_.get(), ws_.get());
+    net_def_adapter.AdaptNetDef(net_def, device_.get(), &adapted_net_def);
+
+    MemoryOptimizer mem_optimizer;
+    // Init model
+    net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(),
+                                                  &adapted_net_def,
+                                                  ws_.get(),
+                                                  device_.get(),
+                                                  &mem_optimizer));
+
+    // Preallocate all output tensors of ops
+    MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(adapted_net_def,
+                                                      &mem_optimizer,
+                                                      device_.get()));
+    if (device_type_ == DeviceType::GPU) {
+      ws_->RemoveAndReloadBuffer(adapted_net_def,
+                                 model_data,
+                                 device_->allocator());
+    }
+    MACE_RETURN_IF_ERROR(net_->Init());
 #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
  }
 #endif
@@ -578,14 +585,14 @@ MaceEngine::Impl::~Impl() {
 MaceStatus MaceEngine::Impl::TransposeInput(
    const std::pair<const std::string, MaceTensor> &input,
    Tensor *input_tensor) {
-  bool has_data_format = input_tensor->data_format() != DataFormat::DF_NONE;
-  DataFormat data_format = DataFormat::DF_NONE;
+  bool has_data_format = input_tensor->data_format() != DataFormat::NONE;
+  DataFormat data_format = DataFormat::NONE;
  DataType input_dt = input_tensor->dtype();
  if (has_data_format) {
    std::vector<int> dst_dims;
    if (device_->device_type() == DeviceType::CPU &&
        input.second.shape().size() == 4 &&
-        input.second.data_format() == NHWC &&
+        input.second.data_format() == DataFormat::NHWC &&
        !is_quantized_model_) {
      VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
      input_tensor->set_data_format(DataFormat::NCHW);
@@ -647,28 +654,28 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
  DataType output_dt = output_tensor->dtype();
  // save output
  if (output_tensor != nullptr && output->second.data() != nullptr) {
-    if (output_tensor->data_format() != DataFormat::DF_NONE &&
-        output->second.data_format() != DataFormat::DF_NONE &&
+    if (output_tensor->data_format() != DataFormat::NONE &&
+        output->second.data_format() != DataFormat::NONE &&
        output->second.shape().size() == 4 &&
        output->second.data_format() != output_tensor->data_format()) {
      VLOG(1) << "Transform output " << output->first << " from "
-              << output_tensor->data_format() << " to "
-              << output->second.data_format();
+              << static_cast<int>(output_tensor->data_format()) << " to "
+              << static_cast<int>(output->second.data_format());
      std::vector<int> dst_dims;
-      if (output_tensor->data_format() == NCHW &&
-          output->second.data_format() == NHWC) {
+      if (output_tensor->data_format() == DataFormat::NCHW &&
+          output->second.data_format() == DataFormat::NHWC) {
        dst_dims = {0, 2, 3, 1};
-      } else if (output_tensor->data_format() == NHWC &&
-          output->second.data_format() == NCHW) {
+      } else if (output_tensor->data_format() == DataFormat::NHWC &&
+          output->second.data_format() == DataFormat::NCHW) {
        dst_dims = {0, 3, 1, 2};
      } else {
        LOG(FATAL) << "Not supported output data format: "
-                   << output->second.data_format() << " vs "
-                   << output_tensor->data_format();
+                   << static_cast<int>(output->second.data_format()) << " vs "
+                   << static_cast<int>(output_tensor->data_format());
      }
      VLOG(1) << "Transform output " << output->first << " from "
-              << output_tensor->data_format() << " to "
-              << output->second.data_format();
+              << static_cast<int>(output_tensor->data_format()) << " to "
+              << static_cast<int>(output->second.data_format());
      std::vector<index_t> shape =
          TransposeShape<index_t, index_t>(output_tensor->shape(),
                                           dst_dims);

--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -15,6 +15,8 @@
 #include "mace/ops/activation.h"

 #include <memory>
+#include <set>
+
 #include "mace/core/operator.h"

 #if defined(MACE_ENABLE_NEON)
@@ -94,7 +96,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
    auto leakyrelu_coefficient = static_cast<T>(
        Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f));
    MemoryType mem_type;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
      kernel_ = make_unique<opencl::image::ActivationKernel<T>>(
          type, relux_max_limit, leakyrelu_coefficient);
@@ -132,6 +134,24 @@ void RegisterActivation(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
                   DeviceType::GPU, half);
 #endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP_CONDITION(
+      op_registry,
+      OpConditionBuilder("Activation")
+          .SetDevicePlacerFunc(
+              [](OpConditionContext *context) -> std::set<DeviceType> {
+                auto op = context->operator_def();
+                if (op->output_shape_size() != op->output_size()) {
+                  return { DeviceType::CPU, DeviceType::GPU };
+                }
+                int has_data_format =
+                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                        *op, "has_data_format", 0);
+                if (!has_data_format ||
+                    op->output_shape(0).dims_size() != 4) {
+                  return { DeviceType::CPU };
+                }
+                return { DeviceType::CPU, DeviceType::GPU };
+              }));
 }

 }  // namespace ops

--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -207,7 +207,8 @@ void TestSimplePrelu() {
    // Run
    net.RunOp(D);
  } else {
-    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<D, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Activation", "PreluTest")
        .Input("InputNCHW")
        .Input("Alpha")
@@ -217,7 +218,8 @@ void TestSimplePrelu() {

    // Run
    net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }

  auto expected = net.CreateTensor<float>(

--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -67,7 +67,7 @@ class AddNOp<DeviceType::GPU, T> : public Operation {
 public:
  explicit AddNOp(OpConstructContext *context)
      : Operation(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::AddNKernel<T>>();
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -101,6 +101,24 @@ void RegisterAddN(OpRegistryBase *op_registry) {

  MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half);
 #endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP_CONDITION(
+      op_registry,
+      OpConditionBuilder("AddN")
+          .SetDevicePlacerFunc(
+              [](OpConditionContext *context) -> std::set<DeviceType> {
+                auto op = context->operator_def();
+                if (op->output_shape_size() != op->output_size()) {
+                  return { DeviceType::CPU, DeviceType::GPU };
+                }
+                int has_data_format =
+                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                        *op, "has_data_format", 0);
+                if (!has_data_format ||
+                    op->output_shape(0).dims_size() != 4) {
+                  return { DeviceType::CPU };
+                }
+                return { DeviceType::CPU, DeviceType::GPU };
+              }));
 }

 }  // namespace ops

--- a/mace/ops/arm/fp32/deconv_2d.cc
+++ b/mace/ops/arm/fp32/deconv_2d.cc
@@ -54,7 +54,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut(
                                 out_pad_size,
                                 &padded_out_shape,
                                 framework_type_,
-                                 NCHW);
+                                 DataFormat::NCHW);

  MACE_RETURN_IF_ERROR(output->Resize(out_shape));


--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -174,7 +174,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
    float leakyrelu_coefficient = Operation::GetOptionalArg<float>(
        "leakyrelu_coefficient", 0.0f);
    MemoryType mem_type;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
      kernel_ = make_unique<opencl::image::BatchNormKernel<T>>(
          epsilon, activation, relux_max_limit, leakyrelu_coefficient);

--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -34,7 +34,8 @@ void Simple() {
  net.AddInputFromArray<D, float>("Var", {1}, {11.67f}, true);

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<D, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("BatchNorm", "BatchNormTest")
        .Input("InputNCHW")
        .Input("Scale")
@@ -47,7 +48,8 @@ void Simple() {
    // Run

    net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("BatchNorm", "BatchNormTest")
        .Input("Input")
@@ -93,8 +95,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  // Construct graph
  OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -112,8 +114,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  // run cpu
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>();
@@ -163,8 +165,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("BatchNorm", "BatchNormTest")
      .Input("InputNCHW")
@@ -179,8 +181,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  // run cpu
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>();
@@ -230,8 +232,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("BatchNorm", "BatchNormTest")
      .Input("InputNCHW")
@@ -246,8 +248,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  // run cpu
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>();
@@ -296,8 +298,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("BatchNorm", "BatchNormTest")
      .Input("InputNCHW")
@@ -312,8 +314,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  // run cpu
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>();

--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -264,7 +264,7 @@ class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
 public:
  explicit BatchToSpaceNDOp(OpConstructContext *context)
      : BatchToSpaceOpBase(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::BatchToSpaceKernel<T>>();
    } else {
      MACE_NOT_IMPLEMENTED;

--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -103,7 +103,7 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
      : Operation(context),
        has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 1)) {
    MemoryType mem_type = MemoryType::CPU_BUFFER;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
      kernel_ = make_unique<opencl::image::BiasAddKernel<T>>();
    } else {
@@ -145,6 +145,24 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
                   DeviceType::GPU, half);
 #endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP_CONDITION(
+      op_registry,
+      OpConditionBuilder("BiasAdd")
+          .SetDevicePlacerFunc(
+              [](OpConditionContext *context) -> std::set<DeviceType> {
+                auto op = context->operator_def();
+                if (op->output_shape_size() != op->output_size()) {
+                  return { DeviceType::CPU, DeviceType::GPU };
+                }
+                int has_data_format =
+                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                        *op, "has_data_format", 0);
+                if (!has_data_format ||
+                    op->output_shape(0).dims_size() != 4) {
+                  return { DeviceType::CPU };
+                }
+                return { DeviceType::CPU, DeviceType::GPU };
+              }));
 }

 }  // namespace ops

--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -27,9 +27,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
  OpsTestNet net;

  // Add input data
-  DataFormat data_format = NHWC;
  if (D == DeviceType::CPU) {
-    data_format = NCHW;
    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});

--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -31,8 +31,8 @@ void BiasAddSimple() {
  net.AddInputFromArray<D, float>("Bias", {1}, {0.5f}, true);

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("BiasAdd", "BiasAddTest")
        .Input("InputNCHW")
        .Input("Bias")
@@ -41,8 +41,8 @@ void BiasAddSimple() {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("BiasAdd", "BiasAddTest")
        .Input("Input")
@@ -83,8 +83,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
                                             {batch, height, width, channels});
  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  // Construct graph
  OpDefBuilder("BiasAdd", "BiasAddTest")
@@ -97,8 +97,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
  // run cpu
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>();
@@ -132,8 +132,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
                                             {batch, height, width, channels});
  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  // Construct graph
  OpDefBuilder("BiasAdd", "BiasAddTest")
@@ -146,8 +146,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
  // run cpu
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>();
  expected->Copy(*net.GetOutput("Output"));

--- a/mace/ops/buffer_to_image_benchmark.cc
+++ b/mace/ops/buffer_to_image_benchmark.cc
@@ -48,7 +48,6 @@ void FilterBufferToImage(int iters,
                   OpenCLBufferType::IN_OUT_CHANNEL,
                   MemoryType::GPU_IMAGE,
                   0,
-                   DataFormat::NHWC,
                   b2i_output);
  };


--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -37,14 +37,14 @@ void TestBidirectionTransform(const OpenCLBufferType type,

  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
      .Transform(&context, net.ws()->GetTensor("Input"),
-                 type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output);
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);

  // Inverse Transform
  Tensor *i2b_output = net.ws()->CreateTensor(
      "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
  OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
      .Transform(&context, b2i_output,
-                 type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output);
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);

  // Check
  ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
@@ -178,14 +178,14 @@ void TestDiffTypeBidirectionTransform(const OpenCLBufferType type,

  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
      .Transform(&context, net.ws()->GetTensor("Input"),
-                 type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output);
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);

  // Inverse Transform
  Tensor *i2b_output = net.ws()->CreateTensor(
      "I2BOutput", context.device()->allocator(), DT_FLOAT);
  OpenCLBufferTransformer<float>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
      .Transform(&context, b2i_output,
-                 type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output);
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);

  // Check
  ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
@@ -218,14 +218,14 @@ void TestStringHalfBidirectionTransform(const OpenCLBufferType type,
  // Transform
  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
      .Transform(&context, net.ws()->GetTensor("Input"),
-                 type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output);
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);

  // Inverse Transform
  Tensor *i2b_output = net.ws()->CreateTensor(
      "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
  OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
      .Transform(&context, b2i_output,
-                 type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output);
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);

  // Check
  ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),

--- a/mace/ops/buffer_transform.cc
+++ b/mace/ops/buffer_transform.cc
@@ -39,14 +39,11 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
    auto type =
        static_cast<OpenCLBufferType>(Operation::GetOptionalArg<int>(
            "buffer_type", static_cast<int>(CONV2D_FILTER)));
-    bool has_data_format = Operation::GetOptionalArg<int>("has_data_format", 0)
-        != 0;

    MemoryType in_mem_type = context->workspace()->GetTensor(
        operator_def_->input(0))->memory_type();
    return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
-        context, input, type, out_mem_type_, wino_blk_size_,
-        has_data_format, output);
+        context, input, type, out_mem_type_, wino_blk_size_, output);
  }

 private:

--- a/mace/ops/buffer_transform_test.cc
+++ b/mace/ops/buffer_transform_test.cc
@@ -48,7 +48,7 @@ void TestBidirectionTransform(const OpenCLBufferType type,
  OpenCLBufferTransformer<DstType>(MemoryType::GPU_BUFFER,
                                   MemoryType::GPU_BUFFER)
      .Transform(&context, net.ws()->GetTensor("Input"),
-                 type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, bt_output);
+                 type, MemoryType::GPU_BUFFER, 0, bt_output);

  // Inverse Transform
  Tensor *output = net.ws()->CreateTensor(
@@ -57,7 +57,7 @@ void TestBidirectionTransform(const OpenCLBufferType type,
  OpenCLBufferTransformer<OrgType>(MemoryType::GPU_BUFFER,
                                   MemoryType::GPU_BUFFER)
      .Transform(&context, bt_output,
-                 type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, output);
+                 type, MemoryType::GPU_BUFFER, 0, output);

  if (DataTypeToEnum<OrgType>::value == DataTypeToEnum<DstType>::value) {
    EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(),
@@ -94,7 +94,7 @@ void TestArgumentTransform(const index_t input_size) {
                             MemoryType::GPU_BUFFER)
      .Transform(&context, net.ws()->GetTensor("Input"),
                 OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER,
-                 0, DataFormat::NHWC, output);
+                 0, output);

  index_t expected_size = RoundUp<index_t>(input_size, 4);
  EXPECT_EQ(expected_size, output->buffer_shape()[0]);

--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -82,7 +82,7 @@ class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
  explicit ChannelShuffleOp(OpConstructContext *context)
      : Operation(context) {
    const int groups = Operation::GetOptionalArg<int>("group", 1);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::ChannelShuffleKernel<T>>(groups);
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -116,7 +116,7 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) {
      op_registry,
      OpConditionBuilder("ChannelShuffle")
          .SetDevicePlacerFunc(
-              [](OpConstructContext *context) -> std::set<DeviceType> {
+              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
                  return { DeviceType::CPU, DeviceType::GPU };

--- a/mace/ops/channel_shuffle_test.cc
+++ b/mace/ops/channel_shuffle_test.cc
@@ -28,8 +28,8 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
      "Input", {1, 1, 2, 8},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  // Construct graph
  OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
@@ -40,8 +40,8 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {

  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>(

--- a/mace/ops/common/conv_pool_2d_util.cc
+++ b/mace/ops/common/conv_pool_2d_util.cc
@@ -40,19 +40,19 @@ void CalcPaddingAndOutputSize(const index_t *input_shape,

  index_t input_height = 0, input_width = 0;
  index_t kernel_height = 0, kernel_width = 0;
-  if (input_format == NCHW) {
+  if (input_format == DataFormat::NCHW) {
    input_height = input_shape[2];
    input_width = input_shape[3];
-  } else if (input_format == NHWC) {
+  } else if (input_format == DataFormat::NHWC) {
    input_height = input_shape[1];
    input_width = input_shape[2];
  } else {
    MACE_NOT_IMPLEMENTED;
  }
-  if (filter_format == OIHW) {
+  if (filter_format == DataFormat::OIHW) {
    kernel_height = filter_shape[2];
    kernel_width = filter_shape[3];
-  } else if (filter_format == OHWI) {
+  } else if (filter_format == DataFormat::OHWI) {
    kernel_height = filter_shape[1];
    kernel_width = filter_shape[2];
  } else {
@@ -97,11 +97,11 @@ void CalcPaddingAndOutputSize(const index_t *input_shape,
      0, (output_width - 1) * strides[1] + k_extent_width - input_width);

  output_shape[0] = input_shape[0];
-  if (input_format == NCHW) {
+  if (input_format == DataFormat::NCHW) {
    output_shape[1] = output_channels;
    output_shape[2] = output_height;
    output_shape[3] = output_width;
-  } else if (input_format == NHWC) {
+  } else if (input_format == DataFormat::NHWC) {
    output_shape[1] = output_height;
    output_shape[2] = output_width;
    output_shape[3] = output_channels;
@@ -117,7 +117,8 @@ void CalcNCHWPaddingAndOutputSize(const index_t *input_shape,   // NCHW
                                  Padding padding,
                                  index_t *output_shape,
                                  int *padding_size) {
-  CalcPaddingAndOutputSize(input_shape, NCHW, filter_shape, OIHW, dilations,
+  CalcPaddingAndOutputSize(input_shape, DataFormat::NCHW, filter_shape,
+                           DataFormat::OIHW, dilations,
                           strides, padding, output_shape, padding_size);
 }

@@ -128,7 +129,8 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape,   // NHWC
                                  Padding padding,
                                  index_t *output_shape,
                                  int *padding_size) {
-  CalcPaddingAndOutputSize(input_shape, NHWC, filter_shape, OIHW, dilations,
+  CalcPaddingAndOutputSize(input_shape, DataFormat::NHWC, filter_shape,
+                           DataFormat::OIHW, dilations,
                           strides, padding, output_shape, padding_size);
 }

@@ -151,19 +153,19 @@ void CalcOutputSize(const index_t *input_shape,

  index_t input_height = 0, input_width = 0;
  index_t kernel_height = 0, kernel_width = 0;
-  if (input_format == NCHW) {
+  if (input_format == DataFormat::NCHW) {
    input_height = input_shape[2];
    input_width = input_shape[3];
-  } else if (input_format == NHWC) {
+  } else if (input_format == DataFormat::NHWC) {
    input_height = input_shape[1];
    input_width = input_shape[2];
  } else {
    MACE_NOT_IMPLEMENTED;
  }
-  if (filter_format == OIHW) {
+  if (filter_format == DataFormat::OIHW) {
    kernel_height = filter_shape[2];
    kernel_width = filter_shape[3];
-  } else if (filter_format == OHWI) {
+  } else if (filter_format == DataFormat::OHWI) {
    kernel_height = filter_shape[1];
    kernel_width = filter_shape[2];
  } else {
@@ -195,11 +197,11 @@ void CalcOutputSize(const index_t *input_shape,
  }

  output_shape[0] = input_shape[0];
-  if (input_format == NCHW) {
+  if (input_format == DataFormat::NCHW) {
    output_shape[1] = output_channels;
    output_shape[2] = output_height;
    output_shape[3] = output_width;
-  } else if (input_format == NHWC) {
+  } else if (input_format == DataFormat::NHWC) {
    output_shape[1] = output_height;
    output_shape[2] = output_width;
    output_shape[3] = output_channels;
@@ -215,7 +217,8 @@ void CalcOutputSize(const index_t *input_shape,   // NHWC
                    const int *strides,
                    const RoundType round_type,
                    index_t *output_shape) {
-  CalcOutputSize(input_shape, NHWC, filter_shape, OIHW, padding_size, dilations,
+  CalcOutputSize(input_shape, DataFormat::NHWC, filter_shape,
+                 DataFormat::OIHW, padding_size, dilations,
                 strides, round_type, output_shape);
 }

@@ -226,7 +229,8 @@ void CalcNCHWOutputSize(const index_t *input_shape,   // NCHW
                        const int *strides,
                        const RoundType round_type,
                        index_t *output_shape) {
-  CalcOutputSize(input_shape, NCHW, filter_shape, OIHW, padding_size, dilations,
+  CalcOutputSize(input_shape, DataFormat::NCHW, filter_shape,
+                 DataFormat::OIHW, padding_size, dilations,
                 strides, round_type, output_shape);
 }

@@ -241,14 +245,18 @@ void CalcDeconvShape_TF(const std::vector<index_t> &input_shape,
                        std::vector<index_t> *padded_out_shape,
                        DataFormat data_format) {
  const index_t
-      in_height = data_format == NCHW ? input_shape[2] : input_shape[1];
+      in_height =
+      data_format == DataFormat::NCHW ? input_shape[2] : input_shape[1];
  const index_t
-      in_width = data_format == NCHW ? input_shape[3] : input_shape[2];
+      in_width =
+          data_format == DataFormat::NCHW ? input_shape[3] : input_shape[2];

  const index_t
-      out_height = data_format == NCHW ? output_shape[2] : output_shape[1];
+      out_height =
+          data_format == DataFormat::NCHW ? output_shape[2] : output_shape[1];
  const index_t
-      out_width = data_format == NCHW ? output_shape[3] : output_shape[2];
+      out_width =
+          data_format == DataFormat::NCHW ? output_shape[3] : output_shape[2];

  const index_t extended_in_height = (in_height - 1) * strides[0] + 1;
  const index_t extended_in_width = (in_width - 1) * strides[1] + 1;
@@ -307,11 +315,11 @@ void CalcDeconvShape_TF(const std::vector<index_t> &input_shape,
    padded_out_shape->resize(4);
    (*padded_out_shape)[0] = output_shape[0];
    (*padded_out_shape)[1] =
-        data_format == NCHW ? output_channel : padded_out_height;
+        data_format == DataFormat::NCHW ? output_channel : padded_out_height;
    (*padded_out_shape)[2] =
-        data_format == NCHW ? padded_out_height : padded_out_width;
+        data_format == DataFormat::NCHW ? padded_out_height : padded_out_width;
    (*padded_out_shape)[3] =
-        data_format == NCHW ? padded_out_width : output_channel;
+        data_format == DataFormat::NCHW ? padded_out_width : output_channel;
  }
 }

@@ -325,9 +333,11 @@ void CalcDeconvShape_Caffe(const std::vector<index_t> &input_shape,
                           std::vector<index_t> *padded_out_shape,
                           DataFormat data_format) {
  const index_t
-      in_height = data_format == NCHW ? input_shape[2] : input_shape[1];
+      in_height =
+          data_format == DataFormat::NCHW ? input_shape[2] : input_shape[1];
  const index_t
-      in_width = data_format == NCHW ? input_shape[3] : input_shape[2];
+      in_width =
+          data_format == DataFormat::NCHW ? input_shape[3] : input_shape[2];

  const index_t output_channel = filter_shape[0] * group;

@@ -351,11 +361,11 @@ void CalcDeconvShape_Caffe(const std::vector<index_t> &input_shape,
    padded_out_shape->resize(4);
    (*padded_out_shape)[0] = input_shape[0];
    (*padded_out_shape)[1] =
-        data_format == NCHW ? output_channel : padded_out_height;
+        data_format == DataFormat::NCHW ? output_channel : padded_out_height;
    (*padded_out_shape)[2] =
-        data_format == NCHW ? padded_out_height : padded_out_width;
+        data_format == DataFormat::NCHW ? padded_out_height : padded_out_width;
    (*padded_out_shape)[3] =
-        data_format == NCHW ? padded_out_width : output_channel;
+        data_format == DataFormat::NCHW ? padded_out_width : output_channel;
  }

  if (out_shape != nullptr) {
@@ -363,9 +373,11 @@ void CalcDeconvShape_Caffe(const std::vector<index_t> &input_shape,
    index_t out_width = padded_out_width - out_pad_size[1];
    out_shape->resize(4);
    (*out_shape)[0] = input_shape[0];
-    (*out_shape)[1] = data_format == NCHW ? output_channel : out_height;
-    (*out_shape)[2] = data_format == NCHW ? out_height : out_width;
-    (*out_shape)[3] = data_format == NCHW ? out_width : output_channel;
+    (*out_shape)[1] =
+        data_format == DataFormat::NCHW ? output_channel : out_height;
+    (*out_shape)[2] = data_format == DataFormat::NCHW ? out_height : out_width;
+    (*out_shape)[3] =
+        data_format == DataFormat::NCHW ? out_width : output_channel;
  }
 }

@@ -385,7 +397,7 @@ void CalDeconvOutputShapeAndPadSize(const std::vector<index_t> &input_shape,
    MACE_CHECK(output_shape->size() == 4,
               "deconv output shape shoud be 4-dims");
    std::vector<index_t> &out_shape = *output_shape;
-    if (data_format == NCHW) {
+    if (data_format == DataFormat::NCHW) {
      const index_t t = out_shape[1];
      out_shape[1] = out_shape[3];
      out_shape[3] = out_shape[2];

--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -199,7 +199,7 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
 public:
  explicit ConcatOp(OpConstructContext *context)
      : ConcatOpBase(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::ConcatKernel<T>>();
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -241,12 +241,12 @@ void RegisterConcat(OpRegistryBase *op_registry) {
      op_registry,
      OpConditionBuilder("Concat")
          .SetDevicePlacerFunc(
-            [](OpConstructContext *context) -> std::set<DeviceType> {
+            [](OpConditionContext *context) -> std::set<DeviceType> {
              auto op = context->operator_def();
-              auto tensor_shape_info = context->tensor_shape_info();
              if (op->output_shape_size() != op->output_size()) {
                return { DeviceType::CPU, DeviceType::GPU };
              }
+              auto tensor_shape_info = context->tensor_shape_info();
              if (op->output_shape(0).dims_size() != 4) {
                return { DeviceType::CPU };
              } else {

--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -231,9 +231,9 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
    std::vector<int> paddings(2);
    if (paddings_.empty()) {
      CalcPaddingAndOutputSize(input->shape().data(),
-                               NHWC,
+                               DataFormat::NHWC,
                               filter->shape().data(),
-                               OHWI,
+                               DataFormat::OHWI,
                               dilations_.data(),
                               strides_.data(),
                               padding_type_,
@@ -242,9 +242,9 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
    } else {
      paddings = paddings_;
      CalcOutputSize(input->shape().data(),
-                     NHWC,
+                     DataFormat::NHWC,
                     filter->shape().data(),
-                     OHWI,
+                     DataFormat::OHWI,
                     paddings_.data(),
                     dilations_.data(),
                     strides_.data(),
@@ -459,14 +459,13 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
              "leakyrelu_coefficient", 0.0f)),
        wino_block_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)) {
    MemoryType mem_type;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
      kernel_ = make_unique<opencl::image::Conv2dKernel<T>>();
    } else {
      mem_type = MemoryType::GPU_BUFFER;
      kernel_ = make_unique<opencl::buffer::Conv2dKernel<T>>();
    }
-    context->set_output_mem_type(mem_type);
    // Transform filter tensor to target format
    if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
        (kernel_->CheckUseWinograd(

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -47,8 +47,8 @@ void TestNHWCSimple3x3VALID(int wino_blk_size = 0) {
  const std::vector<index_t> output_shape = {1, 1, 1, 1};

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -60,8 +60,8 @@ void TestNHWCSimple3x3VALID(int wino_blk_size = 0) {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("Input")
@@ -105,8 +105,8 @@ void TestNHWCSimple3x3SAME(int wino_blk_size = 0) {
  const std::vector<index_t> output_shape = {1, 3, 3, 1};

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -118,8 +118,8 @@ void TestNHWCSimple3x3SAME(int wino_blk_size = 0) {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("Input")
@@ -189,8 +189,8 @@ void TestNHWCSimple3x3WithoutBias() {
       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true);

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -203,8 +203,8 @@ void TestNHWCSimple3x3WithoutBias() {

    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("Input")
@@ -256,8 +256,8 @@ void TestNHWCCombined3x3() {
  net.AddInputFromArray<D, T>("Bias", {2}, {0.1f, 0.2f}, true);

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -270,8 +270,8 @@ void TestNHWCCombined3x3() {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("Input")
@@ -321,8 +321,8 @@ void TestFusedNHWCSimple3x3VALID(int wino_blk_size = 0) {
  const std::vector<index_t> output_shape = {1, 1, 1, 1};

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -336,8 +336,8 @@ void TestFusedNHWCSimple3x3VALID(int wino_blk_size = 0) {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("Input")
@@ -376,8 +376,8 @@ void TestFusedNHWCSimple3x3WithoutBias(int wino_blk_size = 0) {
  const std::vector<index_t> output_shape = {1, 1, 1, 1};

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -391,8 +391,8 @@ void TestFusedNHWCSimple3x3WithoutBias(int wino_blk_size = 0) {

    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("Input")
@@ -459,8 +459,8 @@ void TestConv1x1() {
  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f}, true);

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -472,8 +472,8 @@ void TestConv1x1() {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("Input")
@@ -532,8 +532,8 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true,
        false);
    net.AddRandomInput<D, T>("Bias", {output_channels}, true, false);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

    // Construct graph
    OpDefBuilder("Conv2D", "Conv2dTest")
@@ -552,8 +552,8 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
    // run on cpu
    net.RunOp();

-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

    // Check
    auto expected = net.CreateTensor<float>();
@@ -651,8 +651,8 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
                                    float_bias_data,
                                    true);

-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputNCHW")
@@ -667,8 +667,8 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
    // run on cpu
    net.RunOp();

-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

    // Check
    auto expected = net.CreateTensor<float>();
@@ -811,8 +811,8 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
    net.AddRandomInput<D, T>("Bias", {output_channels}, true);

-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

    // Construct graph
    OpDefBuilder("Conv2D", "Conv2dTest")
@@ -828,8 +828,8 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,

    // run on cpu
    net.RunOp();
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

    // Check
    auto expected = net.CreateTensor<float>();
@@ -900,8 +900,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
    net.AddRandomInput<D, float>("Bias", {output_channels}, true);

-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    // Construct graph
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputNCHW")
@@ -916,8 +916,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
    // run on cpu
    net.RunOp();

-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
    // Check
    auto expected = net.CreateTensor<float>();
    expected->Copy(*net.GetOutput("Output"));
@@ -979,8 +979,8 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
    net.AddRandomInput<D, float>("Bias", {output_channels}, true);

-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    // Construct graph
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputNCHW")
@@ -994,8 +994,8 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
    // run on cpu
    net.RunOp();

-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

    // Check
    auto expected = net.CreateTensor<float>();
@@ -1118,12 +1118,12 @@ void TestQuant(const index_t batch,
  net.AddRandomInput<CPU, float>("Filter", {out_channels, k_height, k_width,
                                            in_channels}, true);
  net.AddRandomInput<CPU, float>("Bias", {out_channels}, true);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  net.TransformFilterDataFormat<DeviceType::CPU, float>("Filter",
-                                                        OHWI,
+                                                        DataFormat::OHWI,
                                                        "FilterOIHW",
-                                                        OIHW);
+                                                        DataFormat::OIHW);

  OpDefBuilder("Conv2D", "Conv2dTest")
      .Input("InputNCHW")
@@ -1136,8 +1136,8 @@ void TestQuant(const index_t batch,
      .AddIntArg("T", static_cast<int>(DT_FLOAT))
      .Finalize(net.NewOperatorDef());
  net.RunOp(CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                  "Output", NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  OpDefBuilder("Quantize", "QuantizeFilter")
      .Input("Filter")

--- a/mace/ops/crop.cc
+++ b/mace/ops/crop.cc
@@ -117,7 +117,7 @@ class CropOp<DeviceType::GPU, T> : public Operation {
 public:
  explicit CropOp(OpConstructContext *context)
      : Operation(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::CropKernel<T>>(
          Operation::GetRepeatedArgs<int>("offset"));
    } else {
@@ -145,6 +145,24 @@ void RegisterCrop(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Crop", CropOp,
                   DeviceType::GPU, half);
 #endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP_CONDITION(
+      op_registry,
+      OpConditionBuilder("Crop")
+          .SetDevicePlacerFunc(
+              [](OpConditionContext *context) -> std::set<DeviceType> {
+                auto op = context->operator_def();
+                if (op->output_shape_size() != op->output_size()) {
+                  return { DeviceType::CPU, DeviceType::GPU };
+                }
+                int has_data_format =
+                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                        *op, "has_data_format", 0);
+                if (!has_data_format ||
+                    op->output_shape(0).dims_size() != 4) {
+                  return { DeviceType::CPU };
+                }
+                return { DeviceType::CPU, DeviceType::GPU };
+              }));
 }

 }  // namespace ops

--- a/mace/ops/crop_test.cc
+++ b/mace/ops/crop_test.cc
@@ -42,13 +42,13 @@ void RunCrop(const std::vector<index_t> &input_shape,
        .Finalize(net.NewOperatorDef());
  } else if (D == CPU) {
    net.TransformDataFormat<DeviceType::CPU, float>("Input0",
-                                                    NHWC,
+                                                    DataFormat::NHWC,
                                                    "InputNCHW0",
-                                                    NCHW);
+                                                    DataFormat::NCHW);
    net.TransformDataFormat<DeviceType::CPU, float>("Input1",
-                                                    NHWC,
+                                                    DataFormat::NHWC,
                                                    "InputNCHW1",
-                                                    NCHW);
+                                                    DataFormat::NCHW);
    OpDefBuilder("Crop", "CropTest")
        .Input("InputNCHW0")
        .Input("InputNCHW1")
@@ -62,8 +62,8 @@ void RunCrop(const std::vector<index_t> &input_shape,
  net.RunOp(D);

  if (D == CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }
  // Check
  auto expected = net.CreateTensor<float>(expected_shape, expected_data);

--- a/mace/ops/cumsum_test.cc
+++ b/mace/ops/cumsum_test.cc
@@ -32,8 +32,8 @@ void SimpleTestWithDataFormat(const std::vector<index_t> &shape,
  OpsTestNet net;

  net.AddInputFromArray<CPU, T>("Input", shape, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("Cumsum", "CumsumTest")
    .Input("InputNCHW")
@@ -48,8 +48,8 @@ void SimpleTestWithDataFormat(const std::vector<index_t> &shape,
  // Run
  net.RunOp(DeviceType::CPU);

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  net.AddInputFromArray<CPU, T>("ExpectedOutput", shape, output);
  ExpectTensorNear<T>(*net.GetOutput("ExpectedOutput"),

--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -173,7 +173,7 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
  explicit Deconv2dOp(OpConstructContext *context)
      : Deconv2dOpBase(context) {
    MemoryType mem_type = MemoryType::GPU_IMAGE;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::Deconv2dKernel<T>>();
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -197,7 +197,6 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
            OpenCLBufferType::ARGUMENT,
            mem_type) == MaceStatus::MACE_SUCCESS);
      }
-      context->SetInputInfo(2, MemoryType::CPU_BUFFER, DataType::DT_INT32);
    }
  }
  MaceStatus Run(OpContext *context) override {
@@ -241,7 +240,7 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
                                   &out_paddings,
                                   nullptr,
                                   model_type_,
-                                   NHWC);
+                                   DataFormat::NHWC);

    return kernel_->Compute(context, input, filter, bias,
                            strides_.data(), in_paddings.data(), activation_,
@@ -264,6 +263,30 @@ void RegisterDeconv2D(OpRegistryBase *op_registry) {

  MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
                   DeviceType::GPU, half);
+  MACE_REGISTER_OP_CONDITION(
+      op_registry,
+      OpConditionBuilder("Deconv2D")
+          .SetInputMemoryTypeSetter(
+              [](OpConditionContext *context) -> void {
+                MemoryType mem_type = MemoryType::CPU_BUFFER;
+                if (context->device()->device_type() == DeviceType::GPU) {
+                  if (context->device()->gpu_runtime()->UseImageMemory()) {
+                    mem_type = MemoryType::GPU_IMAGE;
+                  } else {
+                    MACE_NOT_IMPLEMENTED;
+                  }
+                  FrameworkType framework_type =
+                      static_cast<FrameworkType>(
+                        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                            *(context->operator_def()), "framework_type",
+                            FrameworkType::TENSORFLOW));
+                  if (framework_type == FrameworkType::TENSORFLOW) {
+                    context->SetInputInfo(2, MemoryType::CPU_BUFFER,
+                                          DataType::DT_INT32);
+                  }
+                }
+                context->set_output_mem_type(mem_type);
+              }));
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
@@ -47,7 +47,8 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
  net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data, true);
  net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data, true);
  // TODO(liutuo): remove the unused transform
-  net.TransformFilterDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
+  net.TransformFilterDataFormat<D, float>(
+      "Filter", DataFormat::HWOI, "FilterOIHW", DataFormat::OIHW);
  if (D == DeviceType::GPU) {
    if (model_type == FrameworkType::CAFFE) {
      OpDefBuilder("Deconv2D", "Deconv2dTest")
@@ -77,8 +78,8 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
    }
    net.RunOp(D);
  } else {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

    if (model_type == FrameworkType::CAFFE) {
      OpDefBuilder("Deconv2D", "Deconv2dTest")
@@ -109,8 +110,8 @@ void RunTestSimple(const std::vector<index_t> &input_shape,

    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }

  auto expected = net.CreateTensor<float>(expected_shape, expected_data);
@@ -380,8 +381,8 @@ void TestComplexDeconvNxN(const int batch,
        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true,
        false);
    net.AddRandomInput<D, T>("Bias", {output_channels}, true, false);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    int out_h = 0;
    int out_w = 0;

@@ -440,8 +441,8 @@ void TestComplexDeconvNxN(const int batch,
    // run on cpu
    net.RunOp();

-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

    // Check
    auto expected = net.CreateTensor<float>();

--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -96,7 +96,7 @@ class DepthToSpaceOp<DeviceType::GPU, T> : public Operation {
  explicit DepthToSpaceOp(OpConstructContext *context)
      : Operation(context) {
    int block_size = Operation::GetOptionalArg<int>("block_size", 1);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::DepthToSpaceKernel<T>>(block_size);
    } else {
      MACE_NOT_IMPLEMENTED;

--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
@@ -32,8 +32,8 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,
  net.AddInputFromArray<D, float>("Input", input_shape, input_data);
  // Construct graph
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
        .Input("InputNCHW")
        .Output("OutputNCHW")
@@ -41,8 +41,8 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  } else {
    OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
@@ -114,8 +114,8 @@ void RandomTest(const int block_size,

  // Add input data
  net.AddRandomInput<D, float>("Input", shape);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
      .Input("InputNCHW")
      .AddIntArg("block_size", block_size)
@@ -125,8 +125,8 @@ void RandomTest(const int block_size,
  // Run
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
      .Input("Input")

--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -188,9 +188,9 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
        filter->dim(2) * filter->dim(3), filter->dim(0), filter->dim(1), 1};
    if (paddings_.empty()) {
      CalcPaddingAndOutputSize(input->shape().data(),
-                               NHWC,
+                               DataFormat::NHWC,
                               ohwi_shape.data(),
-                               OHWI,
+                               DataFormat::OHWI,
                               dilations_.data(),
                               strides_.data(),
                               padding_type_,
@@ -199,9 +199,9 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
    } else {
      paddings = paddings_;
      CalcOutputSize(input->shape().data(),
-                     NHWC,
+                     DataFormat::NHWC,
                     ohwi_shape.data(),
-                     OHWI,
+                     DataFormat::OHWI,
                     paddings_.data(),
                     dilations_.data(),
                     strides_.data(),
@@ -375,14 +375,13 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
  explicit DepthwiseConv2dOp(OpConstructContext *context)
      : DepthwiseConv2dOpBase(context) {
    MemoryType mem_type;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
      kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel<T>>();
    } else {
      mem_type = MemoryType::GPU_BUFFER;
      kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel<T>>();
    }
-    context->set_output_mem_type(mem_type);
    Tensor *filter_tensor = context->workspace()->GetTensor(
        operator_def_->input(1));
    if (filter_tensor != nullptr && filter_tensor->is_weight()) {
@@ -393,8 +392,6 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
          1,
          OpenCLBufferType::DW_CONV2D_FILTER,
          mem_type) == MaceStatus::MACE_SUCCESS);
-    } else {
-      context->SetInputOpenCLBufferType(1, OpenCLBufferType::DW_CONV2D_FILTER);
    }
    if (operator_def_->input_size() > 2) {
      MACE_CHECK(TransformFilter<T>(
@@ -440,7 +437,40 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {

  MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
                   DepthwiseConv2dOp, DeviceType::GPU, half);
+  MACE_REGISTER_OP_CONDITION(
+      op_registry,
+      OpConditionBuilder("DepthwiseConv2d")
+          .SetInputMemoryTypeSetter(
+              [](OpConditionContext *context) -> void {
+                MemoryType mem_type = MemoryType::CPU_BUFFER;
+                if (context->device()->device_type() == DeviceType::GPU) {
+                  if (context->device()->gpu_runtime()->UseImageMemory()) {
+                    mem_type = MemoryType::GPU_IMAGE;
+                  } else {
+                    mem_type = MemoryType::GPU_BUFFER;
+                  }
+                  auto filter_tensor = context->workspace()->GetTensor(
+                      context->operator_def()->input(1));
+                  if (filter_tensor == nullptr || !filter_tensor->is_weight()) {
+                    context->SetInputOpenCLBufferType(
+                        1, OpenCLBufferType::DW_CONV2D_FILTER);
+                  }
+                }
+                context->set_output_mem_type(mem_type);
+              }));
 #endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP_CONDITION(
+      op_registry,
+      OpConditionBuilder("DepthwiseConv2d")
+          .SetInputsDataFormatSelector(
+              [](OpConditionContext *context) -> std::vector<DataFormat> {
+                DataFormat op_data_format =
+                    static_cast<DataFormat>(
+                        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                        *context->operator_def(), "data_format",
+                        static_cast<int>(DataFormat::NONE)));
+                return {op_data_format, DataFormat::OIHW, DataFormat::NONE};
+              }));
 }

 }  // namespace ops

--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -39,8 +39,8 @@ void SimpleValidTest() {
      true);
  net.AddInputFromArray<D, float>("Bias", {2}, {.1f, .2f}, true);
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -52,8 +52,8 @@ void SimpleValidTest() {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
        .Input("Input")
@@ -127,8 +127,8 @@ void ComplexValidTest(index_t batch,
                                  true);

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -141,8 +141,8 @@ void ComplexValidTest(index_t batch,
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
        .Input("Input")
@@ -249,8 +249,8 @@ void TestNxNS12(const index_t height, const index_t width) {
                                               {multiplier * channel},
                                               true, false);

-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -267,8 +267,8 @@ void TestNxNS12(const index_t height, const index_t width) {
    // Run on cpu
    net.RunOp();

-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

    // Check
    auto expected = net.CreateTensor<float>();
@@ -389,9 +389,9 @@ void TestQuant(const index_t batch,
      "Filter", {k_height, k_width, in_channels, multiplier}, true, false);
  net.AddRandomInput<CPU, float>("Bias", {out_channels}, true);
  net.TransformDataFormat<DeviceType::CPU, float>(
-      "Input", NHWC, "InputNCHW", NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  net.TransformFilterDataFormat<DeviceType::CPU, float>(
-      "Filter", HWIO, "FilterOIHW", OIHW);
+      "Filter", DataFormat::HWIO, "FilterOIHW", DataFormat::OIHW);

  OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
      .Input("InputNCHW")
@@ -405,7 +405,7 @@ void TestQuant(const index_t batch,
      .Finalize(net.NewOperatorDef());
  net.RunOp(CPU);
  net.TransformDataFormat<DeviceType::CPU, float>(
-      "OutputNCHW", NCHW, "Output", NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  OpDefBuilder("Quantize", "QuantizeFilter")
      .Input("Filter")

--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -190,7 +190,7 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
  explicit DepthwiseDeconv2dOp(OpConstructContext *context)
      : Deconv2dOpBase(context) {
    MemoryType mem_type = MemoryType::GPU_IMAGE;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel<T>>();
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -230,7 +230,7 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
                                   &out_paddings,
                                   nullptr,
                                   CAFFE,
-                                   NHWC);
+                                   DataFormat::NHWC);

    return kernel_->Compute(context,
                            input,

--- a/mace/ops/depthwise_deconv2d_test.cc
+++ b/mace/ops/depthwise_deconv2d_test.cc
@@ -39,7 +39,8 @@ void RunTestSimple(const int group,
  // Add input data
  net.AddInputFromArray<D, float>("Input", input_shape, input_data);
  net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data, true);
-  net.TransformFilterDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
+  net.TransformFilterDataFormat<D, float>(
+      "Filter", DataFormat::HWOI, "FilterOIHW", DataFormat::OIHW);
  const index_t out_channels = expected_shape[3];
  net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data, true);

@@ -56,8 +57,8 @@ void RunTestSimple(const int group,

    net.RunOp(D);
  } else {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC,
-                                                    "InputNCHW", NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
        .Input("InputNCHW")
        .Input("FilterOIHW")
@@ -69,8 +70,8 @@ void RunTestSimple(const int group,
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }

  auto expected = net.CreateTensor<float>(expected_shape, expected_data);
@@ -193,8 +194,8 @@ void RandomTest(index_t batch,
                                                {channel * multiplier},
                                                bias_data, true, false);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
      .Input("InputNCHW")
      .Input("Filter")
@@ -210,8 +211,8 @@ void RandomTest(index_t batch,
      .Finalize(net.NewOperatorDef());
  // Run
  net.RunOp(DeviceType::CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                  "Output", NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);


  // Check

--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -1145,7 +1145,7 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
    int32_t scalar_input_index = Operation::GetOptionalArg<int32_t>(
            "scalar_input_index", 1);
    MemoryType mem_type;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
      kernel_ = make_unique<opencl::image::EltwiseKernel<T>>(
          type, coeff, scalar_input, scalar_input_index);

--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -69,7 +69,8 @@ void SimpleTensorScalar(const ops::EltwiseType type,
  net.AddInputFromArray<D, T>("Input", shape, input);

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, T>("Input", NHWC, "TInput", NCHW);
+    net.TransformDataFormat<D, T>(
+        "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
    OpDefBuilder("Eltwise", "EltwiseTest")
        .Input("TInput")
        .AddIntArg("T", DataTypeToEnum<T>::v())
@@ -81,7 +82,8 @@ void SimpleTensorScalar(const ops::EltwiseType type,
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<D, DstType>("TOutput", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, DstType>(
+        "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else {
    OpDefBuilder("Eltwise", "EltwiseTest")
        .Input("Input")
@@ -124,13 +126,15 @@ void SimpleTensorEltwise(const ops::EltwiseType type,
            .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
            .Output("TOutput");
    if (shape0.size() > 1) {
-      net.TransformDataFormat<D, T>("Input0", NHWC, "TInput0", NCHW);
+      net.TransformDataFormat<D, T>(
+          "Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW);
      op_builder.Input("TInput0");
    } else {
      op_builder.Input("Input0");
    }
    if (shape1.size() > 1) {
-      net.TransformDataFormat<D, T>("Input1", NHWC, "TInput1", NCHW);
+      net.TransformDataFormat<D, T>(
+          "Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW);
      op_builder.Input("TInput1");
    } else {
      op_builder.Input("Input1");
@@ -139,7 +143,8 @@ void SimpleTensorEltwise(const ops::EltwiseType type,

    // Run
    net.RunOp(D);
-    net.TransformDataFormat<D, DstType>("TOutput", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, DstType>(
+        "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else {
    OpDefBuilder("Eltwise", "EltwiseTest")
        .Input("Input0")
@@ -560,7 +565,8 @@ void GPUOverflowTest(const ops::EltwiseType type,
  net.AddInputFromArray<DeviceType::GPU, T>(
      "Filter",
      {output_shape.back(), shape0.back(), 3, 3},
-      std::vector<float>(output_shape.back() * shape0.back() * 9, 1));
+      std::vector<float>(output_shape.back() * shape0.back() * 9, 1),
+      true);
  OpDefBuilder("Conv2D", "Conv2D")
      .AddIntArg("T", DataTypeToEnum<T>::v())
      .Input("EltOutput")
@@ -636,8 +642,8 @@ void RandomTensorScalar(const ops::EltwiseType type,
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input", shape, false, true, true);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
  OpDefBuilder("Eltwise", "EltwiseTest")
      .Input("TInput")
      .AddIntArg("type", static_cast<int>(type))
@@ -647,8 +653,8 @@ void RandomTensorScalar(const ops::EltwiseType type,
      .Finalize(net.NewOperatorDef());
  // Run
  net.RunOp(DeviceType::CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
  auto expected = net.CreateTensor<float>();
  expected->Copy(*net.GetOutput("Output"));

@@ -690,10 +696,10 @@ void RandomTensorEltwise(const ops::EltwiseType type,
                                             true,
                                             true);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC, "TInput0",
-                                                  NCHW);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input1", NHWC, "TInput1",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW);
  OpDefBuilder("Eltwise", "EltwiseTest")
      .Input("TInput0")
      .Input("TInput1")
@@ -705,8 +711,8 @@ void RandomTensorEltwise(const ops::EltwiseType type,

  // Run
  net.RunOp(DeviceType::CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
  auto expected = net.CreateTensor<float>();
  expected->Copy(*net.GetOutput("Output"));

@@ -746,10 +752,10 @@ void Quantized(const std::vector<index_t> &shape,
                                             true,
                                             true);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC, "TInput0",
-                                                  NCHW);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input1", NHWC, "TInput1",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW);

  OpDefBuilder("Eltwise", "EltwiseTest")
      .Input("TInput0")
@@ -761,8 +767,8 @@ void Quantized(const std::vector<index_t> &shape,

  // Run
  net.RunOp(DeviceType::CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);

  OpDefBuilder("Quantize", "QuantizeInput0")
      .Input("Input0")

--- a/mace/ops/expand_dims.cc
+++ b/mace/ops/expand_dims.cc
@@ -14,7 +14,6 @@


 #include "mace/core/operator.h"
-#include "mace/ops/common/transpose.h"
 #include "mace/utils/math.h"

 namespace mace {
@@ -44,27 +43,8 @@ class ExpandDimsOp<DeviceType::CPU, T> : public Operation {
    std::vector<index_t> output_shape(input_shape);
    output_shape.insert(output_shape.begin() + axis_, 1);

-    bool has_data_format = Operation::GetOptionalArg<int>(
-        "has_data_format", 0) == 1;
-    if (has_data_format && output_shape.size() == 4) {
-      // only tensorflow support expand dim, so the default format is NHWC
-      // transform NHWC to NCHW
-      auto t_output_shape = TransposeShape<int64_t, int64_t>(output_shape,
-                                                             {0, 3, 1, 2});
-      output->Resize(t_output_shape);
-      Tensor::MappingGuard input_guard(input);
-      Tensor::MappingGuard output_guard(output);
-      auto input_data = input->data<T>();
-      auto output_data = output->mutable_data<T>();
-
-      Transpose(&context->device()->cpu_runtime()->thread_pool(),
-                input_data, output_shape, {0, 3, 1, 2}, output_data);
-    } else {
-      output->Resize(output_shape);
-      Tensor::MappingGuard input_guard(input);
-      auto input_data = input->data<T>();
-      output->Copy<T>(input_data, input->size());
-    }
+    output->ReuseTensorBuffer(*input);
+    output->Reshape(output_shape);

    return MaceStatus::MACE_SUCCESS;
  }

--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
@@ -49,7 +49,8 @@ void Simple() {
  net.AddInputFromArray<D, float>("Offset", {1}, offset, true);

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<D, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
        .Input("InputNCHW")
        .Input("Scale")
@@ -58,7 +59,8 @@ void Simple() {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
        .Input("Input")
@@ -100,8 +102,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
      .Input("InputNCHW")
@@ -113,8 +115,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
  // run cpu
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>();
@@ -151,8 +153,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
      .Input("InputNCHW")
@@ -164,8 +166,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
  // run cpu
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>();
@@ -205,8 +207,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
      .Input("InputNCHW")
@@ -218,8 +220,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
  // run cpu
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>();
@@ -254,11 +256,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input",
                                             {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
      .Input("InputNCHW")
@@ -270,8 +272,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
  // run cpu
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>();

--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -190,7 +190,7 @@ class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
  explicit FullyConnectedOp(OpConstructContext *context)
      : FullyConnectedOpBase(context) {
    MemoryType mem_type = MemoryType::CPU_BUFFER;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
      kernel_ = make_unique<opencl::image::FullyConnectedKernel<T>>();
    } else {

--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -48,7 +48,8 @@ void Simple(const std::vector<index_t> &input_shape,
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("FullyConnected", "FullyConnectedTest")
        .Input("Input")
@@ -129,8 +130,8 @@ void Random(const index_t batch,
  net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel}, true,
      false);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("FullyConnected", "FullyConnectedTest")
      .Input("InputNCHW")
      .Input("Weight")
@@ -143,7 +144,8 @@ void Random(const index_t batch,
  // run cpu
  net.RunOp();

-  net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+  net.TransformDataFormat<CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>();
@@ -215,8 +217,10 @@ void QuantRandom(const index_t batch,
  net.AddRandomInput<CPU, float>(
      "Weight", {out_channel, height, width, channels}, true);
  net.AddRandomInput<CPU, float>("Bias", {out_channel}, true);
-  net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
-  net.TransformFilterDataFormat<CPU, float>("Weight", OHWI, "WeightOIHW", OIHW);
+  net.TransformDataFormat<CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
+  net.TransformFilterDataFormat<CPU, float>(
+      "Weight", DataFormat::OHWI, "WeightOIHW", DataFormat::OIHW);

  OpDefBuilder("FullyConnected", "FullyConnectedTest")
      .Input("InputNCHW")
@@ -226,7 +230,8 @@ void QuantRandom(const index_t batch,
      .AddIntArg("T", DT_FLOAT)
      .Finalize(net.NewOperatorDef());
  net.RunOp();
-  net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+  net.TransformDataFormat<CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  OpDefBuilder("Quantize", "QuantizeWeight")
      .Input("Weight")

--- a/mace/ops/local_response_norm_test.cc
+++ b/mace/ops/local_response_norm_test.cc
@@ -29,7 +29,8 @@ void Simple() {
                                  {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<D, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

    OpDefBuilder("LocalResponseNorm", "LocalResponseNormTest")
        .Input("InputNCHW")
@@ -41,7 +42,8 @@ void Simple() {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }

  // Check

--- a/mace/ops/lstm_cell.cc
+++ b/mace/ops/lstm_cell.cc
@@ -36,7 +36,7 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation {
        Operation::GetOptionalArg<float>("scalar_input",
                                         0.0));
    MemoryType mem_type = MemoryType::GPU_IMAGE;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::LSTMCellKernel<T>>(forget_bias);
    } else {
      MACE_NOT_IMPLEMENTED;

--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -518,14 +518,6 @@ void RegisterMatMul(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
                   DeviceType::CPU, uint8_t);
 #endif  // MACE_ENABLE_QUANTIZE
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
-                   DeviceType::GPU, float);
-
-  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace ops

--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
@@ -23,7 +23,6 @@
 #include "mace/ops/opencl/image/buffer_to_image.h"
 #include "mace/ops/opencl/image/image_to_buffer.h"
 #include "mace/ops/opencl/buffer/buffer_transform.h"
-#include "mace/ops/common/transpose.h"
 #include "mace/utils/memory.h"

 namespace mace {
@@ -48,7 +47,6 @@ class OpenCLBufferTransformer {
                       const OpenCLBufferType type,
                       const MemoryType out_mem_type,
                       const int wino_blk_size,
-                       bool has_data_format,
                       Tensor *output) {
    Workspace *ws = context->workspace();
    DataType dt = DataTypeToEnum<T>::value;
@@ -67,31 +65,11 @@ class OpenCLBufferTransformer {
        VLOG(2) << "Transform CPU Buffer " << input->name()
                << " to GPU Buffer " << internal_tensor->name()
                << " with data type " << dt;
-        if (has_data_format && input->shape().size() == 4) {
-          // 1. (NCHW -> NHWC)
-          std::vector<int> dst_dims = {0, 2, 3, 1};
-          std::vector<index_t> output_shape =
-              TransposeShape<index_t, index_t>(input->shape(),
-                                               dst_dims);
-          internal_tensor->Resize(output_shape);
-          internal_tensor->set_data_format(DataFormat::NHWC);
-          // TODO(liuqi): Only support float now
-          const float *input_ptr = input->data<float>();
-          Tensor::MappingGuard guard(internal_tensor);
-          float *internal_ptr = internal_tensor->mutable_data<float>();
-          MACE_RETURN_IF_ERROR(ops::Transpose(
-              &context->device()->cpu_runtime()->thread_pool(),
-              input_ptr,
-              input->shape(),
-              dst_dims,
-              internal_ptr));
-        } else {
-          internal_tensor->Resize(input->shape());
-          const uint8_t *input_ptr = input->data<uint8_t>();
-          Tensor::MappingGuard guard(internal_tensor);
-          uint8_t *internal_ptr = internal_tensor->mutable_data<uint8_t>();
-          memcpy(internal_ptr, input_ptr, input->raw_size());
-        }
+        internal_tensor->Resize(input->shape());
+        const uint8_t *input_ptr = input->data<uint8_t>();
+        Tensor::MappingGuard guard(internal_tensor);
+        uint8_t *internal_ptr = internal_tensor->mutable_data<uint8_t>();
+        memcpy(internal_ptr, input_ptr, input->raw_size());
        // 2. convert the internal GPU Buffer to output.
        return kernel_->Compute(
            context, internal_tensor, type, wino_blk_size, output);
@@ -108,30 +86,12 @@ class OpenCLBufferTransformer {
      VLOG(2) << "Transform GPU Buffer " << internal_tensor.name()
              << " to CPU Buffer " << output->name()
              << " with data type " << dt;
-      if (has_data_format && internal_tensor.shape().size() == 4) {
-        // NHWC -> NCHW
-        std::vector<int> dst_dims = {0, 3, 1, 2};
-        std::vector<index_t> output_shape =
-            TransposeShape<index_t, index_t>(internal_tensor.shape(),
-                                             dst_dims);
-        output->set_data_format(DataFormat::NCHW);
-        Tensor::MappingGuard guard(&internal_tensor);
-        const float *internal_ptr = internal_tensor.data<float>();
-        output->Resize(output_shape);
-        float *output_ptr = output->mutable_data<float>();
-        return ops::Transpose(&context->device()->cpu_runtime()->thread_pool(),
-                              internal_ptr,
-                              internal_tensor.shape(),
-                              dst_dims,
-                              output_ptr);
-      } else {
-        Tensor::MappingGuard guard(&internal_tensor);
-        const T *internal_ptr = internal_tensor.data<T>();
-        output->Resize(internal_tensor.shape());
-        T *output_ptr = output->mutable_data<T>();
-        memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T));
-        return MaceStatus::MACE_SUCCESS;
-      }
+      Tensor::MappingGuard guard(&internal_tensor);
+      const T *internal_ptr = internal_tensor.data<T>();
+      output->Resize(internal_tensor.shape());
+      T *output_ptr = output->mutable_data<T>();
+      memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T));
+      return MaceStatus::MACE_SUCCESS;
    } else {
      LOG(FATAL) << "Unexpected error: " << out_mem_type;
      return MaceStatus::MACE_SUCCESS;
@@ -172,7 +132,7 @@ MaceStatus TransformFilter(
  input->MarkUnused();
  return OpenCLBufferTransformer<T>(input->memory_type(), mem_type).
      Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
-                DataFormat::DF_NONE, output);
+                output);
 }

 }  // namespace ops

--- a/mace/ops/opencl/image/eltwise.h
+++ b/mace/ops/opencl/image/eltwise.h
@@ -71,14 +71,17 @@ MaceStatus EltwiseKernel<T>::Compute(
  if (input1 == nullptr) {
    input1_type = "INPUT_SCALAR";
  } else {
-    MACE_CHECK(input0->dim_size() == input1->dim_size() ||
+    MACE_CHECK((input0->dim_size() == input1->dim_size()
+        && input0->dim_size() == 4) ||
        input0->dim_size() == 1 || input1->dim_size() == 1)
-      << "Inputs of Eltwise op must be same shape";
+      << "Inputs of Eltwise op must be same shape or fulfill broadcast logic";
    MACE_CHECK(type_ != EltwiseType::EQUAL)
      << "Eltwise op on GPU does not support EQUAL";
    // broadcast
-    if (input0->size() != input1->size()) {
-      if (input0->size() < input1->size()) {
+    if (input0->size() != input1->size() ||
+        input0->dim_size() != input1->dim_size()) {
+      if (input0->size() < input1->size()
+          || input0->dim_size() < input1->dim_size()) {
        std::swap(input0, input1);
        swapped = true;
      }

--- a/mace/ops/opencl/image/reduce.h
+++ b/mace/ops/opencl/image/reduce.h
@@ -59,11 +59,6 @@ MaceStatus ReduceKernel<T>::Compute(
    const Tensor *input,
    Tensor *output) {
  MACE_CHECK_NOTNULL(input);
-  MACE_CHECK(keep_dims_, "reduce mean gpu only support keep dims.");
-  MACE_CHECK(input->dim_size() == 4,
-             "reduce gpu only support 4-dim input");
-  MACE_CHECK(axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2,
-             "reduce gpu only support 1,2-axis reduce");
  index_t batch = input->dim(0);
  const index_t in_height = input->dim(1);
  const index_t in_width = input->dim(2);

--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
@@ -15,6 +15,7 @@
 #include "mace/ops/ops_test_util.h"
 #include "mace/core/memory_optimizer.h"
 #include "mace/utils/memory.h"
+#include "mace/core/net_def_adapter.h"

 namespace mace {
 namespace ops {
@@ -175,26 +176,27 @@ void OpTestContext::SetOCLImageAndBufferTestFlag() {
 bool OpsTestNet::Setup(mace::DeviceType device) {
  NetDef net_def;
  for (auto &op_def : op_defs_) {
-    net_def.add_op()->CopyFrom(op_def);
-
+    auto target_op = net_def.add_op();
+    target_op->CopyFrom(op_def);
+
+    auto has_data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+        op_def, "has_data_format", 0);
+    auto is_quantized_op = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+        op_def, "T", static_cast<int>(DT_FLOAT))
+        == static_cast<int>(DT_UINT8);
    for (auto input : op_def.input()) {
      if (ws_.GetTensor(input) != nullptr &&
          !ws_.GetTensor(input)->is_weight()) {
        auto input_info = net_def.add_input_info();
        input_info->set_name(input);
-        auto has_data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            op_def, "has_data_format", 1);
-        auto is_quantized_op = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            op_def, "T", static_cast<int>(DT_FLOAT))
-            == static_cast<int>(DT_UINT8);
        if (has_data_format) {
          if (is_quantized_op || device == DeviceType::GPU) {
-            input_info->set_data_format(NHWC);
+            input_info->set_data_format(static_cast<int>(DataFormat::NHWC));
          } else {
-            input_info->set_data_format(NCHW);
+            input_info->set_data_format(static_cast<int>(DataFormat::NCHW));
          }
        } else {
-          input_info->set_data_format(DataFormat::DF_NONE);
+          input_info->set_data_format(static_cast<int>(DataFormat::NONE));
        }
        auto &shape = ws_.GetTensor(input)->shape();
        for (auto d : shape) {
@@ -202,6 +204,10 @@ bool OpsTestNet::Setup(mace::DeviceType device) {
        }
      }
    }
+    if (has_data_format) {
+      SetProtoArg<int>(target_op, "data_format",
+                       static_cast<int>(DataFormat::AUTO));
+    }
  }
  if (!op_defs_.empty()) {
    auto op_def = op_defs_.back();
@@ -216,15 +222,21 @@ bool OpsTestNet::Setup(mace::DeviceType device) {
      }
    }
  }
+  NetDef adapted_net_def;
+  NetDefAdapter net_def_adapter(op_registry_.get(), &ws_);
+  net_def_adapter.AdaptNetDef(&net_def,
+                              OpTestContext::Get()->GetDevice(device),
+                              &adapted_net_def);
+
  MemoryOptimizer mem_optimizer;
  net_ = make_unique<SerialNet>(
      op_registry_.get(),
-      &net_def,
+      &adapted_net_def,
      &ws_,
      OpTestContext::Get()->GetDevice(device),
      &mem_optimizer);
  MaceStatus status = (ws_.PreallocateOutputTensor(
-      net_def,
+      adapted_net_def,
      &mem_optimizer,
      OpTestContext::Get()->GetDevice(device)));
  if (status != MaceStatus::MACE_SUCCESS) return false;
@@ -267,15 +279,20 @@ MaceStatus OpsTestNet::RunOp() {
 MaceStatus OpsTestNet::RunNet(const mace::NetDef &net_def,
                              const mace::DeviceType device) {
  device_type_ = device;
+  NetDef adapted_net_def;
+  NetDefAdapter net_def_adapter(op_registry_.get(), &ws_);
+  net_def_adapter.AdaptNetDef(&net_def,
+                              OpTestContext::Get()->GetDevice(device),
+                              &adapted_net_def);
  MemoryOptimizer mem_optimizer;
  net_ = make_unique<SerialNet>(
      op_registry_.get(),
-      &net_def,
+      &adapted_net_def,
      &ws_,
      OpTestContext::Get()->GetDevice(device),
      &mem_optimizer);
  MACE_RETURN_IF_ERROR(ws_.PreallocateOutputTensor(
-      net_def,
+      adapted_net_def,
      &mem_optimizer,
      OpTestContext::Get()->GetDevice(device)));
  MACE_RETURN_IF_ERROR(net_->Init());

--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -223,7 +223,7 @@ class OpsTestNet {
    const std::vector<index_t> input_shape = input->shape();
    MACE_CHECK(input_shape.size() == 4, "input shape != 4");

-    if (src_format == NHWC && dst_format == NCHW) {
+    if (src_format == DataFormat::NHWC && dst_format == DataFormat::NCHW) {
      index_t batch = input_shape[0];
      index_t height = input_shape[1];
      index_t width = input_shape[2];
@@ -243,7 +243,8 @@ class OpsTestNet {
          }
        }
      }
-    } else if (src_format == NCHW && dst_format == NHWC) {
+    } else if (src_format == DataFormat::NCHW &&
+        dst_format == DataFormat::NHWC) {
      index_t batch = input_shape[0];
      index_t channels = input_shape[1];
      index_t height = input_shape[2];
@@ -281,7 +282,7 @@ class OpsTestNet {
        input->is_weight());
    const std::vector<index_t> input_shape = input->shape();
    MACE_CHECK(input_shape.size() == 4, "input shape != 4");
-    if (src_format == HWOI && dst_format == OIHW) {
+    if (src_format == DataFormat::HWOI && dst_format == DataFormat::OIHW) {
      index_t height = input_shape[0];
      index_t width = input_shape[1];
      index_t out_channels = input_shape[2];
@@ -299,7 +300,8 @@ class OpsTestNet {
              input_data[j * out_channels * in_channels + i];
        }
      }
-    } else if (src_format == OIHW && dst_format == HWOI) {
+    } else if (src_format == DataFormat::OIHW &&
+        dst_format == DataFormat::HWOI) {
      index_t out_channels = input_shape[0];
      index_t in_channels = input_shape[1];
      index_t height = input_shape[2];
@@ -317,7 +319,8 @@ class OpsTestNet {
              input_data[j * height * width + i];
        }
      }
-    } else if (src_format == HWIO && dst_format == OIHW) {
+    } else if (src_format == DataFormat::HWIO &&
+        dst_format == DataFormat::OIHW) {
      index_t height = input_shape[0];
      index_t width = input_shape[1];
      index_t in_channels = input_shape[2];
@@ -337,7 +340,8 @@ class OpsTestNet {
          }
        }
      }
-    } else if (src_format == OHWI && dst_format == OIHW) {
+    } else if (src_format == DataFormat::OHWI &&
+        dst_format == DataFormat::OIHW) {
      index_t out_channels = input_shape[0];
      index_t height = input_shape[1];
      index_t width = input_shape[2];

--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
@@ -179,7 +179,7 @@ class PadOp<DeviceType::GPU, T> : public Operation {
    std::vector<int> paddings = Operation::GetRepeatedArgs<int>("paddings");
    float constant_value = Operation::GetOptionalArg<float>(
        "constant_value", 0.0);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::PadKernel<T>>(
          type, paddings, constant_value);
    } else {

--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
@@ -45,8 +45,8 @@ void SimpleConstant() {
    // Run
    net.RunOp(D);
  } else {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
    OpDefBuilder("Pad", "PadTest")
        .Input("TInput")
        .Output("TOutput")
@@ -58,8 +58,8 @@ void SimpleConstant() {
    // Run
    net.RunOp();

-    net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
-                                                    NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }

  auto output = net.GetTensor("Output");
@@ -93,7 +93,8 @@ void Result(const std::vector<index_t> &input_shape,
  if (D == DeviceType::CPU) {
    t_input = "TInput";
    t_output = "TOutput";
-    net.TransformDataFormat<DeviceType::CPU, T>(input, NHWC, t_input, NCHW);
+    net.TransformDataFormat<DeviceType::CPU, T>(
+        input, DataFormat::NHWC, t_input, DataFormat::NCHW);
  }

  OpDefBuilder("Pad", "PadTest")
@@ -108,7 +109,8 @@ void Result(const std::vector<index_t> &input_shape,
  net.RunOp(D);

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, T>(t_output, NCHW, output, NHWC);
+    net.TransformDataFormat<DeviceType::CPU, T>(
+        t_output, DataFormat::NCHW, output, DataFormat::NHWC);
  }

  auto actual = net.GetTensor(output.c_str());
@@ -172,8 +174,8 @@ TEST_F(PadTest, ComplexCPU) {

  // Add input data
  net.AddRepeatedInput<DeviceType::CPU, float>("Input", {1, 1, 1, 2}, 2);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
  OpDefBuilder("Pad", "PadTest")
      .Input("TInput")
      .Output("TOutput")
@@ -184,8 +186,8 @@ TEST_F(PadTest, ComplexCPU) {

  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);

  auto output = net.GetTensor("Output");

@@ -209,8 +211,8 @@ void Complex(const std::vector<index_t> &input_shape,
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input", input_shape);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
  OpDefBuilder("Pad", "PadTest")
      .Input("TInput")
      .Output("TOutput")
@@ -222,8 +224,8 @@ void Complex(const std::vector<index_t> &input_shape,

  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);

  auto expected = net.CreateTensor<float>();
  expected->Copy(*net.GetOutput("Output"));

--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -270,9 +270,9 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
    std::vector<int> paddings(2);
    if (paddings_.empty()) {
      CalcPaddingAndOutputSize(input_tensor->shape().data(),
-                               NHWC,
+                               DataFormat::NHWC,
                               filter_shape.data(),
-                               OHWI,
+                               DataFormat::OHWI,
                               dilations_.data(),
                               strides_.data(),
                               padding_type_,
@@ -281,9 +281,9 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
    } else {
      paddings = paddings_;
      CalcOutputSize(input_tensor->shape().data(),
-                     NHWC,
+                     DataFormat::NHWC,
                     filter_shape.data(),
-                     OHWI,
+                     DataFormat::OHWI,
                     paddings_.data(),
                     dilations_.data(),
                     strides_.data(),
@@ -477,10 +477,9 @@ class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase {
 public:
  explicit PoolingOp(OpConstructContext *context)
      : PoolingOpBase(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::PoolingKernel<T>>();
    } else {
-      context->set_output_mem_type(MemoryType::GPU_BUFFER);
      kernel_ = make_unique<opencl::buffer::PoolingKernel<T>>();
    }
  }

--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -34,8 +34,8 @@ TEST_F(PoolingOpTest, MAX_VALID) {
      {0, 16, 1, 17, 2,  18, 3,  19, 4,  20, 5,  21, 6,  22, 7,  23,
       8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("Pooling", "PoolingTest")
      .Input("InputNCHW")
@@ -50,8 +50,8 @@ TEST_F(PoolingOpTest, MAX_VALID) {
  // Run
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected =
@@ -68,8 +68,8 @@ TEST_F(PoolingOpTest, MAX_SAME) {
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 3, 1},
                                                {0, 1, 2, 3, 4, 5, 6, 7, 8});

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("Pooling", "PoolingTest")
      .Input("InputNCHW")
@@ -84,8 +84,8 @@ TEST_F(PoolingOpTest, MAX_SAME) {
  // Run
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>({1, 2, 2, 1}, {4, 5, 7, 8});
@@ -102,8 +102,8 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
      "Input", {1, 4, 4, 1},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("Pooling", "PoolingTest")
      .Input("InputNCHW")
@@ -118,8 +118,8 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
  // Run
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>({1, 2, 2, 1}, {10, 11, 14, 15});
@@ -136,8 +136,8 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
      "Input", {1, 2, 9, 1},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("Pooling", "PoolingTest")
      .Input("InputNCHW")
@@ -152,8 +152,8 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
  // Run
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>({1, 1, 5, 1}, {10, 12, 14, 16, 17});
@@ -174,8 +174,8 @@ void SimpleMaxPooling3S2() {
       14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26});

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    // Run
    OpDefBuilder("Pooling", "PoolingTest")
        .Input("InputNCHW")
@@ -187,8 +187,8 @@ void SimpleMaxPooling3S2() {
        .AddIntsArg("dilations", {1, 1})
        .Finalize(net.NewOperatorDef());
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("Pooling", "PoolingTest")
        .Input("Input")
@@ -224,8 +224,8 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
  // Add input data
  net.AddRandomInput<D, float>("Input", input_shape);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("Pooling", "PoolingTest")
      .Input("InputNCHW")
@@ -240,8 +240,8 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
  // run on cpu
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  auto expected = net.CreateTensor<float>();
  expected->Copy(*net.GetOutput("Output"));
@@ -304,8 +304,8 @@ TEST_F(PoolingOpTest, AVG_VALID) {
      {0, 16, 1, 17, 2,  18, 3,  19, 4,  20, 5,  21, 6,  22, 7,  23,
       8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("Pooling", "PoolingTest")
      .Input("InputNCHW")
@@ -320,8 +320,8 @@ TEST_F(PoolingOpTest, AVG_VALID) {
  // Run
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>(
@@ -373,8 +373,8 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
  // Add input data
  net.AddRandomInput<D, float>("Input", shape);

-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("Pooling", "PoolingTest")
      .Input("InputNCHW")
@@ -389,8 +389,8 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
  // run on cpu
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  auto expected = net.CreateTensor<float>();
  expected->Copy(*net.GetOutput("Output"));
@@ -563,7 +563,7 @@ void TestQuant(const index_t batch,
  net.AddRandomInput<CPU, float>(
      "Input", input_shape, false, false);
  net.TransformDataFormat<DeviceType::CPU, float>(
-      "Input", NHWC, "InputNCHW", NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  net.AddRandomInput<DeviceType::CPU, float>(
      "OutputNCHW", input_shape, false, true, true);
@@ -580,7 +580,7 @@ void TestQuant(const index_t batch,

  net.RunOp(CPU);
  net.TransformDataFormat<DeviceType::CPU, float>(
-      "OutputNCHW", NCHW, "Output", NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  OpDefBuilder("Quantize", "QuantizeInput")
      .Input("Input")

--- a/mace/ops/reduce.cc
+++ b/mace/ops/reduce.cc
@@ -16,6 +16,7 @@

 #include <algorithm>
 #include <memory>
+#include <set>
 #include <vector>

 #include "mace/core/future.h"
@@ -872,7 +873,7 @@ class ReduceOp<DeviceType::GPU, T> : public ReduceOpBase {
 public:
  explicit ReduceOp(OpConstructContext *context)
      : ReduceOpBase(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::ReduceKernel<T>>(reduce_type_,
                                                            axis_,
                                                            keep_dims_);
@@ -907,6 +908,34 @@ void RegisterReduce(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
                   DeviceType::GPU, half);
 #endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP_CONDITION(
+      op_registry,
+      OpConditionBuilder("Reduce")
+          .SetDevicePlacerFunc(
+              [](OpConditionContext *context) -> std::set<DeviceType> {
+                auto op = context->operator_def();
+                if (op->output_shape_size() != op->output_size()) {
+                  return { DeviceType::CPU, DeviceType::GPU };
+                }
+                bool keep_dims =
+                    ProtoArgHelper::GetOptionalArg<OperatorDef, bool>(
+                        *op, "keepdims", false);
+                if (!keep_dims) {
+                  return { DeviceType::CPU };
+                }
+                auto axis =
+                    ProtoArgHelper::GetRepeatedArgs<OperatorDef, int>(
+                        *op, "axis");
+                if (axis.size() != 2 || axis[0] != 1 || axis[1] != 2) {
+                  return { DeviceType::CPU };
+                }
+                auto tensor_shape_info = context->tensor_shape_info();
+                if (tensor_shape_info->count(op->input(0)) == 0
+                    || tensor_shape_info->at(op->input(0)).size() != 4) {
+                  return { DeviceType::CPU };
+                }
+                return { DeviceType::CPU, DeviceType::GPU };
+              }));
 }

 }  // namespace ops

--- a/mace/ops/reduce_test.cc
+++ b/mace/ops/reduce_test.cc
@@ -38,7 +38,8 @@ void Simple(const std::vector<index_t> &input_shape,
  net.AddInputFromArray<D, float>("Input", input_shape, input);

  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<D, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Reduce", "ReduceTest")
        .Input("InputNCHW")
        .AddIntsArg("axis", axis)
@@ -49,7 +50,8 @@ void Simple(const std::vector<index_t> &input_shape,
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else {
    OpDefBuilder("Reduce", "ReduceTest")
        .Input("Input")
@@ -289,8 +291,8 @@ void RandomTest(const std::vector<index_t> &input_shape,
    // Add input data
    net.AddRandomInput<D, float>("Input", input_shape);

-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Reduce", "ReduceTest")
        .Input("InputNCHW")
        .AddIntsArg("axis", axis)
@@ -301,8 +303,8 @@ void RandomTest(const std::vector<index_t> &input_shape,
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp();
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
    OpDefBuilder("Reduce", "ReduceTest")
        .Input("Input")
        .AddIntsArg("axis", axis)
@@ -353,7 +355,7 @@ void TestQuant(const std::vector<index_t> &input_shape,
    net.AddRandomInput<CPU, float>(
        "Input", input_shape, false, false);
    net.TransformDataFormat<DeviceType::CPU, float>(
-        "Input", NHWC, "InputNCHW", NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    net.AddRandomInput<DeviceType::CPU, float>(
        "OutputNCHW", input_shape, false, true, true);

@@ -368,7 +370,7 @@ void TestQuant(const std::vector<index_t> &input_shape,
        .Finalize(net.NewOperatorDef());
    net.RunOp(CPU);
    net.TransformDataFormat<DeviceType::CPU, float>(
-        "OutputNCHW", NCHW, "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

    OpDefBuilder("Quantize", "QuantizeInput")
        .Input("Input")

--- a/mace/ops/ref/deconv_2d.cc
+++ b/mace/ops/ref/deconv_2d.cc
@@ -51,7 +51,7 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
                                 &out_pad_size,
                                 &padded_out_shape,
                                 framework_type_,
-                                 NCHW);
+                                 DataFormat::NCHW);

  MACE_RETURN_IF_ERROR(output->Resize(out_shape));


--- a/mace/ops/ref/depthwise_deconv_2d.cc
+++ b/mace/ops/ref/depthwise_deconv_2d.cc
@@ -50,7 +50,7 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
                                 &out_pad_size,
                                 &padded_out_shape,
                                 framework_type_,
-                                 NCHW);
+                                 DataFormat::NCHW);

  MACE_RETURN_IF_ERROR(output->Resize(out_shape));

@@ -185,7 +185,7 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
                                 &out_pad_size,
                                 &padded_out_shape,
                                 framework_type_,
-                                 NCHW);
+                                 DataFormat::NCHW);

  MACE_RETURN_IF_ERROR(output->Resize(out_shape));


--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
@@ -212,7 +212,7 @@ class ResizeBicubicOp<DeviceType::GPU, T> : public Operation {
    std::vector<index_t> size = Operation::GetRepeatedArgs<index_t>(
        "size", {-1, -1});
    MACE_CHECK(size.size() == 2);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::ResizeBicubicKernel<T>>(
          align_corners, size[0], size[1]);
    } else {

--- a/mace/ops/resize_bicubic_test.cc
+++ b/mace/ops/resize_bicubic_test.cc
@@ -31,8 +31,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) {
  std::vector<float> input(24);
  std::iota(begin(input), end(input), 0);
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
      .Input("InputNCHW")
@@ -42,8 +42,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) {

  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
@@ -60,8 +60,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) {
  std::vector<float> input(48);
  std::iota(begin(input), end(input), 0);
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 4, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
      .Input("InputNCHW")
@@ -71,8 +71,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) {

  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>({1, 2, 3, 3},
@@ -92,8 +92,8 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) {
  std::vector<float> input(24);
  std::iota(begin(input), end(input), 0);
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
      .Input("InputNCHW")
@@ -104,8 +104,8 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) {

  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
@@ -133,8 +133,8 @@ void TestRandomResizeBicubic() {
    net.AddRandomInput<D, float>("Input",
                                 {batch, in_height, in_width, channels},
                                 false, true, true);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

    OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
        .Input("InputNCHW")
@@ -144,8 +144,8 @@ void TestRandomResizeBicubic() {
        .Finalize(net.NewOperatorDef());
    // Run on CPU
    net.RunOp(DeviceType::CPU);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));

--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -346,7 +346,7 @@ class ResizeBilinearOp<DeviceType::GPU, T> : public Operation {
    std::vector<index_t> size = Operation::GetRepeatedArgs<index_t>(
        "size", {-1, -1});
    MACE_CHECK(size.size() == 2);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::ResizeBilinearKernel<T>>(
          align_corners, size[0], size[1]);
    } else {

--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -31,8 +31,8 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
  std::vector<float> input(24);
  std::iota(begin(input), end(input), 0);
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
      .Input("InputNCHW")
@@ -42,8 +42,8 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {

  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
@@ -60,8 +60,8 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
  std::vector<float> input(24);
  std::iota(begin(input), end(input), 0);
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

  OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
      .Input("InputNCHW")
@@ -72,8 +72,8 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {

  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
@@ -100,8 +100,8 @@ void TestRandomResizeBilinear() {
    // Add input data
    net.AddRandomInput<D, float>("Input",
                                 {batch, in_height, in_width, channels});
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

    OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
        .Input("InputNCHW")
@@ -111,8 +111,8 @@ void TestRandomResizeBilinear() {
        .Finalize(net.NewOperatorDef());
    // Run on CPU
    net.RunOp(DeviceType::CPU);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

    auto expected = net.CreateTensor<float>();
    expected->Copy(*net.GetOutput("Output"));
@@ -155,8 +155,8 @@ void TestQuantizedResizeBilinear() {
                                   true,
                                   -1.f,
                                   1.f);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

    OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
        .Input("InputNCHW")
@@ -166,8 +166,8 @@ void TestQuantizedResizeBilinear() {
        .Finalize(net.NewOperatorDef());
    // Run on CPU
    net.RunOp(DeviceType::CPU);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

    // run quantize
    OpDefBuilder("Quantize", "QuantizeInput")

--- a/mace/ops/resize_nearest_neighbor.cc
+++ b/mace/ops/resize_nearest_neighbor.cc
@@ -149,7 +149,7 @@ class ResizeNearestNeighborOp<DeviceType::GPU, T> : public Operation {
      : Operation(context) {
    bool align_corners = Operation::GetOptionalArg<bool>(
        "align_corners", false);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::ResizeNearestNeighborKernel<T>>(
          align_corners);
    } else {

--- a/mace/ops/resize_nearest_neighbor_test.cc
+++ b/mace/ops/resize_nearest_neighbor_test.cc
@@ -32,8 +32,8 @@ TEST_F(ResizeNearestNeighborTest, CPUResizeNearestNeighborWOAlignCorners) {
  std::iota(begin(input), end(input), 0);
  std::vector<int32_t> size = {1, 2};
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  net.AddInputFromArray<DeviceType::CPU, int32_t>("Size", {2}, size);

  OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest")
@@ -45,8 +45,8 @@ TEST_F(ResizeNearestNeighborTest, CPUResizeNearestNeighborWOAlignCorners) {

  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
@@ -64,8 +64,8 @@ TEST_F(ResizeNearestNeighborTest, ResizeNearestNeighborWAlignCorners) {
  std::iota(begin(input), end(input), 0);
  std::vector<int32_t> size = {1, 2};
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  net.AddInputFromArray<DeviceType::CPU, int32_t>("Size", {2}, size);

  OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest")
@@ -78,8 +78,8 @@ TEST_F(ResizeNearestNeighborTest, ResizeNearestNeighborWAlignCorners) {

  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  // Check
  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
@@ -105,8 +105,8 @@ void TestRandomResizeNearestNeighbor() {
    std::vector<int32_t> size = {20, 40};
    net.AddRandomInput<D, float>("Input",
                                 {batch, in_height, in_width, channels});
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    net.AddInputFromArray<D, int32_t>("Size", {2}, size);
    OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest")
        .Input("InputNCHW")
@@ -116,8 +116,8 @@ void TestRandomResizeNearestNeighbor() {
        .Finalize(net.NewOperatorDef());
    // Run on CPU
    net.RunOp(DeviceType::CPU);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

    auto expected = net.CreateTensor<float>();
    expected->Copy(*net.GetOutput("Output"));

--- a/mace/ops/scalar_math.cc
+++ b/mace/ops/scalar_math.cc
@@ -100,11 +100,7 @@ class ScalarMathOp : public Operation {
        coeff_(Operation::GetRepeatedArgs<float>("coeff")),
        scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
        scalar_input_index_(Operation::GetOptionalArg<int32_t>(
-            "scalar_input_index", 1)) {
-    if (D == DeviceType::GPU) {
-      context->set_output_mem_type(MemoryType::GPU_BUFFER);
-    }
-  }
+            "scalar_input_index", 1)) {}

  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);

--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -414,10 +414,9 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation {
      : Operation(context) {
    bool use_log = (
        Operation::GetOptionalArg<bool>("use_log", false));
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::SoftmaxKernel<T>>(use_log);
    } else {
-      context->set_output_mem_type(MemoryType::GPU_BUFFER);
      kernel_ = make_unique<opencl::buffer::SoftmaxKernel<T>>(use_log);
    }
  }
@@ -456,7 +455,7 @@ void RegisterSoftmax(OpRegistryBase *op_registry) {
      op_registry,
      OpConditionBuilder("Softmax")
          .SetDevicePlacerFunc(
-              [](OpConstructContext *context) -> std::set<DeviceType> {
+              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
                  return { DeviceType::CPU, DeviceType::GPU };

--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
@@ -50,7 +50,8 @@ void Simple(bool use_log = false) {

  if (D == DeviceType::CPU) {
    // test 4d softmax
-    net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Softmax", "SoftmaxTest")
        .Input("InputNCHW")
        .Output("OutputNCHW")
@@ -59,7 +60,8 @@ void Simple(bool use_log = false) {

    // Run
    net.RunOp(D);
-    net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);

@@ -109,7 +111,8 @@ void Complex(const std::vector<index_t> &logits_shape,
  net.AddRandomInput<D, float>("Input", logits_shape);

  if (logits_shape.size() == 4) {
-    net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);

    OpDefBuilder("Softmax", "SoftmaxTest")
        .Input("InputNCHW")
@@ -127,7 +130,8 @@ void Complex(const std::vector<index_t> &logits_shape,
  net.RunOp();

  if (logits_shape.size() == 4) {
-    net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }

  auto expected = net.CreateTensor<float>();

--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -307,7 +307,7 @@ class SpaceToBatchNDOp<DeviceType::GPU, T> : public SpaceToBatchOpBase {
 public:
  explicit SpaceToBatchNDOp(OpConstructContext *context)
      : SpaceToBatchOpBase(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::SpaceToBatchKernel<T>>();
    } else {
      MACE_NOT_IMPLEMENTED;

--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
@@ -39,8 +39,8 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
        .AddIntsArg("block_shape", block_shape_data)
        .Finalize(net.NewOperatorDef());
  } else if (D == CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
        .Input("InputNCHW")
        .Output("OutputNCHW")
@@ -53,8 +53,8 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
  net.RunOp(D);

  if (D == CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }
  // Check
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
@@ -78,8 +78,8 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
        .AddIntsArg("block_shape", block_shape_data)
        .Finalize(net.NewOperatorDef());
  } else if (D == CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
        .Input("InputNCHW")
        .Output("OutputNCHW")
@@ -92,8 +92,8 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
  net.RunOp(D);

  if (D == CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }
  // Check
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
@@ -155,8 +155,8 @@ void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
  net.RunOp(GPU);

  // run cpu
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
      .Input("InputNCHW")
      .Output("OutputNCHW")
@@ -164,8 +164,8 @@ void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
      .AddIntsArg("block_shape", block_shape_data)
      .Finalize(net.NewOperatorDef());
  net.RunOp(CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                  "OutputCPU", NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);

  // Check
  ExpectTensorNear<float>(*net.GetOutput("OutputCPU"),
@@ -188,8 +188,8 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
  net.RunOp(GPU);

  // run cpu
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
      .Input("InputNCHW")
      .Output("OutputNCHW")
@@ -197,8 +197,8 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
      .AddIntsArg("block_shape", block_shape_data)
      .Finalize(net.NewOperatorDef());
  net.RunOp(CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                  "OutputCPU", NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);

  // Check
  ExpectTensorNear<float>(*net.GetOutput("OutputCPU"),
@@ -218,8 +218,8 @@ void TestSpaceToBatchQuantize(const std::vector<index_t> &input_shape,
                                 1.f);

  // run cpu
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
      .Input("InputNCHW")
      .Output("OutputNCHW")
@@ -227,8 +227,8 @@ void TestSpaceToBatchQuantize(const std::vector<index_t> &input_shape,
      .AddIntsArg("block_shape", block_shape_data)
      .Finalize(net.NewOperatorDef());
  net.RunOp(CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                  "OutputCPU", NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);

  // run quantize
  OpDefBuilder("Quantize", "QuantizeInput")
@@ -279,8 +279,8 @@ void TestoBatchToSpaceQuantize(const std::vector<index_t> &input_shape,
                                 1.f);

  // run cpu
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
      .Input("InputNCHW")
      .Output("OutputNCHW")
@@ -288,8 +288,8 @@ void TestoBatchToSpaceQuantize(const std::vector<index_t> &input_shape,
      .AddIntsArg("block_shape", block_shape_data)
      .Finalize(net.NewOperatorDef());
  net.RunOp(CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                  "OutputCPU", NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);

  // run quantize
  OpDefBuilder("Quantize", "QuantizeInput")

--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
@@ -94,7 +94,7 @@ class SpaceToDepthOp<DeviceType::GPU, T> : public Operation {
  explicit SpaceToDepthOp(OpConstructContext *context)
      : Operation(context) {
    int block_size = Operation::GetOptionalArg<int>("block_size", 1);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::SpaceToDepthKernel<T>>(block_size);
    } else {
      MACE_NOT_IMPLEMENTED;

--- a/mace/ops/space_to_depth_test.cc
+++ b/mace/ops/space_to_depth_test.cc
@@ -32,8 +32,8 @@ void RunSpaceToDepth(const std::vector<index_t> &input_shape,
  net.AddInputFromArray<D, float>("Input", input_shape, input_data);
  // Construct graph
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
        .Input("InputNCHW")
        .Output("OutputNCHW")
@@ -41,8 +41,8 @@ void RunSpaceToDepth(const std::vector<index_t> &input_shape,
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  } else {
    OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
@@ -107,8 +107,8 @@ void RandomTest(const int block_size,

  // Add input data
  net.AddRandomInput<D, float>("Input", shape);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
      .Input("InputNCHW")
      .AddIntArg("block_size", block_size)
@@ -118,8 +118,8 @@ void RandomTest(const int block_size,
  // Run
  net.RunOp();

-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);

  OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
      .Input("Input")

--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
@@ -106,7 +106,7 @@ class SplitOp<DeviceType::GPU, T> : public Operation {
  explicit SplitOp(OpConstructContext *context)
      : Operation(context) {
    int32_t axis = Operation::GetOptionalArg<int>("axis", 3);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::SplitKernel<T>>(axis);
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -144,7 +144,7 @@ void RegisterSplit(OpRegistryBase *op_registry) {
      op_registry,
      OpConditionBuilder("Split")
          .SetDevicePlacerFunc(
-              [](OpConstructContext *context) -> std::set<DeviceType> {
+              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
                  return {DeviceType::CPU, DeviceType::GPU};

--- a/mace/ops/sqrdiff_mean.cc
+++ b/mace/ops/sqrdiff_mean.cc
--- a/mace/ops/sqrdiff_mean_test.cc
+++ b/mace/ops/sqrdiff_mean_test.cc
--- a/mace/ops/squeeze.cc
+++ b/mace/ops/squeeze.cc
--- a/mace/ops/strided_slice_test.cc
+++ b/mace/ops/strided_slice_test.cc
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
--- a/mace/python/tools/converter_tool/caffe_converter.py
+++ b/mace/python/tools/converter_tool/caffe_converter.py
--- a/mace/python/tools/converter_tool/onnx_converter.py
+++ b/mace/python/tools/converter_tool/onnx_converter.py
--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
--- a/mace/python/tools/model.jinja2
+++ b/mace/python/tools/model.jinja2
--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
--- a/mace/test/mace_api_test.h
+++ b/mace/test/mace_api_test.h
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc