Update the memory type choose logic and polish some code.

1. Change DataFormat from enum to enum class.

Update the memory type choose logic and polish some code.
1. Change DataFormat from enum to enum class.
95b32c24 · liuqi · 74dcd617 · 95b32c24 · 95b32c24 · 95b32c24
91 changed file
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -83,7 +83,7 @@ DataFormat ParseDataFormat(const std::string &data_format_str) {
  } else if (data_format_str == "OIHW") {
    return DataFormat::OIHW;
  } else {
-    return DataFormat::DF_NONE;
+    return DataFormat::NONE;
  }
 }

--- a/mace/core/arg_helper.cc
+++ b/mace/core/arg_helper.cc
@@ -123,14 +123,13 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true)
  MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, float, f)       \
  MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, bool, i)        \
  MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int, i)         \
-  MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int64_t, i)     \
+  MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int64_t, i)
-  MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, std::string, s)
 MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(OperatorDef)
 MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(NetDef)
 #undef MACE_SET_OPTIONAL_ARGUMENT_FUNC
-std::string OutputMemoryTypeTagName() {
+const std::string OutputMemoryTypeTagName() {
  static const char *kOutputMemTypeArgName = "output_mem_type";
  return kOutputMemTypeArgName;
 }

--- a/mace/core/arg_helper.h
+++ b/mace/core/arg_helper.h
@@ -65,7 +65,7 @@ void SetProtoArg(NetDef *op_def,
                 const std::string &arg_name,
                 const T&value);
-std::string OutputMemoryTypeTagName();
+const std::string OutputMemoryTypeTagName();
 bool IsQuantizedModel(const NetDef &def);

--- a/mace/core/memory_optimizer.cc
+++ b/mace/core/memory_optimizer.cc
@@ -126,7 +126,8 @@ void MemoryOptimizer::Optimize(
  DataFormat data_format = static_cast<DataFormat>(
      ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-          *op_def, "data_format", DataFormat::DF_NONE));
+          *op_def, "data_format",
+          static_cast<int>(DataFormat::NONE)));
  int output_size = op_def->output_size();
  for (int i = 0; i < output_size; ++i) {
    if (i < op_def->output_type_size()) {

--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -76,7 +76,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
 #ifdef MACE_ENABLE_OPENCL
    if (target_device_->device_type() == DeviceType::GPU) {
-      // update the map : output_tensor -> Operation
+      // update the map : output_tensor -> MemoryType
      MemoryType out_mem_type =
          static_cast<MemoryType>(
              ProtoArgHelper::GetOptionalArg<OperatorDef, int>(

--- a/mace/core/net_def_adapter.cc
+++ b/mace/core/net_def_adapter.cc
@@ -37,7 +37,7 @@ DataFormat GetDefaultDataFormat(DeviceType device_type,
    return DataFormat::NHWC;
  } else {
    LOG(FATAL) << "MACE do not support the device " << device_type;
-    return DataFormat::DF_NONE;
+    return DataFormat::NONE;
  }
 }
@@ -50,19 +50,21 @@ std::string TransformedName(const std::string &input_name,
  return ss.str();
 }
+#ifdef MACE_ENABLE_OPENCL
 bool TransformRequiredOp(const std::string &op_type) {
  static const std::unordered_set<std::string> kNoTransformOp = {
      "Shape", "InferConv2dShape"
  };
  return kNoTransformOp.count(op_type) == 0;
 }
+#endif  // MACE_ENABLE_OPENCL
 void BuildTransposeOpDef(
    const std::string &input_name,
    const std::string &output_name,
-    const std::vector<mace::index_t> &output_shape,
+    const std::vector<index_t> &output_shape,
    const std::vector<int> dst_dims,
-    const mace::DataType dt,
+    const DataType dt,
    DeviceType device_type,
    OperatorDef *op_def) {
  std::string op_name = "mace_node_" + output_name;
@@ -89,21 +91,13 @@ void BuildTransposeOpDef(
 }  // namespace
-NetDefAdapter::NetDefAdapter(const mace::OpRegistryBase *op_registry,
+NetDefAdapter::NetDefAdapter(const OpRegistryBase *op_registry,
-                             const mace::Workspace *ws)
+                             const Workspace *ws)
    : op_registry_(op_registry), ws_(ws) {}
-// Adapt original net_def to a better net.
-// 1. Adapt device: choose best device for every op in the net.
-// 2. Adapt data type: Add data type related transform ops
-//                     for mixing precision.
-// 3. Adapt data format: confirm data format of every op
-//                       and add transpose if necessary.
-// 4. Adapt memory type: Add BufferTransform if necessary
-//                       for transforming memory type between ops.
 MaceStatus NetDefAdapter::AdaptNetDef(
-    const mace::NetDef *net_def,
+    const NetDef *net_def,
-    mace::Device *target_device,
+    Device *target_device,
    NetDef *target_net_def) {
  MACE_LATENCY_LOGGER(1, "Adapting original NetDef");
  // Copy from original op_def, leave ops alone.
@@ -115,7 +109,7 @@ MaceStatus NetDefAdapter::AdaptNetDef(
  std::unique_ptr<CPUDevice> cpu_device = make_unique<CPUDevice>(
      target_device->cpu_runtime()->num_threads(),
      target_device->cpu_runtime()->policy(),
-      target_device->cpu_runtime()->use_gemmlowp());
+      &(target_device->cpu_runtime()->thread_pool()));
  // quantize model flag
  bool is_quantized_model = IsQuantizedModel(*net_def);
@@ -131,40 +125,40 @@ MaceStatus NetDefAdapter::AdaptNetDef(
        std::vector<index_t>(tensor.dims().begin(), tensor.dims().end());
  }
+  MemoryType mem_type = MemoryType::CPU_BUFFER;
+  if (target_device->device_type() == DeviceType::CPU) {
+    mem_type = MemoryType::CPU_BUFFER;
+  } else if (target_device->device_type() == DeviceType::GPU) {
+    mem_type = MemoryType::GPU_BUFFER;
+  } else {
+    LOG(FATAL) << "MACE do not support the device type: "
+               << target_device->device_type();
+  }
  int input_size = target_net_def->input_info_size();
  for (int i = 0; i < input_size; ++i) {
    auto input_info = target_net_def->mutable_input_info(i);
-    MemoryType mem_type = MemoryType::CPU_BUFFER;
+    auto input_data_format = static_cast<DataFormat>(
-    if (target_device->device_type() == DeviceType::CPU) {
-      mem_type = MemoryType::CPU_BUFFER;
-    } else if (target_device->device_type() == DeviceType::GPU) {
-      mem_type = MemoryType::GPU_BUFFER;
-    } else {
-      LOG(FATAL) << "MACE do not support the device type: "
-                 << target_device->device_type();
-    }
-    DataFormat input_data_format = static_cast<DataFormat>(
        input_info->data_format());
    DataFormat expected_data_format = GetDefaultDataFormat(
        target_device->device_type(), is_quantized_model);
-    std::vector<index_t> input_shape =
+    std::vector<index_t> input_shape(input_info->dims().begin(),
-        std::vector<index_t>(input_info->dims().begin(),
+                                     input_info->dims().end());
-                             input_info->dims().end());
+    if (input_data_format != DataFormat::NONE
-    if (input_data_format != DataFormat::DF_NONE
        && input_data_format != expected_data_format
        && input_shape.size() == 4) {
      if (input_data_format == DataFormat::NHWC
          && expected_data_format == DataFormat::NCHW) {
-        std::vector<int> dst_dims = {0, 3, 1, 2};
+        std::vector<int> dst_dims{0, 3, 1, 2};
        input_data_format = DataFormat::NCHW;
        input_shape = TransposeShape<index_t, index_t>(input_shape, dst_dims);
      } else if (input_data_format == DataFormat::NCHW
          && expected_data_format == DataFormat::NHWC) {
-        std::vector<int> dst_dims = {0, 2, 3, 1};
+        std::vector<int> dst_dims{0, 2, 3, 1};
        input_data_format = DataFormat::NHWC;
        input_shape = TransposeShape<index_t, index_t>(input_shape, dst_dims);
      }
-      input_info->set_data_format(input_data_format);
+      input_info->set_data_format(static_cast<int>(input_data_format));
      int input_shape_size = input_shape.size();
      for (int j = 0; j < input_shape_size; ++j) {
        input_info->set_dims(j, input_shape[j]);
@@ -287,9 +281,10 @@ MaceStatus NetDefAdapter::AdaptNetDef(
            internal_output_info.data_format,
            transformed_op_def);
        // set data format arg
-        SetProtoArg<int>(transformed_op_def,
+        SetProtoArg<int>(
-                         "data_format",
+            transformed_op_def,
-                         internal_output_info.data_format);
+            "data_format",
+            static_cast<int>(internal_output_info.data_format));
        // set output memory type argument
        SetProtoArg<int>(transformed_op_def,
                         OutputMemoryTypeTagName(),
@@ -309,7 +304,7 @@ MaceStatus NetDefAdapter::AdaptDevice(OpConditionContext *context,
                                      const TensorInfoMap &output_map,
                                      const NetDef *net_def,
                                      OperatorDef *op_def) {
-  VLOG(1) << "Adapt device for op " << op_def->name();
+  VLOG(3) << "Adapt device for op " << op_def->name();
  DeviceType target_device_type = target_device->device_type();
  DeviceType device_type = DeviceType::CPU;
  context->set_device(cpu_device);
@@ -335,15 +330,18 @@ MaceStatus NetDefAdapter::AdaptDevice(OpConditionContext *context,
                                                  producer_devices);
    if (device_type == target_device_type) {
      context->set_device(target_device);
+    } else {
+      LOG(INFO) << "Op " << op_def->name() << " fall back to CPU";
    }
  }
  op_def->set_device_type(device_type);
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus NetDefAdapter::AdaptDataType(mace::OpConditionContext *context,
+MaceStatus NetDefAdapter::AdaptDataType(OpConditionContext *context,
-                                        mace::OperatorDef *op_def) {
+                                        OperatorDef *op_def) {
  MACE_UNUSED(context);
+  // Where to add logic to support mixing precision
  // Adjust data type of op ran on CPU
  DataType dtype = static_cast<DataType>(
      ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
@@ -355,20 +353,20 @@ MaceStatus NetDefAdapter::AdaptDataType(mace::OpConditionContext *context,
 }
 MaceStatus NetDefAdapter::AdaptDataFormat(
-    mace::OpConditionContext *context,
+    OpConditionContext *context,
-    mace::OperatorDef *op_def,
+    OperatorDef *op_def,
    bool is_quantized_model,
    TensorInfoMap *output_map,
    std::unordered_set<std::string> *transformed_set,
    DataFormat *op_output_df,
-    mace::NetDef *target_net_def) {
+    NetDef *target_net_def) {
-  VLOG(1) << "Adapt data format for op " << op_def->name();
+  VLOG(3) << "Adapt data format for op " << op_def->name();
-  MACE_UNUSED(context);
  DataFormat op_data_format =
      static_cast<DataFormat>(ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-          *op_def, "data_format", 0));
+          *op_def, "data_format",
+          static_cast<int>(DataFormat::NONE)));
  // adjust the data format of operation
-  if (op_data_format == DataFormat::DF_AUTO) {
+  if (op_data_format == DataFormat::AUTO) {
    op_data_format = GetDefaultDataFormat(
        static_cast<DeviceType>(op_def->device_type()), is_quantized_model);
    SetProtoArg<int>(op_def, "data_format", static_cast<int>(op_data_format));
@@ -376,14 +374,15 @@ MaceStatus NetDefAdapter::AdaptDataFormat(
      int output_shape_size = op_def->output_shape_size();
      for (int i = 0; i < output_shape_size; ++i) {
        auto output_shape = op_def->mutable_output_shape(i);
-        if (output_shape->dims_size() == 4) {
+        MACE_CHECK(output_shape->dims_size() == 4,
-          // transpose output shape format from NHWC to NCHW
+                   "Output shape should be 4D if the of has data format. ",
-          int64_t height = output_shape->dims(1);
+                   op_def->name());
-          int64_t width = output_shape->dims(2);
+        // transpose output shape format from NHWC to NCHW
-          output_shape->set_dims(1, output_shape->dims(3));
+        int64_t height = output_shape->dims(1);
-          output_shape->set_dims(2, height);
+        int64_t width = output_shape->dims(2);
-          output_shape->set_dims(3, width);
+        output_shape->set_dims(1, output_shape->dims(3));
-        }
+        output_shape->set_dims(2, height);
+        output_shape->set_dims(3, width);
      }
    }
  }
@@ -394,8 +393,8 @@ MaceStatus NetDefAdapter::AdaptDataFormat(
  if (op_def->device_type() == DeviceType::GPU) {
    target_mem_type = MemoryType::GPU_BUFFER;
  }
-  // Use op's data format as inputs' data format for now.
+  auto inputs_data_format = op_registry_->InputsDataFormat(op_def->type(),
-  // Could move the logic to OpRegistry if necessary.
+      context);
  DataFormat src_df, dst_df;
  int input_size = op_def->input_size();
  for (int i = 0; i < input_size; ++i) {
@@ -408,20 +407,21 @@ MaceStatus NetDefAdapter::AdaptDataFormat(
      continue;
    }
    src_df = output_map->at(op_def->input(i)).data_format;
-    dst_df = op_data_format;
+    dst_df = inputs_data_format[i];
-    if (src_df == DataFormat::DF_NONE
+    if (src_df == DataFormat::NONE
-        || dst_df == DataFormat::DF_NONE
+        || dst_df == DataFormat::NONE
        || output_map->at(op_def->input(i)).shape.size() != 4) {
      continue;
    }
    if (src_df != dst_df) {
      std::string transformed_name = TransformedName(op_def->input(i),
-          "data_format", dst_df);
+          "data_format", static_cast<int>(dst_df));
      if (transformed_set->count(transformed_name) == 0) {
        VLOG(1) << "Add Transpose operation " << op_def->name()
                << " to transpose tensor "
                << op_def->input(i) << "', from data format "
-                << src_df << " to " << dst_df;
+                << static_cast<int>(src_df) << " to "
+                << static_cast<int>(dst_df);
        // Only support transpose between NHWC and NCHW for now.
        std::vector<int> dst_dims;
        if (src_df == DataFormat::NCHW && dst_df == DataFormat::NHWC) {
@@ -430,7 +430,8 @@ MaceStatus NetDefAdapter::AdaptDataFormat(
          dst_dims = {0, 3, 1, 2};
        } else {
          LOG(FATAL) << "Encounter unsupported data format transpose from "
-                     << src_df << " to " << dst_df;
+                     << static_cast<int>(src_df) << " to "
+                     << static_cast<int>(dst_df);
        }
        auto &input_info = output_map->at(op_def->input(i));
        auto output_shape = input_info.shape.empty() ?
@@ -449,7 +450,7 @@ MaceStatus NetDefAdapter::AdaptDataFormat(
        // set data format arg
        SetProtoArg<int>(transpose_op_def,
                         "data_format",
-                         dst_df);
+                         static_cast<int>(dst_df));
        // set output memory type argument
        SetProtoArg<int>(transpose_op_def,
                         OutputMemoryTypeTagName(),
@@ -475,20 +476,20 @@ MaceStatus NetDefAdapter::AdaptDataFormat(
 }
 MaceStatus NetDefAdapter::AdaptMemoryType(
-    mace::OpConditionContext *context,
+    OpConditionContext *context,
-    mace::OperatorDef *op_def,
+    OperatorDef *op_def,
-    mace::NetDefAdapter::TensorInfoMap *output_map,
+    NetDefAdapter::TensorInfoMap *output_map,
    std::unordered_set<std::string> *transformed_set,
    MemoryType *op_output_mem_types,
-    mace::NetDef *target_net_def) {
+    NetDef *target_net_def) {
-  VLOG(1) << "Adapt memory type for op " << op_def->name();
+  VLOG(3) << "Adapt memory type for op " << op_def->name();
  // Get expected output memory type
  // (only support one kind of memory type for multiple outputs)
  op_registry_->GetInOutMemoryTypes(op_def->type(), context);
 #ifdef MACE_ENABLE_OPENCL
-  int input_size = op_def->input_size();
  // if op is memory-unused op, no transformation
  if (TransformRequiredOp(op_def->type())) {
+    int input_size = op_def->input_size();
    for (int i = 0; i < input_size; ++i) {
      if (output_map->count(op_def->input(i)) == 0) {
        MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
@@ -498,14 +499,14 @@ MaceStatus NetDefAdapter::AdaptMemoryType(
        continue;
      }
      auto &input_info = output_map->at(op_def->input(i));
-      if (input_info.data_format == DataFormat::DF_NONE
-          || input_info.shape.size() != 4) {
-        continue;
-      }
      // check whether to do transform
      MemoryType src_mem_type = input_info.mem_type;
      MemoryType dst_mem_type = context->GetInputMemType(i);
-      if (src_mem_type != dst_mem_type) {
+      auto wanted_input_dtype = context->GetInputDataType(i);
+      if (src_mem_type != dst_mem_type ||
+          (input_info.dtype != wanted_input_dtype &&
+              (src_mem_type != MemoryType::CPU_BUFFER
+                  || dst_mem_type != MemoryType::CPU_BUFFER))) {
        auto transformed_name = TransformedName(op_def->input(i),
                                                "mem_type",
                                                dst_mem_type);
@@ -521,7 +522,7 @@ MaceStatus NetDefAdapter::AdaptMemoryType(
              op_def->input(i),
              input_info.shape,
              transformed_name,
-              context->GetInputDataType(i),
+              wanted_input_dtype,
              context->GetInputOpenCLBufferType(i),
              dst_mem_type,
              input_info.data_format,
@@ -529,7 +530,7 @@ MaceStatus NetDefAdapter::AdaptMemoryType(
          // set data format arg
          SetProtoArg<int>(transformed_op_def,
                           "data_format",
-                           input_info.data_format);
+                           static_cast<int>(input_info.data_format));
          // set output memory type argument
          SetProtoArg<int>(transformed_op_def,
                           OutputMemoryTypeTagName(),
@@ -564,7 +565,7 @@ MaceStatus NetDefAdapter::AdaptMemoryType(
  return MaceStatus::MACE_SUCCESS;
 }
-std::string NetDefAdapter::DebugString(const mace::NetDef *net_def) {
+std::string NetDefAdapter::DebugString(const NetDef *net_def) {
  std::stringstream sstream;
  auto DeviceTypeToStrFunc = [](DeviceType device_type) -> std::string {
    if (device_type == DeviceType::CPU) {
@@ -591,10 +592,10 @@ std::string NetDefAdapter::DebugString(const mace::NetDef *net_def) {
      return "NHWC";
    } else if (type == DataFormat::NCHW) {
      return "NCHW";
-    } else if (type == DataFormat::DF_NONE) {
+    } else if (type == DataFormat::NONE) {
-      return "DF_NONE";
+      return "NONE";
-    } else if (type == DataFormat::DF_AUTO) {
+    } else if (type == DataFormat::AUTO) {
-      return "DT_AUTO";
+      return "AUTO";
    } else if (type == DataFormat::OIHW) {
      return "OIHW";
    } else {
@@ -615,7 +616,7 @@ std::string NetDefAdapter::DebugString(const mace::NetDef *net_def) {
    std::string data_format = DataFormatToStrFunc(
        static_cast<DataFormat>(
            ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                op, "data_format", 0)));
+                op, "data_format", static_cast<int>(DataFormat::NONE))));
    sstream << std::endl;
    sstream << "{" << std::endl;

--- a/mace/core/net_def_adapter.h
+++ b/mace/core/net_def_adapter.h
@@ -32,16 +32,22 @@ class OpRegistryBase;
 class Workspace;
 class Device;
-/**
+/// Conventions:
- * Conventions:
+/// 1. DataFormat::AUTO stands for formatted (NHWC or NCHW)
- * 1. DataFormat::DT_AUTO stands for formatted (NHWC or NCHW)
+/// 2. if Op with DataFormat::AUTO, the arguments of this op
- * 2. if Op with DataFormat::DT_AUTO, the arguments of this op
+///    is formatted to NHWC
- *    is formatted to NHWC
- */
 class NetDefAdapter {
 public:
  NetDefAdapter(const OpRegistryBase *op_registry,
                const Workspace *ws);
+  // Adapt original net_def to a better net.
+  // 1. Adapt device: choose best device for every op in the net.
+  // 2. Adapt data type: Add data type related transform ops
+  //                     for mixing precision.
+  // 3. Adapt data format: confirm data format of every op
+  //                       and add transpose if necessary.
+  // 4. Adapt memory type: Add BufferTransform if necessary
+  //                       for transforming memory type between ops.
  MaceStatus AdaptNetDef(
      const NetDef *net_def,
      Device *target_device,
@@ -91,12 +97,12 @@ class NetDefAdapter {
      NetDef *target_net_def);
  MaceStatus AdaptMemoryType(
-      mace::OpConditionContext *context,
+      OpConditionContext *context,
-      mace::OperatorDef *op_def,
+      OperatorDef *op_def,
      TensorInfoMap *output_map,
      std::unordered_set<std::string> *transformed_set,
      MemoryType *op_output_mem_types,
-      mace::NetDef *target_net_def);
+      NetDef *target_net_def);
  std::string DebugString(const NetDef *net_def);

--- a/mace/core/net_optimizer.cc
+++ b/mace/core/net_optimizer.cc
@@ -19,10 +19,10 @@
 namespace mace {
 DeviceType NetOptimizer::SelectBestDevice(
-    const mace::OperatorDef *op_def,
+    const OperatorDef *op_def,
    DeviceType target_device_type,
-    const std::set<mace::DeviceType> &available_devices,
+    const std::set<DeviceType> &available_devices,
-    const std::vector<mace::DeviceType> &inputs_op_devices) {
+    const std::vector<DeviceType> &inputs_op_devices) {
  static const std::set<std::string> kComputeIntensiveOps = {
      "Conv2D", "DepthwiseConv2d", "Deconv2D", "DepthwiseDeconv2d",
      "FullyConnected"

--- a/mace/core/net_optimizer.h
+++ b/mace/core/net_optimizer.h
@@ -23,8 +23,21 @@
 namespace mace {
+/// Any optimization for Net could be put in here in the future.
 class NetOptimizer {
 public:
+  /// Select best device for the op to support mixing usage of CPU and GPU.
+  /// Greedy strategy: one way to the end. If the op fallback to CPU, then
+  /// the follow-up ops will run on CPU too util meet
+  /// some compute-intensive ops(Convolution) to
+  /// reduce the memory copy between CPU and GPU.
+  /// Simple but effective.
+  ///
+  /// \param op_def the op
+  /// \param target_device target device to run on
+  /// \param available_devices available devices of the op
+  /// \param inputs_op_devices devices of father ops run on
+  /// \return Best device for the op_def
  DeviceType SelectBestDevice(const OperatorDef *op_def,
                              DeviceType target_device,
                              const std::set<DeviceType> &available_devices,

--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -21,22 +21,22 @@
 namespace mace {
 OpConditionContext::OpConditionContext(
-    const mace::Workspace *ws,
+    const Workspace *ws,
-    mace::OpConditionContext::TensorShapeMap *info)
+    OpConditionContext::TensorShapeMap *info)
    : operator_def_(nullptr),
      ws_(ws),
      device_(nullptr),
      tensor_shape_info_(info) {}
 void OpConditionContext::set_operator_def(
-    const mace::OperatorDef *operator_def) {
+    const OperatorDef *operator_def) {
  operator_def_ = operator_def;
  input_data_types_.clear();
 }
 void OpConditionContext::SetInputInfo(size_t idx,
-                                      mace::MemoryType mem_type,
+                                      MemoryType mem_type,
-                                      mace::DataType dt) {
+                                      DataType dt) {
  if (input_mem_types_.empty()) {
    // the default inputs' memory types are same as output memory type.
    input_mem_types_.resize(operator_def_->input_size(), output_mem_type_);
@@ -53,7 +53,7 @@ void OpConditionContext::SetInputInfo(size_t idx,
  input_data_types_[idx] = dt;
 }
-void OpConditionContext::set_output_mem_type(mace::MemoryType type) {
+void OpConditionContext::set_output_mem_type(MemoryType type) {
  MACE_CHECK(operator_def_ != nullptr);
  output_mem_type_ = type;
  input_mem_types_.clear();
@@ -106,7 +106,7 @@ OpConstructContext::OpConstructContext(Workspace *ws)
      device_(nullptr) {}
 void OpConstructContext::set_operator_def(
-    std::shared_ptr<mace::OperatorDef> operator_def) {
+    std::shared_ptr<OperatorDef> operator_def) {
  operator_def_ = operator_def;
 }
@@ -225,9 +225,20 @@ OpRegistrationInfo::OpRegistrationInfo() {
      context->set_output_mem_type(MemoryType::CPU_BUFFER);
    }
  };
+  data_format_selector = [](OpConditionContext *context)
+      -> std::vector<DataFormat> {
+    DataFormat op_data_format =
+        static_cast<DataFormat>(
+            ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                *context->operator_def(), "data_format",
+                static_cast<int>(DataFormat::NONE)));
+    return std::vector<DataFormat>(context->operator_def()->input_size(),
+                                   op_data_format);
+  };
 }
-void OpRegistrationInfo::AddDevice(mace::DeviceType device) {
+void OpRegistrationInfo::AddDevice(DeviceType device) {
  devices.insert(device);
 }
@@ -239,9 +250,9 @@ void OpRegistrationInfo::Register(const std::string &key, OpCreator creator) {
 MaceStatus OpRegistryBase::Register(
    const std::string &op_type,
-    const mace::DeviceType device_type,
+    const DeviceType device_type,
-    const mace::DataType dt,
+    const DataType dt,
-    mace::OpRegistrationInfo::OpCreator creator) {
+    OpRegistrationInfo::OpCreator creator) {
  if (registry_.count(op_type) == 0) {
    registry_[op_type] = std::unique_ptr<OpRegistrationInfo>(
        new OpRegistrationInfo);
@@ -277,12 +288,20 @@ const std::set<DeviceType> OpRegistryBase::AvailableDevices(
 void OpRegistryBase::GetInOutMemoryTypes(
    const std::string &op_type,
-    mace::OpConditionContext *context) const {
+    OpConditionContext *context) const {
  MACE_CHECK(registry_.count(op_type) != 0,
             op_type, " operation is not registered.");
  return registry_.at(op_type)->memory_type_setter(context);
 }
+const std::vector<DataFormat> OpRegistryBase::InputsDataFormat(
+    const std::string &op_type,
+    OpConditionContext *context) const {
+  MACE_CHECK(registry_.count(op_type) != 0,
+             op_type, " operation is not registered.");
+  return registry_.at(op_type)->data_format_selector(context);
+}
 std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
    OpConstructContext *context,
    DeviceType device_type) const {
@@ -321,11 +340,17 @@ OpConditionBuilder &OpConditionBuilder::SetDevicePlacerFunc(
 }
 OpConditionBuilder& OpConditionBuilder::SetInputMemoryTypeSetter(
-    mace::OpRegistrationInfo::MemoryTypeSetter setter) {
+    OpRegistrationInfo::MemoryTypeSetter setter) {
  memory_type_setter_ = setter;
  return *this;
 }
+OpConditionBuilder& OpConditionBuilder::SetInputsDataFormatSelector(
+    OpRegistrationInfo::DataFormatSelector selector) {
+  data_format_selector_ = selector;
+  return *this;
+}
 void OpConditionBuilder::Finalize(OpRegistrationInfo *info) const {
  if (info != nullptr) {
    if (placer_) {
@@ -334,6 +359,10 @@ void OpConditionBuilder::Finalize(OpRegistrationInfo *info) const {
    if (memory_type_setter_) {
      info->memory_type_setter = memory_type_setter_;
    }
+    if (data_format_selector_) {
+      info->data_format_selector = data_format_selector_;
+    }
  }
 }

--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -117,6 +117,14 @@ class OpConstructContext {
  inline Device *device() const {
    return device_;
  }
+#ifdef MACE_ENABLE_OPENCL
+  inline MemoryType GetOpMemoryType() const {
+    return static_cast<MemoryType>(
+        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            *operator_def_, OutputMemoryTypeTagName(),
+            static_cast<int>(MemoryType::CPU_BUFFER)));
+  }
+#endif  // MACE_ENABLE_OPENCL
 private:
  std::shared_ptr<OperatorDef> operator_def_;
@@ -270,6 +278,9 @@ class OpConditionBuilder {
  OpConditionBuilder &SetInputMemoryTypeSetter(
      OpRegistrationInfo::MemoryTypeSetter setter);
+  OpConditionBuilder &SetInputsDataFormatSelector(
+      OpRegistrationInfo::DataFormatSelector selector);
  void Finalize(OpRegistrationInfo *info) const;
 private:
@@ -297,6 +308,9 @@ class OpRegistryBase {
  void GetInOutMemoryTypes(
      const std::string &op_type, OpConditionContext *context) const;
+  const std::vector<DataFormat> InputsDataFormat(
+      const std::string &op_type, OpConditionContext *context) const;
  std::unique_ptr<Operation> CreateOperation(
      OpConstructContext *context,
      DeviceType device_type) const;

--- a/mace/core/runtime/opencl/opencl_util.cc
+++ b/mace/core/runtime/opencl/opencl_util.cc
@@ -173,7 +173,7 @@ void OpenCLUtil::BuildTransformOpDef(
  arg->set_i(static_cast<int32_t>(dt));
  arg = op_def->add_arg();
  arg->set_name("data_format");
-  arg->set_i(data_format);
+  arg->set_i(static_cast<int>(data_format));
  if (!input_shape.empty()) {
    OutputShape *shape = op_def->add_output_shape();
    for (auto value : input_shape) {

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -269,7 +269,7 @@ MaceStatus Workspace::PreallocateOutputTensor(
                    tensor_mem.second.data_type,
                    false, tensor_mem.first));
    tensor->set_data_format(tensor_mem.second.data_format);
-    if (tensor_mem.second.data_format != DataFormat::DF_NONE) {
+    if (tensor_mem.second.data_format != DataFormat::NONE) {
      if (mem_blocks[tensor_mem.second.mem_id].mem_type()
          == MemoryType::GPU_IMAGE) {
        VLOG(1) << "Tensor: " << tensor_mem.first

--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -94,7 +94,7 @@ DataFormat ParseDataFormat(const std::string &data_format_str) {
  } else if (data_format_str == "OIHW") {
    return DataFormat::OIHW;
  } else {
-    return DataFormat::DF_NONE;
+    return DataFormat::NONE;
  }
 }

--- a/mace/libmace/capability.cc
+++ b/mace/libmace/capability.cc
@@ -143,7 +143,7 @@ void BMNet::SetUp() {
  // Add input and output information
  for (size_t i = 0; i < input_names_.size(); ++i) {
    InputOutputInfo *info = net_.add_input_info();
-    info->set_data_format(DataFormat::NHWC);
+    info->set_data_format(static_cast<int>(DataFormat::NHWC));
    info->set_name(input_names_[i]);
    for (auto d : input_shapes_[i]) {
      info->add_dims(static_cast<int>(d));
@@ -244,7 +244,7 @@ void BMNet::AddConv(const std::string &conv_type,
  op_def->add_output(output_name);
  AddIntsArg(op_def, "strides", strides);
  AddIntArg(op_def, "padding", padding_type);
-  AddIntArg(op_def, "has_data_format", 1);
+  AddIntArg(op_def, "data_format", static_cast<int>(DataFormat::AUTO));
  AddIntArg(op_def, "T", DT_HALF);
  if (has_relu6) {
    AddStringArg(op_def, "activation", "RELUX");
@@ -271,7 +271,7 @@ void BMNet::AddEltwise(const std::string &op_name,
  op_def->add_output(output);
  AddIntArg(op_def, "type", type);
  AddIntArg(op_def, "T", DT_HALF);
-  AddIntArg(op_def, "has_data_format", 1);
+  AddIntArg(op_def, "data_format", static_cast<int>(DataFormat::AUTO));
  OutputShape *shape = op_def->add_output_shape();
  for (auto dim : output_shape) {
    shape->add_dims(dim);

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -283,9 +283,9 @@ MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
                       std::shared_ptr<void> data,
                       const DataFormat format) {
  MACE_CHECK_NOTNULL(data.get());
-  MACE_CHECK(format == DataFormat::DF_NONE || format == DataFormat::NHWC
+  MACE_CHECK(format == DataFormat::NONE || format == DataFormat::NHWC
-                 || format == DataFormat::NCHW || format == OIHW,
+                 || format == DataFormat::NCHW || format == DataFormat::OIHW,
-             "MACE only support DF_NONE, NHWC, NCHW and OIHW "
+             "MACE only support NONE, NHWC, NCHW and OIHW "
             "formats of input now.");
  impl_ = make_unique<MaceTensor::Impl>();
  impl_->shape = shape;
@@ -496,7 +496,7 @@ MaceStatus MaceEngine::Impl::Init(
    DataType output_dt = output_info_map_[output_name].data_type();
    Tensor *output_tensor =
        ws_->CreateTensor(output_name, device_->allocator(), output_dt);
-    output_tensor->set_data_format(NHWC);
+    output_tensor->set_data_format(DataFormat::NHWC);
 #endif
  }
 #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
@@ -585,14 +585,14 @@ MaceEngine::Impl::~Impl() {
 MaceStatus MaceEngine::Impl::TransposeInput(
    const std::pair<const std::string, MaceTensor> &input,
    Tensor *input_tensor) {
-  bool has_data_format = input_tensor->data_format() != DataFormat::DF_NONE;
+  bool has_data_format = input_tensor->data_format() != DataFormat::NONE;
-  DataFormat data_format = DataFormat::DF_NONE;
+  DataFormat data_format = DataFormat::NONE;
  DataType input_dt = input_tensor->dtype();
  if (has_data_format) {
    std::vector<int> dst_dims;
    if (device_->device_type() == DeviceType::CPU &&
        input.second.shape().size() == 4 &&
-        input.second.data_format() == NHWC &&
+        input.second.data_format() == DataFormat::NHWC &&
        !is_quantized_model_) {
      VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
      input_tensor->set_data_format(DataFormat::NCHW);
@@ -654,28 +654,28 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
  DataType output_dt = output_tensor->dtype();
  // save output
  if (output_tensor != nullptr && output->second.data() != nullptr) {
-    if (output_tensor->data_format() != DataFormat::DF_NONE &&
+    if (output_tensor->data_format() != DataFormat::NONE &&
-        output->second.data_format() != DataFormat::DF_NONE &&
+        output->second.data_format() != DataFormat::NONE &&
        output->second.shape().size() == 4 &&
        output->second.data_format() != output_tensor->data_format()) {
      VLOG(1) << "Transform output " << output->first << " from "
-              << output_tensor->data_format() << " to "
+              << static_cast<int>(output_tensor->data_format()) << " to "
-              << output->second.data_format();
+              << static_cast<int>(output->second.data_format());
      std::vector<int> dst_dims;
-      if (output_tensor->data_format() == NCHW &&
+      if (output_tensor->data_format() == DataFormat::NCHW &&
-          output->second.data_format() == NHWC) {
+          output->second.data_format() == DataFormat::NHWC) {
        dst_dims = {0, 2, 3, 1};
-      } else if (output_tensor->data_format() == NHWC &&
+      } else if (output_tensor->data_format() == DataFormat::NHWC &&
-          output->second.data_format() == NCHW) {
+          output->second.data_format() == DataFormat::NCHW) {
        dst_dims = {0, 3, 1, 2};
      } else {
        LOG(FATAL) << "Not supported output data format: "
-                   << output->second.data_format() << " vs "
+                   << static_cast<int>(output->second.data_format()) << " vs "
-                   << output_tensor->data_format();
+                   << static_cast<int>(output_tensor->data_format());
      }
      VLOG(1) << "Transform output " << output->first << " from "
-              << output_tensor->data_format() << " to "
+              << static_cast<int>(output_tensor->data_format()) << " to "
-              << output->second.data_format();
+              << static_cast<int>(output->second.data_format());
      std::vector<index_t> shape =
          TransposeShape<index_t, index_t>(output_tensor->shape(),
                                           dst_dims);

--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -96,7 +96,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
    auto leakyrelu_coefficient = static_cast<T>(
        Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f));
    MemoryType mem_type;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
      kernel_ = make_unique<opencl::image::ActivationKernel<T>>(
          type, relux_max_limit, leakyrelu_coefficient);
@@ -140,11 +140,13 @@ void RegisterActivation(OpRegistryBase *op_registry) {
          .SetDevicePlacerFunc(
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
+                if (op->output_shape_size() != op->output_size()) {
+                  return { DeviceType::CPU, DeviceType::GPU };
+                }
                int has_data_format =
                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                        *op, "has_data_format", 0);
                if (!has_data_format ||
-                    (op->output_shape_size() != op->output_size()) ||
                    op->output_shape(0).dims_size() != 4) {
                  return { DeviceType::CPU };
                }

--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -207,7 +207,8 @@ void TestSimplePrelu() {
    // Run
    net.RunOp(D);
  } else {
-    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<D, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Activation", "PreluTest")
        .Input("InputNCHW")
        .Input("Alpha")
@@ -217,7 +218,8 @@ void TestSimplePrelu() {
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }
  auto expected = net.CreateTensor<float>(

--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -69,7 +69,7 @@ class AddNOp<DeviceType::GPU, T> : public Operation {
 public:
  explicit AddNOp(OpConstructContext *context)
      : Operation(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::AddNKernel<T>>();
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -109,11 +109,13 @@ void RegisterAddN(OpRegistryBase *op_registry) {
          .SetDevicePlacerFunc(
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
+                if (op->output_shape_size() != op->output_size()) {
+                  return { DeviceType::CPU, DeviceType::GPU };
+                }
                int has_data_format =
                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                        *op, "has_data_format", 0);
                if (!has_data_format ||
-                    (op->output_shape_size() != op->output_size()) ||
                    op->output_shape(0).dims_size() != 4) {
                  return { DeviceType::CPU };
                }

--- a/mace/ops/arm/fp32/deconv_2d.cc
+++ b/mace/ops/arm/fp32/deconv_2d.cc
@@ -54,7 +54,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut(
                                 out_pad_size,
                                 &padded_out_shape,
                                 framework_type_,
-                                 NCHW);
+                                 DataFormat::NCHW);
  MACE_RETURN_IF_ERROR(output->Resize(out_shape));

--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -174,7 +174,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
    float leakyrelu_coefficient = Operation::GetOptionalArg<float>(
        "leakyrelu_coefficient", 0.0f);
    MemoryType mem_type;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
      kernel_ = make_unique<opencl::image::BatchNormKernel<T>>(
          epsilon, activation, relux_max_limit, leakyrelu_coefficient);

--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -34,7 +34,8 @@ void Simple() {
  net.AddInputFromArray<D, float>("Var", {1}, {11.67f}, true);
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<D, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("BatchNorm", "BatchNormTest")
        .Input("InputNCHW")
        .Input("Scale")
@@ -47,7 +48,8 @@ void Simple() {
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("BatchNorm", "BatchNormTest")
        .Input("Input")
@@ -93,8 +95,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  // Construct graph
  OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -112,8 +114,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  // run cpu
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>();
@@ -163,8 +165,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("BatchNorm", "BatchNormTest")
      .Input("InputNCHW")
@@ -179,8 +181,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  // run cpu
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>();
@@ -230,8 +232,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("BatchNorm", "BatchNormTest")
      .Input("InputNCHW")
@@ -246,8 +248,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  // run cpu
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>();
@@ -296,8 +298,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("BatchNorm", "BatchNormTest")
      .Input("InputNCHW")
@@ -312,8 +314,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  // run cpu
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>();

--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -264,7 +264,7 @@ class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
 public:
  explicit BatchToSpaceNDOp(OpConstructContext *context)
      : BatchToSpaceOpBase(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::BatchToSpaceKernel<T>>();
    } else {
      MACE_NOT_IMPLEMENTED;

--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -103,7 +103,7 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
      : Operation(context),
        has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 1)) {
    MemoryType mem_type = MemoryType::CPU_BUFFER;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
      kernel_ = make_unique<opencl::image::BiasAddKernel<T>>();
    } else {
@@ -151,11 +151,13 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) {
          .SetDevicePlacerFunc(
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
+                if (op->output_shape_size() != op->output_size()) {
+                  return { DeviceType::CPU, DeviceType::GPU };
+                }
                int has_data_format =
                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                        *op, "has_data_format", 0);
                if (!has_data_format ||
-                    (op->output_shape_size() != op->output_size()) ||
                    op->output_shape(0).dims_size() != 4) {
                  return { DeviceType::CPU };
                }

--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -27,9 +27,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
  OpsTestNet net;
  // Add input data
-  DataFormat data_format = NHWC;
  if (D == DeviceType::CPU) {
-    data_format = NCHW;
    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
  } else if (D == DeviceType::GPU) {
    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});

--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -31,8 +31,8 @@ void BiasAddSimple() {
  net.AddInputFromArray<D, float>("Bias", {1}, {0.5f}, true);
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("BiasAdd", "BiasAddTest")
        .Input("InputNCHW")
        .Input("Bias")
@@ -41,8 +41,8 @@ void BiasAddSimple() {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("BiasAdd", "BiasAddTest")
        .Input("Input")
@@ -83,8 +83,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
                                             {batch, height, width, channels});
  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  // Construct graph
  OpDefBuilder("BiasAdd", "BiasAddTest")
@@ -97,8 +97,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
  // run cpu
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>();
@@ -132,8 +132,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
                                             {batch, height, width, channels});
  net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  // Construct graph
  OpDefBuilder("BiasAdd", "BiasAddTest")
@@ -146,8 +146,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
  // run cpu
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>();
  expected->Copy(*net.GetOutput("Output"));

--- a/mace/ops/buffer_to_image_benchmark.cc
+++ b/mace/ops/buffer_to_image_benchmark.cc
@@ -48,7 +48,6 @@ void FilterBufferToImage(int iters,
                   OpenCLBufferType::IN_OUT_CHANNEL,
                   MemoryType::GPU_IMAGE,
                   0,
-                   DataFormat::NHWC,
                   b2i_output);
  };

--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -37,14 +37,14 @@ void TestBidirectionTransform(const OpenCLBufferType type,
  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
      .Transform(&context, net.ws()->GetTensor("Input"),
-                 type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output);
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);
  // Inverse Transform
  Tensor *i2b_output = net.ws()->CreateTensor(
      "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
  OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
      .Transform(&context, b2i_output,
-                 type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output);
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);
  // Check
  ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
@@ -178,14 +178,14 @@ void TestDiffTypeBidirectionTransform(const OpenCLBufferType type,
  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
      .Transform(&context, net.ws()->GetTensor("Input"),
-                 type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output);
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);
  // Inverse Transform
  Tensor *i2b_output = net.ws()->CreateTensor(
      "I2BOutput", context.device()->allocator(), DT_FLOAT);
  OpenCLBufferTransformer<float>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
      .Transform(&context, b2i_output,
-                 type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output);
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);
  // Check
  ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
@@ -218,14 +218,14 @@ void TestStringHalfBidirectionTransform(const OpenCLBufferType type,
  // Transform
  OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
      .Transform(&context, net.ws()->GetTensor("Input"),
-                 type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output);
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);
  // Inverse Transform
  Tensor *i2b_output = net.ws()->CreateTensor(
      "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
  OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
      .Transform(&context, b2i_output,
-                 type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output);
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);
  // Check
  ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),

--- a/mace/ops/buffer_transform.cc
+++ b/mace/ops/buffer_transform.cc
@@ -39,14 +39,11 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
    auto type =
        static_cast<OpenCLBufferType>(Operation::GetOptionalArg<int>(
            "buffer_type", static_cast<int>(CONV2D_FILTER)));
-    DataFormat data_format = static_cast<DataFormat>(
-        Operation::GetOptionalArg<int>("data_format", DataFormat::DF_NONE));
    MemoryType in_mem_type = context->workspace()->GetTensor(
        operator_def_->input(0))->memory_type();
    return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
-        context, input, type, out_mem_type_, wino_blk_size_,
+        context, input, type, out_mem_type_, wino_blk_size_, output);
-        data_format, output);
  }
 private:

--- a/mace/ops/buffer_transform_test.cc
+++ b/mace/ops/buffer_transform_test.cc
@@ -48,7 +48,7 @@ void TestBidirectionTransform(const OpenCLBufferType type,
  OpenCLBufferTransformer<DstType>(MemoryType::GPU_BUFFER,
                                   MemoryType::GPU_BUFFER)
      .Transform(&context, net.ws()->GetTensor("Input"),
-                 type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, bt_output);
+                 type, MemoryType::GPU_BUFFER, 0, bt_output);
  // Inverse Transform
  Tensor *output = net.ws()->CreateTensor(
@@ -57,7 +57,7 @@ void TestBidirectionTransform(const OpenCLBufferType type,
  OpenCLBufferTransformer<OrgType>(MemoryType::GPU_BUFFER,
                                   MemoryType::GPU_BUFFER)
      .Transform(&context, bt_output,
-                 type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, output);
+                 type, MemoryType::GPU_BUFFER, 0, output);
  if (DataTypeToEnum<OrgType>::value == DataTypeToEnum<DstType>::value) {
    EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(),
@@ -94,7 +94,7 @@ void TestArgumentTransform(const index_t input_size) {
                             MemoryType::GPU_BUFFER)
      .Transform(&context, net.ws()->GetTensor("Input"),
                 OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER,
-                 0, DataFormat::NHWC, output);
+                 0, output);
  index_t expected_size = RoundUp<index_t>(input_size, 4);
  EXPECT_EQ(expected_size, output->buffer_shape()[0]);

--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -82,7 +82,7 @@ class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
  explicit ChannelShuffleOp(OpConstructContext *context)
      : Operation(context) {
    const int groups = Operation::GetOptionalArg<int>("group", 1);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::ChannelShuffleKernel<T>>(groups);
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -119,7 +119,7 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) {
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU };
+                  return { DeviceType::CPU, DeviceType::GPU };
                }
                int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                    *op, "group", 1);

--- a/mace/ops/channel_shuffle_test.cc
+++ b/mace/ops/channel_shuffle_test.cc
@@ -28,8 +28,8 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
      "Input", {1, 1, 2, 8},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  // Construct graph
  OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
@@ -40,8 +40,8 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>(

--- a/mace/ops/common/conv_pool_2d_util.cc
+++ b/mace/ops/common/conv_pool_2d_util.cc
@@ -40,19 +40,19 @@ void CalcPaddingAndOutputSize(const index_t *input_shape,
  index_t input_height = 0, input_width = 0;
  index_t kernel_height = 0, kernel_width = 0;
-  if (input_format == NCHW) {
+  if (input_format == DataFormat::NCHW) {
    input_height = input_shape[2];
    input_width = input_shape[3];
-  } else if (input_format == NHWC) {
+  } else if (input_format == DataFormat::NHWC) {
    input_height = input_shape[1];
    input_width = input_shape[2];
  } else {
    MACE_NOT_IMPLEMENTED;
  }
-  if (filter_format == OIHW) {
+  if (filter_format == DataFormat::OIHW) {
    kernel_height = filter_shape[2];
    kernel_width = filter_shape[3];
-  } else if (filter_format == OHWI) {
+  } else if (filter_format == DataFormat::OHWI) {
    kernel_height = filter_shape[1];
    kernel_width = filter_shape[2];
  } else {
@@ -97,11 +97,11 @@ void CalcPaddingAndOutputSize(const index_t *input_shape,
      0, (output_width - 1) * strides[1] + k_extent_width - input_width);
  output_shape[0] = input_shape[0];
-  if (input_format == NCHW) {
+  if (input_format == DataFormat::NCHW) {
    output_shape[1] = output_channels;
    output_shape[2] = output_height;
    output_shape[3] = output_width;
-  } else if (input_format == NHWC) {
+  } else if (input_format == DataFormat::NHWC) {
    output_shape[1] = output_height;
    output_shape[2] = output_width;
    output_shape[3] = output_channels;
@@ -117,7 +117,8 @@ void CalcNCHWPaddingAndOutputSize(const index_t *input_shape,   // NCHW
                                  Padding padding,
                                  index_t *output_shape,
                                  int *padding_size) {
-  CalcPaddingAndOutputSize(input_shape, NCHW, filter_shape, OIHW, dilations,
+  CalcPaddingAndOutputSize(input_shape, DataFormat::NCHW, filter_shape,
+                           DataFormat::OIHW, dilations,
                           strides, padding, output_shape, padding_size);
 }
@@ -128,7 +129,8 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape,   // NHWC
                                  Padding padding,
                                  index_t *output_shape,
                                  int *padding_size) {
-  CalcPaddingAndOutputSize(input_shape, NHWC, filter_shape, OIHW, dilations,
+  CalcPaddingAndOutputSize(input_shape, DataFormat::NHWC, filter_shape,
+                           DataFormat::OIHW, dilations,
                           strides, padding, output_shape, padding_size);
 }
@@ -151,19 +153,19 @@ void CalcOutputSize(const index_t *input_shape,
  index_t input_height = 0, input_width = 0;
  index_t kernel_height = 0, kernel_width = 0;
-  if (input_format == NCHW) {
+  if (input_format == DataFormat::NCHW) {
    input_height = input_shape[2];
    input_width = input_shape[3];
-  } else if (input_format == NHWC) {
+  } else if (input_format == DataFormat::NHWC) {
    input_height = input_shape[1];
    input_width = input_shape[2];
  } else {
    MACE_NOT_IMPLEMENTED;
  }
-  if (filter_format == OIHW) {
+  if (filter_format == DataFormat::OIHW) {
    kernel_height = filter_shape[2];
    kernel_width = filter_shape[3];
-  } else if (filter_format == OHWI) {
+  } else if (filter_format == DataFormat::OHWI) {
    kernel_height = filter_shape[1];
    kernel_width = filter_shape[2];
  } else {
@@ -195,11 +197,11 @@ void CalcOutputSize(const index_t *input_shape,
  }
  output_shape[0] = input_shape[0];
-  if (input_format == NCHW) {
+  if (input_format == DataFormat::NCHW) {
    output_shape[1] = output_channels;
    output_shape[2] = output_height;
    output_shape[3] = output_width;
-  } else if (input_format == NHWC) {
+  } else if (input_format == DataFormat::NHWC) {
    output_shape[1] = output_height;
    output_shape[2] = output_width;
    output_shape[3] = output_channels;
@@ -215,7 +217,8 @@ void CalcOutputSize(const index_t *input_shape,   // NHWC
                    const int *strides,
                    const RoundType round_type,
                    index_t *output_shape) {
-  CalcOutputSize(input_shape, NHWC, filter_shape, OIHW, padding_size, dilations,
+  CalcOutputSize(input_shape, DataFormat::NHWC, filter_shape,
+                 DataFormat::OIHW, padding_size, dilations,
                 strides, round_type, output_shape);
 }
@@ -226,7 +229,8 @@ void CalcNCHWOutputSize(const index_t *input_shape,   // NCHW
                        const int *strides,
                        const RoundType round_type,
                        index_t *output_shape) {
-  CalcOutputSize(input_shape, NCHW, filter_shape, OIHW, padding_size, dilations,
+  CalcOutputSize(input_shape, DataFormat::NCHW, filter_shape,
+                 DataFormat::OIHW, padding_size, dilations,
                 strides, round_type, output_shape);
 }
@@ -241,14 +245,18 @@ void CalcDeconvShape_TF(const std::vector<index_t> &input_shape,
                        std::vector<index_t> *padded_out_shape,
                        DataFormat data_format) {
  const index_t
-      in_height = data_format == NCHW ? input_shape[2] : input_shape[1];
+      in_height =
+      data_format == DataFormat::NCHW ? input_shape[2] : input_shape[1];
  const index_t
-      in_width = data_format == NCHW ? input_shape[3] : input_shape[2];
+      in_width =
+          data_format == DataFormat::NCHW ? input_shape[3] : input_shape[2];
  const index_t
-      out_height = data_format == NCHW ? output_shape[2] : output_shape[1];
+      out_height =
+          data_format == DataFormat::NCHW ? output_shape[2] : output_shape[1];
  const index_t
-      out_width = data_format == NCHW ? output_shape[3] : output_shape[2];
+      out_width =
+          data_format == DataFormat::NCHW ? output_shape[3] : output_shape[2];
  const index_t extended_in_height = (in_height - 1) * strides[0] + 1;
  const index_t extended_in_width = (in_width - 1) * strides[1] + 1;
@@ -307,11 +315,11 @@ void CalcDeconvShape_TF(const std::vector<index_t> &input_shape,
    padded_out_shape->resize(4);
    (*padded_out_shape)[0] = output_shape[0];
    (*padded_out_shape)[1] =
-        data_format == NCHW ? output_channel : padded_out_height;
+        data_format == DataFormat::NCHW ? output_channel : padded_out_height;
    (*padded_out_shape)[2] =
-        data_format == NCHW ? padded_out_height : padded_out_width;
+        data_format == DataFormat::NCHW ? padded_out_height : padded_out_width;
    (*padded_out_shape)[3] =
-        data_format == NCHW ? padded_out_width : output_channel;
+        data_format == DataFormat::NCHW ? padded_out_width : output_channel;
  }
 }
@@ -325,9 +333,11 @@ void CalcDeconvShape_Caffe(const std::vector<index_t> &input_shape,
                           std::vector<index_t> *padded_out_shape,
                           DataFormat data_format) {
  const index_t
-      in_height = data_format == NCHW ? input_shape[2] : input_shape[1];
+      in_height =
+          data_format == DataFormat::NCHW ? input_shape[2] : input_shape[1];
  const index_t
-      in_width = data_format == NCHW ? input_shape[3] : input_shape[2];
+      in_width =
+          data_format == DataFormat::NCHW ? input_shape[3] : input_shape[2];
  const index_t output_channel = filter_shape[0] * group;
@@ -351,11 +361,11 @@ void CalcDeconvShape_Caffe(const std::vector<index_t> &input_shape,
    padded_out_shape->resize(4);
    (*padded_out_shape)[0] = input_shape[0];
    (*padded_out_shape)[1] =
-        data_format == NCHW ? output_channel : padded_out_height;
+        data_format == DataFormat::NCHW ? output_channel : padded_out_height;
    (*padded_out_shape)[2] =
-        data_format == NCHW ? padded_out_height : padded_out_width;
+        data_format == DataFormat::NCHW ? padded_out_height : padded_out_width;
    (*padded_out_shape)[3] =
-        data_format == NCHW ? padded_out_width : output_channel;
+        data_format == DataFormat::NCHW ? padded_out_width : output_channel;
  }
  if (out_shape != nullptr) {
@@ -363,9 +373,11 @@ void CalcDeconvShape_Caffe(const std::vector<index_t> &input_shape,
    index_t out_width = padded_out_width - out_pad_size[1];
    out_shape->resize(4);
    (*out_shape)[0] = input_shape[0];
-    (*out_shape)[1] = data_format == NCHW ? output_channel : out_height;
+    (*out_shape)[1] =
-    (*out_shape)[2] = data_format == NCHW ? out_height : out_width;
+        data_format == DataFormat::NCHW ? output_channel : out_height;
-    (*out_shape)[3] = data_format == NCHW ? out_width : output_channel;
+    (*out_shape)[2] = data_format == DataFormat::NCHW ? out_height : out_width;
+    (*out_shape)[3] =
+        data_format == DataFormat::NCHW ? out_width : output_channel;
  }
 }
@@ -385,7 +397,7 @@ void CalDeconvOutputShapeAndPadSize(const std::vector<index_t> &input_shape,
    MACE_CHECK(output_shape->size() == 4,
               "deconv output shape shoud be 4-dims");
    std::vector<index_t> &out_shape = *output_shape;
-    if (data_format == NCHW) {
+    if (data_format == DataFormat::NCHW) {
      const index_t t = out_shape[1];
      out_shape[1] = out_shape[3];
      out_shape[3] = out_shape[2];

--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -199,7 +199,7 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
 public:
  explicit ConcatOp(OpConstructContext *context)
      : ConcatOpBase(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::ConcatKernel<T>>();
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -243,9 +243,11 @@ void RegisterConcat(OpRegistryBase *op_registry) {
          .SetDevicePlacerFunc(
            [](OpConditionContext *context) -> std::set<DeviceType> {
              auto op = context->operator_def();
+              if (op->output_shape_size() != op->output_size()) {
+                return { DeviceType::CPU, DeviceType::GPU };
+              }
              auto tensor_shape_info = context->tensor_shape_info();
-              if (op->output_shape_size() != op->output_size() ||
+              if (op->output_shape(0).dims_size() != 4) {
-                  op->output_shape(0).dims_size() != 4) {
                return { DeviceType::CPU };
              } else {
                int has_data_format =

--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -231,9 +231,9 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
    std::vector<int> paddings(2);
    if (paddings_.empty()) {
      CalcPaddingAndOutputSize(input->shape().data(),
-                               NHWC,
+                               DataFormat::NHWC,
                               filter->shape().data(),
-                               OHWI,
+                               DataFormat::OHWI,
                               dilations_.data(),
                               strides_.data(),
                               padding_type_,
@@ -242,9 +242,9 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
    } else {
      paddings = paddings_;
      CalcOutputSize(input->shape().data(),
-                     NHWC,
+                     DataFormat::NHWC,
                     filter->shape().data(),
-                     OHWI,
+                     DataFormat::OHWI,
                     paddings_.data(),
                     dilations_.data(),
                     strides_.data(),
@@ -459,7 +459,7 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
              "leakyrelu_coefficient", 0.0f)),
        wino_block_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)) {
    MemoryType mem_type;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
      kernel_ = make_unique<opencl::image::Conv2dKernel<T>>();
    } else {

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -47,8 +47,8 @@ void TestNHWCSimple3x3VALID(int wino_blk_size = 0) {
  const std::vector<index_t> output_shape = {1, 1, 1, 1};
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -60,8 +60,8 @@ void TestNHWCSimple3x3VALID(int wino_blk_size = 0) {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("Input")
@@ -105,8 +105,8 @@ void TestNHWCSimple3x3SAME(int wino_blk_size = 0) {
  const std::vector<index_t> output_shape = {1, 3, 3, 1};
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -118,8 +118,8 @@ void TestNHWCSimple3x3SAME(int wino_blk_size = 0) {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("Input")
@@ -189,8 +189,8 @@ void TestNHWCSimple3x3WithoutBias() {
       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true);
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -203,8 +203,8 @@ void TestNHWCSimple3x3WithoutBias() {
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("Input")
@@ -256,8 +256,8 @@ void TestNHWCCombined3x3() {
  net.AddInputFromArray<D, T>("Bias", {2}, {0.1f, 0.2f}, true);
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -270,8 +270,8 @@ void TestNHWCCombined3x3() {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("Input")
@@ -321,8 +321,8 @@ void TestFusedNHWCSimple3x3VALID(int wino_blk_size = 0) {
  const std::vector<index_t> output_shape = {1, 1, 1, 1};
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -336,8 +336,8 @@ void TestFusedNHWCSimple3x3VALID(int wino_blk_size = 0) {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("Input")
@@ -376,8 +376,8 @@ void TestFusedNHWCSimple3x3WithoutBias(int wino_blk_size = 0) {
  const std::vector<index_t> output_shape = {1, 1, 1, 1};
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -391,8 +391,8 @@ void TestFusedNHWCSimple3x3WithoutBias(int wino_blk_size = 0) {
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("Input")
@@ -459,8 +459,8 @@ void TestConv1x1() {
  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f}, true);
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -472,8 +472,8 @@ void TestConv1x1() {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("Conv2D", "Conv2DTest")
        .Input("Input")
@@ -532,8 +532,8 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true,
        false);
    net.AddRandomInput<D, T>("Bias", {output_channels}, true, false);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    // Construct graph
    OpDefBuilder("Conv2D", "Conv2dTest")
@@ -552,8 +552,8 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
    // run on cpu
    net.RunOp();
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
    // Check
    auto expected = net.CreateTensor<float>();
@@ -651,8 +651,8 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
                                    float_bias_data,
                                    true);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputNCHW")
@@ -667,8 +667,8 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
    // run on cpu
    net.RunOp();
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
    // Check
    auto expected = net.CreateTensor<float>();
@@ -811,8 +811,8 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
    net.AddRandomInput<D, T>("Bias", {output_channels}, true);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    // Construct graph
    OpDefBuilder("Conv2D", "Conv2dTest")
@@ -828,8 +828,8 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
    // run on cpu
    net.RunOp();
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
    // Check
    auto expected = net.CreateTensor<float>();
@@ -900,8 +900,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
    net.AddRandomInput<D, float>("Bias", {output_channels}, true);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    // Construct graph
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputNCHW")
@@ -916,8 +916,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
    // run on cpu
    net.RunOp();
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
    // Check
    auto expected = net.CreateTensor<float>();
    expected->Copy(*net.GetOutput("Output"));
@@ -979,8 +979,8 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
    net.AddRandomInput<D, float>("Bias", {output_channels}, true);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    // Construct graph
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputNCHW")
@@ -994,8 +994,8 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
    // run on cpu
    net.RunOp();
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
    // Check
    auto expected = net.CreateTensor<float>();
@@ -1118,12 +1118,12 @@ void TestQuant(const index_t batch,
  net.AddRandomInput<CPU, float>("Filter", {out_channels, k_height, k_width,
                                            in_channels}, true);
  net.AddRandomInput<CPU, float>("Bias", {out_channels}, true);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  net.TransformFilterDataFormat<DeviceType::CPU, float>("Filter",
-                                                        OHWI,
+                                                        DataFormat::OHWI,
                                                        "FilterOIHW",
-                                                        OIHW);
+                                                        DataFormat::OIHW);
  OpDefBuilder("Conv2D", "Conv2dTest")
      .Input("InputNCHW")
@@ -1136,8 +1136,8 @@ void TestQuant(const index_t batch,
      .AddIntArg("T", static_cast<int>(DT_FLOAT))
      .Finalize(net.NewOperatorDef());
  net.RunOp(CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  "Output", NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  OpDefBuilder("Quantize", "QuantizeFilter")
      .Input("Filter")

--- a/mace/ops/crop.cc
+++ b/mace/ops/crop.cc
@@ -117,7 +117,7 @@ class CropOp<DeviceType::GPU, T> : public Operation {
 public:
  explicit CropOp(OpConstructContext *context)
      : Operation(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::CropKernel<T>>(
          Operation::GetRepeatedArgs<int>("offset"));
    } else {
@@ -151,11 +151,13 @@ void RegisterCrop(OpRegistryBase *op_registry) {
          .SetDevicePlacerFunc(
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
+                if (op->output_shape_size() != op->output_size()) {
+                  return { DeviceType::CPU, DeviceType::GPU };
+                }
                int has_data_format =
                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                        *op, "has_data_format", 0);
                if (!has_data_format ||
-                    (op->output_shape_size() != op->output_size()) ||
                    op->output_shape(0).dims_size() != 4) {
                  return { DeviceType::CPU };
                }

--- a/mace/ops/crop_test.cc
+++ b/mace/ops/crop_test.cc
@@ -42,13 +42,13 @@ void RunCrop(const std::vector<index_t> &input_shape,
        .Finalize(net.NewOperatorDef());
  } else if (D == CPU) {
    net.TransformDataFormat<DeviceType::CPU, float>("Input0",
-                                                    NHWC,
+                                                    DataFormat::NHWC,
                                                    "InputNCHW0",
-                                                    NCHW);
+                                                    DataFormat::NCHW);
    net.TransformDataFormat<DeviceType::CPU, float>("Input1",
-                                                    NHWC,
+                                                    DataFormat::NHWC,
                                                    "InputNCHW1",
-                                                    NCHW);
+                                                    DataFormat::NCHW);
    OpDefBuilder("Crop", "CropTest")
        .Input("InputNCHW0")
        .Input("InputNCHW1")
@@ -62,8 +62,8 @@ void RunCrop(const std::vector<index_t> &input_shape,
  net.RunOp(D);
  if (D == CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }
  // Check
  auto expected = net.CreateTensor<float>(expected_shape, expected_data);

--- a/mace/ops/cumsum_test.cc
+++ b/mace/ops/cumsum_test.cc
@@ -32,8 +32,8 @@ void SimpleTestWithDataFormat(const std::vector<index_t> &shape,
  OpsTestNet net;
  net.AddInputFromArray<CPU, T>("Input", shape, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("Cumsum", "CumsumTest")
    .Input("InputNCHW")
@@ -48,8 +48,8 @@ void SimpleTestWithDataFormat(const std::vector<index_t> &shape,
  // Run
  net.RunOp(DeviceType::CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  net.AddInputFromArray<CPU, T>("ExpectedOutput", shape, output);
  ExpectTensorNear<T>(*net.GetOutput("ExpectedOutput"),

--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -173,7 +173,7 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
  explicit Deconv2dOp(OpConstructContext *context)
      : Deconv2dOpBase(context) {
    MemoryType mem_type = MemoryType::GPU_IMAGE;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::Deconv2dKernel<T>>();
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -240,7 +240,7 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
                                   &out_paddings,
                                   nullptr,
                                   model_type_,
-                                   NHWC);
+                                   DataFormat::NHWC);
    return kernel_->Compute(context, input, filter, bias,
                            strides_.data(), in_paddings.data(), activation_,
@@ -276,7 +276,7 @@ void RegisterDeconv2D(OpRegistryBase *op_registry) {
                    MACE_NOT_IMPLEMENTED;
                  }
                  FrameworkType framework_type =
-                      static_cast<ops::FrameworkType>(
+                      static_cast<FrameworkType>(
                        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                            *(context->operator_def()), "framework_type",
                            FrameworkType::TENSORFLOW));

--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
@@ -47,7 +47,8 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
  net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data, true);
  net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data, true);
  // TODO(liutuo): remove the unused transform
-  net.TransformFilterDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
+  net.TransformFilterDataFormat<D, float>(
+      "Filter", DataFormat::HWOI, "FilterOIHW", DataFormat::OIHW);
  if (D == DeviceType::GPU) {
    if (model_type == FrameworkType::CAFFE) {
      OpDefBuilder("Deconv2D", "Deconv2dTest")
@@ -77,8 +78,8 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
    }
    net.RunOp(D);
  } else {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    if (model_type == FrameworkType::CAFFE) {
      OpDefBuilder("Deconv2D", "Deconv2dTest")
@@ -109,8 +110,8 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }
  auto expected = net.CreateTensor<float>(expected_shape, expected_data);
@@ -380,8 +381,8 @@ void TestComplexDeconvNxN(const int batch,
        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true,
        false);
    net.AddRandomInput<D, T>("Bias", {output_channels}, true, false);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    int out_h = 0;
    int out_w = 0;
@@ -440,8 +441,8 @@ void TestComplexDeconvNxN(const int batch,
    // run on cpu
    net.RunOp();
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
    // Check
    auto expected = net.CreateTensor<float>();

--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -96,7 +96,7 @@ class DepthToSpaceOp<DeviceType::GPU, T> : public Operation {
  explicit DepthToSpaceOp(OpConstructContext *context)
      : Operation(context) {
    int block_size = Operation::GetOptionalArg<int>("block_size", 1);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::DepthToSpaceKernel<T>>(block_size);
    } else {
      MACE_NOT_IMPLEMENTED;

--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
@@ -32,8 +32,8 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,
  net.AddInputFromArray<D, float>("Input", input_shape, input_data);
  // Construct graph
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
        .Input("InputNCHW")
        .Output("OutputNCHW")
@@ -41,8 +41,8 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else {
    OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
@@ -114,8 +114,8 @@ void RandomTest(const int block_size,
  // Add input data
  net.AddRandomInput<D, float>("Input", shape);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
      .Input("InputNCHW")
      .AddIntArg("block_size", block_size)
@@ -125,8 +125,8 @@ void RandomTest(const int block_size,
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
      .Input("Input")

--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -188,9 +188,9 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
        filter->dim(2) * filter->dim(3), filter->dim(0), filter->dim(1), 1};
    if (paddings_.empty()) {
      CalcPaddingAndOutputSize(input->shape().data(),
-                               NHWC,
+                               DataFormat::NHWC,
                               ohwi_shape.data(),
-                               OHWI,
+                               DataFormat::OHWI,
                               dilations_.data(),
                               strides_.data(),
                               padding_type_,
@@ -199,9 +199,9 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
    } else {
      paddings = paddings_;
      CalcOutputSize(input->shape().data(),
-                     NHWC,
+                     DataFormat::NHWC,
                     ohwi_shape.data(),
-                     OHWI,
+                     DataFormat::OHWI,
                     paddings_.data(),
                     dilations_.data(),
                     strides_.data(),
@@ -375,7 +375,7 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
  explicit DepthwiseConv2dOp(OpConstructContext *context)
      : DepthwiseConv2dOpBase(context) {
    MemoryType mem_type;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
      kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel<T>>();
    } else {
@@ -459,6 +459,18 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
                context->set_output_mem_type(mem_type);
              }));
 #endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP_CONDITION(
+      op_registry,
+      OpConditionBuilder("DepthwiseConv2d")
+          .SetInputsDataFormatSelector(
+              [](OpConditionContext *context) -> std::vector<DataFormat> {
+                DataFormat op_data_format =
+                    static_cast<DataFormat>(
+                        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                        *context->operator_def(), "data_format",
+                        static_cast<int>(DataFormat::NONE)));
+                return {op_data_format, DataFormat::OIHW, DataFormat::NONE};
+              }));
 }
 }  // namespace ops

--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -39,8 +39,8 @@ void SimpleValidTest() {
      true);
  net.AddInputFromArray<D, float>("Bias", {2}, {.1f, .2f}, true);
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -52,8 +52,8 @@ void SimpleValidTest() {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
        .Input("Input")
@@ -127,8 +127,8 @@ void ComplexValidTest(index_t batch,
                                  true);
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -141,8 +141,8 @@ void ComplexValidTest(index_t batch,
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
        .Input("Input")
@@ -249,8 +249,8 @@ void TestNxNS12(const index_t height, const index_t width) {
                                               {multiplier * channel},
                                               true, false);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
        .Input("InputNCHW")
        .Input("Filter")
@@ -267,8 +267,8 @@ void TestNxNS12(const index_t height, const index_t width) {
    // Run on cpu
    net.RunOp();
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
    // Check
    auto expected = net.CreateTensor<float>();
@@ -389,9 +389,9 @@ void TestQuant(const index_t batch,
      "Filter", {k_height, k_width, in_channels, multiplier}, true, false);
  net.AddRandomInput<CPU, float>("Bias", {out_channels}, true);
  net.TransformDataFormat<DeviceType::CPU, float>(
-      "Input", NHWC, "InputNCHW", NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  net.TransformFilterDataFormat<DeviceType::CPU, float>(
-      "Filter", HWIO, "FilterOIHW", OIHW);
+      "Filter", DataFormat::HWIO, "FilterOIHW", DataFormat::OIHW);
  OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
      .Input("InputNCHW")
@@ -405,7 +405,7 @@ void TestQuant(const index_t batch,
      .Finalize(net.NewOperatorDef());
  net.RunOp(CPU);
  net.TransformDataFormat<DeviceType::CPU, float>(
-      "OutputNCHW", NCHW, "Output", NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  OpDefBuilder("Quantize", "QuantizeFilter")
      .Input("Filter")

--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -190,7 +190,7 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
  explicit DepthwiseDeconv2dOp(OpConstructContext *context)
      : Deconv2dOpBase(context) {
    MemoryType mem_type = MemoryType::GPU_IMAGE;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel<T>>();
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -230,7 +230,7 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
                                   &out_paddings,
                                   nullptr,
                                   CAFFE,
-                                   NHWC);
+                                   DataFormat::NHWC);
    return kernel_->Compute(context,
                            input,

--- a/mace/ops/depthwise_deconv2d_test.cc
+++ b/mace/ops/depthwise_deconv2d_test.cc
@@ -39,7 +39,8 @@ void RunTestSimple(const int group,
  // Add input data
  net.AddInputFromArray<D, float>("Input", input_shape, input_data);
  net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data, true);
-  net.TransformFilterDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
+  net.TransformFilterDataFormat<D, float>(
+      "Filter", DataFormat::HWOI, "FilterOIHW", DataFormat::OIHW);
  const index_t out_channels = expected_shape[3];
  net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data, true);
@@ -56,8 +57,8 @@ void RunTestSimple(const int group,
    net.RunOp(D);
  } else {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "InputNCHW", NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
        .Input("InputNCHW")
        .Input("FilterOIHW")
@@ -69,8 +70,8 @@ void RunTestSimple(const int group,
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }
  auto expected = net.CreateTensor<float>(expected_shape, expected_data);
@@ -193,8 +194,8 @@ void RandomTest(index_t batch,
                                                {channel * multiplier},
                                                bias_data, true, false);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
      .Input("InputNCHW")
      .Input("Filter")
@@ -210,8 +211,8 @@ void RandomTest(index_t batch,
      .Finalize(net.NewOperatorDef());
  // Run
  net.RunOp(DeviceType::CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  "Output", NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check

--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -1145,7 +1145,7 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
    int32_t scalar_input_index = Operation::GetOptionalArg<int32_t>(
            "scalar_input_index", 1);
    MemoryType mem_type;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
      kernel_ = make_unique<opencl::image::EltwiseKernel<T>>(
          type, coeff, scalar_input, scalar_input_index);

--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -69,7 +69,8 @@ void SimpleTensorScalar(const ops::EltwiseType type,
  net.AddInputFromArray<D, T>("Input", shape, input);
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, T>("Input", NHWC, "TInput", NCHW);
+    net.TransformDataFormat<D, T>(
+        "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
    OpDefBuilder("Eltwise", "EltwiseTest")
        .Input("TInput")
        .AddIntArg("T", DataTypeToEnum<T>::v())
@@ -81,7 +82,8 @@ void SimpleTensorScalar(const ops::EltwiseType type,
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<D, DstType>("TOutput", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, DstType>(
+        "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else {
    OpDefBuilder("Eltwise", "EltwiseTest")
        .Input("Input")
@@ -124,13 +126,15 @@ void SimpleTensorEltwise(const ops::EltwiseType type,
            .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
            .Output("TOutput");
    if (shape0.size() > 1) {
-      net.TransformDataFormat<D, T>("Input0", NHWC, "TInput0", NCHW);
+      net.TransformDataFormat<D, T>(
+          "Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW);
      op_builder.Input("TInput0");
    } else {
      op_builder.Input("Input0");
    }
    if (shape1.size() > 1) {
-      net.TransformDataFormat<D, T>("Input1", NHWC, "TInput1", NCHW);
+      net.TransformDataFormat<D, T>(
+          "Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW);
      op_builder.Input("TInput1");
    } else {
      op_builder.Input("Input1");
@@ -139,7 +143,8 @@ void SimpleTensorEltwise(const ops::EltwiseType type,
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<D, DstType>("TOutput", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, DstType>(
+        "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else {
    OpDefBuilder("Eltwise", "EltwiseTest")
        .Input("Input0")
@@ -560,7 +565,8 @@ void GPUOverflowTest(const ops::EltwiseType type,
  net.AddInputFromArray<DeviceType::GPU, T>(
      "Filter",
      {output_shape.back(), shape0.back(), 3, 3},
-      std::vector<float>(output_shape.back() * shape0.back() * 9, 1));
+      std::vector<float>(output_shape.back() * shape0.back() * 9, 1),
+      true);
  OpDefBuilder("Conv2D", "Conv2D")
      .AddIntArg("T", DataTypeToEnum<T>::v())
      .Input("EltOutput")
@@ -636,8 +642,8 @@ void RandomTensorScalar(const ops::EltwiseType type,
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input", shape, false, true, true);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
  OpDefBuilder("Eltwise", "EltwiseTest")
      .Input("TInput")
      .AddIntArg("type", static_cast<int>(type))
@@ -647,8 +653,8 @@ void RandomTensorScalar(const ops::EltwiseType type,
      .Finalize(net.NewOperatorDef());
  // Run
  net.RunOp(DeviceType::CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
  auto expected = net.CreateTensor<float>();
  expected->Copy(*net.GetOutput("Output"));
@@ -690,10 +696,10 @@ void RandomTensorEltwise(const ops::EltwiseType type,
                                             true,
                                             true);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC, "TInput0",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input1", NHWC, "TInput1",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW);
  OpDefBuilder("Eltwise", "EltwiseTest")
      .Input("TInput0")
      .Input("TInput1")
@@ -705,8 +711,8 @@ void RandomTensorEltwise(const ops::EltwiseType type,
  // Run
  net.RunOp(DeviceType::CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
  auto expected = net.CreateTensor<float>();
  expected->Copy(*net.GetOutput("Output"));
@@ -746,10 +752,10 @@ void Quantized(const std::vector<index_t> &shape,
                                             true,
                                             true);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC, "TInput0",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input1", NHWC, "TInput1",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW);
  OpDefBuilder("Eltwise", "EltwiseTest")
      .Input("TInput0")
@@ -761,8 +767,8 @@ void Quantized(const std::vector<index_t> &shape,
  // Run
  net.RunOp(DeviceType::CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
  OpDefBuilder("Quantize", "QuantizeInput0")
      .Input("Input0")

--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
@@ -49,7 +49,8 @@ void Simple() {
  net.AddInputFromArray<D, float>("Offset", {1}, offset, true);
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<D, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
        .Input("InputNCHW")
        .Input("Scale")
@@ -58,7 +59,8 @@ void Simple() {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
        .Input("Input")
@@ -100,8 +102,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
      .Input("InputNCHW")
@@ -113,8 +115,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
  // run cpu
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>();
@@ -151,8 +153,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
      .Input("InputNCHW")
@@ -164,8 +166,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
  // run cpu
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>();
@@ -205,8 +207,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
      .Input("InputNCHW")
@@ -218,8 +220,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
  // run cpu
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>();
@@ -254,11 +256,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input",
                                             {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
      .Input("InputNCHW")
@@ -270,8 +272,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
  // run cpu
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>();

--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -190,7 +190,7 @@ class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
  explicit FullyConnectedOp(OpConstructContext *context)
      : FullyConnectedOpBase(context) {
    MemoryType mem_type = MemoryType::CPU_BUFFER;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      mem_type = MemoryType::GPU_IMAGE;
      kernel_ = make_unique<opencl::image::FullyConnectedKernel<T>>();
    } else {

--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -48,7 +48,8 @@ void Simple(const std::vector<index_t> &input_shape,
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("FullyConnected", "FullyConnectedTest")
        .Input("Input")
@@ -129,8 +130,8 @@ void Random(const index_t batch,
  net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel}, true,
      false);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("FullyConnected", "FullyConnectedTest")
      .Input("InputNCHW")
      .Input("Weight")
@@ -143,7 +144,8 @@ void Random(const index_t batch,
  // run cpu
  net.RunOp();
-  net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+  net.TransformDataFormat<CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>();
@@ -215,8 +217,10 @@ void QuantRandom(const index_t batch,
  net.AddRandomInput<CPU, float>(
      "Weight", {out_channel, height, width, channels}, true);
  net.AddRandomInput<CPU, float>("Bias", {out_channel}, true);
-  net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
+  net.TransformDataFormat<CPU, float>(
-  net.TransformFilterDataFormat<CPU, float>("Weight", OHWI, "WeightOIHW", OIHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
+  net.TransformFilterDataFormat<CPU, float>(
+      "Weight", DataFormat::OHWI, "WeightOIHW", DataFormat::OIHW);
  OpDefBuilder("FullyConnected", "FullyConnectedTest")
      .Input("InputNCHW")
@@ -226,7 +230,8 @@ void QuantRandom(const index_t batch,
      .AddIntArg("T", DT_FLOAT)
      .Finalize(net.NewOperatorDef());
  net.RunOp();
-  net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+  net.TransformDataFormat<CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  OpDefBuilder("Quantize", "QuantizeWeight")
      .Input("Weight")

--- a/mace/ops/local_response_norm_test.cc
+++ b/mace/ops/local_response_norm_test.cc
@@ -29,7 +29,8 @@ void Simple() {
                                  {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<D, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("LocalResponseNorm", "LocalResponseNormTest")
        .Input("InputNCHW")
@@ -41,7 +42,8 @@ void Simple() {
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }
  // Check

--- a/mace/ops/lstm_cell.cc
+++ b/mace/ops/lstm_cell.cc
@@ -36,7 +36,7 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation {
        Operation::GetOptionalArg<float>("scalar_input",
                                         0.0));
    MemoryType mem_type = MemoryType::GPU_IMAGE;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::LSTMCellKernel<T>>(forget_bias);
    } else {
      MACE_NOT_IMPLEMENTED;

--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
@@ -47,7 +47,6 @@ class OpenCLBufferTransformer {
                       const OpenCLBufferType type,
                       const MemoryType out_mem_type,
                       const int wino_blk_size,
-                       DataFormat data_format,
                       Tensor *output) {
    Workspace *ws = context->workspace();
    DataType dt = DataTypeToEnum<T>::value;
@@ -66,7 +65,6 @@ class OpenCLBufferTransformer {
        VLOG(2) << "Transform CPU Buffer " << input->name()
                << " to GPU Buffer " << internal_tensor->name()
                << " with data type " << dt;
-        MACE_CHECK(data_format == DataFormat::NHWC);
        internal_tensor->Resize(input->shape());
        const uint8_t *input_ptr = input->data<uint8_t>();
        Tensor::MappingGuard guard(internal_tensor);
@@ -88,7 +86,6 @@ class OpenCLBufferTransformer {
      VLOG(2) << "Transform GPU Buffer " << internal_tensor.name()
              << " to CPU Buffer " << output->name()
              << " with data type " << dt;
-      MACE_CHECK(data_format == DataFormat::NHWC);
      Tensor::MappingGuard guard(&internal_tensor);
      const T *internal_ptr = internal_tensor.data<T>();
      output->Resize(internal_tensor.shape());
@@ -135,7 +132,7 @@ MaceStatus TransformFilter(
  input->MarkUnused();
  return OpenCLBufferTransformer<T>(input->memory_type(), mem_type).
      Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
-                DataFormat::DF_NONE, output);
+                output);
 }
 }  // namespace ops

--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
@@ -15,6 +15,7 @@
 #include "mace/ops/ops_test_util.h"
 #include "mace/core/memory_optimizer.h"
 #include "mace/utils/memory.h"
+#include "mace/core/net_def_adapter.h"
 namespace mace {
 namespace ops {
@@ -164,26 +165,27 @@ void OpTestContext::SetOCLImageAndBufferTestFlag() {
 bool OpsTestNet::Setup(mace::DeviceType device) {
  NetDef net_def;
  for (auto &op_def : op_defs_) {
-    net_def.add_op()->CopyFrom(op_def);
+    auto target_op = net_def.add_op();
+    target_op->CopyFrom(op_def);
+    auto has_data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+        op_def, "has_data_format", 0);
+    auto is_quantized_op = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+        op_def, "T", static_cast<int>(DT_FLOAT))
+        == static_cast<int>(DT_UINT8);
    for (auto input : op_def.input()) {
      if (ws_.GetTensor(input) != nullptr &&
          !ws_.GetTensor(input)->is_weight()) {
        auto input_info = net_def.add_input_info();
        input_info->set_name(input);
-        auto has_data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            op_def, "has_data_format", 1);
-        auto is_quantized_op = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            op_def, "T", static_cast<int>(DT_FLOAT))
-            == static_cast<int>(DT_UINT8);
        if (has_data_format) {
          if (is_quantized_op || device == DeviceType::GPU) {
-            input_info->set_data_format(NHWC);
+            input_info->set_data_format(static_cast<int>(DataFormat::NHWC));
          } else {
-            input_info->set_data_format(NCHW);
+            input_info->set_data_format(static_cast<int>(DataFormat::NCHW));
          }
        } else {
-          input_info->set_data_format(DataFormat::DF_NONE);
+          input_info->set_data_format(static_cast<int>(DataFormat::NONE));
        }
        auto &shape = ws_.GetTensor(input)->shape();
        for (auto d : shape) {
@@ -191,6 +193,10 @@ bool OpsTestNet::Setup(mace::DeviceType device) {
        }
      }
    }
+    if (has_data_format) {
+      SetProtoArg<int>(target_op, "data_format",
+                       static_cast<int>(DataFormat::AUTO));
+    }
  }
  if (!op_defs_.empty()) {
    auto op_def = op_defs_.back();
@@ -205,15 +211,21 @@ bool OpsTestNet::Setup(mace::DeviceType device) {
      }
    }
  }
+  NetDef adapted_net_def;
+  NetDefAdapter net_def_adapter(op_registry_.get(), &ws_);
+  net_def_adapter.AdaptNetDef(&net_def,
+                              OpTestContext::Get()->GetDevice(device),
+                              &adapted_net_def);
  MemoryOptimizer mem_optimizer;
  net_ = make_unique<SerialNet>(
      op_registry_.get(),
-      &net_def,
+      &adapted_net_def,
      &ws_,
      OpTestContext::Get()->GetDevice(device),
      &mem_optimizer);
  MaceStatus status = (ws_.PreallocateOutputTensor(
-      net_def,
+      adapted_net_def,
      &mem_optimizer,
      OpTestContext::Get()->GetDevice(device)));
  if (status != MaceStatus::MACE_SUCCESS) return false;
@@ -252,15 +264,20 @@ MaceStatus OpsTestNet::RunOp() {
 MaceStatus OpsTestNet::RunNet(const mace::NetDef &net_def,
                              const mace::DeviceType device) {
  device_type_ = device;
+  NetDef adapted_net_def;
+  NetDefAdapter net_def_adapter(op_registry_.get(), &ws_);
+  net_def_adapter.AdaptNetDef(&net_def,
+                              OpTestContext::Get()->GetDevice(device),
+                              &adapted_net_def);
  MemoryOptimizer mem_optimizer;
  net_ = make_unique<SerialNet>(
      op_registry_.get(),
-      &net_def,
+      &adapted_net_def,
      &ws_,
      OpTestContext::Get()->GetDevice(device),
      &mem_optimizer);
  MACE_RETURN_IF_ERROR(ws_.PreallocateOutputTensor(
-      net_def,
+      adapted_net_def,
      &mem_optimizer,
      OpTestContext::Get()->GetDevice(device)));
  MACE_RETURN_IF_ERROR(net_->Init());

--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -216,7 +216,7 @@ class OpsTestNet {
    const std::vector<index_t> input_shape = input->shape();
    MACE_CHECK(input_shape.size() == 4, "input shape != 4");
-    if (src_format == NHWC && dst_format == NCHW) {
+    if (src_format == DataFormat::NHWC && dst_format == DataFormat::NCHW) {
      index_t batch = input_shape[0];
      index_t height = input_shape[1];
      index_t width = input_shape[2];
@@ -236,7 +236,8 @@ class OpsTestNet {
          }
        }
      }
-    } else if (src_format == NCHW && dst_format == NHWC) {
+    } else if (src_format == DataFormat::NCHW &&
+        dst_format == DataFormat::NHWC) {
      index_t batch = input_shape[0];
      index_t channels = input_shape[1];
      index_t height = input_shape[2];
@@ -274,7 +275,7 @@ class OpsTestNet {
        input->is_weight());
    const std::vector<index_t> input_shape = input->shape();
    MACE_CHECK(input_shape.size() == 4, "input shape != 4");
-    if (src_format == HWOI && dst_format == OIHW) {
+    if (src_format == DataFormat::HWOI && dst_format == DataFormat::OIHW) {
      index_t height = input_shape[0];
      index_t width = input_shape[1];
      index_t out_channels = input_shape[2];
@@ -292,7 +293,8 @@ class OpsTestNet {
              input_data[j * out_channels * in_channels + i];
        }
      }
-    } else if (src_format == OIHW && dst_format == HWOI) {
+    } else if (src_format == DataFormat::OIHW &&
+        dst_format == DataFormat::HWOI) {
      index_t out_channels = input_shape[0];
      index_t in_channels = input_shape[1];
      index_t height = input_shape[2];
@@ -310,7 +312,8 @@ class OpsTestNet {
              input_data[j * height * width + i];
        }
      }
-    } else if (src_format == HWIO && dst_format == OIHW) {
+    } else if (src_format == DataFormat::HWIO &&
+        dst_format == DataFormat::OIHW) {
      index_t height = input_shape[0];
      index_t width = input_shape[1];
      index_t in_channels = input_shape[2];
@@ -330,7 +333,8 @@ class OpsTestNet {
          }
        }
      }
-    } else if (src_format == OHWI && dst_format == OIHW) {
+    } else if (src_format == DataFormat::OHWI &&
+        dst_format == DataFormat::OIHW) {
      index_t out_channels = input_shape[0];
      index_t height = input_shape[1];
      index_t width = input_shape[2];

--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
@@ -179,7 +179,7 @@ class PadOp<DeviceType::GPU, T> : public Operation {
    std::vector<int> paddings = Operation::GetRepeatedArgs<int>("paddings");
    float constant_value = Operation::GetOptionalArg<float>(
        "constant_value", 0.0);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::PadKernel<T>>(
          type, paddings, constant_value);
    } else {

--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
@@ -45,8 +45,8 @@ void SimpleConstant() {
    // Run
    net.RunOp(D);
  } else {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
    OpDefBuilder("Pad", "PadTest")
        .Input("TInput")
        .Output("TOutput")
@@ -58,8 +58,8 @@ void SimpleConstant() {
    // Run
    net.RunOp();
-    net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NHWC);
+        "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }
  auto output = net.GetTensor("Output");
@@ -93,7 +93,8 @@ void Result(const std::vector<index_t> &input_shape,
  if (D == DeviceType::CPU) {
    t_input = "TInput";
    t_output = "TOutput";
-    net.TransformDataFormat<DeviceType::CPU, T>(input, NHWC, t_input, NCHW);
+    net.TransformDataFormat<DeviceType::CPU, T>(
+        input, DataFormat::NHWC, t_input, DataFormat::NCHW);
  }
  OpDefBuilder("Pad", "PadTest")
@@ -108,7 +109,8 @@ void Result(const std::vector<index_t> &input_shape,
  net.RunOp(D);
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, T>(t_output, NCHW, output, NHWC);
+    net.TransformDataFormat<DeviceType::CPU, T>(
+        t_output, DataFormat::NCHW, output, DataFormat::NHWC);
  }
  auto actual = net.GetTensor(output.c_str());
@@ -172,8 +174,8 @@ TEST_F(PadTest, ComplexCPU) {
  // Add input data
  net.AddRepeatedInput<DeviceType::CPU, float>("Input", {1, 1, 1, 2}, 2);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
  OpDefBuilder("Pad", "PadTest")
      .Input("TInput")
      .Output("TOutput")
@@ -184,8 +186,8 @@ TEST_F(PadTest, ComplexCPU) {
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
  auto output = net.GetTensor("Output");
@@ -209,8 +211,8 @@ void Complex(const std::vector<index_t> &input_shape,
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input", input_shape);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
  OpDefBuilder("Pad", "PadTest")
      .Input("TInput")
      .Output("TOutput")
@@ -222,8 +224,8 @@ void Complex(const std::vector<index_t> &input_shape,
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
  auto expected = net.CreateTensor<float>();
  expected->Copy(*net.GetOutput("Output"));

--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -270,9 +270,9 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
    std::vector<int> paddings(2);
    if (paddings_.empty()) {
      CalcPaddingAndOutputSize(input_tensor->shape().data(),
-                               NHWC,
+                               DataFormat::NHWC,
                               filter_shape.data(),
-                               OHWI,
+                               DataFormat::OHWI,
                               dilations_.data(),
                               strides_.data(),
                               padding_type_,
@@ -281,9 +281,9 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
    } else {
      paddings = paddings_;
      CalcOutputSize(input_tensor->shape().data(),
-                     NHWC,
+                     DataFormat::NHWC,
                     filter_shape.data(),
-                     OHWI,
+                     DataFormat::OHWI,
                     paddings_.data(),
                     dilations_.data(),
                     strides_.data(),
@@ -477,7 +477,7 @@ class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase {
 public:
  explicit PoolingOp(OpConstructContext *context)
      : PoolingOpBase(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::PoolingKernel<T>>();
    } else {
      kernel_ = make_unique<opencl::buffer::PoolingKernel<T>>();

--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -34,8 +34,8 @@ TEST_F(PoolingOpTest, MAX_VALID) {
      {0, 16, 1, 17, 2,  18, 3,  19, 4,  20, 5,  21, 6,  22, 7,  23,
       8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("Pooling", "PoolingTest")
      .Input("InputNCHW")
@@ -50,8 +50,8 @@ TEST_F(PoolingOpTest, MAX_VALID) {
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected =
@@ -68,8 +68,8 @@ TEST_F(PoolingOpTest, MAX_SAME) {
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 3, 1},
                                                {0, 1, 2, 3, 4, 5, 6, 7, 8});
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("Pooling", "PoolingTest")
      .Input("InputNCHW")
@@ -84,8 +84,8 @@ TEST_F(PoolingOpTest, MAX_SAME) {
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>({1, 2, 2, 1}, {4, 5, 7, 8});
@@ -102,8 +102,8 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
      "Input", {1, 4, 4, 1},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("Pooling", "PoolingTest")
      .Input("InputNCHW")
@@ -118,8 +118,8 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>({1, 2, 2, 1}, {10, 11, 14, 15});
@@ -136,8 +136,8 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
      "Input", {1, 2, 9, 1},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("Pooling", "PoolingTest")
      .Input("InputNCHW")
@@ -152,8 +152,8 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>({1, 1, 5, 1}, {10, 12, 14, 16, 17});
@@ -174,8 +174,8 @@ void SimpleMaxPooling3S2() {
       14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26});
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    // Run
    OpDefBuilder("Pooling", "PoolingTest")
        .Input("InputNCHW")
@@ -187,8 +187,8 @@ void SimpleMaxPooling3S2() {
        .AddIntsArg("dilations", {1, 1})
        .Finalize(net.NewOperatorDef());
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else if (D == DeviceType::GPU) {
    OpDefBuilder("Pooling", "PoolingTest")
        .Input("Input")
@@ -224,8 +224,8 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
  // Add input data
  net.AddRandomInput<D, float>("Input", input_shape);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("Pooling", "PoolingTest")
      .Input("InputNCHW")
@@ -240,8 +240,8 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
  // run on cpu
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  auto expected = net.CreateTensor<float>();
  expected->Copy(*net.GetOutput("Output"));
@@ -304,8 +304,8 @@ TEST_F(PoolingOpTest, AVG_VALID) {
      {0, 16, 1, 17, 2,  18, 3,  19, 4,  20, 5,  21, 6,  22, 7,  23,
       8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("Pooling", "PoolingTest")
      .Input("InputNCHW")
@@ -320,8 +320,8 @@ TEST_F(PoolingOpTest, AVG_VALID) {
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>(
@@ -373,8 +373,8 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
  // Add input data
  net.AddRandomInput<D, float>("Input", shape);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("Pooling", "PoolingTest")
      .Input("InputNCHW")
@@ -389,8 +389,8 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
  // run on cpu
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  auto expected = net.CreateTensor<float>();
  expected->Copy(*net.GetOutput("Output"));
@@ -563,7 +563,7 @@ void TestQuant(const index_t batch,
  net.AddRandomInput<CPU, float>(
      "Input", input_shape, false, false);
  net.TransformDataFormat<DeviceType::CPU, float>(
-      "Input", NHWC, "InputNCHW", NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  net.AddRandomInput<DeviceType::CPU, float>(
      "OutputNCHW", input_shape, false, true, true);
@@ -580,7 +580,7 @@ void TestQuant(const index_t batch,
  net.RunOp(CPU);
  net.TransformDataFormat<DeviceType::CPU, float>(
-      "OutputNCHW", NCHW, "Output", NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  OpDefBuilder("Quantize", "QuantizeInput")
      .Input("Input")

--- a/mace/ops/reduce.cc
+++ b/mace/ops/reduce.cc
@@ -873,7 +873,7 @@ class ReduceOp<DeviceType::GPU, T> : public ReduceOpBase {
 public:
  explicit ReduceOp(OpConstructContext *context)
      : ReduceOpBase(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::ReduceKernel<T>>(reduce_type_,
                                                            axis_,
                                                            keep_dims_);
@@ -914,6 +914,9 @@ void RegisterReduce(OpRegistryBase *op_registry) {
          .SetDevicePlacerFunc(
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
+                if (op->output_shape_size() != op->output_size()) {
+                  return { DeviceType::CPU, DeviceType::GPU };
+                }
                bool keep_dims =
                    ProtoArgHelper::GetOptionalArg<OperatorDef, bool>(
                        *op, "keepdims", false);
@@ -923,7 +926,7 @@ void RegisterReduce(OpRegistryBase *op_registry) {
                auto axis =
                    ProtoArgHelper::GetRepeatedArgs<OperatorDef, int>(
                        *op, "axis");
-                if (axis.size() != 2 || axis[0] != 1 || axis[1] == 2) {
+                if (axis.size() != 2 || axis[0] != 1 || axis[1] != 2) {
                  return { DeviceType::CPU };
                }
                auto tensor_shape_info = context->tensor_shape_info();

--- a/mace/ops/reduce_test.cc
+++ b/mace/ops/reduce_test.cc
@@ -38,7 +38,8 @@ void Simple(const std::vector<index_t> &input_shape,
  net.AddInputFromArray<D, float>("Input", input_shape, input);
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<D, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Reduce", "ReduceTest")
        .Input("InputNCHW")
        .AddIntsArg("axis", axis)
@@ -49,7 +50,8 @@ void Simple(const std::vector<index_t> &input_shape,
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else {
    OpDefBuilder("Reduce", "ReduceTest")
        .Input("Input")
@@ -289,8 +291,8 @@ void RandomTest(const std::vector<index_t> &input_shape,
    // Add input data
    net.AddRandomInput<D, float>("Input", input_shape);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Reduce", "ReduceTest")
        .Input("InputNCHW")
        .AddIntsArg("axis", axis)
@@ -301,8 +303,8 @@ void RandomTest(const std::vector<index_t> &input_shape,
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp();
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
    OpDefBuilder("Reduce", "ReduceTest")
        .Input("Input")
        .AddIntsArg("axis", axis)
@@ -353,7 +355,7 @@ void TestQuant(const std::vector<index_t> &input_shape,
    net.AddRandomInput<CPU, float>(
        "Input", input_shape, false, false);
    net.TransformDataFormat<DeviceType::CPU, float>(
-        "Input", NHWC, "InputNCHW", NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    net.AddRandomInput<DeviceType::CPU, float>(
        "OutputNCHW", input_shape, false, true, true);
@@ -368,7 +370,7 @@ void TestQuant(const std::vector<index_t> &input_shape,
        .Finalize(net.NewOperatorDef());
    net.RunOp(CPU);
    net.TransformDataFormat<DeviceType::CPU, float>(
-        "OutputNCHW", NCHW, "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
    OpDefBuilder("Quantize", "QuantizeInput")
        .Input("Input")

--- a/mace/ops/ref/deconv_2d.cc
+++ b/mace/ops/ref/deconv_2d.cc
@@ -51,7 +51,7 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
                                 &out_pad_size,
                                 &padded_out_shape,
                                 framework_type_,
-                                 NCHW);
+                                 DataFormat::NCHW);
  MACE_RETURN_IF_ERROR(output->Resize(out_shape));

--- a/mace/ops/ref/depthwise_deconv_2d.cc
+++ b/mace/ops/ref/depthwise_deconv_2d.cc
@@ -50,7 +50,7 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
                                 &out_pad_size,
                                 &padded_out_shape,
                                 framework_type_,
-                                 NCHW);
+                                 DataFormat::NCHW);
  MACE_RETURN_IF_ERROR(output->Resize(out_shape));
@@ -185,7 +185,7 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
                                 &out_pad_size,
                                 &padded_out_shape,
                                 framework_type_,
-                                 NCHW);
+                                 DataFormat::NCHW);
  MACE_RETURN_IF_ERROR(output->Resize(out_shape));

--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
@@ -212,7 +212,7 @@ class ResizeBicubicOp<DeviceType::GPU, T> : public Operation {
    std::vector<index_t> size = Operation::GetRepeatedArgs<index_t>(
        "size", {-1, -1});
    MACE_CHECK(size.size() == 2);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::ResizeBicubicKernel<T>>(
          align_corners, size[0], size[1]);
    } else {

--- a/mace/ops/resize_bicubic_test.cc
+++ b/mace/ops/resize_bicubic_test.cc
@@ -31,8 +31,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) {
  std::vector<float> input(24);
  std::iota(begin(input), end(input), 0);
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
      .Input("InputNCHW")
@@ -42,8 +42,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) {
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
@@ -60,8 +60,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) {
  std::vector<float> input(48);
  std::iota(begin(input), end(input), 0);
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 4, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
      .Input("InputNCHW")
@@ -71,8 +71,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) {
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>({1, 2, 3, 3},
@@ -92,8 +92,8 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) {
  std::vector<float> input(24);
  std::iota(begin(input), end(input), 0);
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
      .Input("InputNCHW")
@@ -104,8 +104,8 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) {
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
@@ -133,8 +133,8 @@ void TestRandomResizeBicubic() {
    net.AddRandomInput<D, float>("Input",
                                 {batch, in_height, in_width, channels},
                                 false, true, true);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
        .Input("InputNCHW")
@@ -144,8 +144,8 @@ void TestRandomResizeBicubic() {
        .Finalize(net.NewOperatorDef());
    // Run on CPU
    net.RunOp(DeviceType::CPU);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));

--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -346,7 +346,7 @@ class ResizeBilinearOp<DeviceType::GPU, T> : public Operation {
    std::vector<index_t> size = Operation::GetRepeatedArgs<index_t>(
        "size", {-1, -1});
    MACE_CHECK(size.size() == 2);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::ResizeBilinearKernel<T>>(
          align_corners, size[0], size[1]);
    } else {

--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -31,8 +31,8 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
  std::vector<float> input(24);
  std::iota(begin(input), end(input), 0);
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
      .Input("InputNCHW")
@@ -42,8 +42,8 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
@@ -60,8 +60,8 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
  std::vector<float> input(24);
  std::iota(begin(input), end(input), 0);
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
      .Input("InputNCHW")
@@ -72,8 +72,8 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
@@ -100,8 +100,8 @@ void TestRandomResizeBilinear() {
    // Add input data
    net.AddRandomInput<D, float>("Input",
                                 {batch, in_height, in_width, channels});
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
        .Input("InputNCHW")
@@ -111,8 +111,8 @@ void TestRandomResizeBilinear() {
        .Finalize(net.NewOperatorDef());
    // Run on CPU
    net.RunOp(DeviceType::CPU);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
    auto expected = net.CreateTensor<float>();
    expected->Copy(*net.GetOutput("Output"));
@@ -155,8 +155,8 @@ void TestQuantizedResizeBilinear() {
                                   true,
                                   -1.f,
                                   1.f);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
        .Input("InputNCHW")
@@ -166,8 +166,8 @@ void TestQuantizedResizeBilinear() {
        .Finalize(net.NewOperatorDef());
    // Run on CPU
    net.RunOp(DeviceType::CPU);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
    // run quantize
    OpDefBuilder("Quantize", "QuantizeInput")

--- a/mace/ops/resize_nearest_neighbor.cc
+++ b/mace/ops/resize_nearest_neighbor.cc
@@ -149,7 +149,7 @@ class ResizeNearestNeighborOp<DeviceType::GPU, T> : public Operation {
      : Operation(context) {
    bool align_corners = Operation::GetOptionalArg<bool>(
        "align_corners", false);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::ResizeNearestNeighborKernel<T>>(
          align_corners);
    } else {

--- a/mace/ops/resize_nearest_neighbor_test.cc
+++ b/mace/ops/resize_nearest_neighbor_test.cc
@@ -32,8 +32,8 @@ TEST_F(ResizeNearestNeighborTest, CPUResizeNearestNeighborWOAlignCorners) {
  std::iota(begin(input), end(input), 0);
  std::vector<int32_t> size = {1, 2};
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  net.AddInputFromArray<DeviceType::CPU, int32_t>("Size", {2}, size);
  OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest")
@@ -45,8 +45,8 @@ TEST_F(ResizeNearestNeighborTest, CPUResizeNearestNeighborWOAlignCorners) {
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
@@ -64,8 +64,8 @@ TEST_F(ResizeNearestNeighborTest, ResizeNearestNeighborWAlignCorners) {
  std::iota(begin(input), end(input), 0);
  std::vector<int32_t> size = {1, 2};
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  net.AddInputFromArray<DeviceType::CPU, int32_t>("Size", {2}, size);
  OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest")
@@ -78,8 +78,8 @@ TEST_F(ResizeNearestNeighborTest, ResizeNearestNeighborWAlignCorners) {
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  // Check
  auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
@@ -105,8 +105,8 @@ void TestRandomResizeNearestNeighbor() {
    std::vector<int32_t> size = {20, 40};
    net.AddRandomInput<D, float>("Input",
                                 {batch, in_height, in_width, channels});
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    net.AddInputFromArray<D, int32_t>("Size", {2}, size);
    OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest")
        .Input("InputNCHW")
@@ -116,8 +116,8 @@ void TestRandomResizeNearestNeighbor() {
        .Finalize(net.NewOperatorDef());
    // Run on CPU
    net.RunOp(DeviceType::CPU);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
    auto expected = net.CreateTensor<float>();
    expected->Copy(*net.GetOutput("Output"));

--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -414,7 +414,7 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation {
      : Operation(context) {
    bool use_log = (
        Operation::GetOptionalArg<bool>("use_log", false));
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::SoftmaxKernel<T>>(use_log);
    } else {
      kernel_ = make_unique<opencl::buffer::SoftmaxKernel<T>>(use_log);

--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
@@ -50,7 +50,8 @@ void Simple(bool use_log = false) {
  if (D == DeviceType::CPU) {
    // test 4d softmax
-    net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Softmax", "SoftmaxTest")
        .Input("InputNCHW")
        .Output("OutputNCHW")
@@ -59,7 +60,8 @@ void Simple(bool use_log = false) {
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
@@ -109,7 +111,8 @@ void Complex(const std::vector<index_t> &logits_shape,
  net.AddRandomInput<D, float>("Input", logits_shape);
  if (logits_shape.size() == 4) {
-    net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("Softmax", "SoftmaxTest")
        .Input("InputNCHW")
@@ -127,7 +130,8 @@ void Complex(const std::vector<index_t> &logits_shape,
  net.RunOp();
  if (logits_shape.size() == 4) {
-    net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }
  auto expected = net.CreateTensor<float>();

--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -307,7 +307,7 @@ class SpaceToBatchNDOp<DeviceType::GPU, T> : public SpaceToBatchOpBase {
 public:
  explicit SpaceToBatchNDOp(OpConstructContext *context)
      : SpaceToBatchOpBase(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::SpaceToBatchKernel<T>>();
    } else {
      MACE_NOT_IMPLEMENTED;

--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
@@ -39,8 +39,8 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
        .AddIntsArg("block_shape", block_shape_data)
        .Finalize(net.NewOperatorDef());
  } else if (D == CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
        .Input("InputNCHW")
        .Output("OutputNCHW")
@@ -53,8 +53,8 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
  net.RunOp(D);
  if (D == CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }
  // Check
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
@@ -78,8 +78,8 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
        .AddIntsArg("block_shape", block_shape_data)
        .Finalize(net.NewOperatorDef());
  } else if (D == CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
        .Input("InputNCHW")
        .Output("OutputNCHW")
@@ -92,8 +92,8 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
  net.RunOp(D);
  if (D == CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  }
  // Check
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
@@ -155,8 +155,8 @@ void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
  net.RunOp(GPU);
  // run cpu
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
      .Input("InputNCHW")
      .Output("OutputNCHW")
@@ -164,8 +164,8 @@ void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
      .AddIntsArg("block_shape", block_shape_data)
      .Finalize(net.NewOperatorDef());
  net.RunOp(CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  "OutputCPU", NHWC);
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
  // Check
  ExpectTensorNear<float>(*net.GetOutput("OutputCPU"),
@@ -188,8 +188,8 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
  net.RunOp(GPU);
  // run cpu
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
      .Input("InputNCHW")
      .Output("OutputNCHW")
@@ -197,8 +197,8 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
      .AddIntsArg("block_shape", block_shape_data)
      .Finalize(net.NewOperatorDef());
  net.RunOp(CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  "OutputCPU", NHWC);
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
  // Check
  ExpectTensorNear<float>(*net.GetOutput("OutputCPU"),
@@ -218,8 +218,8 @@ void TestSpaceToBatchQuantize(const std::vector<index_t> &input_shape,
                                 1.f);
  // run cpu
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
      .Input("InputNCHW")
      .Output("OutputNCHW")
@@ -227,8 +227,8 @@ void TestSpaceToBatchQuantize(const std::vector<index_t> &input_shape,
      .AddIntsArg("block_shape", block_shape_data)
      .Finalize(net.NewOperatorDef());
  net.RunOp(CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  "OutputCPU", NHWC);
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
  // run quantize
  OpDefBuilder("Quantize", "QuantizeInput")
@@ -279,8 +279,8 @@ void TestoBatchToSpaceQuantize(const std::vector<index_t> &input_shape,
                                 1.f);
  // run cpu
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
      .Input("InputNCHW")
      .Output("OutputNCHW")
@@ -288,8 +288,8 @@ void TestoBatchToSpaceQuantize(const std::vector<index_t> &input_shape,
      .AddIntsArg("block_shape", block_shape_data)
      .Finalize(net.NewOperatorDef());
  net.RunOp(CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  "OutputCPU", NHWC);
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
  // run quantize
  OpDefBuilder("Quantize", "QuantizeInput")

--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
@@ -94,7 +94,7 @@ class SpaceToDepthOp<DeviceType::GPU, T> : public Operation {
  explicit SpaceToDepthOp(OpConstructContext *context)
      : Operation(context) {
    int block_size = Operation::GetOptionalArg<int>("block_size", 1);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::SpaceToDepthKernel<T>>(block_size);
    } else {
      MACE_NOT_IMPLEMENTED;

--- a/mace/ops/space_to_depth_test.cc
+++ b/mace/ops/space_to_depth_test.cc
@@ -32,8 +32,8 @@ void RunSpaceToDepth(const std::vector<index_t> &input_shape,
  net.AddInputFromArray<D, float>("Input", input_shape, input_data);
  // Construct graph
  if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
    OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
        .Input("InputNCHW")
        .Output("OutputNCHW")
@@ -41,8 +41,8 @@ void RunSpaceToDepth(const std::vector<index_t> &input_shape,
        .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+    net.TransformDataFormat<DeviceType::CPU, float>(
-                                                    "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  } else {
    OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
@@ -107,8 +107,8 @@ void RandomTest(const int block_size,
  // Add input data
  net.AddRandomInput<D, float>("Input", shape);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
      .Input("InputNCHW")
      .AddIntArg("block_size", block_size)
@@ -118,8 +118,8 @@ void RandomTest(const int block_size,
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
      .Input("Input")

--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
@@ -106,7 +106,7 @@ class SplitOp<DeviceType::GPU, T> : public Operation {
  explicit SplitOp(OpConstructContext *context)
      : Operation(context) {
    int32_t axis = Operation::GetOptionalArg<int>("axis", 3);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::SplitKernel<T>>(axis);
    } else {
      MACE_NOT_IMPLEMENTED;
@@ -147,7 +147,7 @@ void RegisterSplit(OpRegistryBase *op_registry) {
              [](OpConditionContext *context) -> std::set<DeviceType> {
                auto op = context->operator_def();
                if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                }
                int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                    *op, "axis", 3);

--- a/mace/ops/sqrdiff_mean.cc
+++ b/mace/ops/sqrdiff_mean.cc
@@ -83,7 +83,7 @@ class SqrDiffMeanOp<DeviceType::GPU, T> : public Operation {
 public:
  explicit SqrDiffMeanOp(OpConstructContext *context)
      : Operation(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
      kernel_ = make_unique<opencl::image::SqrDiffMeanKernel<T>>();
    } else {
      MACE_NOT_IMPLEMENTED;

--- a/mace/ops/sqrdiff_mean_test.cc
+++ b/mace/ops/sqrdiff_mean_test.cc
@@ -36,13 +36,13 @@ void Simple(const std::vector<index_t> &input_shape0,
  net.AddInputFromArray<D, float>("Input1", input_shape1, input1);
  net.TransformDataFormat<DeviceType::CPU, float>("Input0",
-                                                  NHWC,
+                                                  DataFormat::NHWC,
                                                  "InputNCHW0",
-                                                  NCHW);
+                                                  DataFormat::NCHW);
  net.TransformDataFormat<DeviceType::CPU, float>("Input1",
-                                                  NHWC,
+                                                  DataFormat::NHWC,
                                                  "InputNCHW1",
-                                                  NCHW);
+                                                  DataFormat::NCHW);
  if (D == DeviceType::CPU) {
    OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest")
@@ -54,9 +54,9 @@ void Simple(const std::vector<index_t> &input_shape0,
    net.RunOp(D);
    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
-                                                    NCHW,
+                                                    DataFormat::NCHW,
                                                    "Output",
-                                                    NHWC);
+                                                    DataFormat::NHWC);
  } else {
    OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest")
        .Input("Input0")
@@ -107,10 +107,10 @@ void RandomTest(const std::vector<index_t> &input_shape0,
  net.AddRandomInput<D, float>("Input0", input_shape0);
  net.AddRandomInput<D, float>("Input1", input_shape1);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC, "InputNCHW0",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input0", DataFormat::NHWC, "InputNCHW0", DataFormat::NCHW);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input1", NHWC, "InputNCHW1",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input1", DataFormat::NHWC, "InputNCHW1", DataFormat::NCHW);
  OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest")
      .Input("InputNCHW0")
      .Input("InputNCHW1")
@@ -118,8 +118,8 @@ void RandomTest(const std::vector<index_t> &input_shape0,
      .Finalize(net.NewOperatorDef());
  // Run
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  "Output", NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest")
      .Input("Input0")
      .Input("Input1")

--- a/mace/ops/strided_slice_test.cc
+++ b/mace/ops/strided_slice_test.cc
@@ -86,8 +86,8 @@ void TestStridedSliceWithDataFormat(const std::vector<index_t> &input_shape,
  net.AddInputFromArray<CPU, int32_t>(
      "Strides", {static_cast<int32_t>(strides.size())}, strides);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("StridedSlice", "StridedSliceOpTest")
      .Input("InputNCHW")
@@ -105,8 +105,8 @@ void TestStridedSliceWithDataFormat(const std::vector<index_t> &input_shape,
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  net.AddInputFromArray<CPU, float>("ExpectedOutput", output_shape, output);
  ExpectTensorNear<float>(*net.GetOutput("ExpectedOutput"),
                          *net.GetOutput("Output"));
@@ -154,8 +154,8 @@ void TestSliceWithDataFormat(const std::vector<index_t> &input_shape,
  net.AddInputFromArray<CPU, int32_t>(
      "IndicesSize", {static_cast<int32_t>(indices_size.size())}, indices_size);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
  OpDefBuilder("StridedSlice", "StridedSliceOpTest")
      .Input("InputNCHW")
@@ -168,8 +168,8 @@ void TestSliceWithDataFormat(const std::vector<index_t> &input_shape,
  net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
+  net.TransformDataFormat<DeviceType::CPU, float>(
-                                                  NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
  net.AddInputFromArray<CPU, float>("ExpectedOutput", output_shape, output);
  ExpectTensorNear<float>(*net.GetOutput("ExpectedOutput"),
                          *net.GetOutput("Output"));

--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -34,10 +34,10 @@ class NetDef;
 enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3, HTA = 4 };
-enum DataFormat {
+enum class DataFormat {
-  DF_NONE = 0, NHWC = 1, NCHW = 2,
+  NONE = 0, NHWC = 1, NCHW = 2,
  HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103,
-  DF_AUTO = 1000,
+  AUTO = 1000,
 };
 enum GPUPerfHint {

--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -41,7 +41,7 @@ device_type_map = {'cpu': cvt.DeviceType.CPU.value,
                   'cpu+gpu': cvt.DeviceType.CPU.value}
 data_format_map = {
-    'NONE': cvt.DataFormat.DF_NONE,
+    'NONE': cvt.DataFormat.NONE,
    'NHWC': cvt.DataFormat.NHWC,
    'NCHW': cvt.DataFormat.NCHW,
    'OIHW': cvt.DataFormat.OIHW,

--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -26,14 +26,14 @@ class DeviceType(Enum):
 class DataFormat(Enum):
-    DF_NONE = 0
+    NONE = 0
    NHWC = 1
    NCHW = 2
    HWIO = 100
    OIHW = 101
    HWOI = 102
    OHWI = 103
-    DF_AUTO = 1000
+    AUTO = 1000
 # SAME_LOWER: if the amount of paddings to be added is odd,
@@ -598,8 +598,8 @@ class ConverterUtil(object):
            return DataFormat.NHWC
        elif arg.i == DataFormat.NCHW.value:
            return DataFormat.NCHW
-        elif arg.i == DataFormat.DF_AUTO.value:
+        elif arg.i == DataFormat.AUTO.value:
-            return DataFormat.DF_AUTO
+            return DataFormat.AUTO
        else:
            return None

--- a/mace/python/tools/converter_tool/onnx_converter.py
+++ b/mace/python/tools/converter_tool/onnx_converter.py
@@ -387,7 +387,8 @@ class OnnxConverter(base_converter.ConverterInterface):
        self._mace_net_def = mace_pb2.NetDef()
        self._data_format = DataFormat.NCHW
        ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.OIHW)
-        ConverterUtil.add_data_format_arg(self._mace_net_def, self._data_format)
+        ConverterUtil.add_data_format_arg(self._mace_net_def,
+                                          self._data_format)
        onnx_model = onnx.load(src_model_file)
        ir_version = onnx_model.ir_version
@@ -403,7 +404,7 @@ class OnnxConverter(base_converter.ConverterInterface):
            print("constains ops domain: ", domain, "version:", version)
            if 'kaldi2onnx' in domain:
                polish_available = False
-                self._data_format = DataFormat.DF_NONE
+                self._data_format = DataFormat.NONE
                self._isKaldi = True
        if polish_available:
            onnx_model = onnx.utils.polish_model(onnx_model)

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -27,7 +27,7 @@ from mace.python.tools.converter_tool.base_converter import EltwiseType
 from mace.python.tools.converter_tool.base_converter import FrameworkType
 from mace.python.tools.converter_tool.base_converter import MaceKeyword
 from mace.python.tools.converter_tool.base_converter import MaceOp
-from mace.python.tools.converter_tool.base_converter import MaceHasDataFormatOps
+from mace.python.tools.converter_tool.base_converter import MaceHasDataFormatOps  # noqa
 from mace.python.tools.converter_tool.base_converter import MaceMayHasDataFormatOps  # noqa
 from mace.python.tools.converter_tool.base_converter import PaddingMode
 from mace.python.tools.converter_tool.base_converter import ReduceType
@@ -200,15 +200,15 @@ class Transformer(base_converter.ConverterInterface):
                    op.output.extend([input_node.name])
                    output_shape = op.output_shape.add()
                    output_shape.dims.extend(input_node.shape)
-                    if input_node.data_format != DataFormat.DF_NONE:
+                    if input_node.data_format != DataFormat.NONE:
                        if input_node.data_format == DataFormat.NCHW:
                            self.transpose_shape(output_shape.dims,
                                                 [0, 3, 1, 2])
                        ConverterUtil.add_data_format_arg(op,
-                                                          DataFormat.DF_AUTO)
+                                                          DataFormat.AUTO)
                    else:
                        ConverterUtil.add_data_format_arg(op,
-                                                          DataFormat.DF_NONE)
+                                                          DataFormat.NONE)
                    self._producer[op.output[0]] = op
    @staticmethod
@@ -261,7 +261,7 @@ class Transformer(base_converter.ConverterInterface):
            producer = self._producer[tensor]
            return ConverterUtil.data_format(producer)
        else:
-            return DataFormat.DF_NONE
+            return DataFormat.NONE
    def consumer_count(self, tensor_name):
        return len(self._consumers.get(tensor_name, []))
@@ -1021,7 +1021,6 @@ class Transformer(base_converter.ConverterInterface):
                                       filter_format.name)
        return False
    def add_winograd_arg(self):
        if self._wino_arg == 0:
            return False
@@ -1350,20 +1349,21 @@ class Transformer(base_converter.ConverterInterface):
                df_arg = op.arg.add()
                df_arg.name = MaceKeyword.mace_data_format_str
            if op.type in MaceHasDataFormatOps:
-                df_arg.i = DataFormat.DF_AUTO.value
+                df_arg.i = DataFormat.AUTO.value
            elif op.type in MaceMayHasDataFormatOps:
-                input_df = DataFormat.DF_AUTO.value
+                input_df = DataFormat.AUTO.value
                for input_tensor in op.input:
                    if input_tensor in self._consts:
                        continue
-                    mace_check(input_tensor in self._producer,
+                    mace_check(
-                               "Input tensor %s not in producer" % input_tensor)
+                        input_tensor in self._producer,
+                        "Input tensor %s not in producer" % input_tensor)
                    father_op = self._producer[input_tensor]
                    temp_input_df = ConverterUtil.get_arg(
                        father_op, MaceKeyword.mace_data_format_str)
-                    if temp_input_df.i != DataFormat.DF_AUTO.value:
+                    if temp_input_df.i != DataFormat.AUTO.value:
                        input_df = temp_input_df.i
-                if input_df == DataFormat.DF_AUTO.value:
+                if input_df == DataFormat.AUTO.value:
                    df_arg.i = input_df
                    # add flag to mark the ops may has data format
                    has_data_format_arg = op.arg.add()
@@ -1379,7 +1379,7 @@ class Transformer(base_converter.ConverterInterface):
        src_data_format = ConverterUtil.data_format(net)
        for op in net.op:
            has_data_format = ConverterUtil.data_format(op) == \
-                              DataFormat.DF_AUTO
+                              DataFormat.AUTO
            # transpose args
            if op.type == MaceOp.Pad.name:
                for arg in op.arg:

--- a/mace/python/tools/model.jinja2
+++ b/mace/python/tools/model.jinja2
@@ -80,7 +80,7 @@ void CreateInputInfo(NetDef *net_def) {
  input_info = net_def->add_input_info();
  input_info->set_name({{ net.input_info[idx].name|tojson }});
  input_info->set_data_type(static_cast<DataType>({{ net.input_info[idx].data_type }}));
-  input_info->set_data_format(static_cast<DataFormat>({{ net.input_info[idx].data_format }}));
+  input_info->set_data_format({{ net.input_info[idx].data_format }});
  input_info->mutable_dims()->Reserve({{ net.input_info[idx].dims|length }});
  {% for dim in net.input_info[idx].dims %}
  input_info->add_dims({{ dim }});
@@ -97,7 +97,7 @@ void CreateOutputInfo(NetDef *net_def) {
  output_info = net_def->add_output_info();
  output_info->set_name({{ net.output_info[idx].name|tojson }});
  output_info->set_data_type(static_cast<DataType>({{ net.output_info[idx].data_type }}));
-  output_info->set_data_format(static_cast<DataFormat>({{ net.output_info[idx].data_format }}));
+  output_info->set_data_format({{ net.output_info[idx].data_format }});
  output_info->mutable_dims()->Reserve({{ net.output_info[idx].dims|length }});
  {% for dim in net.output_info[idx].dims %}
  output_info->add_dims({{dim}});

--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
@@ -48,7 +48,7 @@ void MaceRunFunc(const int in_out_size) {
  for (size_t i = 0; i < input_names.size(); ++i) {
    InputOutputInfo *info = net_def->add_input_info();
-    info->set_data_format(DataFormat::NHWC);
+    info->set_data_format(static_cast<int>(DataFormat::NHWC));
    info->set_name(input_names[i]);
    for (auto d : input_shapes[0]) {
      info->add_dims(static_cast<int>(d));

--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -45,7 +45,7 @@ void MaceRun(const int in_out_size,
  for (size_t i = 0; i < input_names.size(); ++i) {
    InputOutputInfo *info = net_def->add_input_info();
-    info->set_data_format(DataFormat::NHWC);
+    info->set_data_format(static_cast<int>(DataFormat::NHWC));
    info->set_name(input_names[i]);
    for (auto d : max_shape) {
      info->add_dims(static_cast<int>(d));

--- a/mace/test/mace_api_test.h
+++ b/mace/test/mace_api_test.h
@@ -76,7 +76,7 @@ void Conv3x3(const std::string &input_name,
      .AddIntArg("padding", Padding::SAME)
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddIntArg("has_data_format", 1)
+      .AddIntArg("data_format", static_cast<int>(DataFormat::AUTO))
      .Finalize(&operator_def);
  OutputShape *shape = operator_def.add_output_shape();
@@ -99,7 +99,7 @@ void Relu(const std::string &input_name,
      .AddStringArg("activation", "RELU")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .AddIntArg("device", static_cast<int>(device_type))
-      .AddIntArg("has_data_format", 1)
+      .AddIntArg("data_format", static_cast<int>(DataFormat::AUTO))
      .Finalize(&operator_def);
  net_def->add_op()->CopyFrom(operator_def);
@@ -139,7 +139,8 @@ void CheckOutputs(const NetDef &net_def,
    if (D == DeviceType::CPU) {
      std::string input_name = input.first + "NHWC";
      net.AddInputFromArray<D, float>(input_name, input_shape, input_data);
-      net.TransformDataFormat<D, float>(input_name, NHWC, input.first, NCHW);
+      net.TransformDataFormat<D, float>(
+          input_name, DataFormat::NHWC, input.first, DataFormat::NCHW);
    } else {
      net.AddInputFromArray<D, float>(input.first, input_shape, input_data);
    }
@@ -154,7 +155,7 @@ void CheckOutputs(const NetDef &net_def,
    memcpy(data.data(),
           reinterpret_cast<const T *>(tensor_data.data()) + tensor.offset(),
           tensor.data_size() * sizeof(T));
-    net.AddInputFromArray<D, T>(tensor.name(), shape, data);
+    net.AddInputFromArray<D, T>(tensor.name(), shape, data, true);
  }
  net.RunNet(net_def, D);
@@ -175,9 +176,9 @@ void CheckOutputs(const NetDef &net_def,
    if (D == DeviceType::CPU) {
      output_name = output.first + "NHWC";
      net.TransformDataFormat<CPU, float>(output.first,
-                                          NCHW,
+                                          DataFormat::NCHW,
                                          output_name,
-                                          NHWC);
+                                          DataFormat::NHWC);
    }
    ops::test::ExpectTensorNear<float>(*tmp_tensor,
                                       *net.GetOutput(output_name.data()),

--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -91,7 +91,7 @@ DataFormat ParseDataFormat(const std::string &data_format_str) {
  } else if (data_format_str == "OIHW") {
    return DataFormat::OIHW;
  } else {
-    return DataFormat::DF_NONE;
+    return DataFormat::NONE;
  }
 }