From 95b32c2445f5ec9d02221d38c3ad68c0dd3c1857 Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Mon, 22 Apr 2019 09:49:48 +0800
Subject: [PATCH] Update the memory type choose logic and polish some code.

1. Change DataFormat from enum to enum class.
---
 mace/benchmark/benchmark_model.cc             |   2 +-
 mace/core/arg_helper.cc                       |   5 +-
 mace/core/arg_helper.h                        |   2 +-
 mace/core/memory_optimizer.cc                 |   3 +-
 mace/core/net.cc                              |   2 +-
 mace/core/net_def_adapter.cc                  | 165 +++++++++---------
 mace/core/net_def_adapter.h                   |  24 ++-
 mace/core/net_optimizer.cc                    |   6 +-
 mace/core/net_optimizer.h                     |  13 ++
 mace/core/operator.cc                         |  55 ++++--
 mace/core/operator.h                          |  14 ++
 mace/core/runtime/opencl/opencl_util.cc       |   2 +-
 mace/core/workspace.cc                        |   2 +-
 mace/examples/cli/example.cc                  |   2 +-
 mace/libmace/capability.cc                    |   6 +-
 mace/libmace/mace.cc                          |  38 ++--
 mace/ops/activation.cc                        |   6 +-
 mace/ops/activation_test.cc                   |   6 +-
 mace/ops/addn.cc                              |   6 +-
 mace/ops/arm/fp32/deconv_2d.cc                |   2 +-
 mace/ops/batch_norm.cc                        |   2 +-
 mace/ops/batch_norm_test.cc                   |  38 ++--
 mace/ops/batch_to_space.cc                    |   2 +-
 mace/ops/bias_add.cc                          |   6 +-
 mace/ops/bias_add_benchmark.cc                |   2 -
 mace/ops/bias_add_test.cc                     |  24 +--
 mace/ops/buffer_to_image_benchmark.cc         |   1 -
 mace/ops/buffer_to_image_test.cc              |  12 +-
 mace/ops/buffer_transform.cc                  |   5 +-
 mace/ops/buffer_transform_test.cc             |   6 +-
 mace/ops/channel_shuffle.cc                   |   4 +-
 mace/ops/channel_shuffle_test.cc              |   8 +-
 mace/ops/common/conv_pool_2d_util.cc          |  76 ++++----
 mace/ops/concat.cc                            |   8 +-
 mace/ops/conv_2d.cc                           |  10 +-
 mace/ops/conv_2d_test.cc                      | 108 ++++++------
 mace/ops/crop.cc                              |   6 +-
 mace/ops/crop_test.cc                         |  12 +-
 mace/ops/cumsum_test.cc                       |   8 +-
 mace/ops/deconv_2d.cc                         |   6 +-
 mace/ops/deconv_2d_test.cc                    |  19 +-
 mace/ops/depth_to_space.cc                    |   2 +-
 mace/ops/depth_to_space_test.cc               |  16 +-
 mace/ops/depthwise_conv2d.cc                  |  22 ++-
 mace/ops/depthwise_conv2d_test.cc             |  30 ++--
 mace/ops/depthwise_deconv2d.cc                |   4 +-
 mace/ops/depthwise_deconv2d_test.cc           |  19 +-
 mace/ops/eltwise.cc                           |   2 +-
 mace/ops/eltwise_test.cc                      |  50 +++---
 mace/ops/folded_batch_norm_test.cc            |  42 ++---
 mace/ops/fully_connected.cc                   |   2 +-
 mace/ops/fully_connected_test.cc              |  19 +-
 mace/ops/local_response_norm_test.cc          |   6 +-
 mace/ops/lstm_cell.cc                         |   2 +-
 mace/ops/opencl/buffer_transformer.h          |   5 +-
 mace/ops/ops_test_util.cc                     |  45 +++--
 mace/ops/ops_test_util.h                      |  16 +-
 mace/ops/pad.cc                               |   2 +-
 mace/ops/pad_test.cc                          |  30 ++--
 mace/ops/pooling.cc                           |  10 +-
 mace/ops/pooling_test.cc                      |  68 ++++----
 mace/ops/reduce.cc                            |   7 +-
 mace/ops/reduce_test.cc                       |  18 +-
 mace/ops/ref/deconv_2d.cc                     |   2 +-
 mace/ops/ref/depthwise_deconv_2d.cc           |   4 +-
 mace/ops/resize_bicubic.cc                    |   2 +-
 mace/ops/resize_bicubic_test.cc               |  32 ++--
 mace/ops/resize_bilinear.cc                   |   2 +-
 mace/ops/resize_bilinear_test.cc              |  32 ++--
 mace/ops/resize_nearest_neighbor.cc           |   2 +-
 mace/ops/resize_nearest_neighbor_test.cc      |  24 +--
 mace/ops/softmax.cc                           |   2 +-
 mace/ops/softmax_test.cc                      |  12 +-
 mace/ops/space_to_batch.cc                    |   2 +-
 mace/ops/space_to_batch_test.cc               |  48 ++---
 mace/ops/space_to_depth.cc                    |   2 +-
 mace/ops/space_to_depth_test.cc               |  16 +-
 mace/ops/split.cc                             |   4 +-
 mace/ops/sqrdiff_mean.cc                      |   2 +-
 mace/ops/sqrdiff_mean_test.cc                 |  24 +--
 mace/ops/strided_slice_test.cc                |  16 +-
 mace/public/mace.h                            |   6 +-
 mace/python/tools/converter.py                |   2 +-
 .../tools/converter_tool/base_converter.py    |   8 +-
 .../tools/converter_tool/onnx_converter.py    |   5 +-
 .../tools/converter_tool/transformer.py       |  26 +--
 mace/python/tools/model.jinja2                |   4 +-
 mace/test/mace_api_mt_test.cc                 |   2 +-
 mace/test/mace_api_test.cc                    |   2 +-
 mace/test/mace_api_test.h                     |  13 +-
 mace/tools/validation/mace_run.cc             |   2 +-
 91 files changed, 791 insertions(+), 648 deletions(-)

diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc
index e0dac730..98807b67 100644
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -83,7 +83,7 @@ DataFormat ParseDataFormat(const std::string &data_format_str) {
   } else if (data_format_str == "OIHW") {
     return DataFormat::OIHW;
   } else {
-    return DataFormat::DF_NONE;
+    return DataFormat::NONE;
   }
 }
 
diff --git a/mace/core/arg_helper.cc b/mace/core/arg_helper.cc
index f2a6467b..2cb1379b 100644
--- a/mace/core/arg_helper.cc
+++ b/mace/core/arg_helper.cc
@@ -123,14 +123,13 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true)
   MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, float, f)       \
   MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, bool, i)        \
   MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int, i)         \
-  MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int64_t, i)     \
-  MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, std::string, s)
+  MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int64_t, i)
 
 MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(OperatorDef)
 MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(NetDef)
 #undef MACE_SET_OPTIONAL_ARGUMENT_FUNC
 
-std::string OutputMemoryTypeTagName() {
+const std::string OutputMemoryTypeTagName() {
   static const char *kOutputMemTypeArgName = "output_mem_type";
   return kOutputMemTypeArgName;
 }
diff --git a/mace/core/arg_helper.h b/mace/core/arg_helper.h
index 5512fb06..e3a6319a 100644
--- a/mace/core/arg_helper.h
+++ b/mace/core/arg_helper.h
@@ -65,7 +65,7 @@ void SetProtoArg(NetDef *op_def,
                  const std::string &arg_name,
                  const T&value);
 
-std::string OutputMemoryTypeTagName();
+const std::string OutputMemoryTypeTagName();
 
 bool IsQuantizedModel(const NetDef &def);
 
diff --git a/mace/core/memory_optimizer.cc b/mace/core/memory_optimizer.cc
index 9b572071..b781682f 100644
--- a/mace/core/memory_optimizer.cc
+++ b/mace/core/memory_optimizer.cc
@@ -126,7 +126,8 @@ void MemoryOptimizer::Optimize(
 
   DataFormat data_format = static_cast<DataFormat>(
       ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-          *op_def, "data_format", DataFormat::DF_NONE));
+          *op_def, "data_format",
+          static_cast<int>(DataFormat::NONE)));
   int output_size = op_def->output_size();
   for (int i = 0; i < output_size; ++i) {
     if (i < op_def->output_type_size()) {
diff --git a/mace/core/net.cc b/mace/core/net.cc
index c6e676d2..8c301dc7 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -76,7 +76,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
 
 #ifdef MACE_ENABLE_OPENCL
     if (target_device_->device_type() == DeviceType::GPU) {
-      // update the map : output_tensor -> Operation
+      // update the map : output_tensor -> MemoryType
       MemoryType out_mem_type =
           static_cast<MemoryType>(
               ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
diff --git a/mace/core/net_def_adapter.cc b/mace/core/net_def_adapter.cc
index fe89e810..7c7bb865 100644
--- a/mace/core/net_def_adapter.cc
+++ b/mace/core/net_def_adapter.cc
@@ -37,7 +37,7 @@ DataFormat GetDefaultDataFormat(DeviceType device_type,
     return DataFormat::NHWC;
   } else {
     LOG(FATAL) << "MACE do not support the device " << device_type;
-    return DataFormat::DF_NONE;
+    return DataFormat::NONE;
   }
 }
 
@@ -50,19 +50,21 @@ std::string TransformedName(const std::string &input_name,
   return ss.str();
 }
 
+#ifdef MACE_ENABLE_OPENCL
 bool TransformRequiredOp(const std::string &op_type) {
   static const std::unordered_set<std::string> kNoTransformOp = {
       "Shape", "InferConv2dShape"
   };
   return kNoTransformOp.count(op_type) == 0;
 }
+#endif  // MACE_ENABLE_OPENCL
 
 void BuildTransposeOpDef(
     const std::string &input_name,
     const std::string &output_name,
-    const std::vector<mace::index_t> &output_shape,
+    const std::vector<index_t> &output_shape,
     const std::vector<int> dst_dims,
-    const mace::DataType dt,
+    const DataType dt,
     DeviceType device_type,
     OperatorDef *op_def) {
   std::string op_name = "mace_node_" + output_name;
@@ -89,21 +91,13 @@ void BuildTransposeOpDef(
 
 }  // namespace
 
-NetDefAdapter::NetDefAdapter(const mace::OpRegistryBase *op_registry,
-                             const mace::Workspace *ws)
+NetDefAdapter::NetDefAdapter(const OpRegistryBase *op_registry,
+                             const Workspace *ws)
     : op_registry_(op_registry), ws_(ws) {}
 
-// Adapt original net_def to a better net.
-// 1. Adapt device: choose best device for every op in the net.
-// 2. Adapt data type: Add data type related transform ops
-//                     for mixing precision.
-// 3. Adapt data format: confirm data format of every op
-//                       and add transpose if necessary.
-// 4. Adapt memory type: Add BufferTransform if necessary
-//                       for transforming memory type between ops.
 MaceStatus NetDefAdapter::AdaptNetDef(
-    const mace::NetDef *net_def,
-    mace::Device *target_device,
+    const NetDef *net_def,
+    Device *target_device,
     NetDef *target_net_def) {
   MACE_LATENCY_LOGGER(1, "Adapting original NetDef");
   // Copy from original op_def, leave ops alone.
@@ -115,7 +109,7 @@ MaceStatus NetDefAdapter::AdaptNetDef(
   std::unique_ptr<CPUDevice> cpu_device = make_unique<CPUDevice>(
       target_device->cpu_runtime()->num_threads(),
       target_device->cpu_runtime()->policy(),
-      target_device->cpu_runtime()->use_gemmlowp());
+      &(target_device->cpu_runtime()->thread_pool()));
 
   // quantize model flag
   bool is_quantized_model = IsQuantizedModel(*net_def);
@@ -131,40 +125,40 @@ MaceStatus NetDefAdapter::AdaptNetDef(
         std::vector<index_t>(tensor.dims().begin(), tensor.dims().end());
   }
 
+  MemoryType mem_type = MemoryType::CPU_BUFFER;
+  if (target_device->device_type() == DeviceType::CPU) {
+    mem_type = MemoryType::CPU_BUFFER;
+  } else if (target_device->device_type() == DeviceType::GPU) {
+    mem_type = MemoryType::GPU_BUFFER;
+  } else {
+    LOG(FATAL) << "MACE do not support the device type: "
+               << target_device->device_type();
+  }
+
   int input_size = target_net_def->input_info_size();
   for (int i = 0; i < input_size; ++i) {
     auto input_info = target_net_def->mutable_input_info(i);
-    MemoryType mem_type = MemoryType::CPU_BUFFER;
-    if (target_device->device_type() == DeviceType::CPU) {
-      mem_type = MemoryType::CPU_BUFFER;
-    } else if (target_device->device_type() == DeviceType::GPU) {
-      mem_type = MemoryType::GPU_BUFFER;
-    } else {
-      LOG(FATAL) << "MACE do not support the device type: "
-                 << target_device->device_type();
-    }
-    DataFormat input_data_format = static_cast<DataFormat>(
+    auto input_data_format = static_cast<DataFormat>(
         input_info->data_format());
     DataFormat expected_data_format = GetDefaultDataFormat(
         target_device->device_type(), is_quantized_model);
-    std::vector<index_t> input_shape =
-        std::vector<index_t>(input_info->dims().begin(),
-                             input_info->dims().end());
-    if (input_data_format != DataFormat::DF_NONE
+    std::vector<index_t> input_shape(input_info->dims().begin(),
+                                     input_info->dims().end());
+    if (input_data_format != DataFormat::NONE
         && input_data_format != expected_data_format
         && input_shape.size() == 4) {
       if (input_data_format == DataFormat::NHWC
           && expected_data_format == DataFormat::NCHW) {
-        std::vector<int> dst_dims = {0, 3, 1, 2};
+        std::vector<int> dst_dims{0, 3, 1, 2};
         input_data_format = DataFormat::NCHW;
         input_shape = TransposeShape<index_t, index_t>(input_shape, dst_dims);
       } else if (input_data_format == DataFormat::NCHW
           && expected_data_format == DataFormat::NHWC) {
-        std::vector<int> dst_dims = {0, 2, 3, 1};
+        std::vector<int> dst_dims{0, 2, 3, 1};
         input_data_format = DataFormat::NHWC;
         input_shape = TransposeShape<index_t, index_t>(input_shape, dst_dims);
       }
-      input_info->set_data_format(input_data_format);
+      input_info->set_data_format(static_cast<int>(input_data_format));
       int input_shape_size = input_shape.size();
       for (int j = 0; j < input_shape_size; ++j) {
         input_info->set_dims(j, input_shape[j]);
@@ -287,9 +281,10 @@ MaceStatus NetDefAdapter::AdaptNetDef(
             internal_output_info.data_format,
             transformed_op_def);
         // set data format arg
-        SetProtoArg<int>(transformed_op_def,
-                         "data_format",
-                         internal_output_info.data_format);
+        SetProtoArg<int>(
+            transformed_op_def,
+            "data_format",
+            static_cast<int>(internal_output_info.data_format));
         // set output memory type argument
         SetProtoArg<int>(transformed_op_def,
                          OutputMemoryTypeTagName(),
@@ -309,7 +304,7 @@ MaceStatus NetDefAdapter::AdaptDevice(OpConditionContext *context,
                                       const TensorInfoMap &output_map,
                                       const NetDef *net_def,
                                       OperatorDef *op_def) {
-  VLOG(1) << "Adapt device for op " << op_def->name();
+  VLOG(3) << "Adapt device for op " << op_def->name();
   DeviceType target_device_type = target_device->device_type();
   DeviceType device_type = DeviceType::CPU;
   context->set_device(cpu_device);
@@ -335,15 +330,18 @@ MaceStatus NetDefAdapter::AdaptDevice(OpConditionContext *context,
                                                   producer_devices);
     if (device_type == target_device_type) {
       context->set_device(target_device);
+    } else {
+      LOG(INFO) << "Op " << op_def->name() << " fall back to CPU";
     }
   }
   op_def->set_device_type(device_type);
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus NetDefAdapter::AdaptDataType(mace::OpConditionContext *context,
-                                        mace::OperatorDef *op_def) {
+MaceStatus NetDefAdapter::AdaptDataType(OpConditionContext *context,
+                                        OperatorDef *op_def) {
   MACE_UNUSED(context);
+  // Where to add logic to support mixing precision
   // Adjust data type of op ran on CPU
   DataType dtype = static_cast<DataType>(
       ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
@@ -355,20 +353,20 @@ MaceStatus NetDefAdapter::AdaptDataType(mace::OpConditionContext *context,
 }
 
 MaceStatus NetDefAdapter::AdaptDataFormat(
-    mace::OpConditionContext *context,
-    mace::OperatorDef *op_def,
+    OpConditionContext *context,
+    OperatorDef *op_def,
     bool is_quantized_model,
     TensorInfoMap *output_map,
     std::unordered_set<std::string> *transformed_set,
     DataFormat *op_output_df,
-    mace::NetDef *target_net_def) {
-  VLOG(1) << "Adapt data format for op " << op_def->name();
-  MACE_UNUSED(context);
+    NetDef *target_net_def) {
+  VLOG(3) << "Adapt data format for op " << op_def->name();
   DataFormat op_data_format =
       static_cast<DataFormat>(ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-          *op_def, "data_format", 0));
+          *op_def, "data_format",
+          static_cast<int>(DataFormat::NONE)));
   // adjust the data format of operation
-  if (op_data_format == DataFormat::DF_AUTO) {
+  if (op_data_format == DataFormat::AUTO) {
     op_data_format = GetDefaultDataFormat(
         static_cast<DeviceType>(op_def->device_type()), is_quantized_model);
     SetProtoArg<int>(op_def, "data_format", static_cast<int>(op_data_format));
@@ -376,14 +374,15 @@ MaceStatus NetDefAdapter::AdaptDataFormat(
       int output_shape_size = op_def->output_shape_size();
       for (int i = 0; i < output_shape_size; ++i) {
         auto output_shape = op_def->mutable_output_shape(i);
-        if (output_shape->dims_size() == 4) {
-          // transpose output shape format from NHWC to NCHW
-          int64_t height = output_shape->dims(1);
-          int64_t width = output_shape->dims(2);
-          output_shape->set_dims(1, output_shape->dims(3));
-          output_shape->set_dims(2, height);
-          output_shape->set_dims(3, width);
-        }
+        MACE_CHECK(output_shape->dims_size() == 4,
+                   "Output shape should be 4D if the of has data format. ",
+                   op_def->name());
+        // transpose output shape format from NHWC to NCHW
+        int64_t height = output_shape->dims(1);
+        int64_t width = output_shape->dims(2);
+        output_shape->set_dims(1, output_shape->dims(3));
+        output_shape->set_dims(2, height);
+        output_shape->set_dims(3, width);
       }
     }
   }
@@ -394,8 +393,8 @@ MaceStatus NetDefAdapter::AdaptDataFormat(
   if (op_def->device_type() == DeviceType::GPU) {
     target_mem_type = MemoryType::GPU_BUFFER;
   }
-  // Use op's data format as inputs' data format for now.
-  // Could move the logic to OpRegistry if necessary.
+  auto inputs_data_format = op_registry_->InputsDataFormat(op_def->type(),
+      context);
   DataFormat src_df, dst_df;
   int input_size = op_def->input_size();
   for (int i = 0; i < input_size; ++i) {
@@ -408,20 +407,21 @@ MaceStatus NetDefAdapter::AdaptDataFormat(
       continue;
     }
     src_df = output_map->at(op_def->input(i)).data_format;
-    dst_df = op_data_format;
-    if (src_df == DataFormat::DF_NONE
-        || dst_df == DataFormat::DF_NONE
+    dst_df = inputs_data_format[i];
+    if (src_df == DataFormat::NONE
+        || dst_df == DataFormat::NONE
         || output_map->at(op_def->input(i)).shape.size() != 4) {
       continue;
     }
     if (src_df != dst_df) {
       std::string transformed_name = TransformedName(op_def->input(i),
-          "data_format", dst_df);
+          "data_format", static_cast<int>(dst_df));
       if (transformed_set->count(transformed_name) == 0) {
         VLOG(1) << "Add Transpose operation " << op_def->name()
                 << " to transpose tensor "
                 << op_def->input(i) << "', from data format "
-                << src_df << " to " << dst_df;
+                << static_cast<int>(src_df) << " to "
+                << static_cast<int>(dst_df);
         // Only support transpose between NHWC and NCHW for now.
         std::vector<int> dst_dims;
         if (src_df == DataFormat::NCHW && dst_df == DataFormat::NHWC) {
@@ -430,7 +430,8 @@ MaceStatus NetDefAdapter::AdaptDataFormat(
           dst_dims = {0, 3, 1, 2};
         } else {
           LOG(FATAL) << "Encounter unsupported data format transpose from "
-                     << src_df << " to " << dst_df;
+                     << static_cast<int>(src_df) << " to "
+                     << static_cast<int>(dst_df);
         }
         auto &input_info = output_map->at(op_def->input(i));
         auto output_shape = input_info.shape.empty() ?
@@ -449,7 +450,7 @@ MaceStatus NetDefAdapter::AdaptDataFormat(
         // set data format arg
         SetProtoArg<int>(transpose_op_def,
                          "data_format",
-                         dst_df);
+                         static_cast<int>(dst_df));
         // set output memory type argument
         SetProtoArg<int>(transpose_op_def,
                          OutputMemoryTypeTagName(),
@@ -475,20 +476,20 @@ MaceStatus NetDefAdapter::AdaptDataFormat(
 }
 
 MaceStatus NetDefAdapter::AdaptMemoryType(
-    mace::OpConditionContext *context,
-    mace::OperatorDef *op_def,
-    mace::NetDefAdapter::TensorInfoMap *output_map,
+    OpConditionContext *context,
+    OperatorDef *op_def,
+    NetDefAdapter::TensorInfoMap *output_map,
     std::unordered_set<std::string> *transformed_set,
     MemoryType *op_output_mem_types,
-    mace::NetDef *target_net_def) {
-  VLOG(1) << "Adapt memory type for op " << op_def->name();
+    NetDef *target_net_def) {
+  VLOG(3) << "Adapt memory type for op " << op_def->name();
   // Get expected output memory type
   // (only support one kind of memory type for multiple outputs)
   op_registry_->GetInOutMemoryTypes(op_def->type(), context);
 #ifdef MACE_ENABLE_OPENCL
-  int input_size = op_def->input_size();
   // if op is memory-unused op, no transformation
   if (TransformRequiredOp(op_def->type())) {
+    int input_size = op_def->input_size();
     for (int i = 0; i < input_size; ++i) {
       if (output_map->count(op_def->input(i)) == 0) {
         MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
@@ -498,14 +499,14 @@ MaceStatus NetDefAdapter::AdaptMemoryType(
         continue;
       }
       auto &input_info = output_map->at(op_def->input(i));
-      if (input_info.data_format == DataFormat::DF_NONE
-          || input_info.shape.size() != 4) {
-        continue;
-      }
       // check whether to do transform
       MemoryType src_mem_type = input_info.mem_type;
       MemoryType dst_mem_type = context->GetInputMemType(i);
-      if (src_mem_type != dst_mem_type) {
+      auto wanted_input_dtype = context->GetInputDataType(i);
+      if (src_mem_type != dst_mem_type ||
+          (input_info.dtype != wanted_input_dtype &&
+              (src_mem_type != MemoryType::CPU_BUFFER
+                  || dst_mem_type != MemoryType::CPU_BUFFER))) {
         auto transformed_name = TransformedName(op_def->input(i),
                                                 "mem_type",
                                                 dst_mem_type);
@@ -521,7 +522,7 @@ MaceStatus NetDefAdapter::AdaptMemoryType(
               op_def->input(i),
               input_info.shape,
               transformed_name,
-              context->GetInputDataType(i),
+              wanted_input_dtype,
               context->GetInputOpenCLBufferType(i),
               dst_mem_type,
               input_info.data_format,
@@ -529,7 +530,7 @@ MaceStatus NetDefAdapter::AdaptMemoryType(
           // set data format arg
           SetProtoArg<int>(transformed_op_def,
                            "data_format",
-                           input_info.data_format);
+                           static_cast<int>(input_info.data_format));
           // set output memory type argument
           SetProtoArg<int>(transformed_op_def,
                            OutputMemoryTypeTagName(),
@@ -564,7 +565,7 @@ MaceStatus NetDefAdapter::AdaptMemoryType(
   return MaceStatus::MACE_SUCCESS;
 }
 
-std::string NetDefAdapter::DebugString(const mace::NetDef *net_def) {
+std::string NetDefAdapter::DebugString(const NetDef *net_def) {
   std::stringstream sstream;
   auto DeviceTypeToStrFunc = [](DeviceType device_type) -> std::string {
     if (device_type == DeviceType::CPU) {
@@ -591,10 +592,10 @@ std::string NetDefAdapter::DebugString(const mace::NetDef *net_def) {
       return "NHWC";
     } else if (type == DataFormat::NCHW) {
       return "NCHW";
-    } else if (type == DataFormat::DF_NONE) {
-      return "DF_NONE";
-    } else if (type == DataFormat::DF_AUTO) {
-      return "DT_AUTO";
+    } else if (type == DataFormat::NONE) {
+      return "NONE";
+    } else if (type == DataFormat::AUTO) {
+      return "AUTO";
     } else if (type == DataFormat::OIHW) {
       return "OIHW";
     } else {
@@ -615,7 +616,7 @@ std::string NetDefAdapter::DebugString(const mace::NetDef *net_def) {
     std::string data_format = DataFormatToStrFunc(
         static_cast<DataFormat>(
             ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                op, "data_format", 0)));
+                op, "data_format", static_cast<int>(DataFormat::NONE))));
 
     sstream << std::endl;
     sstream << "{" << std::endl;
diff --git a/mace/core/net_def_adapter.h b/mace/core/net_def_adapter.h
index 7f3a6754..d821ed81 100644
--- a/mace/core/net_def_adapter.h
+++ b/mace/core/net_def_adapter.h
@@ -32,16 +32,22 @@ class OpRegistryBase;
 class Workspace;
 class Device;
 
-/**
- * Conventions:
- * 1. DataFormat::DT_AUTO stands for formatted (NHWC or NCHW)
- * 2. if Op with DataFormat::DT_AUTO, the arguments of this op
- *    is formatted to NHWC
- */
+/// Conventions:
+/// 1. DataFormat::AUTO stands for formatted (NHWC or NCHW)
+/// 2. if Op with DataFormat::AUTO, the arguments of this op
+///    is formatted to NHWC
 class NetDefAdapter {
  public:
   NetDefAdapter(const OpRegistryBase *op_registry,
                 const Workspace *ws);
+  // Adapt original net_def to a better net.
+  // 1. Adapt device: choose best device for every op in the net.
+  // 2. Adapt data type: Add data type related transform ops
+  //                     for mixing precision.
+  // 3. Adapt data format: confirm data format of every op
+  //                       and add transpose if necessary.
+  // 4. Adapt memory type: Add BufferTransform if necessary
+  //                       for transforming memory type between ops.
   MaceStatus AdaptNetDef(
       const NetDef *net_def,
       Device *target_device,
@@ -91,12 +97,12 @@ class NetDefAdapter {
       NetDef *target_net_def);
 
   MaceStatus AdaptMemoryType(
-      mace::OpConditionContext *context,
-      mace::OperatorDef *op_def,
+      OpConditionContext *context,
+      OperatorDef *op_def,
       TensorInfoMap *output_map,
       std::unordered_set<std::string> *transformed_set,
       MemoryType *op_output_mem_types,
-      mace::NetDef *target_net_def);
+      NetDef *target_net_def);
 
   std::string DebugString(const NetDef *net_def);
 
diff --git a/mace/core/net_optimizer.cc b/mace/core/net_optimizer.cc
index 565a42c1..4382b51b 100644
--- a/mace/core/net_optimizer.cc
+++ b/mace/core/net_optimizer.cc
@@ -19,10 +19,10 @@
 namespace mace {
 
 DeviceType NetOptimizer::SelectBestDevice(
-    const mace::OperatorDef *op_def,
+    const OperatorDef *op_def,
     DeviceType target_device_type,
-    const std::set<mace::DeviceType> &available_devices,
-    const std::vector<mace::DeviceType> &inputs_op_devices) {
+    const std::set<DeviceType> &available_devices,
+    const std::vector<DeviceType> &inputs_op_devices) {
   static const std::set<std::string> kComputeIntensiveOps = {
       "Conv2D", "DepthwiseConv2d", "Deconv2D", "DepthwiseDeconv2d",
       "FullyConnected"
diff --git a/mace/core/net_optimizer.h b/mace/core/net_optimizer.h
index 8ec8dc23..23f1897c 100644
--- a/mace/core/net_optimizer.h
+++ b/mace/core/net_optimizer.h
@@ -23,8 +23,21 @@
 
 namespace mace {
 
+/// Any optimization for Net could be put in here in the future.
 class NetOptimizer {
  public:
+  /// Select best device for the op to support mixing usage of CPU and GPU.
+  /// Greedy strategy: one way to the end. If the op fallback to CPU, then
+  /// the follow-up ops will run on CPU too util meet
+  /// some compute-intensive ops(Convolution) to
+  /// reduce the memory copy between CPU and GPU.
+  /// Simple but effective.
+  ///
+  /// \param op_def the op
+  /// \param target_device target device to run on
+  /// \param available_devices available devices of the op
+  /// \param inputs_op_devices devices of father ops run on
+  /// \return Best device for the op_def
   DeviceType SelectBestDevice(const OperatorDef *op_def,
                               DeviceType target_device,
                               const std::set<DeviceType> &available_devices,
diff --git a/mace/core/operator.cc b/mace/core/operator.cc
index 275189a7..605ae3a7 100644
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -21,22 +21,22 @@
 
 namespace mace {
 OpConditionContext::OpConditionContext(
-    const mace::Workspace *ws,
-    mace::OpConditionContext::TensorShapeMap *info)
+    const Workspace *ws,
+    OpConditionContext::TensorShapeMap *info)
     : operator_def_(nullptr),
       ws_(ws),
       device_(nullptr),
       tensor_shape_info_(info) {}
 
 void OpConditionContext::set_operator_def(
-    const mace::OperatorDef *operator_def) {
+    const OperatorDef *operator_def) {
   operator_def_ = operator_def;
   input_data_types_.clear();
 }
 
 void OpConditionContext::SetInputInfo(size_t idx,
-                                      mace::MemoryType mem_type,
-                                      mace::DataType dt) {
+                                      MemoryType mem_type,
+                                      DataType dt) {
   if (input_mem_types_.empty()) {
     // the default inputs' memory types are same as output memory type.
     input_mem_types_.resize(operator_def_->input_size(), output_mem_type_);
@@ -53,7 +53,7 @@ void OpConditionContext::SetInputInfo(size_t idx,
   input_data_types_[idx] = dt;
 }
 
-void OpConditionContext::set_output_mem_type(mace::MemoryType type) {
+void OpConditionContext::set_output_mem_type(MemoryType type) {
   MACE_CHECK(operator_def_ != nullptr);
   output_mem_type_ = type;
   input_mem_types_.clear();
@@ -106,7 +106,7 @@ OpConstructContext::OpConstructContext(Workspace *ws)
       device_(nullptr) {}
 
 void OpConstructContext::set_operator_def(
-    std::shared_ptr<mace::OperatorDef> operator_def) {
+    std::shared_ptr<OperatorDef> operator_def) {
   operator_def_ = operator_def;
 }
 
@@ -225,9 +225,20 @@ OpRegistrationInfo::OpRegistrationInfo() {
       context->set_output_mem_type(MemoryType::CPU_BUFFER);
     }
   };
+
+  data_format_selector = [](OpConditionContext *context)
+      -> std::vector<DataFormat> {
+    DataFormat op_data_format =
+        static_cast<DataFormat>(
+            ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                *context->operator_def(), "data_format",
+                static_cast<int>(DataFormat::NONE)));
+    return std::vector<DataFormat>(context->operator_def()->input_size(),
+                                   op_data_format);
+  };
 }
 
-void OpRegistrationInfo::AddDevice(mace::DeviceType device) {
+void OpRegistrationInfo::AddDevice(DeviceType device) {
   devices.insert(device);
 }
 
@@ -239,9 +250,9 @@ void OpRegistrationInfo::Register(const std::string &key, OpCreator creator) {
 
 MaceStatus OpRegistryBase::Register(
     const std::string &op_type,
-    const mace::DeviceType device_type,
-    const mace::DataType dt,
-    mace::OpRegistrationInfo::OpCreator creator) {
+    const DeviceType device_type,
+    const DataType dt,
+    OpRegistrationInfo::OpCreator creator) {
   if (registry_.count(op_type) == 0) {
     registry_[op_type] = std::unique_ptr<OpRegistrationInfo>(
         new OpRegistrationInfo);
@@ -277,12 +288,20 @@ const std::set<DeviceType> OpRegistryBase::AvailableDevices(
 
 void OpRegistryBase::GetInOutMemoryTypes(
     const std::string &op_type,
-    mace::OpConditionContext *context) const {
+    OpConditionContext *context) const {
   MACE_CHECK(registry_.count(op_type) != 0,
              op_type, " operation is not registered.");
   return registry_.at(op_type)->memory_type_setter(context);
 }
 
+const std::vector<DataFormat> OpRegistryBase::InputsDataFormat(
+    const std::string &op_type,
+    OpConditionContext *context) const {
+  MACE_CHECK(registry_.count(op_type) != 0,
+             op_type, " operation is not registered.");
+  return registry_.at(op_type)->data_format_selector(context);
+}
+
 std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
     OpConstructContext *context,
     DeviceType device_type) const {
@@ -321,11 +340,17 @@ OpConditionBuilder &OpConditionBuilder::SetDevicePlacerFunc(
 }
 
 OpConditionBuilder& OpConditionBuilder::SetInputMemoryTypeSetter(
-    mace::OpRegistrationInfo::MemoryTypeSetter setter) {
+    OpRegistrationInfo::MemoryTypeSetter setter) {
   memory_type_setter_ = setter;
   return *this;
 }
 
+OpConditionBuilder& OpConditionBuilder::SetInputsDataFormatSelector(
+    OpRegistrationInfo::DataFormatSelector selector) {
+  data_format_selector_ = selector;
+  return *this;
+}
+
 void OpConditionBuilder::Finalize(OpRegistrationInfo *info) const {
   if (info != nullptr) {
     if (placer_) {
@@ -334,6 +359,10 @@ void OpConditionBuilder::Finalize(OpRegistrationInfo *info) const {
     if (memory_type_setter_) {
       info->memory_type_setter = memory_type_setter_;
     }
+
+    if (data_format_selector_) {
+      info->data_format_selector = data_format_selector_;
+    }
   }
 }
 
diff --git a/mace/core/operator.h b/mace/core/operator.h
index 35effdc5..9430d90d 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -117,6 +117,14 @@ class OpConstructContext {
   inline Device *device() const {
     return device_;
   }
+#ifdef MACE_ENABLE_OPENCL
+  inline MemoryType GetOpMemoryType() const {
+    return static_cast<MemoryType>(
+        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            *operator_def_, OutputMemoryTypeTagName(),
+            static_cast<int>(MemoryType::CPU_BUFFER)));
+  }
+#endif  // MACE_ENABLE_OPENCL
 
  private:
   std::shared_ptr<OperatorDef> operator_def_;
@@ -270,6 +278,9 @@ class OpConditionBuilder {
   OpConditionBuilder &SetInputMemoryTypeSetter(
       OpRegistrationInfo::MemoryTypeSetter setter);
 
+  OpConditionBuilder &SetInputsDataFormatSelector(
+      OpRegistrationInfo::DataFormatSelector selector);
+
   void Finalize(OpRegistrationInfo *info) const;
 
  private:
@@ -297,6 +308,9 @@ class OpRegistryBase {
   void GetInOutMemoryTypes(
       const std::string &op_type, OpConditionContext *context) const;
 
+  const std::vector<DataFormat> InputsDataFormat(
+      const std::string &op_type, OpConditionContext *context) const;
+
   std::unique_ptr<Operation> CreateOperation(
       OpConstructContext *context,
       DeviceType device_type) const;
diff --git a/mace/core/runtime/opencl/opencl_util.cc b/mace/core/runtime/opencl/opencl_util.cc
index 9f9001f3..20ae6a2b 100644
--- a/mace/core/runtime/opencl/opencl_util.cc
+++ b/mace/core/runtime/opencl/opencl_util.cc
@@ -173,7 +173,7 @@ void OpenCLUtil::BuildTransformOpDef(
   arg->set_i(static_cast<int32_t>(dt));
   arg = op_def->add_arg();
   arg->set_name("data_format");
-  arg->set_i(data_format);
+  arg->set_i(static_cast<int>(data_format));
   if (!input_shape.empty()) {
     OutputShape *shape = op_def->add_output_shape();
     for (auto value : input_shape) {
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index aa482bee..f1740765 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -269,7 +269,7 @@ MaceStatus Workspace::PreallocateOutputTensor(
                     tensor_mem.second.data_type,
                     false, tensor_mem.first));
     tensor->set_data_format(tensor_mem.second.data_format);
-    if (tensor_mem.second.data_format != DataFormat::DF_NONE) {
+    if (tensor_mem.second.data_format != DataFormat::NONE) {
       if (mem_blocks[tensor_mem.second.mem_id].mem_type()
           == MemoryType::GPU_IMAGE) {
         VLOG(1) << "Tensor: " << tensor_mem.first
diff --git a/mace/examples/cli/example.cc b/mace/examples/cli/example.cc
index bbb7c710..054231e9 100644
--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -94,7 +94,7 @@ DataFormat ParseDataFormat(const std::string &data_format_str) {
   } else if (data_format_str == "OIHW") {
     return DataFormat::OIHW;
   } else {
-    return DataFormat::DF_NONE;
+    return DataFormat::NONE;
   }
 }
 
diff --git a/mace/libmace/capability.cc b/mace/libmace/capability.cc
index d37a62b6..46896fcd 100644
--- a/mace/libmace/capability.cc
+++ b/mace/libmace/capability.cc
@@ -143,7 +143,7 @@ void BMNet::SetUp() {
   // Add input and output information
   for (size_t i = 0; i < input_names_.size(); ++i) {
     InputOutputInfo *info = net_.add_input_info();
-    info->set_data_format(DataFormat::NHWC);
+    info->set_data_format(static_cast<int>(DataFormat::NHWC));
     info->set_name(input_names_[i]);
     for (auto d : input_shapes_[i]) {
       info->add_dims(static_cast<int>(d));
@@ -244,7 +244,7 @@ void BMNet::AddConv(const std::string &conv_type,
   op_def->add_output(output_name);
   AddIntsArg(op_def, "strides", strides);
   AddIntArg(op_def, "padding", padding_type);
-  AddIntArg(op_def, "has_data_format", 1);
+  AddIntArg(op_def, "data_format", static_cast<int>(DataFormat::AUTO));
   AddIntArg(op_def, "T", DT_HALF);
   if (has_relu6) {
     AddStringArg(op_def, "activation", "RELUX");
@@ -271,7 +271,7 @@ void BMNet::AddEltwise(const std::string &op_name,
   op_def->add_output(output);
   AddIntArg(op_def, "type", type);
   AddIntArg(op_def, "T", DT_HALF);
-  AddIntArg(op_def, "has_data_format", 1);
+  AddIntArg(op_def, "data_format", static_cast<int>(DataFormat::AUTO));
   OutputShape *shape = op_def->add_output_shape();
   for (auto dim : output_shape) {
     shape->add_dims(dim);
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index f00ce2e6..08aaf9f3 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -283,9 +283,9 @@ MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
                        std::shared_ptr<void> data,
                        const DataFormat format) {
   MACE_CHECK_NOTNULL(data.get());
-  MACE_CHECK(format == DataFormat::DF_NONE || format == DataFormat::NHWC
-                 || format == DataFormat::NCHW || format == OIHW,
-             "MACE only support DF_NONE, NHWC, NCHW and OIHW "
+  MACE_CHECK(format == DataFormat::NONE || format == DataFormat::NHWC
+                 || format == DataFormat::NCHW || format == DataFormat::OIHW,
+             "MACE only support NONE, NHWC, NCHW and OIHW "
              "formats of input now.");
   impl_ = make_unique<MaceTensor::Impl>();
   impl_->shape = shape;
@@ -496,7 +496,7 @@ MaceStatus MaceEngine::Impl::Init(
     DataType output_dt = output_info_map_[output_name].data_type();
     Tensor *output_tensor =
         ws_->CreateTensor(output_name, device_->allocator(), output_dt);
-    output_tensor->set_data_format(NHWC);
+    output_tensor->set_data_format(DataFormat::NHWC);
 #endif
   }
 #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
@@ -585,14 +585,14 @@ MaceEngine::Impl::~Impl() {
 MaceStatus MaceEngine::Impl::TransposeInput(
     const std::pair<const std::string, MaceTensor> &input,
     Tensor *input_tensor) {
-  bool has_data_format = input_tensor->data_format() != DataFormat::DF_NONE;
-  DataFormat data_format = DataFormat::DF_NONE;
+  bool has_data_format = input_tensor->data_format() != DataFormat::NONE;
+  DataFormat data_format = DataFormat::NONE;
   DataType input_dt = input_tensor->dtype();
   if (has_data_format) {
     std::vector<int> dst_dims;
     if (device_->device_type() == DeviceType::CPU &&
         input.second.shape().size() == 4 &&
-        input.second.data_format() == NHWC &&
+        input.second.data_format() == DataFormat::NHWC &&
         !is_quantized_model_) {
       VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
       input_tensor->set_data_format(DataFormat::NCHW);
@@ -654,28 +654,28 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
   DataType output_dt = output_tensor->dtype();
   // save output
   if (output_tensor != nullptr && output->second.data() != nullptr) {
-    if (output_tensor->data_format() != DataFormat::DF_NONE &&
-        output->second.data_format() != DataFormat::DF_NONE &&
+    if (output_tensor->data_format() != DataFormat::NONE &&
+        output->second.data_format() != DataFormat::NONE &&
         output->second.shape().size() == 4 &&
         output->second.data_format() != output_tensor->data_format()) {
       VLOG(1) << "Transform output " << output->first << " from "
-              << output_tensor->data_format() << " to "
-              << output->second.data_format();
+              << static_cast<int>(output_tensor->data_format()) << " to "
+              << static_cast<int>(output->second.data_format());
       std::vector<int> dst_dims;
-      if (output_tensor->data_format() == NCHW &&
-          output->second.data_format() == NHWC) {
+      if (output_tensor->data_format() == DataFormat::NCHW &&
+          output->second.data_format() == DataFormat::NHWC) {
         dst_dims = {0, 2, 3, 1};
-      } else if (output_tensor->data_format() == NHWC &&
-          output->second.data_format() == NCHW) {
+      } else if (output_tensor->data_format() == DataFormat::NHWC &&
+          output->second.data_format() == DataFormat::NCHW) {
         dst_dims = {0, 3, 1, 2};
       } else {
         LOG(FATAL) << "Not supported output data format: "
-                   << output->second.data_format() << " vs "
-                   << output_tensor->data_format();
+                   << static_cast<int>(output->second.data_format()) << " vs "
+                   << static_cast<int>(output_tensor->data_format());
       }
       VLOG(1) << "Transform output " << output->first << " from "
-              << output_tensor->data_format() << " to "
-              << output->second.data_format();
+              << static_cast<int>(output_tensor->data_format()) << " to "
+              << static_cast<int>(output->second.data_format());
       std::vector<index_t> shape =
           TransposeShape<index_t, index_t>(output_tensor->shape(),
                                            dst_dims);
diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc
index 1d697488..6cb21b5c 100644
--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -96,7 +96,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
     auto leakyrelu_coefficient = static_cast<T>(
         Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f));
     MemoryType mem_type;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_ = make_unique<opencl::image::ActivationKernel<T>>(
           type, relux_max_limit, leakyrelu_coefficient);
@@ -140,11 +140,13 @@ void RegisterActivation(OpRegistryBase *op_registry) {
           .SetDevicePlacerFunc(
               [](OpConditionContext *context) -> std::set<DeviceType> {
                 auto op = context->operator_def();
+                if (op->output_shape_size() != op->output_size()) {
+                  return { DeviceType::CPU, DeviceType::GPU };
+                }
                 int has_data_format =
                     ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                         *op, "has_data_format", 0);
                 if (!has_data_format ||
-                    (op->output_shape_size() != op->output_size()) ||
                     op->output_shape(0).dims_size() != 4) {
                   return { DeviceType::CPU };
                 }
diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc
index f16cf060..c2c95882 100644
--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -207,7 +207,8 @@ void TestSimplePrelu() {
     // Run
     net.RunOp(D);
   } else {
-    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<D, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("Activation", "PreluTest")
         .Input("InputNCHW")
         .Input("Alpha")
@@ -217,7 +218,8 @@ void TestSimplePrelu() {
 
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   }
 
   auto expected = net.CreateTensor<float>(
diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc
index d5175180..27bce71b 100644
--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -69,7 +69,7 @@ class AddNOp<DeviceType::GPU, T> : public Operation {
  public:
   explicit AddNOp(OpConstructContext *context)
       : Operation(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::AddNKernel<T>>();
     } else {
       MACE_NOT_IMPLEMENTED;
@@ -109,11 +109,13 @@ void RegisterAddN(OpRegistryBase *op_registry) {
           .SetDevicePlacerFunc(
               [](OpConditionContext *context) -> std::set<DeviceType> {
                 auto op = context->operator_def();
+                if (op->output_shape_size() != op->output_size()) {
+                  return { DeviceType::CPU, DeviceType::GPU };
+                }
                 int has_data_format =
                     ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                         *op, "has_data_format", 0);
                 if (!has_data_format ||
-                    (op->output_shape_size() != op->output_size()) ||
                     op->output_shape(0).dims_size() != 4) {
                   return { DeviceType::CPU };
                 }
diff --git a/mace/ops/arm/fp32/deconv_2d.cc b/mace/ops/arm/fp32/deconv_2d.cc
index a80d6d64..41a01a6c 100644
--- a/mace/ops/arm/fp32/deconv_2d.cc
+++ b/mace/ops/arm/fp32/deconv_2d.cc
@@ -54,7 +54,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut(
                                  out_pad_size,
                                  &padded_out_shape,
                                  framework_type_,
-                                 NCHW);
+                                 DataFormat::NCHW);
 
   MACE_RETURN_IF_ERROR(output->Resize(out_shape));
 
diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc
index c6559032..4e303d07 100644
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -174,7 +174,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
     float leakyrelu_coefficient = Operation::GetOptionalArg<float>(
         "leakyrelu_coefficient", 0.0f);
     MemoryType mem_type;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_ = make_unique<opencl::image::BatchNormKernel<T>>(
           epsilon, activation, relux_max_limit, leakyrelu_coefficient);
diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc
index 495a2409..83c8219f 100644
--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -34,7 +34,8 @@ void Simple() {
   net.AddInputFromArray<D, float>("Var", {1}, {11.67f}, true);
 
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<D, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("BatchNorm", "BatchNormTest")
         .Input("InputNCHW")
         .Input("Scale")
@@ -47,7 +48,8 @@ void Simple() {
     // Run
 
     net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   } else if (D == DeviceType::GPU) {
     OpDefBuilder("BatchNorm", "BatchNormTest")
         .Input("Input")
@@ -93,8 +95,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
   net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
   net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   // Construct graph
   OpDefBuilder("BatchNorm", "BatchNormTest")
@@ -112,8 +114,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
   // run cpu
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>();
@@ -163,8 +165,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
   net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
   net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("BatchNorm", "BatchNormTest")
       .Input("InputNCHW")
@@ -179,8 +181,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
   // run cpu
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>();
@@ -230,8 +232,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
   net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
   net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("BatchNorm", "BatchNormTest")
       .Input("InputNCHW")
@@ -246,8 +248,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
   // run cpu
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>();
@@ -296,8 +298,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
   net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
   net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("BatchNorm", "BatchNormTest")
       .Input("InputNCHW")
@@ -312,8 +314,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
   // run cpu
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>();
diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc
index c44501f1..03ac91ff 100644
--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -264,7 +264,7 @@ class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
  public:
   explicit BatchToSpaceNDOp(OpConstructContext *context)
       : BatchToSpaceOpBase(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::BatchToSpaceKernel<T>>();
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc
index 7991a088..72e93fec 100644
--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -103,7 +103,7 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
       : Operation(context),
         has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 1)) {
     MemoryType mem_type = MemoryType::CPU_BUFFER;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_ = make_unique<opencl::image::BiasAddKernel<T>>();
     } else {
@@ -151,11 +151,13 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) {
           .SetDevicePlacerFunc(
               [](OpConditionContext *context) -> std::set<DeviceType> {
                 auto op = context->operator_def();
+                if (op->output_shape_size() != op->output_size()) {
+                  return { DeviceType::CPU, DeviceType::GPU };
+                }
                 int has_data_format =
                     ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                         *op, "has_data_format", 0);
                 if (!has_data_format ||
-                    (op->output_shape_size() != op->output_size()) ||
                     op->output_shape(0).dims_size() != 4) {
                   return { DeviceType::CPU };
                 }
diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc
index 34f6a713..8c51b703 100644
--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -27,9 +27,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
   OpsTestNet net;
 
   // Add input data
-  DataFormat data_format = NHWC;
   if (D == DeviceType::CPU) {
-    data_format = NCHW;
     net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
   } else if (D == DeviceType::GPU) {
     net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
diff --git a/mace/ops/bias_add_test.cc b/mace/ops/bias_add_test.cc
index 2e4764ca..0126abb9 100644
--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -31,8 +31,8 @@ void BiasAddSimple() {
   net.AddInputFromArray<D, float>("Bias", {1}, {0.5f}, true);
 
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("BiasAdd", "BiasAddTest")
         .Input("InputNCHW")
         .Input("Bias")
@@ -41,8 +41,8 @@ void BiasAddSimple() {
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   } else if (D == DeviceType::GPU) {
     OpDefBuilder("BiasAdd", "BiasAddTest")
         .Input("Input")
@@ -83,8 +83,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
                                              {batch, height, width, channels});
   net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   // Construct graph
   OpDefBuilder("BiasAdd", "BiasAddTest")
@@ -97,8 +97,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
   // run cpu
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>();
@@ -132,8 +132,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
                                              {batch, height, width, channels});
   net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   // Construct graph
   OpDefBuilder("BiasAdd", "BiasAddTest")
@@ -146,8 +146,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
   // run cpu
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   // Check
   auto expected = net.CreateTensor<float>();
   expected->Copy(*net.GetOutput("Output"));
diff --git a/mace/ops/buffer_to_image_benchmark.cc b/mace/ops/buffer_to_image_benchmark.cc
index 92733d61..2a8c42b3 100644
--- a/mace/ops/buffer_to_image_benchmark.cc
+++ b/mace/ops/buffer_to_image_benchmark.cc
@@ -48,7 +48,6 @@ void FilterBufferToImage(int iters,
                    OpenCLBufferType::IN_OUT_CHANNEL,
                    MemoryType::GPU_IMAGE,
                    0,
-                   DataFormat::NHWC,
                    b2i_output);
   };
 
diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc
index a819b6a7..cb52eafe 100644
--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -37,14 +37,14 @@ void TestBidirectionTransform(const OpenCLBufferType type,
 
   OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
       .Transform(&context, net.ws()->GetTensor("Input"),
-                 type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output);
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);
 
   // Inverse Transform
   Tensor *i2b_output = net.ws()->CreateTensor(
       "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
   OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
       .Transform(&context, b2i_output,
-                 type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output);
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);
 
   // Check
   ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
@@ -178,14 +178,14 @@ void TestDiffTypeBidirectionTransform(const OpenCLBufferType type,
 
   OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
       .Transform(&context, net.ws()->GetTensor("Input"),
-                 type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output);
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);
 
   // Inverse Transform
   Tensor *i2b_output = net.ws()->CreateTensor(
       "I2BOutput", context.device()->allocator(), DT_FLOAT);
   OpenCLBufferTransformer<float>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
       .Transform(&context, b2i_output,
-                 type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output);
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);
 
   // Check
   ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
@@ -218,14 +218,14 @@ void TestStringHalfBidirectionTransform(const OpenCLBufferType type,
   // Transform
   OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
       .Transform(&context, net.ws()->GetTensor("Input"),
-                 type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output);
+                 type, MemoryType::GPU_IMAGE, 0, b2i_output);
 
   // Inverse Transform
   Tensor *i2b_output = net.ws()->CreateTensor(
       "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
   OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
       .Transform(&context, b2i_output,
-                 type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output);
+                 type, MemoryType::GPU_BUFFER, 0, i2b_output);
 
   // Check
   ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
diff --git a/mace/ops/buffer_transform.cc b/mace/ops/buffer_transform.cc
index f8bf025d..7e59b339 100644
--- a/mace/ops/buffer_transform.cc
+++ b/mace/ops/buffer_transform.cc
@@ -39,14 +39,11 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
     auto type =
         static_cast<OpenCLBufferType>(Operation::GetOptionalArg<int>(
             "buffer_type", static_cast<int>(CONV2D_FILTER)));
-    DataFormat data_format = static_cast<DataFormat>(
-        Operation::GetOptionalArg<int>("data_format", DataFormat::DF_NONE));
 
     MemoryType in_mem_type = context->workspace()->GetTensor(
         operator_def_->input(0))->memory_type();
     return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
-        context, input, type, out_mem_type_, wino_blk_size_,
-        data_format, output);
+        context, input, type, out_mem_type_, wino_blk_size_, output);
   }
 
  private:
diff --git a/mace/ops/buffer_transform_test.cc b/mace/ops/buffer_transform_test.cc
index b3f68a31..a9af4bc9 100644
--- a/mace/ops/buffer_transform_test.cc
+++ b/mace/ops/buffer_transform_test.cc
@@ -48,7 +48,7 @@ void TestBidirectionTransform(const OpenCLBufferType type,
   OpenCLBufferTransformer<DstType>(MemoryType::GPU_BUFFER,
                                    MemoryType::GPU_BUFFER)
       .Transform(&context, net.ws()->GetTensor("Input"),
-                 type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, bt_output);
+                 type, MemoryType::GPU_BUFFER, 0, bt_output);
 
   // Inverse Transform
   Tensor *output = net.ws()->CreateTensor(
@@ -57,7 +57,7 @@ void TestBidirectionTransform(const OpenCLBufferType type,
   OpenCLBufferTransformer<OrgType>(MemoryType::GPU_BUFFER,
                                    MemoryType::GPU_BUFFER)
       .Transform(&context, bt_output,
-                 type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, output);
+                 type, MemoryType::GPU_BUFFER, 0, output);
 
   if (DataTypeToEnum<OrgType>::value == DataTypeToEnum<DstType>::value) {
     EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(),
@@ -94,7 +94,7 @@ void TestArgumentTransform(const index_t input_size) {
                              MemoryType::GPU_BUFFER)
       .Transform(&context, net.ws()->GetTensor("Input"),
                  OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER,
-                 0, DataFormat::NHWC, output);
+                 0, output);
 
   index_t expected_size = RoundUp<index_t>(input_size, 4);
   EXPECT_EQ(expected_size, output->buffer_shape()[0]);
diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc
index 09811828..d68ebbbe 100644
--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -82,7 +82,7 @@ class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
   explicit ChannelShuffleOp(OpConstructContext *context)
       : Operation(context) {
     const int groups = Operation::GetOptionalArg<int>("group", 1);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::ChannelShuffleKernel<T>>(groups);
     } else {
       MACE_NOT_IMPLEMENTED;
@@ -119,7 +119,7 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) {
               [](OpConditionContext *context) -> std::set<DeviceType> {
                 auto op = context->operator_def();
                 if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU };
+                  return { DeviceType::CPU, DeviceType::GPU };
                 }
                 int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                     *op, "group", 1);
diff --git a/mace/ops/channel_shuffle_test.cc b/mace/ops/channel_shuffle_test.cc
index d59b45d8..4e25448b 100644
--- a/mace/ops/channel_shuffle_test.cc
+++ b/mace/ops/channel_shuffle_test.cc
@@ -28,8 +28,8 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
       "Input", {1, 1, 2, 8},
       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   // Construct graph
   OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
@@ -40,8 +40,8 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
 
   // Run
   net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>(
diff --git a/mace/ops/common/conv_pool_2d_util.cc b/mace/ops/common/conv_pool_2d_util.cc
index 2ca95a7d..43988881 100644
--- a/mace/ops/common/conv_pool_2d_util.cc
+++ b/mace/ops/common/conv_pool_2d_util.cc
@@ -40,19 +40,19 @@ void CalcPaddingAndOutputSize(const index_t *input_shape,
 
   index_t input_height = 0, input_width = 0;
   index_t kernel_height = 0, kernel_width = 0;
-  if (input_format == NCHW) {
+  if (input_format == DataFormat::NCHW) {
     input_height = input_shape[2];
     input_width = input_shape[3];
-  } else if (input_format == NHWC) {
+  } else if (input_format == DataFormat::NHWC) {
     input_height = input_shape[1];
     input_width = input_shape[2];
   } else {
     MACE_NOT_IMPLEMENTED;
   }
-  if (filter_format == OIHW) {
+  if (filter_format == DataFormat::OIHW) {
     kernel_height = filter_shape[2];
     kernel_width = filter_shape[3];
-  } else if (filter_format == OHWI) {
+  } else if (filter_format == DataFormat::OHWI) {
     kernel_height = filter_shape[1];
     kernel_width = filter_shape[2];
   } else {
@@ -97,11 +97,11 @@ void CalcPaddingAndOutputSize(const index_t *input_shape,
       0, (output_width - 1) * strides[1] + k_extent_width - input_width);
 
   output_shape[0] = input_shape[0];
-  if (input_format == NCHW) {
+  if (input_format == DataFormat::NCHW) {
     output_shape[1] = output_channels;
     output_shape[2] = output_height;
     output_shape[3] = output_width;
-  } else if (input_format == NHWC) {
+  } else if (input_format == DataFormat::NHWC) {
     output_shape[1] = output_height;
     output_shape[2] = output_width;
     output_shape[3] = output_channels;
@@ -117,7 +117,8 @@ void CalcNCHWPaddingAndOutputSize(const index_t *input_shape,   // NCHW
                                   Padding padding,
                                   index_t *output_shape,
                                   int *padding_size) {
-  CalcPaddingAndOutputSize(input_shape, NCHW, filter_shape, OIHW, dilations,
+  CalcPaddingAndOutputSize(input_shape, DataFormat::NCHW, filter_shape,
+                           DataFormat::OIHW, dilations,
                            strides, padding, output_shape, padding_size);
 }
 
@@ -128,7 +129,8 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape,   // NHWC
                                   Padding padding,
                                   index_t *output_shape,
                                   int *padding_size) {
-  CalcPaddingAndOutputSize(input_shape, NHWC, filter_shape, OIHW, dilations,
+  CalcPaddingAndOutputSize(input_shape, DataFormat::NHWC, filter_shape,
+                           DataFormat::OIHW, dilations,
                            strides, padding, output_shape, padding_size);
 }
 
@@ -151,19 +153,19 @@ void CalcOutputSize(const index_t *input_shape,
 
   index_t input_height = 0, input_width = 0;
   index_t kernel_height = 0, kernel_width = 0;
-  if (input_format == NCHW) {
+  if (input_format == DataFormat::NCHW) {
     input_height = input_shape[2];
     input_width = input_shape[3];
-  } else if (input_format == NHWC) {
+  } else if (input_format == DataFormat::NHWC) {
     input_height = input_shape[1];
     input_width = input_shape[2];
   } else {
     MACE_NOT_IMPLEMENTED;
   }
-  if (filter_format == OIHW) {
+  if (filter_format == DataFormat::OIHW) {
     kernel_height = filter_shape[2];
     kernel_width = filter_shape[3];
-  } else if (filter_format == OHWI) {
+  } else if (filter_format == DataFormat::OHWI) {
     kernel_height = filter_shape[1];
     kernel_width = filter_shape[2];
   } else {
@@ -195,11 +197,11 @@ void CalcOutputSize(const index_t *input_shape,
   }
 
   output_shape[0] = input_shape[0];
-  if (input_format == NCHW) {
+  if (input_format == DataFormat::NCHW) {
     output_shape[1] = output_channels;
     output_shape[2] = output_height;
     output_shape[3] = output_width;
-  } else if (input_format == NHWC) {
+  } else if (input_format == DataFormat::NHWC) {
     output_shape[1] = output_height;
     output_shape[2] = output_width;
     output_shape[3] = output_channels;
@@ -215,7 +217,8 @@ void CalcOutputSize(const index_t *input_shape,   // NHWC
                     const int *strides,
                     const RoundType round_type,
                     index_t *output_shape) {
-  CalcOutputSize(input_shape, NHWC, filter_shape, OIHW, padding_size, dilations,
+  CalcOutputSize(input_shape, DataFormat::NHWC, filter_shape,
+                 DataFormat::OIHW, padding_size, dilations,
                  strides, round_type, output_shape);
 }
 
@@ -226,7 +229,8 @@ void CalcNCHWOutputSize(const index_t *input_shape,   // NCHW
                         const int *strides,
                         const RoundType round_type,
                         index_t *output_shape) {
-  CalcOutputSize(input_shape, NCHW, filter_shape, OIHW, padding_size, dilations,
+  CalcOutputSize(input_shape, DataFormat::NCHW, filter_shape,
+                 DataFormat::OIHW, padding_size, dilations,
                  strides, round_type, output_shape);
 }
 
@@ -241,14 +245,18 @@ void CalcDeconvShape_TF(const std::vector<index_t> &input_shape,
                         std::vector<index_t> *padded_out_shape,
                         DataFormat data_format) {
   const index_t
-      in_height = data_format == NCHW ? input_shape[2] : input_shape[1];
+      in_height =
+      data_format == DataFormat::NCHW ? input_shape[2] : input_shape[1];
   const index_t
-      in_width = data_format == NCHW ? input_shape[3] : input_shape[2];
+      in_width =
+          data_format == DataFormat::NCHW ? input_shape[3] : input_shape[2];
 
   const index_t
-      out_height = data_format == NCHW ? output_shape[2] : output_shape[1];
+      out_height =
+          data_format == DataFormat::NCHW ? output_shape[2] : output_shape[1];
   const index_t
-      out_width = data_format == NCHW ? output_shape[3] : output_shape[2];
+      out_width =
+          data_format == DataFormat::NCHW ? output_shape[3] : output_shape[2];
 
   const index_t extended_in_height = (in_height - 1) * strides[0] + 1;
   const index_t extended_in_width = (in_width - 1) * strides[1] + 1;
@@ -307,11 +315,11 @@ void CalcDeconvShape_TF(const std::vector<index_t> &input_shape,
     padded_out_shape->resize(4);
     (*padded_out_shape)[0] = output_shape[0];
     (*padded_out_shape)[1] =
-        data_format == NCHW ? output_channel : padded_out_height;
+        data_format == DataFormat::NCHW ? output_channel : padded_out_height;
     (*padded_out_shape)[2] =
-        data_format == NCHW ? padded_out_height : padded_out_width;
+        data_format == DataFormat::NCHW ? padded_out_height : padded_out_width;
     (*padded_out_shape)[3] =
-        data_format == NCHW ? padded_out_width : output_channel;
+        data_format == DataFormat::NCHW ? padded_out_width : output_channel;
   }
 }
 
@@ -325,9 +333,11 @@ void CalcDeconvShape_Caffe(const std::vector<index_t> &input_shape,
                            std::vector<index_t> *padded_out_shape,
                            DataFormat data_format) {
   const index_t
-      in_height = data_format == NCHW ? input_shape[2] : input_shape[1];
+      in_height =
+          data_format == DataFormat::NCHW ? input_shape[2] : input_shape[1];
   const index_t
-      in_width = data_format == NCHW ? input_shape[3] : input_shape[2];
+      in_width =
+          data_format == DataFormat::NCHW ? input_shape[3] : input_shape[2];
 
   const index_t output_channel = filter_shape[0] * group;
 
@@ -351,11 +361,11 @@ void CalcDeconvShape_Caffe(const std::vector<index_t> &input_shape,
     padded_out_shape->resize(4);
     (*padded_out_shape)[0] = input_shape[0];
     (*padded_out_shape)[1] =
-        data_format == NCHW ? output_channel : padded_out_height;
+        data_format == DataFormat::NCHW ? output_channel : padded_out_height;
     (*padded_out_shape)[2] =
-        data_format == NCHW ? padded_out_height : padded_out_width;
+        data_format == DataFormat::NCHW ? padded_out_height : padded_out_width;
     (*padded_out_shape)[3] =
-        data_format == NCHW ? padded_out_width : output_channel;
+        data_format == DataFormat::NCHW ? padded_out_width : output_channel;
   }
 
   if (out_shape != nullptr) {
@@ -363,9 +373,11 @@ void CalcDeconvShape_Caffe(const std::vector<index_t> &input_shape,
     index_t out_width = padded_out_width - out_pad_size[1];
     out_shape->resize(4);
     (*out_shape)[0] = input_shape[0];
-    (*out_shape)[1] = data_format == NCHW ? output_channel : out_height;
-    (*out_shape)[2] = data_format == NCHW ? out_height : out_width;
-    (*out_shape)[3] = data_format == NCHW ? out_width : output_channel;
+    (*out_shape)[1] =
+        data_format == DataFormat::NCHW ? output_channel : out_height;
+    (*out_shape)[2] = data_format == DataFormat::NCHW ? out_height : out_width;
+    (*out_shape)[3] =
+        data_format == DataFormat::NCHW ? out_width : output_channel;
   }
 }
 
@@ -385,7 +397,7 @@ void CalDeconvOutputShapeAndPadSize(const std::vector<index_t> &input_shape,
     MACE_CHECK(output_shape->size() == 4,
                "deconv output shape shoud be 4-dims");
     std::vector<index_t> &out_shape = *output_shape;
-    if (data_format == NCHW) {
+    if (data_format == DataFormat::NCHW) {
       const index_t t = out_shape[1];
       out_shape[1] = out_shape[3];
       out_shape[3] = out_shape[2];
diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc
index d2bb5713..518e9cc2 100644
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -199,7 +199,7 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
  public:
   explicit ConcatOp(OpConstructContext *context)
       : ConcatOpBase(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::ConcatKernel<T>>();
     } else {
       MACE_NOT_IMPLEMENTED;
@@ -243,9 +243,11 @@ void RegisterConcat(OpRegistryBase *op_registry) {
           .SetDevicePlacerFunc(
             [](OpConditionContext *context) -> std::set<DeviceType> {
               auto op = context->operator_def();
+              if (op->output_shape_size() != op->output_size()) {
+                return { DeviceType::CPU, DeviceType::GPU };
+              }
               auto tensor_shape_info = context->tensor_shape_info();
-              if (op->output_shape_size() != op->output_size() ||
-                  op->output_shape(0).dims_size() != 4) {
+              if (op->output_shape(0).dims_size() != 4) {
                 return { DeviceType::CPU };
               } else {
                 int has_data_format =
diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc
index 80e8fe78..cc84b963 100644
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -231,9 +231,9 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
     std::vector<int> paddings(2);
     if (paddings_.empty()) {
       CalcPaddingAndOutputSize(input->shape().data(),
-                               NHWC,
+                               DataFormat::NHWC,
                                filter->shape().data(),
-                               OHWI,
+                               DataFormat::OHWI,
                                dilations_.data(),
                                strides_.data(),
                                padding_type_,
@@ -242,9 +242,9 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
     } else {
       paddings = paddings_;
       CalcOutputSize(input->shape().data(),
-                     NHWC,
+                     DataFormat::NHWC,
                      filter->shape().data(),
-                     OHWI,
+                     DataFormat::OHWI,
                      paddings_.data(),
                      dilations_.data(),
                      strides_.data(),
@@ -459,7 +459,7 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
               "leakyrelu_coefficient", 0.0f)),
         wino_block_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)) {
     MemoryType mem_type;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_ = make_unique<opencl::image::Conv2dKernel<T>>();
     } else {
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index 7fb85478..42929057 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -47,8 +47,8 @@ void TestNHWCSimple3x3VALID(int wino_blk_size = 0) {
   const std::vector<index_t> output_shape = {1, 1, 1, 1};
 
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputNCHW")
         .Input("Filter")
@@ -60,8 +60,8 @@ void TestNHWCSimple3x3VALID(int wino_blk_size = 0) {
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   } else if (D == DeviceType::GPU) {
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("Input")
@@ -105,8 +105,8 @@ void TestNHWCSimple3x3SAME(int wino_blk_size = 0) {
   const std::vector<index_t> output_shape = {1, 3, 3, 1};
 
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputNCHW")
         .Input("Filter")
@@ -118,8 +118,8 @@ void TestNHWCSimple3x3SAME(int wino_blk_size = 0) {
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   } else if (D == DeviceType::GPU) {
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("Input")
@@ -189,8 +189,8 @@ void TestNHWCSimple3x3WithoutBias() {
        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true);
 
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputNCHW")
         .Input("Filter")
@@ -203,8 +203,8 @@ void TestNHWCSimple3x3WithoutBias() {
 
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   } else if (D == DeviceType::GPU) {
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("Input")
@@ -256,8 +256,8 @@ void TestNHWCCombined3x3() {
   net.AddInputFromArray<D, T>("Bias", {2}, {0.1f, 0.2f}, true);
 
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("Conv2D", "Conv2DTest")
         .Input("InputNCHW")
         .Input("Filter")
@@ -270,8 +270,8 @@ void TestNHWCCombined3x3() {
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   } else if (D == DeviceType::GPU) {
     OpDefBuilder("Conv2D", "Conv2DTest")
         .Input("Input")
@@ -321,8 +321,8 @@ void TestFusedNHWCSimple3x3VALID(int wino_blk_size = 0) {
   const std::vector<index_t> output_shape = {1, 1, 1, 1};
 
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputNCHW")
         .Input("Filter")
@@ -336,8 +336,8 @@ void TestFusedNHWCSimple3x3VALID(int wino_blk_size = 0) {
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   } else if (D == DeviceType::GPU) {
     OpDefBuilder("Conv2D", "Conv2DTest")
         .Input("Input")
@@ -376,8 +376,8 @@ void TestFusedNHWCSimple3x3WithoutBias(int wino_blk_size = 0) {
   const std::vector<index_t> output_shape = {1, 1, 1, 1};
 
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("Conv2D", "Conv2DTest")
         .Input("InputNCHW")
         .Input("Filter")
@@ -391,8 +391,8 @@ void TestFusedNHWCSimple3x3WithoutBias(int wino_blk_size = 0) {
 
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   } else if (D == DeviceType::GPU) {
     OpDefBuilder("Conv2D", "Conv2DTest")
         .Input("Input")
@@ -459,8 +459,8 @@ void TestConv1x1() {
   net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f}, true);
 
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("Conv2D", "Conv2DTest")
         .Input("InputNCHW")
         .Input("Filter")
@@ -472,8 +472,8 @@ void TestConv1x1() {
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   } else if (D == DeviceType::GPU) {
     OpDefBuilder("Conv2D", "Conv2DTest")
         .Input("Input")
@@ -532,8 +532,8 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
         "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true,
         false);
     net.AddRandomInput<D, T>("Bias", {output_channels}, true, false);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
     // Construct graph
     OpDefBuilder("Conv2D", "Conv2dTest")
@@ -552,8 +552,8 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
     // run on cpu
     net.RunOp();
 
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
     // Check
     auto expected = net.CreateTensor<float>();
@@ -651,8 +651,8 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
                                     float_bias_data,
                                     true);
 
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputNCHW")
@@ -667,8 +667,8 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
     // run on cpu
     net.RunOp();
 
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
     // Check
     auto expected = net.CreateTensor<float>();
@@ -811,8 +811,8 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
         "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
     net.AddRandomInput<D, T>("Bias", {output_channels}, true);
 
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
     // Construct graph
     OpDefBuilder("Conv2D", "Conv2dTest")
@@ -828,8 +828,8 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
 
     // run on cpu
     net.RunOp();
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
     // Check
     auto expected = net.CreateTensor<float>();
@@ -900,8 +900,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
         "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
     net.AddRandomInput<D, float>("Bias", {output_channels}, true);
 
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     // Construct graph
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputNCHW")
@@ -916,8 +916,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
     // run on cpu
     net.RunOp();
 
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
     // Check
     auto expected = net.CreateTensor<float>();
     expected->Copy(*net.GetOutput("Output"));
@@ -979,8 +979,8 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
         "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
     net.AddRandomInput<D, float>("Bias", {output_channels}, true);
 
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     // Construct graph
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputNCHW")
@@ -994,8 +994,8 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
     // run on cpu
     net.RunOp();
 
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
     // Check
     auto expected = net.CreateTensor<float>();
@@ -1118,12 +1118,12 @@ void TestQuant(const index_t batch,
   net.AddRandomInput<CPU, float>("Filter", {out_channels, k_height, k_width,
                                             in_channels}, true);
   net.AddRandomInput<CPU, float>("Bias", {out_channels}, true);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
   net.TransformFilterDataFormat<DeviceType::CPU, float>("Filter",
-                                                        OHWI,
+                                                        DataFormat::OHWI,
                                                         "FilterOIHW",
-                                                        OIHW);
+                                                        DataFormat::OIHW);
 
   OpDefBuilder("Conv2D", "Conv2dTest")
       .Input("InputNCHW")
@@ -1136,8 +1136,8 @@ void TestQuant(const index_t batch,
       .AddIntArg("T", static_cast<int>(DT_FLOAT))
       .Finalize(net.NewOperatorDef());
   net.RunOp(CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                  "Output", NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   OpDefBuilder("Quantize", "QuantizeFilter")
       .Input("Filter")
diff --git a/mace/ops/crop.cc b/mace/ops/crop.cc
index 9cb836ee..20146c8d 100644
--- a/mace/ops/crop.cc
+++ b/mace/ops/crop.cc
@@ -117,7 +117,7 @@ class CropOp<DeviceType::GPU, T> : public Operation {
  public:
   explicit CropOp(OpConstructContext *context)
       : Operation(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::CropKernel<T>>(
           Operation::GetRepeatedArgs<int>("offset"));
     } else {
@@ -151,11 +151,13 @@ void RegisterCrop(OpRegistryBase *op_registry) {
           .SetDevicePlacerFunc(
               [](OpConditionContext *context) -> std::set<DeviceType> {
                 auto op = context->operator_def();
+                if (op->output_shape_size() != op->output_size()) {
+                  return { DeviceType::CPU, DeviceType::GPU };
+                }
                 int has_data_format =
                     ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                         *op, "has_data_format", 0);
                 if (!has_data_format ||
-                    (op->output_shape_size() != op->output_size()) ||
                     op->output_shape(0).dims_size() != 4) {
                   return { DeviceType::CPU };
                 }
diff --git a/mace/ops/crop_test.cc b/mace/ops/crop_test.cc
index 213b8ce8..0fd0026b 100644
--- a/mace/ops/crop_test.cc
+++ b/mace/ops/crop_test.cc
@@ -42,13 +42,13 @@ void RunCrop(const std::vector<index_t> &input_shape,
         .Finalize(net.NewOperatorDef());
   } else if (D == CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>("Input0",
-                                                    NHWC,
+                                                    DataFormat::NHWC,
                                                     "InputNCHW0",
-                                                    NCHW);
+                                                    DataFormat::NCHW);
     net.TransformDataFormat<DeviceType::CPU, float>("Input1",
-                                                    NHWC,
+                                                    DataFormat::NHWC,
                                                     "InputNCHW1",
-                                                    NCHW);
+                                                    DataFormat::NCHW);
     OpDefBuilder("Crop", "CropTest")
         .Input("InputNCHW0")
         .Input("InputNCHW1")
@@ -62,8 +62,8 @@ void RunCrop(const std::vector<index_t> &input_shape,
   net.RunOp(D);
 
   if (D == CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   }
   // Check
   auto expected = net.CreateTensor<float>(expected_shape, expected_data);
diff --git a/mace/ops/cumsum_test.cc b/mace/ops/cumsum_test.cc
index 8b111540..69e62965 100644
--- a/mace/ops/cumsum_test.cc
+++ b/mace/ops/cumsum_test.cc
@@ -32,8 +32,8 @@ void SimpleTestWithDataFormat(const std::vector<index_t> &shape,
   OpsTestNet net;
 
   net.AddInputFromArray<CPU, T>("Input", shape, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("Cumsum", "CumsumTest")
     .Input("InputNCHW")
@@ -48,8 +48,8 @@ void SimpleTestWithDataFormat(const std::vector<index_t> &shape,
   // Run
   net.RunOp(DeviceType::CPU);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   net.AddInputFromArray<CPU, T>("ExpectedOutput", shape, output);
   ExpectTensorNear<T>(*net.GetOutput("ExpectedOutput"),
diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc
index 3ac54186..2b7623e6 100644
--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -173,7 +173,7 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
   explicit Deconv2dOp(OpConstructContext *context)
       : Deconv2dOpBase(context) {
     MemoryType mem_type = MemoryType::GPU_IMAGE;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::Deconv2dKernel<T>>();
     } else {
       MACE_NOT_IMPLEMENTED;
@@ -240,7 +240,7 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
                                    &out_paddings,
                                    nullptr,
                                    model_type_,
-                                   NHWC);
+                                   DataFormat::NHWC);
 
     return kernel_->Compute(context, input, filter, bias,
                             strides_.data(), in_paddings.data(), activation_,
@@ -276,7 +276,7 @@ void RegisterDeconv2D(OpRegistryBase *op_registry) {
                     MACE_NOT_IMPLEMENTED;
                   }
                   FrameworkType framework_type =
-                      static_cast<ops::FrameworkType>(
+                      static_cast<FrameworkType>(
                         ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                             *(context->operator_def()), "framework_type",
                             FrameworkType::TENSORFLOW));
diff --git a/mace/ops/deconv_2d_test.cc b/mace/ops/deconv_2d_test.cc
index 25aa7eee..9ea8161e 100644
--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
@@ -47,7 +47,8 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
   net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data, true);
   net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data, true);
   // TODO(liutuo): remove the unused transform
-  net.TransformFilterDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
+  net.TransformFilterDataFormat<D, float>(
+      "Filter", DataFormat::HWOI, "FilterOIHW", DataFormat::OIHW);
   if (D == DeviceType::GPU) {
     if (model_type == FrameworkType::CAFFE) {
       OpDefBuilder("Deconv2D", "Deconv2dTest")
@@ -77,8 +78,8 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
     }
     net.RunOp(D);
   } else {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
     if (model_type == FrameworkType::CAFFE) {
       OpDefBuilder("Deconv2D", "Deconv2dTest")
@@ -109,8 +110,8 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
 
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   }
 
   auto expected = net.CreateTensor<float>(expected_shape, expected_data);
@@ -380,8 +381,8 @@ void TestComplexDeconvNxN(const int batch,
         "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true,
         false);
     net.AddRandomInput<D, T>("Bias", {output_channels}, true, false);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     int out_h = 0;
     int out_w = 0;
 
@@ -440,8 +441,8 @@ void TestComplexDeconvNxN(const int batch,
     // run on cpu
     net.RunOp();
 
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
     // Check
     auto expected = net.CreateTensor<float>();
diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc
index 09208e7a..a57ddecf 100644
--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -96,7 +96,7 @@ class DepthToSpaceOp<DeviceType::GPU, T> : public Operation {
   explicit DepthToSpaceOp(OpConstructContext *context)
       : Operation(context) {
     int block_size = Operation::GetOptionalArg<int>("block_size", 1);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::DepthToSpaceKernel<T>>(block_size);
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/depth_to_space_test.cc b/mace/ops/depth_to_space_test.cc
index 2719619f..65fb7d39 100644
--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
@@ -32,8 +32,8 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,
   net.AddInputFromArray<D, float>("Input", input_shape, input_data);
   // Construct graph
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
         .Input("InputNCHW")
         .Output("OutputNCHW")
@@ -41,8 +41,8 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   } else {
     OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
@@ -114,8 +114,8 @@ void RandomTest(const int block_size,
 
   // Add input data
   net.AddRandomInput<D, float>("Input", shape);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
   OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
       .Input("InputNCHW")
       .AddIntArg("block_size", block_size)
@@ -125,8 +125,8 @@ void RandomTest(const int block_size,
   // Run
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
       .Input("Input")
diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc
index 7d389766..ae2a4dfd 100644
--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -188,9 +188,9 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
         filter->dim(2) * filter->dim(3), filter->dim(0), filter->dim(1), 1};
     if (paddings_.empty()) {
       CalcPaddingAndOutputSize(input->shape().data(),
-                               NHWC,
+                               DataFormat::NHWC,
                                ohwi_shape.data(),
-                               OHWI,
+                               DataFormat::OHWI,
                                dilations_.data(),
                                strides_.data(),
                                padding_type_,
@@ -199,9 +199,9 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
     } else {
       paddings = paddings_;
       CalcOutputSize(input->shape().data(),
-                     NHWC,
+                     DataFormat::NHWC,
                      ohwi_shape.data(),
-                     OHWI,
+                     DataFormat::OHWI,
                      paddings_.data(),
                      dilations_.data(),
                      strides_.data(),
@@ -375,7 +375,7 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
   explicit DepthwiseConv2dOp(OpConstructContext *context)
       : DepthwiseConv2dOpBase(context) {
     MemoryType mem_type;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel<T>>();
     } else {
@@ -459,6 +459,18 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
                 context->set_output_mem_type(mem_type);
               }));
 #endif  // MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP_CONDITION(
+      op_registry,
+      OpConditionBuilder("DepthwiseConv2d")
+          .SetInputsDataFormatSelector(
+              [](OpConditionContext *context) -> std::vector<DataFormat> {
+                DataFormat op_data_format =
+                    static_cast<DataFormat>(
+                        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                        *context->operator_def(), "data_format",
+                        static_cast<int>(DataFormat::NONE)));
+                return {op_data_format, DataFormat::OIHW, DataFormat::NONE};
+              }));
 }
 
 }  // namespace ops
diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc
index 58852a01..d34722a5 100644
--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -39,8 +39,8 @@ void SimpleValidTest() {
       true);
   net.AddInputFromArray<D, float>("Bias", {2}, {.1f, .2f}, true);
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
         .Input("InputNCHW")
         .Input("Filter")
@@ -52,8 +52,8 @@ void SimpleValidTest() {
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   } else if (D == DeviceType::GPU) {
     OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
         .Input("Input")
@@ -127,8 +127,8 @@ void ComplexValidTest(index_t batch,
                                   true);
 
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
         .Input("InputNCHW")
         .Input("Filter")
@@ -141,8 +141,8 @@ void ComplexValidTest(index_t batch,
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   } else if (D == DeviceType::GPU) {
     OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
         .Input("Input")
@@ -249,8 +249,8 @@ void TestNxNS12(const index_t height, const index_t width) {
                                                {multiplier * channel},
                                                true, false);
 
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
         .Input("InputNCHW")
         .Input("Filter")
@@ -267,8 +267,8 @@ void TestNxNS12(const index_t height, const index_t width) {
     // Run on cpu
     net.RunOp();
 
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
     // Check
     auto expected = net.CreateTensor<float>();
@@ -389,9 +389,9 @@ void TestQuant(const index_t batch,
       "Filter", {k_height, k_width, in_channels, multiplier}, true, false);
   net.AddRandomInput<CPU, float>("Bias", {out_channels}, true);
   net.TransformDataFormat<DeviceType::CPU, float>(
-      "Input", NHWC, "InputNCHW", NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
   net.TransformFilterDataFormat<DeviceType::CPU, float>(
-      "Filter", HWIO, "FilterOIHW", OIHW);
+      "Filter", DataFormat::HWIO, "FilterOIHW", DataFormat::OIHW);
 
   OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
       .Input("InputNCHW")
@@ -405,7 +405,7 @@ void TestQuant(const index_t batch,
       .Finalize(net.NewOperatorDef());
   net.RunOp(CPU);
   net.TransformDataFormat<DeviceType::CPU, float>(
-      "OutputNCHW", NCHW, "Output", NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   OpDefBuilder("Quantize", "QuantizeFilter")
       .Input("Filter")
diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc
index 6111ea30..31b634af 100644
--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -190,7 +190,7 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
   explicit DepthwiseDeconv2dOp(OpConstructContext *context)
       : Deconv2dOpBase(context) {
     MemoryType mem_type = MemoryType::GPU_IMAGE;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel<T>>();
     } else {
       MACE_NOT_IMPLEMENTED;
@@ -230,7 +230,7 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
                                    &out_paddings,
                                    nullptr,
                                    CAFFE,
-                                   NHWC);
+                                   DataFormat::NHWC);
 
     return kernel_->Compute(context,
                             input,
diff --git a/mace/ops/depthwise_deconv2d_test.cc b/mace/ops/depthwise_deconv2d_test.cc
index 0cf3de95..fda0cf59 100644
--- a/mace/ops/depthwise_deconv2d_test.cc
+++ b/mace/ops/depthwise_deconv2d_test.cc
@@ -39,7 +39,8 @@ void RunTestSimple(const int group,
   // Add input data
   net.AddInputFromArray<D, float>("Input", input_shape, input_data);
   net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data, true);
-  net.TransformFilterDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
+  net.TransformFilterDataFormat<D, float>(
+      "Filter", DataFormat::HWOI, "FilterOIHW", DataFormat::OIHW);
   const index_t out_channels = expected_shape[3];
   net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data, true);
 
@@ -56,8 +57,8 @@ void RunTestSimple(const int group,
 
     net.RunOp(D);
   } else {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC,
-                                                    "InputNCHW", NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
         .Input("InputNCHW")
         .Input("FilterOIHW")
@@ -69,8 +70,8 @@ void RunTestSimple(const int group,
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   }
 
   auto expected = net.CreateTensor<float>(expected_shape, expected_data);
@@ -193,8 +194,8 @@ void RandomTest(index_t batch,
                                                 {channel * multiplier},
                                                 bias_data, true, false);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
   OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
       .Input("InputNCHW")
       .Input("Filter")
@@ -210,8 +211,8 @@ void RandomTest(index_t batch,
       .Finalize(net.NewOperatorDef());
   // Run
   net.RunOp(DeviceType::CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                  "Output", NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
 
   // Check
diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc
index 04c0e10e..bfe00742 100644
--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -1145,7 +1145,7 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
     int32_t scalar_input_index = Operation::GetOptionalArg<int32_t>(
             "scalar_input_index", 1);
     MemoryType mem_type;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_ = make_unique<opencl::image::EltwiseKernel<T>>(
           type, coeff, scalar_input, scalar_input_index);
diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc
index 58306b62..08dc11d0 100644
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -69,7 +69,8 @@ void SimpleTensorScalar(const ops::EltwiseType type,
   net.AddInputFromArray<D, T>("Input", shape, input);
 
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, T>("Input", NHWC, "TInput", NCHW);
+    net.TransformDataFormat<D, T>(
+        "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
     OpDefBuilder("Eltwise", "EltwiseTest")
         .Input("TInput")
         .AddIntArg("T", DataTypeToEnum<T>::v())
@@ -81,7 +82,8 @@ void SimpleTensorScalar(const ops::EltwiseType type,
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<D, DstType>("TOutput", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, DstType>(
+        "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
   } else {
     OpDefBuilder("Eltwise", "EltwiseTest")
         .Input("Input")
@@ -124,13 +126,15 @@ void SimpleTensorEltwise(const ops::EltwiseType type,
             .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
             .Output("TOutput");
     if (shape0.size() > 1) {
-      net.TransformDataFormat<D, T>("Input0", NHWC, "TInput0", NCHW);
+      net.TransformDataFormat<D, T>(
+          "Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW);
       op_builder.Input("TInput0");
     } else {
       op_builder.Input("Input0");
     }
     if (shape1.size() > 1) {
-      net.TransformDataFormat<D, T>("Input1", NHWC, "TInput1", NCHW);
+      net.TransformDataFormat<D, T>(
+          "Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW);
       op_builder.Input("TInput1");
     } else {
       op_builder.Input("Input1");
@@ -139,7 +143,8 @@ void SimpleTensorEltwise(const ops::EltwiseType type,
 
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<D, DstType>("TOutput", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, DstType>(
+        "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
   } else {
     OpDefBuilder("Eltwise", "EltwiseTest")
         .Input("Input0")
@@ -560,7 +565,8 @@ void GPUOverflowTest(const ops::EltwiseType type,
   net.AddInputFromArray<DeviceType::GPU, T>(
       "Filter",
       {output_shape.back(), shape0.back(), 3, 3},
-      std::vector<float>(output_shape.back() * shape0.back() * 9, 1));
+      std::vector<float>(output_shape.back() * shape0.back() * 9, 1),
+      true);
   OpDefBuilder("Conv2D", "Conv2D")
       .AddIntArg("T", DataTypeToEnum<T>::v())
       .Input("EltOutput")
@@ -636,8 +642,8 @@ void RandomTensorScalar(const ops::EltwiseType type,
   // Add input data
   net.AddRandomInput<DeviceType::GPU, float>("Input", shape, false, true, true);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
   OpDefBuilder("Eltwise", "EltwiseTest")
       .Input("TInput")
       .AddIntArg("type", static_cast<int>(type))
@@ -647,8 +653,8 @@ void RandomTensorScalar(const ops::EltwiseType type,
       .Finalize(net.NewOperatorDef());
   // Run
   net.RunOp(DeviceType::CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
   auto expected = net.CreateTensor<float>();
   expected->Copy(*net.GetOutput("Output"));
 
@@ -690,10 +696,10 @@ void RandomTensorEltwise(const ops::EltwiseType type,
                                              true,
                                              true);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC, "TInput0",
-                                                  NCHW);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input1", NHWC, "TInput1",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW);
   OpDefBuilder("Eltwise", "EltwiseTest")
       .Input("TInput0")
       .Input("TInput1")
@@ -705,8 +711,8 @@ void RandomTensorEltwise(const ops::EltwiseType type,
 
   // Run
   net.RunOp(DeviceType::CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
   auto expected = net.CreateTensor<float>();
   expected->Copy(*net.GetOutput("Output"));
 
@@ -746,10 +752,10 @@ void Quantized(const std::vector<index_t> &shape,
                                              true,
                                              true);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC, "TInput0",
-                                                  NCHW);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input1", NHWC, "TInput1",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW);
 
   OpDefBuilder("Eltwise", "EltwiseTest")
       .Input("TInput0")
@@ -761,8 +767,8 @@ void Quantized(const std::vector<index_t> &shape,
 
   // Run
   net.RunOp(DeviceType::CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   OpDefBuilder("Quantize", "QuantizeInput0")
       .Input("Input0")
diff --git a/mace/ops/folded_batch_norm_test.cc b/mace/ops/folded_batch_norm_test.cc
index 5be44e05..fb0c45bb 100644
--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
@@ -49,7 +49,8 @@ void Simple() {
   net.AddInputFromArray<D, float>("Offset", {1}, offset, true);
 
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<D, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
         .Input("InputNCHW")
         .Input("Scale")
@@ -58,7 +59,8 @@ void Simple() {
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   } else if (D == DeviceType::GPU) {
     OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
         .Input("Input")
@@ -100,8 +102,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
   net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
   net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
       .Input("InputNCHW")
@@ -113,8 +115,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
   // run cpu
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>();
@@ -151,8 +153,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
   net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
   net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
       .Input("InputNCHW")
@@ -164,8 +166,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
   // run cpu
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>();
@@ -205,8 +207,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
   net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
   net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
       .Input("InputNCHW")
@@ -218,8 +220,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
   // run cpu
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>();
@@ -254,11 +256,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
   // Add input data
   net.AddRandomInput<DeviceType::GPU, float>("Input",
                                              {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
       .Input("InputNCHW")
@@ -270,8 +272,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
   // run cpu
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>();
diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc
index 64765d9c..9a371b16 100644
--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -190,7 +190,7 @@ class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
   explicit FullyConnectedOp(OpConstructContext *context)
       : FullyConnectedOpBase(context) {
     MemoryType mem_type = MemoryType::CPU_BUFFER;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       mem_type = MemoryType::GPU_IMAGE;
       kernel_ = make_unique<opencl::image::FullyConnectedKernel<T>>();
     } else {
diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc
index 64fead6e..586eb166 100644
--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -48,7 +48,8 @@ void Simple(const std::vector<index_t> &input_shape,
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   } else if (D == DeviceType::GPU) {
     OpDefBuilder("FullyConnected", "FullyConnectedTest")
         .Input("Input")
@@ -129,8 +130,8 @@ void Random(const index_t batch,
   net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel}, true,
       false);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
   OpDefBuilder("FullyConnected", "FullyConnectedTest")
       .Input("InputNCHW")
       .Input("Weight")
@@ -143,7 +144,8 @@ void Random(const index_t batch,
   // run cpu
   net.RunOp();
 
-  net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+  net.TransformDataFormat<CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>();
@@ -215,8 +217,10 @@ void QuantRandom(const index_t batch,
   net.AddRandomInput<CPU, float>(
       "Weight", {out_channel, height, width, channels}, true);
   net.AddRandomInput<CPU, float>("Bias", {out_channel}, true);
-  net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
-  net.TransformFilterDataFormat<CPU, float>("Weight", OHWI, "WeightOIHW", OIHW);
+  net.TransformDataFormat<CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
+  net.TransformFilterDataFormat<CPU, float>(
+      "Weight", DataFormat::OHWI, "WeightOIHW", DataFormat::OIHW);
 
   OpDefBuilder("FullyConnected", "FullyConnectedTest")
       .Input("InputNCHW")
@@ -226,7 +230,8 @@ void QuantRandom(const index_t batch,
       .AddIntArg("T", DT_FLOAT)
       .Finalize(net.NewOperatorDef());
   net.RunOp();
-  net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+  net.TransformDataFormat<CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   OpDefBuilder("Quantize", "QuantizeWeight")
       .Input("Weight")
diff --git a/mace/ops/local_response_norm_test.cc b/mace/ops/local_response_norm_test.cc
index e3597006..9a2d2cdf 100644
--- a/mace/ops/local_response_norm_test.cc
+++ b/mace/ops/local_response_norm_test.cc
@@ -29,7 +29,8 @@ void Simple() {
                                   {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
 
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<D, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
     OpDefBuilder("LocalResponseNorm", "LocalResponseNormTest")
         .Input("InputNCHW")
@@ -41,7 +42,8 @@ void Simple() {
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   }
 
   // Check
diff --git a/mace/ops/lstm_cell.cc b/mace/ops/lstm_cell.cc
index 82ed9053..d43dbf6b 100644
--- a/mace/ops/lstm_cell.cc
+++ b/mace/ops/lstm_cell.cc
@@ -36,7 +36,7 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation {
         Operation::GetOptionalArg<float>("scalar_input",
                                          0.0));
     MemoryType mem_type = MemoryType::GPU_IMAGE;
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::LSTMCellKernel<T>>(forget_bias);
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h
index d5dca3d7..d2ef5058 100644
--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
@@ -47,7 +47,6 @@ class OpenCLBufferTransformer {
                        const OpenCLBufferType type,
                        const MemoryType out_mem_type,
                        const int wino_blk_size,
-                       DataFormat data_format,
                        Tensor *output) {
     Workspace *ws = context->workspace();
     DataType dt = DataTypeToEnum<T>::value;
@@ -66,7 +65,6 @@ class OpenCLBufferTransformer {
         VLOG(2) << "Transform CPU Buffer " << input->name()
                 << " to GPU Buffer " << internal_tensor->name()
                 << " with data type " << dt;
-        MACE_CHECK(data_format == DataFormat::NHWC);
         internal_tensor->Resize(input->shape());
         const uint8_t *input_ptr = input->data<uint8_t>();
         Tensor::MappingGuard guard(internal_tensor);
@@ -88,7 +86,6 @@ class OpenCLBufferTransformer {
       VLOG(2) << "Transform GPU Buffer " << internal_tensor.name()
               << " to CPU Buffer " << output->name()
               << " with data type " << dt;
-      MACE_CHECK(data_format == DataFormat::NHWC);
       Tensor::MappingGuard guard(&internal_tensor);
       const T *internal_ptr = internal_tensor.data<T>();
       output->Resize(internal_tensor.shape());
@@ -135,7 +132,7 @@ MaceStatus TransformFilter(
   input->MarkUnused();
   return OpenCLBufferTransformer<T>(input->memory_type(), mem_type).
       Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
-                DataFormat::DF_NONE, output);
+                output);
 }
 
 }  // namespace ops
diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc
index ab61e8c6..aa98275c 100644
--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
@@ -15,6 +15,7 @@
 #include "mace/ops/ops_test_util.h"
 #include "mace/core/memory_optimizer.h"
 #include "mace/utils/memory.h"
+#include "mace/core/net_def_adapter.h"
 
 namespace mace {
 namespace ops {
@@ -164,26 +165,27 @@ void OpTestContext::SetOCLImageAndBufferTestFlag() {
 bool OpsTestNet::Setup(mace::DeviceType device) {
   NetDef net_def;
   for (auto &op_def : op_defs_) {
-    net_def.add_op()->CopyFrom(op_def);
-
+    auto target_op = net_def.add_op();
+    target_op->CopyFrom(op_def);
+
+    auto has_data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+        op_def, "has_data_format", 0);
+    auto is_quantized_op = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+        op_def, "T", static_cast<int>(DT_FLOAT))
+        == static_cast<int>(DT_UINT8);
     for (auto input : op_def.input()) {
       if (ws_.GetTensor(input) != nullptr &&
           !ws_.GetTensor(input)->is_weight()) {
         auto input_info = net_def.add_input_info();
         input_info->set_name(input);
-        auto has_data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            op_def, "has_data_format", 1);
-        auto is_quantized_op = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            op_def, "T", static_cast<int>(DT_FLOAT))
-            == static_cast<int>(DT_UINT8);
         if (has_data_format) {
           if (is_quantized_op || device == DeviceType::GPU) {
-            input_info->set_data_format(NHWC);
+            input_info->set_data_format(static_cast<int>(DataFormat::NHWC));
           } else {
-            input_info->set_data_format(NCHW);
+            input_info->set_data_format(static_cast<int>(DataFormat::NCHW));
           }
         } else {
-          input_info->set_data_format(DataFormat::DF_NONE);
+          input_info->set_data_format(static_cast<int>(DataFormat::NONE));
         }
         auto &shape = ws_.GetTensor(input)->shape();
         for (auto d : shape) {
@@ -191,6 +193,10 @@ bool OpsTestNet::Setup(mace::DeviceType device) {
         }
       }
     }
+    if (has_data_format) {
+      SetProtoArg<int>(target_op, "data_format",
+                       static_cast<int>(DataFormat::AUTO));
+    }
   }
   if (!op_defs_.empty()) {
     auto op_def = op_defs_.back();
@@ -205,15 +211,21 @@ bool OpsTestNet::Setup(mace::DeviceType device) {
       }
     }
   }
+  NetDef adapted_net_def;
+  NetDefAdapter net_def_adapter(op_registry_.get(), &ws_);
+  net_def_adapter.AdaptNetDef(&net_def,
+                              OpTestContext::Get()->GetDevice(device),
+                              &adapted_net_def);
+
   MemoryOptimizer mem_optimizer;
   net_ = make_unique<SerialNet>(
       op_registry_.get(),
-      &net_def,
+      &adapted_net_def,
       &ws_,
       OpTestContext::Get()->GetDevice(device),
       &mem_optimizer);
   MaceStatus status = (ws_.PreallocateOutputTensor(
-      net_def,
+      adapted_net_def,
       &mem_optimizer,
       OpTestContext::Get()->GetDevice(device)));
   if (status != MaceStatus::MACE_SUCCESS) return false;
@@ -252,15 +264,20 @@ MaceStatus OpsTestNet::RunOp() {
 MaceStatus OpsTestNet::RunNet(const mace::NetDef &net_def,
                               const mace::DeviceType device) {
   device_type_ = device;
+  NetDef adapted_net_def;
+  NetDefAdapter net_def_adapter(op_registry_.get(), &ws_);
+  net_def_adapter.AdaptNetDef(&net_def,
+                              OpTestContext::Get()->GetDevice(device),
+                              &adapted_net_def);
   MemoryOptimizer mem_optimizer;
   net_ = make_unique<SerialNet>(
       op_registry_.get(),
-      &net_def,
+      &adapted_net_def,
       &ws_,
       OpTestContext::Get()->GetDevice(device),
       &mem_optimizer);
   MACE_RETURN_IF_ERROR(ws_.PreallocateOutputTensor(
-      net_def,
+      adapted_net_def,
       &mem_optimizer,
       OpTestContext::Get()->GetDevice(device)));
   MACE_RETURN_IF_ERROR(net_->Init());
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index e9ef4d90..8d94f51f 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -216,7 +216,7 @@ class OpsTestNet {
     const std::vector<index_t> input_shape = input->shape();
     MACE_CHECK(input_shape.size() == 4, "input shape != 4");
 
-    if (src_format == NHWC && dst_format == NCHW) {
+    if (src_format == DataFormat::NHWC && dst_format == DataFormat::NCHW) {
       index_t batch = input_shape[0];
       index_t height = input_shape[1];
       index_t width = input_shape[2];
@@ -236,7 +236,8 @@ class OpsTestNet {
           }
         }
       }
-    } else if (src_format == NCHW && dst_format == NHWC) {
+    } else if (src_format == DataFormat::NCHW &&
+        dst_format == DataFormat::NHWC) {
       index_t batch = input_shape[0];
       index_t channels = input_shape[1];
       index_t height = input_shape[2];
@@ -274,7 +275,7 @@ class OpsTestNet {
         input->is_weight());
     const std::vector<index_t> input_shape = input->shape();
     MACE_CHECK(input_shape.size() == 4, "input shape != 4");
-    if (src_format == HWOI && dst_format == OIHW) {
+    if (src_format == DataFormat::HWOI && dst_format == DataFormat::OIHW) {
       index_t height = input_shape[0];
       index_t width = input_shape[1];
       index_t out_channels = input_shape[2];
@@ -292,7 +293,8 @@ class OpsTestNet {
               input_data[j * out_channels * in_channels + i];
         }
       }
-    } else if (src_format == OIHW && dst_format == HWOI) {
+    } else if (src_format == DataFormat::OIHW &&
+        dst_format == DataFormat::HWOI) {
       index_t out_channels = input_shape[0];
       index_t in_channels = input_shape[1];
       index_t height = input_shape[2];
@@ -310,7 +312,8 @@ class OpsTestNet {
               input_data[j * height * width + i];
         }
       }
-    } else if (src_format == HWIO && dst_format == OIHW) {
+    } else if (src_format == DataFormat::HWIO &&
+        dst_format == DataFormat::OIHW) {
       index_t height = input_shape[0];
       index_t width = input_shape[1];
       index_t in_channels = input_shape[2];
@@ -330,7 +333,8 @@ class OpsTestNet {
           }
         }
       }
-    } else if (src_format == OHWI && dst_format == OIHW) {
+    } else if (src_format == DataFormat::OHWI &&
+        dst_format == DataFormat::OIHW) {
       index_t out_channels = input_shape[0];
       index_t height = input_shape[1];
       index_t width = input_shape[2];
diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc
index e0a94f4a..24130d7a 100644
--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
@@ -179,7 +179,7 @@ class PadOp<DeviceType::GPU, T> : public Operation {
     std::vector<int> paddings = Operation::GetRepeatedArgs<int>("paddings");
     float constant_value = Operation::GetOptionalArg<float>(
         "constant_value", 0.0);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::PadKernel<T>>(
           type, paddings, constant_value);
     } else {
diff --git a/mace/ops/pad_test.cc b/mace/ops/pad_test.cc
index e68e8eb8..97730559 100644
--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
@@ -45,8 +45,8 @@ void SimpleConstant() {
     // Run
     net.RunOp(D);
   } else {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
     OpDefBuilder("Pad", "PadTest")
         .Input("TInput")
         .Output("TOutput")
@@ -58,8 +58,8 @@ void SimpleConstant() {
     // Run
     net.RunOp();
 
-    net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
-                                                    NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
   }
 
   auto output = net.GetTensor("Output");
@@ -93,7 +93,8 @@ void Result(const std::vector<index_t> &input_shape,
   if (D == DeviceType::CPU) {
     t_input = "TInput";
     t_output = "TOutput";
-    net.TransformDataFormat<DeviceType::CPU, T>(input, NHWC, t_input, NCHW);
+    net.TransformDataFormat<DeviceType::CPU, T>(
+        input, DataFormat::NHWC, t_input, DataFormat::NCHW);
   }
 
   OpDefBuilder("Pad", "PadTest")
@@ -108,7 +109,8 @@ void Result(const std::vector<index_t> &input_shape,
   net.RunOp(D);
 
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, T>(t_output, NCHW, output, NHWC);
+    net.TransformDataFormat<DeviceType::CPU, T>(
+        t_output, DataFormat::NCHW, output, DataFormat::NHWC);
   }
 
   auto actual = net.GetTensor(output.c_str());
@@ -172,8 +174,8 @@ TEST_F(PadTest, ComplexCPU) {
 
   // Add input data
   net.AddRepeatedInput<DeviceType::CPU, float>("Input", {1, 1, 1, 2}, 2);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
   OpDefBuilder("Pad", "PadTest")
       .Input("TInput")
       .Output("TOutput")
@@ -184,8 +186,8 @@ TEST_F(PadTest, ComplexCPU) {
 
   // Run
   net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   auto output = net.GetTensor("Output");
 
@@ -209,8 +211,8 @@ void Complex(const std::vector<index_t> &input_shape,
   // Add input data
   net.AddRandomInput<DeviceType::GPU, float>("Input", input_shape);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
   OpDefBuilder("Pad", "PadTest")
       .Input("TInput")
       .Output("TOutput")
@@ -222,8 +224,8 @@ void Complex(const std::vector<index_t> &input_shape,
 
   // Run
   net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   auto expected = net.CreateTensor<float>();
   expected->Copy(*net.GetOutput("Output"));
diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc
index 21d02e14..ce726dcb 100644
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -270,9 +270,9 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
     std::vector<int> paddings(2);
     if (paddings_.empty()) {
       CalcPaddingAndOutputSize(input_tensor->shape().data(),
-                               NHWC,
+                               DataFormat::NHWC,
                                filter_shape.data(),
-                               OHWI,
+                               DataFormat::OHWI,
                                dilations_.data(),
                                strides_.data(),
                                padding_type_,
@@ -281,9 +281,9 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
     } else {
       paddings = paddings_;
       CalcOutputSize(input_tensor->shape().data(),
-                     NHWC,
+                     DataFormat::NHWC,
                      filter_shape.data(),
-                     OHWI,
+                     DataFormat::OHWI,
                      paddings_.data(),
                      dilations_.data(),
                      strides_.data(),
@@ -477,7 +477,7 @@ class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase {
  public:
   explicit PoolingOp(OpConstructContext *context)
       : PoolingOpBase(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::PoolingKernel<T>>();
     } else {
       kernel_ = make_unique<opencl::buffer::PoolingKernel<T>>();
diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc
index 104b67bc..037cf8cf 100644
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -34,8 +34,8 @@ TEST_F(PoolingOpTest, MAX_VALID) {
       {0, 16, 1, 17, 2,  18, 3,  19, 4,  20, 5,  21, 6,  22, 7,  23,
        8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("Pooling", "PoolingTest")
       .Input("InputNCHW")
@@ -50,8 +50,8 @@ TEST_F(PoolingOpTest, MAX_VALID) {
   // Run
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected =
@@ -68,8 +68,8 @@ TEST_F(PoolingOpTest, MAX_SAME) {
   net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 3, 1},
                                                 {0, 1, 2, 3, 4, 5, 6, 7, 8});
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("Pooling", "PoolingTest")
       .Input("InputNCHW")
@@ -84,8 +84,8 @@ TEST_F(PoolingOpTest, MAX_SAME) {
   // Run
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>({1, 2, 2, 1}, {4, 5, 7, 8});
@@ -102,8 +102,8 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
       "Input", {1, 4, 4, 1},
       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("Pooling", "PoolingTest")
       .Input("InputNCHW")
@@ -118,8 +118,8 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
   // Run
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>({1, 2, 2, 1}, {10, 11, 14, 15});
@@ -136,8 +136,8 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
       "Input", {1, 2, 9, 1},
       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("Pooling", "PoolingTest")
       .Input("InputNCHW")
@@ -152,8 +152,8 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
   // Run
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>({1, 1, 5, 1}, {10, 12, 14, 16, 17});
@@ -174,8 +174,8 @@ void SimpleMaxPooling3S2() {
        14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26});
 
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     // Run
     OpDefBuilder("Pooling", "PoolingTest")
         .Input("InputNCHW")
@@ -187,8 +187,8 @@ void SimpleMaxPooling3S2() {
         .AddIntsArg("dilations", {1, 1})
         .Finalize(net.NewOperatorDef());
     net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   } else if (D == DeviceType::GPU) {
     OpDefBuilder("Pooling", "PoolingTest")
         .Input("Input")
@@ -224,8 +224,8 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
   // Add input data
   net.AddRandomInput<D, float>("Input", input_shape);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("Pooling", "PoolingTest")
       .Input("InputNCHW")
@@ -240,8 +240,8 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
   // run on cpu
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   auto expected = net.CreateTensor<float>();
   expected->Copy(*net.GetOutput("Output"));
@@ -304,8 +304,8 @@ TEST_F(PoolingOpTest, AVG_VALID) {
       {0, 16, 1, 17, 2,  18, 3,  19, 4,  20, 5,  21, 6,  22, 7,  23,
        8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("Pooling", "PoolingTest")
       .Input("InputNCHW")
@@ -320,8 +320,8 @@ TEST_F(PoolingOpTest, AVG_VALID) {
   // Run
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>(
@@ -373,8 +373,8 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
   // Add input data
   net.AddRandomInput<D, float>("Input", shape);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("Pooling", "PoolingTest")
       .Input("InputNCHW")
@@ -389,8 +389,8 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
   // run on cpu
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   auto expected = net.CreateTensor<float>();
   expected->Copy(*net.GetOutput("Output"));
@@ -563,7 +563,7 @@ void TestQuant(const index_t batch,
   net.AddRandomInput<CPU, float>(
       "Input", input_shape, false, false);
   net.TransformDataFormat<DeviceType::CPU, float>(
-      "Input", NHWC, "InputNCHW", NCHW);
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   net.AddRandomInput<DeviceType::CPU, float>(
       "OutputNCHW", input_shape, false, true, true);
@@ -580,7 +580,7 @@ void TestQuant(const index_t batch,
 
   net.RunOp(CPU);
   net.TransformDataFormat<DeviceType::CPU, float>(
-      "OutputNCHW", NCHW, "Output", NHWC);
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   OpDefBuilder("Quantize", "QuantizeInput")
       .Input("Input")
diff --git a/mace/ops/reduce.cc b/mace/ops/reduce.cc
index 86964ed9..27b34a91 100644
--- a/mace/ops/reduce.cc
+++ b/mace/ops/reduce.cc
@@ -873,7 +873,7 @@ class ReduceOp<DeviceType::GPU, T> : public ReduceOpBase {
  public:
   explicit ReduceOp(OpConstructContext *context)
       : ReduceOpBase(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::ReduceKernel<T>>(reduce_type_,
                                                             axis_,
                                                             keep_dims_);
@@ -914,6 +914,9 @@ void RegisterReduce(OpRegistryBase *op_registry) {
           .SetDevicePlacerFunc(
               [](OpConditionContext *context) -> std::set<DeviceType> {
                 auto op = context->operator_def();
+                if (op->output_shape_size() != op->output_size()) {
+                  return { DeviceType::CPU, DeviceType::GPU };
+                }
                 bool keep_dims =
                     ProtoArgHelper::GetOptionalArg<OperatorDef, bool>(
                         *op, "keepdims", false);
@@ -923,7 +926,7 @@ void RegisterReduce(OpRegistryBase *op_registry) {
                 auto axis =
                     ProtoArgHelper::GetRepeatedArgs<OperatorDef, int>(
                         *op, "axis");
-                if (axis.size() != 2 || axis[0] != 1 || axis[1] == 2) {
+                if (axis.size() != 2 || axis[0] != 1 || axis[1] != 2) {
                   return { DeviceType::CPU };
                 }
                 auto tensor_shape_info = context->tensor_shape_info();
diff --git a/mace/ops/reduce_test.cc b/mace/ops/reduce_test.cc
index ccf38fea..21a2dc13 100644
--- a/mace/ops/reduce_test.cc
+++ b/mace/ops/reduce_test.cc
@@ -38,7 +38,8 @@ void Simple(const std::vector<index_t> &input_shape,
   net.AddInputFromArray<D, float>("Input", input_shape, input);
 
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<D, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("Reduce", "ReduceTest")
         .Input("InputNCHW")
         .AddIntsArg("axis", axis)
@@ -49,7 +50,8 @@ void Simple(const std::vector<index_t> &input_shape,
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<D, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   } else {
     OpDefBuilder("Reduce", "ReduceTest")
         .Input("Input")
@@ -289,8 +291,8 @@ void RandomTest(const std::vector<index_t> &input_shape,
     // Add input data
     net.AddRandomInput<D, float>("Input", input_shape);
 
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("Reduce", "ReduceTest")
         .Input("InputNCHW")
         .AddIntsArg("axis", axis)
@@ -301,8 +303,8 @@ void RandomTest(const std::vector<index_t> &input_shape,
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp();
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
     OpDefBuilder("Reduce", "ReduceTest")
         .Input("Input")
         .AddIntsArg("axis", axis)
@@ -353,7 +355,7 @@ void TestQuant(const std::vector<index_t> &input_shape,
     net.AddRandomInput<CPU, float>(
         "Input", input_shape, false, false);
     net.TransformDataFormat<DeviceType::CPU, float>(
-        "Input", NHWC, "InputNCHW", NCHW);
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     net.AddRandomInput<DeviceType::CPU, float>(
         "OutputNCHW", input_shape, false, true, true);
 
@@ -368,7 +370,7 @@ void TestQuant(const std::vector<index_t> &input_shape,
         .Finalize(net.NewOperatorDef());
     net.RunOp(CPU);
     net.TransformDataFormat<DeviceType::CPU, float>(
-        "OutputNCHW", NCHW, "Output", NHWC);
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
     OpDefBuilder("Quantize", "QuantizeInput")
         .Input("Input")
diff --git a/mace/ops/ref/deconv_2d.cc b/mace/ops/ref/deconv_2d.cc
index 6044af3b..d06c6634 100644
--- a/mace/ops/ref/deconv_2d.cc
+++ b/mace/ops/ref/deconv_2d.cc
@@ -51,7 +51,7 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
                                  &out_pad_size,
                                  &padded_out_shape,
                                  framework_type_,
-                                 NCHW);
+                                 DataFormat::NCHW);
 
   MACE_RETURN_IF_ERROR(output->Resize(out_shape));
 
diff --git a/mace/ops/ref/depthwise_deconv_2d.cc b/mace/ops/ref/depthwise_deconv_2d.cc
index 0da81faa..63b3aa69 100644
--- a/mace/ops/ref/depthwise_deconv_2d.cc
+++ b/mace/ops/ref/depthwise_deconv_2d.cc
@@ -50,7 +50,7 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
                                  &out_pad_size,
                                  &padded_out_shape,
                                  framework_type_,
-                                 NCHW);
+                                 DataFormat::NCHW);
 
   MACE_RETURN_IF_ERROR(output->Resize(out_shape));
 
@@ -185,7 +185,7 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
                                  &out_pad_size,
                                  &padded_out_shape,
                                  framework_type_,
-                                 NCHW);
+                                 DataFormat::NCHW);
 
   MACE_RETURN_IF_ERROR(output->Resize(out_shape));
 
diff --git a/mace/ops/resize_bicubic.cc b/mace/ops/resize_bicubic.cc
index f06692b9..349f6423 100644
--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
@@ -212,7 +212,7 @@ class ResizeBicubicOp<DeviceType::GPU, T> : public Operation {
     std::vector<index_t> size = Operation::GetRepeatedArgs<index_t>(
         "size", {-1, -1});
     MACE_CHECK(size.size() == 2);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::ResizeBicubicKernel<T>>(
           align_corners, size[0], size[1]);
     } else {
diff --git a/mace/ops/resize_bicubic_test.cc b/mace/ops/resize_bicubic_test.cc
index 035ddfcf..e9c5e4d1 100644
--- a/mace/ops/resize_bicubic_test.cc
+++ b/mace/ops/resize_bicubic_test.cc
@@ -31,8 +31,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) {
   std::vector<float> input(24);
   std::iota(begin(input), end(input), 0);
   net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
       .Input("InputNCHW")
@@ -42,8 +42,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) {
 
   // Run
   net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
@@ -60,8 +60,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) {
   std::vector<float> input(48);
   std::iota(begin(input), end(input), 0);
   net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 4, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
       .Input("InputNCHW")
@@ -71,8 +71,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) {
 
   // Run
   net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>({1, 2, 3, 3},
@@ -92,8 +92,8 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) {
   std::vector<float> input(24);
   std::iota(begin(input), end(input), 0);
   net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
       .Input("InputNCHW")
@@ -104,8 +104,8 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) {
 
   // Run
   net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
@@ -133,8 +133,8 @@ void TestRandomResizeBicubic() {
     net.AddRandomInput<D, float>("Input",
                                  {batch, in_height, in_width, channels},
                                  false, true, true);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
     OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
         .Input("InputNCHW")
@@ -144,8 +144,8 @@ void TestRandomResizeBicubic() {
         .Finalize(net.NewOperatorDef());
     // Run on CPU
     net.RunOp(DeviceType::CPU);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
     Tensor expected;
     expected.Copy(*net.GetOutput("Output"));
diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc
index 1fe13f42..09df62d8 100644
--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -346,7 +346,7 @@ class ResizeBilinearOp<DeviceType::GPU, T> : public Operation {
     std::vector<index_t> size = Operation::GetRepeatedArgs<index_t>(
         "size", {-1, -1});
     MACE_CHECK(size.size() == 2);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::ResizeBilinearKernel<T>>(
           align_corners, size[0], size[1]);
     } else {
diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc
index 9252e81f..c9c86427 100644
--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -31,8 +31,8 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
   std::vector<float> input(24);
   std::iota(begin(input), end(input), 0);
   net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
       .Input("InputNCHW")
@@ -42,8 +42,8 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
 
   // Run
   net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
@@ -60,8 +60,8 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
   std::vector<float> input(24);
   std::iota(begin(input), end(input), 0);
   net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
       .Input("InputNCHW")
@@ -72,8 +72,8 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
 
   // Run
   net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
@@ -100,8 +100,8 @@ void TestRandomResizeBilinear() {
     // Add input data
     net.AddRandomInput<D, float>("Input",
                                  {batch, in_height, in_width, channels});
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
     OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
         .Input("InputNCHW")
@@ -111,8 +111,8 @@ void TestRandomResizeBilinear() {
         .Finalize(net.NewOperatorDef());
     // Run on CPU
     net.RunOp(DeviceType::CPU);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
     auto expected = net.CreateTensor<float>();
     expected->Copy(*net.GetOutput("Output"));
@@ -155,8 +155,8 @@ void TestQuantizedResizeBilinear() {
                                    true,
                                    -1.f,
                                    1.f);
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
     OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
         .Input("InputNCHW")
@@ -166,8 +166,8 @@ void TestQuantizedResizeBilinear() {
         .Finalize(net.NewOperatorDef());
     // Run on CPU
     net.RunOp(DeviceType::CPU);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
     // run quantize
     OpDefBuilder("Quantize", "QuantizeInput")
diff --git a/mace/ops/resize_nearest_neighbor.cc b/mace/ops/resize_nearest_neighbor.cc
index 8840458f..9e98e75e 100644
--- a/mace/ops/resize_nearest_neighbor.cc
+++ b/mace/ops/resize_nearest_neighbor.cc
@@ -149,7 +149,7 @@ class ResizeNearestNeighborOp<DeviceType::GPU, T> : public Operation {
       : Operation(context) {
     bool align_corners = Operation::GetOptionalArg<bool>(
         "align_corners", false);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::ResizeNearestNeighborKernel<T>>(
           align_corners);
     } else {
diff --git a/mace/ops/resize_nearest_neighbor_test.cc b/mace/ops/resize_nearest_neighbor_test.cc
index b9500472..842c44c6 100644
--- a/mace/ops/resize_nearest_neighbor_test.cc
+++ b/mace/ops/resize_nearest_neighbor_test.cc
@@ -32,8 +32,8 @@ TEST_F(ResizeNearestNeighborTest, CPUResizeNearestNeighborWOAlignCorners) {
   std::iota(begin(input), end(input), 0);
   std::vector<int32_t> size = {1, 2};
   net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
   net.AddInputFromArray<DeviceType::CPU, int32_t>("Size", {2}, size);
 
   OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest")
@@ -45,8 +45,8 @@ TEST_F(ResizeNearestNeighborTest, CPUResizeNearestNeighborWOAlignCorners) {
 
   // Run
   net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
@@ -64,8 +64,8 @@ TEST_F(ResizeNearestNeighborTest, ResizeNearestNeighborWAlignCorners) {
   std::iota(begin(input), end(input), 0);
   std::vector<int32_t> size = {1, 2};
   net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
   net.AddInputFromArray<DeviceType::CPU, int32_t>("Size", {2}, size);
 
   OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest")
@@ -78,8 +78,8 @@ TEST_F(ResizeNearestNeighborTest, ResizeNearestNeighborWAlignCorners) {
 
   // Run
   net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   // Check
   auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
@@ -105,8 +105,8 @@ void TestRandomResizeNearestNeighbor() {
     std::vector<int32_t> size = {20, 40};
     net.AddRandomInput<D, float>("Input",
                                  {batch, in_height, in_width, channels});
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     net.AddInputFromArray<D, int32_t>("Size", {2}, size);
     OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest")
         .Input("InputNCHW")
@@ -116,8 +116,8 @@ void TestRandomResizeNearestNeighbor() {
         .Finalize(net.NewOperatorDef());
     // Run on CPU
     net.RunOp(DeviceType::CPU);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
     auto expected = net.CreateTensor<float>();
     expected->Copy(*net.GetOutput("Output"));
diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc
index d5fcbc02..e3241098 100644
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -414,7 +414,7 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation {
       : Operation(context) {
     bool use_log = (
         Operation::GetOptionalArg<bool>("use_log", false));
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::SoftmaxKernel<T>>(use_log);
     } else {
       kernel_ = make_unique<opencl::buffer::SoftmaxKernel<T>>(use_log);
diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc
index ab818ac8..eb3398db 100644
--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
@@ -50,7 +50,8 @@ void Simple(bool use_log = false) {
 
   if (D == DeviceType::CPU) {
     // test 4d softmax
-    net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("Softmax", "SoftmaxTest")
         .Input("InputNCHW")
         .Output("OutputNCHW")
@@ -59,7 +60,8 @@ void Simple(bool use_log = false) {
 
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
     ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 
@@ -109,7 +111,8 @@ void Complex(const std::vector<index_t> &logits_shape,
   net.AddRandomInput<D, float>("Input", logits_shape);
 
   if (logits_shape.size() == 4) {
-    net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
+    net.TransformDataFormat<CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
     OpDefBuilder("Softmax", "SoftmaxTest")
         .Input("InputNCHW")
@@ -127,7 +130,8 @@ void Complex(const std::vector<index_t> &logits_shape,
   net.RunOp();
 
   if (logits_shape.size() == 4) {
-    net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+    net.TransformDataFormat<CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   }
 
   auto expected = net.CreateTensor<float>();
diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc
index b239193c..50de3fc7 100644
--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -307,7 +307,7 @@ class SpaceToBatchNDOp<DeviceType::GPU, T> : public SpaceToBatchOpBase {
  public:
   explicit SpaceToBatchNDOp(OpConstructContext *context)
       : SpaceToBatchOpBase(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::SpaceToBatchKernel<T>>();
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/space_to_batch_test.cc b/mace/ops/space_to_batch_test.cc
index 95b9fafc..045d6ece 100644
--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
@@ -39,8 +39,8 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
         .AddIntsArg("block_shape", block_shape_data)
         .Finalize(net.NewOperatorDef());
   } else if (D == CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
         .Input("InputNCHW")
         .Output("OutputNCHW")
@@ -53,8 +53,8 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
   net.RunOp(D);
 
   if (D == CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   }
   // Check
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
@@ -78,8 +78,8 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
         .AddIntsArg("block_shape", block_shape_data)
         .Finalize(net.NewOperatorDef());
   } else if (D == CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
         .Input("InputNCHW")
         .Output("OutputNCHW")
@@ -92,8 +92,8 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
   net.RunOp(D);
 
   if (D == CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   }
   // Check
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
@@ -155,8 +155,8 @@ void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
   net.RunOp(GPU);
 
   // run cpu
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
   OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
       .Input("InputNCHW")
       .Output("OutputNCHW")
@@ -164,8 +164,8 @@ void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
       .AddIntsArg("block_shape", block_shape_data)
       .Finalize(net.NewOperatorDef());
   net.RunOp(CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                  "OutputCPU", NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
 
   // Check
   ExpectTensorNear<float>(*net.GetOutput("OutputCPU"),
@@ -188,8 +188,8 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
   net.RunOp(GPU);
 
   // run cpu
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
   OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
       .Input("InputNCHW")
       .Output("OutputNCHW")
@@ -197,8 +197,8 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
       .AddIntsArg("block_shape", block_shape_data)
       .Finalize(net.NewOperatorDef());
   net.RunOp(CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                  "OutputCPU", NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
 
   // Check
   ExpectTensorNear<float>(*net.GetOutput("OutputCPU"),
@@ -218,8 +218,8 @@ void TestSpaceToBatchQuantize(const std::vector<index_t> &input_shape,
                                  1.f);
 
   // run cpu
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
   OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
       .Input("InputNCHW")
       .Output("OutputNCHW")
@@ -227,8 +227,8 @@ void TestSpaceToBatchQuantize(const std::vector<index_t> &input_shape,
       .AddIntsArg("block_shape", block_shape_data)
       .Finalize(net.NewOperatorDef());
   net.RunOp(CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                  "OutputCPU", NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
 
   // run quantize
   OpDefBuilder("Quantize", "QuantizeInput")
@@ -279,8 +279,8 @@ void TestoBatchToSpaceQuantize(const std::vector<index_t> &input_shape,
                                  1.f);
 
   // run cpu
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
   OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
       .Input("InputNCHW")
       .Output("OutputNCHW")
@@ -288,8 +288,8 @@ void TestoBatchToSpaceQuantize(const std::vector<index_t> &input_shape,
       .AddIntsArg("block_shape", block_shape_data)
       .Finalize(net.NewOperatorDef());
   net.RunOp(CPU);
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                  "OutputCPU", NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
 
   // run quantize
   OpDefBuilder("Quantize", "QuantizeInput")
diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc
index 918ae678..9584ddb8 100644
--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
@@ -94,7 +94,7 @@ class SpaceToDepthOp<DeviceType::GPU, T> : public Operation {
   explicit SpaceToDepthOp(OpConstructContext *context)
       : Operation(context) {
     int block_size = Operation::GetOptionalArg<int>("block_size", 1);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::SpaceToDepthKernel<T>>(block_size);
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/space_to_depth_test.cc b/mace/ops/space_to_depth_test.cc
index 23daaa55..6d023b88 100644
--- a/mace/ops/space_to_depth_test.cc
+++ b/mace/ops/space_to_depth_test.cc
@@ -32,8 +32,8 @@ void RunSpaceToDepth(const std::vector<index_t> &input_shape,
   net.AddInputFromArray<D, float>("Input", input_shape, input_data);
   // Construct graph
   if (D == DeviceType::CPU) {
-    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
     OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
         .Input("InputNCHW")
         .Output("OutputNCHW")
@@ -41,8 +41,8 @@ void RunSpaceToDepth(const std::vector<index_t> &input_shape,
         .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
-    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                    "Output", NHWC);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   } else {
     OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
@@ -107,8 +107,8 @@ void RandomTest(const int block_size,
 
   // Add input data
   net.AddRandomInput<D, float>("Input", shape);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
   OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
       .Input("InputNCHW")
       .AddIntArg("block_size", block_size)
@@ -118,8 +118,8 @@ void RandomTest(const int block_size,
   // Run
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
 
   OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
       .Input("Input")
diff --git a/mace/ops/split.cc b/mace/ops/split.cc
index 6b646270..b08d72c5 100644
--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
@@ -106,7 +106,7 @@ class SplitOp<DeviceType::GPU, T> : public Operation {
   explicit SplitOp(OpConstructContext *context)
       : Operation(context) {
     int32_t axis = Operation::GetOptionalArg<int>("axis", 3);
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::SplitKernel<T>>(axis);
     } else {
       MACE_NOT_IMPLEMENTED;
@@ -147,7 +147,7 @@ void RegisterSplit(OpRegistryBase *op_registry) {
               [](OpConditionContext *context) -> std::set<DeviceType> {
                 auto op = context->operator_def();
                 if (op->output_shape_size() != op->output_size()) {
-                  return { DeviceType::CPU };
+                  return {DeviceType::CPU, DeviceType::GPU};
                 }
                 int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                     *op, "axis", 3);
diff --git a/mace/ops/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc
index d58191c4..cd2fb174 100644
--- a/mace/ops/sqrdiff_mean.cc
+++ b/mace/ops/sqrdiff_mean.cc
@@ -83,7 +83,7 @@ class SqrDiffMeanOp<DeviceType::GPU, T> : public Operation {
  public:
   explicit SqrDiffMeanOp(OpConstructContext *context)
       : Operation(context) {
-    if (context->device()->gpu_runtime()->UseImageMemory()) {
+    if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
       kernel_ = make_unique<opencl::image::SqrDiffMeanKernel<T>>();
     } else {
       MACE_NOT_IMPLEMENTED;
diff --git a/mace/ops/sqrdiff_mean_test.cc b/mace/ops/sqrdiff_mean_test.cc
index 34257479..3257987c 100644
--- a/mace/ops/sqrdiff_mean_test.cc
+++ b/mace/ops/sqrdiff_mean_test.cc
@@ -36,13 +36,13 @@ void Simple(const std::vector<index_t> &input_shape0,
   net.AddInputFromArray<D, float>("Input1", input_shape1, input1);
 
   net.TransformDataFormat<DeviceType::CPU, float>("Input0",
-                                                  NHWC,
+                                                  DataFormat::NHWC,
                                                   "InputNCHW0",
-                                                  NCHW);
+                                                  DataFormat::NCHW);
   net.TransformDataFormat<DeviceType::CPU, float>("Input1",
-                                                  NHWC,
+                                                  DataFormat::NHWC,
                                                   "InputNCHW1",
-                                                  NCHW);
+                                                  DataFormat::NCHW);
 
   if (D == DeviceType::CPU) {
     OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest")
@@ -54,9 +54,9 @@ void Simple(const std::vector<index_t> &input_shape0,
     net.RunOp(D);
 
     net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
-                                                    NCHW,
+                                                    DataFormat::NCHW,
                                                     "Output",
-                                                    NHWC);
+                                                    DataFormat::NHWC);
   } else {
     OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest")
         .Input("Input0")
@@ -107,10 +107,10 @@ void RandomTest(const std::vector<index_t> &input_shape0,
   net.AddRandomInput<D, float>("Input0", input_shape0);
   net.AddRandomInput<D, float>("Input1", input_shape1);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC, "InputNCHW0",
-                                                  NCHW);
-  net.TransformDataFormat<DeviceType::CPU, float>("Input1", NHWC, "InputNCHW1",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input0", DataFormat::NHWC, "InputNCHW0", DataFormat::NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input1", DataFormat::NHWC, "InputNCHW1", DataFormat::NCHW);
   OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest")
       .Input("InputNCHW0")
       .Input("InputNCHW1")
@@ -118,8 +118,8 @@ void RandomTest(const std::vector<index_t> &input_shape0,
       .Finalize(net.NewOperatorDef());
   // Run
   net.RunOp();
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
-                                                  "Output", NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest")
       .Input("Input0")
       .Input("Input1")
diff --git a/mace/ops/strided_slice_test.cc b/mace/ops/strided_slice_test.cc
index 8b085fe5..f8dd06f5 100644
--- a/mace/ops/strided_slice_test.cc
+++ b/mace/ops/strided_slice_test.cc
@@ -86,8 +86,8 @@ void TestStridedSliceWithDataFormat(const std::vector<index_t> &input_shape,
   net.AddInputFromArray<CPU, int32_t>(
       "Strides", {static_cast<int32_t>(strides.size())}, strides);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("StridedSlice", "StridedSliceOpTest")
       .Input("InputNCHW")
@@ -105,8 +105,8 @@ void TestStridedSliceWithDataFormat(const std::vector<index_t> &input_shape,
 
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   net.AddInputFromArray<CPU, float>("ExpectedOutput", output_shape, output);
   ExpectTensorNear<float>(*net.GetOutput("ExpectedOutput"),
                           *net.GetOutput("Output"));
@@ -154,8 +154,8 @@ void TestSliceWithDataFormat(const std::vector<index_t> &input_shape,
   net.AddInputFromArray<CPU, int32_t>(
       "IndicesSize", {static_cast<int32_t>(indices_size.size())}, indices_size);
 
-  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
-                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
 
   OpDefBuilder("StridedSlice", "StridedSliceOpTest")
       .Input("InputNCHW")
@@ -168,8 +168,8 @@ void TestSliceWithDataFormat(const std::vector<index_t> &input_shape,
 
   net.RunOp();
 
-  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
-                                                  NHWC);
+  net.TransformDataFormat<DeviceType::CPU, float>(
+      "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
   net.AddInputFromArray<CPU, float>("ExpectedOutput", output_shape, output);
   ExpectTensorNear<float>(*net.GetOutput("ExpectedOutput"),
                           *net.GetOutput("Output"));
diff --git a/mace/public/mace.h b/mace/public/mace.h
index dd559249..72e96d1e 100644
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -34,10 +34,10 @@ class NetDef;
 
 enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3, HTA = 4 };
 
-enum DataFormat {
-  DF_NONE = 0, NHWC = 1, NCHW = 2,
+enum class DataFormat {
+  NONE = 0, NHWC = 1, NCHW = 2,
   HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103,
-  DF_AUTO = 1000,
+  AUTO = 1000,
 };
 
 enum GPUPerfHint {
diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py
index 446321a4..58658dd8 100644
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -41,7 +41,7 @@ device_type_map = {'cpu': cvt.DeviceType.CPU.value,
                    'cpu+gpu': cvt.DeviceType.CPU.value}
 
 data_format_map = {
-    'NONE': cvt.DataFormat.DF_NONE,
+    'NONE': cvt.DataFormat.NONE,
     'NHWC': cvt.DataFormat.NHWC,
     'NCHW': cvt.DataFormat.NCHW,
     'OIHW': cvt.DataFormat.OIHW,
diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py
index 8162f008..61e65bae 100644
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -26,14 +26,14 @@ class DeviceType(Enum):
 
 
 class DataFormat(Enum):
-    DF_NONE = 0
+    NONE = 0
     NHWC = 1
     NCHW = 2
     HWIO = 100
     OIHW = 101
     HWOI = 102
     OHWI = 103
-    DF_AUTO = 1000
+    AUTO = 1000
 
 
 # SAME_LOWER: if the amount of paddings to be added is odd,
@@ -598,8 +598,8 @@ class ConverterUtil(object):
             return DataFormat.NHWC
         elif arg.i == DataFormat.NCHW.value:
             return DataFormat.NCHW
-        elif arg.i == DataFormat.DF_AUTO.value:
-            return DataFormat.DF_AUTO
+        elif arg.i == DataFormat.AUTO.value:
+            return DataFormat.AUTO
         else:
             return None
 
diff --git a/mace/python/tools/converter_tool/onnx_converter.py b/mace/python/tools/converter_tool/onnx_converter.py
index 8974489c..70e855d5 100644
--- a/mace/python/tools/converter_tool/onnx_converter.py
+++ b/mace/python/tools/converter_tool/onnx_converter.py
@@ -387,7 +387,8 @@ class OnnxConverter(base_converter.ConverterInterface):
         self._mace_net_def = mace_pb2.NetDef()
         self._data_format = DataFormat.NCHW
         ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.OIHW)
-        ConverterUtil.add_data_format_arg(self._mace_net_def, self._data_format)
+        ConverterUtil.add_data_format_arg(self._mace_net_def,
+                                          self._data_format)
         onnx_model = onnx.load(src_model_file)
 
         ir_version = onnx_model.ir_version
@@ -403,7 +404,7 @@ class OnnxConverter(base_converter.ConverterInterface):
             print("constains ops domain: ", domain, "version:", version)
             if 'kaldi2onnx' in domain:
                 polish_available = False
-                self._data_format = DataFormat.DF_NONE
+                self._data_format = DataFormat.NONE
                 self._isKaldi = True
         if polish_available:
             onnx_model = onnx.utils.polish_model(onnx_model)
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index 65c456c9..51806961 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -27,7 +27,7 @@ from mace.python.tools.converter_tool.base_converter import EltwiseType
 from mace.python.tools.converter_tool.base_converter import FrameworkType
 from mace.python.tools.converter_tool.base_converter import MaceKeyword
 from mace.python.tools.converter_tool.base_converter import MaceOp
-from mace.python.tools.converter_tool.base_converter import MaceHasDataFormatOps
+from mace.python.tools.converter_tool.base_converter import MaceHasDataFormatOps  # noqa
 from mace.python.tools.converter_tool.base_converter import MaceMayHasDataFormatOps  # noqa
 from mace.python.tools.converter_tool.base_converter import PaddingMode
 from mace.python.tools.converter_tool.base_converter import ReduceType
@@ -200,15 +200,15 @@ class Transformer(base_converter.ConverterInterface):
                     op.output.extend([input_node.name])
                     output_shape = op.output_shape.add()
                     output_shape.dims.extend(input_node.shape)
-                    if input_node.data_format != DataFormat.DF_NONE:
+                    if input_node.data_format != DataFormat.NONE:
                         if input_node.data_format == DataFormat.NCHW:
                             self.transpose_shape(output_shape.dims,
                                                  [0, 3, 1, 2])
                         ConverterUtil.add_data_format_arg(op,
-                                                          DataFormat.DF_AUTO)
+                                                          DataFormat.AUTO)
                     else:
                         ConverterUtil.add_data_format_arg(op,
-                                                          DataFormat.DF_NONE)
+                                                          DataFormat.NONE)
                     self._producer[op.output[0]] = op
 
     @staticmethod
@@ -261,7 +261,7 @@ class Transformer(base_converter.ConverterInterface):
             producer = self._producer[tensor]
             return ConverterUtil.data_format(producer)
         else:
-            return DataFormat.DF_NONE
+            return DataFormat.NONE
 
     def consumer_count(self, tensor_name):
         return len(self._consumers.get(tensor_name, []))
@@ -1021,7 +1021,6 @@ class Transformer(base_converter.ConverterInterface):
                                        filter_format.name)
         return False
 
-
     def add_winograd_arg(self):
         if self._wino_arg == 0:
             return False
@@ -1350,20 +1349,21 @@ class Transformer(base_converter.ConverterInterface):
                 df_arg = op.arg.add()
                 df_arg.name = MaceKeyword.mace_data_format_str
             if op.type in MaceHasDataFormatOps:
-                df_arg.i = DataFormat.DF_AUTO.value
+                df_arg.i = DataFormat.AUTO.value
             elif op.type in MaceMayHasDataFormatOps:
-                input_df = DataFormat.DF_AUTO.value
+                input_df = DataFormat.AUTO.value
                 for input_tensor in op.input:
                     if input_tensor in self._consts:
                         continue
-                    mace_check(input_tensor in self._producer,
-                               "Input tensor %s not in producer" % input_tensor)
+                    mace_check(
+                        input_tensor in self._producer,
+                        "Input tensor %s not in producer" % input_tensor)
                     father_op = self._producer[input_tensor]
                     temp_input_df = ConverterUtil.get_arg(
                         father_op, MaceKeyword.mace_data_format_str)
-                    if temp_input_df.i != DataFormat.DF_AUTO.value:
+                    if temp_input_df.i != DataFormat.AUTO.value:
                         input_df = temp_input_df.i
-                if input_df == DataFormat.DF_AUTO.value:
+                if input_df == DataFormat.AUTO.value:
                     df_arg.i = input_df
                     # add flag to mark the ops may has data format
                     has_data_format_arg = op.arg.add()
@@ -1379,7 +1379,7 @@ class Transformer(base_converter.ConverterInterface):
         src_data_format = ConverterUtil.data_format(net)
         for op in net.op:
             has_data_format = ConverterUtil.data_format(op) == \
-                              DataFormat.DF_AUTO
+                              DataFormat.AUTO
             # transpose args
             if op.type == MaceOp.Pad.name:
                 for arg in op.arg:
diff --git a/mace/python/tools/model.jinja2 b/mace/python/tools/model.jinja2
index 89bee8d8..0d1396c4 100644
--- a/mace/python/tools/model.jinja2
+++ b/mace/python/tools/model.jinja2
@@ -80,7 +80,7 @@ void CreateInputInfo(NetDef *net_def) {
   input_info = net_def->add_input_info();
   input_info->set_name({{ net.input_info[idx].name|tojson }});
   input_info->set_data_type(static_cast<DataType>({{ net.input_info[idx].data_type }}));
-  input_info->set_data_format(static_cast<DataFormat>({{ net.input_info[idx].data_format }}));
+  input_info->set_data_format({{ net.input_info[idx].data_format }});
   input_info->mutable_dims()->Reserve({{ net.input_info[idx].dims|length }});
   {% for dim in net.input_info[idx].dims %}
   input_info->add_dims({{ dim }});
@@ -97,7 +97,7 @@ void CreateOutputInfo(NetDef *net_def) {
   output_info = net_def->add_output_info();
   output_info->set_name({{ net.output_info[idx].name|tojson }});
   output_info->set_data_type(static_cast<DataType>({{ net.output_info[idx].data_type }}));
-  output_info->set_data_format(static_cast<DataFormat>({{ net.output_info[idx].data_format }}));
+  output_info->set_data_format({{ net.output_info[idx].data_format }});
   output_info->mutable_dims()->Reserve({{ net.output_info[idx].dims|length }});
   {% for dim in net.output_info[idx].dims %}
   output_info->add_dims({{dim}});
diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc
index 4bf5f40b..a06ce493 100644
--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
@@ -48,7 +48,7 @@ void MaceRunFunc(const int in_out_size) {
 
   for (size_t i = 0; i < input_names.size(); ++i) {
     InputOutputInfo *info = net_def->add_input_info();
-    info->set_data_format(DataFormat::NHWC);
+    info->set_data_format(static_cast<int>(DataFormat::NHWC));
     info->set_name(input_names[i]);
     for (auto d : input_shapes[0]) {
       info->add_dims(static_cast<int>(d));
diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc
index 0a852a17..6cad55b9 100644
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -45,7 +45,7 @@ void MaceRun(const int in_out_size,
 
   for (size_t i = 0; i < input_names.size(); ++i) {
     InputOutputInfo *info = net_def->add_input_info();
-    info->set_data_format(DataFormat::NHWC);
+    info->set_data_format(static_cast<int>(DataFormat::NHWC));
     info->set_name(input_names[i]);
     for (auto d : max_shape) {
       info->add_dims(static_cast<int>(d));
diff --git a/mace/test/mace_api_test.h b/mace/test/mace_api_test.h
index 9cc1402f..faaf1443 100644
--- a/mace/test/mace_api_test.h
+++ b/mace/test/mace_api_test.h
@@ -76,7 +76,7 @@ void Conv3x3(const std::string &input_name,
       .AddIntArg("padding", Padding::SAME)
       .AddIntsArg("dilations", {1, 1})
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddIntArg("has_data_format", 1)
+      .AddIntArg("data_format", static_cast<int>(DataFormat::AUTO))
       .Finalize(&operator_def);
 
   OutputShape *shape = operator_def.add_output_shape();
@@ -99,7 +99,7 @@ void Relu(const std::string &input_name,
       .AddStringArg("activation", "RELU")
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .AddIntArg("device", static_cast<int>(device_type))
-      .AddIntArg("has_data_format", 1)
+      .AddIntArg("data_format", static_cast<int>(DataFormat::AUTO))
       .Finalize(&operator_def);
 
   net_def->add_op()->CopyFrom(operator_def);
@@ -139,7 +139,8 @@ void CheckOutputs(const NetDef &net_def,
     if (D == DeviceType::CPU) {
       std::string input_name = input.first + "NHWC";
       net.AddInputFromArray<D, float>(input_name, input_shape, input_data);
-      net.TransformDataFormat<D, float>(input_name, NHWC, input.first, NCHW);
+      net.TransformDataFormat<D, float>(
+          input_name, DataFormat::NHWC, input.first, DataFormat::NCHW);
     } else {
       net.AddInputFromArray<D, float>(input.first, input_shape, input_data);
     }
@@ -154,7 +155,7 @@ void CheckOutputs(const NetDef &net_def,
     memcpy(data.data(),
            reinterpret_cast<const T *>(tensor_data.data()) + tensor.offset(),
            tensor.data_size() * sizeof(T));
-    net.AddInputFromArray<D, T>(tensor.name(), shape, data);
+    net.AddInputFromArray<D, T>(tensor.name(), shape, data, true);
   }
   net.RunNet(net_def, D);
 
@@ -175,9 +176,9 @@ void CheckOutputs(const NetDef &net_def,
     if (D == DeviceType::CPU) {
       output_name = output.first + "NHWC";
       net.TransformDataFormat<CPU, float>(output.first,
-                                          NCHW,
+                                          DataFormat::NCHW,
                                           output_name,
-                                          NHWC);
+                                          DataFormat::NHWC);
     }
     ops::test::ExpectTensorNear<float>(*tmp_tensor,
                                        *net.GetOutput(output_name.data()),
diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc
index 7fc0690d..fca4a0fd 100644
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -91,7 +91,7 @@ DataFormat ParseDataFormat(const std::string &data_format_str) {
   } else if (data_format_str == "OIHW") {
     return DataFormat::OIHW;
   } else {
-    return DataFormat::DF_NONE;
+    return DataFormat::NONE;
   }
 }
 
-- 
GitLab