Merge branch 'data-format' into 'master'

Refactor: remove data_format and add has_data_format flag in Operation See merge request !1006

Merge branch 'data-format' into 'master'
Refactor: remove data_format and add has_data_format flag in Operation See merge request !1006
3e795fa3 · 叶剑武 · 202ea3a6 · 3cfe0dd6 · 3e795fa3 · 3e795fa3
50 changed file
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -90,6 +90,16 @@ DeviceType ParseDeviceType(const std::string &device_str) {
  }
 }

+DataFormat ParseDataFormat(const std::string &data_format_str) {
+  if (data_format_str == "NHWC") {
+    return DataFormat::NHWC;
+  } else if (data_format_str == "NCHW") {
+    return DataFormat::NCHW;
+  } else {
+    return DataFormat::DF_NONE;
+  }
+}
+
 bool RunInference(MaceEngine *engine,
                  const std::map<std::string, mace::MaceTensor> &input_infos,
                  std::map<std::string, mace::MaceTensor> *output_infos,
@@ -168,6 +178,12 @@ DEFINE_string(output_node, "output_node0,output_node1",
              "output nodes, separated by comma");
 DEFINE_string(input_shape, "", "input shape, separated by colon and comma");
 DEFINE_string(output_shape, "", "output shape, separated by colon and comma");
+DEFINE_string(input_data_format,
+              "NHWC",
+              "input data formats, NONE|NHWC|NCHW");
+DEFINE_string(output_data_format,
+              "NHWC",
+              "output data formats, NONE|NHWC|NCHW");
 DEFINE_string(input_file, "", "input file name");
 DEFINE_int32(max_num_runs, 100, "max number of runs");
 DEFINE_double(max_seconds, 10.0, "max number of seconds to run");
@@ -233,6 +249,19 @@ int Main(int argc, char **argv) {
    ParseShape(output_shapes[i], &output_shape_vec[i]);
  }

+  std::vector<std::string> raw_input_data_formats =
+      str_util::Split(FLAGS_input_data_format, ',');
+  std::vector<std::string> raw_output_data_formats =
+      str_util::Split(FLAGS_output_data_format, ',');
+  std::vector<DataFormat> input_data_formats(input_count);
+  std::vector<DataFormat> output_data_formats(output_count);
+  for (size_t i = 0; i < input_count; ++i) {
+    input_data_formats[i] = ParseDataFormat(raw_input_data_formats[i]);
+  }
+  for (size_t i = 0; i < output_count; ++i) {
+    output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]);
+  }
+
  mace::DeviceType device_type = ParseDeviceType(FLAGS_device);

  // configuration
@@ -333,7 +362,8 @@ int Main(int argc, char **argv) {
      LOG(INFO) << "Open input file failed";
      return -1;
    }
-    inputs[input_names[i]] = mace::MaceTensor(input_shape_vec[i], buffer_in);
+    inputs[input_names[i]] = mace::MaceTensor(input_shape_vec[i], buffer_in,
+        input_data_formats[i]);
  }

  for (size_t i = 0; i < output_count; ++i) {
@@ -344,7 +374,8 @@ int Main(int argc, char **argv) {
    auto buffer_out = std::shared_ptr<float>(new float[output_size],
                                             std::default_delete<float[]>());
    outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i],
-                                                buffer_out);
+                                                buffer_out,
+                                                output_data_formats[i]);
  }

  int64_t warmup_time_us = 0;

--- a/mace/core/memory_optimizer.cc
+++ b/mace/core/memory_optimizer.cc
@@ -115,6 +115,8 @@ void MemoryOptimizer::Optimize(
      op_def->output_type_size());
  DataType dt;

+  bool has_data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+      *op_def, "has_data_format", 0) != 0;
  int output_size = op_def->output_size();
  for (int i = 0; i < output_size; ++i) {
    if (i < op_def->output_type_size()) {
@@ -134,7 +136,7 @@ void MemoryOptimizer::Optimize(
    MemoryBlock best_mem_block;
    if (IsMemoryReuseOp(op_def->type())) {
      if (tensor_mem_map_.count(op_def->input(0)) == 1) {
-        best_mem_id = tensor_mem_map_[op_def->input(0)].first;
+        best_mem_id = tensor_mem_map_.at(op_def->input(0)).mem_id;
      }
    } else {
      auto shape = std::vector<int64_t>(
@@ -204,7 +206,8 @@ void MemoryOptimizer::Optimize(
      } else {
        mem_ref_count_[best_mem_id] = 1;
      }
-      tensor_mem_map_[op_def->output(i)] = std::make_pair(best_mem_id, dt);
+      tensor_mem_map_.emplace(op_def->output(i), TensorMemInfo(best_mem_id,
+          dt, has_data_format));
    }
  }

@@ -216,7 +219,7 @@ void MemoryOptimizer::Optimize(
      tensor_ref_count_[input_name] -= 1;
      if (tensor_ref_count_.at(input_name) == 0 &&
          tensor_mem_map_.count(input_name) == 1) {
-        int mem_id = tensor_mem_map_.at(input_name).first;
+        int mem_id = tensor_mem_map_.at(input_name).mem_id;
        mem_ref_count_[mem_id] -= 1;
        if (mem_ref_count_.at(mem_id) == 0) {
          idle_blocks_.insert(mem_id);
@@ -236,7 +239,7 @@ const std::vector<MemoryBlock>& MemoryOptimizer::mem_blocks() const {
  return mem_blocks_;
 }

-const std::unordered_map<std::string, std::pair<int, DataType>>&
+const std::unordered_map<std::string, MemoryOptimizer::TensorMemInfo>&
    MemoryOptimizer::tensor_mem_map() const {
  return tensor_mem_map_;
 }

--- a/mace/core/memory_optimizer.h
+++ b/mace/core/memory_optimizer.h
@@ -77,6 +77,17 @@ class MemoryBlock {
 };

 class MemoryOptimizer {
+ public:
+  struct TensorMemInfo {
+    int mem_id;
+    DataType data_type;
+    bool has_data_format;
+
+    TensorMemInfo(int mem_id, DataType data_type, bool has_data_format) :
+        mem_id(mem_id), data_type(data_type), has_data_format(has_data_format)
+    {}
+  };
+
 public:
  static bool IsMemoryReuseOp(const std::string &op_type);
  void UpdateTensorRef(const std::string &tensor_name);
@@ -86,8 +97,7 @@ class MemoryOptimizer {

  const std::vector<MemoryBlock> &mem_blocks() const;

-  const std::unordered_map<std::string,
-                           std::pair<int, DataType>> &tensor_mem_map() const;
+  const std::unordered_map<std::string, TensorMemInfo> &tensor_mem_map() const;

  std::string DebugInfo() const;

@@ -101,7 +111,7 @@ class MemoryOptimizer {
  std::vector<MemoryBlock> mem_blocks_;
  // tensor name : <mem_id, data_type>
  // Buffer Memory do not different data type, so store the data type.
-  std::unordered_map<std::string, std::pair<int, DataType>> tensor_mem_map_;
+  std::unordered_map<std::string, TensorMemInfo> tensor_mem_map_;
  std::unordered_map<int, int> mem_ref_count_;
  std::set<int> idle_blocks_;
 };

--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -70,7 +70,7 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
    const OpRegistryBase *op_registry,
    OpConstructContext *construct_context,
    std::shared_ptr<OperatorDef> op_def,
-    DataFormat data_format_flag,
+    bool has_data_format,
    bool is_quantize_model) {
  // Create the Operation
  DeviceType target_device_type = target_device_->device_type();
@@ -100,8 +100,7 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
  if (!is_quantize_model && device_type == DeviceType::CPU &&
      op_def->output_shape_size() == op_def->output_size()) {
    for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
-      if (data_format_flag == NHWC &&
-          op_def->output_shape(out_idx).dims_size() == 4) {
+      if (has_data_format && op_def->output_shape(out_idx).dims_size() == 4) {
        //  NHWC -> NCHW
        std::vector<index_t> output_shape =
            TransposeShape<index_t, index_t>(
@@ -160,7 +159,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
                                       tensor.dims().end()));
  }

-  DataFormat data_format_flag = NHWC;
+  bool has_data_format = false;
  if (target_device_->device_type() == DeviceType::CPU) {
    target_mem_type = MemoryType::CPU_BUFFER;
    for (auto &input_info : net_def->input_info()) {
@@ -170,15 +169,15 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
      // update tensor shape map
      tensor_shape_map[input_info.name()] = input_shape;
      // Only could be NONE or NHWC
-      auto input_data_format = static_cast<DataFormat>(
+      DataFormat input_data_format = static_cast<DataFormat>(
          input_info.data_format());
-      if (!is_quantize_model && input_data_format == NHWC &&
+      has_data_format = has_data_format ||
+          (input_data_format != DataFormat::DF_NONE);
+      if (!is_quantize_model && input_data_format == DataFormat::NHWC &&
          input_info.dims_size() == 4) {
        // NHWC -> NCHW
        input_shape =
            TransposeShape<index_t, index_t>(input_shape, {0, 3, 1, 2});
-      } else if (input_data_format == DataFormat::DF_NONE) {
-        data_format_flag = DataFormat::DF_NONE;
      }
      output_map.emplace(input_info.name(), InternalOutputInfo(
          target_mem_type, DataType::DT_FLOAT, input_shape, -1));
@@ -189,11 +188,8 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
  else {  // GPU  NOLINT[readability/braces]
    target_mem_type = MemoryType::GPU_BUFFER;
    for (auto &input_info : net_def->input_info()) {
-      auto input_data_format = static_cast<DataFormat>(
-          input_info.data_format());
-      if (input_data_format == DataFormat::DF_NONE) {
-        data_format_flag = DataFormat::DF_NONE;
-      }
+      has_data_format = static_cast<DataFormat>(
+          input_info.data_format()) == NHWC;
      std::vector<index_t> input_shape =
          std::vector<index_t>(input_info.dims().begin(),
                               input_info.dims().end());
@@ -212,7 +208,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
    auto op = CreateOperation(op_registry,
                              &construct_context,
                              op_def,
-                              data_format_flag,
+                              has_data_format,
                              is_quantize_model);
 #ifdef MACE_ENABLE_OPENCL
    // Add input transform operation if necessary
@@ -259,13 +255,13 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
                }
                auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
                    input_name, input_shape, t_input_name,
-                    wanted_in_dt, wanted_in_mem_type, data_format_flag);
+                    wanted_in_dt, wanted_in_mem_type, has_data_format);
                OpConstructContext t_construct_context(ws_);
                auto transform_op = CreateOperation(
                    op_registry,
                    &t_construct_context,
                    transform_op_def,
-                    data_format_flag);
+                    has_data_format);
                operators_.emplace_back(std::move(transform_op));
                transformed_set.insert(t_input_name);
                output_mem_map[t_input_name] = wanted_in_mem_type;
@@ -340,7 +336,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
            output_mem_map[output_info.name()] = target_mem_type;
          }
        }
-        auto output_data_format =
+        bool output_has_data_format =
            static_cast<DataFormat>(output_info.data_format());
        auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
            t_output_name,
@@ -348,12 +344,12 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
            output_info.name(),
            output_info.data_type(),
            target_mem_type,
-            data_format_flag);
+            output_has_data_format);
        auto transform_op = CreateOperation(
            op_registry,
            &construct_context,
            transform_op_def,
-            output_data_format);
+            output_has_data_format);
        operators_.emplace_back(std::move(transform_op));
        // where to do graph reference count.
        mem_optimizer->UpdateTensorRef(transform_op_def.get());

--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -59,7 +59,7 @@ class SerialNet : public NetBase {
      const OpRegistryBase *op_registry,
      OpConstructContext *construct_context,
      std::shared_ptr<OperatorDef> op_def,
-      DataFormat input_format,
+      bool has_data_format,
      bool is_quantize_model = false);

 protected:

--- a/mace/core/runtime/opencl/opencl_util.cc
+++ b/mace/core/runtime/opencl/opencl_util.cc
@@ -152,7 +152,7 @@ std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef(
    const std::string &output_name,
    const mace::DataType dt,
    const mace::MemoryType mem_type,
-    const DataFormat data_format) {
+    bool has_data_format) {
  std::unique_ptr<OperatorDef> op(new OperatorDef);
  std::string op_name = "mace_node_" + output_name;
  op->set_name(op_name);
@@ -169,8 +169,8 @@ std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef(
  arg->set_name("T");
  arg->set_i(static_cast<int32_t>(dt));
  arg = op->add_arg();
-  arg->set_name("data_format");
-  arg->set_i(data_format);
+  arg->set_name("has_data_format");
+  arg->set_i(has_data_format);
  if (!input_shape.empty()) {
    OutputShape *shape = op->add_output_shape();
    for (auto value : input_shape) {

--- a/mace/core/runtime/opencl/opencl_util.h
+++ b/mace/core/runtime/opencl/opencl_util.h
@@ -49,7 +49,7 @@ class OpenCLUtil {
      const std::string &output_name,
      const mace::DataType dt,
      const MemoryType mem_type,
-      const DataFormat data_format);
+      bool has_data_format);
 };

 }  // namespace mace

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -264,31 +264,35 @@ MaceStatus Workspace::PreallocateOutputTensor(
  bool is_quantize_model = IsQuantizedModel(net_def);
  for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) {
    std::unique_ptr<Tensor> tensor
-        (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.first),
-                    tensor_mem.second.second,
+        (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.mem_id),
+                    tensor_mem.second.data_type,
                    false, tensor_mem.first));
-    if (mem_blocks[tensor_mem.second.first].mem_type()
-        == MemoryType::GPU_IMAGE) {
-      VLOG(1) << "Tensor: " << tensor_mem.first
-              << " Mem: " << tensor_mem.second.first
-              << " Data type: " << tensor->dtype()
-              << " Image shape: "
-              << tensor->UnderlyingBuffer()->shape()[0]
-              << ", "
-              << tensor->UnderlyingBuffer()->shape()[1];
-     tensor->set_data_format(DataFormat::NHWC);
-    } else {
-      VLOG(1) << "Tensor: " << tensor_mem.first
-              << " Mem: " << tensor_mem.second.first
-              << " Data type: " << tensor->dtype()
-              << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
-      if (mem_blocks[tensor_mem.second.first].mem_type()
-          == MemoryType::GPU_BUFFER ||
-          is_quantize_model) {
+    if (tensor_mem.second.has_data_format) {
+      if (mem_blocks[tensor_mem.second.mem_id].mem_type()
+          == MemoryType::GPU_IMAGE) {
+        VLOG(1) << "Tensor: " << tensor_mem.first
+                << " Mem: " << tensor_mem.second.mem_id
+                << " Data type: " << tensor->dtype()
+                << " Image shape: "
+                << tensor->UnderlyingBuffer()->shape()[0]
+                << ", "
+                << tensor->UnderlyingBuffer()->shape()[1];
        tensor->set_data_format(DataFormat::NHWC);
      } else {
-        tensor->set_data_format(DataFormat::NCHW);
+        VLOG(1) << "Tensor: " << tensor_mem.first
+                << " Mem: " << tensor_mem.second.mem_id
+                << " Data type: " << tensor->dtype()
+                << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
+        if (mem_blocks[tensor_mem.second.mem_id].mem_type()
+            == MemoryType::GPU_BUFFER ||
+            is_quantize_model) {
+          tensor->set_data_format(DataFormat::NHWC);
+        } else {
+          tensor->set_data_format(DataFormat::NCHW);
+        }
      }
+    } else {
+      tensor->set_data_format(DataFormat::DF_NONE);
    }
    tensor_map_[tensor_mem.first] = std::move(tensor);
  }

--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -170,6 +170,15 @@ DeviceType ParseDeviceType(const std::string &device_str) {
  }
 }

+DataFormat ParseDataFormat(const std::string &data_format_str) {
+  if (data_format_str == "NHWC") {
+    return DataFormat::NHWC;
+  } else if (data_format_str == "NCHW") {
+    return DataFormat::NCHW;
+  } else {
+    return DataFormat::DF_NONE;
+  }
+}

 DEFINE_string(model_name,
              "",
@@ -186,6 +195,12 @@ DEFINE_string(output_node,
 DEFINE_string(output_shape,
              "1,224,224,2:1,1,1,10",
              "output shapes, separated by colon and comma");
+DEFINE_string(input_data_format,
+              "NHWC",
+              "input data formats, NONE|NHWC|NCHW");
+DEFINE_string(output_data_format,
+              "NHWC",
+              "output data formats, NONE|NHWC|NCHW");
 DEFINE_string(input_file,
              "",
              "input file name | input file prefix for multiple inputs.");
@@ -222,8 +237,10 @@ DEFINE_int32(cpu_affinity_policy, 1,

 bool RunModel(const std::vector<std::string> &input_names,
              const std::vector<std::vector<int64_t>> &input_shapes,
+              const std::vector<DataFormat> &input_data_formats,
              const std::vector<std::string> &output_names,
-              const std::vector<std::vector<int64_t>> &output_shapes) {
+              const std::vector<std::vector<int64_t>> &output_shapes,
+              const std::vector<DataFormat> &output_data_formats) {
  // load model
  DeviceType device_type = ParseDeviceType(FLAGS_device);
  // configuration
@@ -324,7 +341,8 @@ bool RunModel(const std::vector<std::string> &input_names,
    inputs_size[input_names[i]] = input_size;
    auto buffer_in = std::shared_ptr<float>(new float[input_size],
                                            std::default_delete<float[]>());
-    inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in);
+    inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in,
+        input_data_formats[i]);
  }

  for (size_t i = 0; i < output_count; ++i) {
@@ -333,7 +351,8 @@ bool RunModel(const std::vector<std::string> &input_names,
                        std::multiplies<int64_t>());
    auto buffer_out = std::shared_ptr<float>(new float[output_size],
                                             std::default_delete<float[]>());
-    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out);
+    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out,
+        output_data_formats[i]);
  }

  if (!FLAGS_input_dir.empty()) {
@@ -485,11 +504,25 @@ int Main(int argc, char **argv) {
    ParseShape(output_shapes[i], &output_shape_vec[i]);
  }

+  std::vector<std::string> raw_input_data_formats =
+      str_util::Split(FLAGS_input_data_format, ',');
+  std::vector<std::string> raw_output_data_formats =
+      str_util::Split(FLAGS_output_data_format, ',');
+  std::vector<DataFormat> input_data_formats(input_count);
+  std::vector<DataFormat> output_data_formats(output_count);
+  for (size_t i = 0; i < input_count; ++i) {
+    input_data_formats[i] = ParseDataFormat(raw_input_data_formats[i]);
+  }
+  for (size_t i = 0; i < output_count; ++i) {
+    output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]);
+  }
+
  bool ret = false;
  for (int i = 0; i < FLAGS_restart_round; ++i) {
    std::cout << "restart round " << i << std::endl;
    ret =
-        RunModel(input_names, input_shape_vec, output_names, output_shape_vec);
+        RunModel(input_names, input_shape_vec, input_data_formats,
+                 output_names, output_shape_vec, output_data_formats);
  }
  if (ret) {
    return 0;

--- a/mace/libmace/capability.cc
+++ b/mace/libmace/capability.cc
@@ -143,6 +143,7 @@ void BMNet::SetUp() {
  // Add input and output information
  for (size_t i = 0; i < input_names_.size(); ++i) {
    InputInfo *info = net_.add_input_info();
+    info->set_data_format(DataFormat::NHWC);
    info->set_name(input_names_[i]);
    for (auto d : input_shapes_[i]) {
      info->add_dims(static_cast<int>(d));
@@ -243,8 +244,8 @@ void BMNet::AddConv(const std::string &conv_type,
  op_def->add_output(output_name);
  AddIntsArg(op_def, "strides", strides);
  AddIntArg(op_def, "padding", padding_type);
+  AddIntArg(op_def, "has_data_format", 1);
  AddIntArg(op_def, "T", DT_HALF);
-  AddIntArg(op_def, "data_format", 1);
  if (has_relu6) {
    AddStringArg(op_def, "activation", "RELUX");
    AddFloatArg(op_def, "max_limit", 6);
@@ -270,7 +271,7 @@ void BMNet::AddEltwise(const std::string &op_name,
  op_def->add_output(output);
  AddIntArg(op_def, "type", type);
  AddIntArg(op_def, "T", DT_HALF);
-  AddIntArg(op_def, "data_format", 1);
+  AddIntArg(op_def, "has_data_format", 1);
  OutputShape *shape = op_def->add_output_shape();
  for (auto dim : output_shape) {
    shape->add_dims(dim);

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -470,6 +470,9 @@ MaceStatus MaceEngine::Impl::Init(
      shape[i] = input_info_map_[input_name].dims(i);
    }
    input_tensor->Resize(shape);
+    // Set to the default data format
+    input_tensor->set_data_format(static_cast<DataFormat>(
+        input_info_map_[input_name].data_format()));
  }
  for (auto output_name : output_nodes) {
    if (output_info_map_.find(output_name) == output_info_map_.end()) {
@@ -477,7 +480,9 @@ MaceStatus MaceEngine::Impl::Init(
                 << "' does not belong to model's outputs "
                 << MakeString(MapKeys(output_info_map_));
    }
+#ifdef MACE_ENABLE_HEXAGON
    ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT);
+#endif
  }
 #ifdef MACE_ENABLE_HEXAGON
  if (device_type_ == HEXAGON) {
@@ -559,47 +564,51 @@ MaceEngine::Impl::~Impl() {
 MaceStatus MaceEngine::Impl::TransposeInput(
    const std::pair<const std::string, MaceTensor> &input,
    Tensor *input_tensor) {
-  if (device_->device_type() == DeviceType::CPU &&
-      input.second.shape().size() == 4 &&
-      input.second.data_format() == NHWC &&
-      !is_quantized_model_) {
-    VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
-    input_tensor->set_data_format(DataFormat::NCHW);
-    std::vector<int> dst_dims = {0, 3, 1, 2};
-    std::vector<index_t> output_shape =
-        TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
-    MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
-    Tensor::MappingGuard input_guard(input_tensor);
-    float *input_data = input_tensor->mutable_data<float>();
-    return ops::Transpose(input.second.data().get(),
-                          input.second.shape(),
-                          dst_dims,
-                          input_data);
-  } else if (
-      (is_quantized_model_ || device_->device_type() == DeviceType::GPU) &&
-      input.second.shape().size() == 4 &&
-      input.second.data_format() == DataFormat::NCHW) {
-    VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC";
-    std::vector<int> dst_dims = {0, 2, 3, 1};
-    input_tensor->set_data_format(DataFormat::NHWC);
-    std::vector<index_t> output_shape =
-        TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
-    MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
-    Tensor::MappingGuard input_guard(input_tensor);
-    float *input_data = input_tensor->mutable_data<float>();
-    return ops::Transpose(input.second.data().get(),
-                          input.second.shape(),
-                          dst_dims,
-                          input_data);
-  } else {
-    input_tensor->set_data_format(input.second.data_format());
-    MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
-    Tensor::MappingGuard input_guard(input_tensor);
-    float *input_data = input_tensor->mutable_data<float>();
-    memcpy(input_data, input.second.data().get(),
-           input_tensor->size() * sizeof(float));
-    return MaceStatus::MACE_SUCCESS;
+  bool has_data_format = input_tensor->data_format() != DataFormat::DF_NONE;
+  DataFormat data_format = DataFormat::DF_NONE;
+  if (has_data_format) {
+    if (device_->device_type() == DeviceType::CPU &&
+        input.second.shape().size() == 4 &&
+        input.second.data_format() == NHWC &&
+        !is_quantized_model_) {
+      VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
+      input_tensor->set_data_format(DataFormat::NCHW);
+      std::vector<int> dst_dims = {0, 3, 1, 2};
+      std::vector<index_t> output_shape =
+          TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
+      MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
+      Tensor::MappingGuard input_guard(input_tensor);
+      float *input_data = input_tensor->mutable_data<float>();
+      return ops::Transpose(input.second.data().get(),
+                            input.second.shape(),
+                            dst_dims,
+                            input_data);
+    } else if (
+        (is_quantized_model_ || device_->device_type() == DeviceType::GPU) &&
+            input.second.shape().size() == 4 &&
+            input.second.data_format() == DataFormat::NCHW) {
+      VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC";
+      std::vector<int> dst_dims = {0, 2, 3, 1};
+      input_tensor->set_data_format(DataFormat::NHWC);
+      std::vector<index_t> output_shape =
+          TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
+      MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
+      Tensor::MappingGuard input_guard(input_tensor);
+      float *input_data = input_tensor->mutable_data<float>();
+      return ops::Transpose(input.second.data().get(),
+                            input.second.shape(),
+                            dst_dims,
+                            input_data);
+    }
+    data_format = input.second.data_format();
  }
+  input_tensor->set_data_format(data_format);
+  MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
+  Tensor::MappingGuard input_guard(input_tensor);
+  float *input_data = input_tensor->mutable_data<float>();
+  memcpy(input_data, input.second.data().get(),
+         input_tensor->size() * sizeof(float));
+  return MaceStatus::MACE_SUCCESS;
 }

 MaceStatus MaceEngine::Impl::TransposeOutput(
@@ -607,38 +616,28 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
    std::pair<const std::string, mace::MaceTensor> *output) {
  // save output
  if (output_tensor != nullptr && output->second.data() != nullptr) {
-    if (device_->device_type() == DeviceType::CPU &&
-        output->second.shape().size() == 4 &&
-        output->second.data_format() != output_tensor->data_format()) {
-      MACE_CHECK(output_tensor->data_format() == NCHW);
-      VLOG(1) << "Transform output " << output->first << " from NCHW to NHWC";
-      std::vector<int> dst_dims = {0, 2, 3, 1};
-      std::vector<index_t> shape =
-          TransposeShape<index_t, index_t>(output_tensor->shape(),
-                                           dst_dims);
-      int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
-                                            std::multiplies<int64_t>());
-      MACE_CHECK(output_size <= output->second.impl_->buffer_size)
-        << "Output size exceeds buffer size: shape"
-        << MakeString<int64_t>(shape) << " vs buffer size "
-        << output->second.impl_->buffer_size;
-      output->second.impl_->shape = shape;
-      Tensor::MappingGuard output_guard(output_tensor);
-      const float *output_data = output_tensor->data<float>();
-      return ops::Transpose(output_data,
-                            output_tensor->shape(),
-                            dst_dims,
-                            output->second.data().get());
-    } else if (device_->device_type() == DeviceType::GPU &&
+    if (output_tensor->data_format() != DataFormat::DF_NONE &&
+        output->second.data_format() != DataFormat::DF_NONE &&
        output->second.shape().size() == 4 &&
        output->second.data_format() != output_tensor->data_format()) {
      VLOG(1) << "Transform output " << output->first << " from "
              << output_tensor->data_format() << " to "
              << output->second.data_format();
-      std::vector<int> dst_dims = {0, 3, 1, 2};
-      if (output_tensor->data_format() == NCHW) {
+      std::vector<int> dst_dims;
+      if (output_tensor->data_format() == NCHW &&
+          output->second.data_format() == NHWC) {
        dst_dims = {0, 2, 3, 1};
+      } else if (output_tensor->data_format() == NHWC &&
+          output->second.data_format() == NCHW) {
+        dst_dims = {0, 3, 1, 2};
+      } else {
+        LOG(FATAL) <<"Not supported output data format: "
+                   << output->second.data_format() << " vs "
+                   << output_tensor->data_format();
      }
+      VLOG(1) << "Transform output " << output->first << " from "
+              << output_tensor->data_format() << " to "
+              << output->second.data_format();
      std::vector<index_t> shape =
          TransposeShape<index_t, index_t>(output_tensor->shape(),
                                           dst_dims);

--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -35,8 +35,8 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
 public:
  explicit BiasAddOp(OpConstructContext *context)
      : Operation(context),
-        data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-                     "data_format", NHWC))) {}
+        has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 0))
+  {}

  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
@@ -57,7 +57,7 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
    const float *bias_ptr = bias->data<float>();
    float *output_ptr = output->mutable_data<float>();

-    if (input->dim_size() == 4 && data_format_ == NCHW) {
+    if (input->dim_size() == 4 && has_data_format_) {
      const index_t batch = input->dim(0);
      const index_t channels = input->dim(1);
      const index_t height_width = input->dim(2) * input->dim(3);
@@ -90,7 +90,7 @@ class BiasAddOp<DeviceType::CPU, float> : public Operation {
  }

 private:
-  DataFormat data_format_;
+  int has_data_format_;
 };

 #ifdef MACE_ENABLE_OPENCL
@@ -99,8 +99,7 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
 public:
  explicit BiasAddOp(OpConstructContext *context)
      : Operation(context),
-        data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-            "data_format", NHWC))) {
+        has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 1)) {
    MemoryType mem_type;
    if (context->device()->gpu_runtime()->UseImageMemory()) {
      mem_type = MemoryType::GPU_IMAGE;
@@ -121,13 +120,13 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {

    Tensor *output = this->Output(0);
    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-    MACE_CHECK(input->dim_size() == 4 && data_format_ == NHWC,
+    MACE_CHECK(input->dim_size() == 4 && has_data_format_,
               "gpu only support biasadd for 4-dimensional NHWC format tensor");
    return kernel_->Compute(context, input, bias, output);
  }

 private:
-  DataFormat data_format_;
+  int has_data_format_;
  std::unique_ptr<OpenCLBiasAddKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL

--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -42,7 +42,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
  OpDefBuilder("BiasAdd", "BiasAddBM")
      .Input("Input")
      .Input("Bias")
-      .AddIntArg("data_format", data_format)
+      .AddIntArg("has_data_format", 1)
      .Output("Output")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -36,7 +36,7 @@ void BiasAddSimple() {
    OpDefBuilder("BiasAdd", "BiasAddTest")
        .Input("InputNCHW")
        .Input("Bias")
-        .AddIntArg("data_format", NCHW)
+        .AddIntArg("has_data_format", 1)
        .Output("OutputNCHW")
        .Finalize(net.NewOperatorDef());
    // Run
@@ -90,7 +90,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
  OpDefBuilder("BiasAdd", "BiasAddTest")
      .Input("InputNCHW")
      .Input("Bias")
-      .AddIntArg("data_format", NCHW)
+      .AddIntArg("has_data_format", 1)
      .Output("OutputNCHW")
      .Finalize(net.NewOperatorDef());

@@ -139,7 +139,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
  OpDefBuilder("BiasAdd", "BiasAddTest")
      .Input("InputNCHW")
      .Input("Bias")
-      .AddIntArg("data_format", NCHW)
+      .AddIntArg("has_data_format", 1)
      .Output("OutputNCHW")
      .Finalize(net.NewOperatorDef());


--- a/mace/ops/buffer_transform.cc
+++ b/mace/ops/buffer_transform.cc
@@ -39,14 +39,14 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
    auto type =
        static_cast<OpenCLBufferType>(Operation::GetOptionalArg<int>(
            "buffer_type", static_cast<int>(CONV2D_FILTER)));
-    auto data_format = static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-        "data_format", DataFormat::DF_NONE));
+    bool has_data_format = Operation::GetOptionalArg<int>("has_data_format", 0)
+        != 0;

    MemoryType in_mem_type = context->workspace()->GetTensor(
        operator_def_->input(0))->memory_type();
    return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
        context, input, type, out_mem_type_, wino_blk_size_,
-        data_format, output);
+        has_data_format, output);
  }

 private:

--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -60,9 +60,9 @@ class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
    MACE_UNUSED(context);
    if (!checked_) {
      Validate();
-      auto df = static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-          "data_format", DataFormat::DF_NONE));
-      if (df == DataFormat::NHWC && this->Input(0)->dim_size() == 4) {
+      auto has_df = Operation::GetOptionalArg<int>(
+          "has_data_format", 0);
+      if (has_df && this->Input(0)->dim_size() == 4) {
        if (axis_ == 3) axis_ = 1;
        else if (axis_ == 2) axis_ = 3;
        else if (axis_ == 1) axis_ = 2;
@@ -251,9 +251,12 @@ void RegisterConcat(OpRegistryBase *op_registry) {
              if (op->output_shape(0).dims_size() != 4) {
                return { DeviceType::CPU };
              } else {
+                int has_data_format =
+                    ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                        *op, "has_data_format", 0);
                int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
                    *op, "axis", 3);
-                if (axis != 3) {
+                if (!has_data_format || axis != 3) {
                  return { DeviceType::CPU };
                }
                bool divisible_four = true;

--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -91,6 +91,7 @@ void OpenCLConcatHelper(int iters,
      .Input("Input0")
      .Input("Input1")
      .AddIntArg("axis", concat_dim)
+      .AddIntArg("has_data_format", 1)
      .Output("Output")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -100,11 +100,12 @@ TEST_F(ConcatOpTest, CPUSimpleVertical) {
  }
 }

-TEST_F(ConcatOpTest, CPURandom) {
+namespace {
+void CPURandomTest(int input_dim, int has_data_format) {
  static unsigned int seed = time(NULL);
-  int dim = 5;
+  int dim = input_dim;
  int num_inputs = 2 + rand_r(&seed) % 10;
-  int axis = 1;
+  int axis = 3;
  // Construct graph
  OpsTestNet net;
  auto builder = OpDefBuilder("Concat", "ConcatTest");
@@ -112,9 +113,13 @@ TEST_F(ConcatOpTest, CPURandom) {
    builder = builder.Input(MakeString("Input", i));
  }
  builder.AddIntArg("axis", axis)
+      .AddIntArg("has_data_format", has_data_format)
      .Output("Output")
      .Finalize(net.NewOperatorDef());

+  if (has_data_format) {
+    axis = 1;
+  }
  std::vector<index_t> shape_data;
  GenerateRandomIntTypeData<index_t>({dim}, &shape_data, 1, dim);
  std::vector<std::vector<index_t>> input_shapes(num_inputs, shape_data);
@@ -152,6 +157,13 @@ TEST_F(ConcatOpTest, CPURandom) {
    }
  }
 }
+}  // namespace
+
+TEST_F(ConcatOpTest, CPURandom) {
+  CPURandomTest(5, 0);
+  CPURandomTest(4, 0);
+  CPURandomTest(4, 1);
+}

 TEST_F(ConcatOpTest, QuantizedCPURandom) {
  static unsigned int seed = time(NULL);
@@ -186,7 +198,7 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
    builder = builder.Input(MakeString("Input", i));
  }
  builder.AddIntArg("axis", axis_arg)
-      .AddIntArg("data_format", DataFormat::NHWC)
+      .AddIntArg("has_data_format", 1)
      .Output("Output")
      .Finalize(net.NewOperatorDef());

@@ -248,7 +260,7 @@ namespace {
 template <typename T>
 void OpenCLRandomTest(const std::vector<std::vector<index_t>> &shapes,
                      const int axis,
-                      DataFormat data_format) {
+                      bool has_data_format) {
  srand(time(nullptr));
  int num_inputs = shapes.size();
  int concat_axis_size = 0;
@@ -275,7 +287,7 @@ void OpenCLRandomTest(const std::vector<std::vector<index_t>> &shapes,
  builder.AddIntArg("axis", axis)
      .Output("Output")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddIntArg("data_format", data_format)
+      .AddIntArg("has_data_format", has_data_format)
      .OutputShape(expected_shape)
      .Finalize(net.NewOperatorDef());

@@ -309,38 +321,37 @@ void OpenCLRandomTest(const std::vector<std::vector<index_t>> &shapes,
 }  // namespace

 TEST_F(ConcatOpTest, OPENCLAligned) {
-  OpenCLRandomTest<float>({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3,
-                          DataFormat::NHWC);
+  OpenCLRandomTest<float>({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3, 1);
 }

 TEST_F(ConcatOpTest, OPENCLHalfAligned) {
-  OpenCLRandomTest<half>({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3,
-                         DataFormat::NHWC);
+  OpenCLRandomTest<half>({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3, 1);
 }

 TEST_F(ConcatOpTest, OPENCLUnAligned) {
-  OpenCLRandomTest<float>({{3, 32, 32, 13}, {3, 32, 32, 17}}, 3,
-                          DataFormat::NHWC);
+  OpenCLRandomTest<float>({{3, 32, 32, 13}, {3, 32, 32, 17}}, 3, 1);
 }

 TEST_F(ConcatOpTest, OPENCLAlignedMultiInput) {
  OpenCLRandomTest<float>(
      {{3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}},
-      3, DataFormat::NHWC);
+      3, 1);
 }

 TEST_F(ConcatOpTest, GPUFallbackToCPU2DInput) {
-  OpenCLRandomTest<float>({{3, 4}, {3, 4}}, 1, DataFormat::DF_NONE);
+  OpenCLRandomTest<float>({{3, 4}, {3, 4}}, 1, 0);
 }

 TEST_F(ConcatOpTest, GPUFallbackToCPUChanNotDivisibleBy4) {
-  OpenCLRandomTest<float>({{1, 1, 4, 3}, {1, 1, 4, 3}}, 3,
-                          DataFormat::DF_NONE);
+  OpenCLRandomTest<float>({{1, 1, 4, 3}, {1, 1, 4, 3}}, 3, 0);
+}
+
+TEST_F(ConcatOpTest, GPUFallbackToCPUNoDataFormat) {
+  OpenCLRandomTest<float>({{1, 1, 4, 4}, {1, 1, 4, 4}}, 3, 0);
 }

 TEST_F(ConcatOpTest, GPUFallbackToCPUAxis2) {
-  OpenCLRandomTest<float>({{1, 1, 4, 3}, {1, 1, 4, 3}}, 2,
-                          DataFormat::DF_NONE);
+  OpenCLRandomTest<float>({{1, 1, 4, 3}, {1, 1, 4, 3}}, 2, 0);
 }

 }  // namespace test

--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -897,8 +897,8 @@ class EltwiseOp : public Operation {
        scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
        scalar_input_index_(Operation::GetOptionalArg<int32_t>(
            "scalar_input_index", 1)),
-        data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-            "data_format", 0))) {}
+        has_data_format_(Operation::GetOptionalArg<int>(
+            "has_data_format", 0)) {}

  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
@@ -940,7 +940,7 @@ class EltwiseOp : public Operation {
    // check if we can broadcast tensor
    uint32_t rank_diff =
        static_cast<uint32_t>(input0->dim_size() - input1->dim_size());
-    if (data_format_ == NCHW) {
+    if (has_data_format_) {
      MACE_CHECK(
          (input0->dim_size() == 4) &&
              ((input1->dim_size() == 0) ||
@@ -965,7 +965,7 @@ class EltwiseOp : public Operation {
    const T *input0_ptr = input0->data<T>();
    const T *input1_ptr = input1->data<T>();

-    if (data_format_ == NCHW && input1->dim_size() > 0) {
+    if (has_data_format_ && input1->dim_size() > 0) {
      MACE_RETURN_IF_ERROR(output->ResizeLike(input0));
      Tensor::MappingGuard output_guard(output);
      DstType *output_ptr = output->mutable_data<DstType>();
@@ -1027,7 +1027,7 @@ class EltwiseOp : public Operation {
  std::vector<float> coeff_;
  float scalar_input_;
  int32_t scalar_input_index_;
-  DataFormat data_format_;
+  int has_data_format_;
  Tensor scalar_tensor_;
 };

@@ -1042,9 +1042,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
        coeff_(Operation::GetRepeatedArgs<float>("coeff")),
        scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
        scalar_input_index_(Operation::GetOptionalArg<int32_t>(
-            "scalar_input_index", 1)),
-        data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-            "data_format", 0)))
+            "scalar_input_index", 1))
 #ifdef MACE_ENABLE_NEON
        , eltwise_(static_cast<ops::EltwiseType>(Operation::GetOptionalArg<int>(
            "type", static_cast<int>(ops::EltwiseType::NONE))))
@@ -1139,7 +1137,6 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
  std::vector<float> coeff_;
  float scalar_input_;
  int32_t scalar_input_index_;
-  DataFormat data_format_;
  Tensor scalar_tensor_;
 #ifdef MACE_ENABLE_NEON
  arm::q8::Eltwise eltwise_;

--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -44,6 +44,7 @@ void EltwiseBenchmark(
      .AddIntArg("type", static_cast<int>(type))
      .AddFloatsArg("coeff", {1.2, 2.1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntArg("has_data_format", 1)
      .Output("Output")
      .Finalize(net.NewOperatorDef());


--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -75,7 +75,7 @@ void SimpleTensorScalar(const ops::EltwiseType type,
        .AddIntArg("T", DataTypeToEnum<T>::v())
        .AddIntArg("type", static_cast<int>(type))
        .AddFloatArg("scalar_input", x)
-        .AddIntArg("data_format", DataFormat::NCHW)
+        .AddIntArg("has_data_format", 1)
        .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
        .Output("TOutput")
        .Finalize(net.NewOperatorDef());
@@ -120,7 +120,7 @@ void SimpleTensorEltwise(const ops::EltwiseType type,
            .AddIntArg("T", DataTypeToEnum<T>::v())
            .AddIntArg("type", static_cast<int>(type))
            .AddFloatsArg("coeff", coeff)
-            .AddIntArg("data_format", DataFormat::NCHW)
+            .AddIntArg("has_data_format", 1)
            .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
            .Output("TOutput");
    if (shape0.size() > 1) {
@@ -642,7 +642,7 @@ void RandomTensorScalar(const ops::EltwiseType type,
      .Input("TInput")
      .AddIntArg("type", static_cast<int>(type))
      .AddFloatArg("scalar_input", 0.1)
-      .AddIntArg("data_format", DataFormat::NCHW)
+      .AddIntArg("has_data_format", 1)
      .Output("TOutput")
      .Finalize(net.NewOperatorDef());
  // Run
@@ -699,7 +699,7 @@ void RandomTensorEltwise(const ops::EltwiseType type,
      .Input("TInput1")
      .AddIntArg("type", static_cast<int>(type))
      .AddFloatsArg("coeff", coeff)
-      .AddIntArg("data_format", DataFormat::NCHW)
+      .AddIntArg("has_data_format", 1)
      .Output("TOutput")
      .Finalize(net.NewOperatorDef());

@@ -755,7 +755,7 @@ void Quantized(const std::vector<index_t> &shape,
      .Input("TInput0")
      .Input("TInput1")
      .AddIntArg("type", static_cast<int>(type))
-      .AddIntArg("data_format", DataFormat::NCHW)
+      .AddIntArg("has_data_format", 1)
      .Output("TOutput")
      .Finalize(net.NewOperatorDef());


--- a/mace/ops/infer_conv2d_shape.cc
+++ b/mace/ops/infer_conv2d_shape.cc
@@ -34,9 +34,9 @@ class InferConv2dShapeOp : public Operation {
    Tensor::MappingGuard output_guard(output);
    int32_t *output_data = output->mutable_data<int32_t>();

-    const int32_t data_format =
-        Operation::GetOptionalArg<int>("data_format", 0);
-    const bool isNCHW = data_format == 1;
+    auto has_data_format =
+        Operation::GetOptionalArg<int>("has_data_format", 0);
+    const bool isNCHW = (has_data_format && D == DeviceType::CPU);

    Padding padding_type =
        static_cast<Padding>(Operation::GetOptionalArg<int>(

--- a/mace/ops/infer_conv2d_shape_test.cc
+++ b/mace/ops/infer_conv2d_shape_test.cc
@@ -57,8 +57,8 @@ void TestInferConv2dShapeOp(const std::vector<index_t> &input_shape,
 }  // namespace

 TEST_F(InferConv2dShapeOpTest, TestInferConv2dShape) {
-TestInferConv2dShapeOp({3, 640, 480, 16}, 1, {3, 640, 480, 3});
-TestInferConv2dShapeOp({3, 640, 480, 16}, 2, {3, 320, 240, 3});
+  TestInferConv2dShapeOp({3, 640, 480, 16}, 1, {3, 640, 480, 3});
+  TestInferConv2dShapeOp({3, 640, 480, 16}, 2, {3, 320, 240, 3});
 }

 }  // namespace test

--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
@@ -48,7 +48,7 @@ class OpenCLBufferTransformer {
                       const OpenCLBufferType type,
                       const MemoryType out_mem_type,
                       const int wino_blk_size,
-                       const DataFormat data_format,
+                       bool has_data_format,
                       Tensor *output) {
    Workspace *ws = context->workspace();
    DataType dt = DataTypeToEnum<T>::value;
@@ -67,13 +67,14 @@ class OpenCLBufferTransformer {
        VLOG(2) << "Transform CPU Buffer " << input->name()
                << " to GPU Buffer " << internal_tensor->name()
                << " with data type " << dt;
-        if (data_format == DataFormat::NHWC && input->shape().size() == 4) {
+        if (has_data_format && input->shape().size() == 4) {
          // 1. (NCHW -> NHWC)
          std::vector<int> dst_dims = {0, 2, 3, 1};
          std::vector<index_t> output_shape =
              TransposeShape<index_t, index_t>(input->shape(),
                                               dst_dims);
          internal_tensor->Resize(output_shape);
+          internal_tensor->set_data_format(DataFormat::NHWC);
          // TODO(liuqi): Only support float now
          const float *input_ptr = input->data<float>();
          Tensor::MappingGuard guard(internal_tensor);
@@ -105,13 +106,13 @@ class OpenCLBufferTransformer {
      VLOG(2) << "Transform GPU Buffer " << internal_tensor.name()
              << " to CPU Buffer " << output->name()
              << " with data type " << dt;
-      if (data_format == DataFormat::NHWC &&
-          internal_tensor.shape().size() == 4) {
+      if (has_data_format && internal_tensor.shape().size() == 4) {
        // NHWC -> NCHW
        std::vector<int> dst_dims = {0, 3, 1, 2};
        std::vector<index_t> output_shape =
            TransposeShape<index_t, index_t>(internal_tensor.shape(),
                                             dst_dims);
+        output->set_data_format(DataFormat::NCHW);
        Tensor::MappingGuard guard(&internal_tensor);
        const float *internal_ptr = internal_tensor.data<float>();
        output->Resize(output_shape);

--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
@@ -166,9 +166,20 @@ bool OpsTestNet::Setup(mace::DeviceType device) {
          !ws_.GetTensor(input)->is_weight()) {
        auto input_info = net_def.add_input_info();
        input_info->set_name(input);
-        auto data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            op_def, "data_format", DataFormat::DF_NONE);
-        input_info->set_data_format(data_format);
+        auto has_data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            op_def, "has_data_format", 1);
+        auto is_quantized_op = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            op_def, "T", static_cast<int>(DT_FLOAT))
+            == static_cast<int>(DT_UINT8);
+        if (has_data_format) {
+          if (is_quantized_op || device == DeviceType::GPU) {
+            input_info->set_data_format(NHWC);
+          } else {
+            input_info->set_data_format(NCHW);
+          }
+        } else {
+          input_info->set_data_format(DataFormat::DF_NONE);
+        }
        auto &shape = ws_.GetTensor(input)->shape();
        for (auto d : shape) {
          input_info->add_dims(static_cast<int>(d));

--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
@@ -40,9 +40,9 @@ class PadOp<DeviceType::CPU, T> : public Operation {
        constant_value_(Operation::GetOptionalArg<float>(
            "constant_value", 0.0)) {
    MACE_CHECK(paddings_.size() == 8);
-    auto df = static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-        "data_format", DataFormat::DF_NONE));
-    if (df == DataFormat::NHWC) {
+    auto has_df = Operation::GetOptionalArg<int>(
+        "has_data_format", 0);
+    if (has_df) {
      paddings_ = TransposeShape<int, int>(paddings_, {0, 1, 6, 7, 2, 3, 4, 5});
    }
  }
@@ -55,11 +55,9 @@ class PadOp<DeviceType::CPU, T> : public Operation {
        this->paddings_.size() == static_cast<size_t>(input->dim_size()) * 2);
    auto input_shape = input->shape();
    for (size_t i = 0; i < paddings_.size(); ++i) {
-      if (type_ == PadType::REFLECT) {
-        MACE_CHECK(paddings_[i] < input_shape[i / 2]);
-
-      } else if (type_ == PadType::SYMMETRIC) {
-        MACE_CHECK(paddings_[i] <= input_shape[i / 2]);
+      if (type_ == PadType::REFLECT || type_ == PadType::SYMMETRIC) {
+        MACE_CHECK(paddings_[i] < input_shape[i / 2], paddings_[i],
+                   " vs ", input_shape[i / 2]);
      }
      MACE_CHECK(paddings_[i] >= 0);
    }

--- a/mace/ops/pad_benchmark.cc
+++ b/mace/ops/pad_benchmark.cc
@@ -29,7 +29,11 @@ void Pad(int iters, int batch, int height,
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
+  } else {
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
+  }

  const std::vector<int> paddings = {0, 0, pad, pad, pad, pad, 0, 0};
  OpDefBuilder("Pad", "PadTest")
@@ -37,6 +41,7 @@ void Pad(int iters, int batch, int height,
      .Output("Output")
      .AddIntsArg("paddings", paddings)
      .AddIntArg("pad_type", pad_type)
+      .AddIntArg("has_data_format", 1)
      .AddFloatArg("constant_value", 1.0)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
@@ -39,7 +39,7 @@ void SimpleConstant() {
        .Output("Output")
        .AddIntsArg("paddings", {0, 0, 1, 2, 1, 2, 0, 0})
        .AddFloatArg("constant_value", 1.0)
-        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("has_data_format", 1)
        .Finalize(net.NewOperatorDef());

    // Run
@@ -52,7 +52,7 @@ void SimpleConstant() {
        .Output("TOutput")
        .AddIntsArg("paddings", {0, 0, 1, 2, 1, 2, 0, 0})
        .AddFloatArg("constant_value", 1.0)
-        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("has_data_format", 1)
        .Finalize(net.NewOperatorDef());

    // Run
@@ -101,7 +101,7 @@ void Result(const std::vector<index_t> &input_shape,
  .Output(t_output)
  .AddIntsArg("paddings", paddings)
  .AddIntArg("pad_type", static_cast<int>(pad_type))
-  .AddIntArg("data_format", DataFormat::NHWC)
+  .AddIntArg("has_data_format", 1)
  .Finalize(net.NewOperatorDef());

  // Run
@@ -179,7 +179,7 @@ TEST_F(PadTest, ComplexCPU) {
      .Output("TOutput")
      .AddIntsArg("paddings", {0, 0, 1, 1, 1, 1, 1, 1})
      .AddFloatArg("constant_value", 1.0)
-      .AddIntArg("data_format", DataFormat::NHWC)
+      .AddIntArg("has_data_format", 1)
      .Finalize(net.NewOperatorDef());

  // Run
@@ -217,7 +217,7 @@ void Complex(const std::vector<index_t> &input_shape,
      .AddIntsArg("paddings", paddings)
      .AddIntArg("pad_type", pad_type)
      .AddFloatArg("constant_value", 1.0)
-      .AddIntArg("data_format", DataFormat::NHWC)
+      .AddIntArg("has_data_format", 1)
      .Finalize(net.NewOperatorDef());

  // Run
@@ -234,7 +234,7 @@ void Complex(const std::vector<index_t> &input_shape,
      .AddIntsArg("paddings", paddings)
      .AddIntArg("pad_type", pad_type)
      .AddFloatArg("constant_value", 1.0)
-      .AddIntArg("data_format", DataFormat::NHWC)
+      .AddIntArg("has_data_format", 1)
      .Finalize(net.NewOperatorDef());

  // Run

--- a/mace/ops/reduce.cc
+++ b/mace/ops/reduce.cc
@@ -94,9 +94,9 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
        int index = axis_[i] >= 0 ?
                          axis_[i] :
                          axis_[i] + input->dim_size();
-        auto df = static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-            "data_format", DataFormat::DF_NONE));
-        if (df == DataFormat::NHWC && DataTypeToEnum<T>::value != DT_UINT8
+        auto has_df = Operation::GetOptionalArg<int>(
+            "has_data_format", 0);
+        if (has_df && DataTypeToEnum<T>::value != DT_UINT8
            && input->dim_size() == 4) {
          if (index == 1 || index == 2) index = index + 1;
          else if (index == 3) index = 1;

--- a/mace/ops/reduce_benchmark.cc
+++ b/mace/ops/reduce_benchmark.cc
@@ -38,6 +38,7 @@ void Reduce(int iters, int batch, int channels,
      .Input("Input")
      .AddIntsArg("axis", axis)
      .Output("OutputImage")
+      .AddIntArg("has_data_format", 1)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());


--- a/mace/ops/reduce_test.cc
+++ b/mace/ops/reduce_test.cc
@@ -44,7 +44,7 @@ void Simple(const std::vector<index_t> &input_shape,
        .AddIntsArg("axis", axis)
        .AddIntArg("keepdims", keepdims ? 1 : 0)
        .AddIntArg("reduce_type", type)
-        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("has_data_format", 1)
        .Output("OutputNCHW")
        .Finalize(net.NewOperatorDef());
    // Run
@@ -56,7 +56,7 @@ void Simple(const std::vector<index_t> &input_shape,
        .AddIntsArg("axis", axis)
        .AddIntArg("keepdims", keepdims ? 1 : 0)
        .AddIntArg("reduce_type", type)
-        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("has_data_format", 1)
        .Output("Output")
        .Finalize(net.NewOperatorDef());
    // Run
@@ -84,7 +84,7 @@ void Simple3D(const std::vector<index_t> &input_shape,
      .AddIntsArg("axis", axis)
      .AddIntArg("keepdims", keepdims ? 1 : 0)
      .AddIntArg("reduce_type", type)
-      .AddIntArg("data_format", DataFormat::NHWC)
+      .AddIntArg("has_data_format", 1)
      .Output("Output")
      .Finalize(net.NewOperatorDef());
  // Run
@@ -588,7 +588,7 @@ void RandomTest(const std::vector<index_t> &input_shape,
        .AddIntsArg("axis", axis)
        .AddIntArg("keepdims", 1)
        .AddIntArg("reduce_type", type)
-        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("has_data_format", 1)
        .Output("OutputNCHW")
        .Finalize(net.NewOperatorDef());
    // Run
@@ -600,7 +600,7 @@ void RandomTest(const std::vector<index_t> &input_shape,
        .AddIntsArg("axis", axis)
        .AddIntArg("keepdims", 1)
        .AddIntArg("reduce_type", type)
-        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("has_data_format", 1)
        .Output("OPENCLOutput")
        .Finalize(net.NewOperatorDef());
    // Run
@@ -662,7 +662,7 @@ void TestQuant(const std::vector<index_t> &input_shape,
        .AddIntsArg("axis", axis)
        .AddIntArg("keepdims", 1)
        .AddIntArg("reduce_type", type)
-        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("has_data_format", 1)
        .Output("OutputNCHW")
        .AddIntArg("T", DT_FLOAT)
        .Finalize(net.NewOperatorDef());
@@ -687,7 +687,7 @@ void TestQuant(const std::vector<index_t> &input_shape,
        .AddIntsArg("axis", axis)
        .AddIntArg("keepdims", 1)
        .AddIntArg("reduce_type", type)
-        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("has_data_format", 1)
        .AddIntArg("T", DT_UINT8)
        .Finalize(net.NewOperatorDef());
    net.RunOp();

--- a/mace/ops/reshape.cc
+++ b/mace/ops/reshape.cc
@@ -77,9 +77,9 @@ class ReshapeOp : public Operation {
    }
    Tensor *output = this->Output(OUTPUT);
    // NHWC -> NCHW
-    auto df = static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-        "data_format", DataFormat::DF_NONE));
-    if (df == DataFormat::NHWC && D == DeviceType::CPU
+    auto has_df = Operation::GetOptionalArg<int>(
+        "has_data_format", 0);
+    if (has_df && D == DeviceType::CPU
        && out_shape.size() == 4 && shape->is_weight()) {
      std::vector<int> dst_dims = {0, 3, 1, 2};
      std::vector<index_t> out_shape_gpu = TransposeShape<index_t, index_t>(

--- a/mace/ops/shape.cc
+++ b/mace/ops/shape.cc
@@ -35,11 +35,10 @@ class ShapeOp : public Operation {
    Tensor::MappingGuard output_guard(output);
    int32_t *output_data = output->mutable_data<int32_t>();

-    const int data_format =
-        Operation::GetOptionalArg<int>("data_format", 0);
-    if (input->dim_size() == 4 &&
-        D == DeviceType::CPU &&
-        data_format == DataFormat::NCHW) {
+    auto has_df = Operation::GetOptionalArg<int>(
+        "has_data_format", 0);
+    if (D == DeviceType::CPU &&
+        has_df && input->dim_size() == 4) {
      // transpose NCHW to NHWC for cpu runtime
      output_data[0] = static_cast<int32_t>(input->dim(0));
      output_data[1] = static_cast<int32_t>(input->dim(2));

--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
@@ -36,9 +36,9 @@ class SplitOp<DeviceType::CPU, T> : public Operation {
        checked_(false) {}

  void Validate() {
-    auto df = static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-        "data_format", DataFormat::DF_NONE));
-    if (df == DataFormat::NHWC && this->Input(0)->dim_size() == 4) {
+    auto has_df = Operation::GetOptionalArg<int>(
+        "has_data_format", 0);
+    if (has_df && this->Input(0)->dim_size() == 4) {
      if (axis_ == 3) axis_ = 1;
      else if (axis_ == 2) axis_ = 3;
      else if (axis_ == 1) axis_ = 2;

--- a/mace/ops/split_benchmark.cc
+++ b/mace/ops/split_benchmark.cc
@@ -44,6 +44,7 @@ void BMSplitHelper(int iters,
  }
  builder
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntArg("has_data_format", 1)
      .Finalize(net.NewOperatorDef());

  // Warm-up

--- a/mace/ops/split_test.cc
+++ b/mace/ops/split_test.cc
@@ -54,7 +54,7 @@ void RandomTest(const int num_outputs, int axis) {
    builder = builder.Output(MakeString("Output", i));
  }
  builder.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddIntArg("data_format", DataFormat::NHWC)
+      .AddIntArg("has_data_format", 1)
      .Finalize(net.NewOperatorDef());

  // Run

--- a/mace/ops/squeeze.cc
+++ b/mace/ops/squeeze.cc
@@ -32,9 +32,9 @@ class SqueezeOp : public Operation {
    MACE_UNUSED(context);
    if (!checked_ && D == DeviceType::CPU
        && DataTypeToEnum<T>::value != DT_UINT8) {
-      auto df = static_cast<DataFormat>(Operation::GetOptionalArg<int>(
-          "data_format", DataFormat::DF_NONE));
-      if (df == DataFormat::NHWC && this->Input(0)->dim_size() == 4) {
+      auto has_df = Operation::GetOptionalArg<int>(
+          "has_data_format", 0);
+      if (has_df && this->Input(0)->dim_size() == 4) {
        if (axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2) {
          axis_[0] = 2;
          axis_[1] = 3;

--- a/mace/ops/squeeze_test.cc
+++ b/mace/ops/squeeze_test.cc
@@ -30,7 +30,7 @@ void TestSqueeze(const std::vector<index_t> &org_shape,
  OpDefBuilder("Squeeze", "SqueezeTest")
      .Input("Input")
      .AddIntsArg("axis", axis)
-      .AddIntArg("data_format", DataFormat::NHWC)
+      .AddIntArg("has_data_format", 1)
      .Output("Output")
      .Finalize(net.NewOperatorDef());


--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -42,6 +42,7 @@ device_type_map = {'cpu': cvt.DeviceType.CPU.value,
 data_format_map = {
    'NONE': cvt.DataFormat.DF_NONE,
    'NHWC': cvt.DataFormat.NHWC,
+    'NCHW': cvt.DataFormat.NCHW,
 }


@@ -74,6 +75,13 @@ def parse_float_array_from_str(ints_str):
    return [float(int_str) for int_str in ints_str.split(',')]


+def transpose_shape(shape, dst_order):
+    t_shape = [0] * len(shape)
+    for i in range(len(shape)):
+        t_shape[i] = shape[dst_order[i]]
+    return t_shape
+
+
 def main(unused_args):
    if not os.path.isfile(FLAGS.model_file):
        six.print_("Input graph file '" +
@@ -139,6 +147,10 @@ def main(unused_args):
        else:
            input_node.data_format = data_format_map[input_node_formats[i]]
        input_node.shape = parse_int_array_from_str(input_node_shapes[i])
+        if input_node.data_format == cvt.DataFormat.NCHW and\
+                len(input_node.shape) == 4:
+            input_node.shape = transpose_shape(input_node.shape, [0, 2, 3, 1])
+            input_node.data_format = cvt.DataFormat.NHWC
        if len(input_node_ranges) > i:
            input_node.range = parse_float_array_from_str(input_node_ranges[i])
        option.add_input_node(input_node)
@@ -156,6 +168,11 @@ def main(unused_args):
        else:
            output_node.data_format = data_format_map[output_node_formats[i]]
        output_node.shape = parse_int_array_from_str(output_node_shapes[i])
+        if output_node.data_format == cvt.DataFormat.NCHW and\
+                len(output_node.shape) == 4:
+            output_node.shape = transpose_shape(output_node.shape,
+                                                [0, 2, 3, 1])
+            output_node.data_format = cvt.DataFormat.NHWC
        option.add_output_node(output_node)

    if FLAGS.check_node != '':

--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -181,6 +181,7 @@ class MaceKeyword(object):
    mace_global_pooling_str = 'global_pooling'
    mace_kernel_str = 'kernels'
    mace_data_format_str = 'data_format'
+    mace_has_data_format_str = 'has_data_format'
    mace_filter_format_str = 'filter_format'
    mace_element_type_str = 'type'
    mace_activation_type_str = 'activation'
@@ -525,6 +526,16 @@ class ConverterUtil(object):
                return arg
        return None

+    @staticmethod
+    def del_arg(op, arg_name):
+        found_idx = -1
+        for idx in range(len(op.arg)):
+            if op.arg[idx].name == arg_name:
+                found_idx = idx
+                break
+        if found_idx != -1:
+            del op.arg[found_idx]
+
    @staticmethod
    def add_data_format_arg(op, data_format):
        data_format_arg = op.arg.add()

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1406,21 +1406,17 @@ class Transformer(base_converter.ConverterInterface):

    def update_data_format(self):
        print("update data format")
-        data_format_flag = DataFormat.NHWC.value
+        data_format_flag = 1
        for input_node in self._option.input_nodes.values():
            if input_node.data_format.value == DataFormat.DF_NONE.value:
-                data_format_flag = DataFormat.DF_NONE.value
-
+                data_format_flag = 0
        net = self._model
        for op in net.op:
-            data_format_arg = ConverterUtil.get_arg(
+            ConverterUtil.del_arg(
                op, MaceKeyword.mace_data_format_str)
-            if not data_format_arg:
-                data_format_arg = op.arg.add()
-                data_format_arg.name = MaceKeyword.mace_data_format_str
-                data_format_arg.i = data_format_flag
-            elif data_format_arg.i != data_format_flag:
-                data_format_arg.i = data_format_flag
+            has_data_format_arg = op.arg.add()
+            has_data_format_arg.name = MaceKeyword.mace_has_data_format_str
+            has_data_format_arg.i = data_format_flag
        return False

    def quantize_nodes(self):

--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
@@ -46,6 +46,7 @@ void MaceRunFunc(const int in_out_size) {

  for (size_t i = 0; i < input_names.size(); ++i) {
    InputInfo *info = net_def->add_input_info();
+    info->set_data_format(DataFormat::NHWC);
    info->set_name(input_names[i]);
    for (auto d : input_shapes[0]) {
      info->add_dims(static_cast<int>(d));

--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -45,6 +45,7 @@ void MaceRun(const int in_out_size,

  for (size_t i = 0; i < input_names.size(); ++i) {
    InputInfo *info = net_def->add_input_info();
+    info->set_data_format(DataFormat::NHWC);
    info->set_name(input_names[i]);
    for (auto d : max_shape) {
      info->add_dims(static_cast<int>(d));

--- a/mace/test/mace_api_test.h
+++ b/mace/test/mace_api_test.h
@@ -76,6 +76,7 @@ void Conv3x3(const std::string &input_name,
      .AddIntArg("padding", Padding::SAME)
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntArg("has_data_format", 1)
      .Finalize(&operator_def);

  OutputShape *shape = operator_def.add_output_shape();
@@ -98,6 +99,7 @@ void Relu(const std::string &input_name,
      .AddStringArg("activation", "RELU")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .AddIntArg("device", static_cast<int>(device_type))
+      .AddIntArg("has_data_format", 1)
      .Finalize(&operator_def);

  net_def->add_op()->CopyFrom(operator_def);

--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -103,6 +103,16 @@ DeviceType ParseDeviceType(const std::string &device_str) {
  }
 }

+DataFormat ParseDataFormat(const std::string &data_format_str) {
+  if (data_format_str == "NHWC") {
+    return DataFormat::NHWC;
+  } else if (data_format_str == "NCHW") {
+    return DataFormat::NCHW;
+  } else {
+    return DataFormat::DF_NONE;
+  }
+}
+
 struct mallinfo LogMallinfoChange(struct mallinfo prev) {
  struct mallinfo curr = mallinfo();
  if (prev.arena != curr.arena) {
@@ -168,6 +178,12 @@ DEFINE_string(output_node,
 DEFINE_string(output_shape,
              "1,224,224,2:1,1,1,10",
              "output shapes, separated by colon and comma");
+DEFINE_string(input_data_format,
+              "NHWC",
+              "input data formats, NONE|NHWC|NCHW");
+DEFINE_string(output_data_format,
+              "NHWC",
+              "output data formats, NONE|NHWC|NCHW");
 DEFINE_string(input_file,
              "",
              "input file name | input file prefix for multiple inputs.");
@@ -206,8 +222,10 @@ DEFINE_int32(cpu_affinity_policy, 1,
 bool RunModel(const std::string &model_name,
              const std::vector<std::string> &input_names,
              const std::vector<std::vector<int64_t>> &input_shapes,
+              const std::vector<DataFormat> &input_data_formats,
              const std::vector<std::string> &output_names,
-              const std::vector<std::vector<int64_t>> &output_shapes) {
+              const std::vector<std::vector<int64_t>> &output_shapes,
+              const std::vector<DataFormat> &output_data_formats) {
  DeviceType device_type = ParseDeviceType(FLAGS_device);

  int64_t t0 = NowMicros();
@@ -325,7 +343,8 @@ bool RunModel(const std::string &model_name,
      LOG(INFO) << "Open input file failed";
      return -1;
    }
-    inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in);
+    inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in,
+        input_data_formats[i]);
  }

  for (size_t i = 0; i < output_count; ++i) {
@@ -334,7 +353,8 @@ bool RunModel(const std::string &model_name,
                        std::multiplies<int64_t>());
    auto buffer_out = std::shared_ptr<float>(new float[output_size],
                                             std::default_delete<float[]>());
-    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out);
+    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out,
+        output_data_formats[i]);
  }

  LOG(INFO) << "Warm up run";
@@ -498,13 +518,27 @@ int Main(int argc, char **argv) {
  for (size_t i = 0; i < output_count; ++i) {
    ParseShape(output_shapes[i], &output_shape_vec[i]);
  }
+  std::vector<std::string> raw_input_data_formats =
+      str_util::Split(FLAGS_input_data_format, ',');
+  std::vector<std::string> raw_output_data_formats =
+      str_util::Split(FLAGS_output_data_format, ',');
+  std::vector<DataFormat> input_data_formats(input_count);
+  std::vector<DataFormat> output_data_formats(output_count);
+  for (size_t i = 0; i < input_count; ++i) {
+    input_data_formats[i] = ParseDataFormat(raw_input_data_formats[i]);
+  }
+  for (size_t i = 0; i < output_count; ++i) {
+    output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]);
+  }
+

  bool ret = false;
  for (int i = 0; i < FLAGS_restart_round; ++i) {
    VLOG(0) << "restart round " << i;
    ret =
-        RunModel(FLAGS_model_name, input_names, input_shape_vec,
-                 output_names, output_shape_vec);
+        RunModel(FLAGS_model_name,
+                 input_names, input_shape_vec, input_data_formats,
+                 output_names, output_shape_vec, output_data_formats);
  }
  if (ret) {
    return 0;

--- a/tools/common.py
+++ b/tools/common.py
@@ -131,6 +131,12 @@ class DeviceType(object):
    HEXAGON = 'HEXAGON'


+class DataFormat(object):
+    NONE = "NONE"
+    NHWC = "NHWC"
+    NCHW = "NCHW"
+
+
 ################################
 # Argument types
 ################################

--- a/tools/converter.py
+++ b/tools/converter.py
@@ -96,14 +96,10 @@ WinogradParameters = [0, 2, 4]
 DataFormatStrs = [
    "NONE",
    "NHWC",
+    "NCHW",
 ]


-class DataFormat(object):
-    NONE = "NONE"
-    NHWC = "NHWC"
-
-
 class DefaultValues(object):
    mace_lib_type = MACELibType.static
    omp_num_threads = -1,
@@ -371,6 +367,15 @@ def format_model_config(flags):
                if not isinstance(value, list):
                    subgraph[key] = [value]
                subgraph[key] = [str(v) for v in subgraph[key]]
+            input_size = len(subgraph[YAMLKeyword.input_tensors])
+            output_size = len(subgraph[YAMLKeyword.output_tensors])
+
+            mace_check(len(subgraph[YAMLKeyword.input_shapes]) == input_size,
+                       ModuleName.YAML_CONFIG,
+                       "input shapes' size not equal inputs' size.")
+            mace_check(len(subgraph[YAMLKeyword.output_shapes]) == output_size,
+                       ModuleName.YAML_CONFIG,
+                       "output shapes' size not equal outputs' size.")

            for key in [YAMLKeyword.check_tensors,
                        YAMLKeyword.check_shapes]:
@@ -399,13 +404,13 @@ def format_model_config(flags):
            if input_data_formats:
                if not isinstance(input_data_formats, list):
                    subgraph[YAMLKeyword.input_data_formats] =\
-                        [input_data_formats]
+                        [input_data_formats] * input_size
                else:
                    mace_check(len(input_data_formats)
-                               == len(subgraph[YAMLKeyword.input_tensors]),
+                               == input_size,
                               ModuleName.YAML_CONFIG,
                               "input_data_formats should match"
-                               " the size of input")
+                               " the size of input.")
                for input_data_format in\
                        subgraph[YAMLKeyword.input_data_formats]:
                    mace_check(input_data_format in DataFormatStrs,
@@ -414,17 +419,18 @@ def format_model_config(flags):
                               + str(DataFormatStrs) + ", but got "
                               + input_data_format)
            else:
-                subgraph[YAMLKeyword.input_data_formats] = [DataFormat.NHWC]
+                subgraph[YAMLKeyword.input_data_formats] = \
+                    [DataFormat.NHWC] * input_size

            output_data_formats = subgraph.get(YAMLKeyword.output_data_formats,
                                               [])
            if output_data_formats:
                if not isinstance(output_data_formats, list):
                    subgraph[YAMLKeyword.output_data_formats] = \
-                        [output_data_formats]
+                        [output_data_formats] * output_size
                else:
                    mace_check(len(output_data_formats)
-                               == len(subgraph[YAMLKeyword.output_tensors]),
+                               == output_size,
                               ModuleName.YAML_CONFIG,
                               "output_data_formats should match"
                               " the size of output")
@@ -435,7 +441,8 @@ def format_model_config(flags):
                               "'output_data_formats' must be in "
                               + str(DataFormatStrs))
            else:
-                subgraph[YAMLKeyword.output_data_formats] = [DataFormat.NHWC]
+                subgraph[YAMLKeyword.output_data_formats] =\
+                    [DataFormat.NHWC] * output_size

            validation_threshold = subgraph.get(
                YAMLKeyword.validation_threshold, {})

--- a/tools/device.py
+++ b/tools/device.py
@@ -154,7 +154,9 @@ class DeviceWrapper:
                   input_nodes,
                   output_nodes,
                   input_shapes,
+                   input_data_formats,
                   output_shapes,
+                   output_data_formats,
                   mace_model_dir,
                   model_tag,
                   device_type,
@@ -216,6 +218,8 @@ class DeviceWrapper:
                    "--output_node=%s" % ",".join(output_nodes),
                    "--input_shape=%s" % ":".join(input_shapes),
                    "--output_shape=%s" % ":".join(output_shapes),
+                    "--input_data_format=%s" % ",".join(input_data_formats),
+                    "--output_data_format=%s" % ",".join(output_data_formats),
                    "--input_file=%s/%s" % (model_output_dir,
                                            input_file_name),
                    "--output_file=%s/%s" % (model_output_dir,
@@ -307,6 +311,8 @@ class DeviceWrapper:
                "--output_node=%s" % ",".join(output_nodes),
                "--input_shape=%s" % ":".join(input_shapes),
                "--output_shape=%s" % ":".join(output_shapes),
+                "--input_data_format=%s" % ",".join(input_data_formats),
+                "--output_data_format=%s" % ",".join(output_data_formats),
                "--input_file=%s/%s" % (self.data_dir, input_file_name),
                "--output_file=%s/%s" % (self.data_dir, output_file_name),
                "--input_dir=%s" % input_dir,
@@ -394,6 +400,8 @@ class DeviceWrapper:
            output_nodes=subgraphs[0][YAMLKeyword.output_tensors],
            input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
            output_shapes=subgraphs[0][YAMLKeyword.output_shapes],
+            input_data_formats=subgraphs[0][YAMLKeyword.input_data_formats],
+            output_data_formats=subgraphs[0][YAMLKeyword.output_data_formats],
            mace_model_dir=mace_model_dir,
            model_tag=model_name,
            device_type=DeviceType.GPU,
@@ -587,6 +595,10 @@ class DeviceWrapper:
                            YAMLKeyword.output_tensors],
                        input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
                        output_shapes=output_config[YAMLKeyword.output_shapes],
+                        input_data_formats=subgraphs[0][
+                            YAMLKeyword.input_data_formats],
+                        output_data_formats=subgraphs[0][
+                            YAMLKeyword.output_data_formats],
                        mace_model_dir=mace_model_dir,
                        model_tag=model_name,
                        device_type=device_type,
@@ -652,6 +664,10 @@ class DeviceWrapper:
                                YAMLKeyword.input_shapes],
                            output_shapes=output_config[
                                YAMLKeyword.output_shapes],
+                            input_data_formats=subgraphs[0][
+                                YAMLKeyword.input_data_formats],
+                            output_data_formats=subgraphs[0][
+                                YAMLKeyword.output_data_formats],
                            model_output_dir=model_output_dir,
                            input_data_types=subgraphs[0][
                                YAMLKeyword.input_data_types],
@@ -750,6 +766,8 @@ class DeviceWrapper:
                        output_nodes,
                        input_shapes,
                        output_shapes,
+                        input_data_formats,
+                        output_data_formats,
                        max_num_runs,
                        max_seconds,
                        model_tag,
@@ -790,6 +808,8 @@ class DeviceWrapper:
                    '--output_node=%s' % ','.join(output_nodes),
                    '--input_shape=%s' % ':'.join(input_shapes),
                    '--output_shape=%s' % ':'.join(output_shapes),
+                    "--input_data_format=%s" % ",".join(input_data_formats),
+                    "--output_data_format=%s" % ",".join(output_data_formats),
                    '--input_file=%s/%s' % (model_output_dir, input_file_name),
                    "--model_data_file=%s" % model_data_file,
                    '--max_num_runs=%d' % max_num_runs,
@@ -845,6 +865,8 @@ class DeviceWrapper:
                '--output_node=%s' % ','.join(output_nodes),
                '--input_shape=%s' % ':'.join(input_shapes),
                '--output_shape=%s' % ':'.join(output_shapes),
+                "--input_data_format=%s" % ",".join(input_data_formats),
+                "--output_data_format=%s" % ",".join(output_data_formats),
                '--input_file=%s/%s' % (self.data_dir, input_file_name),
                "--model_data_file=%s" % model_data_file,
                '--max_num_runs=%d' % max_num_runs,
@@ -961,6 +983,10 @@ class DeviceWrapper:
                    output_nodes=output_nodes,
                    input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
                    output_shapes=output_shapes,
+                    input_data_formats=subgraphs[0][
+                        YAMLKeyword.input_data_formats],
+                    output_data_formats=subgraphs[0][
+                        YAMLKeyword.output_data_formats],
                    max_num_runs=flags.max_num_runs,
                    max_seconds=flags.max_seconds,
                    mace_model_dir=mace_model_dir,
@@ -974,8 +1000,7 @@ class DeviceWrapper:
                    opencl_binary_file=opencl_output_bin_path,
                    opencl_parameter_file=opencl_parameter_path,
                    libmace_dynamic_library_path=LIBMACE_DYNAMIC_PATH,
-                    link_dynamic=link_dynamic
-                )
+                    link_dynamic=link_dynamic)

    def run(self,
            abi,

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -649,6 +649,8 @@ def validate_model(abi,
                   output_nodes,
                   input_shapes,
                   output_shapes,
+                   input_data_formats,
+                   output_data_formats,
                   model_output_dir,
                   input_data_types,
                   caffe_env,
@@ -671,20 +673,12 @@ def validate_model(abi,
                sh.rm("-rf", "%s/%s" % (model_output_dir, formatted_name))
            device.pull_from_data_dir(formatted_name, model_output_dir)

-    if platform == "tensorflow":
-        validate(platform, model_file_path, "",
-                 "%s/%s" % (model_output_dir, input_file_name),
-                 "%s/%s" % (model_output_dir, output_file_name), device_type,
-                 ":".join(input_shapes), ":".join(output_shapes),
-                 ",".join(input_nodes), ",".join(output_nodes),
-                 validation_threshold, ",".join(input_data_types), backend,
-                 validation_outputs_data,
-                 log_file)
-    elif platform == "onnx":
+    if platform == "tensorflow" or platform == "onnx":
        validate(platform, model_file_path, "",
                 "%s/%s" % (model_output_dir, input_file_name),
                 "%s/%s" % (model_output_dir, output_file_name), device_type,
                 ":".join(input_shapes), ":".join(output_shapes),
+                 ",".join(input_data_formats), ",".join(output_data_formats),
                 ",".join(input_nodes), ",".join(output_nodes),
                 validation_threshold, ",".join(input_data_types), backend,
                 validation_outputs_data,
@@ -703,6 +697,8 @@ def validate_model(abi,
                     "%s/%s" % (model_output_dir, output_file_name),
                     device_type,
                     ":".join(input_shapes), ":".join(output_shapes),
+                     ",".join(input_data_formats),
+                     ",".join(output_data_formats),
                     ",".join(input_nodes), ",".join(output_nodes),
                     validation_threshold, ",".join(input_data_types), backend,
                     validation_outputs_data,
@@ -770,6 +766,8 @@ def validate_model(abi,
                "--output_node=%s" % ",".join(output_nodes),
                "--input_shape=%s" % ":".join(input_shapes),
                "--output_shape=%s" % ":".join(output_shapes),
+                "--input_data_format=%s" % ",".join(input_data_formats),
+                "--output_data_format=%s" % ",".join(output_data_formats),
                "--validation_threshold=%f" % validation_threshold,
                "--input_data_type=%s" % ",".join(input_data_types),
                "--backend=%s" % ",".join(backend),

--- a/tools/validate.py
+++ b/tools/validate.py
@@ -148,10 +148,11 @@ def validate_with_file(platform, device_type,
                       value, validation_threshold, log_file)


-def validate_tf_model(platform, device_type, model_file, input_file,
-                      mace_out_file, input_names, input_shapes,
-                      output_names, validation_threshold, input_data_types,
-                      log_file):
+def validate_tf_model(platform, device_type, model_file,
+                      input_file, mace_out_file,
+                      input_names, input_shapes, input_data_formats,
+                      output_names, output_shapes, output_data_formats,
+                      validation_threshold, input_data_types, log_file):
    import tensorflow as tf
    if not os.path.isfile(model_file):
        common.MaceLogger.error(
@@ -174,6 +175,9 @@ def validate_tf_model(platform, device_type, model_file, input_file,
                        common.formatted_file_name(input_file, input_names[i]),
                        input_data_types[i])
                    input_value = input_value.reshape(input_shapes[i])
+                    if input_data_formats[i] == common.DataFormat.NCHW and\
+                            len(input_shapes[i]) == 4:
+                        input_value = input_value.transpose((0, 2, 3, 1))
                    input_node = graph.get_tensor_by_name(
                        normalize_tf_tensor_name(input_names[i]))
                    input_dict[input_node] = input_value
@@ -188,15 +192,20 @@ def validate_tf_model(platform, device_type, model_file, input_file,
                    output_file_name = common.formatted_file_name(
                        mace_out_file, output_names[i])
                    mace_out_value = load_data(output_file_name)
+                    if output_data_formats[i] == common.DataFormat.NCHW and\
+                            len(output_shapes[i]) == 4:
+                        mace_out_value = mace_out_value.\
+                            reshape(output_shapes[i]).transpose((0, 2, 3, 1))
                    compare_output(platform, device_type, output_names[i],
                                   mace_out_value, output_values[i],
                                   validation_threshold, log_file)


 def validate_caffe_model(platform, device_type, model_file, input_file,
-                         mace_out_file, weight_file, input_names, input_shapes,
-                         output_names, output_shapes, validation_threshold,
-                         log_file):
+                         mace_out_file, weight_file,
+                         input_names, input_shapes, input_data_formats,
+                         output_names, output_shapes, output_data_formats,
+                         validation_threshold, log_file):
    os.environ['GLOG_minloglevel'] = '1'  # suprress Caffe verbose prints
    import caffe
    if not os.path.isfile(model_file):
@@ -215,8 +224,10 @@ def validate_caffe_model(platform, device_type, model_file, input_file,
    for i in range(len(input_names)):
        input_value = load_data(
            common.formatted_file_name(input_file, input_names[i]))
-        input_value = input_value.reshape(input_shapes[i]).transpose((0, 3, 1,
-                                                                      2))
+        input_value = input_value.reshape(input_shapes[i])
+        if input_data_formats[i] == common.DataFormat.NHWC and \
+                len(input_shapes[i]) == 4:
+            input_value = input_value.transpose((0, 3, 1, 2))
        input_blob_name = input_names[i]
        try:
            if input_names[i] in net.top_names:
@@ -232,22 +243,23 @@ def validate_caffe_model(platform, device_type, model_file, input_file,

    for i in range(len(output_names)):
        value = net.blobs[output_names[i]].data
-        out_shape = output_shapes[i]
-        if len(out_shape) == 4:
-            out_shape[1], out_shape[2], out_shape[3] = \
-                out_shape[3], out_shape[1], out_shape[2]
-            value = value.reshape(out_shape).transpose((0, 2, 3, 1))
        output_file_name = common.formatted_file_name(
            mace_out_file, output_names[i])
        mace_out_value = load_data(output_file_name)
+        if output_data_formats[i] == common.DataFormat.NHWC and \
+                len(output_shapes[i]) == 4:
+            mace_out_value = mace_out_value.reshape(output_shapes[i])\
+                .transpose((0, 3, 1, 2))
        compare_output(platform, device_type, output_names[i], mace_out_value,
                       value, validation_threshold, log_file)


-def validate_onnx_model(platform, device_type, model_file, input_file,
-                        mace_out_file, input_names, input_shapes,
-                        output_names, output_shapes, validation_threshold,
-                        input_data_types, backend, log_file):
+def validate_onnx_model(platform, device_type, model_file,
+                        input_file, mace_out_file,
+                        input_names, input_shapes, input_data_formats,
+                        output_names, output_shapes, output_data_formats,
+                        validation_threshold, input_data_types,
+                        backend, log_file):
    import onnx
    if backend == "tensorflow":
        from onnx_tf.backend import prepare
@@ -269,13 +281,16 @@ def validate_onnx_model(platform, device_type, model_file, input_file,
        input_value = load_data(common.formatted_file_name(input_file,
                                                           input_names[i]),
                                input_data_types[i])
-        input_value = input_value.reshape(input_shapes[i]).transpose((0, 3, 1,
-                                                                      2))
+        input_value = input_value.reshape(input_shapes[i])
+        if input_data_formats[i] == common.DataFormat.NHWC and \
+                len(input_shapes[i]) == 4:
+            input_value = input_value.transpose((0, 3, 1, 2))
        input_dict[input_names[i]] = input_value
    onnx_outputs = []
    for i in range(len(output_names)):
        out_shape = output_shapes[i]
-        if len(out_shape) == 4:
+        if output_data_formats[i] == common.DataFormat.NHWC and\
+                len(out_shape) == 4:
            out_shape[1], out_shape[2], out_shape[3] = \
                out_shape[3], out_shape[1], out_shape[2]
        onnx_outputs.append(
@@ -289,25 +304,32 @@ def validate_onnx_model(platform, device_type, model_file, input_file,
    for i in range(len(output_names)):
        out_name = output_names[i]
        value = output_values[out_name].flatten()
-        out_shape = output_shapes[i]
-        if len(out_shape) == 4:
-            value = value.reshape(out_shape).transpose((0, 2, 3, 1))
        output_file_name = common.formatted_file_name(mace_out_file,
                                                      output_names[i])
        mace_out_value = load_data(output_file_name)
+        if output_data_formats[i] == common.DataFormat.NHWC and \
+                len(output_shapes[i]) == 4:
+            mace_out_value = mace_out_value.reshape(output_shapes[i]) \
+                .transpose((0, 3, 1, 2))
        compare_output(platform, device_type, output_names[i],
                       mace_out_value, value,
                       validation_threshold, log_file)


 def validate(platform, model_file, weight_file, input_file, mace_out_file,
-             device_type, input_shape, output_shape, input_node, output_node,
+             device_type, input_shape, output_shape, input_data_format_str,
+             output_data_format_str, input_node, output_node,
             validation_threshold, input_data_type, backend,
             validation_outputs_data, log_file):
    input_names = [name for name in input_node.split(',')]
    input_shape_strs = [shape for shape in input_shape.split(':')]
    input_shapes = [[int(x) for x in shape.split(',')]
                    for shape in input_shape_strs]
+    output_shape_strs = [shape for shape in output_shape.split(':')]
+    output_shapes = [[int(x) for x in shape.split(',')]
+                     for shape in output_shape_strs]
+    input_data_formats = [df for df in input_data_format_str.split(',')]
+    output_data_formats = [df for df in output_data_format_str.split(',')]
    if input_data_type:
        input_data_types = [data_type
                            for data_type in input_data_type.split(',')]
@@ -323,32 +345,27 @@ def validate(platform, model_file, weight_file, input_file, mace_out_file,
    else:
        validation_outputs = validation_outputs_data
    if validation_outputs:
-        output_shape_strs = [shape for shape in output_shape.split(':')]
-        output_shapes = [[int(x) for x in shape.split(',')]
-                         for shape in output_shape_strs]
        validate_with_file(platform, device_type, output_names, output_shapes,
                           mace_out_file, validation_outputs,
                           validation_threshold, log_file)
    elif platform == 'tensorflow':
-        validate_tf_model(platform, device_type, model_file, input_file,
-                          mace_out_file, input_names, input_shapes,
-                          output_names, validation_threshold, input_data_types,
+        validate_tf_model(platform, device_type,
+                          model_file, input_file, mace_out_file,
+                          input_names, input_shapes, input_data_formats,
+                          output_names, output_shapes, output_data_formats,
+                          validation_threshold, input_data_types,
                          log_file)
    elif platform == 'caffe':
-        output_shape_strs = [shape for shape in output_shape.split(':')]
-        output_shapes = [[int(x) for x in shape.split(',')]
-                         for shape in output_shape_strs]
-        validate_caffe_model(platform, device_type, model_file, input_file,
-                             mace_out_file, weight_file, input_names,
-                             input_shapes, output_names, output_shapes,
+        validate_caffe_model(platform, device_type, model_file,
+                             input_file, mace_out_file, weight_file,
+                             input_names, input_shapes, input_data_formats,
+                             output_names, output_shapes, output_data_formats,
                             validation_threshold, log_file)
    elif platform == 'onnx':
-        output_shape_strs = [shape for shape in output_shape.split(':')]
-        output_shapes = [[int(x) for x in shape.split(',')]
-                         for shape in output_shape_strs]
-        validate_onnx_model(platform, device_type, model_file, input_file,
-                            mace_out_file, input_names, input_shapes,
-                            output_names, output_shapes,
+        validate_onnx_model(platform, device_type, model_file,
+                            input_file, mace_out_file,
+                            input_names, input_shapes, input_data_formats,
+                            output_names, output_shapes, output_data_formats,
                            validation_threshold,
                            input_data_types, backend, log_file)

@@ -379,8 +396,14 @@ def parse_args():
        "--device_type", type=str, default="", help="mace runtime device.")
    parser.add_argument(
        "--input_shape", type=str, default="1,64,64,3", help="input shape.")
+    parser.add_argument(
+        "--input_data_format", type=str, default="NHWC",
+        help="input data format.")
    parser.add_argument(
        "--output_shape", type=str, default="1,64,64,2", help="output shape.")
+    parser.add_argument(
+        "--output_data_format", type=str, default="NHWC",
+        help="output data format.")
    parser.add_argument(
        "--input_node", type=str, default="input_node", help="input node")
    parser.add_argument(
@@ -417,6 +440,8 @@ if __name__ == '__main__':
             FLAGS.device_type,
             FLAGS.input_shape,
             FLAGS.output_shape,
+             FLAGS.input_data_format,
+             FLAGS.output_data_format,
             FLAGS.input_node,
             FLAGS.output_node,
             FLAGS.validation_threshold,