Add Reduce for quantized CPU and DSP

0caade30 · Bin Li · c23719f2 · 0caade30 · 0caade30 · 0caade30
6 changed file
--- a/mace/ops/quantization_util.cc
+++ b/mace/ops/quantization_util.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+// Copyright 2018 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/quantization_util.h
+++ b/mace/ops/quantization_util.h
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+// Copyright 2018 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/reduce.cc
+++ b/mace/ops/reduce.cc
@@ -73,6 +73,9 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
    const Tensor *input = this->Input(0);
    Tensor *output = this->Output(0);
    Simplify(input);
+    // Use the same scale and zero point with input and output.
+    output->SetScale(input->scale());
+    output->SetZeroPoint(input->zero_point());
    output->Resize(out_shape_);
    Compute(input, output);
    return MaceStatus::MACE_SUCCESS;
@@ -92,7 +95,8 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
                          axis_[i] + input->dim_size();
        auto df = static_cast<DataFormat>(Operation::GetOptionalArg<int>(
            "data_format", DataFormat::DF_NONE));
-        if (df == DataFormat::NHWC && input->dim_size() == 4) {
+        if (df == DataFormat::NHWC && DataTypeToEnum<T>::value != DT_UINT8
+            && input->dim_size() == 4) {
          if (index == 1 || index == 2) index = index + 1;
          else if (index == 3) index = 1;
        }
@@ -132,7 +136,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
    }
  }

-  void compute_reduce_1(const T *input, ReduceType type, T *output) {
+  void Reduce1Dims(const T *input, ReduceType type, T *output) {
    if (reduce_first_axis_) {
      if (type == ReduceType::MEAN) {
        T tmp = 0;
@@ -166,7 +170,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
    }
  }

-  void compute_reduce_2(const T *input, ReduceType type, T *output) {
+  void Reduce2Dims(const T *input, ReduceType type, T *output) {
    if (reduce_first_axis_) {
      if (type == ReduceType::MEAN) {
 #pragma omp parallel for schedule(runtime)
@@ -250,7 +254,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
    }
  }

-  void compute_reduce_3(const T *input, ReduceType type, T *output) {
+  void Reduce3Dims(const T *input, ReduceType type, T *output) {
    if (reduce_first_axis_) {
      if (type == ReduceType::MEAN) {
 #pragma omp parallel for collapse(1) schedule(runtime)
@@ -364,7 +368,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
    }
  }

-  void compute_reduce_4(const T *input, ReduceType type, T *output) {
+  void Reduce4Dims(const T *input, ReduceType type, T *output) {
    if (reduce_first_axis_) {
      if (type == ReduceType::MEAN) {
 #pragma omp parallel for collapse(2) schedule(runtime)
@@ -498,7 +502,6 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
    }
  }

-
  void Compute(const Tensor *input, Tensor *output) {
    Tensor::MappingGuard input_mapper(input);
    const T *input_ptr = input->data<T>();
@@ -507,16 +510,16 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
    memset(output_ptr, 0, output->size() * sizeof(T));
    switch (data_reshape_.size()) {
      case 1:
-        compute_reduce_1(input_ptr, reduce_type_, output_ptr);
+        Reduce1Dims(input_ptr, reduce_type_, output_ptr);
        break;
      case 2:
-        compute_reduce_2(input_ptr, reduce_type_, output_ptr);
+        Reduce2Dims(input_ptr, reduce_type_, output_ptr);
        break;
      case 3:
-        compute_reduce_3(input_ptr, reduce_type_, output_ptr);
+        Reduce3Dims(input_ptr, reduce_type_, output_ptr);
        break;
      case 4:
-        compute_reduce_4(input_ptr, reduce_type_, output_ptr);
+        Reduce4Dims(input_ptr, reduce_type_, output_ptr);
        break;
      default:
        MACE_CHECK(false, "not implemented in mace")
@@ -532,6 +535,311 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
  std::vector<index_t> out_shape_;
 };

+#ifdef MACE_ENABLE_QUANTIZE
+template <>
+void ReduceOp<DeviceType::CPU, uint8_t>::Reduce1Dims(
+    const uint8_t *input, ReduceType type, uint8_t *output) {
+  if (reduce_first_axis_) {
+    if (type == ReduceType::MEAN) {
+      uint32_t tmp = 0;
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        tmp = tmp + input[i];
+      }
+      output[0] = static_cast<uint8_t>(
+          (tmp + data_reshape_[0] / 2) / data_reshape_[0]);
+    } else if (type == ReduceType::MIN) {
+      uint8_t tmp = input[0];
+      for (int i = 1; i < data_reshape_[0]; ++i) {
+        tmp = std::min<uint8_t>(tmp, input[i]);
+      }
+      output[0] = tmp;
+    } else if (type == ReduceType::MAX) {
+      uint8_t tmp = input[0];
+      for (int i = 1; i < data_reshape_[0]; ++i) {
+        tmp = std::max<uint8_t>(tmp, input[i]);
+      }
+      output[0] = tmp;
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  } else {
+    memcpy(output, input, data_reshape_[0] * sizeof(uint8_t));
+  }
+}
+
+template <>
+void ReduceOp<DeviceType::CPU, uint8_t>::Reduce2Dims(
+    const uint8_t *input, ReduceType type, uint8_t *output) {
+  if (reduce_first_axis_) {
+    if (type == ReduceType::MEAN) {
+#pragma omp parallel for schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        uint32_t tmp = 0;
+        for (int j = 0; j < data_reshape_[0]; ++j) {
+          tmp += input[j * data_reshape_[1] + i];
+        }
+        output[i] = static_cast<uint8_t>(
+            (tmp + data_reshape_[0] / 2) / data_reshape_[0]);
+      }
+    } else if (type == ReduceType::MIN) {
+#pragma omp parallel for schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        uint8_t tmp = input[i];
+        for (int j = 1; j < data_reshape_[0]; ++j) {
+          tmp = std::min(tmp, input[j * data_reshape_[1] + i]);
+        }
+        output[i] = tmp;
+      }
+    } else if (type == ReduceType::MAX) {
+#pragma omp parallel for schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        uint8_t tmp = input[i];
+        for (int j = 1; j < data_reshape_[0]; ++j) {
+          tmp = std::max(tmp, input[j * data_reshape_[1] + i]);
+        }
+        output[i] = tmp;
+      }
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  } else {
+    if (type == ReduceType::MEAN) {
+#pragma omp parallel for schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        uint32_t tmp = 0;
+        for (int j = 0; j < data_reshape_[1]; ++j) {
+          tmp += input[i * data_reshape_[1] + j];
+        }
+        output[i] = static_cast<uint8_t>(
+            (tmp + data_reshape_[1] / 2) / data_reshape_[1]);
+      }
+    } else if (type == ReduceType::MIN) {
+#pragma omp parallel for schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        uint8_t tmp = input[i * data_reshape_[1]];
+        for (int j = 1; j < data_reshape_[1]; ++j) {
+          tmp = std::min(tmp, input[i * data_reshape_[1] + j]);
+        }
+        output[i] = tmp;
+      }
+    } else if (type == ReduceType::MAX) {
+#pragma omp parallel for schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        uint8_t tmp = input[i * data_reshape_[1]];
+        for (int j = 1; j < data_reshape_[1]; ++j) {
+          tmp = std::max(tmp, input[i * data_reshape_[1] + j]);
+        }
+        output[i] = tmp;
+      }
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+}
+
+template <>
+void ReduceOp<DeviceType::CPU, uint8_t>::Reduce3Dims(
+    const uint8_t *input, ReduceType type, uint8_t *output) {
+  if (reduce_first_axis_) {
+    if (type == ReduceType::MEAN) {
+#pragma omp parallel for collapse(1) schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        uint32_t tmp = 0;
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          for (int k = 0; k < data_reshape_[0]; ++k) {
+            tmp += input[(k * data_reshape_[1] + i) * data_reshape_[2] + j];
+          }
+        }
+        index_t dim = data_reshape_[0] * data_reshape_[2];
+        output[i] = static_cast<uint8_t>((tmp + dim / 2) / dim);
+      }
+    } else if (type == ReduceType::MIN) {
+#pragma omp parallel for collapse(1) schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        uint8_t tmp = input[i * data_reshape_[2]];
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          for (int k = 0; k < data_reshape_[0]; ++k) {
+            tmp = std::min(tmp,
+                           input[(k * data_reshape_[1] + i) * data_reshape_[2]
+                               + j]);
+          }
+        }
+        output[i] = tmp;
+      }
+    } else if (type == ReduceType::MAX) {
+#pragma omp parallel for collapse(1) schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        uint8_t tmp = input[i * data_reshape_[2]];
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          for (int k = 0; k < data_reshape_[0]; ++k) {
+            tmp =
+                std::max(tmp,
+                         input[(k * data_reshape_[1] + i)
+                             * data_reshape_[2] + j]);
+          }
+        }
+        output[i] = tmp;
+      }
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  } else {
+    if (type == ReduceType::MEAN) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          uint32_t tmp = 0;
+          for (int k = 0; k < data_reshape_[1]; ++k) {
+            tmp += input[(i * data_reshape_[1] + k) * data_reshape_[2] + j];
+          }
+          output[i * data_reshape_[2] + j] =
+              static_cast<uint8_t>((tmp + data_reshape_[1] / 2) /
+                  data_reshape_[1]);
+        }
+      }
+    } else if (type == ReduceType::MIN) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          uint8_t tmp = input[i * data_reshape_[1] * data_reshape_[2] + j];
+          for (int k = 1; k < data_reshape_[1]; ++k) {
+            tmp = std::min(tmp,
+                           input[(i * data_reshape_[1] + k) *
+                               data_reshape_[2] + j]);
+          }
+          output[i * data_reshape_[2] + j] = tmp;
+        }
+      }
+    } else if (type == ReduceType::MAX) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          uint8_t tmp = input[i * data_reshape_[1] * data_reshape_[2] + j];
+          for (int k = 1; k < data_reshape_[1]; ++k) {
+            tmp = std::max(tmp,
+                           input[(i * data_reshape_[1] + k) *
+                               data_reshape_[2] + j]);
+          }
+          output[i * data_reshape_[2] + j] = tmp;
+        }
+      }
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+}
+
+template <>
+void ReduceOp<DeviceType::CPU, uint8_t>::Reduce4Dims(
+    const uint8_t *input, ReduceType type, uint8_t *output) {
+  if (reduce_first_axis_) {
+    if (type == ReduceType::MEAN) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        for (int j = 0; j < data_reshape_[3]; ++j) {
+          uint32_t tmp = 0;
+          for (int k = 0; k < data_reshape_[2]; ++k) {
+            for (int t = 0; t < data_reshape_[0]; ++t) {
+              tmp += input[((t * data_reshape_[1] + i) *
+                  data_reshape_[2] + k)*data_reshape_[3] + j];
+            }
+          }
+          index_t dim = data_reshape_[0] * data_reshape_[2];
+          output[i * data_reshape_[3] + j] =
+              static_cast<uint8_t>((tmp + dim / 2) / dim);
+        }
+      }
+    } else if (type == ReduceType::MIN) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        for (int j = 0; j < data_reshape_[3]; ++j) {
+          uint8_t tmp = input[i * data_reshape_[2] * data_reshape_[3] + j];
+          for (int k = 0; k < data_reshape_[2]; ++k) {
+            for (int t = 0; t < data_reshape_[0]; ++t) {
+              tmp = std::min(tmp,
+                             input[((t * data_reshape_[1] + i) *
+                                 data_reshape_[2] + k)*data_reshape_[3] + j]);
+            }
+          }
+          output[i * data_reshape_[3] + j] = tmp;
+        }
+      }
+    } else if (type == ReduceType::MAX) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        for (int j = 0; j < data_reshape_[3]; ++j) {
+          uint8_t tmp = input[i * data_reshape_[2] * data_reshape_[3] + j];
+          for (int k = 0; k < data_reshape_[2]; ++k) {
+            for (int t = 0; t < data_reshape_[0]; ++t) {
+              tmp = std::max(tmp,
+                             input[((t * data_reshape_[1] + i) *
+                                 data_reshape_[2] + k)*data_reshape_[3] + j]);
+            }
+          }
+          output[i * data_reshape_[3] + j] = tmp;
+        }
+      }
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  } else {
+    if (type == ReduceType::MEAN) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          uint32_t tmp = 0;
+          for (int k = 0; k < data_reshape_[1]; ++k) {
+            for (int t = 0; t < data_reshape_[3]; ++t) {
+              tmp += input[((i * data_reshape_[1] + k) *
+                  data_reshape_[2] + j)*data_reshape_[3] + t];
+            }
+          }
+          index_t dim = data_reshape_[1] * data_reshape_[3];
+          output[i * data_reshape_[2] + j] =
+              static_cast<uint8_t>((tmp + dim / 2) / dim);
+        }
+      }
+    } else if (type == ReduceType::MIN) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          uint8_t tmp = input[(i * data_reshape_[1] *
+              data_reshape_[2] + j)*data_reshape_[3]];
+          for (int k = 0; k < data_reshape_[1]; ++k) {
+            for (int t = 0; t < data_reshape_[3]; ++t) {
+              tmp =
+                  std::min(tmp,
+                           input[((i * data_reshape_[1] + k) *
+                               data_reshape_[2] + j)*data_reshape_[3] + t]);
+            }
+          }
+          output[i * data_reshape_[2] + j] = tmp;
+        }
+      }
+    } else if (type == ReduceType::MAX) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          uint8_t tmp = input[(i * data_reshape_[1] *
+              data_reshape_[2] + j)*data_reshape_[3]];
+          for (int k = 0; k < data_reshape_[1]; ++k) {
+            for (int t = 0; t < data_reshape_[3]; ++t) {
+              tmp =
+                  std::max(tmp,
+                           input[((i * data_reshape_[1] + k) *
+                               data_reshape_[2] + j)*data_reshape_[3] + t]);
+            }
+          }
+          output[i * data_reshape_[2] + j] = tmp;
+        }
+      }
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+}
+#endif  // MACE_ENABLE_QUANTIZE
+
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 class ReduceOp<DeviceType::GPU, T> : public ReduceOpBase {
@@ -562,7 +870,10 @@ class ReduceOp<DeviceType::GPU, T> : public ReduceOpBase {
 void RegisterReduce(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
                   DeviceType::CPU, float);
-
+#ifdef MACE_ENABLE_QUANTIZE
+  MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
+                   DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL
  MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
                   DeviceType::GPU, float);

--- a/mace/ops/reduce_test.cc
+++ b/mace/ops/reduce_test.cc
@@ -644,6 +644,89 @@ TEST_F(ReduceOpTest, GPURandomHalf) {
  RandomTest<DeviceType::GPU, half>({1, 511, 561, 11}, {1, 2});
 }

+namespace {
+
+void TestQuant(const std::vector<index_t> &input_shape,
+               const std::vector<int> &axis) {
+  auto func = [&](ReduceType type) {
+    OpsTestNet net;
+    net.AddRandomInput<CPU, float>(
+        "Input", input_shape, false, false);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", NHWC, "InputNCHW", NCHW);
+    net.AddRandomInput<DeviceType::CPU, float>(
+        "OutputNCHW", input_shape, false, true, true);
+
+    OpDefBuilder("Reduce", "ReduceTest")
+        .Input("InputNCHW")
+        .AddIntsArg("axis", axis)
+        .AddIntArg("keepdims", 1)
+        .AddIntArg("reduce_type", type)
+        .AddIntArg("data_format", DataFormat::NHWC)
+        .Output("OutputNCHW")
+        .AddIntArg("T", DT_FLOAT)
+        .Finalize(net.NewOperatorDef());
+    net.RunOp(CPU);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", NCHW, "Output", NHWC);
+
+    OpDefBuilder("Quantize", "QuantizeInput")
+        .Input("Input")
+        .Output("QuantizedInput")
+        .OutputType({DT_UINT8})
+        .AddIntArg("T", DT_UINT8)
+        .AddIntArg("non_zero", true)
+        .Finalize(net.NewOperatorDef());
+    net.RunOp();
+
+    net.AddRandomInput<DeviceType::CPU, uint8_t>("QuantizedOutput",
+                                                 input_shape);
+    OpDefBuilder("Reduce", "ReduceTest")
+        .Input("QuantizedInput")
+        .Output("QuantizedOutput")
+        .AddIntsArg("axis", axis)
+        .AddIntArg("keepdims", 1)
+        .AddIntArg("reduce_type", type)
+        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("T", DT_UINT8)
+        .Finalize(net.NewOperatorDef());
+    net.RunOp();
+
+    OpDefBuilder("Dequantize", "DeQuantizeTest")
+        .Input("QuantizedOutput")
+        .Output("DequantizedOutput")
+        .OutputType({DT_FLOAT})
+        .AddIntArg("T", DT_UINT8)
+        .Finalize(net.NewOperatorDef());
+    net.RunOp();
+    // Check
+    ExpectTensorSimilar<float>(*net.GetOutput("Output"),
+                               *net.GetTensor("DequantizedOutput"), 0.01);
+  };
+
+  for (ReduceType type : {MEAN, MIN, MAX}) {
+    func(type);
+  }
+}
+}  // namespace
+
+TEST_F(ReduceOpTest, Quant) {
+  // reduce 1, first axis
+  TestQuant({1, 1, 3, 4}, {2, 3});
+  // reduce 2, first axis
+  TestQuant({1, 4, 4, 320}, {1, 2});
+  // reduce 2, not first axis
+  TestQuant({16, 320, 4, 4}, {2, 3});
+  // reduce 3, first axis
+  TestQuant({1, 4, 323, 4}, {1, 3});
+  // reduce 3, not first axis
+  TestQuant({15, 117, 15, 32}, {2});
+  // reduce 4, first axis
+  TestQuant({4, 323, 4, 4}, {0, 2});
+  // reduce 4, not first axis
+  TestQuant({32, 4, 323, 16}, {1, 3});
+}
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/python/tools/converter_tool/hexagon_converter.py
+++ b/mace/python/tools/converter_tool/hexagon_converter.py
@@ -25,6 +25,7 @@ from mace.python.tools.converter_tool.base_converter import MaceKeyword
 from mace.python.tools.converter_tool.base_converter import MaceOp
 from mace.python.tools.converter_tool.base_converter import PaddingMode
 from mace.python.tools.converter_tool.base_converter import PoolingType
+from mace.python.tools.converter_tool.base_converter import ReduceType
 from mace.python.tools.convert_util import mace_check
 from mace.python.tools import graph_util

@@ -63,6 +64,7 @@ class HexagonOps(object):
            MaceOp.Quantize.name: HexagonOp.QuantizeINPUT_f_to_8.name,
            MaceOp.Pooling.name: [HexagonOp.QuantizedAvgPool_8.name,
                                  HexagonOp.QuantizedMaxPool_8.name],
+            MaceOp.Reduce.name: HexagonOp.QuantizedAvgPool_8.name,
            MaceOp.ResizeBilinear.name:
                HexagonOp.QuantizedResizeBilinear_8.name,
            MaceOp.SpaceToBatchND.name: HexagonOp.SpaceToBatchND_8.name,
@@ -222,6 +224,43 @@ class HexagonConverter(base_converter.ConverterInterface):
                strides_tensor.dims.extend(
                    [1, strides_arg.ints[0], strides_arg.ints[1], 1])
                op.input.extend([window_tensor.name, strides_tensor.name])
+            elif op.type == MaceOp.Reduce.name:
+                self.add_min_max_const_node(op, op.input[0])
+                reduce_type_arg = ConverterUtil.get_arg(
+                    op, MaceKeyword.mace_reduce_type_str)
+                mace_check(reduce_type_arg.i == ReduceType.MEAN.value,
+                           "Hexagon Reduce only supports Mean now.")
+                keep_dims_arg = ConverterUtil.get_arg(
+                    op, MaceKeyword.mace_keepdims_str)
+                mace_check(keep_dims_arg.i == 1,
+                           "Hexagon Reduce Mean only supports keep dims now.")
+                axis_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_axis_str)
+                mace_check(1 <= len(axis_arg.ints) <= 2,
+                           "Hexagon Reduce Mean only supports spatial now.")
+                for i in axis_arg.ints:
+                    mace_check(1 <= i <= 2,
+                               "Hexagon Reduce Mean only supports spatial now")
+                producer_op_name, _ = get_op_and_port_from_tensor(op.input[0])
+                input_dims = None
+                for producer_op in self._model.op:
+                    if producer_op.name == producer_op_name:
+                        input_dims = producer_op.output_shape[0].dims
+                        break
+                mace_check(input_dims is not None, "Missing input shape.")
+                window_tensor = self._model.tensors.add()
+                window_tensor.name = op.name + '/window:0'
+                window_tensor.data_type = mace_pb2.DT_INT32
+                if len(axis_arg.ints) == 1:
+                    dim1, dim2 = (input_dims[1], 1) \
+                        if axis_arg.ints[0] == 1 else (1, input_dims[2])
+                else:
+                    dim1, dim2 = input_dims[1], input_dims[2]
+                window_tensor.dims.extend([1, dim1, dim2, 1])
+                strides_tensor = self._model.tensors.add()
+                strides_tensor.name = op.name + '/strides:0'
+                strides_tensor.data_type = mace_pb2.DT_INT32
+                strides_tensor.dims.extend([1, dim1, dim2, 1])
+                op.input.extend([window_tensor.name, strides_tensor.name])
            elif op.type == MaceOp.ResizeBilinear.name:
                newdim_arg = ConverterUtil.get_arg(
                    op, MaceKeyword.mace_resize_size_str)

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -113,7 +113,6 @@ class Transformer(base_converter.ConverterInterface):
        self._consts = {}
        self._consumers = {}
        self._producer = {}
-        self._target_data_format = DataFormat.NHWC
        self._quantize_activation_info = {}
        self._quantized_tensor = set()

@@ -996,8 +995,7 @@ class Transformer(base_converter.ConverterInterface):
                    if arg.name == MaceKeyword.mace_paddings_str:
                        mace_check(len(arg.ints) == 8,
                                   "pad dim rank should be 8.")
-                        if ConverterUtil.data_format(op) == DataFormat.NCHW \
-                                and self._target_data_format == DataFormat.NHWC:  # noqa
+                        if ConverterUtil.data_format(op) == DataFormat.NCHW:
                            print("Transpose pad args: %s(%s)"
                                  % (op.name, op.type))
                            self.transpose_shape(arg.ints,
@@ -1006,7 +1004,6 @@ class Transformer(base_converter.ConverterInterface):
                for arg in op.arg:
                    if arg.name == MaceKeyword.mace_axis_str:
                        if (ConverterUtil.data_format(op) == DataFormat.NCHW
-                                and self._target_data_format == DataFormat.NHWC
                                and len(op.output_shape[0].dims) == 4):
                            print("Transpose concat/split args: %s(%s)"
                                  % (op.name, op.type))
@@ -1023,8 +1020,7 @@ class Transformer(base_converter.ConverterInterface):
                                len(input_shape) == 2:
                            axis_arg = ConverterUtil.get_arg(
                                op, MaceKeyword.mace_axis_str)
-                            if axis_arg.i == 1 \
-                                    and self._target_data_format == DataFormat.NHWC:  # noqa
+                            if axis_arg.i == 1:
                                axis_arg.i = 3

            elif op.type == MaceOp.Squeeze.name:
@@ -1041,8 +1037,7 @@ class Transformer(base_converter.ConverterInterface):
                for arg in op.arg:
                    if arg.name == MaceKeyword.mace_axis_str:
                        if ConverterUtil.data_format(
-                                op) == DataFormat.NCHW \
-                                and self._target_data_format == DataFormat.NHWC:  # noqa
+                                op) == DataFormat.NCHW:
                            print("Transpose reduce args: %s(%s)"
                                  % (op.name, op.type))
                            reduce_axises = list(arg.ints)
@@ -1062,15 +1057,12 @@ class Transformer(base_converter.ConverterInterface):
            # transpose op output shape
            data_format = ConverterUtil.data_format(op)
            if data_format is not None \
-                    and data_format != self._target_data_format:
+                    and data_format != DataFormat.NHWC:
                print("Transpose output shapes: %s(%s)" % (op.name, op.type))
                for output_shape in op.output_shape:
                    if len(output_shape.dims) == 4:
                        self.transpose_shape(output_shape.dims,
                                             [0, 2, 3, 1])
-                ConverterUtil.get_arg(op,
-                                      MaceKeyword.mace_data_format_str).i = \
-                    self._target_data_format.value

        return False

@@ -1683,6 +1675,7 @@ class Transformer(base_converter.ConverterInterface):
        print("Add default quantize info for ops like Pooling, Softmax")
        for op in self._model.op:
            if op.type in [MaceOp.Pooling.name,
+                           MaceOp.Reduce.name,
                           MaceOp.Squeeze.name,
                           MaceOp.Reshape.name,
                           MaceOp.ResizeBilinear.name,