diff --git a/mace/ops/quantization_util.cc b/mace/ops/quantization_util.cc
index d34e77455b7389ff0bfc30fe85196d5128a6991d..9df5c6fdf2b0e87a08088799793941f47eb6f922 100644
--- a/mace/ops/quantization_util.cc
+++ b/mace/ops/quantization_util.cc
@@ -1,4 +1,4 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+// Copyright 2018 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/quantization_util.h b/mace/ops/quantization_util.h
index 2e8806efc67b173d7c5845ecf3c244ba5fca0579..3e6beeb0eea9439b54e8d9f90ecd8b3a74ac675b 100644
--- a/mace/ops/quantization_util.h
+++ b/mace/ops/quantization_util.h
@@ -1,4 +1,4 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+// Copyright 2018 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/mace/ops/reduce.cc b/mace/ops/reduce.cc
index 88e909c0f5a52705b2d3a9a486ca4c5445bbf91f..f4a147cc7b8191f5323cf38acd532830a44948c9 100644
--- a/mace/ops/reduce.cc
+++ b/mace/ops/reduce.cc
@@ -73,6 +73,9 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
     const Tensor *input = this->Input(0);
     Tensor *output = this->Output(0);
     Simplify(input);
+    // Use the same scale and zero point with input and output.
+    output->SetScale(input->scale());
+    output->SetZeroPoint(input->zero_point());
     output->Resize(out_shape_);
     Compute(input, output);
     return MaceStatus::MACE_SUCCESS;
@@ -92,7 +95,8 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
                           axis_[i] + input->dim_size();
         auto df = static_cast<DataFormat>(Operation::GetOptionalArg<int>(
             "data_format", DataFormat::DF_NONE));
-        if (df == DataFormat::NHWC && input->dim_size() == 4) {
+        if (df == DataFormat::NHWC && DataTypeToEnum<T>::value != DT_UINT8
+            && input->dim_size() == 4) {
           if (index == 1 || index == 2) index = index + 1;
           else if (index == 3) index = 1;
         }
@@ -132,7 +136,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
     }
   }
 
-  void compute_reduce_1(const T *input, ReduceType type, T *output) {
+  void Reduce1Dims(const T *input, ReduceType type, T *output) {
     if (reduce_first_axis_) {
       if (type == ReduceType::MEAN) {
         T tmp = 0;
@@ -166,7 +170,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
     }
   }
 
-  void compute_reduce_2(const T *input, ReduceType type, T *output) {
+  void Reduce2Dims(const T *input, ReduceType type, T *output) {
     if (reduce_first_axis_) {
       if (type == ReduceType::MEAN) {
 #pragma omp parallel for schedule(runtime)
@@ -250,7 +254,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
     }
   }
 
-  void compute_reduce_3(const T *input, ReduceType type, T *output) {
+  void Reduce3Dims(const T *input, ReduceType type, T *output) {
     if (reduce_first_axis_) {
       if (type == ReduceType::MEAN) {
 #pragma omp parallel for collapse(1) schedule(runtime)
@@ -364,7 +368,7 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
     }
   }
 
-  void compute_reduce_4(const T *input, ReduceType type, T *output) {
+  void Reduce4Dims(const T *input, ReduceType type, T *output) {
     if (reduce_first_axis_) {
       if (type == ReduceType::MEAN) {
 #pragma omp parallel for collapse(2) schedule(runtime)
@@ -498,7 +502,6 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
     }
   }
 
-
   void Compute(const Tensor *input, Tensor *output) {
     Tensor::MappingGuard input_mapper(input);
     const T *input_ptr = input->data<T>();
@@ -507,16 +510,16 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
     memset(output_ptr, 0, output->size() * sizeof(T));
     switch (data_reshape_.size()) {
       case 1:
-        compute_reduce_1(input_ptr, reduce_type_, output_ptr);
+        Reduce1Dims(input_ptr, reduce_type_, output_ptr);
         break;
       case 2:
-        compute_reduce_2(input_ptr, reduce_type_, output_ptr);
+        Reduce2Dims(input_ptr, reduce_type_, output_ptr);
         break;
       case 3:
-        compute_reduce_3(input_ptr, reduce_type_, output_ptr);
+        Reduce3Dims(input_ptr, reduce_type_, output_ptr);
         break;
       case 4:
-        compute_reduce_4(input_ptr, reduce_type_, output_ptr);
+        Reduce4Dims(input_ptr, reduce_type_, output_ptr);
         break;
       default:
         MACE_CHECK(false, "not implemented in mace")
@@ -532,6 +535,311 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
   std::vector<index_t> out_shape_;
 };
 
+#ifdef MACE_ENABLE_QUANTIZE
+template <>
+void ReduceOp<DeviceType::CPU, uint8_t>::Reduce1Dims(
+    const uint8_t *input, ReduceType type, uint8_t *output) {
+  if (reduce_first_axis_) {
+    if (type == ReduceType::MEAN) {
+      uint32_t tmp = 0;
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        tmp = tmp + input[i];
+      }
+      output[0] = static_cast<uint8_t>(
+          (tmp + data_reshape_[0] / 2) / data_reshape_[0]);
+    } else if (type == ReduceType::MIN) {
+      uint8_t tmp = input[0];
+      for (int i = 1; i < data_reshape_[0]; ++i) {
+        tmp = std::min<uint8_t>(tmp, input[i]);
+      }
+      output[0] = tmp;
+    } else if (type == ReduceType::MAX) {
+      uint8_t tmp = input[0];
+      for (int i = 1; i < data_reshape_[0]; ++i) {
+        tmp = std::max<uint8_t>(tmp, input[i]);
+      }
+      output[0] = tmp;
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  } else {
+    memcpy(output, input, data_reshape_[0] * sizeof(uint8_t));
+  }
+}
+
+template <>
+void ReduceOp<DeviceType::CPU, uint8_t>::Reduce2Dims(
+    const uint8_t *input, ReduceType type, uint8_t *output) {
+  if (reduce_first_axis_) {
+    if (type == ReduceType::MEAN) {
+#pragma omp parallel for schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        uint32_t tmp = 0;
+        for (int j = 0; j < data_reshape_[0]; ++j) {
+          tmp += input[j * data_reshape_[1] + i];
+        }
+        output[i] = static_cast<uint8_t>(
+            (tmp + data_reshape_[0] / 2) / data_reshape_[0]);
+      }
+    } else if (type == ReduceType::MIN) {
+#pragma omp parallel for schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        uint8_t tmp = input[i];
+        for (int j = 1; j < data_reshape_[0]; ++j) {
+          tmp = std::min(tmp, input[j * data_reshape_[1] + i]);
+        }
+        output[i] = tmp;
+      }
+    } else if (type == ReduceType::MAX) {
+#pragma omp parallel for schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        uint8_t tmp = input[i];
+        for (int j = 1; j < data_reshape_[0]; ++j) {
+          tmp = std::max(tmp, input[j * data_reshape_[1] + i]);
+        }
+        output[i] = tmp;
+      }
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  } else {
+    if (type == ReduceType::MEAN) {
+#pragma omp parallel for schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        uint32_t tmp = 0;
+        for (int j = 0; j < data_reshape_[1]; ++j) {
+          tmp += input[i * data_reshape_[1] + j];
+        }
+        output[i] = static_cast<uint8_t>(
+            (tmp + data_reshape_[1] / 2) / data_reshape_[1]);
+      }
+    } else if (type == ReduceType::MIN) {
+#pragma omp parallel for schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        uint8_t tmp = input[i * data_reshape_[1]];
+        for (int j = 1; j < data_reshape_[1]; ++j) {
+          tmp = std::min(tmp, input[i * data_reshape_[1] + j]);
+        }
+        output[i] = tmp;
+      }
+    } else if (type == ReduceType::MAX) {
+#pragma omp parallel for schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        uint8_t tmp = input[i * data_reshape_[1]];
+        for (int j = 1; j < data_reshape_[1]; ++j) {
+          tmp = std::max(tmp, input[i * data_reshape_[1] + j]);
+        }
+        output[i] = tmp;
+      }
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+}
+
+template <>
+void ReduceOp<DeviceType::CPU, uint8_t>::Reduce3Dims(
+    const uint8_t *input, ReduceType type, uint8_t *output) {
+  if (reduce_first_axis_) {
+    if (type == ReduceType::MEAN) {
+#pragma omp parallel for collapse(1) schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        uint32_t tmp = 0;
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          for (int k = 0; k < data_reshape_[0]; ++k) {
+            tmp += input[(k * data_reshape_[1] + i) * data_reshape_[2] + j];
+          }
+        }
+        index_t dim = data_reshape_[0] * data_reshape_[2];
+        output[i] = static_cast<uint8_t>((tmp + dim / 2) / dim);
+      }
+    } else if (type == ReduceType::MIN) {
+#pragma omp parallel for collapse(1) schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        uint8_t tmp = input[i * data_reshape_[2]];
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          for (int k = 0; k < data_reshape_[0]; ++k) {
+            tmp = std::min(tmp,
+                           input[(k * data_reshape_[1] + i) * data_reshape_[2]
+                               + j]);
+          }
+        }
+        output[i] = tmp;
+      }
+    } else if (type == ReduceType::MAX) {
+#pragma omp parallel for collapse(1) schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        uint8_t tmp = input[i * data_reshape_[2]];
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          for (int k = 0; k < data_reshape_[0]; ++k) {
+            tmp =
+                std::max(tmp,
+                         input[(k * data_reshape_[1] + i)
+                             * data_reshape_[2] + j]);
+          }
+        }
+        output[i] = tmp;
+      }
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  } else {
+    if (type == ReduceType::MEAN) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          uint32_t tmp = 0;
+          for (int k = 0; k < data_reshape_[1]; ++k) {
+            tmp += input[(i * data_reshape_[1] + k) * data_reshape_[2] + j];
+          }
+          output[i * data_reshape_[2] + j] =
+              static_cast<uint8_t>((tmp + data_reshape_[1] / 2) /
+                  data_reshape_[1]);
+        }
+      }
+    } else if (type == ReduceType::MIN) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          uint8_t tmp = input[i * data_reshape_[1] * data_reshape_[2] + j];
+          for (int k = 1; k < data_reshape_[1]; ++k) {
+            tmp = std::min(tmp,
+                           input[(i * data_reshape_[1] + k) *
+                               data_reshape_[2] + j]);
+          }
+          output[i * data_reshape_[2] + j] = tmp;
+        }
+      }
+    } else if (type == ReduceType::MAX) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          uint8_t tmp = input[i * data_reshape_[1] * data_reshape_[2] + j];
+          for (int k = 1; k < data_reshape_[1]; ++k) {
+            tmp = std::max(tmp,
+                           input[(i * data_reshape_[1] + k) *
+                               data_reshape_[2] + j]);
+          }
+          output[i * data_reshape_[2] + j] = tmp;
+        }
+      }
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+}
+
+template <>
+void ReduceOp<DeviceType::CPU, uint8_t>::Reduce4Dims(
+    const uint8_t *input, ReduceType type, uint8_t *output) {
+  if (reduce_first_axis_) {
+    if (type == ReduceType::MEAN) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        for (int j = 0; j < data_reshape_[3]; ++j) {
+          uint32_t tmp = 0;
+          for (int k = 0; k < data_reshape_[2]; ++k) {
+            for (int t = 0; t < data_reshape_[0]; ++t) {
+              tmp += input[((t * data_reshape_[1] + i) *
+                  data_reshape_[2] + k)*data_reshape_[3] + j];
+            }
+          }
+          index_t dim = data_reshape_[0] * data_reshape_[2];
+          output[i * data_reshape_[3] + j] =
+              static_cast<uint8_t>((tmp + dim / 2) / dim);
+        }
+      }
+    } else if (type == ReduceType::MIN) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        for (int j = 0; j < data_reshape_[3]; ++j) {
+          uint8_t tmp = input[i * data_reshape_[2] * data_reshape_[3] + j];
+          for (int k = 0; k < data_reshape_[2]; ++k) {
+            for (int t = 0; t < data_reshape_[0]; ++t) {
+              tmp = std::min(tmp,
+                             input[((t * data_reshape_[1] + i) *
+                                 data_reshape_[2] + k)*data_reshape_[3] + j]);
+            }
+          }
+          output[i * data_reshape_[3] + j] = tmp;
+        }
+      }
+    } else if (type == ReduceType::MAX) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[1]; ++i) {
+        for (int j = 0; j < data_reshape_[3]; ++j) {
+          uint8_t tmp = input[i * data_reshape_[2] * data_reshape_[3] + j];
+          for (int k = 0; k < data_reshape_[2]; ++k) {
+            for (int t = 0; t < data_reshape_[0]; ++t) {
+              tmp = std::max(tmp,
+                             input[((t * data_reshape_[1] + i) *
+                                 data_reshape_[2] + k)*data_reshape_[3] + j]);
+            }
+          }
+          output[i * data_reshape_[3] + j] = tmp;
+        }
+      }
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  } else {
+    if (type == ReduceType::MEAN) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          uint32_t tmp = 0;
+          for (int k = 0; k < data_reshape_[1]; ++k) {
+            for (int t = 0; t < data_reshape_[3]; ++t) {
+              tmp += input[((i * data_reshape_[1] + k) *
+                  data_reshape_[2] + j)*data_reshape_[3] + t];
+            }
+          }
+          index_t dim = data_reshape_[1] * data_reshape_[3];
+          output[i * data_reshape_[2] + j] =
+              static_cast<uint8_t>((tmp + dim / 2) / dim);
+        }
+      }
+    } else if (type == ReduceType::MIN) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          uint8_t tmp = input[(i * data_reshape_[1] *
+              data_reshape_[2] + j)*data_reshape_[3]];
+          for (int k = 0; k < data_reshape_[1]; ++k) {
+            for (int t = 0; t < data_reshape_[3]; ++t) {
+              tmp =
+                  std::min(tmp,
+                           input[((i * data_reshape_[1] + k) *
+                               data_reshape_[2] + j)*data_reshape_[3] + t]);
+            }
+          }
+          output[i * data_reshape_[2] + j] = tmp;
+        }
+      }
+    } else if (type == ReduceType::MAX) {
+#pragma omp parallel for collapse(2) schedule(runtime)
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        for (int j = 0; j < data_reshape_[2]; ++j) {
+          uint8_t tmp = input[(i * data_reshape_[1] *
+              data_reshape_[2] + j)*data_reshape_[3]];
+          for (int k = 0; k < data_reshape_[1]; ++k) {
+            for (int t = 0; t < data_reshape_[3]; ++t) {
+              tmp =
+                  std::max(tmp,
+                           input[((i * data_reshape_[1] + k) *
+                               data_reshape_[2] + j)*data_reshape_[3] + t]);
+            }
+          }
+          output[i * data_reshape_[2] + j] = tmp;
+        }
+      }
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+}
+#endif  // MACE_ENABLE_QUANTIZE
+
 #ifdef MACE_ENABLE_OPENCL
 template <typename T>
 class ReduceOp<DeviceType::GPU, T> : public ReduceOpBase {
@@ -562,7 +870,10 @@ class ReduceOp<DeviceType::GPU, T> : public ReduceOpBase {
 void RegisterReduce(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
                    DeviceType::CPU, float);
-
+#ifdef MACE_ENABLE_QUANTIZE
+  MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
+                   DeviceType::CPU, uint8_t);
+#endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENCL
   MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
                    DeviceType::GPU, float);
diff --git a/mace/ops/reduce_test.cc b/mace/ops/reduce_test.cc
index 62d6cf4c23b96a508eef9c98ff2f61ddecd7904e..78a9f9345a8ca4da9eae0a0beedcb8dd1fbed49c 100644
--- a/mace/ops/reduce_test.cc
+++ b/mace/ops/reduce_test.cc
@@ -644,6 +644,89 @@ TEST_F(ReduceOpTest, GPURandomHalf) {
   RandomTest<DeviceType::GPU, half>({1, 511, 561, 11}, {1, 2});
 }
 
+namespace {
+
+void TestQuant(const std::vector<index_t> &input_shape,
+               const std::vector<int> &axis) {
+  auto func = [&](ReduceType type) {
+    OpsTestNet net;
+    net.AddRandomInput<CPU, float>(
+        "Input", input_shape, false, false);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", NHWC, "InputNCHW", NCHW);
+    net.AddRandomInput<DeviceType::CPU, float>(
+        "OutputNCHW", input_shape, false, true, true);
+
+    OpDefBuilder("Reduce", "ReduceTest")
+        .Input("InputNCHW")
+        .AddIntsArg("axis", axis)
+        .AddIntArg("keepdims", 1)
+        .AddIntArg("reduce_type", type)
+        .AddIntArg("data_format", DataFormat::NHWC)
+        .Output("OutputNCHW")
+        .AddIntArg("T", DT_FLOAT)
+        .Finalize(net.NewOperatorDef());
+    net.RunOp(CPU);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", NCHW, "Output", NHWC);
+
+    OpDefBuilder("Quantize", "QuantizeInput")
+        .Input("Input")
+        .Output("QuantizedInput")
+        .OutputType({DT_UINT8})
+        .AddIntArg("T", DT_UINT8)
+        .AddIntArg("non_zero", true)
+        .Finalize(net.NewOperatorDef());
+    net.RunOp();
+
+    net.AddRandomInput<DeviceType::CPU, uint8_t>("QuantizedOutput",
+                                                 input_shape);
+    OpDefBuilder("Reduce", "ReduceTest")
+        .Input("QuantizedInput")
+        .Output("QuantizedOutput")
+        .AddIntsArg("axis", axis)
+        .AddIntArg("keepdims", 1)
+        .AddIntArg("reduce_type", type)
+        .AddIntArg("data_format", DataFormat::NHWC)
+        .AddIntArg("T", DT_UINT8)
+        .Finalize(net.NewOperatorDef());
+    net.RunOp();
+
+    OpDefBuilder("Dequantize", "DeQuantizeTest")
+        .Input("QuantizedOutput")
+        .Output("DequantizedOutput")
+        .OutputType({DT_FLOAT})
+        .AddIntArg("T", DT_UINT8)
+        .Finalize(net.NewOperatorDef());
+    net.RunOp();
+    // Check
+    ExpectTensorSimilar<float>(*net.GetOutput("Output"),
+                               *net.GetTensor("DequantizedOutput"), 0.01);
+  };
+
+  for (ReduceType type : {MEAN, MIN, MAX}) {
+    func(type);
+  }
+}
+}  // namespace
+
+TEST_F(ReduceOpTest, Quant) {
+  // reduce 1, first axis
+  TestQuant({1, 1, 3, 4}, {2, 3});
+  // reduce 2, first axis
+  TestQuant({1, 4, 4, 320}, {1, 2});
+  // reduce 2, not first axis
+  TestQuant({16, 320, 4, 4}, {2, 3});
+  // reduce 3, first axis
+  TestQuant({1, 4, 323, 4}, {1, 3});
+  // reduce 3, not first axis
+  TestQuant({15, 117, 15, 32}, {2});
+  // reduce 4, first axis
+  TestQuant({4, 323, 4, 4}, {0, 2});
+  // reduce 4, not first axis
+  TestQuant({32, 4, 323, 16}, {1, 3});
+}
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/python/tools/converter_tool/hexagon_converter.py b/mace/python/tools/converter_tool/hexagon_converter.py
index f37ab7ba15027cc021d55d381352c6ec436ddfb3..c3e590f658b6b24227d2cfb8d4e01bf8b60f30e3 100644
--- a/mace/python/tools/converter_tool/hexagon_converter.py
+++ b/mace/python/tools/converter_tool/hexagon_converter.py
@@ -25,6 +25,7 @@ from mace.python.tools.converter_tool.base_converter import MaceKeyword
 from mace.python.tools.converter_tool.base_converter import MaceOp
 from mace.python.tools.converter_tool.base_converter import PaddingMode
 from mace.python.tools.converter_tool.base_converter import PoolingType
+from mace.python.tools.converter_tool.base_converter import ReduceType
 from mace.python.tools.convert_util import mace_check
 from mace.python.tools import graph_util
 
@@ -63,6 +64,7 @@ class HexagonOps(object):
             MaceOp.Quantize.name: HexagonOp.QuantizeINPUT_f_to_8.name,
             MaceOp.Pooling.name: [HexagonOp.QuantizedAvgPool_8.name,
                                   HexagonOp.QuantizedMaxPool_8.name],
+            MaceOp.Reduce.name: HexagonOp.QuantizedAvgPool_8.name,
             MaceOp.ResizeBilinear.name:
                 HexagonOp.QuantizedResizeBilinear_8.name,
             MaceOp.SpaceToBatchND.name: HexagonOp.SpaceToBatchND_8.name,
@@ -222,6 +224,43 @@ class HexagonConverter(base_converter.ConverterInterface):
                 strides_tensor.dims.extend(
                     [1, strides_arg.ints[0], strides_arg.ints[1], 1])
                 op.input.extend([window_tensor.name, strides_tensor.name])
+            elif op.type == MaceOp.Reduce.name:
+                self.add_min_max_const_node(op, op.input[0])
+                reduce_type_arg = ConverterUtil.get_arg(
+                    op, MaceKeyword.mace_reduce_type_str)
+                mace_check(reduce_type_arg.i == ReduceType.MEAN.value,
+                           "Hexagon Reduce only supports Mean now.")
+                keep_dims_arg = ConverterUtil.get_arg(
+                    op, MaceKeyword.mace_keepdims_str)
+                mace_check(keep_dims_arg.i == 1,
+                           "Hexagon Reduce Mean only supports keep dims now.")
+                axis_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_axis_str)
+                mace_check(1 <= len(axis_arg.ints) <= 2,
+                           "Hexagon Reduce Mean only supports spatial now.")
+                for i in axis_arg.ints:
+                    mace_check(1 <= i <= 2,
+                               "Hexagon Reduce Mean only supports spatial now")
+                producer_op_name, _ = get_op_and_port_from_tensor(op.input[0])
+                input_dims = None
+                for producer_op in self._model.op:
+                    if producer_op.name == producer_op_name:
+                        input_dims = producer_op.output_shape[0].dims
+                        break
+                mace_check(input_dims is not None, "Missing input shape.")
+                window_tensor = self._model.tensors.add()
+                window_tensor.name = op.name + '/window:0'
+                window_tensor.data_type = mace_pb2.DT_INT32
+                if len(axis_arg.ints) == 1:
+                    dim1, dim2 = (input_dims[1], 1) \
+                        if axis_arg.ints[0] == 1 else (1, input_dims[2])
+                else:
+                    dim1, dim2 = input_dims[1], input_dims[2]
+                window_tensor.dims.extend([1, dim1, dim2, 1])
+                strides_tensor = self._model.tensors.add()
+                strides_tensor.name = op.name + '/strides:0'
+                strides_tensor.data_type = mace_pb2.DT_INT32
+                strides_tensor.dims.extend([1, dim1, dim2, 1])
+                op.input.extend([window_tensor.name, strides_tensor.name])
             elif op.type == MaceOp.ResizeBilinear.name:
                 newdim_arg = ConverterUtil.get_arg(
                     op, MaceKeyword.mace_resize_size_str)
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index 9ea6b6d83628cfef70c58db0b401472d29565776..cf426941b933add7e5f5e5b7e6627a9a290c99de 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -113,7 +113,6 @@ class Transformer(base_converter.ConverterInterface):
         self._consts = {}
         self._consumers = {}
         self._producer = {}
-        self._target_data_format = DataFormat.NHWC
         self._quantize_activation_info = {}
         self._quantized_tensor = set()
 
@@ -996,8 +995,7 @@ class Transformer(base_converter.ConverterInterface):
                     if arg.name == MaceKeyword.mace_paddings_str:
                         mace_check(len(arg.ints) == 8,
                                    "pad dim rank should be 8.")
-                        if ConverterUtil.data_format(op) == DataFormat.NCHW \
-                                and self._target_data_format == DataFormat.NHWC:  # noqa
+                        if ConverterUtil.data_format(op) == DataFormat.NCHW:
                             print("Transpose pad args: %s(%s)"
                                   % (op.name, op.type))
                             self.transpose_shape(arg.ints,
@@ -1006,7 +1004,6 @@ class Transformer(base_converter.ConverterInterface):
                 for arg in op.arg:
                     if arg.name == MaceKeyword.mace_axis_str:
                         if (ConverterUtil.data_format(op) == DataFormat.NCHW
-                                and self._target_data_format == DataFormat.NHWC
                                 and len(op.output_shape[0].dims) == 4):
                             print("Transpose concat/split args: %s(%s)"
                                   % (op.name, op.type))
@@ -1023,8 +1020,7 @@ class Transformer(base_converter.ConverterInterface):
                                 len(input_shape) == 2:
                             axis_arg = ConverterUtil.get_arg(
                                 op, MaceKeyword.mace_axis_str)
-                            if axis_arg.i == 1 \
-                                    and self._target_data_format == DataFormat.NHWC:  # noqa
+                            if axis_arg.i == 1:
                                 axis_arg.i = 3
 
             elif op.type == MaceOp.Squeeze.name:
@@ -1041,8 +1037,7 @@ class Transformer(base_converter.ConverterInterface):
                 for arg in op.arg:
                     if arg.name == MaceKeyword.mace_axis_str:
                         if ConverterUtil.data_format(
-                                op) == DataFormat.NCHW \
-                                and self._target_data_format == DataFormat.NHWC:  # noqa
+                                op) == DataFormat.NCHW:
                             print("Transpose reduce args: %s(%s)"
                                   % (op.name, op.type))
                             reduce_axises = list(arg.ints)
@@ -1062,15 +1057,12 @@ class Transformer(base_converter.ConverterInterface):
             # transpose op output shape
             data_format = ConverterUtil.data_format(op)
             if data_format is not None \
-                    and data_format != self._target_data_format:
+                    and data_format != DataFormat.NHWC:
                 print("Transpose output shapes: %s(%s)" % (op.name, op.type))
                 for output_shape in op.output_shape:
                     if len(output_shape.dims) == 4:
                         self.transpose_shape(output_shape.dims,
                                              [0, 2, 3, 1])
-                ConverterUtil.get_arg(op,
-                                      MaceKeyword.mace_data_format_str).i = \
-                    self._target_data_format.value
 
         return False
 
@@ -1683,6 +1675,7 @@ class Transformer(base_converter.ConverterInterface):
         print("Add default quantize info for ops like Pooling, Softmax")
         for op in self._model.op:
             if op.type in [MaceOp.Pooling.name,
+                           MaceOp.Reduce.name,
                            MaceOp.Squeeze.name,
                            MaceOp.Reshape.name,
                            MaceOp.ResizeBilinear.name,