diff --git a/docs/index.rst b/docs/index.rst
index 1441180820bb48b8c52f0f3d33e0f5e3eec71563..f839a13f7cf8d04c39d63280306ee3fb8dff513b 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -28,6 +28,7 @@ The main documentation is organized into the following sections:
    user_guide/basic_usage
    user_guide/advanced_usage
    user_guide/op_lists
+   user_guide/quantization_usage
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/user_guide/basic_usage.rst b/docs/user_guide/basic_usage.rst
index 131837ebb660e4f027fd2dd5f7ebd1c7687028c6..eb067c49a5b42fc2f33d8a614da6f067eb5452ff 100644
--- a/docs/user_guide/basic_usage.rst
+++ b/docs/user_guide/basic_usage.rst
@@ -28,7 +28,7 @@ Here we use the mobilenet-v2 model as an example.
 
     .. note::
 
-        It's highly recommanded to use a release version instead of master branch.
+        It's highly recommended to use a release version instead of master branch.
 
 
     2. Pull `MACE Model Zoo <https://github.com/XiaoMi/mace-models>`__ project.
diff --git a/docs/user_guide/quantization_usage.rst b/docs/user_guide/quantization_usage.rst
new file mode 100644
index 0000000000000000000000000000000000000000..11f8bee2205537ed515c3329532a2974d3ff3621
--- /dev/null
+++ b/docs/user_guide/quantization_usage.rst
@@ -0,0 +1,62 @@
+Quantization
+===============
+
+MACE supports two kinds of quantization mechanisms, i.e.,
+
+* **Quantization-aware training (Recommend)**
+
+After pre-training model using float point, insert simulated quantization operations into the model. Fine tune the new model.
+Refer to `Tensorflow quantization-aware training <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize>`__.
+
+* **Post training quantization**
+
+After pre-training model using float point, estimate output range of each activation layer using sample inputs.
+
+
+Quantization-aware training
+----------------------------
+It is recommended that developers fine tune the fixed-point model, as experiments show that by this way accuracy could be improved, especially for lightweight
+models, e.g., MobileNet. The only thing you need to make it run using MACE is to add the following config to model yaml file:
+
+	1. `input_ranges`: the ranges of model's inputs, e.g., -1.0,1.0.
+
+	2. `quantize`: set `quantize` to be 1.
+
+    .. note::
+	You need set `runtime` to be `cpu` because we only support this quantization method to run on CPU for now (soon DSP will be supported).
+
+
+Post training quantization
+---------------------------
+MACE supports post-training quantization if you want to take a chance to quantize model directly without fine tuning.
+This method requires developer to calculate tensor range of each activation layer statistically using sample inputs.
+MACE provides tools to do statistics with following steps:
+
+	1. Convert original model to run on CPU host without obfuscation (by setting `target_abis` to `host`, `runtime` to `cpu`, and `obfuscate` to `0`, appending `:0` to `output_tensors` if missing in yaml config).
+	E.g.,
+
+	.. code:: sh
+
+		python tools/converter.py convert --config ../mace-models/inception-v3/inception-v3.yml
+
+
+	2. Log tensor range of each activation layer by inferring several samples.
+
+	.. code:: sh
+
+		python tools/converter.py run --config ../mace-models/inception-v3/inception-v3.yml --example --quantize_stat --input_dir samples > range_log
+
+
+	3. Calculate overall range of each activation layer by specifying percentage cutoff.
+
+	.. code:: sh
+
+		python mace/python/tools/quantization/quantize_stat.py --log_file range_log --percentile 5 > overall_range
+
+
+	4. Convert quantized model (by setting `quantize` to `1` and `quantize_range_file` to the overall_range file path in yaml config).
+
+
+.. note::
+
+	`quantize_weights` and `quantize_nodes` should not be specified when using `TransformGraph` tool if using MACE quantization.
diff --git a/mace/core/net.cc b/mace/core/net.cc
index 2766de115cfd8d745633f0ee182044010552fde6..d71a14826a5ae8b907a56526ba79c6bce245e12d 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -133,7 +133,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
     VLOG(3) << "Operator " << op->debug_def().name()
             << " has shape: " << MakeString(op->Output(0)->shape());
 
-    if (EnvEnabled("MACE_LOG_TENSOR_RANGE") && device_type == CPU) {
+    if (EnvEnabled("MACE_LOG_TENSOR_RANGE")) {
       for (int i = 0; i < op->OutputSize(); ++i) {
         if (op->debug_def().quantize_info_size() == 0) {
           int data_type = op->GetOptionalArg("T", static_cast<int>(DT_FLOAT));
diff --git a/mace/examples/cli/example.cc b/mace/examples/cli/example.cc
index 6679c17e3f876eff0db7604a5efded5fc3d424df..204be499ebd0c1500072da106f39800e2fca1384 100644
--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -305,7 +305,7 @@ bool RunModel(const std::vector<std::string> &input_names,
                 out_file.flush();
                 out_file.close();
               } else {
-                std::cerr << "Open output file failed";
+                std::cerr << "Open output file failed" << std::endl;
                 return -1;
               }
             }
@@ -315,7 +315,8 @@ bool RunModel(const std::vector<std::string> &input_names,
 
       closedir(dir_parent);
     } else {
-      std::cerr << "Directory " << FLAGS_input_dir << " does not exist.";
+      std::cerr << "Directory " << FLAGS_input_dir << " does not exist."
+                << std::endl;
     }
   } else {
     for (size_t i = 0; i < input_count; ++i) {
@@ -346,7 +347,7 @@ bool RunModel(const std::vector<std::string> &input_names,
         out_file.flush();
         out_file.close();
       } else {
-        std::cerr << "Open output file failed";
+        std::cerr << "Open output file failed" << std::endl;
         return -1;
       }
     }
diff --git a/mace/ops/quantize_benchmark.cc b/mace/ops/quantize_benchmark.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62a534b721894360b922270fe03833be60ad582a
--- /dev/null
+++ b/mace/ops/quantize_benchmark.cc
@@ -0,0 +1,117 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/operator.h"
+#include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+namespace {
+template <DeviceType D, typename T>
+void Quantize(int iters, int count) {
+  mace::testing::StopTiming();
+
+  OpsTestNet net;
+  // Add input data
+  net.AddRandomInput<D, float>("Input", {count});
+
+  OpDefBuilder("Quantize", "QuantizeBM")
+      .Input("Input")
+      .Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+
+  // Warm-up
+  for (int i = 0; i < 2; ++i) {
+    net.RunOp(D);
+  }
+  net.Sync();
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.RunOp(D);
+  }
+  net.Sync();
+}
+
+template <DeviceType D, typename T>
+void Dequantize(int iters, int count) {
+  mace::testing::StopTiming();
+
+  OpsTestNet net;
+  // Add input data
+  net.AddRandomInput<D, T>("Input", {count});
+
+  OpDefBuilder("Dequantize", "DequantizeBM")
+      .Input("Input")
+      .Output("Output")
+      .OutputType({DT_FLOAT})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+
+  // Warm-up
+  for (int i = 0; i < 2; ++i) {
+    net.RunOp(D);
+  }
+  net.Sync();
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.RunOp(D);
+  }
+  net.Sync();
+}
+}  // namespace
+
+#define MACE_BM_QUANTIZE_MACRO(N, TYPE, DEVICE)            \
+  static void                                              \
+    MACE_BM_QUANTIZE_##N##_##TYPE##_##DEVICE(              \
+      int iters) {                                         \
+    const int64_t tot = static_cast<int64_t>(iters) * N;   \
+    mace::testing::MaccProcessed(tot);                     \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));    \
+    Quantize<DEVICE, TYPE>(iters, N);                      \
+  }                                                        \
+  MACE_BENCHMARK(                                          \
+    MACE_BM_QUANTIZE_##N##_##TYPE##_##DEVICE)
+
+#define MACE_BM_QUANTIZE(N)                                \
+  MACE_BM_QUANTIZE_MACRO(N, uint8_t, CPU);
+
+#define MACE_BM_DEQUANTIZE_MACRO(N, TYPE, DEVICE)          \
+  static void                                              \
+    MACE_BM_DEQUANTIZE_##N##_##TYPE##_##DEVICE(            \
+      int iters) {                                         \
+    const int64_t tot = static_cast<int64_t>(iters) * N;   \
+    mace::testing::MaccProcessed(tot);                     \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));    \
+    Dequantize<DEVICE, TYPE>(iters, N);                    \
+  }                                                        \
+  MACE_BENCHMARK(                                          \
+    MACE_BM_DEQUANTIZE_##N##_##TYPE##_##DEVICE)
+
+#define MACE_BM_DEQUANTIZE(N)                              \
+  MACE_BM_DEQUANTIZE_MACRO(N, uint8_t, CPU);
+
+MACE_BM_QUANTIZE(256);
+MACE_BM_QUANTIZE(1470000);
+MACE_BM_DEQUANTIZE(256);
+MACE_BM_DEQUANTIZE(1470000);
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index b920d43fbe85fc85bdd23831173e31e1e1e282b1..367da8f9d551717dd508a4a0ab0d26f8ec98eeb9 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -721,6 +721,8 @@ def tuning_run(abi,
                (model_tag, running_round, restart_round, str(tuning),
                 str(out_of_range_check), omp_num_threads, cpu_affinity_policy,
                 gpu_perf_hint, gpu_priority_hint))
+    sys.stdout.flush()
+
     mace_model_path = ""
     if model_graph_format == ModelFormat.file:
         mace_model_path = "%s/%s.pb" % (mace_model_dir, model_tag)
@@ -880,6 +882,7 @@ def tuning_run(abi,
 
         six.print_("Running finished!\n")
 
+    sys.stdout.flush()
     return stdout