Merge branch 'quantize' into 'master'

Add quantization docs See merge request !845

Merge branch 'quantize' into 'master'
Add quantization docs See merge request !845
474645fb · 李滨 · e8553972 · 46bbadc4 · 474645fb · 474645fb
7 changed file
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -28,6 +28,7 @@ The main documentation is organized into the following sections:
   user_guide/basic_usage
   user_guide/advanced_usage
   user_guide/op_lists
+   user_guide/quantization_usage

 .. toctree::
   :maxdepth: 1

--- a/docs/user_guide/basic_usage.rst
+++ b/docs/user_guide/basic_usage.rst
@@ -28,7 +28,7 @@ Here we use the mobilenet-v2 model as an example.

    .. note::

-        It's highly recommanded to use a release version instead of master branch.
+        It's highly recommended to use a release version instead of master branch.


    2. Pull `MACE Model Zoo <https://github.com/XiaoMi/mace-models>`__ project.

--- a/docs/user_guide/quantization_usage.rst
+++ b/docs/user_guide/quantization_usage.rst
+Quantization
+===============
+
+MACE supports two kinds of quantization mechanisms, i.e.,
+
+* **Quantization-aware training (Recommend)**
+
+After pre-training model using float point, insert simulated quantization operations into the model. Fine tune the new model.
+Refer to `Tensorflow quantization-aware training <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize>`__.
+
+* **Post training quantization**
+
+After pre-training model using float point, estimate output range of each activation layer using sample inputs.
+
+
+Quantization-aware training
+----------------------------
+It is recommended that developers fine tune the fixed-point model, as experiments show that by this way accuracy could be improved, especially for lightweight
+models, e.g., MobileNet. The only thing you need to make it run using MACE is to add the following config to model yaml file:
+
+	1. `input_ranges`: the ranges of model's inputs, e.g., -1.0,1.0.
+
+	2. `quantize`: set `quantize` to be 1.
+
+    .. note::
+	You need set `runtime` to be `cpu` because we only support this quantization method to run on CPU for now (soon DSP will be supported).
+
+
+Post training quantization
+---------------------------
+MACE supports post-training quantization if you want to take a chance to quantize model directly without fine tuning.
+This method requires developer to calculate tensor range of each activation layer statistically using sample inputs.
+MACE provides tools to do statistics with following steps:
+
+	1. Convert original model to run on CPU host without obfuscation (by setting `target_abis` to `host`, `runtime` to `cpu`, and `obfuscate` to `0`, appending `:0` to `output_tensors` if missing in yaml config).
+	E.g.,
+
+	.. code:: sh
+
+		python tools/converter.py convert --config ../mace-models/inception-v3/inception-v3.yml
+
+
+	2. Log tensor range of each activation layer by inferring several samples.
+
+	.. code:: sh
+
+		python tools/converter.py run --config ../mace-models/inception-v3/inception-v3.yml --example --quantize_stat --input_dir samples > range_log
+
+
+	3. Calculate overall range of each activation layer by specifying percentage cutoff.
+
+	.. code:: sh
+
+		python mace/python/tools/quantization/quantize_stat.py --log_file range_log --percentile 5 > overall_range
+
+
+	4. Convert quantized model (by setting `quantize` to `1` and `quantize_range_file` to the overall_range file path in yaml config).
+
+
+.. note::
+
+	`quantize_weights` and `quantize_nodes` should not be specified when using `TransformGraph` tool if using MACE quantization.
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -133,7 +133,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
    VLOG(3) << "Operator " << op->debug_def().name()
            << " has shape: " << MakeString(op->Output(0)->shape());

-    if (EnvEnabled("MACE_LOG_TENSOR_RANGE") && device_type == CPU) {
+    if (EnvEnabled("MACE_LOG_TENSOR_RANGE")) {
      for (int i = 0; i < op->OutputSize(); ++i) {
        if (op->debug_def().quantize_info_size() == 0) {
          int data_type = op->GetOptionalArg("T", static_cast<int>(DT_FLOAT));

--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -305,7 +305,7 @@ bool RunModel(const std::vector<std::string> &input_names,
                out_file.flush();
                out_file.close();
              } else {
-                std::cerr << "Open output file failed";
+                std::cerr << "Open output file failed" << std::endl;
                return -1;
              }
            }
@@ -315,7 +315,8 @@ bool RunModel(const std::vector<std::string> &input_names,

      closedir(dir_parent);
    } else {
-      std::cerr << "Directory " << FLAGS_input_dir << " does not exist.";
+      std::cerr << "Directory " << FLAGS_input_dir << " does not exist."
+                << std::endl;
    }
  } else {
    for (size_t i = 0; i < input_count; ++i) {
@@ -346,7 +347,7 @@ bool RunModel(const std::vector<std::string> &input_names,
        out_file.flush();
        out_file.close();
      } else {
-        std::cerr << "Open output file failed";
+        std::cerr << "Open output file failed" << std::endl;
        return -1;
      }
    }

--- a/mace/ops/quantize_benchmark.cc
+++ b/mace/ops/quantize_benchmark.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/operator.h"
+#include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+namespace {
+template <DeviceType D, typename T>
+void Quantize(int iters, int count) {
+  mace::testing::StopTiming();
+
+  OpsTestNet net;
+  // Add input data
+  net.AddRandomInput<D, float>("Input", {count});
+
+  OpDefBuilder("Quantize", "QuantizeBM")
+      .Input("Input")
+      .Output("Output")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+
+  // Warm-up
+  for (int i = 0; i < 2; ++i) {
+    net.RunOp(D);
+  }
+  net.Sync();
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.RunOp(D);
+  }
+  net.Sync();
+}
+
+template <DeviceType D, typename T>
+void Dequantize(int iters, int count) {
+  mace::testing::StopTiming();
+
+  OpsTestNet net;
+  // Add input data
+  net.AddRandomInput<D, T>("Input", {count});
+
+  OpDefBuilder("Dequantize", "DequantizeBM")
+      .Input("Input")
+      .Output("Output")
+      .OutputType({DT_FLOAT})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+
+  // Warm-up
+  for (int i = 0; i < 2; ++i) {
+    net.RunOp(D);
+  }
+  net.Sync();
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.RunOp(D);
+  }
+  net.Sync();
+}
+}  // namespace
+
+#define MACE_BM_QUANTIZE_MACRO(N, TYPE, DEVICE)            \
+  static void                                              \
+    MACE_BM_QUANTIZE_##N##_##TYPE##_##DEVICE(              \
+      int iters) {                                         \
+    const int64_t tot = static_cast<int64_t>(iters) * N;   \
+    mace::testing::MaccProcessed(tot);                     \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));    \
+    Quantize<DEVICE, TYPE>(iters, N);                      \
+  }                                                        \
+  MACE_BENCHMARK(                                          \
+    MACE_BM_QUANTIZE_##N##_##TYPE##_##DEVICE)
+
+#define MACE_BM_QUANTIZE(N)                                \
+  MACE_BM_QUANTIZE_MACRO(N, uint8_t, CPU);
+
+#define MACE_BM_DEQUANTIZE_MACRO(N, TYPE, DEVICE)          \
+  static void                                              \
+    MACE_BM_DEQUANTIZE_##N##_##TYPE##_##DEVICE(            \
+      int iters) {                                         \
+    const int64_t tot = static_cast<int64_t>(iters) * N;   \
+    mace::testing::MaccProcessed(tot);                     \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));    \
+    Dequantize<DEVICE, TYPE>(iters, N);                    \
+  }                                                        \
+  MACE_BENCHMARK(                                          \
+    MACE_BM_DEQUANTIZE_##N##_##TYPE##_##DEVICE)
+
+#define MACE_BM_DEQUANTIZE(N)                              \
+  MACE_BM_DEQUANTIZE_MACRO(N, uint8_t, CPU);
+
+MACE_BM_QUANTIZE(256);
+MACE_BM_QUANTIZE(1470000);
+MACE_BM_DEQUANTIZE(256);
+MACE_BM_DEQUANTIZE(1470000);
+
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -721,6 +721,8 @@ def tuning_run(abi,
               (model_tag, running_round, restart_round, str(tuning),
                str(out_of_range_check), omp_num_threads, cpu_affinity_policy,
                gpu_perf_hint, gpu_priority_hint))
+    sys.stdout.flush()
+
    mace_model_path = ""
    if model_graph_format == ModelFormat.file:
        mace_model_path = "%s/%s.pb" % (mace_model_dir, model_tag)
@@ -880,6 +882,7 @@ def tuning_run(abi,

        six.print_("Running finished!\n")

+    sys.stdout.flush()
    return stdout