From 7e9bb98a37cb703b320eff078128a3d82aec9b11 Mon Sep 17 00:00:00 2001
From: Yanzhan Yang <yangyanzhan@gmail.com>
Date: Wed, 9 Oct 2019 21:20:40 +0800
Subject: [PATCH] add n-fold quantification algorithm (#2164)

* 1. add quantification_fold parameter. 2. support quantification test in run.py.

* implement n-fold quantification
---
 mobile/src/framework/executor.cpp          |  84 ++++++++---------
 mobile/src/framework/loader.cpp            |  25 +++--
 mobile/src/framework/loader.h              |  19 ++--
 mobile/src/framework/program/program.h     |   1 +
 mobile/src/io/paddle_inference_api.h       |   1 +
 mobile/src/io/paddle_mobile.cpp            |  27 +++---
 mobile/src/io/paddle_mobile.h              |   7 +-
 mobile/test/net/test_net.cpp               |   6 +-
 mobile/test/net/test_op_in_net.cpp         |   2 +-
 mobile/tools/pre-commit.hooks/cpplint.hook |   2 +-
 mobile/tools/python/fluidtools/run.py      |  27 +++++-
 mobile/tools/quantification/convert.cpp    | 105 ++++++++++++---------
 12 files changed, 181 insertions(+), 125 deletions(-)
diff --git a/mobile/src/framework/executor.cpp b/mobile/src/framework/executor.cpp
index c1ff6ee29b..0d25596af0 100644
--- a/mobile/src/framework/executor.cpp
+++ b/mobile/src/framework/executor.cpp
@@ -173,24 +173,33 @@ void Executor<Device, T>::InitFeedFetchList() {
 }
 
 template <typename T>
-static void LoadMemInternal(void **data, LoDTensor *tensor,
-                            bool quant_uint8 = false) {
-  char **data_buf = reinterpret_cast<char **>(data);
-  int64_t size = tensor->numel();
-  T *tensor_data = tensor->mutable_data<T>();
+static void LoadMemInternal(void **in_data, void *out_data, int64_t size,
+                            bool quant_uint8 = false, int quant_fold = 1) {
+  char **data_buf = reinterpret_cast<char **>(in_data);
+  T *tensor_data = reinterpret_cast<T *>(out_data);
   if (quant_uint8) {
-    // should be moved into operator init function
-    float min_value;
-    float max_value;
-    memory::Copy(&min_value, *data_buf, sizeof(float));
-    memory::Copy(&max_value, *data_buf + sizeof(float), sizeof(float));
-    *data_buf += 2 * sizeof(float);
-    const float factor = (max_value - min_value) / 255.0;
-    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data_buf);
-    for (int k = 0; k < size; ++k) {
-      tensor_data[k] = uint8_data[k] * factor + min_value;
+    int step = fmax(size / quant_fold, 1);
+    int visited_fold = 0;
+    while (visited_fold * step < size) {
+      // should be moved into operator init function
+      float min_value;
+      float max_value;
+      memory::Copy(&min_value, *data_buf, sizeof(float));
+      memory::Copy(&max_value, *data_buf + sizeof(float), sizeof(float));
+      *data_buf += 2 * sizeof(float);
+      const float factor = (max_value - min_value) / 255.0;
+      const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data_buf);
+      int k = 0;
+      for (; k < step; ++k) {
+        int tensor_data_idx = visited_fold * step + k;
+        if (tensor_data_idx >= size) {
+          break;
+        }
+        tensor_data[tensor_data_idx] = uint8_data[k] * factor + min_value;
+      }
+      *data_buf += k * sizeof(uint8_t);
+      visited_fold++;
     }
-    *data_buf += size * sizeof(uint8_t);
   } else {
     memory::Copy(tensor_data, *data_buf, size * sizeof(T));
     *data_buf += size * sizeof(T);
@@ -235,14 +244,20 @@ void Executor<Device, T>::LoadMemory(void **data,
   // parse tensor from stream
   switch (tensor_desc.DataType()) {
     case VARTYPE_TYPE_FP32:
-      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
-                             program_.quantification);
+      LoadMemInternal<float>(
+          reinterpret_cast<void **>(data_buf),
+          reinterpret_cast<void *>(tensor->mutable_data<T>()), tensor->numel(),
+          program_.quantification, program_.quantification_fold);
       break;
     case VARTYPE_TYPE_INT8:
-      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
+      LoadMemInternal<int8_t>(
+          reinterpret_cast<void **>(data_buf),
+          reinterpret_cast<void *>(tensor->mutable_data<T>()), tensor->numel());
       break;
     case VARTYPE_TYPE_INT32:
-      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
+      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf),
+                           reinterpret_cast<void *>(tensor->mutable_data<T>()),
+                           tensor->numel());
       break;
     default:
       LOG(kLOG_ERROR) << "data type is not supported";
@@ -944,31 +959,10 @@ void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
   void *memory = nullptr;
   int type_size = 4;
   memory = tensorInput;
-  if (program_.quantification) {
-    float min_value;
-    float max_value;
-
-    memcpy(&min_value, *data, sizeof(float));
-    memcpy(&max_value, *data + sizeof(float), sizeof(float));
-    *data += 2 * sizeof(float);
-    const float factor = (max_value - min_value) / 255.0;
-    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
-    for (int k = 0; k < memory_size; ++k) {
-      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
-    }
-    *data += (memory_size * sizeof(uint8_t));
-  } else {
-    for (int n = 0; n < memory_size; n++) {
-      float value;
-      memcpy(&value, *data + n * type_size, type_size);
-      if (value < 1e-30 && value > -1e-30) {
-        static_cast<float *>(memory)[n] = 0.0;
-      } else {
-        static_cast<float *>(memory)[n] = value;
-      }
-    }
-    (*data) += (sizeof(char) * memory_size * type_size);
-  }
+
+  LoadMemInternal<float>(reinterpret_cast<void **>(data),
+                         reinterpret_cast<void *>(memory), memory_size,
+                         program_.quantification, program_.quantification_fold);
 }
 
 template <>
diff --git a/mobile/src/framework/loader.cpp b/mobile/src/framework/loader.cpp
index 4350fda969..34cf6253cb 100644
--- a/mobile/src/framework/loader.cpp
+++ b/mobile/src/framework/loader.cpp
@@ -87,7 +87,8 @@ void Loader<GPU_CL, float>::InitMemoryFromProgram(
 template <>
 const Program<GPU_CL, float> Loader<GPU_CL, float>::LoadCombinedMemory(
     size_t read_size, const uint8_t *buf, size_t combined_params_len,
-    uint8_t *combined_params_buf, bool optimize, bool quantification) {
+    uint8_t *combined_params_buf, bool optimize, bool quantification,
+    int quantification_fold) {
   bool can_add_split = false;
 
   PaddleMobile__Framework__Proto__ProgramDesc *c_program;
@@ -109,6 +110,7 @@ const Program<GPU_CL, float> Loader<GPU_CL, float>::LoadCombinedMemory(
   program.quantification = quantification;
   program.combined_params_len = combined_params_len;
   program.combined_params_buf = combined_params_buf;
+  program.quantification_fold = quantification_fold;
 
   auto scope = std::make_shared<Scope>();
   program.scope = scope;
@@ -187,9 +189,11 @@ template <typename Device, typename T>
 const Program<Device, T> Loader<Device, T>::Load(const std::string &dirname,
                                                  bool optimize,
                                                  bool quantification,
-                                                 bool can_add_split) {
-  auto program = this->LoadProgram(dirname + "/__model__", optimize,
-                                   quantification, can_add_split);
+                                                 bool can_add_split,
+                                                 int quantification_fold) {
+  auto program =
+      this->LoadProgram(dirname + "/__model__", optimize, quantification,
+                        can_add_split, quantification_fold);
   program.model_path = dirname;
   return program;
 }
@@ -198,8 +202,10 @@ template <typename Device, typename T>
 const Program<Device, T> Loader<Device, T>::Load(const std::string &model_path,
                                                  const std::string &para_path,
                                                  bool optimize,
-                                                 bool quantification) {
-  auto program = this->LoadProgram(model_path, optimize, quantification);
+                                                 bool quantification,
+                                                 int quantification_fold) {
+  auto program = this->LoadProgram(model_path, optimize, quantification, false,
+                                   quantification_fold);
 
   program.para_path = para_path;
   program.combined = true;
@@ -210,7 +216,7 @@ const Program<Device, T> Loader<Device, T>::Load(const std::string &model_path,
 template <typename Device, typename T>
 const Program<Device, T> Loader<Device, T>::LoadProgram(
     const std::string &model_path, bool optimize, bool quantification,
-    bool can_add_split) {
+    bool can_add_split, int quantification_fold) {
   std::string model_filename = model_path;
   PaddleMobile__Framework__Proto__ProgramDesc *c_program;
   uint8_t *buf = NULL;
@@ -232,6 +238,7 @@ const Program<Device, T> Loader<Device, T>::LoadProgram(
   program.quantification = quantification;
   program.combined_params_len = 0;
   program.combined_params_buf = nullptr;
+  program.quantification_fold = quantification_fold;
   auto scope = std::make_shared<Scope>();
   program.scope = scope;
 
@@ -248,7 +255,8 @@ const Program<Device, T> Loader<Device, T>::LoadProgram(
 template <typename Device, typename T>
 const Program<Device, T> Loader<Device, T>::LoadCombinedMemory(
     size_t read_size, const uint8_t *buf, size_t combined_params_len,
-    uint8_t *combined_params_buf, bool optimize, bool quantification) {
+    uint8_t *combined_params_buf, bool optimize, bool quantification,
+    int quantification_fold) {
   bool can_add_split = false;
 
   PaddleMobile__Framework__Proto__ProgramDesc *c_program;
@@ -270,6 +278,7 @@ const Program<Device, T> Loader<Device, T>::LoadCombinedMemory(
   program.quantification = quantification;
   program.combined_params_len = combined_params_len;
   program.combined_params_buf = combined_params_buf;
+  program.quantification_fold = quantification_fold;
 
   auto scope = std::make_shared<Scope>();
   program.scope = scope;
diff --git a/mobile/src/framework/loader.h b/mobile/src/framework/loader.h
index bd4dfa1556..40ded643d5 100644
--- a/mobile/src/framework/loader.h
+++ b/mobile/src/framework/loader.h
@@ -32,7 +32,8 @@ class Loader {
   const Program<Device, T> Load(const std::string &dirname,
                                 bool optimize = false,
                                 bool quantification = false,
-                                bool can_add_split = false);
+                                bool can_add_split = false,
+                                int quantification_fold = 1);
 
   /*
    * @b load combine format fluid mode
@@ -41,20 +42,20 @@ class Loader {
   const Program<Device, T> Load(const std::string &model_path,
                                 const std::string &para_path,
                                 bool optimize = false,
-                                bool quantification = false);
+                                bool quantification = false,
+                                int quantification_fold = 1);
 
-  const Program<Device, T> LoadCombinedMemory(size_t model_len,
-                                              const uint8_t *model_buf,
-                                              size_t combined_params_len,
-                                              uint8_t *combined_params_buf,
-                                              bool optimize = false,
-                                              bool quantification = false);
+  const Program<Device, T> LoadCombinedMemory(
+      size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
+      uint8_t *combined_params_buf, bool optimize = false,
+      bool quantification = false, int quantification_fold = 1);
 
  private:
   const Program<Device, T> LoadProgram(const std::string &model_path,
                                        bool optimize = false,
                                        bool quantification = false,
-                                       bool can_add_split = false);
+                                       bool can_add_split = false,
+                                       int quantification_fold = 1);
 
   void InitMemoryFromProgram(
       const std::shared_ptr<ProgramDesc> &originProgramDesc,
diff --git a/mobile/src/framework/program/program.h b/mobile/src/framework/program/program.h
index f05aba8565..b6d1d96279 100644
--- a/mobile/src/framework/program/program.h
+++ b/mobile/src/framework/program/program.h
@@ -34,6 +34,7 @@ class Program {
   bool quantification = false;
   size_t combined_params_len;
   uint8_t *combined_params_buf;
+  int quantification_fold = 1;
 };
 
 }  // namespace framework
diff --git a/mobile/src/io/paddle_inference_api.h b/mobile/src/io/paddle_inference_api.h
index 5c104db41f..dccfd1ceca 100644
--- a/mobile/src/io/paddle_inference_api.h
+++ b/mobile/src/io/paddle_inference_api.h
@@ -216,6 +216,7 @@ struct PaddleMobileConfig : public PaddlePredictor::Config {
   int batch_size = 1;
   bool optimize = true;
   bool quantification = false;
+  int quantification_fold = 1;
   bool lod_mode = false;
   int thread_num = 1;
   bool load_when_predict = false;
diff --git a/mobile/src/io/paddle_mobile.cpp b/mobile/src/io/paddle_mobile.cpp
index 95ae3763a2..be69ce0f63 100644
--- a/mobile/src/io/paddle_mobile.cpp
+++ b/mobile/src/io/paddle_mobile.cpp
@@ -37,7 +37,8 @@ void PaddleMobile<Device, T>::SetThreadNum(int thread_num,
 template <typename Device, typename T>
 PMStatus PaddleMobile<Device, T>::Load(const std::string &dirname,
                                        bool optimize, bool quantification,
-                                       int batch_size, bool lod_mode) {
+                                       int batch_size, bool lod_mode,
+                                       int quantification_fold) {
   if (loader_.get() == nullptr) {
     loader_ = std::make_shared<framework::Loader<Device, T>>();
   } else {
@@ -46,8 +47,9 @@ PMStatus PaddleMobile<Device, T>::Load(const std::string &dirname,
 
   if (executor_.get() == nullptr) {
     executor_ = std::make_shared<framework::Executor<Device, T>>(
-        loader_->Load(dirname, optimize, quantification), config_, batch_size,
-        optimize, lod_mode);
+        loader_->Load(dirname, optimize, quantification, false,
+                      quantification_fold),
+        config_, batch_size, optimize, lod_mode);
   } else {
     LOG(kLOG_INFO) << "executor inited";
   }
@@ -59,7 +61,8 @@ template <typename Device, typename T>
 PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path,
                                        const std::string &para_path,
                                        bool optimize, bool quantification,
-                                       int batch_size, bool lod_mode) {
+                                       int batch_size, bool lod_mode,
+                                       int quantification_fold) {
   if (loader_.get() == nullptr) {
     loader_ = std::make_shared<framework::Loader<Device, T>>();
   } else {
@@ -69,8 +72,9 @@ PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path,
 
   if (executor_.get() == nullptr) {
     executor_ = std::make_shared<framework::Executor<Device, T>>(
-        loader_->Load(model_path, para_path, optimize, quantification), config_,
-        batch_size, optimize, lod_mode);
+        loader_->Load(model_path, para_path, optimize, quantification,
+                      quantification_fold),
+        config_, batch_size, optimize, lod_mode);
   } else {
     LOG(kLOG_INFO) << "executor inited";
   }
@@ -82,11 +86,12 @@ template <typename Device, typename T>
 PMStatus PaddleMobile<Device, T>::Load(const PaddleMobileConfig &config) {
   if (!config.model_dir.empty()) {
     return this->Load(config.model_dir, config.optimize, config.quantification,
-                      config.batch_size, config.lod_mode);
+                      config.batch_size, config.lod_mode,
+                      config.quantification_fold);
   } else if (!config.prog_file.empty() && !config.param_file.empty()) {
     return this->Load(config.prog_file, config.param_file, config.optimize,
-                      config.quantification, config.batch_size,
-                      config.lod_mode);
+                      config.quantification, config.batch_size, config.lod_mode,
+                      config.quantification_fold);
   } else {
     LOG(kLOG_ERROR) << "Failed to load inference model";
     return PMNotInitialized;
@@ -97,7 +102,7 @@ template <typename Device, typename T>
 bool PaddleMobile<Device, T>::LoadCombinedMemory(
     size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
     uint8_t *combined_params_buf, bool optimize, bool quantification,
-    int batch_size, bool lod_mode) {
+    int batch_size, bool lod_mode, int quantification_fold) {
   if (loader_.get() == nullptr) {
     loader_ = std::make_shared<framework::Loader<Device, T>>();
   } else {
@@ -107,7 +112,7 @@ bool PaddleMobile<Device, T>::LoadCombinedMemory(
     executor_ = std::make_shared<framework::Executor<Device, T>>(
         loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
                                     combined_params_buf, optimize,
-                                    quantification),
+                                    quantification, quantification_fold),
         config_, batch_size, optimize, lod_mode);
   } else {
     LOG(kLOG_INFO) << "executor inited";
diff --git a/mobile/src/io/paddle_mobile.h b/mobile/src/io/paddle_mobile.h
index e39d712447..8b8f0683ab 100644
--- a/mobile/src/io/paddle_mobile.h
+++ b/mobile/src/io/paddle_mobile.h
@@ -50,10 +50,11 @@ class PaddleMobile {
 
   PMStatus Load(const std::string &dirname, const bool optimize = false,
                 const bool quantification = false, const int batch_size = 1,
-                const bool lod_mode = false);
+                const bool lod_mode = false, const int quantification_fold = 1);
   PMStatus Load(const std::string &model_path, const std::string &para_path,
                 const bool optimize = false, const bool quantification = false,
-                const int batch_size = 1, const bool lod_mode = false);
+                const int batch_size = 1, const bool lod_mode = false,
+                const int quantification_fold = 1);
 
   PMStatus Load(const PaddleMobileConfig &config);
 
@@ -84,7 +85,7 @@ class PaddleMobile {
                           size_t combined_params_len,
                           uint8_t *combined_params_buf, bool optimize = false,
                           bool quantification = false, int batch_size = 1,
-                          bool lod_mode = false);
+                          bool lod_mode = false, int quantification_fold = 1);
 
   void SetThreadNum(int thread_num,
                     PowerMode power_mode = PERFORMANCE_PRIORITY);
diff --git a/mobile/test/net/test_net.cpp b/mobile/test/net/test_net.cpp
index a1c234dbca..74a124e46c 100644
--- a/mobile/test/net/test_net.cpp
+++ b/mobile/test/net/test_net.cpp
@@ -31,6 +31,10 @@ void test(int argc, char *argv[]) {
   arg_index++;
   bool enable_memory_optimization = std::stoi(argv[arg_index]) == 1;
   arg_index++;
+  bool quantification = std::stoi(argv[arg_index]) == 1;
+  arg_index++;
+  int quantification_fold = std::stoi(argv[arg_index]);
+  arg_index++;
   paddle_mobile::PaddleMobileConfigInternal config;
   config.memory_optimization_level = enable_memory_optimization
                                          ? MemoryOptimizationWithoutFeeds
@@ -98,7 +102,7 @@ void test(int argc, char *argv[]) {
 
   auto time1 = time();
   if (paddle_mobile.Load("./checked_model/model", "./checked_model/params",
-                         fuse, false, 1, true)) {
+                         fuse, quantification, 1, true, quantification_fold)) {
     auto time2 = time();
     std::cout << "auto-test"
               << " load-time-cost :" << time_diff(time1, time2) << "ms"
diff --git a/mobile/test/net/test_op_in_net.cpp b/mobile/test/net/test_op_in_net.cpp
index 4666f4133c..9425c02762 100644
--- a/mobile/test/net/test_op_in_net.cpp
+++ b/mobile/test/net/test_op_in_net.cpp
@@ -58,7 +58,7 @@ void test(int argc, char *argv[]) {
 
   auto time1 = time();
   if (paddle_mobile.Load("./checked_model/model", "./checked_model/params",
-                         fuse, false, 1, true)) {
+                         fuse, false, 1, true, 1)) {
     auto time2 = time();
     std::cout << "auto-test"
               << " load-time-cost :" << time_diff(time1, time2) << "ms"
diff --git a/mobile/tools/pre-commit.hooks/cpplint.hook b/mobile/tools/pre-commit.hooks/cpplint.hook
index 78ca3cfcdd..3740e64c73 100644
--- a/mobile/tools/pre-commit.hooks/cpplint.hook
+++ b/mobile/tools/pre-commit.hooks/cpplint.hook
@@ -5,7 +5,7 @@ TOTAL_ERRORS=0
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \
         grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \
-        grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do
+        grep -v "protobuf-c.h" | grep -v "protobuf-c.c" | grep -v "^mobile/tools/quantification"); do
     cpplint $file;
     TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
 done
diff --git a/mobile/tools/python/fluidtools/run.py b/mobile/tools/python/fluidtools/run.py
index a77943e2af..6fa5842009 100644
--- a/mobile/tools/python/fluidtools/run.py
+++ b/mobile/tools/python/fluidtools/run.py
@@ -22,6 +22,8 @@ checked_encrypt_model_path = "checked_encrypt_model"
 output_var_filter = []
 output_key_filter = {}
 check_shape = False
+quantification = False
+quantification_fold = 1000
 architecture = "arm-v7a"
 # architecture = "arm-v8a"
 
@@ -107,7 +109,8 @@ def resave_model(feed_kv):
     for name in p_names:
         v = fluid.framework._get_var(name, prog)
         v.persistable = False
-    fluid.io.save_inference_model(dirname=checked_model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model", params_filename="params")
+    if not quantification:
+        fluid.io.save_inference_model(dirname=checked_model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model", params_filename="params")
     if has_found_wrong_shape:
         pp_red("has found wrong shape", 1)
     else:
@@ -392,7 +395,7 @@ for op in ops:
 pp_tab("op types : {}".format(op_types), 1)
 
 def check_mobile_results(args, fuse, mem_opt):
-    args = "{} {} {}".format("1" if fuse else "0", "1" if mem_opt else "0", args)
+    args = "{} {} {} {} {}".format("1" if fuse else "0", "1" if mem_opt else "0", "1" if quantification else "0", quantification_fold, args)
     res = sh("adb shell \"cd {} && export LD_LIBRARY_PATH=. && ./test-net {}\"".format(mobile_exec_root, args))
     lines = res.split("\n")
     # for line in lines:
@@ -425,6 +428,26 @@ def check_mobile_results(args, fuse, mem_opt):
     fetch_names = []
     for fetch in fetches:
         fetch_names.append(fetch.name)
+    fetch_diff = 0.0
+    fetch_count = 0
+    for index in op_cache:
+        op_output_var_name, op = op_cache[index]
+        if not op_output_var_name in output_var_cache:
+            continue
+        if not op_output_var_name in mobile_var_cache:
+            continue
+        if op_output_var_name not in fetch_names:
+            continue
+        values1 = output_var_cache[op_output_var_name]
+        values2 = mobile_var_cache[op_output_var_name]
+        shape = get_var_shape(op_output_var_name) if check_shape else []
+        for i in range(len(values1)):
+            v1 = values1[i]
+            v2 = values2[len(shape) + i]
+            fetch_diff += abs(v1 - v2)
+            fetch_count += 1
+    if fetch_count != 0:
+        pp_yellow("output avg diff : {}".format(fetch_diff / fetch_count), 1)
     for index in op_cache:
         op_output_var_name, op = op_cache[index]
         if mem_opt:
diff --git a/mobile/tools/quantification/convert.cpp b/mobile/tools/quantification/convert.cpp
index 3473f9a118..22be4ce5b9 100644
--- a/mobile/tools/quantification/convert.cpp
+++ b/mobile/tools/quantification/convert.cpp
@@ -68,7 +68,7 @@ std::shared_ptr<ProgramDesc> loadParams(const std::string &model_path) {
 
 }
 
-void LoadWithDumpForInt8(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file) {
+void LoadWithDumpForInt8(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file, int quantification_fold) {
     // 1. version
     uint32_t version = *reinterpret_cast<uint32_t *>(*dataP);
 
@@ -162,27 +162,33 @@ void LoadWithDumpForInt8(const paddle_mobile::framework::VarDesc &var_desc, char
     }
     *dataP += tensorSize;
 
-    // for float 32
-    float min_value = std::numeric_limits<float>::max();
-    float max_value = std::numeric_limits<float>::min();
+    int step = std::max(memory_size / quantification_fold, 1);
 
-    for (int k = 0; k < memory_size; ++k) {
-        min_value = std::min(min_value, static_cast<float *> (memory)[k]);
-        max_value = std::max(max_value, static_cast<float *> (memory)[k]);
-    }
+    int visited_fold = 0;
+    while (visited_fold * step < memory_size) {
+        // for float 32
+        float min_value = std::numeric_limits<float>::max();
+        float max_value = std::numeric_limits<float>::min();
+
+        for (int k = visited_fold * step; k < std::min((visited_fold + 1) * step, memory_size); ++k) {
+            min_value = std::min(min_value, static_cast<float *> (memory)[k]);
+            max_value = std::max(max_value, static_cast<float *> (memory)[k]);
+        }
 
-    fwrite(&min_value, sizeof(float), 1, out_file);
-    fwrite(&max_value, sizeof(float), 1, out_file);
+        fwrite(&min_value, sizeof(float), 1, out_file);
+        fwrite(&max_value, sizeof(float), 1, out_file);
 
-    for (int g = 0; g < memory_size; ++g) {
-        float value = static_cast<float *> (memory)[g];
-        auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
-        fwrite(&factor, sizeof(uint8_t), 1, out_file);
+        for (int g = visited_fold * step; g < std::min((visited_fold + 1) * step, memory_size); ++g) {
+            float value = static_cast<float *> (memory)[g];
+            auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
+            fwrite(&factor, sizeof(uint8_t), 1, out_file);
+        }
+        visited_fold++;
     }
 }
 
 void
-quantificate_combined_int8(const std::string &model_path, const std::string &param_path, const std::string &param_min_path) {
+quantificate_combined_int8(const std::string &model_path, const std::string &param_path, const std::string &param_min_path, int quantification_fold) {
     auto program = loadParams(model_path);
     char *origin_data = Get_binary_data(param_path);
     char *data = origin_data;
@@ -193,7 +199,7 @@ quantificate_combined_int8(const std::string &model_path, const std::string &par
                 if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
                     continue;
                 }
-                LoadWithDumpForInt8(*var_desc, &data, out_file);
+                LoadWithDumpForInt8(*var_desc, &data, out_file, quantification_fold);
             }
         }
     }
@@ -201,7 +207,7 @@ quantificate_combined_int8(const std::string &model_path, const std::string &par
     delete origin_data;
 }
 
-void quantificate_seperated_int8(const std::string model_dir, const std::string param_min_path) {
+void quantificate_seperated_int8(const std::string model_dir, const std::string param_min_path, int quantification_fold) {
     auto program = loadParams(model_dir + "/__model__");
 
     std::string shell_command = "mkdir " + param_min_path;
@@ -217,7 +223,7 @@ void quantificate_seperated_int8(const std::string model_dir, const std::string
                 FILE *out_file = fopen(file_name.c_str(), "wb");
                 char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
                 char *data = origin_data;
-                LoadWithDumpForInt8(*var_desc, &data, out_file);
+                LoadWithDumpForInt8(*var_desc, &data, out_file, quantification_fold);
                 delete origin_data;
                 fclose(out_file);
             }
@@ -225,7 +231,7 @@ void quantificate_seperated_int8(const std::string model_dir, const std::string
     }
 }
 
-void LoadWithDumpForFloat32(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file) {
+void LoadWithDumpForFloat32(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file, int quantification_fold) {
     // 1. version
     uint32_t version = *reinterpret_cast<uint32_t *>(*dataP);
 
@@ -319,30 +325,36 @@ void LoadWithDumpForFloat32(const paddle_mobile::framework::VarDesc &var_desc, c
     }
     *dataP += tensorSize;
 
-    // for float 32
-    float min_value = std::numeric_limits<float>::max();
-    float max_value = std::numeric_limits<float>::min();
+    int step = std::max(memory_size / quantification_fold, 1);
 
-    for (int k = 0; k < memory_size; ++k) {
-        min_value = std::min(min_value, static_cast<float *> (memory)[k]);
-        max_value = std::max(max_value, static_cast<float *> (memory)[k]);
-    }
+    int visited_fold = 0;
+    while (visited_fold * step < memory_size) {
+        // for float 32
+        float min_value = std::numeric_limits<float>::max();
+        float max_value = std::numeric_limits<float>::min();
 
-    float diff = 0.0;
-    for (int g = 0; g < memory_size; ++g) {
-        float value = static_cast<float *> (memory)[g];
-        auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
-        float value_quantized = min_value + (factor / 255.0) * (max_value - min_value);
-        diff += fabs(value - value_quantized);
-        fwrite(&value_quantized, sizeof(float), 1, out_file);
-    }
-    if (memory_size > 0) {
-        std::cout << "avg diff caused by quantization for var " << var_desc.Name() << " is: " << (diff / memory_size) << std::endl;
+        for (int k = visited_fold * step; k < std::min((visited_fold + 1) * step, memory_size); ++k) {
+            min_value = std::min(min_value, static_cast<float *> (memory)[k]);
+            max_value = std::max(max_value, static_cast<float *> (memory)[k]);
+        }
+
+        float diff = 0.0;
+        for (int g = visited_fold * step; g < std::min((visited_fold + 1) * step, memory_size); ++g) {
+            float value = static_cast<float *> (memory)[g];
+            auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
+            float value_quantized = min_value + (factor / 255.0) * (max_value - min_value);
+            diff += fabs(value - value_quantized);
+            fwrite(&value_quantized, sizeof(float), 1, out_file);
+        }
+        if (memory_size > 0) {
+            std::cout << "avg diff caused by quantization for var " << var_desc.Name() << " is: " << (diff / memory_size) << std::endl;
+        }
+        visited_fold++;
     }
 }
 
 void
-quantificate_combined_float32(const std::string &model_path, const std::string &param_path, const std::string &param_min_path) {
+quantificate_combined_float32(const std::string &model_path, const std::string &param_path, const std::string &param_min_path, int quantification_fold) {
     auto program = loadParams(model_path);
     char *origin_data = Get_binary_data(param_path);
     char *data = origin_data;
@@ -353,7 +365,7 @@ quantificate_combined_float32(const std::string &model_path, const std::string &
                 if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
                     continue;
                 }
-                LoadWithDumpForFloat32(*var_desc, &data, out_file);
+                LoadWithDumpForFloat32(*var_desc, &data, out_file, quantification_fold);
             }
         }
     }
@@ -361,7 +373,7 @@ quantificate_combined_float32(const std::string &model_path, const std::string &
     delete origin_data;
 }
 
-void quantificate_seperated_float32(const std::string model_dir, const std::string param_min_path) {
+void quantificate_seperated_float32(const std::string model_dir, const std::string param_min_path, int quantification_fold) {
     auto program = loadParams(model_dir + "/__model__");
 
     std::string shell_command = "mkdir " + param_min_path;
@@ -377,7 +389,7 @@ void quantificate_seperated_float32(const std::string model_dir, const std::stri
                 FILE *out_file = fopen(file_name.c_str(), "wb");
                 char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
                 char *data = origin_data;
-                LoadWithDumpForFloat32(*var_desc, &data, out_file);
+                LoadWithDumpForFloat32(*var_desc, &data, out_file, quantification_fold);
                 delete origin_data;
                 fclose(out_file);
             }
@@ -402,10 +414,15 @@ int main(int argc, char **argv) {
     PADDLE_MOBILE_ENFORCE(argc > 3, "we need your output path. %s ", kNoteEg.c_str());
     std::string output_path = argv[3];
 
+    int quantification_fold = 1;
+    if (argc > 4) {
+        quantification_fold = std::stoi(argv[4]);
+    }
+
     if (action_type == "0") {
         // for seperated
         const std::string &seperated_min_dir = output_path;
-        quantificate_seperated_int8(base_path, seperated_min_dir);
+        quantificate_seperated_int8(base_path, seperated_min_dir, quantification_fold);
         return 0;
     }
 
@@ -414,14 +431,14 @@ int main(int argc, char **argv) {
         const std::string &combined_min_dir = output_path;
         std::string model_path = base_path + "/model";
         std::string param_path = base_path + "/params";
-        quantificate_combined_int8(model_path, param_path, combined_min_dir);
+        quantificate_combined_int8(model_path, param_path, combined_min_dir, quantification_fold);
         return 0;
     }
 
     if (action_type == "2") {
         // for seperated
         const std::string &seperated_min_dir = output_path;
-        quantificate_seperated_float32(base_path, seperated_min_dir);
+        quantificate_seperated_float32(base_path, seperated_min_dir, quantification_fold);
         return 0;
     }
 
@@ -430,7 +447,7 @@ int main(int argc, char **argv) {
         const std::string &combined_min_dir = output_path;
         std::string model_path = base_path + "/model";
         std::string param_path = base_path + "/params";
-        quantificate_combined_float32(model_path, param_path, combined_min_dir);
+        quantificate_combined_float32(model_path, param_path, combined_min_dir, quantification_fold);
         return 0;
     }
 
-- 
GitLab