add n-fold quantification algorithm (#2164)

* 1. add quantification_fold parameter. 2. support quantification test in run.py. * implement n-fold quantification

add n-fold quantification algorithm (#2164)
* 1. add quantification_fold parameter. 2. support quantification test in run.py. * implement n-fold quantification
7e9bb98a · Yanzhan Yang · GitHub · 4b9df8fb · 7e9bb98a · 7e9bb98a
12 changed file
--- a/mobile/src/framework/executor.cpp
+++ b/mobile/src/framework/executor.cpp
@@ -173,24 +173,33 @@ void Executor<Device, T>::InitFeedFetchList() {
 }
 template <typename T>
-static void LoadMemInternal(void **data, LoDTensor *tensor,
+static void LoadMemInternal(void **in_data, void *out_data, int64_t size,
-                            bool quant_uint8 = false) {
+                            bool quant_uint8 = false, int quant_fold = 1) {
-  char **data_buf = reinterpret_cast<char **>(data);
+  char **data_buf = reinterpret_cast<char **>(in_data);
-  int64_t size = tensor->numel();
+  T *tensor_data = reinterpret_cast<T *>(out_data);
-  T *tensor_data = tensor->mutable_data<T>();
  if (quant_uint8) {
-    // should be moved into operator init function
+    int step = fmax(size / quant_fold, 1);
-    float min_value;
+    int visited_fold = 0;
-    float max_value;
+    while (visited_fold * step < size) {
-    memory::Copy(&min_value, *data_buf, sizeof(float));
+      // should be moved into operator init function
-    memory::Copy(&max_value, *data_buf + sizeof(float), sizeof(float));
+      float min_value;
-    *data_buf += 2 * sizeof(float);
+      float max_value;
-    const float factor = (max_value - min_value) / 255.0;
+      memory::Copy(&min_value, *data_buf, sizeof(float));
-    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data_buf);
+      memory::Copy(&max_value, *data_buf + sizeof(float), sizeof(float));
-    for (int k = 0; k < size; ++k) {
+      *data_buf += 2 * sizeof(float);
-      tensor_data[k] = uint8_data[k] * factor + min_value;
+      const float factor = (max_value - min_value) / 255.0;
+      const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data_buf);
+      int k = 0;
+      for (; k < step; ++k) {
+        int tensor_data_idx = visited_fold * step + k;
+        if (tensor_data_idx >= size) {
+          break;
+        }
+        tensor_data[tensor_data_idx] = uint8_data[k] * factor + min_value;
+      }
+      *data_buf += k * sizeof(uint8_t);
+      visited_fold++;
    }
-    *data_buf += size * sizeof(uint8_t);
  } else {
    memory::Copy(tensor_data, *data_buf, size * sizeof(T));
    *data_buf += size * sizeof(T);
@@ -235,14 +244,20 @@ void Executor<Device, T>::LoadMemory(void **data,
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
    case VARTYPE_TYPE_FP32:
-      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
+      LoadMemInternal<float>(
-                             program_.quantification);
+          reinterpret_cast<void **>(data_buf),
+          reinterpret_cast<void *>(tensor->mutable_data<T>()), tensor->numel(),
+          program_.quantification, program_.quantification_fold);
      break;
    case VARTYPE_TYPE_INT8:
-      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
+      LoadMemInternal<int8_t>(
+          reinterpret_cast<void **>(data_buf),
+          reinterpret_cast<void *>(tensor->mutable_data<T>()), tensor->numel());
      break;
    case VARTYPE_TYPE_INT32:
-      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
+      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf),
+                           reinterpret_cast<void *>(tensor->mutable_data<T>()),
+                           tensor->numel());
      break;
    default:
      LOG(kLOG_ERROR) << "data type is not supported";
@@ -944,31 +959,10 @@ void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
  void *memory = nullptr;
  int type_size = 4;
  memory = tensorInput;
-  if (program_.quantification) {
-    float min_value;
+  LoadMemInternal<float>(reinterpret_cast<void **>(data),
-    float max_value;
+                         reinterpret_cast<void *>(memory), memory_size,
+                         program_.quantification, program_.quantification_fold);
-    memcpy(&min_value, *data, sizeof(float));
-    memcpy(&max_value, *data + sizeof(float), sizeof(float));
-    *data += 2 * sizeof(float);
-    const float factor = (max_value - min_value) / 255.0;
-    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
-    for (int k = 0; k < memory_size; ++k) {
-      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
-    }
-    *data += (memory_size * sizeof(uint8_t));
-  } else {
-    for (int n = 0; n < memory_size; n++) {
-      float value;
-      memcpy(&value, *data + n * type_size, type_size);
-      if (value < 1e-30 && value > -1e-30) {
-        static_cast<float *>(memory)[n] = 0.0;
-      } else {
-        static_cast<float *>(memory)[n] = value;
-      }
-    }
-    (*data) += (sizeof(char) * memory_size * type_size);
-  }
 }
 template <>

--- a/mobile/src/framework/loader.cpp
+++ b/mobile/src/framework/loader.cpp
@@ -87,7 +87,8 @@ void Loader<GPU_CL, float>::InitMemoryFromProgram(
 template <>
 const Program<GPU_CL, float> Loader<GPU_CL, float>::LoadCombinedMemory(
    size_t read_size, const uint8_t *buf, size_t combined_params_len,
-    uint8_t *combined_params_buf, bool optimize, bool quantification) {
+    uint8_t *combined_params_buf, bool optimize, bool quantification,
+    int quantification_fold) {
  bool can_add_split = false;
  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
@@ -109,6 +110,7 @@ const Program<GPU_CL, float> Loader<GPU_CL, float>::LoadCombinedMemory(
  program.quantification = quantification;
  program.combined_params_len = combined_params_len;
  program.combined_params_buf = combined_params_buf;
+  program.quantification_fold = quantification_fold;
  auto scope = std::make_shared<Scope>();
  program.scope = scope;
@@ -187,9 +189,11 @@ template <typename Device, typename T>
 const Program<Device, T> Loader<Device, T>::Load(const std::string &dirname,
                                                 bool optimize,
                                                 bool quantification,
-                                                 bool can_add_split) {
+                                                 bool can_add_split,
-  auto program = this->LoadProgram(dirname + "/__model__", optimize,
+                                                 int quantification_fold) {
-                                   quantification, can_add_split);
+  auto program =
+      this->LoadProgram(dirname + "/__model__", optimize, quantification,
+                        can_add_split, quantification_fold);
  program.model_path = dirname;
  return program;
 }
@@ -198,8 +202,10 @@ template <typename Device, typename T>
 const Program<Device, T> Loader<Device, T>::Load(const std::string &model_path,
                                                 const std::string &para_path,
                                                 bool optimize,
-                                                 bool quantification) {
+                                                 bool quantification,
-  auto program = this->LoadProgram(model_path, optimize, quantification);
+                                                 int quantification_fold) {
+  auto program = this->LoadProgram(model_path, optimize, quantification, false,
+                                   quantification_fold);
  program.para_path = para_path;
  program.combined = true;
@@ -210,7 +216,7 @@ const Program<Device, T> Loader<Device, T>::Load(const std::string &model_path,
 template <typename Device, typename T>
 const Program<Device, T> Loader<Device, T>::LoadProgram(
    const std::string &model_path, bool optimize, bool quantification,
-    bool can_add_split) {
+    bool can_add_split, int quantification_fold) {
  std::string model_filename = model_path;
  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
  uint8_t *buf = NULL;
@@ -232,6 +238,7 @@ const Program<Device, T> Loader<Device, T>::LoadProgram(
  program.quantification = quantification;
  program.combined_params_len = 0;
  program.combined_params_buf = nullptr;
+  program.quantification_fold = quantification_fold;
  auto scope = std::make_shared<Scope>();
  program.scope = scope;
@@ -248,7 +255,8 @@ const Program<Device, T> Loader<Device, T>::LoadProgram(
 template <typename Device, typename T>
 const Program<Device, T> Loader<Device, T>::LoadCombinedMemory(
    size_t read_size, const uint8_t *buf, size_t combined_params_len,
-    uint8_t *combined_params_buf, bool optimize, bool quantification) {
+    uint8_t *combined_params_buf, bool optimize, bool quantification,
+    int quantification_fold) {
  bool can_add_split = false;
  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
@@ -270,6 +278,7 @@ const Program<Device, T> Loader<Device, T>::LoadCombinedMemory(
  program.quantification = quantification;
  program.combined_params_len = combined_params_len;
  program.combined_params_buf = combined_params_buf;
+  program.quantification_fold = quantification_fold;
  auto scope = std::make_shared<Scope>();
  program.scope = scope;

--- a/mobile/src/framework/loader.h
+++ b/mobile/src/framework/loader.h
@@ -32,7 +32,8 @@ class Loader {
  const Program<Device, T> Load(const std::string &dirname,
                                bool optimize = false,
                                bool quantification = false,
-                                bool can_add_split = false);
+                                bool can_add_split = false,
+                                int quantification_fold = 1);
  /*
   * @b load combine format fluid mode
@@ -41,20 +42,20 @@ class Loader {
  const Program<Device, T> Load(const std::string &model_path,
                                const std::string &para_path,
                                bool optimize = false,
-                                bool quantification = false);
+                                bool quantification = false,
+                                int quantification_fold = 1);
-  const Program<Device, T> LoadCombinedMemory(size_t model_len,
+  const Program<Device, T> LoadCombinedMemory(
-                                              const uint8_t *model_buf,
+      size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
-                                              size_t combined_params_len,
+      uint8_t *combined_params_buf, bool optimize = false,
-                                              uint8_t *combined_params_buf,
+      bool quantification = false, int quantification_fold = 1);
-                                              bool optimize = false,
-                                              bool quantification = false);
 private:
  const Program<Device, T> LoadProgram(const std::string &model_path,
                                       bool optimize = false,
                                       bool quantification = false,
-                                       bool can_add_split = false);
+                                       bool can_add_split = false,
+                                       int quantification_fold = 1);
  void InitMemoryFromProgram(
      const std::shared_ptr<ProgramDesc> &originProgramDesc,

--- a/mobile/src/framework/program/program.h
+++ b/mobile/src/framework/program/program.h
@@ -34,6 +34,7 @@ class Program {
  bool quantification = false;
  size_t combined_params_len;
  uint8_t *combined_params_buf;
+  int quantification_fold = 1;
 };
 }  // namespace framework

--- a/mobile/src/io/paddle_inference_api.h
+++ b/mobile/src/io/paddle_inference_api.h
@@ -216,6 +216,7 @@ struct PaddleMobileConfig : public PaddlePredictor::Config {
  int batch_size = 1;
  bool optimize = true;
  bool quantification = false;
+  int quantification_fold = 1;
  bool lod_mode = false;
  int thread_num = 1;
  bool load_when_predict = false;

--- a/mobile/src/io/paddle_mobile.cpp
+++ b/mobile/src/io/paddle_mobile.cpp
@@ -37,7 +37,8 @@ void PaddleMobile<Device, T>::SetThreadNum(int thread_num,
 template <typename Device, typename T>
 PMStatus PaddleMobile<Device, T>::Load(const std::string &dirname,
                                       bool optimize, bool quantification,
-                                       int batch_size, bool lod_mode) {
+                                       int batch_size, bool lod_mode,
+                                       int quantification_fold) {
  if (loader_.get() == nullptr) {
    loader_ = std::make_shared<framework::Loader<Device, T>>();
  } else {
@@ -46,8 +47,9 @@ PMStatus PaddleMobile<Device, T>::Load(const std::string &dirname,
  if (executor_.get() == nullptr) {
    executor_ = std::make_shared<framework::Executor<Device, T>>(
-        loader_->Load(dirname, optimize, quantification), config_, batch_size,
+        loader_->Load(dirname, optimize, quantification, false,
-        optimize, lod_mode);
+                      quantification_fold),
+        config_, batch_size, optimize, lod_mode);
  } else {
    LOG(kLOG_INFO) << "executor inited";
  }
@@ -59,7 +61,8 @@ template <typename Device, typename T>
 PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path,
                                       const std::string &para_path,
                                       bool optimize, bool quantification,
-                                       int batch_size, bool lod_mode) {
+                                       int batch_size, bool lod_mode,
+                                       int quantification_fold) {
  if (loader_.get() == nullptr) {
    loader_ = std::make_shared<framework::Loader<Device, T>>();
  } else {
@@ -69,8 +72,9 @@ PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path,
  if (executor_.get() == nullptr) {
    executor_ = std::make_shared<framework::Executor<Device, T>>(
-        loader_->Load(model_path, para_path, optimize, quantification), config_,
+        loader_->Load(model_path, para_path, optimize, quantification,
-        batch_size, optimize, lod_mode);
+                      quantification_fold),
+        config_, batch_size, optimize, lod_mode);
  } else {
    LOG(kLOG_INFO) << "executor inited";
  }
@@ -82,11 +86,12 @@ template <typename Device, typename T>
 PMStatus PaddleMobile<Device, T>::Load(const PaddleMobileConfig &config) {
  if (!config.model_dir.empty()) {
    return this->Load(config.model_dir, config.optimize, config.quantification,
-                      config.batch_size, config.lod_mode);
+                      config.batch_size, config.lod_mode,
+                      config.quantification_fold);
  } else if (!config.prog_file.empty() && !config.param_file.empty()) {
    return this->Load(config.prog_file, config.param_file, config.optimize,
-                      config.quantification, config.batch_size,
+                      config.quantification, config.batch_size, config.lod_mode,
-                      config.lod_mode);
+                      config.quantification_fold);
  } else {
    LOG(kLOG_ERROR) << "Failed to load inference model";
    return PMNotInitialized;
@@ -97,7 +102,7 @@ template <typename Device, typename T>
 bool PaddleMobile<Device, T>::LoadCombinedMemory(
    size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
    uint8_t *combined_params_buf, bool optimize, bool quantification,
-    int batch_size, bool lod_mode) {
+    int batch_size, bool lod_mode, int quantification_fold) {
  if (loader_.get() == nullptr) {
    loader_ = std::make_shared<framework::Loader<Device, T>>();
  } else {
@@ -107,7 +112,7 @@ bool PaddleMobile<Device, T>::LoadCombinedMemory(
    executor_ = std::make_shared<framework::Executor<Device, T>>(
        loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
                                    combined_params_buf, optimize,
-                                    quantification),
+                                    quantification, quantification_fold),
        config_, batch_size, optimize, lod_mode);
  } else {
    LOG(kLOG_INFO) << "executor inited";

--- a/mobile/src/io/paddle_mobile.h
+++ b/mobile/src/io/paddle_mobile.h
@@ -50,10 +50,11 @@ class PaddleMobile {
  PMStatus Load(const std::string &dirname, const bool optimize = false,
                const bool quantification = false, const int batch_size = 1,
-                const bool lod_mode = false);
+                const bool lod_mode = false, const int quantification_fold = 1);
  PMStatus Load(const std::string &model_path, const std::string &para_path,
                const bool optimize = false, const bool quantification = false,
-                const int batch_size = 1, const bool lod_mode = false);
+                const int batch_size = 1, const bool lod_mode = false,
+                const int quantification_fold = 1);
  PMStatus Load(const PaddleMobileConfig &config);
@@ -84,7 +85,7 @@ class PaddleMobile {
                          size_t combined_params_len,
                          uint8_t *combined_params_buf, bool optimize = false,
                          bool quantification = false, int batch_size = 1,
-                          bool lod_mode = false);
+                          bool lod_mode = false, int quantification_fold = 1);
  void SetThreadNum(int thread_num,
                    PowerMode power_mode = PERFORMANCE_PRIORITY);

--- a/mobile/test/net/test_net.cpp
+++ b/mobile/test/net/test_net.cpp
@@ -31,6 +31,10 @@ void test(int argc, char *argv[]) {
  arg_index++;
  bool enable_memory_optimization = std::stoi(argv[arg_index]) == 1;
  arg_index++;
+  bool quantification = std::stoi(argv[arg_index]) == 1;
+  arg_index++;
+  int quantification_fold = std::stoi(argv[arg_index]);
+  arg_index++;
  paddle_mobile::PaddleMobileConfigInternal config;
  config.memory_optimization_level = enable_memory_optimization
                                         ? MemoryOptimizationWithoutFeeds
@@ -98,7 +102,7 @@ void test(int argc, char *argv[]) {
  auto time1 = time();
  if (paddle_mobile.Load("./checked_model/model", "./checked_model/params",
-                         fuse, false, 1, true)) {
+                         fuse, quantification, 1, true, quantification_fold)) {
    auto time2 = time();
    std::cout << "auto-test"
              << " load-time-cost :" << time_diff(time1, time2) << "ms"

--- a/mobile/test/net/test_op_in_net.cpp
+++ b/mobile/test/net/test_op_in_net.cpp
@@ -58,7 +58,7 @@ void test(int argc, char *argv[]) {
  auto time1 = time();
  if (paddle_mobile.Load("./checked_model/model", "./checked_model/params",
-                         fuse, false, 1, true)) {
+                         fuse, false, 1, true, 1)) {
    auto time2 = time();
    std::cout << "auto-test"
              << " load-time-cost :" << time_diff(time1, time2) << "ms"

--- a/mobile/tools/pre-commit.hooks/cpplint.hook
+++ b/mobile/tools/pre-commit.hooks/cpplint.hook
@@ -5,7 +5,7 @@ TOTAL_ERRORS=0
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \
        grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \
-        grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do
+        grep -v "protobuf-c.h" | grep -v "protobuf-c.c" | grep -v "^mobile/tools/quantification"); do
    cpplint $file;
    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
 done

--- a/mobile/tools/python/fluidtools/run.py
+++ b/mobile/tools/python/fluidtools/run.py
@@ -22,6 +22,8 @@ checked_encrypt_model_path = "checked_encrypt_model"
 output_var_filter = []
 output_key_filter = {}
 check_shape = False
+quantification = False
+quantification_fold = 1000
 architecture = "arm-v7a"
 # architecture = "arm-v8a"
@@ -107,7 +109,8 @@ def resave_model(feed_kv):
    for name in p_names:
        v = fluid.framework._get_var(name, prog)
        v.persistable = False
-    fluid.io.save_inference_model(dirname=checked_model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model", params_filename="params")
+    if not quantification:
+        fluid.io.save_inference_model(dirname=checked_model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model", params_filename="params")
    if has_found_wrong_shape:
        pp_red("has found wrong shape", 1)
    else:
@@ -392,7 +395,7 @@ for op in ops:
 pp_tab("op types : {}".format(op_types), 1)
 def check_mobile_results(args, fuse, mem_opt):
-    args = "{} {} {}".format("1" if fuse else "0", "1" if mem_opt else "0", args)
+    args = "{} {} {} {} {}".format("1" if fuse else "0", "1" if mem_opt else "0", "1" if quantification else "0", quantification_fold, args)
    res = sh("adb shell \"cd {} && export LD_LIBRARY_PATH=. && ./test-net {}\"".format(mobile_exec_root, args))
    lines = res.split("\n")
    # for line in lines:
@@ -425,6 +428,26 @@ def check_mobile_results(args, fuse, mem_opt):
    fetch_names = []
    for fetch in fetches:
        fetch_names.append(fetch.name)
+    fetch_diff = 0.0
+    fetch_count = 0
+    for index in op_cache:
+        op_output_var_name, op = op_cache[index]
+        if not op_output_var_name in output_var_cache:
+            continue
+        if not op_output_var_name in mobile_var_cache:
+            continue
+        if op_output_var_name not in fetch_names:
+            continue
+        values1 = output_var_cache[op_output_var_name]
+        values2 = mobile_var_cache[op_output_var_name]
+        shape = get_var_shape(op_output_var_name) if check_shape else []
+        for i in range(len(values1)):
+            v1 = values1[i]
+            v2 = values2[len(shape) + i]
+            fetch_diff += abs(v1 - v2)
+            fetch_count += 1
+    if fetch_count != 0:
+        pp_yellow("output avg diff : {}".format(fetch_diff / fetch_count), 1)
    for index in op_cache:
        op_output_var_name, op = op_cache[index]
        if mem_opt:

--- a/mobile/tools/quantification/convert.cpp
+++ b/mobile/tools/quantification/convert.cpp
@@ -68,7 +68,7 @@ std::shared_ptr<ProgramDesc> loadParams(const std::string &model_path) {
 }
-void LoadWithDumpForInt8(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file) {
+void LoadWithDumpForInt8(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file, int quantification_fold) {
    // 1. version
    uint32_t version = *reinterpret_cast<uint32_t *>(*dataP);
@@ -162,27 +162,33 @@ void LoadWithDumpForInt8(const paddle_mobile::framework::VarDesc &var_desc, char
    }
    *dataP += tensorSize;
-    // for float 32
+    int step = std::max(memory_size / quantification_fold, 1);
-    float min_value = std::numeric_limits<float>::max();
-    float max_value = std::numeric_limits<float>::min();
-    for (int k = 0; k < memory_size; ++k) {
+    int visited_fold = 0;
-        min_value = std::min(min_value, static_cast<float *> (memory)[k]);
+    while (visited_fold * step < memory_size) {
-        max_value = std::max(max_value, static_cast<float *> (memory)[k]);
+        // for float 32
-    }
+        float min_value = std::numeric_limits<float>::max();
+        float max_value = std::numeric_limits<float>::min();
+        for (int k = visited_fold * step; k < std::min((visited_fold + 1) * step, memory_size); ++k) {
+            min_value = std::min(min_value, static_cast<float *> (memory)[k]);
+            max_value = std::max(max_value, static_cast<float *> (memory)[k]);
+        }
-    fwrite(&min_value, sizeof(float), 1, out_file);
+        fwrite(&min_value, sizeof(float), 1, out_file);
-    fwrite(&max_value, sizeof(float), 1, out_file);
+        fwrite(&max_value, sizeof(float), 1, out_file);
-    for (int g = 0; g < memory_size; ++g) {
+        for (int g = visited_fold * step; g < std::min((visited_fold + 1) * step, memory_size); ++g) {
-        float value = static_cast<float *> (memory)[g];
+            float value = static_cast<float *> (memory)[g];
-        auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
+            auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
-        fwrite(&factor, sizeof(uint8_t), 1, out_file);
+            fwrite(&factor, sizeof(uint8_t), 1, out_file);
+        }
+        visited_fold++;
    }
 }
 void
-quantificate_combined_int8(const std::string &model_path, const std::string &param_path, const std::string &param_min_path) {
+quantificate_combined_int8(const std::string &model_path, const std::string &param_path, const std::string &param_min_path, int quantification_fold) {
    auto program = loadParams(model_path);
    char *origin_data = Get_binary_data(param_path);
    char *data = origin_data;
@@ -193,7 +199,7 @@ quantificate_combined_int8(const std::string &model_path, const std::string &par
                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
                    continue;
                }
-                LoadWithDumpForInt8(*var_desc, &data, out_file);
+                LoadWithDumpForInt8(*var_desc, &data, out_file, quantification_fold);
            }
        }
    }
@@ -201,7 +207,7 @@ quantificate_combined_int8(const std::string &model_path, const std::string &par
    delete origin_data;
 }
-void quantificate_seperated_int8(const std::string model_dir, const std::string param_min_path) {
+void quantificate_seperated_int8(const std::string model_dir, const std::string param_min_path, int quantification_fold) {
    auto program = loadParams(model_dir + "/__model__");
    std::string shell_command = "mkdir " + param_min_path;
@@ -217,7 +223,7 @@ void quantificate_seperated_int8(const std::string model_dir, const std::string
                FILE *out_file = fopen(file_name.c_str(), "wb");
                char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
                char *data = origin_data;
-                LoadWithDumpForInt8(*var_desc, &data, out_file);
+                LoadWithDumpForInt8(*var_desc, &data, out_file, quantification_fold);
                delete origin_data;
                fclose(out_file);
            }
@@ -225,7 +231,7 @@ void quantificate_seperated_int8(const std::string model_dir, const std::string
    }
 }
-void LoadWithDumpForFloat32(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file) {
+void LoadWithDumpForFloat32(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file, int quantification_fold) {
    // 1. version
    uint32_t version = *reinterpret_cast<uint32_t *>(*dataP);
@@ -319,30 +325,36 @@ void LoadWithDumpForFloat32(const paddle_mobile::framework::VarDesc &var_desc, c
    }
    *dataP += tensorSize;
-    // for float 32
+    int step = std::max(memory_size / quantification_fold, 1);
-    float min_value = std::numeric_limits<float>::max();
-    float max_value = std::numeric_limits<float>::min();
-    for (int k = 0; k < memory_size; ++k) {
+    int visited_fold = 0;
-        min_value = std::min(min_value, static_cast<float *> (memory)[k]);
+    while (visited_fold * step < memory_size) {
-        max_value = std::max(max_value, static_cast<float *> (memory)[k]);
+        // for float 32
-    }
+        float min_value = std::numeric_limits<float>::max();
+        float max_value = std::numeric_limits<float>::min();
-    float diff = 0.0;
+        for (int k = visited_fold * step; k < std::min((visited_fold + 1) * step, memory_size); ++k) {
-    for (int g = 0; g < memory_size; ++g) {
+            min_value = std::min(min_value, static_cast<float *> (memory)[k]);
-        float value = static_cast<float *> (memory)[g];
+            max_value = std::max(max_value, static_cast<float *> (memory)[k]);
-        auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
+        }
-        float value_quantized = min_value + (factor / 255.0) * (max_value - min_value);
-        diff += fabs(value - value_quantized);
+        float diff = 0.0;
-        fwrite(&value_quantized, sizeof(float), 1, out_file);
+        for (int g = visited_fold * step; g < std::min((visited_fold + 1) * step, memory_size); ++g) {
-    }
+            float value = static_cast<float *> (memory)[g];
-    if (memory_size > 0) {
+            auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
-        std::cout << "avg diff caused by quantization for var " << var_desc.Name() << " is: " << (diff / memory_size) << std::endl;
+            float value_quantized = min_value + (factor / 255.0) * (max_value - min_value);
+            diff += fabs(value - value_quantized);
+            fwrite(&value_quantized, sizeof(float), 1, out_file);
+        }
+        if (memory_size > 0) {
+            std::cout << "avg diff caused by quantization for var " << var_desc.Name() << " is: " << (diff / memory_size) << std::endl;
+        }
+        visited_fold++;
    }
 }
 void
-quantificate_combined_float32(const std::string &model_path, const std::string &param_path, const std::string &param_min_path) {
+quantificate_combined_float32(const std::string &model_path, const std::string &param_path, const std::string &param_min_path, int quantification_fold) {
    auto program = loadParams(model_path);
    char *origin_data = Get_binary_data(param_path);
    char *data = origin_data;
@@ -353,7 +365,7 @@ quantificate_combined_float32(const std::string &model_path, const std::string &
                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
                    continue;
                }
-                LoadWithDumpForFloat32(*var_desc, &data, out_file);
+                LoadWithDumpForFloat32(*var_desc, &data, out_file, quantification_fold);
            }
        }
    }
@@ -361,7 +373,7 @@ quantificate_combined_float32(const std::string &model_path, const std::string &
    delete origin_data;
 }
-void quantificate_seperated_float32(const std::string model_dir, const std::string param_min_path) {
+void quantificate_seperated_float32(const std::string model_dir, const std::string param_min_path, int quantification_fold) {
    auto program = loadParams(model_dir + "/__model__");
    std::string shell_command = "mkdir " + param_min_path;
@@ -377,7 +389,7 @@ void quantificate_seperated_float32(const std::string model_dir, const std::stri
                FILE *out_file = fopen(file_name.c_str(), "wb");
                char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
                char *data = origin_data;
-                LoadWithDumpForFloat32(*var_desc, &data, out_file);
+                LoadWithDumpForFloat32(*var_desc, &data, out_file, quantification_fold);
                delete origin_data;
                fclose(out_file);
            }
@@ -402,10 +414,15 @@ int main(int argc, char **argv) {
    PADDLE_MOBILE_ENFORCE(argc > 3, "we need your output path. %s ", kNoteEg.c_str());
    std::string output_path = argv[3];
+    int quantification_fold = 1;
+    if (argc > 4) {
+        quantification_fold = std::stoi(argv[4]);
+    }
    if (action_type == "0") {
        // for seperated
        const std::string &seperated_min_dir = output_path;
-        quantificate_seperated_int8(base_path, seperated_min_dir);
+        quantificate_seperated_int8(base_path, seperated_min_dir, quantification_fold);
        return 0;
    }
@@ -414,14 +431,14 @@ int main(int argc, char **argv) {
        const std::string &combined_min_dir = output_path;
        std::string model_path = base_path + "/model";
        std::string param_path = base_path + "/params";
-        quantificate_combined_int8(model_path, param_path, combined_min_dir);
+        quantificate_combined_int8(model_path, param_path, combined_min_dir, quantification_fold);
        return 0;
    }
    if (action_type == "2") {
        // for seperated
        const std::string &seperated_min_dir = output_path;
-        quantificate_seperated_float32(base_path, seperated_min_dir);
+        quantificate_seperated_float32(base_path, seperated_min_dir, quantification_fold);
        return 0;
    }
@@ -430,7 +447,7 @@ int main(int argc, char **argv) {
        const std::string &combined_min_dir = output_path;
        std::string model_path = base_path + "/model";
        std::string param_path = base_path + "/params";
-        quantificate_combined_float32(model_path, param_path, combined_min_dir);
+        quantificate_combined_float32(model_path, param_path, combined_min_dir, quantification_fold);
        return 0;
    }