Merge pull request #614 from cocodark/develop

add quantification tool to compress binary size

Merge pull request #614 from cocodark/develop
add quantification tool to compress binary size
4be6a2c1 · WangLiu · GitHub · 834a0930 · 28426189 · 4be6a2c1
9 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,6 +9,7 @@ option(LOG_PROFILE "log profile" ON)
 option(CPU "armv7 with neon" ON)
 option(MALI_GPU "mali gpu" OFF)
 option(FPGA "fpga" OFF)
+option(QUANTI "quantification" OFF)

 file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
 file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
@@ -152,3 +153,7 @@ if(DEBUGING)
    endif()
 endif()

+if (QUANTI)
+    add_subdirectory(tools/quantification)
+endif ()
+
--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -30,6 +30,7 @@ class Program {
  std::string model_path;
  std::string para_path;
  bool combined = false;
+  bool quantification = false;

 private:
 };

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -154,7 +154,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,

  tensor->Resize(framework::make_ddim(desc.Dims()));

-  void *memory = tensor;
+  void *memory = nullptr;
  int type_size = 0;
  switch (desc.DataType()) {
    case framework::VARTYPE_TYPE_FP16:
@@ -179,11 +179,25 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
    default:
      break;
  }
-
-  for (int n = 0; n < memory_size * type_size; ++n) {
-    static_cast<char *>(memory)[n] = (*data)[n];
+  if (program_.quantification) {
+    float min_value;
+    float max_value;
+
+    memcpy(&min_value, *data, sizeof(float));
+    memcpy(&max_value, *data + sizeof(float), sizeof(float));
+    *data += 2 * sizeof(float);
+    const float factor = (max_value - min_value) / 255.0;
+    uint8_t *uint8_data = (uint8_t *)(*data);
+    for (int k = 0; k < memory_size; ++k) {
+      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
+    }
+    *data += (memory_size * sizeof(uint8_t));
+  } else {
+    for (int n = 0; n < memory_size * type_size; ++n) {
+      static_cast<char *>(memory)[n] = (*data)[n];
+    }
+    (*data) += (sizeof(char) * memory_size * type_size);
  }
-  (*data) += (sizeof(char) * memory_size * type_size);
 }

 template <typename Dtype, Precision P>

--- a/src/io/loader.cpp
+++ b/src/io/loader.cpp
@@ -44,26 +44,29 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {

 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &dirname, bool optimize, bool can_add_split) {
-  auto program =
-      this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
+    const std::string &dirname, bool optimize, bool quantification,
+    bool can_add_split) {
+  auto program = this->LoadProgram(dirname + "/__model__", optimize,
+                                   quantification, can_add_split);
  program.model_path = dirname;
  return program;
 }

 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &model_path, const std::string &para_path,
-    bool optimize) {
+    const std::string &model_path, const std::string &para_path, bool optimize,
+    bool quantification) {
  auto program = this->LoadProgram(model_path, optimize);
  program.para_path = para_path;
  program.combined = true;
+  program.quantification = quantification;
  return program;
 }

 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
-    const std::string &model_path, bool optimize, bool can_add_split) {
+    const std::string &model_path, bool optimize, bool quantification,
+    bool can_add_split) {
  std::string model_filename = model_path;
  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
  uint8_t *buf = NULL;
@@ -82,6 +85,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(

  framework::Program<Dtype, P> program;
  program.originProgram = originProgramDesc;
+  program.quantification = quantification;

  auto scope = std::make_shared<framework::Scope>();
  program.scope = scope;

--- a/src/io/loader.h
+++ b/src/io/loader.h
@@ -30,6 +30,7 @@ class Loader {
   * */
  const framework::Program<Dtype, P> Load(const std::string &dirname,
                                          bool optimize = false,
+                                          bool quantification = false,
                                          bool can_add_split = false);

  /*
@@ -38,11 +39,13 @@ class Loader {
   * */
  const framework::Program<Dtype, P> Load(const std::string &model_path,
                                          const std::string &para_path,
-                                          bool optimize = false);
+                                          bool optimize = false,
+                                          bool quantification = false);

 private:
  const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
                                                 bool optimize = false,
+                                                 bool quantification = false,
                                                 bool can_add_split = false);
 };


--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -26,7 +26,7 @@ void PaddleMobile<Dtype, P>::SetThreadNum(int num) {

 template <typename Dtype, Precision P>
 bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
-                                  int batch_size) {
+                                  bool quantification, int batch_size) {
  if (loader_.get() == nullptr) {
    loader_ = std::make_shared<Loader<Dtype, P>>();
  } else {
@@ -35,7 +35,7 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,

  if (executor_.get() == nullptr) {
    executor_ = std::make_shared<Executor<Dtype, P>>(
-        loader_->Load(dirname, optimize), batch_size, optimize);
+        loader_->Load(dirname, optimize, quantification), batch_size, optimize);
  } else {
    LOG(kLOG_INFO) << "executor inited";
  }
@@ -46,7 +46,7 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
 template <typename Dtype, Precision P>
 bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
                                  const std::string &para_path, bool optimize,
-                                  int batch_size) {
+                                  bool quantification, int batch_size) {
  if (loader_.get() == nullptr) {
    loader_ = std::make_shared<Loader<Dtype, P>>();
  } else {
@@ -55,7 +55,8 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,

  if (executor_.get() == nullptr) {
    executor_ = std::make_shared<Executor<Dtype, P>>(
-        loader_->Load(model_path, para_path, optimize), batch_size, optimize);
+        loader_->Load(model_path, para_path, optimize, quantification),
+        batch_size, optimize);
  } else {
    LOG(kLOG_INFO) << "executor inited";
  }

--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -39,15 +39,15 @@ class PaddleMobile {
   * @b 加载分开形式的 fluid 模型
   * */
  bool Load(const std::string &dirname, bool optimize = false,
-            int batch_size = 1);
+            bool quantification = false, int batch_size = 1);

  /*
   * @b load combine format fluid mode
   * @b 加载结合在一起格式的模型
   * */
  bool Load(const std::string &model_path, const std::string &para_path,
-            bool optimize = false, int batch_size = 1);
-
+            bool optimize = false, bool quantification = false,
+            int batch_size = 1);
  /*
   * @b 设置线程数, 当 cmake 中开启 openmp 时生效
   * */

--- a/tools/quantification/CMakeLists.txt
+++ b/tools/quantification/CMakeLists.txt
+set(dir ${CMAKE_CURRENT_SOURCE_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build")
+
+ADD_EXECUTABLE(convert convert.cpp)
+target_link_libraries(convert paddle-mobile)
\ No newline at end of file
--- a/tools/quantification/convert.cpp
+++ b/tools/quantification/convert.cpp
+
+
+#include "io/paddle_mobile.h"
+#include <cstdlib>
+using std::string;
+
+static const std::string g_googlenet_combine = "../models/googlenet_combine";
+static const std::string g_googlenet = "../models/googlenet";
+using paddle_mobile::Executor;
+using paddle_mobile::framework::Program;
+
+    char *Get_binary_data(std::string filename) {
+        FILE *file = fopen(filename.c_str(), "rb");
+        PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
+                              filename.c_str());
+        fseek(file, 0, SEEK_END);
+        int64_t size = ftell(file);
+        PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
+        rewind(file);
+        char *data = new char[size];
+        size_t bytes_read = fread(data, 1, size, file);
+        PADDLE_MOBILE_ENFORCE(bytes_read == size,
+                              "read binary file bytes do not match with fseek");
+        DLOG << "Get_binary_data end";
+        fclose(file);
+        return data;
+    }
+
+    void LoadWithDump(const paddle_mobile::framework::VarDesc var_desc,
+                    paddle_mobile::framework::LoDTensor *tensor, char **data, FILE *out_file) {
+        // 1. version
+        uint32_t version = *reinterpret_cast<uint32_t *>(*data);
+        // write version
+        fwrite(&version, sizeof(uint32_t), 1, out_file );
+        (*data) += sizeof(uint32_t);
+        // 2 Lod information
+        uint64_t *lod_level_ptr = new uint64_t();
+        memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
+        uint64_t lod_level = 0;
+        // write lod Information
+        fwrite(&lod_level, sizeof(uint64_t), 1, out_file);
+        delete lod_level_ptr;
+        (*data) += sizeof(uint64_t);
+        auto &lod = *tensor->mutable_lod();
+        lod.resize(lod_level);
+        for (uint64_t i = 0; i < lod_level; ++i) {
+            uint64_t size = *reinterpret_cast<uint64_t *>(*data);
+            // write lod size
+            fwrite(&size, sizeof(uint64_t), 1, out_file);
+            (*data) += sizeof(uint64_t);
+            std::vector<size_t> tmp(size / sizeof(size_t));
+            for (int k = 0; k < tmp.size(); ++k) {
+                tmp[k] = *reinterpret_cast<size_t *>(*data);
+                (*data) += sizeof(size_t);
+            }
+            // write lod size vector
+            fwrite(&tmp, sizeof(size_t), tmp.size(), out_file );
+
+            lod[i] = tmp;
+        }
+
+        // 3. tensor version
+        uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
+        // write tensor version
+        fwrite(&tensor_version, sizeof(uint32_t), 1, out_file);
+        (*data) += sizeof(uint32_t);
+
+        // 4. tensor desc
+        int32_t size = *reinterpret_cast<int32_t *>(*data);
+        // write tensor desc
+        fwrite(&size, sizeof(int32_t), 1, out_file);
+        (*data) += sizeof(int32_t);
+
+        std::unique_ptr<char[]> buf(new char[size]);
+        for (int m = 0; m < size; ++m) {
+            buf.get()[m] = (*data)[m];
+        }
+        fwrite(buf.get(), sizeof(char), size, out_file);
+        (*data) += (sizeof(char) * size);
+
+        const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
+        int memory_size = 1;
+        for (auto l : desc.Dims()) {
+            memory_size *= l;
+        }
+        tensor->Resize(paddle_mobile::framework::make_ddim(desc.Dims()));
+
+        void *memory = tensor;
+        int type_size = 0;
+        switch (desc.DataType()) {
+            case paddle_mobile::framework::VARTYPE_TYPE_FP16:
+                type_size = 2;
+                break;
+            case paddle_mobile::framework::VARTYPE_TYPE_FP32:
+                type_size = 4;
+                memory = tensor->mutable_data<float>();
+                break;
+            case paddle_mobile::framework::VARTYPE_TYPE_FP64:
+                type_size = 8;
+                break;
+            case paddle_mobile::framework::VARTYPE_TYPE_INT32:
+                type_size = 4;
+                break;
+            case paddle_mobile::framework::VARTYPE_TYPE_INT64:
+                type_size = 8;
+                break;
+            case paddle_mobile::framework::VARTYPE_TYPE_BOOL:
+                type_size = 1;
+                break;
+            default:
+                break;
+        }
+        for (int n = 0; n < memory_size * type_size; ++n) {
+            static_cast<char *>(memory)[n] = (*data)[n];
+        }
+        (*data) += (sizeof(char) * memory_size * type_size);
+        // for float 32
+        float min_value = std::numeric_limits<float>::max();
+        float max_value = std::numeric_limits<float>::min();
+        for (int k = 0; k < memory_size; ++k) {
+            min_value = std::min(min_value, static_cast<float *> (memory)[k]);
+            max_value = std::max(max_value, static_cast<float *> (memory)[k]);
+        }
+        fwrite(&min_value, sizeof(float), 1, out_file);
+        fwrite(&max_value, sizeof(float), 1, out_file);
+        for (int g = 0; g < memory_size; ++g) {
+            float value = static_cast<float *> (memory)[g];
+            uint8_t factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
+            fwrite(&factor, sizeof(uint8_t), 1, out_file);
+        }
+
+
+    }
+
+    void quantificate_combined(std::string model_path, std::string param_path, std::string param_min_path){
+        paddle_mobile::Loader<paddle_mobile::CPU,paddle_mobile::Precision::FP32 > loader;
+        bool optimize = true;
+        auto program = loader.Load(model_path, param_path, optimize);
+        char *origin_data = Get_binary_data(program.para_path);
+        char *data = origin_data;
+        FILE *out_file = fopen(param_min_path.c_str(), "wb");
+        for (const auto &block : program.originProgram->Blocks()) {
+            for (const auto &var_desc : block->Vars()) {
+                auto var = program.scope->Var(var_desc->Name());
+                if(var_desc ->Persistable()) {
+                    auto tensor = var->template GetMutable<paddle_mobile::framework::LoDTensor>();
+                    if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+                        continue;
+                    }
+                    LoadWithDump(*var_desc, tensor, &data,out_file);
+                }
+            }
+        }
+        fclose(out_file);
+        delete origin_data;
+
+    }
+    void quantificate_seperated(std::string model_dir, std::string param_min_path) {
+        paddle_mobile::Loader<paddle_mobile::CPU,paddle_mobile::Precision::FP32 > loader;
+        bool optimize = true;
+        auto program = loader.Load(model_dir, optimize);
+        std::string shell_command = "mkdir "+param_min_path;
+        system(shell_command.c_str());
+        for (const auto &block : program.originProgram->Blocks()) {
+            for (const auto &var_desc : block->Vars()) {
+                auto var = program.scope->Var(var_desc->Name());
+                if(var_desc ->Persistable()) {
+                    auto tensor = var->template GetMutable<paddle_mobile::framework::LoDTensor>();
+                    if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+                        continue;
+                    }
+                    std::string file_name = param_min_path +"/"+ var_desc->Name();
+
+                    FILE *out_file = fopen(file_name.c_str(), "wb");
+                    char *origin_data =
+                            Get_binary_data(program.model_path + "/" + var_desc->Name());
+                    char *data = origin_data;
+                    LoadWithDump(*var_desc, tensor, &data,out_file);
+                    delete origin_data;
+                    fclose(out_file);
+                }
+            }
+        }
+
+    }
+    int main() {
+        std::string filename = "params_min";
+        std::string model_path = g_googlenet_combine + "/model";
+        std::string param_path = g_googlenet_combine + "/params";
+        std::string dirname = "param_min_dir";
+        std::string model_dir = g_googlenet;
+//        quantificate_combined(model_path, param_path,filename);
+        quantificate_seperated(model_dir, dirname);
+
+        return 0;
+    }
+
+
+
+
+
+