提交 6ef17f09 编写于 作者: W wangliu

add quantification tool to compress binary size

上级 f2f0217c
......@@ -9,6 +9,7 @@ option(LOG_PROFILE "log profile" ON)
option(CPU "armv7 with neon" ON)
option(MALI_GPU "mali gpu" OFF)
option(FPGA "fpga" OFF)
option(QUANTI "quantification" OFF)
file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
......@@ -153,3 +154,7 @@ if(DEBUGING)
endif ()
......@@ -30,6 +30,7 @@ class Program {
std::string model_path;
std::string para_path;
bool combined = false;
bool quantification = false;
......@@ -154,7 +154,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
void *memory = tensor;
void *memory = nullptr;
int type_size = 0;
switch (desc.DataType()) {
case framework::VARTYPE_TYPE_FP16:
......@@ -179,11 +179,26 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
if (program_.quantification) {
float min_value;
float max_value;
memcpy(&min_value, *data, sizeof(float));
memcpy(&max_value, *data + sizeof(float) , sizeof(float));
*data += 2 * sizeof(float);
const float factor = (max_value - min_value) / 255.0;
uint8_t *uint8_data = (uint8_t *) (*data);
for (int k = 0; k < memory_size; ++k) {
static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
*data += (memory_size * sizeof(uint8_t));
} else {
for (int n = 0; n < memory_size * type_size; ++n) {
static_cast<char *>(memory)[n] = (*data)[n];
(*data) += (sizeof(char) * memory_size * type_size);
template <typename Dtype, Precision P>
......@@ -44,9 +44,9 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
const std::string &dirname, bool optimize, bool can_add_split) {
const std::string &dirname, bool optimize,bool quantification, bool can_add_split) {
auto program =
this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
this->LoadProgram(dirname + "/__model__", optimize,quantification, can_add_split);
program.model_path = dirname;
return program;
......@@ -54,16 +54,17 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
const std::string &model_path, const std::string &para_path,
bool optimize) {
bool optimize, bool quantification) {
auto program = this->LoadProgram(model_path, optimize);
program.para_path = para_path;
program.combined = true;
program.quantification = quantification;
return program;
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
const std::string &model_path, bool optimize, bool can_add_split) {
const std::string &model_path, bool optimize, bool quantification, bool can_add_split) {
std::string model_filename = model_path;
PaddleMobile__Framework__Proto__ProgramDesc *c_program;
uint8_t *buf = NULL;
......@@ -82,6 +83,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
framework::Program<Dtype, P> program;
program.originProgram = originProgramDesc;
program.quantification = quantification;
auto scope = std::make_shared<framework::Scope>();
program.scope = scope;
......@@ -30,6 +30,7 @@ class Loader {
* */
const framework::Program<Dtype, P> Load(const std::string &dirname,
bool optimize = false,
bool quantification = false,
bool can_add_split = false);
......@@ -38,11 +39,13 @@ class Loader {
* */
const framework::Program<Dtype, P> Load(const std::string &model_path,
const std::string &para_path,
bool optimize = false);
bool optimize = false,
bool quantification = false);
const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
bool optimize = false,
bool quantification = false,
bool can_add_split = false);
......@@ -25,7 +25,7 @@ void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
template <typename Dtype, Precision P>
bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize, bool quantification,
int batch_size) {
if (loader_.get() == nullptr) {
loader_ = std::make_shared<Loader<Dtype, P>>();
......@@ -35,7 +35,7 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
if (executor_.get() == nullptr) {
executor_ = std::make_shared<Executor<Dtype, P>>(
loader_->Load(dirname, optimize), batch_size, optimize);
loader_->Load(dirname, optimize,quantification), batch_size, optimize);
} else {
LOG(kLOG_INFO) << "executor inited";
......@@ -45,7 +45,7 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
template <typename Dtype, Precision P>
bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
const std::string &para_path, bool optimize,
const std::string &para_path, bool optimize, bool quantification,
int batch_size) {
if (loader_.get() == nullptr) {
loader_ = std::make_shared<Loader<Dtype, P>>();
......@@ -55,7 +55,7 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
if (executor_.get() == nullptr) {
executor_ = std::make_shared<Executor<Dtype, P>>(
loader_->Load(model_path, para_path, optimize), batch_size, optimize);
loader_->Load(model_path, para_path, optimize, quantification), batch_size, optimize);
} else {
LOG(kLOG_INFO) << "executor inited";
......@@ -38,7 +38,7 @@ class PaddleMobile {
* @b load separate format fluid model
* @b 加载分开形式的 fluid 模型
* */
bool Load(const std::string &dirname, bool optimize = false,
bool Load(const std::string &dirname, bool optimize = false, bool quantification = false,
int batch_size = 1);
......@@ -46,7 +46,7 @@ class PaddleMobile {
* @b 加载结合在一起格式的模型
* */
bool Load(const std::string &model_path, const std::string &para_path,
bool optimize = false, int batch_size = 1);
bool optimize = false,bool quantification = false, int batch_size = 1);
void SetThreadNum(int num);
......@@ -21,7 +21,7 @@ int main() {
bool optimize = true;
auto time1 = time();
if (paddle_mobile.Load(g_googlenet, optimize)) {
if (paddle_mobile.Load(g_googlenet, optimize, true)) {
auto time2 = time();
DLOG << "load cost: " << time_diff(time1, time1) << "ms";
std::vector<float> input;
ADD_EXECUTABLE(convert convert.cpp)
target_link_libraries(convert paddle-mobile)
\ No newline at end of file
#include "io/paddle_mobile.h"
#include <cstdlib>
using std::string;
static const std::string g_googlenet_combine = "../models/googlenet_combine";
static const std::string g_googlenet = "../models/googlenet";
using paddle_mobile::Executor;
using paddle_mobile::framework::Program;
char *Get_binary_data(std::string filename) {
FILE *file = fopen(filename.c_str(), "rb");
PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
fseek(file, 0, SEEK_END);
int64_t size = ftell(file);
PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
char *data = new char[size];
size_t bytes_read = fread(data, 1, size, file);
PADDLE_MOBILE_ENFORCE(bytes_read == size,
"read binary file bytes do not match with fseek");
DLOG << "Get_binary_data end";
return data;
void LoadWithDump(const paddle_mobile::framework::VarDesc var_desc,
paddle_mobile::framework::LoDTensor *tensor, char **data, FILE *out_file) {
// 1. version
uint32_t version = *reinterpret_cast<uint32_t *>(*data);
// write version
fwrite(&version, sizeof(uint32_t), 1, out_file );
(*data) += sizeof(uint32_t);
// 2 Lod information
uint64_t *lod_level_ptr = new uint64_t();
memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
uint64_t lod_level = 0;
// write lod Information
fwrite(&lod_level, sizeof(uint64_t), 1, out_file);
delete lod_level_ptr;
(*data) += sizeof(uint64_t);
auto &lod = *tensor->mutable_lod();
for (uint64_t i = 0; i < lod_level; ++i) {
uint64_t size = *reinterpret_cast<uint64_t *>(*data);
// write lod size
fwrite(&size, sizeof(uint64_t), 1, out_file);
(*data) += sizeof(uint64_t);
std::vector<size_t> tmp(size / sizeof(size_t));
for (int k = 0; k < tmp.size(); ++k) {
tmp[k] = *reinterpret_cast<size_t *>(*data);
(*data) += sizeof(size_t);
// write lod size vector
fwrite(&tmp, sizeof(size_t), tmp.size(), out_file );
lod[i] = tmp;
// 3. tensor version
uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
// write tensor version
fwrite(&tensor_version, sizeof(uint32_t), 1, out_file);
(*data) += sizeof(uint32_t);
// 4. tensor desc
int32_t size = *reinterpret_cast<int32_t *>(*data);
// write tensor desc
fwrite(&size, sizeof(int32_t), 1, out_file);
(*data) += sizeof(int32_t);
std::unique_ptr<char[]> buf(new char[size]);
for (int m = 0; m < size; ++m) {
buf.get()[m] = (*data)[m];
fwrite(buf.get(), sizeof(char), size, out_file);
(*data) += (sizeof(char) * size);
const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
int memory_size = 1;
for (auto l : desc.Dims()) {
memory_size *= l;
void *memory = tensor;
int type_size = 0;
switch (desc.DataType()) {
case paddle_mobile::framework::VARTYPE_TYPE_FP16:
type_size = 2;
case paddle_mobile::framework::VARTYPE_TYPE_FP32:
type_size = 4;
memory = tensor->mutable_data<float>();
case paddle_mobile::framework::VARTYPE_TYPE_FP64:
type_size = 8;
case paddle_mobile::framework::VARTYPE_TYPE_INT32:
type_size = 4;
case paddle_mobile::framework::VARTYPE_TYPE_INT64:
type_size = 8;
case paddle_mobile::framework::VARTYPE_TYPE_BOOL:
type_size = 1;
for (int n = 0; n < memory_size * type_size; ++n) {
static_cast<char *>(memory)[n] = (*data)[n];
(*data) += (sizeof(char) * memory_size * type_size);
// for float 32
float min_value = std::numeric_limits<float>::max();
float max_value = std::numeric_limits<float>::min();
for (int k = 0; k < memory_size; ++k) {
min_value = std::min(min_value, static_cast<float *> (memory)[k]);
max_value = std::max(max_value, static_cast<float *> (memory)[k]);
fwrite(&min_value, sizeof(float), 1, out_file);
fwrite(&max_value, sizeof(float), 1, out_file);
for (int g = 0; g < memory_size; ++g) {
float value = static_cast<float *> (memory)[g];
uint8_t factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
fwrite(&factor, sizeof(uint8_t), 1, out_file);
void quantificate_combined(std::string model_path, std::string param_path, std::string param_min_path){
paddle_mobile::Loader<paddle_mobile::CPU,paddle_mobile::Precision::FP32 > loader;
bool optimize = true;
auto program = loader.Load(model_path, param_path, optimize);
char *origin_data = Get_binary_data(program.para_path);
char *data = origin_data;
FILE *out_file = fopen(param_min_path.c_str(), "wb");
for (const auto &block : program.originProgram->Blocks()) {
for (const auto &var_desc : block->Vars()) {
auto var = program.scope->Var(var_desc->Name());
if(var_desc ->Persistable()) {
auto tensor = var->template GetMutable<paddle_mobile::framework::LoDTensor>();
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
LoadWithDump(*var_desc, tensor, &data,out_file);
delete origin_data;
void quantificate_seperated(std::string model_dir, std::string param_min_path) {
paddle_mobile::Loader<paddle_mobile::CPU,paddle_mobile::Precision::FP32 > loader;
bool optimize = true;
auto program = loader.Load(model_dir, optimize);
std::string shell_command = "mkdir "+param_min_path;
for (const auto &block : program.originProgram->Blocks()) {
for (const auto &var_desc : block->Vars()) {
auto var = program.scope->Var(var_desc->Name());
if(var_desc ->Persistable()) {
auto tensor = var->template GetMutable<paddle_mobile::framework::LoDTensor>();
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
std::string file_name = param_min_path +"/"+ var_desc->Name();
FILE *out_file = fopen(file_name.c_str(), "wb");
char *origin_data =
Get_binary_data(program.model_path + "/" + var_desc->Name());
char *data = origin_data;
LoadWithDump(*var_desc, tensor, &data,out_file);
delete origin_data;
int main() {
std::string filename = "params_min";
std::string model_path = g_googlenet_combine + "/model";
std::string param_path = g_googlenet_combine + "/params";
std::string dirname = "param_min_dir";
std::string model_dir = g_googlenet;
// quantificate_combined(model_path, param_path,filename);
quantificate_seperated(model_dir, dirname);
return 0;
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
想要评论请 注册