未验证 提交 23231af8 编写于 作者: C cc 提交者: GitHub

Optimize weight quantizaion (#3374)

* Optimize weight quantizaion, test=develop
上级 40a31442
......@@ -27,6 +27,9 @@
#include "lite/utils/cp_logging.h"
#include "lite/utils/string.h"
DEFINE_string(optimized_model_path,
"",
"the path of the model that is optimized by opt.");
DEFINE_string(model_dir,
"",
"the path of the model, the model and param files is under "
......@@ -61,10 +64,7 @@ DEFINE_int32(threads, 1, "threads num");
DEFINE_string(result_filename,
"result.txt",
"save the inference time to the file.");
DEFINE_bool(run_model_optimize,
false,
"if set true, apply model_optimize_tool to "
"model and use optimized model to test. ");
DEFINE_bool(show_output, false, "Wether to show the output in shell.");
namespace paddle {
namespace lite_api {
......@@ -100,15 +100,23 @@ void OutputOptModel(const std::string& save_optimized_model_dir) {
LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
}
int64_t ShapeProduction(const std::vector<int64_t>& shape) {
int64_t num = 1;
for (auto i : shape) {
num *= i;
}
return num;
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
void Run(const std::vector<int64_t>& input_shape,
const std::string& model_dir,
const std::string& model_path,
const std::string model_name) {
// set config and create predictor
lite_api::MobileConfig config;
config.set_threads(FLAGS_threads);
config.set_power_mode(static_cast<PowerMode>(FLAGS_power_mode));
config.set_model_from_file(model_dir + ".nb");
config.set_model_from_file(model_path);
auto predictor = lite_api::CreatePaddlePredictor(config);
......@@ -116,10 +124,7 @@ void Run(const std::vector<int64_t>& input_shape,
auto input_tensor = predictor->GetInput(0);
input_tensor->Resize(input_shape);
auto input_data = input_tensor->mutable_data<float>();
int input_num = 1;
for (size_t i = 0; i < input_shape.size(); ++i) {
input_num *= input_shape[i];
}
int64_t input_num = ShapeProduction(input_shape);
if (FLAGS_input_img_path.empty()) {
for (int i = 0; i < input_num; ++i) {
input_data[i] = 1.f;
......@@ -167,26 +172,73 @@ void Run(const std::vector<int64_t>& input_shape,
ofs << "average = " << std::setw(12) << avg_res;
ofs << std::endl;
ofs.close();
if (FLAGS_show_output) {
auto out_tensor = predictor->GetOutput(0);
auto* out_data = out_tensor->data<float>();
int64_t output_num = ShapeProduction(out_tensor->shape());
float max_value = out_data[0];
int max_index = 0;
for (int i = 0; i < output_num; i++) {
if (max_value < out_data[i]) {
max_value = out_data[i];
max_index = i;
}
}
LOG(INFO) << "max_value:" << max_value;
LOG(INFO) << "max_index:" << max_index;
}
}
#endif
} // namespace lite_api
} // namespace paddle
void print_usage() {
std::string help_info =
"Usage: \n"
"./benchmark_bin \n"
" --optimized_model_path (the path of the model that is optimized\n"
" by opt.) type: string \n"
" --model_dir (the path of the model that is not optimized by opt,\n"
" the model and param files is under model_dir.) type: string \n"
" --model_filename (the filename of model file. When the model is\n "
" combined formate, please set model_file. Otherwise, it is not\n"
" necessary to set it.) type: string \n"
" --param_filename (the filename of param file, set param_file when\n"
" the model is combined formate. Otherwise, it is not necessary\n"
" to set it.) type: string \n"
" --input_shape (set input shapes according to the model, separated by\n"
" colon and comma, such as 1,3,244,244) type: string\n"
" default: 1,3,224,224 \n"
" --input_img_path (the path of input image, if not set\n"
" input_img_path, the input will be 1.0.) type: string \n "
" --power_mode (arm power mode: 0 for big cluster, 1 for little\n"
" cluster, 2 for all cores, 3 for no bind) type: int32 default: 3\n"
" --repeats (repeats times) type: int32 default: 1 \n"
" --result_filename (save the inference time to the file.) type: \n"
" string default: result.txt \n"
" --threads (threads num) type: int32 default: 1 \n"
" --warmup (warmup times) type: int32 default: 0 \n"
"Note that: \n"
" If load the optimized model, set optimized_model_path, or set\n"
" model_dir, model_filename and param_filename according to the\n"
" model. \n";
LOG(INFO) << help_info;
}
int main(int argc, char** argv) {
// Check inputs
gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir == "") {
LOG(INFO) << "Please run ./benchmark_bin --help to obtain usage.";
bool is_opt_model = (FLAGS_optimized_model_path != "");
bool is_origin_model = (FLAGS_model_dir != "");
if (!is_origin_model && !is_opt_model) {
LOG(INFO) << "Input error, the model path should not be empty.\n";
print_usage();
exit(0);
}
if (FLAGS_model_dir.back() == '/') {
FLAGS_model_dir.pop_back();
}
std::size_t found = FLAGS_model_dir.find_last_of("/");
std::string model_name = FLAGS_model_dir.substr(found + 1);
std::string save_optimized_model_dir = FLAGS_model_dir + "_opt2";
// Get input shape
auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
std::vector<int64_t> shape;
std::string tmp_str = str_shape;
......@@ -202,19 +254,31 @@ int main(int argc, char** argv) {
}
return shape;
};
std::vector<int64_t> input_shape = get_shape(FLAGS_input_shape);
// Output optimized model if needed
if (FLAGS_run_model_optimize) {
paddle::lite_api::OutputOptModel(save_optimized_model_dir);
// Get model_name and run_model_path
std::string model_name;
std::string run_model_path;
if (is_origin_model) {
if (FLAGS_model_dir.back() == '/') {
FLAGS_model_dir.pop_back();
}
std::size_t found = FLAGS_model_dir.find_last_of("/");
model_name = FLAGS_model_dir.substr(found + 1);
std::string optimized_model_path = FLAGS_model_dir + "_opt2";
paddle::lite_api::OutputOptModel(optimized_model_path);
run_model_path = optimized_model_path + ".nb";
} else {
size_t found1 = FLAGS_optimized_model_path.find_last_of("/");
size_t found2 = FLAGS_optimized_model_path.find_last_of(".");
size_t len = found2 - found1 - 1;
model_name = FLAGS_optimized_model_path.substr(found1 + 1, len);
run_model_path = FLAGS_optimized_model_path;
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
// Run inference using optimized model
std::string run_model_dir =
FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
paddle::lite_api::Run(input_shape, run_model_dir, model_name);
// Run test
paddle::lite_api::Run(input_shape, run_model_path, model_name);
#endif
return 0;
}
......@@ -29,7 +29,10 @@ void LightPredictor::Build(const std::string& lite_model_file,
LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
}
// For weight quantization of post training, load the int8/16 weights
// for optimized model, and dequant it to fp32.
DequantizeWeight();
BuildRuntimeProgram(cpp_program_desc_);
PrepareFeedFetch();
}
......@@ -182,58 +185,76 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
}
void LightPredictor::DequantizeWeight() {
#define PROCESS_CONV2D_DATA() \
for (int64_t i = 0; i < h; ++i) { \
for (int64_t j = 0; j < w; ++j) { \
fp_data[i * w + j] = scale_list[i] * int_data[i * w + j]; \
} \
#define PROCESS_CONV2D_DATA() \
for (int64_t i = 0; i < ch; ++i) { \
for (int64_t j = 0; j < offset; ++j) { \
fp_data[i * offset + j] = scale_list[i] * int_data[i * offset + j]; \
} \
}
#define PROCESS_FC_DATA() \
for (int i = 0; i < input_tensor->numel(); i++) { \
*fp_data = scale_list[0] * (*int_data); \
++fp_data; \
++int_data; \
#define PROCESS_FC_DATA() \
for (int64_t i = 0; i < chin; i++) { \
for (int64_t j = 0; j < chout; j++) { \
fp_data[i * chout + j] = scale_list[j] * int_data[i * chout + j]; \
} \
}
auto is_weight_quantized_op = [](const cpp::OpDesc* op_desc) {
bool result = false;
if (op_desc->HasAttr("quantization_type")) {
std::string type = op_desc->GetAttr<std::string>("quantization_type");
result = (type == "post_weight_abs_max") ||
(type == "post_weight_channel_wise_abs_max");
} else {
result = op_desc->HasAttr("quantize_weight_bits");
}
return result;
};
Tensor tmp_tensor;
CHECK(cpp_program_desc_.BlocksSize());
auto* main_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
for (size_t k = 0; k < main_block->OpsSize(); ++k) {
auto* op_desc = main_block->GetOp<cpp::OpDesc>(k);
if (op_desc->HasAttr("quantize_weight_bits")) { // weight quantized op
auto input_names = op_desc->input_vars();
for (auto& input_name : input_names) {
std::string input_scale_name = input_name + "_quant_scale";
if (op_desc->HasAttr(input_scale_name)) { // the input is quantized
auto input_tensor =
scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
tmp_tensor.CopyDataFrom(*input_tensor);
auto scale_list =
op_desc->GetAttr<std::vector<float>>(input_scale_name);
int quantize_weight_bits =
op_desc->GetAttr<int>("quantize_weight_bits");
float* fp_data = input_tensor->mutable_data<float>();
std::string op_type = op_desc->Type();
if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
int64_t h = input_tensor->dims()[0];
int64_t w = input_tensor->numel() / h;
CHECK_EQ(scale_list.size(), h);
if (quantize_weight_bits == 8) {
const int8_t* int_data = tmp_tensor.data<int8_t>();
PROCESS_CONV2D_DATA()
} else {
const int16_t* int_data = tmp_tensor.data<int16_t>();
PROCESS_CONV2D_DATA()
}
} else if (op_type == "fc" || op_type == "mul") {
if (quantize_weight_bits == 8) {
const int8_t* int_data = tmp_tensor.data<int8_t>();
PROCESS_FC_DATA()
} else {
const int16_t* int_data = tmp_tensor.data<int16_t>();
PROCESS_FC_DATA()
for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) {
auto* block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(i);
for (size_t k = 0; k < block->OpsSize(); ++k) {
auto* op_desc = block->GetOp<cpp::OpDesc>(k);
if (is_weight_quantized_op(op_desc)) {
auto input_names = op_desc->input_vars();
for (auto& input_name : input_names) {
std::string input_scale_name = input_name + "_quant_scale";
if (op_desc->HasAttr(input_scale_name)) { // the input is quantized
auto input_tensor =
scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
tmp_tensor.CopyDataFrom(*input_tensor);
auto scale_list =
op_desc->GetAttr<std::vector<float>>(input_scale_name);
int quantize_weight_bits =
op_desc->GetAttr<int>("quantize_weight_bits");
CHECK(quantize_weight_bits == 8 || quantize_weight_bits == 16);
float* fp_data = input_tensor->mutable_data<float>();
std::string op_type = op_desc->Type();
if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
int64_t ch = input_tensor->dims()[0];
int64_t offset = input_tensor->numel() / ch;
CHECK_EQ(scale_list.size(), ch);
if (quantize_weight_bits == 8) {
const int8_t* int_data = tmp_tensor.data<int8_t>();
PROCESS_CONV2D_DATA()
} else {
const int16_t* int_data = tmp_tensor.data<int16_t>();
PROCESS_CONV2D_DATA()
}
} else if (op_type == "fc" || op_type == "mul") {
int64_t chin = input_tensor->dims()[0];
int64_t chout = input_tensor->dims()[1];
CHECK_EQ(scale_list.size(), chout);
if (quantize_weight_bits == 8) {
const int8_t* int_data = tmp_tensor.data<int8_t>();
PROCESS_FC_DATA()
} else {
const int16_t* int_data = tmp_tensor.data<int16_t>();
PROCESS_FC_DATA()
}
}
}
}
......
......@@ -116,8 +116,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
}
size_t weight_num = conv_weight_t->data_size();
bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
bool is_weight_quantization =
conv_op_desc->HasAttr("quantize_weight_bits") ? true : false;
bool is_weight_quantization = conv_op_desc->HasAttr("quantize_weight_bits");
// comupte BN alpha and beta
Tensor alpha_tensor, beta_tensor;
......
......@@ -22,9 +22,29 @@ namespace paddle {
namespace lite {
namespace mir {
bool IsAbsMaxQuantizedOp(const OpInfo& op_info) {
bool result = false;
if (op_info.HasAttr("quantization_type") &&
op_info.GetAttr<std::string>("quantization_type") ==
"post_weight_abs_max") {
result = true;
} else if (!op_info.HasAttr("quantization_type") &&
op_info.HasAttr("quantize_weight_bits")) { // Support older model,
// save this for now
result = true;
}
return result;
}
/*
* For abs_max method in WeightQuantization, this pass obtains the scale value
* of conv2d, depthwise_conv2d and mul, expands the scale list, and save the
* list in the quantized ops.
*/
void WeightQuantizationPreprocessPass::Apply(
const std::unique_ptr<SSAGraph>& graph) {
std::vector<std::string> weight_quantized_op = {"conv2d", "depthwise_conv2d"};
std::vector<std::string> weight_quantized_op = {
"conv2d", "depthwise_conv2d", "mul"};
for (auto& node : graph->StmtTopologicalOrder()) {
if (node->IsStmt() &&
std::find(weight_quantized_op.begin(),
......@@ -32,14 +52,20 @@ void WeightQuantizationPreprocessPass::Apply(
node->AsStmt().op_type()) != weight_quantized_op.end()) {
auto* scope = node->stmt()->op()->scope();
auto* op_desc = node->stmt()->mutable_op_info();
if (op_desc->HasAttr("quantize_weight_bits")) {
if (IsAbsMaxQuantizedOp(*op_desc)) {
for (auto& input_name : op_desc->input_vars()) {
std::string scale_name = input_name + "_quant_scale";
if (op_desc->HasAttr(scale_name)) {
VLOG(5) << "op:" << op_desc->Type() << " input_name:" << input_name;
VLOG(0) << " WeightQuantizationPreprocessPass op:"
<< op_desc->Type() << " input_name:" << input_name;
auto input_tensor =
scope->FindVar(input_name)->GetMutable<lite::Tensor>();
int weight_out_channel = static_cast<int>(input_tensor->dims()[0]);
int weight_out_channel;
if (op_desc->Type() == "mul") {
weight_out_channel = static_cast<int>(input_tensor->dims()[1]);
} else {
weight_out_channel = static_cast<int>(input_tensor->dims()[0]);
}
auto input_scale = op_desc->GetAttr<std::vector<float>>(scale_name);
// scale length is equal to weight out channel
std::vector<float> scale_list(weight_out_channel, input_scale[0]);
......
......@@ -25,8 +25,9 @@ namespace mir {
* If the model is quantized by WeightQuantization in PostTrainingQuantization,
* the data type of the weight in quantized ops (conv2d, depthwise_conv2d) is
* int, and the scale is save in the quantized ops.
* WeightQuantizationPreprocessPass obtains the scale value, expands the
* scale value to a list, and save the list in the quantized ops.
* For abs_max method in WeightQuantization, WeightQuantizationPreprocessPass
* obtains the scale value of conv2d, depthwise_conv2d and mul, expands the
* scale list, and save the list in the quantized ops.
*/
class WeightQuantizationPreprocessPass : public ProgramPass {
public:
......
......@@ -2,12 +2,12 @@
set -e
# Check input
if [ $# -lt 2 ];
if [ $# -lt 3 ];
then
echo "Input error"
echo "Usage:"
echo " sh benchmark.sh benchmark_bin_path benchmark_models_path <result_filename> <input_shape> <power_mode: [0|1|2|3]> <is_run_model_optimize: [true|false]> <is_run_quantized_model: [trur|false]>"
echo "\npower_mode refer: 0 for big cluster, 1 for little cluster, 2 for all cores, 3 for no bind."
echo " sh benchmark.sh <benchmark_bin_path> <benchmark_models_path> <result_filename>"
echo " sh benchmark.sh <benchmark_bin_path> <benchmark_models_path> <result_filename> <is_run_model_optimize: [true|false]>"
exit
fi
......@@ -15,10 +15,8 @@ fi
ANDROID_DIR=/data/local/tmp
BENCHMARK_BIN=$1
MODELS_DIR=$2
RESULT_FILENAME=$3
RESULT_FILENAME=result.txt
INPUT_SHAPE=1,3,244,244
POWER_MODE=3
WARMUP=10
REPEATS=30
IS_RUN_MODEL_OPTIMIZE=false
......@@ -27,25 +25,9 @@ NUM_THREADS_LIST=(1 2 4)
MODELS_LIST=$(ls $MODELS_DIR)
# Check input
if [ $# -gt 2 ];
then
RESULT_FILENAME=$3
fi
if [ $# -gt 3 ];
then
INPUT_SHAPE=$4
fi
if [ $# -gt 4 ];
then
POWER_MODE=$5
fi
if [ $# -gt 5 ];
then
IS_RUN_MODEL_OPTIMIZE=$6
fi
if [ $# -gt 6 ];
then
IS_RUN_QUANTIZED_MODEL=$7
IS_RUN_MODEL_OPTIMIZE=$4
fi
# Adb push benchmark_bin, models
......@@ -54,26 +36,31 @@ adb shell chmod +x $ANDROID_DIR/benchmark_bin
adb push $MODELS_DIR $ANDROID_DIR
# Run benchmark
adb shell "echo 'PaddleLite Benchmark (in ms)\n' > $ANDROID_DIR/$RESULT_FILENAME"
adb shell "echo 'PaddleLite Benchmark' > $ANDROID_DIR/$RESULT_FILENAME"
for threads in ${NUM_THREADS_LIST[@]}; do
adb shell "echo threads=$threads warmup=$WARMUP repeats=$REPEATS input_shape=$INPUT_SHAPE power_mode=$POWER_MODE >> $ANDROID_DIR/$RESULT_FILENAME"
adb shell "echo Threads=$threads Warmup=$WARMUP Repeats=$REPEATS >> $ANDROID_DIR/$RESULT_FILENAME"
for model_name in ${MODELS_LIST[@]}; do
echo "Model=$model_name Threads=$threads"
adb shell "$ANDROID_DIR/benchmark_bin \
if [ "$IS_RUN_MODEL_OPTIMIZE" = true ];
then
adb shell "$ANDROID_DIR/benchmark_bin \
--model_dir=$ANDROID_DIR/${MODELS_DIR}/$model_name \
--input_shape=$INPUT_SHAPE \
--warmup=$WARMUP \
--repeats=$REPEATS \
--threads=$threads \
--power_mode=$POWER_MODE \
--result_filename=$ANDROID_DIR/$RESULT_FILENAME \
--run_model_optimize=$IS_RUN_MODEL_OPTIMIZE \
--is_quantized_model=$IS_RUN_QUANTIZED_MODEL"
--result_filename=$ANDROID_DIR/$RESULT_FILENAME"
else
adb shell "$ANDROID_DIR/benchmark_bin \
--optimized_model_path=$ANDROID_DIR/${MODELS_DIR}/$model_name \
--warmup=$WARMUP \
--repeats=$REPEATS \
--threads=$threads \
--result_filename=$ANDROID_DIR/$RESULT_FILENAME"
fi
done
adb shell "echo >> $ANDROID_DIR/$RESULT_FILENAME"
done
adb shell "echo >> $ANDROID_DIR/$RESULT_FILENAME"
adb shell "echo power_mode refer: 0 for big cluster, 1 for little cluster, 2 for all cores, 3 for no bind >> $ANDROID_DIR/$RESULT_FILENAME"
# Adb pull benchmark result, show result
adb pull $ANDROID_DIR/$RESULT_FILENAME .
echo "\n--------------------------------------"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册