From 23231af8fd8c3720fd3ecc199f8ab0680d510acf Mon Sep 17 00:00:00 2001 From: cc <52520497+juncaipeng@users.noreply.github.com> Date: Fri, 10 Apr 2020 20:22:49 +0800 Subject: [PATCH] Optimize weight quantizaion (#3374) * Optimize weight quantizaion, test=develop --- lite/api/benchmark.cc | 118 ++++++++++++++---- lite/api/light_api.cc | 115 ++++++++++------- lite/core/mir/fusion/conv_bn_fuser.cc | 3 +- .../weight_quantization_preprocess_pass.cc | 34 ++++- .../mir/weight_quantization_preprocess_pass.h | 5 +- lite/tools/benchmark.sh | 53 +++----- 6 files changed, 213 insertions(+), 115 deletions(-) diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc index 0843faf0d6..81708bc625 100644 --- a/lite/api/benchmark.cc +++ b/lite/api/benchmark.cc @@ -27,6 +27,9 @@ #include "lite/utils/cp_logging.h" #include "lite/utils/string.h" +DEFINE_string(optimized_model_path, + "", + "the path of the model that is optimized by opt."); DEFINE_string(model_dir, "", "the path of the model, the model and param files is under " @@ -61,10 +64,7 @@ DEFINE_int32(threads, 1, "threads num"); DEFINE_string(result_filename, "result.txt", "save the inference time to the file."); -DEFINE_bool(run_model_optimize, - false, - "if set true, apply model_optimize_tool to " - "model and use optimized model to test. "); +DEFINE_bool(show_output, false, "Wether to show the output in shell."); namespace paddle { namespace lite_api { @@ -100,15 +100,23 @@ void OutputOptModel(const std::string& save_optimized_model_dir) { LOG(INFO) << "Save optimized model to " << save_optimized_model_dir; } +int64_t ShapeProduction(const std::vector& shape) { + int64_t num = 1; + for (auto i : shape) { + num *= i; + } + return num; +} + #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK void Run(const std::vector& input_shape, - const std::string& model_dir, + const std::string& model_path, const std::string model_name) { // set config and create predictor lite_api::MobileConfig config; config.set_threads(FLAGS_threads); config.set_power_mode(static_cast(FLAGS_power_mode)); - config.set_model_from_file(model_dir + ".nb"); + config.set_model_from_file(model_path); auto predictor = lite_api::CreatePaddlePredictor(config); @@ -116,10 +124,7 @@ void Run(const std::vector& input_shape, auto input_tensor = predictor->GetInput(0); input_tensor->Resize(input_shape); auto input_data = input_tensor->mutable_data(); - int input_num = 1; - for (size_t i = 0; i < input_shape.size(); ++i) { - input_num *= input_shape[i]; - } + int64_t input_num = ShapeProduction(input_shape); if (FLAGS_input_img_path.empty()) { for (int i = 0; i < input_num; ++i) { input_data[i] = 1.f; @@ -167,26 +172,73 @@ void Run(const std::vector& input_shape, ofs << "average = " << std::setw(12) << avg_res; ofs << std::endl; ofs.close(); + + if (FLAGS_show_output) { + auto out_tensor = predictor->GetOutput(0); + auto* out_data = out_tensor->data(); + int64_t output_num = ShapeProduction(out_tensor->shape()); + float max_value = out_data[0]; + int max_index = 0; + for (int i = 0; i < output_num; i++) { + if (max_value < out_data[i]) { + max_value = out_data[i]; + max_index = i; + } + } + LOG(INFO) << "max_value:" << max_value; + LOG(INFO) << "max_index:" << max_index; + } } #endif } // namespace lite_api } // namespace paddle +void print_usage() { + std::string help_info = + "Usage: \n" + "./benchmark_bin \n" + " --optimized_model_path (the path of the model that is optimized\n" + " by opt.) type: string \n" + " --model_dir (the path of the model that is not optimized by opt,\n" + " the model and param files is under model_dir.) type: string \n" + " --model_filename (the filename of model file. When the model is\n " + " combined formate, please set model_file. Otherwise, it is not\n" + " necessary to set it.) type: string \n" + " --param_filename (the filename of param file, set param_file when\n" + " the model is combined formate. Otherwise, it is not necessary\n" + " to set it.) type: string \n" + " --input_shape (set input shapes according to the model, separated by\n" + " colon and comma, such as 1,3,244,244) type: string\n" + " default: 1,3,224,224 \n" + " --input_img_path (the path of input image, if not set\n" + " input_img_path, the input will be 1.0.) type: string \n " + " --power_mode (arm power mode: 0 for big cluster, 1 for little\n" + " cluster, 2 for all cores, 3 for no bind) type: int32 default: 3\n" + " --repeats (repeats times) type: int32 default: 1 \n" + " --result_filename (save the inference time to the file.) type: \n" + " string default: result.txt \n" + " --threads (threads num) type: int32 default: 1 \n" + " --warmup (warmup times) type: int32 default: 0 \n" + "Note that: \n" + " If load the optimized model, set optimized_model_path, or set\n" + " model_dir, model_filename and param_filename according to the\n" + " model. \n"; + LOG(INFO) << help_info; +} + int main(int argc, char** argv) { + // Check inputs gflags::ParseCommandLineFlags(&argc, &argv, true); - if (FLAGS_model_dir == "") { - LOG(INFO) << "Please run ./benchmark_bin --help to obtain usage."; + bool is_opt_model = (FLAGS_optimized_model_path != ""); + bool is_origin_model = (FLAGS_model_dir != ""); + if (!is_origin_model && !is_opt_model) { + LOG(INFO) << "Input error, the model path should not be empty.\n"; + print_usage(); exit(0); } - if (FLAGS_model_dir.back() == '/') { - FLAGS_model_dir.pop_back(); - } - std::size_t found = FLAGS_model_dir.find_last_of("/"); - std::string model_name = FLAGS_model_dir.substr(found + 1); - std::string save_optimized_model_dir = FLAGS_model_dir + "_opt2"; - + // Get input shape auto get_shape = [](const std::string& str_shape) -> std::vector { std::vector shape; std::string tmp_str = str_shape; @@ -202,19 +254,31 @@ int main(int argc, char** argv) { } return shape; }; - std::vector input_shape = get_shape(FLAGS_input_shape); - // Output optimized model if needed - if (FLAGS_run_model_optimize) { - paddle::lite_api::OutputOptModel(save_optimized_model_dir); + // Get model_name and run_model_path + std::string model_name; + std::string run_model_path; + if (is_origin_model) { + if (FLAGS_model_dir.back() == '/') { + FLAGS_model_dir.pop_back(); + } + std::size_t found = FLAGS_model_dir.find_last_of("/"); + model_name = FLAGS_model_dir.substr(found + 1); + std::string optimized_model_path = FLAGS_model_dir + "_opt2"; + paddle::lite_api::OutputOptModel(optimized_model_path); + run_model_path = optimized_model_path + ".nb"; + } else { + size_t found1 = FLAGS_optimized_model_path.find_last_of("/"); + size_t found2 = FLAGS_optimized_model_path.find_last_of("."); + size_t len = found2 - found1 - 1; + model_name = FLAGS_optimized_model_path.substr(found1 + 1, len); + run_model_path = FLAGS_optimized_model_path; } #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK - // Run inference using optimized model - std::string run_model_dir = - FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir; - paddle::lite_api::Run(input_shape, run_model_dir, model_name); + // Run test + paddle::lite_api::Run(input_shape, run_model_path, model_name); #endif return 0; } diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc index d82869dbef..01f8853cb9 100644 --- a/lite/api/light_api.cc +++ b/lite/api/light_api.cc @@ -29,7 +29,10 @@ void LightPredictor::Build(const std::string& lite_model_file, LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_); } + // For weight quantization of post training, load the int8/16 weights + // for optimized model, and dequant it to fp32. DequantizeWeight(); + BuildRuntimeProgram(cpp_program_desc_); PrepareFeedFetch(); } @@ -182,58 +185,76 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) { } void LightPredictor::DequantizeWeight() { -#define PROCESS_CONV2D_DATA() \ - for (int64_t i = 0; i < h; ++i) { \ - for (int64_t j = 0; j < w; ++j) { \ - fp_data[i * w + j] = scale_list[i] * int_data[i * w + j]; \ - } \ +#define PROCESS_CONV2D_DATA() \ + for (int64_t i = 0; i < ch; ++i) { \ + for (int64_t j = 0; j < offset; ++j) { \ + fp_data[i * offset + j] = scale_list[i] * int_data[i * offset + j]; \ + } \ } -#define PROCESS_FC_DATA() \ - for (int i = 0; i < input_tensor->numel(); i++) { \ - *fp_data = scale_list[0] * (*int_data); \ - ++fp_data; \ - ++int_data; \ +#define PROCESS_FC_DATA() \ + for (int64_t i = 0; i < chin; i++) { \ + for (int64_t j = 0; j < chout; j++) { \ + fp_data[i * chout + j] = scale_list[j] * int_data[i * chout + j]; \ + } \ } + auto is_weight_quantized_op = [](const cpp::OpDesc* op_desc) { + bool result = false; + if (op_desc->HasAttr("quantization_type")) { + std::string type = op_desc->GetAttr("quantization_type"); + result = (type == "post_weight_abs_max") || + (type == "post_weight_channel_wise_abs_max"); + } else { + result = op_desc->HasAttr("quantize_weight_bits"); + } + return result; + }; + Tensor tmp_tensor; - CHECK(cpp_program_desc_.BlocksSize()); - auto* main_block = cpp_program_desc_.GetBlock(0); - for (size_t k = 0; k < main_block->OpsSize(); ++k) { - auto* op_desc = main_block->GetOp(k); - if (op_desc->HasAttr("quantize_weight_bits")) { // weight quantized op - auto input_names = op_desc->input_vars(); - for (auto& input_name : input_names) { - std::string input_scale_name = input_name + "_quant_scale"; - if (op_desc->HasAttr(input_scale_name)) { // the input is quantized - auto input_tensor = - scope_->FindVar(input_name)->GetMutable(); - tmp_tensor.CopyDataFrom(*input_tensor); - auto scale_list = - op_desc->GetAttr>(input_scale_name); - int quantize_weight_bits = - op_desc->GetAttr("quantize_weight_bits"); - float* fp_data = input_tensor->mutable_data(); - - std::string op_type = op_desc->Type(); - if (op_type == "conv2d" || op_type == "depthwise_conv2d") { - int64_t h = input_tensor->dims()[0]; - int64_t w = input_tensor->numel() / h; - CHECK_EQ(scale_list.size(), h); - if (quantize_weight_bits == 8) { - const int8_t* int_data = tmp_tensor.data(); - PROCESS_CONV2D_DATA() - } else { - const int16_t* int_data = tmp_tensor.data(); - PROCESS_CONV2D_DATA() - } - } else if (op_type == "fc" || op_type == "mul") { - if (quantize_weight_bits == 8) { - const int8_t* int_data = tmp_tensor.data(); - PROCESS_FC_DATA() - } else { - const int16_t* int_data = tmp_tensor.data(); - PROCESS_FC_DATA() + for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) { + auto* block = cpp_program_desc_.GetBlock(i); + for (size_t k = 0; k < block->OpsSize(); ++k) { + auto* op_desc = block->GetOp(k); + if (is_weight_quantized_op(op_desc)) { + auto input_names = op_desc->input_vars(); + for (auto& input_name : input_names) { + std::string input_scale_name = input_name + "_quant_scale"; + if (op_desc->HasAttr(input_scale_name)) { // the input is quantized + auto input_tensor = + scope_->FindVar(input_name)->GetMutable(); + tmp_tensor.CopyDataFrom(*input_tensor); + auto scale_list = + op_desc->GetAttr>(input_scale_name); + + int quantize_weight_bits = + op_desc->GetAttr("quantize_weight_bits"); + CHECK(quantize_weight_bits == 8 || quantize_weight_bits == 16); + float* fp_data = input_tensor->mutable_data(); + + std::string op_type = op_desc->Type(); + if (op_type == "conv2d" || op_type == "depthwise_conv2d") { + int64_t ch = input_tensor->dims()[0]; + int64_t offset = input_tensor->numel() / ch; + CHECK_EQ(scale_list.size(), ch); + if (quantize_weight_bits == 8) { + const int8_t* int_data = tmp_tensor.data(); + PROCESS_CONV2D_DATA() + } else { + const int16_t* int_data = tmp_tensor.data(); + PROCESS_CONV2D_DATA() + } + } else if (op_type == "fc" || op_type == "mul") { + int64_t chin = input_tensor->dims()[0]; + int64_t chout = input_tensor->dims()[1]; + CHECK_EQ(scale_list.size(), chout); + if (quantize_weight_bits == 8) { + const int8_t* int_data = tmp_tensor.data(); + PROCESS_FC_DATA() + } else { + const int16_t* int_data = tmp_tensor.data(); + PROCESS_FC_DATA() + } } } } diff --git a/lite/core/mir/fusion/conv_bn_fuser.cc b/lite/core/mir/fusion/conv_bn_fuser.cc index 150a6e68d8..43869beddd 100644 --- a/lite/core/mir/fusion/conv_bn_fuser.cc +++ b/lite/core/mir/fusion/conv_bn_fuser.cc @@ -116,8 +116,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { } size_t weight_num = conv_weight_t->data_size(); bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false; - bool is_weight_quantization = - conv_op_desc->HasAttr("quantize_weight_bits") ? true : false; + bool is_weight_quantization = conv_op_desc->HasAttr("quantize_weight_bits"); // comupte BN alpha and beta Tensor alpha_tensor, beta_tensor; diff --git a/lite/core/mir/weight_quantization_preprocess_pass.cc b/lite/core/mir/weight_quantization_preprocess_pass.cc index c7889a5490..2bb247871b 100644 --- a/lite/core/mir/weight_quantization_preprocess_pass.cc +++ b/lite/core/mir/weight_quantization_preprocess_pass.cc @@ -22,9 +22,29 @@ namespace paddle { namespace lite { namespace mir { +bool IsAbsMaxQuantizedOp(const OpInfo& op_info) { + bool result = false; + if (op_info.HasAttr("quantization_type") && + op_info.GetAttr("quantization_type") == + "post_weight_abs_max") { + result = true; + } else if (!op_info.HasAttr("quantization_type") && + op_info.HasAttr("quantize_weight_bits")) { // Support older model, + // save this for now + result = true; + } + return result; +} + +/* + * For abs_max method in WeightQuantization, this pass obtains the scale value + * of conv2d, depthwise_conv2d and mul, expands the scale list, and save the + * list in the quantized ops. +*/ void WeightQuantizationPreprocessPass::Apply( const std::unique_ptr& graph) { - std::vector weight_quantized_op = {"conv2d", "depthwise_conv2d"}; + std::vector weight_quantized_op = { + "conv2d", "depthwise_conv2d", "mul"}; for (auto& node : graph->StmtTopologicalOrder()) { if (node->IsStmt() && std::find(weight_quantized_op.begin(), @@ -32,14 +52,20 @@ void WeightQuantizationPreprocessPass::Apply( node->AsStmt().op_type()) != weight_quantized_op.end()) { auto* scope = node->stmt()->op()->scope(); auto* op_desc = node->stmt()->mutable_op_info(); - if (op_desc->HasAttr("quantize_weight_bits")) { + if (IsAbsMaxQuantizedOp(*op_desc)) { for (auto& input_name : op_desc->input_vars()) { std::string scale_name = input_name + "_quant_scale"; if (op_desc->HasAttr(scale_name)) { - VLOG(5) << "op:" << op_desc->Type() << " input_name:" << input_name; + VLOG(0) << " WeightQuantizationPreprocessPass op:" + << op_desc->Type() << " input_name:" << input_name; auto input_tensor = scope->FindVar(input_name)->GetMutable(); - int weight_out_channel = static_cast(input_tensor->dims()[0]); + int weight_out_channel; + if (op_desc->Type() == "mul") { + weight_out_channel = static_cast(input_tensor->dims()[1]); + } else { + weight_out_channel = static_cast(input_tensor->dims()[0]); + } auto input_scale = op_desc->GetAttr>(scale_name); // scale length is equal to weight out channel std::vector scale_list(weight_out_channel, input_scale[0]); diff --git a/lite/core/mir/weight_quantization_preprocess_pass.h b/lite/core/mir/weight_quantization_preprocess_pass.h index 76a35c6b44..e7c9f03eef 100644 --- a/lite/core/mir/weight_quantization_preprocess_pass.h +++ b/lite/core/mir/weight_quantization_preprocess_pass.h @@ -25,8 +25,9 @@ namespace mir { * If the model is quantized by WeightQuantization in PostTrainingQuantization, * the data type of the weight in quantized ops (conv2d, depthwise_conv2d) is * int, and the scale is save in the quantized ops. - * WeightQuantizationPreprocessPass obtains the scale value, expands the - * scale value to a list, and save the list in the quantized ops. + * For abs_max method in WeightQuantization, WeightQuantizationPreprocessPass + * obtains the scale value of conv2d, depthwise_conv2d and mul, expands the + * scale list, and save the list in the quantized ops. */ class WeightQuantizationPreprocessPass : public ProgramPass { public: diff --git a/lite/tools/benchmark.sh b/lite/tools/benchmark.sh index 23bb183ec9..3af8176f97 100644 --- a/lite/tools/benchmark.sh +++ b/lite/tools/benchmark.sh @@ -2,12 +2,12 @@ set -e # Check input -if [ $# -lt 2 ]; +if [ $# -lt 3 ]; then echo "Input error" echo "Usage:" - echo " sh benchmark.sh benchmark_bin_path benchmark_models_path " - echo "\npower_mode refer: 0 for big cluster, 1 for little cluster, 2 for all cores, 3 for no bind." + echo " sh benchmark.sh " + echo " sh benchmark.sh " exit fi @@ -15,10 +15,8 @@ fi ANDROID_DIR=/data/local/tmp BENCHMARK_BIN=$1 MODELS_DIR=$2 +RESULT_FILENAME=$3 -RESULT_FILENAME=result.txt -INPUT_SHAPE=1,3,244,244 -POWER_MODE=3 WARMUP=10 REPEATS=30 IS_RUN_MODEL_OPTIMIZE=false @@ -27,25 +25,9 @@ NUM_THREADS_LIST=(1 2 4) MODELS_LIST=$(ls $MODELS_DIR) # Check input -if [ $# -gt 2 ]; -then - RESULT_FILENAME=$3 -fi if [ $# -gt 3 ]; then - INPUT_SHAPE=$4 -fi -if [ $# -gt 4 ]; -then - POWER_MODE=$5 -fi -if [ $# -gt 5 ]; -then - IS_RUN_MODEL_OPTIMIZE=$6 -fi -if [ $# -gt 6 ]; -then - IS_RUN_QUANTIZED_MODEL=$7 + IS_RUN_MODEL_OPTIMIZE=$4 fi # Adb push benchmark_bin, models @@ -54,26 +36,31 @@ adb shell chmod +x $ANDROID_DIR/benchmark_bin adb push $MODELS_DIR $ANDROID_DIR # Run benchmark -adb shell "echo 'PaddleLite Benchmark (in ms)\n' > $ANDROID_DIR/$RESULT_FILENAME" +adb shell "echo 'PaddleLite Benchmark' > $ANDROID_DIR/$RESULT_FILENAME" for threads in ${NUM_THREADS_LIST[@]}; do - adb shell "echo threads=$threads warmup=$WARMUP repeats=$REPEATS input_shape=$INPUT_SHAPE power_mode=$POWER_MODE >> $ANDROID_DIR/$RESULT_FILENAME" + adb shell "echo Threads=$threads Warmup=$WARMUP Repeats=$REPEATS >> $ANDROID_DIR/$RESULT_FILENAME" for model_name in ${MODELS_LIST[@]}; do echo "Model=$model_name Threads=$threads" - adb shell "$ANDROID_DIR/benchmark_bin \ + if [ "$IS_RUN_MODEL_OPTIMIZE" = true ]; + then + adb shell "$ANDROID_DIR/benchmark_bin \ --model_dir=$ANDROID_DIR/${MODELS_DIR}/$model_name \ - --input_shape=$INPUT_SHAPE \ --warmup=$WARMUP \ --repeats=$REPEATS \ --threads=$threads \ - --power_mode=$POWER_MODE \ - --result_filename=$ANDROID_DIR/$RESULT_FILENAME \ - --run_model_optimize=$IS_RUN_MODEL_OPTIMIZE \ - --is_quantized_model=$IS_RUN_QUANTIZED_MODEL" + --result_filename=$ANDROID_DIR/$RESULT_FILENAME" + else + adb shell "$ANDROID_DIR/benchmark_bin \ + --optimized_model_path=$ANDROID_DIR/${MODELS_DIR}/$model_name \ + --warmup=$WARMUP \ + --repeats=$REPEATS \ + --threads=$threads \ + --result_filename=$ANDROID_DIR/$RESULT_FILENAME" + fi done adb shell "echo >> $ANDROID_DIR/$RESULT_FILENAME" done -adb shell "echo >> $ANDROID_DIR/$RESULT_FILENAME" -adb shell "echo power_mode refer: 0 for big cluster, 1 for little cluster, 2 for all cores, 3 for no bind >> $ANDROID_DIR/$RESULT_FILENAME" + # Adb pull benchmark result, show result adb pull $ANDROID_DIR/$RESULT_FILENAME . echo "\n--------------------------------------" -- GitLab