Optimize weight quantizaion (#3374)

* Optimize weight quantizaion, test=develop

Optimize weight quantizaion (#3374)
* Optimize weight quantizaion, test=develop
23231af8 · cc · GitHub · 40a31442 · 23231af8 · 23231af8
6 changed file
--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -27,6 +27,9 @@
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"

+DEFINE_string(optimized_model_path,
+              "",
+              "the path of the model that is optimized by opt.");
 DEFINE_string(model_dir,
              "",
              "the path of the model, the model and param files is under "
@@ -61,10 +64,7 @@ DEFINE_int32(threads, 1, "threads num");
 DEFINE_string(result_filename,
              "result.txt",
              "save the inference time to the file.");
-DEFINE_bool(run_model_optimize,
-            false,
-            "if set true, apply model_optimize_tool to "
-            "model and use optimized model to test. ");
+DEFINE_bool(show_output, false, "Wether to show the output in shell.");

 namespace paddle {
 namespace lite_api {
@@ -100,15 +100,23 @@ void OutputOptModel(const std::string& save_optimized_model_dir) {
  LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
 }

+int64_t ShapeProduction(const std::vector<int64_t>& shape) {
+  int64_t num = 1;
+  for (auto i : shape) {
+    num *= i;
+  }
+  return num;
+}
+
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 void Run(const std::vector<int64_t>& input_shape,
-         const std::string& model_dir,
+         const std::string& model_path,
         const std::string model_name) {
  // set config and create predictor
  lite_api::MobileConfig config;
  config.set_threads(FLAGS_threads);
  config.set_power_mode(static_cast<PowerMode>(FLAGS_power_mode));
-  config.set_model_from_file(model_dir + ".nb");
+  config.set_model_from_file(model_path);

  auto predictor = lite_api::CreatePaddlePredictor(config);

@@ -116,10 +124,7 @@ void Run(const std::vector<int64_t>& input_shape,
  auto input_tensor = predictor->GetInput(0);
  input_tensor->Resize(input_shape);
  auto input_data = input_tensor->mutable_data<float>();
-  int input_num = 1;
-  for (size_t i = 0; i < input_shape.size(); ++i) {
-    input_num *= input_shape[i];
-  }
+  int64_t input_num = ShapeProduction(input_shape);
  if (FLAGS_input_img_path.empty()) {
    for (int i = 0; i < input_num; ++i) {
      input_data[i] = 1.f;
@@ -167,26 +172,73 @@ void Run(const std::vector<int64_t>& input_shape,
  ofs << "average = " << std::setw(12) << avg_res;
  ofs << std::endl;
  ofs.close();
+
+  if (FLAGS_show_output) {
+    auto out_tensor = predictor->GetOutput(0);
+    auto* out_data = out_tensor->data<float>();
+    int64_t output_num = ShapeProduction(out_tensor->shape());
+    float max_value = out_data[0];
+    int max_index = 0;
+    for (int i = 0; i < output_num; i++) {
+      if (max_value < out_data[i]) {
+        max_value = out_data[i];
+        max_index = i;
+      }
+    }
+    LOG(INFO) << "max_value:" << max_value;
+    LOG(INFO) << "max_index:" << max_index;
+  }
 }
 #endif

 }  // namespace lite_api
 }  // namespace paddle

+void print_usage() {
+  std::string help_info =
+      "Usage: \n"
+      "./benchmark_bin \n"
+      "  --optimized_model_path (the path of the model that is optimized\n"
+      "    by opt.) type: string \n"
+      "  --model_dir (the path of the model that is not optimized by opt,\n"
+      "    the model and param files is under model_dir.) type: string \n"
+      "  --model_filename (the filename of model file. When the model is\n "
+      "    combined formate, please set model_file. Otherwise, it is not\n"
+      "    necessary to set it.) type: string \n"
+      "  --param_filename (the filename of param file, set param_file when\n"
+      "    the model is combined formate. Otherwise, it is not necessary\n"
+      "    to set it.) type: string \n"
+      "  --input_shape (set input shapes according to the model, separated by\n"
+      "    colon and comma, such as 1,3,244,244) type: string\n"
+      "    default: 1,3,224,224 \n"
+      "  --input_img_path (the path of input image, if not set\n"
+      "    input_img_path, the input will be 1.0.) type: string \n "
+      "  --power_mode (arm power mode: 0 for big cluster, 1 for little\n"
+      "    cluster, 2 for all cores, 3 for no bind) type: int32 default: 3\n"
+      "  --repeats (repeats times) type: int32 default: 1 \n"
+      "  --result_filename (save the inference time to the file.) type: \n"
+      "    string default: result.txt \n"
+      "  --threads (threads num) type: int32 default: 1 \n"
+      "  --warmup (warmup times) type: int32 default: 0 \n"
+      "Note that: \n"
+      "  If load the optimized model, set optimized_model_path, or set\n"
+      "    model_dir, model_filename and param_filename according to the\n"
+      "    model. \n";
+  LOG(INFO) << help_info;
+}
+
 int main(int argc, char** argv) {
+  // Check inputs
  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  if (FLAGS_model_dir == "") {
-    LOG(INFO) << "Please run ./benchmark_bin --help to obtain usage.";
+  bool is_opt_model = (FLAGS_optimized_model_path != "");
+  bool is_origin_model = (FLAGS_model_dir != "");
+  if (!is_origin_model && !is_opt_model) {
+    LOG(INFO) << "Input error, the model path should not be empty.\n";
+    print_usage();
    exit(0);
  }

-  if (FLAGS_model_dir.back() == '/') {
-    FLAGS_model_dir.pop_back();
-  }
-  std::size_t found = FLAGS_model_dir.find_last_of("/");
-  std::string model_name = FLAGS_model_dir.substr(found + 1);
-  std::string save_optimized_model_dir = FLAGS_model_dir + "_opt2";
-
+  // Get input shape
  auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
    std::vector<int64_t> shape;
    std::string tmp_str = str_shape;
@@ -202,19 +254,31 @@ int main(int argc, char** argv) {
    }
    return shape;
  };
-
  std::vector<int64_t> input_shape = get_shape(FLAGS_input_shape);

-  // Output optimized model if needed
-  if (FLAGS_run_model_optimize) {
-    paddle::lite_api::OutputOptModel(save_optimized_model_dir);
+  // Get model_name and run_model_path
+  std::string model_name;
+  std::string run_model_path;
+  if (is_origin_model) {
+    if (FLAGS_model_dir.back() == '/') {
+      FLAGS_model_dir.pop_back();
+    }
+    std::size_t found = FLAGS_model_dir.find_last_of("/");
+    model_name = FLAGS_model_dir.substr(found + 1);
+    std::string optimized_model_path = FLAGS_model_dir + "_opt2";
+    paddle::lite_api::OutputOptModel(optimized_model_path);
+    run_model_path = optimized_model_path + ".nb";
+  } else {
+    size_t found1 = FLAGS_optimized_model_path.find_last_of("/");
+    size_t found2 = FLAGS_optimized_model_path.find_last_of(".");
+    size_t len = found2 - found1 - 1;
+    model_name = FLAGS_optimized_model_path.substr(found1 + 1, len);
+    run_model_path = FLAGS_optimized_model_path;
  }

 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-  // Run inference using optimized model
-  std::string run_model_dir =
-      FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
-  paddle::lite_api::Run(input_shape, run_model_dir, model_name);
+  // Run test
+  paddle::lite_api::Run(input_shape, run_model_path, model_name);
 #endif
  return 0;
 }
--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -29,7 +29,10 @@ void LightPredictor::Build(const std::string& lite_model_file,
    LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
  }

+  // For weight quantization of post training, load the int8/16 weights
+  // for optimized model, and dequant it to fp32.
  DequantizeWeight();
+
  BuildRuntimeProgram(cpp_program_desc_);
  PrepareFeedFetch();
 }
@@ -182,58 +185,76 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
 }

 void LightPredictor::DequantizeWeight() {
-#define PROCESS_CONV2D_DATA()                                   \
-  for (int64_t i = 0; i < h; ++i) {                             \
-    for (int64_t j = 0; j < w; ++j) {                           \
-      fp_data[i * w + j] = scale_list[i] * int_data[i * w + j]; \
-    }                                                           \
+#define PROCESS_CONV2D_DATA()                                             \
+  for (int64_t i = 0; i < ch; ++i) {                                      \
+    for (int64_t j = 0; j < offset; ++j) {                                \
+      fp_data[i * offset + j] = scale_list[i] * int_data[i * offset + j]; \
+    }                                                                     \
  }

-#define PROCESS_FC_DATA()                           \
-  for (int i = 0; i < input_tensor->numel(); i++) { \
-    *fp_data = scale_list[0] * (*int_data);         \
-    ++fp_data;                                      \
-    ++int_data;                                     \
+#define PROCESS_FC_DATA()                                               \
+  for (int64_t i = 0; i < chin; i++) {                                  \
+    for (int64_t j = 0; j < chout; j++) {                               \
+      fp_data[i * chout + j] = scale_list[j] * int_data[i * chout + j]; \
+    }                                                                   \
  }

+  auto is_weight_quantized_op = [](const cpp::OpDesc* op_desc) {
+    bool result = false;
+    if (op_desc->HasAttr("quantization_type")) {
+      std::string type = op_desc->GetAttr<std::string>("quantization_type");
+      result = (type == "post_weight_abs_max") ||
+               (type == "post_weight_channel_wise_abs_max");
+    } else {
+      result = op_desc->HasAttr("quantize_weight_bits");
+    }
+    return result;
+  };
+
  Tensor tmp_tensor;
-  CHECK(cpp_program_desc_.BlocksSize());
-  auto* main_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
-  for (size_t k = 0; k < main_block->OpsSize(); ++k) {
-    auto* op_desc = main_block->GetOp<cpp::OpDesc>(k);
-    if (op_desc->HasAttr("quantize_weight_bits")) {  //  weight quantized op
-      auto input_names = op_desc->input_vars();
-      for (auto& input_name : input_names) {
-        std::string input_scale_name = input_name + "_quant_scale";
-        if (op_desc->HasAttr(input_scale_name)) {  // the input is quantized
-          auto input_tensor =
-              scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
-          tmp_tensor.CopyDataFrom(*input_tensor);
-          auto scale_list =
-              op_desc->GetAttr<std::vector<float>>(input_scale_name);
-          int quantize_weight_bits =
-              op_desc->GetAttr<int>("quantize_weight_bits");
-          float* fp_data = input_tensor->mutable_data<float>();
-
-          std::string op_type = op_desc->Type();
-          if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
-            int64_t h = input_tensor->dims()[0];
-            int64_t w = input_tensor->numel() / h;
-            CHECK_EQ(scale_list.size(), h);
-            if (quantize_weight_bits == 8) {
-              const int8_t* int_data = tmp_tensor.data<int8_t>();
-              PROCESS_CONV2D_DATA()
-            } else {
-              const int16_t* int_data = tmp_tensor.data<int16_t>();
-              PROCESS_CONV2D_DATA()
-            }
-          } else if (op_type == "fc" || op_type == "mul") {
-            if (quantize_weight_bits == 8) {
-              const int8_t* int_data = tmp_tensor.data<int8_t>();
-              PROCESS_FC_DATA()
-            } else {
-              const int16_t* int_data = tmp_tensor.data<int16_t>();
-              PROCESS_FC_DATA()
+  for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) {
+    auto* block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(i);
+    for (size_t k = 0; k < block->OpsSize(); ++k) {
+      auto* op_desc = block->GetOp<cpp::OpDesc>(k);
+      if (is_weight_quantized_op(op_desc)) {
+        auto input_names = op_desc->input_vars();
+        for (auto& input_name : input_names) {
+          std::string input_scale_name = input_name + "_quant_scale";
+          if (op_desc->HasAttr(input_scale_name)) {  // the input is quantized
+            auto input_tensor =
+                scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
+            tmp_tensor.CopyDataFrom(*input_tensor);
+            auto scale_list =
+                op_desc->GetAttr<std::vector<float>>(input_scale_name);
+
+            int quantize_weight_bits =
+                op_desc->GetAttr<int>("quantize_weight_bits");
+            CHECK(quantize_weight_bits == 8 || quantize_weight_bits == 16);
+            float* fp_data = input_tensor->mutable_data<float>();
+
+            std::string op_type = op_desc->Type();
+            if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
+              int64_t ch = input_tensor->dims()[0];
+              int64_t offset = input_tensor->numel() / ch;
+              CHECK_EQ(scale_list.size(), ch);
+              if (quantize_weight_bits == 8) {
+                const int8_t* int_data = tmp_tensor.data<int8_t>();
+                PROCESS_CONV2D_DATA()
+              } else {
+                const int16_t* int_data = tmp_tensor.data<int16_t>();
+                PROCESS_CONV2D_DATA()
+              }
+            } else if (op_type == "fc" || op_type == "mul") {
+              int64_t chin = input_tensor->dims()[0];
+              int64_t chout = input_tensor->dims()[1];
+              CHECK_EQ(scale_list.size(), chout);
+              if (quantize_weight_bits == 8) {
+                const int8_t* int_data = tmp_tensor.data<int8_t>();
+                PROCESS_FC_DATA()
+              } else {
+                const int16_t* int_data = tmp_tensor.data<int16_t>();
+                PROCESS_FC_DATA()
+              }
            }
          }
        }

--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -116,8 +116,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  }
  size_t weight_num = conv_weight_t->data_size();
  bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
-  bool is_weight_quantization =
-      conv_op_desc->HasAttr("quantize_weight_bits") ? true : false;
+  bool is_weight_quantization = conv_op_desc->HasAttr("quantize_weight_bits");

  // comupte BN alpha and beta
  Tensor alpha_tensor, beta_tensor;

--- a/lite/core/mir/weight_quantization_preprocess_pass.cc
+++ b/lite/core/mir/weight_quantization_preprocess_pass.cc
@@ -22,9 +22,29 @@ namespace paddle {
 namespace lite {
 namespace mir {

+bool IsAbsMaxQuantizedOp(const OpInfo& op_info) {
+  bool result = false;
+  if (op_info.HasAttr("quantization_type") &&
+      op_info.GetAttr<std::string>("quantization_type") ==
+          "post_weight_abs_max") {
+    result = true;
+  } else if (!op_info.HasAttr("quantization_type") &&
+             op_info.HasAttr("quantize_weight_bits")) {  // Support older model,
+                                                         // save this for now
+    result = true;
+  }
+  return result;
+}
+
+/*
+ * For abs_max method in WeightQuantization, this pass obtains the scale value
+ * of conv2d, depthwise_conv2d and mul, expands the scale list, and save the
+ * list in the quantized ops.
+*/
 void WeightQuantizationPreprocessPass::Apply(
    const std::unique_ptr<SSAGraph>& graph) {
-  std::vector<std::string> weight_quantized_op = {"conv2d", "depthwise_conv2d"};
+  std::vector<std::string> weight_quantized_op = {
+      "conv2d", "depthwise_conv2d", "mul"};
  for (auto& node : graph->StmtTopologicalOrder()) {
    if (node->IsStmt() &&
        std::find(weight_quantized_op.begin(),
@@ -32,14 +52,20 @@ void WeightQuantizationPreprocessPass::Apply(
                  node->AsStmt().op_type()) != weight_quantized_op.end()) {
      auto* scope = node->stmt()->op()->scope();
      auto* op_desc = node->stmt()->mutable_op_info();
-      if (op_desc->HasAttr("quantize_weight_bits")) {
+      if (IsAbsMaxQuantizedOp(*op_desc)) {
        for (auto& input_name : op_desc->input_vars()) {
          std::string scale_name = input_name + "_quant_scale";
          if (op_desc->HasAttr(scale_name)) {
-            VLOG(5) << "op:" << op_desc->Type() << " input_name:" << input_name;
+            VLOG(0) << " WeightQuantizationPreprocessPass op:"
+                    << op_desc->Type() << " input_name:" << input_name;
            auto input_tensor =
                scope->FindVar(input_name)->GetMutable<lite::Tensor>();
-            int weight_out_channel = static_cast<int>(input_tensor->dims()[0]);
+            int weight_out_channel;
+            if (op_desc->Type() == "mul") {
+              weight_out_channel = static_cast<int>(input_tensor->dims()[1]);
+            } else {
+              weight_out_channel = static_cast<int>(input_tensor->dims()[0]);
+            }
            auto input_scale = op_desc->GetAttr<std::vector<float>>(scale_name);
            // scale length is equal to weight out channel
            std::vector<float> scale_list(weight_out_channel, input_scale[0]);

--- a/lite/core/mir/weight_quantization_preprocess_pass.h
+++ b/lite/core/mir/weight_quantization_preprocess_pass.h
@@ -25,8 +25,9 @@ namespace mir {
 * If the model is quantized by WeightQuantization in PostTrainingQuantization,
 * the data type of the weight in quantized ops (conv2d, depthwise_conv2d) is
 * int, and the scale is save in the quantized ops.
- * WeightQuantizationPreprocessPass obtains the scale value, expands the
- * scale value to a list, and save the list in the quantized ops.
+ * For abs_max method in WeightQuantization, WeightQuantizationPreprocessPass
+ * obtains the scale value of conv2d, depthwise_conv2d and mul, expands the
+ * scale list, and save the list in the quantized ops.
 */
 class WeightQuantizationPreprocessPass : public ProgramPass {
 public:

--- a/lite/tools/benchmark.sh
+++ b/lite/tools/benchmark.sh
@@ -2,12 +2,12 @@
 set -e

 # Check input
-if [ $# -lt  2 ];
+if [ $# -lt  3 ];
 then
    echo "Input error"
    echo "Usage:"
-    echo "  sh benchmark.sh benchmark_bin_path benchmark_models_path <result_filename> <input_shape> <power_mode: [0|1|2|3]> <is_run_model_optimize: [true|false]> <is_run_quantized_model: [trur|false]>"
-    echo "\npower_mode refer: 0 for big cluster, 1 for little cluster, 2 for all cores,  3 for no bind."
+    echo "  sh benchmark.sh <benchmark_bin_path> <benchmark_models_path> <result_filename>"
+    echo "  sh benchmark.sh <benchmark_bin_path> <benchmark_models_path> <result_filename> <is_run_model_optimize: [true|false]>"
    exit
 fi

@@ -15,10 +15,8 @@ fi
 ANDROID_DIR=/data/local/tmp
 BENCHMARK_BIN=$1
 MODELS_DIR=$2
+RESULT_FILENAME=$3

-RESULT_FILENAME=result.txt
-INPUT_SHAPE=1,3,244,244
-POWER_MODE=3
 WARMUP=10
 REPEATS=30
 IS_RUN_MODEL_OPTIMIZE=false
@@ -27,25 +25,9 @@ NUM_THREADS_LIST=(1 2 4)
 MODELS_LIST=$(ls $MODELS_DIR)

 # Check input
-if [ $# -gt  2 ];
-then
-    RESULT_FILENAME=$3
-fi
 if [ $# -gt  3 ];
 then
-    INPUT_SHAPE=$4
-fi
-if [ $# -gt  4 ];
-then
-    POWER_MODE=$5
-fi
-if [ $# -gt  5 ];
-then
-    IS_RUN_MODEL_OPTIMIZE=$6
-fi
-if [ $# -gt  6 ];
-then
-    IS_RUN_QUANTIZED_MODEL=$7
+    IS_RUN_MODEL_OPTIMIZE=$4
 fi

 # Adb push benchmark_bin, models
@@ -54,26 +36,31 @@ adb shell chmod +x $ANDROID_DIR/benchmark_bin
 adb push $MODELS_DIR $ANDROID_DIR

 # Run benchmark
-adb shell "echo 'PaddleLite Benchmark (in ms)\n' > $ANDROID_DIR/$RESULT_FILENAME"
+adb shell "echo 'PaddleLite Benchmark' > $ANDROID_DIR/$RESULT_FILENAME"
 for threads in ${NUM_THREADS_LIST[@]}; do
-    adb shell "echo threads=$threads warmup=$WARMUP repeats=$REPEATS input_shape=$INPUT_SHAPE power_mode=$POWER_MODE >> $ANDROID_DIR/$RESULT_FILENAME"
+    adb shell "echo Threads=$threads Warmup=$WARMUP Repeats=$REPEATS >> $ANDROID_DIR/$RESULT_FILENAME"
    for model_name in ${MODELS_LIST[@]}; do
      echo "Model=$model_name Threads=$threads"
-      adb shell "$ANDROID_DIR/benchmark_bin \
+      if [ "$IS_RUN_MODEL_OPTIMIZE" = true ]; 
+      then
+          adb shell "$ANDROID_DIR/benchmark_bin \
                   --model_dir=$ANDROID_DIR/${MODELS_DIR}/$model_name \
-                   --input_shape=$INPUT_SHAPE \
                   --warmup=$WARMUP \
                   --repeats=$REPEATS \
                   --threads=$threads \
-                   --power_mode=$POWER_MODE \
-                   --result_filename=$ANDROID_DIR/$RESULT_FILENAME \
-                   --run_model_optimize=$IS_RUN_MODEL_OPTIMIZE \
-                   --is_quantized_model=$IS_RUN_QUANTIZED_MODEL"
+                   --result_filename=$ANDROID_DIR/$RESULT_FILENAME"
+      else
+          adb shell "$ANDROID_DIR/benchmark_bin \
+                   --optimized_model_path=$ANDROID_DIR/${MODELS_DIR}/$model_name \
+                   --warmup=$WARMUP \
+                   --repeats=$REPEATS \
+                   --threads=$threads \
+                   --result_filename=$ANDROID_DIR/$RESULT_FILENAME"
+      fi
    done
    adb shell "echo >> $ANDROID_DIR/$RESULT_FILENAME"
 done
-adb shell "echo >> $ANDROID_DIR/$RESULT_FILENAME"
-adb shell "echo power_mode refer: 0 for big cluster, 1 for little cluster, 2 for all cores,  3 for no bind >> $ANDROID_DIR/$RESULT_FILENAME"
+
 # Adb pull benchmark result, show result
 adb pull $ANDROID_DIR/$RESULT_FILENAME .
 echo "\n--------------------------------------"