support ernie trt-int8 for inference (#32232)

* support ernie trt-int8 for inference * fix reshape

support ernie trt-int8 for inference (#32232)
* support ernie trt-int8 for inference * fix reshape
6da043eb · ceci3 · GitHub · fabdb43c · 6da043eb · 6da043eb
11 changed file
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -299,6 +299,11 @@ static int BuildFusion(Graph* graph, const std::string& name_scope
    new_op_desc.SetOutput("Out", {end_pattern_out[k]->Name()});
    new_op_desc.SetAttr("epsilon",
                        end_patter_layernorms[k]->Op()->GetAttr("epsilon"));
+    if (end_patter_layernorms[k]->Op()->HasAttr("out_threshold")) {
+      new_op_desc.SetAttr("enable_int8", true);
+    }
    auto* embedding_eltwise_layernorm = graph->CreateOpNode(&new_op_desc);
    for (size_t iter = 0; iter < start_pattern_in_nodes[i].size(); ++iter) {

--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -535,6 +535,38 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
    multihead_op_desc.SetAttr("alpha", scale_attr);
    multihead_op_desc.SetAttr("head_number", head_number);
+    auto* mul0_op_desc = mul0->Op();
+    auto* mul1_op_desc = mul1->Op();
+    auto* mul2_op_desc = mul2->Op();
+    if (mul0_op_desc->HasAttr("enable_int8")) {
+      multihead_op_desc.SetAttr("enable_int8",
+                                mul0_op_desc->GetAttr("enable_int8"));
+      // all mul op has same input.
+      multihead_op_desc.SetAttr("Input_scale",
+                                mul0_op_desc->GetAttr("X_scale"));
+      auto weight_scale0 = BOOST_GET_CONST(
+          std::vector<float>, mul0_op_desc->GetAttr("weight_scale"));
+      auto weight_scale1 = BOOST_GET_CONST(
+          std::vector<float>, mul1_op_desc->GetAttr("weight_scale"));
+      auto weight_scale2 = BOOST_GET_CONST(
+          std::vector<float>, mul2_op_desc->GetAttr("weight_scale"));
+      auto weight_max = std::max(weight_scale0, weight_scale1);
+      weight_max = std::max(weight_max, weight_scale2);
+      multihead_op_desc.SetAttr("weight_scale", weight_max);
+      if (mul0_op_desc->HasAttr("out_threshold")) {
+        auto out_scale0 =
+            BOOST_GET_CONST(float, mul0_op_desc->GetAttr("out_threshold"));
+        auto out_scale1 =
+            BOOST_GET_CONST(float, mul1_op_desc->GetAttr("out_threshold"));
+        auto out_scale2 =
+            BOOST_GET_CONST(float, mul2_op_desc->GetAttr("out_threshold"));
+        auto out_scale_max = std::max(out_scale0, out_scale1);
+        out_scale_max = std::max(out_scale_max, out_scale2);
+        multihead_op_desc.SetAttr("out_threshold", out_scale_max);
+      }
+    }
    auto* multihead = graph->CreateOpNode(&multihead_op_desc);
    IR_NODE_LINK_TO(input0, multihead);

--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -153,6 +153,10 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
    new_desc.SetInput("Scale", {layer_norm_scale->Name()});
    new_desc.SetInput("Bias", {layer_norm_bias->Name()});
+    if (elementwise->Op()->HasAttr("out_threshold")) {
+      new_desc.SetAttr("enable_int8", true);
+    }
    // outputs
    new_desc.SetOutput("Out", {layer_norm_out->Name()});

--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -31,7 +31,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
 #if IS_TRT_VERSION_GE(6000)
-    VLOG(4) << "convert fluid swish op to tensorrt layer";
+    VLOG(4) << "convert fluid EmbEltwiseLayerNorm op to tensorrt layer";
    framework::OpDesc op_desc(op, nullptr);
    auto id_names = op_desc.Input("Ids");
@@ -89,10 +89,14 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
    int64_t bias_size = framework::product(bias_dims);
    int64_t scale_size = framework::product(scale_dims);
    nvinfer1::ILayer* layer = nullptr;
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
    if (engine_->with_dynamic_shape()) {
      if (engine_->use_oss()) {
        int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
+        if (enable_int8) {
+          output_fp16 = 1;
+        }
        PADDLE_ENFORCE_EQ(
            output_fp16, 1,
            platform::errors::InvalidArgument(

--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -106,8 +106,22 @@ class FcOpConverter : public OpConverter {
    auto regist_fc = [&](nvinfer1::ITensor* inputs, int n_output,
                         TensorRTEngine::Weight& weight,
                         TensorRTEngine::Weight& bias) {
-      auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs,
+      nvinfer1::ILayer* fc_layer = nullptr;
-                                            n_output, weight.get(), bias.get());
+      if (enable_int8) {
+        PADDLE_ENFORCE_EQ(
+            op_desc.HasAttr("out_threshold"), true,
+            platform::errors::InvalidArgument(
+                "must have out threshold in fc layers in int8 mode"));
+        float out_scale =
+            BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+        nvinfer1::DimsHW nv_ksize(1, 1);
+        fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output,
+                                        nv_ksize, weight.get(), bias.get());
+        engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
+      } else {
+        fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs,
+                                        n_output, weight.get(), bias.get());
+      }
      auto output_name = op_desc.Output("Out").front();
      if (activation_type == "relu") {
@@ -229,13 +243,24 @@ class FcOpConverter : public OpConverter {
                "dims equals to 4, the last dim of input must be 1, but got %d",
                input_d[3]));
      }
-      for (int i = 0; i < 3; i++) {
+      if (enable_int8) {
-        if (i < input_dims) {
+        reshape_dim3[0] = 1;
-          reshape_dim3[i] = input_d[i];
+        for (int i = 0; i < 3; i++) {
-        } else {
+          reshape_dim3[0] *= input_d[i];
-          reshape_dim3[i] = 1;
+          if (i > 0) {
+            reshape_dim3[i] = 1;
+          }
+        }
+      } else {
+        for (int i = 0; i < 3; i++) {
+          if (i < input_dims) {
+            reshape_dim3[i] = input_d[i];
+          } else {
+            reshape_dim3[i] = 1;
+          }
        }
      }
      nvinfer1::Dims3 reshape_dim(reshape_dim3[0], reshape_dim3[1],
                                  reshape_dim3[2]);
      auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
@@ -249,11 +274,25 @@ class FcOpConverter : public OpConverter {
                        platform::errors::InvalidArgument(
                            "Invalid dimensions. When x_num_col_dims equals to "
                            "2, input_dims should not be 1"));
-      for (int i = 0; i < 4; i++) {
-        if (i < input_dims) {
+      if (enable_int8) {
-          reshape_dim4[i] = input_d[i];
+        for (int i = 0; i < 4; i++) {
-        } else {
+          if (i == 0) {
-          reshape_dim4[i] = 1;
+            reshape_dim4[i] = input_d[i];
+          } else {
+            reshape_dim4[i] = 1;
+            if (i < input_dims) {
+              reshape_dim4[1] *= input_d[i];
+            }
+          }
+        }
+      } else {
+        for (int i = 0; i < 4; i++) {
+          if (i < input_dims) {
+            reshape_dim4[i] = input_d[i];
+          } else {
+            reshape_dim4[i] = 1;
+          }
        }
      }
      nvinfer1::Dims4 reshape_dim(reshape_dim4[0], reshape_dim4[1],

--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -40,8 +40,25 @@ class MultiheadMatMulOpConverter : public OpConverter {
    auto* bias_v = scope.FindVar(bias_name);
    auto* bias_t = bias_v->GetMutable<framework::LoDTensor>();
-    float* weight_data =
+    float* weight_data = nullptr;
-        engine_->GetWeightCPUData(weight_name, weight_t, false);
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
+    float in_scale = 0.;
+    if (enable_int8) {
+      PADDLE_ENFORCE_EQ(
+          op_desc.HasAttr("Input_scale"), true,
+          platform::errors::InvalidArgument(
+              "must have input scale in multihead layers in int8 mode"));
+      in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127;
+      auto weight_scale =
+          BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("weight_scale"));
+      weight_data =
+          engine_->GetWeightCPUData(weight_name, weight_t, true, weight_scale);
+      engine_->SetTensorDynamicRange(input, in_scale);
+    } else {
+      weight_data = engine_->GetWeightCPUData(weight_name, weight_t, false);
+    }
    float* bias_data = engine_->GetWeightCPUData(bias_name, bias_t, false);
    std::vector<float> weight_data_tmp;
    weight_data_tmp.reserve(weight_t->numel());
@@ -117,8 +134,27 @@ class MultiheadMatMulOpConverter : public OpConverter {
                               static_cast<void*>(bias_data),
                               static_cast<int32_t>(bias_t->numel())};
-        auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input,
+        nvinfer1::ILayer* fc_layer = nullptr;
-                                              n, weight, bias);
+        float dp_probs = 1.0 / 127.0;
+        if (enable_int8) {
+          nvinfer1::DimsHW nv_ksize(1, 1);
+          fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *input, n,
+                                          nv_ksize, weight, bias);
+        } else {
+          fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, n,
+                                          weight, bias);
+        }
+        if (enable_int8) {
+          PADDLE_ENFORCE_EQ(
+              op_desc.HasAttr("out_threshold"), true,
+              platform::errors::InvalidArgument(
+                  "must have out threshold in multihead layers in int8 mode"));
+          float out_scale =
+              BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+          engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
+          dp_probs = out_scale / 127.0;
+        }
        auto mask_tensor = engine_->GetITensor("qkv_plugin_mask");
@@ -128,6 +164,9 @@ class MultiheadMatMulOpConverter : public OpConverter {
        int type = static_cast<int>((engine_->WithFp16() == 1)
                                        ? nvinfer1::DataType::kHALF
                                        : nvinfer1::DataType::kFLOAT);
+        if (enable_int8) {
+          type = static_cast<int>(nvinfer1::DataType::kHALF);
+        }
        bool has_mask = true;
        int var_seqlen = 1;
        const std::vector<nvinfer1::PluginField> fields{
@@ -136,7 +175,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
            {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1},
            {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1},
            {"var_seqlen", &var_seqlen, nvinfer1::PluginFieldType::kINT32, 1},
-        };
+            { "dq_probs", &dp_probs, nvinfer1::PluginFieldType::kFLOAT32, 1 }};
        nvinfer1::PluginFieldCollection* plugin_collection =
            static_cast<nvinfer1::PluginFieldCollection*>(
                malloc(sizeof(*plugin_collection) +

--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
@@ -49,6 +49,7 @@ class SkipLayerNormOpConverter : public OpConverter {
    auto* scale = get_persistable_data("Scale", &scale_dims);
    int bias_size = framework::product(bias_dims);
    int scale_size = framework::product(scale_dims);
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
    nvinfer1::ILayer* layer = nullptr;
    if (engine_->with_dynamic_shape()) {
@@ -62,6 +63,10 @@ class SkipLayerNormOpConverter : public OpConverter {
        int ld = input1->getDimensions().d[2];  // hidden dimension
        assert(ld > 0);
+        if (enable_int8) {
+          type = static_cast<int>(nvinfer1::DataType::kHALF);
+        }
        const std::vector<nvinfer1::PluginField> fields{
            {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
            {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1},

--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -31,6 +31,12 @@ class SliceOpConverter : public OpConverter {
    // Declare inputs
    auto* input = engine_->GetITensor(op_desc.Input("Input")[0]);
+    if (op_desc.HasAttr("out_threshold")) {
+      float out_scale =
+          BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+      engine_->SetTensorDynamicRange(input, out_scale);
+    }
    std::vector<int> axes =
        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("axes"));
    std::vector<int> starts =

--- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
@@ -45,6 +45,11 @@ class StackOpConverter : public OpConverter {
    for (int i = 0; i < input_num; ++i) {
      inputs[i] = engine_->GetITensor(input[i]);
+      if (op_desc.HasAttr("out_threshold")) {
+        float out_scale =
+            BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+        engine_->SetTensorDynamicRange(inputs[i], out_scale);
+      }
    }
    int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));

--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -45,6 +45,12 @@ struct SimpleOpTypeSetTeller : public Teller {
 #endif
 #if IS_TRT_VERSION_GE(7130)
    teller_set.insert("group_norm");
+    int8_teller_set.insert("multihead_matmul");
+    int8_teller_set.insert("skip_layernorm");
+    int8_teller_set.insert("fused_embedding_eltwise_layernorm");
+    int8_teller_set.insert("matmul");
+    int8_teller_set.insert("stack");
+    int8_teller_set.insert("slice");
 #endif
  }

--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -60,6 +60,7 @@ _out_scale_op_list = [
    "swish",
    "softmax",
    "batch_norm",
+    "layer_norm",
    "elementwise_add",
    "pool2d",
    "reshape2",
@@ -67,6 +68,7 @@ _out_scale_op_list = [
    "concat",
    "elementwise_mul",
    "scale",
+    "slice",
    "hard_swish",
    "hard_sigmoid",
    "conv2d_transpose",
@@ -119,6 +121,7 @@ _op_real_in_out_name = {
    "swish": [["X"], ["Out"]],
    "dropout": [["X"], ["Out"]],
    "batch_norm": [["X"], ["Y"]],
+    "layer_norm": [["X"], ["Y"]],
    "sigmoid": [["X"], ["Out"]],
    "elementwise_mul": [["X", "Y"], ["Out"]],
    "scale": [["X"], ["Out"]],
@@ -1749,7 +1752,7 @@ class AddQuantDequantPass(object):
        "bilinear_interp", "nearest_interp", "trilinear_interp", "slice",
        "squeeze", "elementwise_sub", "mul", "matmul", "relu", "relu6",
        "leaky_relu", "tanh", "swish", "scale", "transpose", "transpose2",
-        "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm"
+        "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm", "layer_norm"
    ]
    # To be compatible with PaddleSlim, not remove _activation_type for now