Preln fix (#49802)

* preln_residual 2 fused_bias_residual * skip layernorm fix and ut * code refine * code style refine * fix ut * fix output * add trt layer fall back info * refine op teller and ut * DropoutMaskOut output fix

Preln fix (#49802)
* preln_residual 2 fused_bias_residual * skip layernorm fix and ut * code refine * code style refine * fix ut * fix output * add trt layer fall back info * refine op teller and ut * DropoutMaskOut output fix
e03718f5 · Wang Bojun · GitHub · 9fa2eb38 · e03718f5 · e03718f5
12 changed file
--- a/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc
@@ -129,6 +129,24 @@ void PrelnResidualBias::operator()(PDNode *x, PDNode *y) {
 }  // namespace patterns
+void setIntermediateOut(OpDesc *desc,
+                        const std::string &out_name,
+                        const std::string &scope_name) {
+  std::string new_name = scope_name + "/at." + out_name + ".new";
+  desc->SetOutput(out_name, {new_name});
+}
+void addIntermediateOut(Node *op_node,
+                        const std::string &out_name,
+                        const std::string &scope_name,
+                        Graph *graph) {
+  std::string new_name = scope_name + "/at." + out_name + ".new";
+  VarDesc out_var(new_name);
+  out_var.SetPersistable(false);
+  auto *node_var = graph->CreateVarNode(&out_var);
+  IR_NODE_LINK_TO(op_node, node_var);
+}
 int PrelnResidualBiasFusePass::ApplyPattern(ir::Graph *graph,
                                            bool with_bias) const {
  PADDLE_ENFORCE_NOT_NULL(
@@ -207,7 +225,7 @@ int PrelnResidualBiasFusePass::ApplyPattern(ir::Graph *graph,
    // on each other, so we make below check to ensure only one
    // PrelnResidualBias pattern is delalted with.
    for (auto op : elementwise1_out->inputs) {
-      if (op->Name() == "preln_residual_bias") return;
+      if (op->Name() == "fused_bias_dropout_residual_layer_norm") return;
    }
    if (!IsCompat(subgraph, graph)) {
@@ -218,31 +236,37 @@ int PrelnResidualBiasFusePass::ApplyPattern(ir::Graph *graph,
    std::unordered_set<const Node *> del_node_set;
    // Create an PrelnResidualBias op node
    OpDesc new_desc;
-    new_desc.SetType("preln_residual_bias");
+    new_desc.SetType("fused_bias_dropout_residual_layer_norm");
    // inputs
    new_desc.SetInput("X", {subgraph.at(x)->Name()});
-    new_desc.SetInput("Y", {subgraph.at(y)->Name()});
+    new_desc.SetInput("Residual", {subgraph.at(y)->Name()});
-    new_desc.SetInput("Scale", {layer_norm_scale->Name()});
+    new_desc.SetInput("LnScale", {layer_norm_scale->Name()});
-    new_desc.SetInput("Bias", {layer_norm_bias->Name()});
+    new_desc.SetInput("LnBias", {layer_norm_bias->Name()});
    if (with_bias) {
-      new_desc.SetInput("EleBias", {elementwise_bias->Name()});
+      new_desc.SetInput("Bias", {elementwise_bias->Name()});
    }
    // outputs
-    new_desc.SetOutput("Out_0", {layer_norm_out->Name()});
+    new_desc.SetOutput("Y", {layer_norm_out->Name()});
-    new_desc.SetOutput("Out_1", {elementwise1_out->Name()});
+    new_desc.SetOutput("BiasDropoutResidualOut", {elementwise1_out->Name()});
+    new_desc.SetOutput("LnMean", {layer_norm_mean->Name()});
+    new_desc.SetOutput("LnVariance", {layer_norm_variance->Name()});
+    setIntermediateOut(&new_desc, "DropoutMaskOut", "preln_residual_bias_fuse");
    // attrs
-    new_desc.SetAttr("epsilon", layer_norm->Op()->GetAttr("epsilon"));
+    new_desc.SetAttr("ln_epsilon", layer_norm->Op()->GetAttr("epsilon"));
+    new_desc.SetAttr("dropout_rate", 0.0f);
+    new_desc.SetAttr("is_test", true);
    new_desc.SetAttr("begin_norm_axis",
                     layer_norm->Op()->GetAttr("begin_norm_axis"));
    auto fused_node = graph->CreateOpNode(&new_desc);  // OpDesc will be copied.
+    addIntermediateOut(
+        fused_node, "DropoutMaskOut", "preln_residual_bias_fuse", graph);
    if (with_bias) {
      del_node_set.insert(elementwise0);
      del_node_set.insert(elementwise0_out);
    }
    del_node_set.insert(elementwise1);
    del_node_set.insert(layer_norm);
-    del_node_set.insert(layer_norm_mean);
-    del_node_set.insert(layer_norm_variance);
    GraphSafeRemoveNodes(graph, del_node_set);
    IR_NODE_LINK_TO(subgraph.at(x), fused_node);
    IR_NODE_LINK_TO(subgraph.at(y), fused_node);
@@ -253,6 +277,9 @@ int PrelnResidualBiasFusePass::ApplyPattern(ir::Graph *graph,
    IR_NODE_LINK_TO(layer_norm_bias, fused_node);
    IR_NODE_LINK_TO(fused_node, layer_norm_out);
    IR_NODE_LINK_TO(fused_node, elementwise1_out);
+    IR_NODE_LINK_TO(fused_node, layer_norm_mean);
+    IR_NODE_LINK_TO(fused_node, layer_norm_variance);
    found_subgraph_count++;
  };
@@ -261,6 +288,8 @@ int PrelnResidualBiasFusePass::ApplyPattern(ir::Graph *graph,
 }
 void PrelnResidualBiasFusePass::ApplyImpl(ir::Graph *graph) const {
+  VLOG(1) << "Fuse PrelnResidualBias into "
+             "fused_bias_dropout_residual_layer_norm op with dropout rate = 0";
  PADDLE_ENFORCE_NOT_NULL(
      graph, platform::errors::PreconditionNotMet("graph should not be null."));
  FusePassBase::Init("preln_residual_bias_fuse", graph);

--- a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
@@ -170,7 +170,7 @@ void TrtSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
    // attrs
    new_desc.SetAttr("epsilon", layer_norm->Op()->GetAttr("epsilon"));
-    if (new_desc.HasAttr("begin_norm_axis")) {
+    if (layer_norm->Op()->HasAttr("begin_norm_axis")) {
      int32_t begin_norm_axis = PADDLE_GET_CONST(
          int32_t, layer_norm->Op()->GetAttr("begin_norm_axis"));
      int32_t input_rank =

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2464,7 +2464,7 @@ USE_TRT_CONVERTER(rsqrt);
 USE_TRT_CONVERTER(fused_preln_embedding_eltwise_layernorm)
 USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm);
 USE_TRT_CONVERTER(preln_skip_layernorm)
-USE_TRT_CONVERTER(preln_residual_bias)
+USE_TRT_CONVERTER(fused_bias_dropout_residual_layer_norm)
 USE_TRT_CONVERTER(c_allreduce_sum)
 USE_TRT_CONVERTER(roll)
 USE_TRT_CONVERTER(strided_slice)

--- a/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
@@ -26,16 +26,12 @@ class PrelnResidualBiasOpConverter : public OpConverter {
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope,
                  bool test_mode) override {
-    VLOG(4) << "convert fused preln_residual_bias op to tensorrt layer";
+    VLOG(4) << "convert fused_bias_dropout_residual_layer_norm op with "
-    if (!engine_->with_dynamic_shape()) {
+               "drop_rate = 0 to preln_residual_bias tensorrt layer";
-      PADDLE_THROW(
-          platform::errors::Fatal("Unsupported static graph mode. Please set "
-                                  "dynamic shape of inputs."));
-    }
    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs
    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
-    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
+    auto* input2 = engine_->GetITensor(op_desc.Input("Residual")[0]);
    std::vector<nvinfer1::ITensor*> inputs;
    inputs.push_back(input1);
    inputs.push_back(input2);
@@ -50,18 +46,18 @@ class PrelnResidualBiasOpConverter : public OpConverter {
      return temp_data;
    };
    framework::DDim bias_dims, scale_dims, ele_bias_dims;
-    auto* bias = get_persistable_data("Bias", &bias_dims);
+    auto* bias = get_persistable_data("LnBias", &bias_dims);
-    auto* scale = get_persistable_data("Scale", &scale_dims);
+    auto* scale = get_persistable_data("LnScale", &scale_dims);
    auto const& vars = op_desc.Inputs(false);
-    bool has_bias = vars.find("EleBias") != vars.end();
+    bool has_bias = vars.find("Bias") != vars.end();
    float* ele_bias =
-        has_bias ? get_persistable_data("EleBias", &ele_bias_dims) : nullptr;
+        has_bias ? get_persistable_data("Bias", &ele_bias_dims) : nullptr;
    int bias_size = phi::product(bias_dims);
    int scale_size = phi::product(scale_dims);
    int ele_bias_size = has_bias ? phi::product(ele_bias_dims) : 0;
-    float epsilon = PADDLE_GET_CONST(float, op_desc.GetAttr("epsilon"));
+    float epsilon = PADDLE_GET_CONST(float, op_desc.GetAttr("ln_epsilon"));
    bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
    if (engine_->precision() == AnalysisConfig::Precision::kInt8) {
      with_fp16 = true;
@@ -102,8 +98,8 @@ class PrelnResidualBiasOpConverter : public OpConverter {
    plugin_inputs.emplace_back(input2);
    layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin);
    std::vector<std::string> output_names;
-    output_names.push_back(op_desc.Output("Out_0")[0]);
+    output_names.push_back(op_desc.Output("Y")[0]);
-    output_names.push_back(op_desc.Output("Out_1")[0]);
+    output_names.push_back(op_desc.Output("BiasDropoutResidualOut")[0]);
    RreplenishLayerAndOutput(
        layer, "preln_residual_bias", output_names, test_mode);
  }
@@ -113,4 +109,5 @@ class PrelnResidualBiasOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
-REGISTER_TRT_OP_CONVERTER(preln_residual_bias, PrelnResidualBiasOpConverter);
+REGISTER_TRT_OP_CONVERTER(fused_bias_dropout_residual_layer_norm,
+                          PrelnResidualBiasOpConverter);
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -1495,7 +1495,21 @@ struct SimpleOpTypeSetTeller : public Teller {
        return false;
      }
    }
+    if (op_type == "fused_bias_dropout_residual_layer_norm") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "fused_bias_dropout_residual_layer_norm should run on "
+                   "dynamic shape mode.";
+        return false;
+      }
+      float dropout_rate =
+          PADDLE_GET_CONST(float, desc.GetAttr("dropout_rate"));
+      if (dropout_rate != 0.0f) {
+        VLOG(4) << "preln_residual_bias trt layer can not work with "
+                   "fused_bias_dropout_residual_layer_norm op in which the "
+                   "dropout_rate != 0, stop convert";
+        return false;
+      }
+    }
    if (op_type == "fused_preln_embedding_eltwise_layernorm") {
      if (!with_dynamic_shape) {
        VLOG(3) << "fused_preln_embedding_eltwise_layernorm should run on "
@@ -2594,7 +2608,7 @@ struct SimpleOpTypeSetTeller : public Teller {
      "slice",
      "strided_slice",
      "fused_preln_embedding_eltwise_layernorm",
-      "preln_residual_bias",
+      "fused_bias_dropout_residual_layer_norm",
      "c_allreduce_sum",
      "c_allreduce_min",
      "c_allreduce_max",
@@ -2744,7 +2758,7 @@ struct SimpleOpTypeSetTeller : public Teller {
      "strided_slice",
      "fused_preln_embedding_eltwise_layernorm",
      "preln_skip_layernorm",
-      "preln_residual_bias",
+      "fused_bias_dropout_residual_layer_norm",
      "c_allreduce_sum",
      "c_allreduce_min",
      "c_allreduce_max",

--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
@@ -35,16 +35,17 @@ class FusedBiasDropoutResidualLnOp : public framework::OperatorWithKernel {
                   "Output",
                   "LnVariance",
                   "FusedBiasDropoutResidualLnOp");
-    OP_INOUT_CHECK(ctx->HasOutput("BiasDropoutResidualOut"),
-                   "Output",
-                   "BiasDropoutResidualOut",
-                   "FusedBiasDropoutResidualLnOp");
    OP_INOUT_CHECK(ctx->HasOutput("DropoutMaskOut"),
                   "Output",
                   "DropoutMaskOut",
                   "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("BiasDropoutResidualOut"),
+                   "Output",
+                   "BiasDropoutResidualOut",
+                   "FusedBiasDropoutResidualLnOp");
    OP_INOUT_CHECK(
        ctx->HasOutput("Y"), "Output", "Y", "FusedBiasDropoutResidualLnOp");
    auto x_dim = ctx->GetInputDim("X");
    int left = 1;
    for (int i = 0; i < x_dim.size() - 1; i++) {

--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
@@ -54,8 +54,12 @@ class FusedBiasDropoutResidualLnOpKernel : public framework::OpKernel<T> {
    auto *ln_mean_data =
        dev_ctx.Alloc<U>(ln_mean, ln_mean->numel() * sizeof(U));
    auto *ln_var_data = dev_ctx.Alloc<U>(ln_var, ln_var->numel() * sizeof(U));
-    auto *dropout_mask_out_data = dev_ctx.Alloc<uint8_t>(
+    auto *dropout_mask_out_data =
-        dropout_mask_out, dropout_mask_out->numel() * sizeof(uint8_t));
+        (dropout_mask_out == nullptr)
+            ? nullptr
+            : dev_ctx.Alloc<uint8_t>(
+                  dropout_mask_out,
+                  dropout_mask_out->numel() * sizeof(uint8_t));
    auto *y_data = dev_ctx.Alloc<T>(y, y->numel() * sizeof(T));
    const auto input_x_dims = input_x->dims();

--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -854,9 +854,10 @@ void LaunchLayernormResidualDropoutBias(
                 residual,
                 rows * cols * sizeof(T),
                 ctx.stream());
+    if (mask_data != nullptr) {
      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
          mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream()));
+    }
    // call layernorm forward
    switch (GetDesiredBlockDim(cols)) {
      FIXED_BLOCK_DIM_CASE(

--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -18,15 +18,6 @@ string(REPLACE ".py" "" TEST_TRT_CONVERTER "${TEST_TRT_CONVERTER}")
 if(NOT WITH_DISTRIBUTE)
  list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES "test_delete_c_identity_op_pass")
-  list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES
-       "test_trt_convert_preln_residual_bias")
-  list(REMOVE_ITEM TEST_TRT_IR_PASSES "test_trt_convert_preln_residual_bias")
-  list(REMOVE_ITEM TEST_TRT_CONVERTER "test_trt_convert_preln_residual_bias")
-  list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES
-       "test_trt_convert_preln_residual_no_bias")
-  list(REMOVE_ITEM TEST_TRT_IR_PASSES "test_trt_convert_preln_residual_no_bias")
-  list(REMOVE_ITEM TEST_TRT_CONVERTER "test_trt_convert_preln_residual_no_bias")
  list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES "test_trt_convert_c_allreduce")
  list(REMOVE_ITEM TEST_TRT_IR_PASSES "test_trt_convert_c_allreduce")
  list(REMOVE_ITEM TEST_TRT_CONVERTER "test_trt_convert_c_allreduce")

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_bias.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_bias.py
@@ -158,11 +158,24 @@ class TrtConvertSkipLayernormTest(TrtLayerAutoScanTest):
            self.dynamic_shape.opt_input_shape = {}
        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if dynamic_shape:
                return 1, 4
+            else:
+                return 0, 5
        attrs = [
            program_config.ops[i].attrs for i in range(len(program_config.ops))
        ]
+        # for static_shape, fall back to fluid fused op
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-2  # atol=1e-2 while rtol is 1e-8
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-2  # atol=1e-2 while rtol is 1e-8
        # just support dynamic_shape
        generate_dynamic_shape(attrs)

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_no_bias.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_no_bias.py
@@ -146,12 +146,26 @@ class TrtConvertSkipLayernormTest(TrtLayerAutoScanTest):
            self.dynamic_shape.opt_input_shape = {}
        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if dynamic_shape:
                return 1, 4
+            else:
+                return 0, 5
        attrs = [
            program_config.ops[i].attrs for i in range(len(program_config.ops))
        ]
+        # for static_shape, fall back to fluid fused op
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-2  # atol=1e-2 while rtol is 1e-8
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-2  # atol=1e-2 while rtol is 1e-8
        # just support dynamic_shape
        generate_dynamic_shape(attrs)
        self.trt_param.precision = paddle_infer.PrecisionType.Float32

--- a/python/paddle/fluid/tests/unittests/ir/test_ir_preln_residual_bias_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_preln_residual_bias_fuse_pass.py
@@ -38,7 +38,7 @@ class PrelnResidualBiasFusePassTest(PassTest):
        self.fetch_list = [out, elementwise_out]
        self.pass_names = "preln_residual_bias_fuse_pass"
-        self.fused_op_type = "preln_residual_bias"
+        self.fused_op_type = "fused_bias_dropout_residual_layer_norm"
        self.num_fused_ops = 1
        # self.graph_attrs = {
        #     "embedding_eltwise_layernorm_fuse_pass_flag": True,
@@ -72,7 +72,7 @@ class PrelnResidualBiasFusePassNoBiasTest(PassTest):
        self.fetch_list = [out, elementwise_out]
        self.pass_names = "preln_residual_bias_fuse_pass"
-        self.fused_op_type = "preln_residual_bias"
+        self.fused_op_type = "fused_bias_dropout_residual_layer_norm"
        self.num_fused_ops = 1
    def test_check_program(self):