裁剪transformer模型trt支持；修复tensorRT不支持DeletePass的bug (#28517)

* skip_layernorm_op done * add unittest * slice op convertor support trt < 6 * skip_layernorm only work in ernie

裁剪transformer模型trt支持；修复tensorRT不支持DeletePass的bug (#28517)
* skip_layernorm_op done * add unittest * slice op convertor support trt < 6 * skip_layernorm only work in ernie
8699f38d · Shang Zhizhou · GitHub · 08d24131 · 8699f38d · 8699f38d
16 changed file
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -127,7 +127,7 @@ function(op_library TARGET)
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
-"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
+"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
 "fused_bn_add_activation_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)

--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -326,6 +326,9 @@ static int BuildFusion(Graph* graph, const std::string& name_scope
 void EmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
  FusePassBase::Init(name_scope_, graph);
  int fusion_count = patterns::BuildFusion(graph, name_scope_);
+  if (fusion_count > 0) {
+    graph->Set(kEmbEltwiseLayernormPass, new bool(true));
+  }
  AddStatis(fusion_count);
 }


--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -696,7 +696,11 @@ void MultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const {
      platform::errors::Fatal(
          "During the multiheadMatmul pass, The scope should not be null."));

-  patterns::BuildFusionV2(graph, name_scope_, scope);
+  int fusion_count = patterns::BuildFusionV2(graph, name_scope_, scope);
+  if (fusion_count > 0) {
+    graph->Set(kMultiheadMatmulPass, new bool(true));
+  }
+  AddStatis(fusion_count);
 }

 }  // namespace ir

--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -36,6 +36,9 @@ struct PassRegistrar;

 typedef std::unordered_set<std::string> PassRecorder;
 constexpr char kPassRecorder[] = "pass_recorder";
+constexpr char kEmbEltwiseLayernormPass[] =
+    "embedding_eltwise_layernorm_fuse_pass_flag";
+constexpr char kMultiheadMatmulPass[] = "multihead_matmul_fuse_pass_flag";

 class Pass {
 public:

--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -134,6 +134,14 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance,
                              fused_pattern);

+    // check if is in ernie or not
+    if (!graph->Has(kEmbEltwiseLayernormPass) ||
+        !graph->Has(kMultiheadMatmulPass)) {
+      LOG(INFO) << "The skip_layernorm_fuse_pass is only supported in "
+                << "Ernie/Bert model. Just skip this pass.";
+      return;
+    }
+
    std::unordered_set<const Node *> del_node_set;

    // Create an SkipLayerNorm op node

--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
@@ -36,6 +36,8 @@ TEST(SkipLayerNormFusePass, basic) {
  layers.layer_norm(elementwise_out, scale, bias);

  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
+  graph->Set(kEmbEltwiseLayernormPass, new bool(true));
+  graph->Set(kMultiheadMatmulPass, new bool(true));
  auto pass = PassRegistry::Instance().Get("skip_layernorm_fuse_pass");
  int num_nodes_before = graph->Nodes().size();
  VLOG(3) << DebugString(graph);

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -117,20 +117,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
  block_desc.Proto()->set_idx(0);
  LOG(INFO) << "---  detect a sub-graph with " << subgraph.size() << " nodes";

-  bool has_fused_embedding_eltwise_layernorm = false;
-  bool has_multihead_matmul = false;
  for (auto *node : subgraph) {
    auto *new_block_op = new_block->AppendOp();
    auto *op = block_desc.AppendOp();
    *new_block_op->Proto() = *node->Op()->Proto();
    *op->Proto() = *node->Op()->Proto();
-    if (!has_fused_embedding_eltwise_layernorm 
-        && op->Type() == "fused_embedding_eltwise_layernorm") {
-      has_fused_embedding_eltwise_layernorm = true;
-    }
-    if (!has_multihead_matmul && op->Type() == "multihead_matmul") {
-      has_multihead_matmul = true;
-    }
  }

  // Then, we will use the input_names_with_id and output_names_with_id to
@@ -319,7 +310,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
                  disable_trt_plugin_fp16);
  trt_engine->SetUseOSS(Get<bool>("use_oss"));
  trt_engine->SetWithErnie(
-      has_multihead_matmul && has_fused_embedding_eltwise_layernorm);
+      graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
+      graph->Has(framework::ir::kMultiheadMatmulPass));

  bool need_serialize = (use_static_engine && !load_from_memory);
  if (need_serialize) {

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -175,7 +175,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {

 #undef CP_MEMBER

-  Update();
+  // Update();
+  // Update() will reset all the passes, when some tensorRT pass is deleted in
+  // other.pass_builder(), it will set again, so just copy the passes.
+  pass_builder_->ClearPasses();
+  for (const std::string &pass : other.pass_builder()->AllPasses()) {
+    pass_builder_->AppendPass(pass);
+  }
 }

 void AnalysisConfig::EnableCUDNN() {
@@ -281,9 +287,7 @@ void AnalysisConfig::SetTRTDynamicShapeInfo(
  disable_trt_plugin_fp16_ = disable_trt_plugin_fp16;
 }

-void AnalysisConfig::EnableTensorRtOSS() {
-    trt_use_oss_ = true;
-}
+void AnalysisConfig::EnableTensorRtOSS() { trt_use_oss_ = true; }

 // TODO(Superjomn) refactor this, buggy.
 void AnalysisConfig::Update() {

--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -78,6 +78,7 @@ class SliceOpConverter : public OpConverter {

    nvinfer1::ILayer* layer = nullptr;
    if (engine_->with_dynamic_shape()) {
+#if IS_TRT_VERSION_GE(6000)
      if (engine_->use_oss() && engine_->with_ernie()) {
        std::vector<nvinfer1::ITensor*> plugin_inputs;
        // plugin_inputs.emplace_back(trans_layer->getOutput(0));
@@ -92,17 +93,16 @@ class SliceOpConverter : public OpConverter {
        layer = engine_->AddPluginV2(plugin_inputs.data(), plugin_inputs.size(),
                                     plugin);
      } else {
-#if IS_TRT_VERSION_GE(6000)
        bool ban_fp16 = engine_->disable_trt_plugin_fp16();
        plugin::SlicePluginDynamic* plugin =
            new plugin::SlicePluginDynamic(starts, ends, axes, ban_fp16);
        layer = engine_->AddPluginV2(&input, 1, plugin);
+      }
 #else
      PADDLE_THROW(platform::errors::Fatal(
          "You are running the TRT Dynamic Shape mode, need to confirm that "
          "your TRT version is no less than 6.0"));
 #endif
-      }
    } else {
      bool ban_fp16 = engine_->disable_trt_plugin_fp16();
      plugin::SlicePlugin* plugin =

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -529,6 +529,15 @@ if(WITH_GPU AND TENSORRT_FOUND)
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)

+    set(TEST_TRT_TRANSFORMER_PRUNE_MODEL "${TRT_MODEL_INSTALL_DIR}/transformer_prune")
+    if (NOT EXISTS ${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune.tar.gz)
+        inference_download_and_uncompress(${TEST_TRT_TRANSFORMER_PRUNE_MODEL} ${INFERENCE_URL}/tensorrt_test "transformer_prune.tar.gz")
+    endif()
+
+    inference_analysis_test(test_trt_dynamic_shape_transformer_prune SRCS trt_dynamic_shape_transformer_prune_test.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
+            ARGS --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune)
+
    set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/")
    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL}/ernie_model_4_unserialized.tgz)
        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")

--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
+
+namespace paddle {
+namespace inference {
+
+void run(const AnalysisConfig& config, std::vector<float>* out_data) {
+  auto predictor = CreatePaddlePredictor(config);
+  auto input_names = predictor->GetInputNames();
+
+  int run_batch = 1;
+  const int run_seq_len = 128;
+
+  std::vector<int64_t> tmp_input;
+  std::vector<float> tmp_four_input;
+  tmp_input.reserve(run_batch * run_seq_len);
+  tmp_four_input.reserve(run_batch * run_seq_len);
+
+  int64_t i0[run_seq_len] = {
+      1,    3558, 4,   75,  491, 89, 340, 313, 93,   4,   255,   10, 75,    321,
+      4095, 1902, 4,   134, 49,  75, 311, 14,  44,   178, 543,   15, 12043, 2,
+      75,   201,  340, 9,   14,  44, 486, 218, 1140, 279, 12043, 2};
+  int64_t i1[run_seq_len] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  int64_t i2[run_seq_len] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+                             10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                             20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+                             30, 31, 32, 33, 34, 35, 36, 37, 38, 39};
+  float i3[run_seq_len] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
+  // first input
+  auto input_t = predictor->GetInputTensor(input_names[0]);
+  input_t->Reshape({run_batch, run_seq_len, 1});
+  input_t->copy_from_cpu(i0);
+
+  // second input
+  auto input_t2 = predictor->GetInputTensor(input_names[1]);
+  input_t2->Reshape({run_batch, run_seq_len, 1});
+  input_t2->copy_from_cpu(i1);
+
+  // third input.
+  auto input_t3 = predictor->GetInputTensor(input_names[2]);
+  input_t3->Reshape({run_batch, run_seq_len, 1});
+  input_t3->copy_from_cpu(i2);
+
+  auto input_t4 = predictor->GetInputTensor(input_names[3]);
+  input_t4->Reshape({run_batch, run_seq_len, 1});
+  input_t4->copy_from_cpu(i3);
+
+  ASSERT_TRUE(predictor->ZeroCopyRun());
+
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputTensor(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  out_data->resize(out_num);
+  output_t->copy_to_cpu(out_data->data());
+}
+
+void trt_ernie(bool with_fp16, std::vector<float> result) {
+  AnalysisConfig config;
+  std::string model_dir = FLAGS_infer_model;
+  SetConfig(&config, model_dir, true);
+
+  config.SwitchUseFeedFetchOps(false);
+
+  int batch = 32;
+  int min_seq_len = 1;
+  int max_seq_len = 128;
+  int opt_seq_len = 128;
+
+  std::vector<int> min_shape = {1, min_seq_len, 1};
+  std::vector<int> max_shape = {batch, max_seq_len, 1};
+  std::vector<int> opt_shape = {batch, opt_seq_len, 1};
+  // Set the input's min, max, opt shape
+  std::map<std::string, std::vector<int>> min_input_shape = {
+      {"read_file_0.tmp_0", min_shape},
+      {"read_file_0.tmp_1", min_shape},
+      {"read_file_0.tmp_2", min_shape},
+      {"read_file_0.tmp_3", min_shape}};
+  std::map<std::string, std::vector<int>> max_input_shape = {
+      {"read_file_0.tmp_0", max_shape},
+      {"read_file_0.tmp_1", max_shape},
+      {"read_file_0.tmp_2", max_shape},
+      {"read_file_0.tmp_3", max_shape}};
+  std::map<std::string, std::vector<int>> opt_input_shape = {
+      {"read_file_0.tmp_0", opt_shape},
+      {"read_file_0.tmp_1", opt_shape},
+      {"read_file_0.tmp_2", opt_shape},
+      {"read_file_0.tmp_3", opt_shape}};
+
+  auto precision = AnalysisConfig::Precision::kFloat32;
+  if (with_fp16) {
+    precision = AnalysisConfig::Precision::kHalf;
+  }
+  config.EnableTensorRtEngine(1 << 30, 1, 12, precision, false, false);
+  config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
+                                opt_input_shape);
+  std::vector<float> out_data;
+  run(config, &out_data);
+
+  for (size_t i = 0; i < out_data.size(); i++) {
+    EXPECT_NEAR(result[i], out_data[i], 1e-5);
+  }
+}
+
+TEST(AnalysisPredictor, no_fp16) {
+  std::vector<float> result = {0.498667, 0.501333};
+  trt_ernie(false, result);
+}
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -6,6 +6,7 @@ register_operators(EXCLUDES
    fusion_conv_inception_op
    fused_fc_elementwise_layernorm_op
    multihead_matmul_op
+    skip_layernorm_op
    fused_embedding_eltwise_layernorm_op
    fusion_group_op
    fusion_gru_op
@@ -40,6 +41,8 @@ if (WITH_GPU)
    # multihead_matmul_op
    op_library(multihead_matmul_op)
    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(multihead_matmul);\n")
+    op_library(skip_layernorm_op)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(skip_layernorm);\n")
    op_library(fused_embedding_eltwise_layernorm_op)
    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_embedding_eltwise_layernorm);\n")
    # fusion_group

--- a/paddle/fluid/operators/fused/skip_layernorm_op.cc
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/errors.h"
+
+namespace paddle {
+namespace operators {
+
+class SkipLayerNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE_EQ(context->HasInput("X"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(X) of MultiHeadMatMul should not be null."));
+    PADDLE_ENFORCE_EQ(context->HasInput("Y"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Y) of MultiHeadMatMul should not be null."));
+    PADDLE_ENFORCE_EQ(
+        context->HasInput("Scale"), true,
+        platform::errors::InvalidArgument(
+            "Input(Scale) of MultiHeadMatMul should not be null."));
+    PADDLE_ENFORCE_EQ(
+        context->HasInput("Bias"), true,
+        platform::errors::InvalidArgument(
+            "Input(Bias) of MultiHeadMatMul should not be null."));
+    PADDLE_ENFORCE_EQ(
+        context->HasOutput("Out"), true,
+        platform::errors::InvalidArgument(
+            "Output(Out) of MultiHeadMatMul should not be null."));
+
+    auto dim_input = context->GetInputDim("X");
+    context->SetOutputDim("Out", dim_input);
+    context->ShareLoD("X", "Out");
+  }
+};
+
+class SkipLayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The X input of SkipLayerNorm op");
+    AddInput("Y", "The Y input of SkipLayerNorm op");
+    AddInput("Scale", "The scale input of SkipLayerNorm op");
+    AddInput("Bias", "The bias input of SkipLayerNorm op");
+    AddOutput("Out", "The output of SkipLayerNorm op");
+    AddAttr<float>("epsilon",
+                   "param epsilon of layer_norm op in "
+                   "skip_layernorm_fuse_pass");
+    AddAttr<int>("begin_norm_axis",
+                 "param begin_norm_axis of "
+                 "layer_norm op in skip_layernorm_fuse_pass");
+    AddComment(R"DOC(
+SkipLayerNorm Operator.
+
+This op is used for skip_layernorm_fuse_pass, which fuse op pattern as followed.
+
+     |           |                            |            |
+ other_op1   other_op2                    other_op1    other_op2
+     |           |              fuse           \          /
+     |------elementwise_add      ->           skip_layernorm
+                 |                                   |
+             layer_norm                          other_op3
+                 |                                   |
+             other_op3
+                 |
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(skip_layernorm, ops::SkipLayerNormOp,
+                             ops::SkipLayerNormOpMaker);
--- a/paddle/fluid/operators/fused/skip_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cu
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_runtime.h>
+#include <paddle/fluid/platform/device_context.h>
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/operators/math/bert_encoder_functor.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SkipLayerNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    using Tensor = framework::Tensor;
+    auto *X = context.Input<framework::Tensor>("X");
+    auto *Y = context.Input<framework::Tensor>("Y");
+    auto *scale = context.Input<framework::Tensor>("Scale");
+    auto *bias = context.Input<framework::Tensor>("Bias");
+
+    auto *X_d = X->data<T>();
+    auto *Y_d = Y->data<T>();
+    auto *scale_d = scale->data<T>();
+    auto *bias_d = bias->data<T>();
+    float epsilon = context.Attr<float>("epsilon");
+    int begin_norm_axis = context.Attr<int>("begin_norm_axis");
+
+    auto *out = context.Output<framework::Tensor>("Out");
+    out->Resize(X->dims());
+    auto *output_d = out->mutable_data<T>(context.GetPlace());
+
+    size_t num = 1;
+    for (size_t i = 0; i < X->dims().size(); i++) {
+      num *= X->dims()[i];
+    }
+    int hidden = X->dims()[2];
+    auto &device_ctx = context.template device_context<DeviceContext>();
+    operators::math::SkipLayerNormFunctor<T> skip_layer_norm_func;
+
+    skip_layer_norm_func(num, hidden, X_d, Y_d, scale_d, bias_d, output_d,
+                         epsilon, device_ctx.stream());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    skip_layernorm,
+    ops::SkipLayerNormKernel<paddle::platform::CUDADeviceContext, float>);
--- a/python/paddle/fluid/tests/unittests/ir/pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/pass_test.py
@@ -36,6 +36,7 @@ class PassTest(unittest.TestCase):
        self.fetch_list = None
        self.pass_names = None
        self.pass_attrs = {}
+        self.graph_attrs = {}
        self.fused_op_type = None
        self.num_fused_ops = -1

@@ -85,6 +86,8 @@ class PassTest(unittest.TestCase):
    def _apply_ir_passes(self):
        graph = core.Graph(self.main_program.desc)
        graph.set_not_owned("__param_scope__", fluid.global_scope())
+        for attr_name, attr_value in self.graph_attrs.items():
+            graph.set(attr_name, attr_value)

        if not isinstance(self.pass_names, list):
            self.pass_names = [self.pass_names]

--- a/python/paddle/fluid/tests/unittests/ir/test_ir_skip_layernorm_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_skip_layernorm_pass.py
@@ -16,12 +16,14 @@ import unittest

 import numpy as np
 from pass_test import PassTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core


 class SkipLayerNormFusePassTest(PassTest):
    def setUp(self):
+        paddle.enable_static()
        with fluid.program_guard(self.main_program, self.startup_program):
            x = fluid.data(
                name="x", shape=[128, 768], dtype="float32", lod_level=0)
@@ -34,6 +36,10 @@ class SkipLayerNormFusePassTest(PassTest):
        self.pass_names = "skip_layernorm_fuse_pass"
        self.fused_op_type = "skip_layernorm"
        self.num_fused_ops = 1
+        self.graph_attrs = {
+            "embedding_eltwise_layernorm_fuse_pass_flag": True,
+            "multihead_matmul_fuse_pass_flag": True
+        }

    def test_check_program(self):
        use_gpu_set = [False]