From 8699f38d0841b0026c8a1af2d82cabccf32ecbb0 Mon Sep 17 00:00:00 2001
From: Shang Zhizhou <shangzhizhou@baidu.com>
Date: Thu, 12 Nov 2020 17:00:23 +0800
Subject: [PATCH] =?UTF-8?q?=E8=A3=81=E5=89=AAtransformer=E6=A8=A1=E5=9E=8B?=
 =?UTF-8?q?trt=E6=94=AF=E6=8C=81=EF=BC=9B=E4=BF=AE=E5=A4=8DtensorRT?=
 =?UTF-8?q?=E4=B8=8D=E6=94=AF=E6=8C=81DeletePass=E7=9A=84bug=20(#28517)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* skip_layernorm_op done

* add unittest

* slice op convertor support trt < 6

* skip_layernorm only work in ernie
---
 cmake/operators.cmake                         |   2 +-
 .../embedding_eltwise_layernorm_fuse_pass.cc  |   3 +
 .../ir/multihead_matmul_fuse_pass.cc          |   6 +-
 paddle/fluid/framework/ir/pass.h              |   3 +
 .../framework/ir/skip_layernorm_fuse_pass.cc  |   8 +
 .../ir/skip_layernorm_fuse_pass_tester.cc     |   2 +
 .../ir_passes/tensorrt_subgraph_pass.cc       |  12 +-
 paddle/fluid/inference/api/analysis_config.cc |  12 +-
 .../inference/tensorrt/convert/slice_op.cc    |  10 +-
 .../fluid/inference/tests/api/CMakeLists.txt  |   9 ++
 ...rt_dynamic_shape_transformer_prune_test.cc | 139 ++++++++++++++++++
 paddle/fluid/operators/fused/CMakeLists.txt   |   3 +
 .../operators/fused/skip_layernorm_op.cc      |  91 ++++++++++++
 .../operators/fused/skip_layernorm_op.cu      |  66 +++++++++
 .../fluid/tests/unittests/ir/pass_test.py     |   3 +
 .../ir/test_ir_skip_layernorm_pass.py         |   6 +
 16 files changed, 354 insertions(+), 21 deletions(-)
 create mode 100644 paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc
 create mode 100644 paddle/fluid/operators/fused/skip_layernorm_op.cc
 create mode 100644 paddle/fluid/operators/fused/skip_layernorm_op.cu

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 7aa2766763..715d324c35 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -127,7 +127,7 @@ function(op_library TARGET)
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
-"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
+"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
 "fused_bn_add_activation_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
index 51861b402d..19662a04f5 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -326,6 +326,9 @@ static int BuildFusion(Graph* graph, const std::string& name_scope
 void EmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
   FusePassBase::Init(name_scope_, graph);
   int fusion_count = patterns::BuildFusion(graph, name_scope_);
+  if (fusion_count > 0) {
+    graph->Set(kEmbEltwiseLayernormPass, new bool(true));
+  }
   AddStatis(fusion_count);
 }
 
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index d1fbc8396b..cd6d1d5703 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -696,7 +696,11 @@ void MultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const {
       platform::errors::Fatal(
           "During the multiheadMatmul pass, The scope should not be null."));
 
-  patterns::BuildFusionV2(graph, name_scope_, scope);
+  int fusion_count = patterns::BuildFusionV2(graph, name_scope_, scope);
+  if (fusion_count > 0) {
+    graph->Set(kMultiheadMatmulPass, new bool(true));
+  }
+  AddStatis(fusion_count);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 668dc74eab..a3b1b33d26 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -36,6 +36,9 @@ struct PassRegistrar;
 
 typedef std::unordered_set<std::string> PassRecorder;
 constexpr char kPassRecorder[] = "pass_recorder";
+constexpr char kEmbEltwiseLayernormPass[] =
+    "embedding_eltwise_layernorm_fuse_pass_flag";
+constexpr char kMultiheadMatmulPass[] = "multihead_matmul_fuse_pass_flag";
 
 class Pass {
  public:
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index e5f348dfeb..b708f2eff1 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -134,6 +134,14 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
     GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance,
                               fused_pattern);
 
+    // check if is in ernie or not
+    if (!graph->Has(kEmbEltwiseLayernormPass) ||
+        !graph->Has(kMultiheadMatmulPass)) {
+      LOG(INFO) << "The skip_layernorm_fuse_pass is only supported in "
+                << "Ernie/Bert model. Just skip this pass.";
+      return;
+    }
+
     std::unordered_set<const Node *> del_node_set;
 
     // Create an SkipLayerNorm op node
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
index eff5dcddf5..29be2c3cb0 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
@@ -36,6 +36,8 @@ TEST(SkipLayerNormFusePass, basic) {
   layers.layer_norm(elementwise_out, scale, bias);
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
+  graph->Set(kEmbEltwiseLayernormPass, new bool(true));
+  graph->Set(kMultiheadMatmulPass, new bool(true));
   auto pass = PassRegistry::Instance().Get("skip_layernorm_fuse_pass");
   int num_nodes_before = graph->Nodes().size();
   VLOG(3) << DebugString(graph);
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 7ad8827978..08f3d609fa 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -117,20 +117,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   block_desc.Proto()->set_idx(0);
   LOG(INFO) << "---  detect a sub-graph with " << subgraph.size() << " nodes";
 
-  bool has_fused_embedding_eltwise_layernorm = false;
-  bool has_multihead_matmul = false;
   for (auto *node : subgraph) {
     auto *new_block_op = new_block->AppendOp();
     auto *op = block_desc.AppendOp();
     *new_block_op->Proto() = *node->Op()->Proto();
     *op->Proto() = *node->Op()->Proto();
-    if (!has_fused_embedding_eltwise_layernorm 
-        && op->Type() == "fused_embedding_eltwise_layernorm") {
-      has_fused_embedding_eltwise_layernorm = true;
-    }
-    if (!has_multihead_matmul && op->Type() == "multihead_matmul") {
-      has_multihead_matmul = true;
-    }
   }
 
   // Then, we will use the input_names_with_id and output_names_with_id to
@@ -319,7 +310,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
                   disable_trt_plugin_fp16);
   trt_engine->SetUseOSS(Get<bool>("use_oss"));
   trt_engine->SetWithErnie(
-      has_multihead_matmul && has_fused_embedding_eltwise_layernorm);
+      graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
+      graph->Has(framework::ir::kMultiheadMatmulPass));
 
   bool need_serialize = (use_static_engine && !load_from_memory);
   if (need_serialize) {
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 7e5552a74c..9df3c3e316 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -175,7 +175,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 
 #undef CP_MEMBER
 
-  Update();
+  // Update();
+  // Update() will reset all the passes, when some tensorRT pass is deleted in
+  // other.pass_builder(), it will set again, so just copy the passes.
+  pass_builder_->ClearPasses();
+  for (const std::string &pass : other.pass_builder()->AllPasses()) {
+    pass_builder_->AppendPass(pass);
+  }
 }
 
 void AnalysisConfig::EnableCUDNN() {
@@ -281,9 +287,7 @@ void AnalysisConfig::SetTRTDynamicShapeInfo(
   disable_trt_plugin_fp16_ = disable_trt_plugin_fp16;
 }
 
-void AnalysisConfig::EnableTensorRtOSS() {
-    trt_use_oss_ = true;
-}
+void AnalysisConfig::EnableTensorRtOSS() { trt_use_oss_ = true; }
 
 // TODO(Superjomn) refactor this, buggy.
 void AnalysisConfig::Update() {
diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
index ee4716bb56..f516d605cc 100644
--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -78,6 +78,7 @@ class SliceOpConverter : public OpConverter {
 
     nvinfer1::ILayer* layer = nullptr;
     if (engine_->with_dynamic_shape()) {
+#if IS_TRT_VERSION_GE(6000)
       if (engine_->use_oss() && engine_->with_ernie()) {
         std::vector<nvinfer1::ITensor*> plugin_inputs;
         // plugin_inputs.emplace_back(trans_layer->getOutput(0));
@@ -92,17 +93,16 @@ class SliceOpConverter : public OpConverter {
         layer = engine_->AddPluginV2(plugin_inputs.data(), plugin_inputs.size(),
                                      plugin);
       } else {
-#if IS_TRT_VERSION_GE(6000)
         bool ban_fp16 = engine_->disable_trt_plugin_fp16();
         plugin::SlicePluginDynamic* plugin =
             new plugin::SlicePluginDynamic(starts, ends, axes, ban_fp16);
         layer = engine_->AddPluginV2(&input, 1, plugin);
+      }
 #else
-        PADDLE_THROW(platform::errors::Fatal(
-            "You are running the TRT Dynamic Shape mode, need to confirm that "
-            "your TRT version is no less than 6.0"));
+      PADDLE_THROW(platform::errors::Fatal(
+          "You are running the TRT Dynamic Shape mode, need to confirm that "
+          "your TRT version is no less than 6.0"));
 #endif
-      }
     } else {
       bool ban_fp16 = engine_->disable_trt_plugin_fp16();
       plugin::SlicePlugin* plugin =
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index bfc2984dc6..a1e0717062 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -529,6 +529,15 @@ if(WITH_GPU AND TENSORRT_FOUND)
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)
 
+    set(TEST_TRT_TRANSFORMER_PRUNE_MODEL "${TRT_MODEL_INSTALL_DIR}/transformer_prune")
+    if (NOT EXISTS ${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune.tar.gz)
+        inference_download_and_uncompress(${TEST_TRT_TRANSFORMER_PRUNE_MODEL} ${INFERENCE_URL}/tensorrt_test "transformer_prune.tar.gz")
+    endif()
+
+    inference_analysis_test(test_trt_dynamic_shape_transformer_prune SRCS trt_dynamic_shape_transformer_prune_test.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
+            ARGS --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune)
+
     set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/")
     if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL}/ernie_model_4_unserialized.tgz)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc
new file mode 100644
index 0000000000..fe86a42663
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc
@@ -0,0 +1,139 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
+
+namespace paddle {
+namespace inference {
+
+void run(const AnalysisConfig& config, std::vector<float>* out_data) {
+  auto predictor = CreatePaddlePredictor(config);
+  auto input_names = predictor->GetInputNames();
+
+  int run_batch = 1;
+  const int run_seq_len = 128;
+
+  std::vector<int64_t> tmp_input;
+  std::vector<float> tmp_four_input;
+  tmp_input.reserve(run_batch * run_seq_len);
+  tmp_four_input.reserve(run_batch * run_seq_len);
+
+  int64_t i0[run_seq_len] = {
+      1,    3558, 4,   75,  491, 89, 340, 313, 93,   4,   255,   10, 75,    321,
+      4095, 1902, 4,   134, 49,  75, 311, 14,  44,   178, 543,   15, 12043, 2,
+      75,   201,  340, 9,   14,  44, 486, 218, 1140, 279, 12043, 2};
+  int64_t i1[run_seq_len] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  int64_t i2[run_seq_len] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+                             10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                             20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+                             30, 31, 32, 33, 34, 35, 36, 37, 38, 39};
+  float i3[run_seq_len] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
+  // first input
+  auto input_t = predictor->GetInputTensor(input_names[0]);
+  input_t->Reshape({run_batch, run_seq_len, 1});
+  input_t->copy_from_cpu(i0);
+
+  // second input
+  auto input_t2 = predictor->GetInputTensor(input_names[1]);
+  input_t2->Reshape({run_batch, run_seq_len, 1});
+  input_t2->copy_from_cpu(i1);
+
+  // third input.
+  auto input_t3 = predictor->GetInputTensor(input_names[2]);
+  input_t3->Reshape({run_batch, run_seq_len, 1});
+  input_t3->copy_from_cpu(i2);
+
+  auto input_t4 = predictor->GetInputTensor(input_names[3]);
+  input_t4->Reshape({run_batch, run_seq_len, 1});
+  input_t4->copy_from_cpu(i3);
+
+  ASSERT_TRUE(predictor->ZeroCopyRun());
+
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputTensor(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  out_data->resize(out_num);
+  output_t->copy_to_cpu(out_data->data());
+}
+
+void trt_ernie(bool with_fp16, std::vector<float> result) {
+  AnalysisConfig config;
+  std::string model_dir = FLAGS_infer_model;
+  SetConfig(&config, model_dir, true);
+
+  config.SwitchUseFeedFetchOps(false);
+
+  int batch = 32;
+  int min_seq_len = 1;
+  int max_seq_len = 128;
+  int opt_seq_len = 128;
+
+  std::vector<int> min_shape = {1, min_seq_len, 1};
+  std::vector<int> max_shape = {batch, max_seq_len, 1};
+  std::vector<int> opt_shape = {batch, opt_seq_len, 1};
+  // Set the input's min, max, opt shape
+  std::map<std::string, std::vector<int>> min_input_shape = {
+      {"read_file_0.tmp_0", min_shape},
+      {"read_file_0.tmp_1", min_shape},
+      {"read_file_0.tmp_2", min_shape},
+      {"read_file_0.tmp_3", min_shape}};
+  std::map<std::string, std::vector<int>> max_input_shape = {
+      {"read_file_0.tmp_0", max_shape},
+      {"read_file_0.tmp_1", max_shape},
+      {"read_file_0.tmp_2", max_shape},
+      {"read_file_0.tmp_3", max_shape}};
+  std::map<std::string, std::vector<int>> opt_input_shape = {
+      {"read_file_0.tmp_0", opt_shape},
+      {"read_file_0.tmp_1", opt_shape},
+      {"read_file_0.tmp_2", opt_shape},
+      {"read_file_0.tmp_3", opt_shape}};
+
+  auto precision = AnalysisConfig::Precision::kFloat32;
+  if (with_fp16) {
+    precision = AnalysisConfig::Precision::kHalf;
+  }
+  config.EnableTensorRtEngine(1 << 30, 1, 12, precision, false, false);
+  config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
+                                opt_input_shape);
+  std::vector<float> out_data;
+  run(config, &out_data);
+
+  for (size_t i = 0; i < out_data.size(); i++) {
+    EXPECT_NEAR(result[i], out_data[i], 1e-5);
+  }
+}
+
+TEST(AnalysisPredictor, no_fp16) {
+  std::vector<float> result = {0.498667, 0.501333};
+  trt_ernie(false, result);
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 477a9162fe..97d6e696b1 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -6,6 +6,7 @@ register_operators(EXCLUDES
     fusion_conv_inception_op
     fused_fc_elementwise_layernorm_op
     multihead_matmul_op
+    skip_layernorm_op
     fused_embedding_eltwise_layernorm_op
     fusion_group_op
     fusion_gru_op
@@ -40,6 +41,8 @@ if (WITH_GPU)
     # multihead_matmul_op
     op_library(multihead_matmul_op)
     file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(multihead_matmul);\n")
+    op_library(skip_layernorm_op)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(skip_layernorm);\n")
     op_library(fused_embedding_eltwise_layernorm_op)
     file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_embedding_eltwise_layernorm);\n")
     # fusion_group
diff --git a/paddle/fluid/operators/fused/skip_layernorm_op.cc b/paddle/fluid/operators/fused/skip_layernorm_op.cc
new file mode 100644
index 0000000000..442f359c0d
--- /dev/null
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/errors.h"
+
+namespace paddle {
+namespace operators {
+
+class SkipLayerNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE_EQ(context->HasInput("X"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(X) of MultiHeadMatMul should not be null."));
+    PADDLE_ENFORCE_EQ(context->HasInput("Y"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Y) of MultiHeadMatMul should not be null."));
+    PADDLE_ENFORCE_EQ(
+        context->HasInput("Scale"), true,
+        platform::errors::InvalidArgument(
+            "Input(Scale) of MultiHeadMatMul should not be null."));
+    PADDLE_ENFORCE_EQ(
+        context->HasInput("Bias"), true,
+        platform::errors::InvalidArgument(
+            "Input(Bias) of MultiHeadMatMul should not be null."));
+    PADDLE_ENFORCE_EQ(
+        context->HasOutput("Out"), true,
+        platform::errors::InvalidArgument(
+            "Output(Out) of MultiHeadMatMul should not be null."));
+
+    auto dim_input = context->GetInputDim("X");
+    context->SetOutputDim("Out", dim_input);
+    context->ShareLoD("X", "Out");
+  }
+};
+
+class SkipLayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The X input of SkipLayerNorm op");
+    AddInput("Y", "The Y input of SkipLayerNorm op");
+    AddInput("Scale", "The scale input of SkipLayerNorm op");
+    AddInput("Bias", "The bias input of SkipLayerNorm op");
+    AddOutput("Out", "The output of SkipLayerNorm op");
+    AddAttr<float>("epsilon",
+                   "param epsilon of layer_norm op in "
+                   "skip_layernorm_fuse_pass");
+    AddAttr<int>("begin_norm_axis",
+                 "param begin_norm_axis of "
+                 "layer_norm op in skip_layernorm_fuse_pass");
+    AddComment(R"DOC(
+SkipLayerNorm Operator.
+
+This op is used for skip_layernorm_fuse_pass, which fuse op pattern as followed.
+
+     |           |                            |            |
+ other_op1   other_op2                    other_op1    other_op2
+     |           |              fuse           \          /
+     |------elementwise_add      ->           skip_layernorm
+                 |                                   |
+             layer_norm                          other_op3
+                 |                                   |
+             other_op3
+                 |
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(skip_layernorm, ops::SkipLayerNormOp,
+                             ops::SkipLayerNormOpMaker);
diff --git a/paddle/fluid/operators/fused/skip_layernorm_op.cu b/paddle/fluid/operators/fused/skip_layernorm_op.cu
new file mode 100644
index 0000000000..856d5e694b
--- /dev/null
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cu
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_runtime.h>
+#include <paddle/fluid/platform/device_context.h>
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/operators/math/bert_encoder_functor.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SkipLayerNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    using Tensor = framework::Tensor;
+    auto *X = context.Input<framework::Tensor>("X");
+    auto *Y = context.Input<framework::Tensor>("Y");
+    auto *scale = context.Input<framework::Tensor>("Scale");
+    auto *bias = context.Input<framework::Tensor>("Bias");
+
+    auto *X_d = X->data<T>();
+    auto *Y_d = Y->data<T>();
+    auto *scale_d = scale->data<T>();
+    auto *bias_d = bias->data<T>();
+    float epsilon = context.Attr<float>("epsilon");
+    int begin_norm_axis = context.Attr<int>("begin_norm_axis");
+
+    auto *out = context.Output<framework::Tensor>("Out");
+    out->Resize(X->dims());
+    auto *output_d = out->mutable_data<T>(context.GetPlace());
+
+    size_t num = 1;
+    for (size_t i = 0; i < X->dims().size(); i++) {
+      num *= X->dims()[i];
+    }
+    int hidden = X->dims()[2];
+    auto &device_ctx = context.template device_context<DeviceContext>();
+    operators::math::SkipLayerNormFunctor<T> skip_layer_norm_func;
+
+    skip_layer_norm_func(num, hidden, X_d, Y_d, scale_d, bias_d, output_d,
+                         epsilon, device_ctx.stream());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    skip_layernorm,
+    ops::SkipLayerNormKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/python/paddle/fluid/tests/unittests/ir/pass_test.py b/python/paddle/fluid/tests/unittests/ir/pass_test.py
index c1c05c4335..aae1cc65c9 100644
--- a/python/paddle/fluid/tests/unittests/ir/pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/pass_test.py
@@ -36,6 +36,7 @@ class PassTest(unittest.TestCase):
         self.fetch_list = None
         self.pass_names = None
         self.pass_attrs = {}
+        self.graph_attrs = {}
         self.fused_op_type = None
         self.num_fused_ops = -1
 
@@ -85,6 +86,8 @@ class PassTest(unittest.TestCase):
     def _apply_ir_passes(self):
         graph = core.Graph(self.main_program.desc)
         graph.set_not_owned("__param_scope__", fluid.global_scope())
+        for attr_name, attr_value in self.graph_attrs.items():
+            graph.set(attr_name, attr_value)
 
         if not isinstance(self.pass_names, list):
             self.pass_names = [self.pass_names]
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_skip_layernorm_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_skip_layernorm_pass.py
index 888857e5a7..0aac6650f5 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_skip_layernorm_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_skip_layernorm_pass.py
@@ -16,12 +16,14 @@ import unittest
 
 import numpy as np
 from pass_test import PassTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 
 
 class SkipLayerNormFusePassTest(PassTest):
     def setUp(self):
+        paddle.enable_static()
         with fluid.program_guard(self.main_program, self.startup_program):
             x = fluid.data(
                 name="x", shape=[128, 768], dtype="float32", lod_level=0)
@@ -34,6 +36,10 @@ class SkipLayerNormFusePassTest(PassTest):
         self.pass_names = "skip_layernorm_fuse_pass"
         self.fused_op_type = "skip_layernorm"
         self.num_fused_ops = 1
+        self.graph_attrs = {
+            "embedding_eltwise_layernorm_fuse_pass_flag": True,
+            "multihead_matmul_fuse_pass_flag": True
+        }
 
     def test_check_program(self):
         use_gpu_set = [False]
-- 
GitLab