Move fused feedforward xpu (#53196)

* add sig file * trans fused feedforward compute function to phi * remove fluid include * delete old register info * fix build error * trans fused feedforward grad xpu to phi

Move fused feedforward xpu (#53196)
* add sig file * trans fused feedforward compute function to phi * remove fluid include * delete old register info * fix build error * trans fused feedforward grad xpu to phi
83c2e682 · Sonder · GitHub · 680460fd · 680460fd · 83c2e682
6 changed file
--- a/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc
--- a/paddle/phi/kernels/fused_feedforward_grad_kernel.h
+++ b/paddle/phi/kernels/fused_feedforward_grad_kernel.h
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context>
+void FusedFeedForwardGradKernel(
+    const Context& dev_ctx,
+    const DenseTensor& out_grad,
+    const DenseTensor& x,
+    const DenseTensor& linear1_weight,
+    const DenseTensor& linear1_bias,
+    const DenseTensor& linear2_weight,
+    const DenseTensor& dropout1_mask,
+    const DenseTensor& dropout2_mask,
+    const DenseTensor& linear1_out,
+    const DenseTensor& dropout1_out,
+    const DenseTensor& dropout2_out,
+    const paddle::optional<DenseTensor>& ln1_scale,
+    const paddle::optional<DenseTensor>& ln1_bias,
+    const paddle::optional<DenseTensor>& ln1_out,
+    const paddle::optional<DenseTensor>& ln1_mean,
+    const paddle::optional<DenseTensor>& ln1_variance,
+    const paddle::optional<DenseTensor>& ln2_scale,
+    const paddle::optional<DenseTensor>& ln2_bias,
+    const paddle::optional<DenseTensor>& ln2_mean,
+    const paddle::optional<DenseTensor>& ln2_variance,
+    const paddle::optional<DenseTensor>& linear2_bias,
+    bool pre_layer_norm,
+    float ln1_epsilon,
+    float ln2_epsilon,
+    const std::string& act_method,
+    float dropout1_prob,
+    float dropout2_prob,
+    const std::string& dropout1_implementation,
+    const std::string& dropout2_implementation,
+    bool is_test,
+    bool dropout1_fix_seed,
+    bool dropout2_fix_seed,
+    int dropout1_seed_val,
+    int dropout2_seed_val,
+    bool add_residual,
+    int ring_id,
+    DenseTensor* x_grad,
+    DenseTensor* ln1_scale_grad,
+    DenseTensor* ln1_bias_grad,
+    DenseTensor* ln2_scale_grad,
+    DenseTensor* ln2_bias_grad,
+    DenseTensor* linear1_weight_grad,
+    DenseTensor* linear1_bias_grad,
+    DenseTensor* linear2_weight_grad,
+    DenseTensor* linear2_bias_grad);
+}  // namespace phi
--- a/paddle/phi/kernels/fused_feedforward_kernel.h
+++ b/paddle/phi/kernels/fused_feedforward_kernel.h
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context>
+void FusedFeedForwardKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const paddle::optional<DenseTensor>& dropout1_seed,
+                            const paddle::optional<DenseTensor>& dropout2_seed,
+                            const DenseTensor& linear1_weight,
+                            const paddle::optional<DenseTensor>& linear1_bias,
+                            const DenseTensor& linear2_weight,
+                            const paddle::optional<DenseTensor>& linear2_bias,
+                            const paddle::optional<DenseTensor>& ln1_scale,
+                            const paddle::optional<DenseTensor>& ln1_bias,
+                            const paddle::optional<DenseTensor>& ln2_scale,
+                            const paddle::optional<DenseTensor>& ln2_bias,
+                            bool pre_layer_norm,
+                            float ln1_epsilon,
+                            float ln2_epsilon,
+                            const std::string& act_method,
+                            float dropout1_prob,
+                            float dropout2_prob,
+                            const std::string& dropout1_implementation,
+                            const std::string& dropout2_implementation,
+                            bool is_test,
+                            bool dropout1_fix_seed,
+                            bool dropout2_fix_seed,
+                            int dropout1_seed_val,
+                            int dropout2_seed_val,
+                            bool add_residual,
+                            int ring_id,
+                            DenseTensor* out,
+                            DenseTensor* dropout1_mask,
+                            DenseTensor* dropout2_mask,
+                            DenseTensor* ln1_mean,
+                            DenseTensor* ln1_variance,
+                            DenseTensor* ln2_mean,
+                            DenseTensor* ln2_variance,
+                            DenseTensor* linear1_out,
+                            DenseTensor* ln1_out,
+                            DenseTensor* dropout1_out,
+                            DenseTensor* dropout2_out);
+}  // namespace phi
--- a/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_xpu_kernel.cc
--- a/paddle/phi/kernels/fusion/xpu/fused_feedforward_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fused_feedforward_xpu_kernel.cc
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
+#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h"
+#include "paddle/phi/kernels/xpu/xpu_fused_common_function.h"
+namespace phi {
+namespace fusion {
+template <typename T, typename Context>
+void FFN(const phi::XPUContext& dev_ctx,
+         const phi::DenseTensor* x,
+         const phi::DenseTensor* linear1_weight,
+         const phi::DenseTensor* linear1_bias,
+         const phi::DenseTensor* linear2_weight,
+         const phi::DenseTensor* linear2_bias,
+         const phi::DenseTensor* ln_scale,
+         const phi::DenseTensor* ln_bias,
+         phi::DenseTensor* out,
+         phi::DenseTensor* dropout1_mask,
+         phi::DenseTensor* dropout2_mask,
+         phi::DenseTensor* ln_mean,
+         phi::DenseTensor* ln_variance,
+         phi::DenseTensor* linear1_out,
+         phi::DenseTensor* ln1_out,
+         phi::DenseTensor* dropout1_out,
+         phi::DenseTensor* dropout2_out,
+         const int bsz_seq,
+         const int d_model,
+         const int dim_feedforward,
+         const std::string& act_method,
+         const bool pre_layer_norm,
+         const float epsilon1,
+         const float epsilon2,
+         const phi::XPUDropoutParam& dropout_param1,
+         const phi::XPUDropoutParam& dropout_param2,
+         int ring_id) {
+  using XPUTypeT = typename XPUTypeTrait<T>::Type;
+  xpu::Context* xpu_ctx = dev_ctx.x_context();
+  xpu::ctx_guard RAII_GUARD(xpu_ctx);
+  int r = xpu::SUCCESS;
+  const XPUTypeT* x_ptr = reinterpret_cast<const XPUTypeT*>(x->data<T>());
+  const XPUTypeT* residual_ptr = x_ptr;
+  const XPUTypeT* linear1_weight_ptr =
+      reinterpret_cast<const XPUTypeT*>(linear1_weight->data<T>());
+  const XPUTypeT* linear1_bias_ptr =
+      reinterpret_cast<const XPUTypeT*>(linear1_bias->data<T>());
+  const XPUTypeT* linear2_weight_ptr =
+      reinterpret_cast<const XPUTypeT*>(linear2_weight->data<T>());
+  const XPUTypeT* linear2_bias_ptr =
+      reinterpret_cast<const XPUTypeT*>(linear2_bias->data<T>());
+  const float* ln_scale_ptr = ln_scale->data<float>();
+  const float* ln_bias_ptr = ln_bias->data<float>();
+  // out
+  XPUTypeT* out_ptr = reinterpret_cast<XPUTypeT*>(out->data<T>());
+  XPUTypeT* linear1_out_ptr =
+      reinterpret_cast<XPUTypeT*>(linear1_out->data<T>());
+  XPUTypeT* dropout1_mask_ptr =
+      reinterpret_cast<XPUTypeT*>(dropout1_mask->data<T>());
+  XPUTypeT* dropout2_mask_ptr =
+      reinterpret_cast<XPUTypeT*>(dropout2_mask->data<T>());
+  float* ln_mean_ptr = ln_mean->data<float>();
+  float* ln_variance_ptr = ln_variance->data<float>();
+  XPUTypeT* dropout1_out_ptr =
+      reinterpret_cast<XPUTypeT*>(dropout1_out->data<T>());
+  XPUTypeT* dropout2_out_ptr =
+      reinterpret_cast<XPUTypeT*>(dropout2_out->data<T>());
+  size_t l3_total_size = xpu_ctx->_l3_mgr.get_size();
+  XPUTypeT* linear2_before_tmp_ptr = NULL;  // dim_feedforward * bsz_seq
+  XPUTypeT* linear2_after_tmp_ptr = NULL;   // d_model * bsz_seq
+  if (l3_total_size >= dim_feedforward * bsz_seq * sizeof(T)) {
+    XPUTypeT* l3_ptr = RAII_GUARD.alloc_l3<XPUTypeT>(dim_feedforward * bsz_seq);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(l3_ptr);
+    linear2_before_tmp_ptr = linear2_after_tmp_ptr = l3_ptr;
+  } else if ((l3_total_size < dim_feedforward * bsz_seq * sizeof(T)) &&
+             (l3_total_size >= d_model * bsz_seq * sizeof(T))) {
+    XPUTypeT* l3_ptr = RAII_GUARD.alloc_l3<XPUTypeT>(d_model * bsz_seq);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(l3_ptr);
+    linear2_after_tmp_ptr = l3_ptr;
+    linear2_before_tmp_ptr =
+        RAII_GUARD.alloc<XPUTypeT>(dim_feedforward * bsz_seq);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(linear2_before_tmp_ptr);
+  } else {
+    XPUTypeT* gm_ptr = RAII_GUARD.alloc<XPUTypeT>(dim_feedforward * bsz_seq);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(gm_ptr);
+    linear2_before_tmp_ptr = linear2_after_tmp_ptr = gm_ptr;
+  }
+  // layernorm
+  if (pre_layer_norm) {
+    XPUTypeT* ln1_out_ptr = reinterpret_cast<XPUTypeT*>(ln1_out->data<T>());
+    r = xpu::layer_norm(xpu_ctx,
+                        x_ptr,
+                        ln1_out_ptr,
+                        bsz_seq,
+                        d_model,
+                        epsilon1,
+                        ln_scale_ptr,
+                        ln_bias_ptr,
+                        ln_mean_ptr,
+                        ln_variance_ptr);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm ");
+    x_ptr = ln1_out_ptr;
+  }
+  // fc
+  phi::XpuFcInfo linear1_fc_info;
+  linear1_fc_info.InitFcInfo(0,
+                             bsz_seq,
+                             dim_feedforward,
+                             d_model,
+                             false,
+                             false,
+                             nullptr,
+                             nullptr,
+                             nullptr);
+  phi::MatMulXPUFunction<XPUTypeT>(xpu_ctx,
+                                   x_ptr,
+                                   linear1_weight_ptr,
+                                   linear2_before_tmp_ptr,
+                                   linear1_fc_info,
+                                   1.0f);
+  // bias
+  r = xpu::broadcast_add(xpu_ctx,
+                         linear2_before_tmp_ptr,
+                         linear1_bias_ptr,
+                         linear1_out_ptr,
+                         {bsz_seq, dim_feedforward},
+                         {dim_feedforward});
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add");
+  // act
+  if (act_method == "gelu") {
+    r = xpu::gelu(
+        xpu_ctx, linear1_out_ptr, linear2_before_tmp_ptr, linear1_out->numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "gelu");
+  } else if (act_method == "relu") {
+    r = xpu::relu(
+        xpu_ctx, linear1_out_ptr, linear2_before_tmp_ptr, linear1_out->numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu");
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "Currently only supports gelu or relu activation functions!"));
+  }
+  // dropout1
+  phi::Dropout<XPUTypeT>(xpu_ctx,
+                         linear2_before_tmp_ptr,
+                         dropout1_mask_ptr,
+                         dropout1_out_ptr,
+                         dropout_param1,
+                         dropout1_out->numel());
+  // fc
+  phi::XpuFcInfo linear2_fc_info;
+  linear2_fc_info.InitFcInfo(0,
+                             bsz_seq,
+                             d_model,
+                             dim_feedforward,
+                             false,
+                             false,
+                             nullptr,
+                             nullptr,
+                             nullptr);
+  phi::MatMulXPUFunction<XPUTypeT>(xpu_ctx,
+                                   dropout1_out_ptr,
+                                   linear2_weight_ptr,
+                                   dropout2_out_ptr,
+                                   linear2_fc_info,
+                                   1.0f);
+  // bias
+  r = xpu::broadcast_add(xpu_ctx,
+                         dropout2_out_ptr,
+                         linear2_bias_ptr,
+                         dropout2_out_ptr,
+                         {bsz_seq, d_model},
+                         {d_model});
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add");
+  // dropout2
+  phi::Dropout<XPUTypeT>(xpu_ctx,
+                         dropout2_out_ptr,
+                         dropout2_mask_ptr,
+                         dropout2_out_ptr,
+                         dropout_param2,
+                         dropout2_out->numel());
+  // residual_ptr + dropout_out
+  XPUTypeT* residual_add_out_ptr = out_ptr;
+  if (pre_layer_norm == false) {
+    residual_add_out_ptr = dropout2_out_ptr;
+  }
+  r = xpu::broadcast_add(xpu_ctx,
+                         residual_ptr,
+                         dropout2_out_ptr,
+                         residual_add_out_ptr,
+                         {bsz_seq, d_model},
+                         {bsz_seq, d_model});
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add");
+  if (pre_layer_norm == false) {
+    r = xpu::layer_norm(xpu_ctx,
+                        residual_add_out_ptr,
+                        out_ptr,
+                        bsz_seq,
+                        d_model,
+                        epsilon2,
+                        ln_scale_ptr,
+                        ln_bias_ptr,
+                        ln_mean_ptr,
+                        ln_variance_ptr);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm");
+  }
+}
+template <typename T, typename Context>
+void FusedFeedForwardKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const paddle::optional<DenseTensor>& dropout1_seed,
+                            const paddle::optional<DenseTensor>& dropout2_seed,
+                            const DenseTensor& linear1_weight,
+                            const paddle::optional<DenseTensor>& linear1_bias,
+                            const DenseTensor& linear2_weight,
+                            const paddle::optional<DenseTensor>& linear2_bias,
+                            const paddle::optional<DenseTensor>& ln1_scale,
+                            const paddle::optional<DenseTensor>& ln1_bias,
+                            const paddle::optional<DenseTensor>& ln2_scale,
+                            const paddle::optional<DenseTensor>& ln2_bias,
+                            bool pre_layer_norm,
+                            float ln1_epsilon,
+                            float ln2_epsilon,
+                            const std::string& act_method,
+                            float dropout1_prob,
+                            float dropout2_prob,
+                            const std::string& dropout1_implementation,
+                            const std::string& dropout2_implementation,
+                            bool is_test,
+                            bool dropout1_fix_seed,
+                            bool dropout2_fix_seed,
+                            int dropout1_seed_val,
+                            int dropout2_seed_val,
+                            bool add_residual,
+                            int ring_id,
+                            DenseTensor* out,
+                            DenseTensor* dropout1_mask,
+                            DenseTensor* dropout2_mask,
+                            DenseTensor* ln1_mean,
+                            DenseTensor* ln1_variance,
+                            DenseTensor* ln2_mean,
+                            DenseTensor* ln2_variance,
+                            DenseTensor* linear1_out,
+                            DenseTensor* ln1_out,
+                            DenseTensor* dropout1_out,
+                            DenseTensor* dropout2_out) {
+  auto* x_ptr = &x;
+  auto* linear1_weight_ptr = &linear1_weight;
+  auto* linear1_bias_ptr = linear1_bias.get_ptr();
+  auto* linear2_weight_ptr = &linear2_weight;
+  auto* linear2_bias_ptr = linear2_bias.get_ptr();
+  const phi::DenseTensor* ln_scale = nullptr;
+  const phi::DenseTensor* ln_bias = nullptr;
+  phi::DenseTensor* ln_mean = nullptr;
+  phi::DenseTensor* ln_variance = nullptr;
+  if (pre_layer_norm) {
+    ln_scale = ln1_scale.get_ptr();
+    ln_bias = ln1_bias.get_ptr();
+    ln_mean = ln1_mean;
+    ln_variance = ln1_variance;
+    dev_ctx.template Alloc<T>(ln1_out);
+  } else {
+    ln_scale = ln2_scale.get_ptr();
+    ln_bias = ln2_bias.get_ptr();
+    ln_mean = ln2_mean;
+    ln_variance = ln2_variance;
+  }
+  const float epsilon1 = ln1_epsilon;
+  const float epsilon2 = ln2_epsilon;
+  bool is_upscale_in_train_1 = dropout1_implementation == "upscale_in_train";
+  bool is_upscale_in_train_2 = dropout2_implementation == "upscale_in_train";
+  auto* dropout1_seed_ptr = dropout1_seed.get_ptr();
+  auto* dropout2_seed_ptr = dropout2_seed.get_ptr();
+  phi::XPUDropoutParam dropout_param1;
+  dropout_param1.initXPUDropoutParam(dropout1_prob,
+                                     is_upscale_in_train_1,
+                                     is_test,
+                                     dropout1_fix_seed,
+                                     dropout1_seed_ptr,
+                                     dropout1_seed_val);
+  phi::XPUDropoutParam dropout_param2;
+  dropout_param2.initXPUDropoutParam(dropout2_prob,
+                                     is_upscale_in_train_2,
+                                     is_test,
+                                     dropout2_fix_seed,
+                                     dropout2_seed_ptr,
+                                     dropout2_seed_val);
+  dev_ctx.template Alloc<float>(ln_mean);
+  dev_ctx.template Alloc<float>(ln_variance);
+  dev_ctx.template Alloc<T>(out);
+  dev_ctx.template Alloc<T>(dropout1_mask);
+  dev_ctx.template Alloc<T>(dropout2_mask);
+  dev_ctx.template Alloc<T>(dropout1_out);
+  dev_ctx.template Alloc<T>(dropout2_out);
+  dev_ctx.template Alloc<T>(linear1_out);
+  auto x_dim = x_ptr->dims();
+  auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(
+      phi::RowMatrixFromVector(x_dim), 0, false);
+  auto dim = linear1_weight_ptr->dims();
+  int d_model = dim[0];
+  int dim_feedforward = dim[dim.size() - 1];
+  int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_;
+  phi::fusion::FFN<T, Context>(dev_ctx,
+                               x_ptr,
+                               linear1_weight_ptr,
+                               linear1_bias_ptr,
+                               linear2_weight_ptr,
+                               linear2_bias_ptr,
+                               ln_scale,
+                               ln_bias,
+                               out,
+                               dropout1_mask,
+                               dropout2_mask,
+                               ln_mean,
+                               ln_variance,
+                               linear1_out,
+                               ln1_out,
+                               dropout1_out,
+                               dropout2_out,
+                               bsz_seq,
+                               d_model,
+                               dim_feedforward,
+                               act_method,
+                               pre_layer_norm,
+                               epsilon1,
+                               epsilon2,
+                               dropout_param1,
+                               dropout_param2,
+                               ring_id);
+}
+}  // namespace fusion
+}  // namespace phi
+PD_REGISTER_KERNEL(fused_feedward,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::fusion::FusedFeedForwardKernel,
+                   float,
+                   phi::dtype::float16) {
+  kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(5).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(6).SetDataType(phi::DataType::FLOAT32);
+}
--- a/paddle/phi/ops/compat/fused_feedforward_sig.cc
+++ b/paddle/phi/ops/compat/fused_feedforward_sig.cc
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature FeedForwardFuseOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("fused_feedforward",
+                         {"X",
+                          "Dropout1Seed",
+                          "Dropout2Seed",
+                          "Linear1Weight",
+                          "Linear1Bias",
+                          "Linear2Weight",
+                          "Linear2Bias",
+                          "Ln1Scale",
+                          "Ln1Bias",
+                          "Ln2Scale",
+                          "Ln2Bias"},
+                         {"pre_layer_norm",
+                          "ln1_epsilon",
+                          "ln2_epsilon",
+                          "act_method",
+                          "dropout1_rate",
+                          "dropout2_rate",
+                          "dropout1_implementation",
+                          "dropout2_implementation",
+                          "is_test",
+                          "dropout1_fix_seed",
+                          "dropout2_fix_seed",
+                          "dropout1_seed",
+                          "dropout2_seed",
+                          "add_residual",
+                          "ring_id"},
+                         {"Out",
+                          "Dropout1Mask",
+                          "Dropout2Mask",
+                          "Ln1Mean",
+                          "Ln1Variance",
+                          "Ln2Mean",
+                          "Ln2Variance",
+                          "Linear1Out",
+                          "Ln1Out",
+                          "Dropout1Out",
+                          "Dropout2Out"});
+}
+KernelSignature FeedForwardGradFuseOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("fused_feedforward_grad",
+                         {"Out@GRAD",      "X",
+                          "Linear1Weight", "Linear1Bias",
+                          "Linear2Weight", "Dropout1Mask",
+                          "Dropout2Mask",  "Linear1Out",
+                          "Dropout1Out",   "Dropout2Out",
+                          "Ln1Scale",      "Ln1Bias",
+                          "Ln1Out",        "Ln1Mean",
+                          "Ln1Variance",   "Ln2Scale",
+                          "Ln2Bias",       "Ln2Mean",
+                          "Ln2Variance",   "Linear2Bias"},
+                         {"pre_layer_norm",
+                          "ln1_epsilon",
+                          "ln2_epsilon",
+                          "act_method",
+                          "dropout1_rate",
+                          "dropout2_rate",
+                          "dropout1_implementation",
+                          "dropout2_implementation",
+                          "is_test",
+                          "dropout1_fix_seed",
+                          "dropout2_fix_seed",
+                          "dropout1_seed",
+                          "dropout2_seed",
+                          "add_residual",
+                          "ring_id"},
+                         {"X@GRAD",
+                          "Ln1Scale@GRAD",
+                          "Ln1Bias@GRAD",
+                          "Ln2Scale@GRAD",
+                          "Ln2Bias@GRAD",
+                          "Linear1Weight@GRAD",
+                          "Linear1Bias@GRAD",
+                          "Linear2Weight@GRAD",
+                          "Linear2Bias@GRAD"});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(fused_feedforward,
+                           phi::FeedForwardFuseOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(fused_feedforward_grad,
+                           phi::FeedForwardGradFuseOpArgumentMapping);