fused_feedforward_op.cu 26.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/fused/fused_dropout_helper.h"
#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
19
#include "paddle/fluid/operators/matmul_v2_op.h"
20
#include "paddle/phi/api/include/tensor.h"
21
#include "paddle/phi/kernels/funcs/blas/blas.h"
22 23
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
24

25
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
26
#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
27 28 29 30
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif

31 32 33 34 35
namespace paddle {
namespace operators {

using Tensor = framework::Tensor;

36 37 38
template <typename T>
static void AllReduce(framework::Tensor& tensor,  // NOLINT
                      const int ring_id,
L
Leo Chen 已提交
39
                      const phi::GPUContext& ctx) {
40 41
  if (ring_id == -1) return;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
42 43 44 45
  auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();

  if (map->has(ring_id)) {
    paddle::distributed::ProcessGroup* pg = map->get(ring_id);
46 47
    auto pg_nccl = static_cast<distributed::ProcessGroupNCCL*>(pg);

48 49 50 51 52 53
    std::vector<phi::DenseTensor> in_tensor;
    std::vector<phi::DenseTensor> out_tensor;
    in_tensor.push_back(tensor);
    out_tensor.push_back(tensor);
    paddle::distributed::AllreduceOptions opts;
    opts.reduce_op = distributed::ReduceOp::SUM;
54
    auto task = pg_nccl->AllReduce(in_tensor, out_tensor, opts, true, true);
55 56 57 58 59 60 61
    task->Wait();
  } else {
    auto dtype = platform::ToNCCLDataType(
        framework::TransToProtoVarType(tensor.dtype()));
    int64_t numel = tensor.numel();
    const void* sendbuff = tensor.data<T>();
    auto place = ctx.GetPlace();
62
    void* recvbuff = ctx.Alloc<T>(&tensor, tensor.numel() * sizeof(T));
63 64 65 66 67
    auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
    auto stream = ctx.stream();
    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
        sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
  }
68 69 70 71 72 73 74
#else
  PADDLE_THROW(platform::errors::Unimplemented(
      "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
      "parallel op."));
#endif
}

75 76 77
template <typename DeviceContext, typename T>
class FusedFeedForwardKernel : public framework::OpKernel<T> {
 public:
L
Leo Chen 已提交
78
  void MatMul(const phi::GPUContext& ctx,
79 80
              const framework::Tensor& a,
              const framework::Tensor& b,
81
              framework::Tensor* c) const {
82
    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
83 84
    auto a_2d = FoldInitDims(a);
    auto b_2d = FoldInitDims(b);
85 86
    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_2d.dims(), 0, false);
    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_2d.dims(), 0, false);
87 88 89 90
    T alpha = static_cast<T>(1.0);
    blas.MatMul(a, mat_dim_a, b, mat_dim_b, alpha, c, T(0));
  }

L
Leo Chen 已提交
91
  void FFN(const phi::GPUContext& ctx,
92
           const framework::Tensor& x,
93
           const framework::Tensor& linear1_weight,
94 95 96 97 98 99
           const framework::Tensor* linear1_bias,
           const framework::Tensor& linear2_weight,
           const framework::Tensor* linear2_bias,
           const framework::Tensor* ln1_scale,
           const framework::Tensor* ln1_bias,
           const framework::Tensor* ln2_scale,
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
           const framework::Tensor* ln2_bias,
           framework::Tensor* out,
           framework::Tensor* dropout1_mask,
           framework::Tensor* dropout2_mask,
           framework::Tensor* ln1_mean,
           framework::Tensor* ln1_variance,
           framework::Tensor* ln2_mean,
           framework::Tensor* ln2_variance,
           framework::Tensor* linear1_out,
           framework::Tensor* ln1_out,
           framework::Tensor* dropout1_out,
           framework::Tensor* dropout2_out,
           const int bsz_seq,
           const int d_model,
           const int dim_feedforward,
           const std::string& act_method,
           const bool pre_layer_norm,
           const float epsilon1,
           const float epsilon2,
           const bool add_residual,
           const int ring_id,
           const DropoutParam& dropout_param1,
122
           const DropoutParam& dropout_param2) const {
123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
    FusedDropoutLayerNormHelper<T, uint8_t> pre_layernorm_helper(
        bsz_seq, d_model, epsilon1);
    FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper(
        ctx, bsz_seq, dim_feedforward, dropout_param1);
    FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
        ctx, bsz_seq, d_model, dropout_param2, epsilon2);

    using U = LayerNormParamType<T>;
    const framework::Tensor* in = &x;

    const U* ln1_scale_ptr =
        ln1_scale == nullptr ? nullptr : ln1_scale->data<U>();
    const U* ln1_bias_ptr = ln1_bias == nullptr ? nullptr : ln1_bias->data<U>();
    const U* ln2_scale_ptr =
        ln2_scale == nullptr ? nullptr : ln2_scale->data<U>();
    const U* ln2_bias_ptr = ln2_bias == nullptr ? nullptr : ln2_bias->data<U>();
    const T* linear1_bias_ptr =
        linear1_bias == nullptr ? nullptr : linear1_bias->data<T>();
    const T* linear2_bias_ptr =
        linear2_bias == nullptr ? nullptr : linear2_bias->data<T>();

    if (pre_layer_norm) {
145 146 147 148 149 150 151
      pre_layernorm_helper.LayerNorm(ctx,
                                     x.data<T>(),
                                     ln1_scale_ptr,
                                     ln1_bias_ptr,
                                     ln1_out->data<T>(),
                                     ln1_mean->data<U>(),
                                     ln1_variance->data<U>());
152 153 154
      in = ln1_out;
    }
    MatMul(ctx, *in, linear1_weight, linear1_out);
155 156 157 158 159 160
    fused_act_dropout_helper.DropoutActBias(ctx,
                                            linear1_out->data<T>(),
                                            linear1_bias_ptr,
                                            act_method,
                                            dropout1_out->data<T>(),
                                            dropout1_mask->data<uint8_t>());
161
    framework::Tensor linear2_out;
162 163
    linear2_out.Resize({bsz_seq, d_model});
    ctx.Alloc<T>(&linear2_out, linear2_out.numel() * sizeof(T));
164
    MatMul(ctx, *dropout1_out, linear2_weight, &linear2_out);
165 166 167 168

    // tensor model parallel
    AllReduce<T>(linear2_out, ring_id, ctx);

169
    const T* residual_ptr = add_residual ? x.data<T>() : nullptr;
170
    if (!pre_layer_norm) {
171
      // TODO(Xreki): support post layer_norm case when add_residual is false.
172 173
      PADDLE_ENFORCE_EQ(add_residual,
                        true,
174 175 176 177
                        platform::errors::InvalidArgument(
                            "Attribute add_residual is expected to be true "
                            "when pre_layer_norm is false."));

178
      fused_dropout_layernorm_helper.LayernormResidualDropoutBias(
179 180 181 182 183 184 185 186 187 188
          ctx,
          linear2_out.data<T>(),
          residual_ptr,
          linear2_bias_ptr,
          ln2_scale_ptr,
          ln2_bias_ptr,
          dropout2_out->data<T>(),
          dropout2_mask->data<uint8_t>(),
          out->data<T>(),
          ln2_mean->data<U>(),
189 190 191
          ln2_variance->data<U>());
    } else {
      fused_dropout_layernorm_helper.ResidualDropoutBias(
192 193 194 195 196 197
          ctx,
          linear2_out.data<T>(),
          residual_ptr,
          linear2_bias_ptr,
          out->data<T>(),
          dropout2_mask->data<uint8_t>());
198 199 200 201 202 203 204 205 206
    }
  }

  void Compute(const framework::ExecutionContext& context) const override {
    auto* x = context.Input<framework::Tensor>("X");
    auto* linear1_weight = context.Input<framework::Tensor>("Linear1Weight");
    auto* linear1_bias = context.Input<framework::Tensor>("Linear1Bias");
    auto* linear2_weight = context.Input<framework::Tensor>("Linear2Weight");
    auto* linear2_bias = context.Input<framework::Tensor>("Linear2Bias");
207
    const bool pre_layer_norm = context.Attr<bool>("pre_layer_norm");
208
    auto& dev_ctx = context.template device_context<phi::GPUContext>();
209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230

    auto* ln1_scale =
        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Scale") : nullptr;
    auto* ln1_bias =
        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Bias") : nullptr;
    auto* ln2_scale = !pre_layer_norm
                          ? context.Input<framework::Tensor>("Ln2Scale")
                          : nullptr;
    auto* ln2_bias =
        !pre_layer_norm ? context.Input<framework::Tensor>("Ln2Bias") : nullptr;

    auto* ln1_mean =
        pre_layer_norm ? context.Output<framework::Tensor>("Ln1Mean") : nullptr;
    auto* ln1_variance = pre_layer_norm
                             ? context.Output<framework::Tensor>("Ln1Variance")
                             : nullptr;
    auto* ln2_mean = !pre_layer_norm
                         ? context.Output<framework::Tensor>("Ln2Mean")
                         : nullptr;
    auto* ln2_variance = !pre_layer_norm
                             ? context.Output<framework::Tensor>("Ln2Variance")
                             : nullptr;
231 232 233 234
    auto* out = context.Output<framework::Tensor>("Out");
    auto* dropout1_mask = context.Output<framework::Tensor>("Dropout1Mask");
    auto* dropout2_mask = context.Output<framework::Tensor>("Dropout2Mask");
    auto* linear1_out = context.Output<framework::Tensor>("Linear1Out");
235 236
    auto* ln1_out =
        pre_layer_norm ? context.Output<framework::Tensor>("Ln1Out") : nullptr;
237 238 239 240 241 242 243
    auto* dropout1_out = context.Output<framework::Tensor>("Dropout1Out");
    auto* dropout2_out = context.Output<framework::Tensor>("Dropout2Out");

    const std::string act_method = context.Attr<std::string>("act_method");

    const float epsilon1 = context.Attr<float>("ln1_epsilon");
    const float epsilon2 = context.Attr<float>("ln2_epsilon");
244
    const int ring_id = context.Attr<int>("ring_id");
245
    const bool add_residual = context.Attr<bool>("add_residual");
246 247 248 249 250

    DropoutParam dropout_param1(context, 1);
    DropoutParam dropout_param2(context, 2);

    using U = LayerNormParamType<T>;
251 252 253 254 255
    dev_ctx.Alloc<T>(out, out->numel() * sizeof(T));
    dev_ctx.Alloc<uint8_t>(dropout1_mask,
                           dropout1_mask->numel() * sizeof(uint8_t));
    dev_ctx.Alloc<uint8_t>(dropout2_mask,
                           dropout2_mask->numel() * sizeof(uint8_t));
256
    if (pre_layer_norm) {
257 258 259
      dev_ctx.Alloc<U>(ln1_mean, ln1_mean->numel() * sizeof(U));
      dev_ctx.Alloc<U>(ln1_variance, ln1_variance->numel() * sizeof(U));
      dev_ctx.Alloc<T>(ln1_out, ln1_out->numel() * sizeof(T));
260
    } else {
261 262
      dev_ctx.Alloc<U>(ln2_mean, ln2_mean->numel() * sizeof(U));
      dev_ctx.Alloc<U>(ln2_variance, ln2_variance->numel() * sizeof(U));
263 264
    }

265 266 267
    dev_ctx.Alloc<T>(linear1_out, linear1_out->numel() * sizeof(T));
    dev_ctx.Alloc<T>(dropout1_out, dropout1_out->numel() * sizeof(T));
    dev_ctx.Alloc<T>(dropout2_out, dropout2_out->numel() * sizeof(T));
268 269

    auto x_dim = x->dims();
270
    auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(
271
        RowMatrixFromVector(x_dim), 0, false);
272 273 274 275 276 277

    auto dim = linear1_weight->dims();
    int d_model = dim[0];
    int dim_feedforward = dim[dim.size() - 1];
    int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_;

278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
    FFN(context.cuda_device_context(),
        *x,
        *linear1_weight,
        linear1_bias,
        *linear2_weight,
        linear2_bias,
        ln1_scale,
        ln1_bias,
        ln2_scale,
        ln2_bias,
        out,
        dropout1_mask,
        dropout2_mask,
        ln1_mean,
        ln1_variance,
        ln2_mean,
        ln2_variance,
        linear1_out,
        ln1_out,
        dropout1_out,
        dropout2_out,
        bsz_seq,
        d_model,
        dim_feedforward,
        act_method,
        pre_layer_norm,
        epsilon1,
        epsilon2,
        add_residual,
        ring_id,
        dropout_param1,
        dropout_param2);
310 311 312
  }
};

313 314 315
template <typename DeviceContext, typename T>
class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
 public:
L
Leo Chen 已提交
316
  void MatMulGrad(const phi::GPUContext& ctx,
317 318 319 320
                  const framework::Tensor& d_out,
                  const framework::Tensor& a,
                  const framework::Tensor& b,
                  framework::Tensor* d_a,
321
                  framework::Tensor* d_b) const {
322
    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
323 324
    auto a_2d = FoldInitDims(a);
    auto b_2d = FoldInitDims(b);
325 326
    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_2d.dims(), 0, true);
    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_2d.dims(), 0, true);
327
    auto mat_dim_dout =
328
        phi::funcs::CreateMatrixDescriptor(d_out.dims(), 0, false);
329 330 331 332 333
    T alpha = static_cast<T>(1.0);
    blas.MatMul(d_out, mat_dim_dout, b, mat_dim_b, alpha, d_a, T(0));
    blas.MatMul(a, mat_dim_a, d_out, mat_dim_dout, alpha, d_b, T(0));
  }

L
Leo Chen 已提交
334
  void FFNGrad(const phi::GPUContext& ctx,
335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
               const framework::Tensor& d_out,
               const framework::Tensor& x,
               const framework::Tensor& dropout1_mask,
               const framework::Tensor& dropout2_mask,
               const framework::Tensor& linear1_out,
               const framework::Tensor* ln1_out,
               const framework::Tensor& dropout1_out,
               const framework::Tensor& dropout2_out,
               const framework::Tensor& linear1_weight,
               const framework::Tensor* linear1_bias,
               const framework::Tensor& linear2_weight,
               const framework::Tensor* ln1_gamma,
               const framework::Tensor* ln1_beta,
               const framework::Tensor* ln1_mean,
               const framework::Tensor* ln1_variance,
               const framework::Tensor* ln2_gamma,
               const framework::Tensor* ln2_beta,
               const framework::Tensor* ln2_mean,
               const framework::Tensor* ln2_variance,
               framework::Tensor* d_x,
               framework::Tensor* d_linear1_weight,
               framework::Tensor* d_linear1_bias,
               framework::Tensor* d_linear2_weight,
               framework::Tensor* d_linear2_bias,
               framework::Tensor* d_ln1_gamma,
               framework::Tensor* d_ln1_beta,
               framework::Tensor* d_ln2_gamma,
               framework::Tensor* d_ln2_beta,
               const int bsz_seq,
               const int d_model,
               const int dim_feedforward,
               const DropoutParam& dropout_param1,
               const DropoutParam& dropout_param2,
               const std::string& act_method,
               const bool pre_layer_norm,
               const float epsilon1,
               const float epsilon2,
               const bool add_residual,
               const int ring_id) const {
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401
    FusedDropoutLayerNormHelper<T, uint8_t> pre_layernorm_helper(
        bsz_seq, d_model, epsilon1);
    FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper(
        ctx, bsz_seq, dim_feedforward, dropout_param1);
    FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
        ctx, bsz_seq, d_model, dropout_param2, epsilon2);

    using U = LayerNormParamType<T>;
    const U* ln1_gamma_ptr =
        ln1_gamma == nullptr ? nullptr : ln1_gamma->data<U>();
    const U* ln1_beta_ptr = ln1_beta == nullptr ? nullptr : ln1_beta->data<U>();
    const U* ln2_gamma_ptr =
        ln2_gamma == nullptr ? nullptr : ln2_gamma->data<U>();
    const U* ln2_beta_ptr = ln2_beta == nullptr ? nullptr : ln2_beta->data<U>();
    const T* linear1_bias_ptr =
        linear1_bias == nullptr ? nullptr : linear1_bias->data<T>();
    T* d_linear1_bias_ptr =
        d_linear1_bias == nullptr ? nullptr : d_linear1_bias->data<T>();
    T* d_linear2_bias_ptr =
        d_linear2_bias == nullptr ? nullptr : d_linear2_bias->data<T>();
    U* d_ln1_gamma_ptr =
        d_ln1_gamma == nullptr ? nullptr : d_ln1_gamma->data<U>();
    U* d_ln1_beta_ptr = d_ln1_beta == nullptr ? nullptr : d_ln1_beta->data<U>();
    U* d_ln2_gamma_ptr =
        d_ln2_gamma == nullptr ? nullptr : d_ln2_gamma->data<U>();
    U* d_ln2_beta_ptr = d_ln2_beta == nullptr ? nullptr : d_ln2_beta->data<U>();

    framework::Tensor d_linear2_out, d_dropout2_out, d_residual;
402 403 404 405
    d_linear2_out.Resize({bsz_seq, d_model});
    ctx.Alloc<T>(&d_linear2_out, d_linear2_out.numel() * sizeof(T));
    d_dropout2_out.Resize({bsz_seq, d_model});
    ctx.Alloc<T>(&d_dropout2_out, d_dropout2_out.numel() * sizeof(T));
406

407 408
    T* d_residual_ptr = nullptr;
    if (add_residual) {
409 410 411
      d_residual.Resize(d_x->dims());
      d_residual_ptr =
          ctx.Alloc<T>(&d_residual, d_residual.numel() * sizeof(T));
412
    }
413 414
    if (pre_layer_norm) {
      fused_dropout_layernorm_helper.ResidualDropoutBiasGrad(
415 416 417 418 419 420
          ctx,
          d_out.data<T>(),
          dropout2_mask.data<uint8_t>(),
          d_linear2_out.data<T>(),
          d_residual_ptr,
          d_linear2_bias_ptr);
421 422
    } else {
      fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad(
423 424 425 426 427 428 429 430 431 432 433 434
          ctx,
          d_out.data<T>(),
          dropout2_out.data<T>(),
          dropout2_mask.data<uint8_t>(),
          ln2_gamma_ptr,
          ln2_mean->data<U>(),
          ln2_variance->data<U>(),
          d_dropout2_out.data<T>(),
          d_ln2_gamma_ptr,
          d_ln2_beta_ptr,
          d_linear2_out.data<T>(),
          d_linear2_bias_ptr,
435
          d_residual_ptr);
436 437 438
    }

    framework::Tensor d_dropout1_out;
439 440
    d_dropout1_out.Resize({bsz_seq, dim_feedforward});
    ctx.Alloc<T>(&d_dropout1_out, d_dropout1_out.numel() * sizeof(T));
441 442 443 444 445 446
    MatMulGrad(ctx,
               d_linear2_out,
               dropout1_out,
               linear2_weight,
               &d_dropout1_out,
               d_linear2_weight);
447 448

    framework::Tensor d_linear1_out;
449 450
    d_linear1_out.Resize({bsz_seq, dim_feedforward});
    ctx.Alloc<T>(&d_linear1_out, d_linear1_out.numel() * sizeof(T));
451 452 453 454 455 456 457 458
    fused_act_dropout_helper.DropoutActBiasGrad(ctx,
                                                d_dropout1_out.data<T>(),
                                                linear1_out.data<T>(),
                                                linear1_bias_ptr,
                                                dropout1_mask.data<uint8_t>(),
                                                d_linear1_out.data<T>(),
                                                d_linear1_bias_ptr,
                                                act_method);
459 460 461

    if (pre_layer_norm) {
      framework::Tensor d_ln1_out;
462 463
      d_ln1_out.Resize({bsz_seq, d_model});
      ctx.Alloc<T>(&d_ln1_out, d_ln1_out.numel() * sizeof(T));
464 465 466 467 468
      MatMulGrad(ctx,
                 d_linear1_out,
                 *ln1_out,
                 linear1_weight,
                 &d_ln1_out,
469
                 d_linear1_weight);
470 471
      // tensor model parallel
      AllReduce<T>(d_ln1_out, ring_id, ctx);
472 473 474 475 476 477 478 479 480
      pre_layernorm_helper.LayerNormGrad(ctx,
                                         d_ln1_out.data<T>(),
                                         x.data<T>(),
                                         ln1_gamma_ptr,
                                         ln1_mean->data<U>(),
                                         ln1_variance->data<U>(),
                                         d_x->data<T>(),
                                         d_ln1_gamma_ptr,
                                         d_ln1_beta_ptr);
481 482
    } else {
      MatMulGrad(ctx, d_linear1_out, x, linear1_weight, d_x, d_linear1_weight);
483 484
      // tensor model parallel
      AllReduce<T>(*d_x, ring_id, ctx);
485
    }
486 487 488 489 490

    if (add_residual) {
      // gradient accumulation
      std::vector<const Tensor*> ins = {&d_residual, d_x};
      std::vector<Tensor*> outs = {d_x};
491 492
      phi::funcs::ElementwiseKernel<T>(
          ctx, ins, &outs, phi::funcs::AddFunctor<T>());
493
    }
494 495 496 497
  }

  void Compute(const framework::ExecutionContext& context) const override {
    using U = LayerNormParamType<T>;
498
    auto& dev_ctx = context.template device_context<phi::GPUContext>();
499 500 501
    auto d_out =
        *context.Input<framework::Tensor>(framework::GradVarName("Out"));
    auto x = *context.Input<framework::Tensor>("X");
502
    const bool pre_layer_norm = context.Attr<bool>("pre_layer_norm");
503 504 505
    auto dropout1_mask = *context.Input<framework::Tensor>("Dropout1Mask");
    auto dropout2_mask = *context.Input<framework::Tensor>("Dropout2Mask");
    auto linear1_out = *context.Input<framework::Tensor>("Linear1Out");
506 507
    auto* ln1_out =
        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Out") : nullptr;
508 509 510 511 512
    auto dropout1_out = *context.Input<framework::Tensor>("Dropout1Out");
    auto dropout2_out = *context.Input<framework::Tensor>("Dropout2Out");
    auto linear1_weight = *context.Input<framework::Tensor>("Linear1Weight");
    auto* linear1_bias = context.Input<framework::Tensor>("Linear1Bias");
    auto linear2_weight = *context.Input<framework::Tensor>("Linear2Weight");
513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531
    auto* ln1_mean =
        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Mean") : nullptr;
    auto* ln1_variance = pre_layer_norm
                             ? context.Input<framework::Tensor>("Ln1Variance")
                             : nullptr;
    auto* ln1_scale =
        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Scale") : nullptr;
    auto* ln1_bias =
        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Bias") : nullptr;
    auto* ln2_mean =
        !pre_layer_norm ? context.Input<framework::Tensor>("Ln2Mean") : nullptr;
    auto* ln2_variance = !pre_layer_norm
                             ? context.Input<framework::Tensor>("Ln2Variance")
                             : nullptr;
    auto* ln2_scale = !pre_layer_norm
                          ? context.Input<framework::Tensor>("Ln2Scale")
                          : nullptr;
    auto* ln2_bias =
        !pre_layer_norm ? context.Input<framework::Tensor>("Ln2Bias") : nullptr;
532 533

    auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
534 535 536 537 538 539 540 541 542 543 544 545 546
    auto* d_ln1_scale = pre_layer_norm ? context.Output<framework::Tensor>(
                                             framework::GradVarName("Ln1Scale"))
                                       : nullptr;
    auto* d_ln1_bias = pre_layer_norm ? context.Output<framework::Tensor>(
                                            framework::GradVarName("Ln1Bias"))
                                      : nullptr;
    auto* d_ln2_scale = pre_layer_norm
                            ? nullptr
                            : context.Output<framework::Tensor>(
                                  framework::GradVarName("Ln2Scale"));
    auto* d_ln2_bias = pre_layer_norm ? nullptr
                                      : context.Output<framework::Tensor>(
                                            framework::GradVarName("Ln2Bias"));
547 548 549 550 551 552 553 554 555 556 557
    auto* d_linear1_weight = context.Output<framework::Tensor>(
        framework::GradVarName("Linear1Weight"));
    auto* d_linear1_bias = context.Output<framework::Tensor>(
        framework::GradVarName("Linear1Bias"));
    auto* d_linear2_weight = context.Output<framework::Tensor>(
        framework::GradVarName("Linear2Weight"));
    auto* d_linear2_bias = context.Output<framework::Tensor>(
        framework::GradVarName("Linear2Bias"));

    const float epsilon1 = context.Attr<float>("ln1_epsilon");
    const float epsilon2 = context.Attr<float>("ln2_epsilon");
558
    const bool add_residual = context.Attr<bool>("add_residual");
559
    const int ring_id = context.Attr<int>("ring_id");
560 561 562 563
    const std::string act_method = context.Attr<std::string>("act_method");
    DropoutParam dropout_param1(context, 1);
    DropoutParam dropout_param2(context, 2);

564
    dev_ctx.Alloc<T>(d_x, d_x->numel() * sizeof(T));
565
    if (d_ln1_scale) {
566
      dev_ctx.Alloc<U>(d_ln1_scale, d_ln1_scale->numel() * sizeof(U));
567 568
    }
    if (d_ln1_bias) {
569
      dev_ctx.Alloc<U>(d_ln1_bias, d_ln1_bias->numel() * sizeof(U));
570 571
    }
    if (d_ln2_scale) {
572
      dev_ctx.Alloc<U>(d_ln2_scale, d_ln2_scale->numel() * sizeof(U));
573 574
    }
    if (d_ln2_bias) {
575
      dev_ctx.Alloc<U>(d_ln2_bias, d_ln2_bias->numel() * sizeof(U));
576 577
    }
    if (d_linear1_bias) {
578
      dev_ctx.Alloc<T>(d_linear1_bias, d_linear1_bias->numel() * sizeof(T));
579 580
    }
    if (d_linear2_bias) {
581
      dev_ctx.Alloc<T>(d_linear2_bias, d_linear2_bias->numel() * sizeof(T));
582
    }
583 584
    dev_ctx.Alloc<T>(d_linear1_weight, d_linear1_weight->numel() * sizeof(T));
    dev_ctx.Alloc<T>(d_linear2_weight, d_linear2_weight->numel() * sizeof(T));
585 586

    auto x_dim = x.dims();
587
    auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(
588
        RowMatrixFromVector(x_dim), 0, false);
589 590 591 592 593 594

    auto linear1_weight_dim = linear1_weight.dims();
    int d_model = linear1_weight_dim[0];
    int dim_feedforward = linear1_weight_dim[linear1_weight_dim.size() - 1];
    int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_;

595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633
    FFNGrad(context.cuda_device_context(),
            d_out,
            x,
            dropout1_mask,
            dropout2_mask,
            linear1_out,
            ln1_out,
            dropout1_out,
            dropout2_out,
            linear1_weight,
            linear1_bias,
            linear2_weight,
            ln1_scale,
            ln1_bias,
            ln1_mean,
            ln1_variance,
            ln2_scale,
            ln2_bias,
            ln2_mean,
            ln2_variance,
            d_x,
            d_linear1_weight,
            d_linear1_bias,
            d_linear2_weight,
            d_linear2_bias,
            d_ln1_scale,
            d_ln1_bias,
            d_ln2_scale,
            d_ln2_bias,
            bsz_seq,
            d_model,
            dim_feedforward,
            dropout_param1,
            dropout_param2,
            act_method,
            pre_layer_norm,
            epsilon1,
            epsilon2,
            add_residual,
634
            ring_id);
635 636
  }
};
637 638 639 640 641 642
}  // namespace operators
}  // namespace paddle

namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
    fused_feedforward,
L
Leo Chen 已提交
643 644 645
    ops::FusedFeedForwardKernel<phi::GPUContext, float>,
    ops::FusedFeedForwardKernel<phi::GPUContext, double>,
    ops::FusedFeedForwardKernel<phi::GPUContext, paddle::platform::float16>);
646 647
REGISTER_OP_CUDA_KERNEL(
    fused_feedforward_grad,
L
Leo Chen 已提交
648 649 650
    ops::FusedFeedForwardGradKernel<phi::GPUContext, float>,
    ops::FusedFeedForwardGradKernel<phi::GPUContext, double>,
    ops::FusedFeedForwardGradKernel<phi::GPUContext,
651
                                    paddle::platform::float16>);