fused_dropout_helper.h 15.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/operators/dropout_impl_util.h"
#include "paddle/fluid/operators/fused/fused_dropout_act_bias.h"
#include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h"
#include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h"
22
#include "paddle/phi/kernels/funcs/functors.h"
23 24 25 26 27 28 29 30 31 32

namespace paddle {
namespace operators {

/**
 * Support two Dropouts in the use senarieo.
 * This warpper can be used in FFN op.
 * The DropoutParam will be used in the fused_dropout_act_bias,
 * fused_residual_dropout_bias(pre_layer_norm=ture) or
 * fused_layernorm_residual_dropout_bias(pre_layer_norm=false).
33
 */
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
struct DropoutParam {
  uint64_t seed;
  float dropout_prob;
  bool is_upscale_in_train;
  bool is_test;
  bool fix_seed;
  int increment;
  const framework::Tensor* tensor_seed;
  int seed_val;

  DropoutParam() {
    fix_seed = false;
    seed = 0;
    is_test = false;
    is_upscale_in_train = false;
    dropout_prob = 0.5;
    tensor_seed = nullptr;
    seed_val = 0;
  }

54 55 56 57 58 59 60
  DropoutParam(bool fix_seed_,
               uint64_t seed_,
               bool is_test_,
               bool is_upscale_in_train_,
               float dropout_prob_,
               const framework::Tensor* tensor_seed_,
               int seed_val_) {
61 62 63 64 65 66 67 68 69
    fix_seed = fix_seed_;
    seed = seed_;
    is_test = is_test_;
    is_upscale_in_train = is_upscale_in_train_;
    dropout_prob = dropout_prob_;
    tensor_seed = tensor_seed_;
    seed_val = seed_val_;
  }

70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
  /**
   * dropout_index: can be 0, 1, 2. 0 means there is only one dropout,
   * 1 and 2 represent two dropout, the parameter name of dropout
   * will be "dropout" + dropout_index + param name, such as dropout1_seed,
   * dropout1_is_test.
   */
  DropoutParam(const framework::ExecutionContext& context,
               const int dropout_index) {
    std::string pre_fix = "dropout";
    std::string str_index = std::to_string(dropout_index);
    if (dropout_index > 0) {
      pre_fix = pre_fix + str_index + "_";
    } else {
      pre_fix = pre_fix + "_";
    }
L
Li Min 已提交
85
    dropout_prob = context.Attr<float>(pre_fix + "rate");
86 87 88
    auto& dropout_implementation =
        context.Attr<std::string>(pre_fix + "implementation");
    is_upscale_in_train = (dropout_implementation == "upscale_in_train");
L
Li Min 已提交
89
    is_test = context.Attr<bool>("is_test");
90 91 92 93 94 95 96 97 98 99 100 101 102
    fix_seed = context.Attr<bool>(pre_fix + "fix_seed");

    std::string str_seed = "Dropout";
    if (dropout_index > 0) {
      str_seed = str_seed + str_index + "Seed";
    } else {
      str_seed = str_seed + "Seed";
    }
    tensor_seed =
        context.HasInput(str_seed) ? context.Input<Tensor>(str_seed) : nullptr;
    seed_val = context.Attr<int>(pre_fix + "seed");
  }

L
Leo Chen 已提交
103
  int UpdateSeedAndIncrement(const phi::GPUContext& ctx, const int offset) {
104
    uint64_t tmp_increment;
105 106
    GetSeedDataAndIncrement(
        ctx, tensor_seed, fix_seed, seed_val, offset, &seed, &tmp_increment);
107 108 109 110 111 112 113 114
    increment = static_cast<int>(tmp_increment);
    return increment;
  }
};

template <typename T, typename MaskType>
class FusedDropoutHelper {
 private:
L
Leo Chen 已提交
115
  int GetIncrement(const phi::GPUContext& ctx) {
116 117
    const int VecSize = MAX_CACHE_BYTES / sizeof(T);
    const int real_vec_size = cols_ % VecSize == 0 ? VecSize : 1;
118 119 120 121
    auto config = Get1DBlocksAnd2DGrids(ctx,
                                        static_cast<uint64_t>(rows_),
                                        static_cast<uint64_t>(cols_),
                                        real_vec_size);
122 123 124 125 126 127 128 129 130 131
    int increment = ((cols_ - 1) / (config.thread_per_block.x *
                                    config.block_per_grid.x * real_vec_size) +
                     1) *
                    real_vec_size;
    increment = dropout_param_.UpdateSeedAndIncrement(ctx, increment);
    return increment;
  }

 public:
  FusedDropoutHelper() {}
L
Leo Chen 已提交
132
  FusedDropoutHelper(const phi::GPUContext& ctx,
133 134 135
                     const int rows,
                     const int cols,
                     const DropoutParam& dropout_param) {
136 137 138 139 140 141
    rows_ = rows;
    cols_ = cols;
    dropout_param_ = dropout_param;
  }

  // out = residual + dropout( src + bias )
L
Leo Chen 已提交
142
  void ResidualDropoutBias(const phi::GPUContext& ctx,
143 144 145 146
                           const T* src,
                           const T* residual,
                           const T* bias,
                           T* out,
147 148
                           MaskType* mask) {
    auto increment = GetIncrement(ctx);
149 150 151 152 153 154 155 156 157 158 159 160 161
    LaunchResidualDropoutBias<T, MaskType>(rows_,
                                           cols_,
                                           increment,
                                           dropout_param_.seed,
                                           dropout_param_.dropout_prob,
                                           dropout_param_.is_test,
                                           dropout_param_.is_upscale_in_train,
                                           src,
                                           residual,
                                           bias,
                                           mask,
                                           out,
                                           ctx);
162 163
  }

L
Leo Chen 已提交
164
  void ResidualDropoutBiasGrad(const phi::GPUContext& ctx,
165 166 167 168 169
                               const T* d_out,
                               const MaskType* mask,
                               T* d_src,
                               T* d_residual,
                               T* d_bias) {
170
    LaunchResidualDropoutBiasGrad<T, uint8_t>(
171 172 173 174 175 176 177 178 179
        d_out,
        mask,
        dropout_param_.dropout_prob,
        dropout_param_.is_upscale_in_train,
        rows_,
        cols_,
        d_src,
        d_bias,
        ctx);
180
    if (d_residual) {
181 182 183 184 185 186
      memory::Copy(ctx.GetPlace(),
                   d_residual,
                   ctx.GetPlace(),
                   d_out,
                   rows_ * cols_ * sizeof(T),
                   ctx.stream());
187
    }
188 189 190
  }

  // out = dropout(activation(src + bias))
L
Leo Chen 已提交
191
  void DropoutActBias(const phi::GPUContext& ctx,
192 193 194 195
                      const T* src,
                      const T* bias,
                      const std::string& act_method,
                      T* out,
196 197 198 199 200
                      MaskType* mask) {
    auto increment = GetIncrement(ctx);
    if (act_method == "gelu") {
      GeluFunctor<T> gelu;
      LaunchDropoutActBias<T, MaskType, GeluFunctor<T>>(
201 202 203 204 205 206 207 208 209 210 211 212 213
          gelu,
          dropout_param_.seed,
          rows_,
          cols_,
          dropout_param_.increment,
          dropout_param_.dropout_prob,
          dropout_param_.is_upscale_in_train,
          dropout_param_.is_test,
          src,
          bias,
          out,
          mask,
          ctx);
214
    } else if (act_method == "relu") {
215 216
      phi::funcs::ReluFunctor<T> relu;
      LaunchDropoutActBias<T, MaskType, phi::funcs::ReluFunctor<T>>(
217 218 219 220 221 222 223 224 225 226 227 228 229
          relu,
          dropout_param_.seed,
          rows_,
          cols_,
          increment,
          dropout_param_.dropout_prob,
          dropout_param_.is_upscale_in_train,
          dropout_param_.is_test,
          src,
          bias,
          out,
          mask,
          ctx);
230 231 232 233 234 235
    } else {
      PADDLE_THROW(platform::errors::InvalidArgument(
          "Currently only supports gelu or relu activation functions!"));
    }
  }

L
Leo Chen 已提交
236
  void DropoutActBiasGrad(const phi::GPUContext& ctx,
237 238 239 240 241 242 243
                          const T* dout,
                          const T* src,
                          const T* bias,
                          const MaskType* mask,
                          T* d_src,
                          T* d_bias,
                          const std::string& act_method) {
244 245 246
    if (act_method == "gelu") {
      GeluGradFunctor<T> gelu_grad;
      LaunchDropoutActBiasGrad<T, MaskType, GeluGradFunctor<T>>(
247 248 249 250 251 252 253 254 255 256 257 258
          gelu_grad,
          dout,
          mask,
          src,
          bias,
          dropout_param_.dropout_prob,
          dropout_param_.is_upscale_in_train,
          rows_,
          cols_,
          d_src,
          d_bias,
          ctx);
259
    } else if (act_method == "relu") {
260 261
      phi::funcs::ReluGradFunctor<T> relu_grad;
      LaunchDropoutActBiasGrad<T, MaskType, phi::funcs::ReluGradFunctor<T>>(
262 263 264 265 266 267 268 269 270 271 272 273
          relu_grad,
          dout,
          mask,
          src,
          bias,
          dropout_param_.dropout_prob,
          dropout_param_.is_upscale_in_train,
          rows_,
          cols_,
          d_src,
          d_bias,
          ctx);
274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
    } else {
      PADDLE_THROW(platform::errors::InvalidArgument(
          "Currently only supports gelu or relu activation functions!"));
    }
  }

 protected:
  int rows_;
  int cols_;
  DropoutParam dropout_param_;
};

template <typename T, typename MaskType>
class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
 public:
  FusedDropoutLayerNormHelper() {}
290 291
  FusedDropoutLayerNormHelper(const int rows,
                              const int cols,
292 293 294 295 296 297 298
                              const float epsilon) {
    using U = LayerNormParamType<T>;
    this->rows_ = rows;
    this->cols_ = cols;
    epsilon_ = epsilon;
  }

L
Leo Chen 已提交
299
  FusedDropoutLayerNormHelper(const phi::GPUContext& ctx,
300 301
                              const int rows,
                              const int cols,
302 303 304 305 306 307 308 309
                              const DropoutParam& dropout_param,
                              const float epsilon)
      : FusedDropoutHelper<T, MaskType>(ctx, rows, cols, dropout_param) {
    using U = LayerNormParamType<T>;
    epsilon_ = epsilon;
  }

  // call layer_norm
L
Leo Chen 已提交
310
  void LayerNorm(const phi::GPUContext& ctx,
311
                 const T* src,
312
                 const LayerNormParamType<T>* gamma,
313 314 315 316
                 const LayerNormParamType<T>* beta,
                 T* out,
                 LayerNormParamType<T>* mean,
                 LayerNormParamType<T>* variance) {
317 318 319
    using U = LayerNormParamType<T>;
    switch (GetDesiredBlockDim(this->cols_)) {
      FIXED_BLOCK_DIM_CASE(
320 321
          LayerNormForward<T, U, kBlockDim>
          <<<this->rows_, kBlockDim, 0, ctx.stream()>>>(
322 323 324 325
              src, gamma, beta, out, mean, variance, epsilon_, this->cols_));
    }
  }

L
Leo Chen 已提交
326
  void LayerNormGrad(const phi::GPUContext& ctx,
327 328 329
                     const T* dout,
                     const T* src,
                     const LayerNormParamType<T>* gamma,
330
                     const LayerNormParamType<T>* mean,
331 332
                     const LayerNormParamType<T>* variance,
                     T* d_src,
333 334 335
                     LayerNormParamType<T>* d_scale,
                     LayerNormParamType<T>* d_bias) {
    using U = LayerNormParamType<T>;
336 337 338 339 340 341 342 343 344 345 346 347
    LayerNormBackward<T, U>(src,
                            dout,
                            gamma,
                            mean,
                            variance,
                            d_src,
                            d_scale,
                            d_bias,
                            epsilon_,
                            this->rows_,
                            this->cols_,
                            ctx);
348 349 350
  }

  // out = layernorm(residual + dropout(src + bias))
351
  template <typename P = LayerNormParamType<T>, bool is_same_type = false>
L
Leo Chen 已提交
352
  void LayernormResidualDropoutBias(const phi::GPUContext& ctx,
353 354 355 356 357 358 359 360
                                    const T* src,
                                    const T* residual,
                                    const T* bias,
                                    const P* gamma,
                                    const P* beta,
                                    T* dropout_out,
                                    MaskType* mask,
                                    T* out,
361 362
                                    LayerNormParamType<T>* mean,
                                    LayerNormParamType<T>* variance) {
363 364 365 366 367 368 369 370
    using U = LayerNormParamType<T>;
    int vec_size = MAX_CACHE_BYTES / sizeof(T);
    if (this->cols_ % vec_size != 0) {
      vec_size = 1;
    }
    int threads = GetDesiredBlockDim(this->cols_ / vec_size);
    int increment = ((this->cols_ - 1) / (threads * vec_size) + 1) * vec_size;
    increment = this->dropout_param_.UpdateSeedAndIncrement(ctx, increment);
371
    LaunchLayernormResidualDropoutBias<T, MaskType, U, is_same_type>(
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390
        this->rows_,
        this->cols_,
        increment,
        this->dropout_param_.seed,
        this->dropout_param_.dropout_prob,
        epsilon_,
        this->dropout_param_.is_upscale_in_train,
        this->dropout_param_.is_test,
        src,
        residual,
        bias,
        gamma,
        beta,
        mask,
        dropout_out,
        out,
        mean,
        variance,
        ctx);
391 392
  }

393
  template <typename P = LayerNormParamType<T>, bool is_same_type = false>
L
Leo Chen 已提交
394
  void LayernormResidualDropoutBiasGrad(const phi::GPUContext& ctx,
395 396 397 398
                                        const T* d_out,
                                        const T* layernorm_src,
                                        const MaskType* mask,
                                        const P* gamma,
399 400
                                        const LayerNormParamType<T>* mean,
                                        const LayerNormParamType<T>* variance,
401 402 403 404 405 406
                                        T* d_layernorm_src,
                                        P* d_scale,
                                        P* d_layernorm_bias,
                                        T* d_dropout_src,
                                        T* d_bias,
                                        T* d_residual) {
407
    using U = LayerNormParamType<T>;
408 409 410 411 412 413 414 415 416 417 418 419
    bool can_call_1024_kernel = false;
    // Fast impl for cases when cols is 1024 and linear_bias is nullptr.
    // In fact, linear_bias is not nullptr is also feasible for impl.
    // Here, we do not support it.
    if (this->cols_ == 1024 && d_bias == nullptr && d_scale != nullptr &&
        d_layernorm_bias != nullptr && sizeof(T) <= 4) {
      can_call_1024_kernel = true;
    }
    VLOG(6) << "LaunchLayernormResidualDropoutGrad = " << can_call_1024_kernel;

    if (can_call_1024_kernel) {
      LaunchLayernormResidualDropoutGrad<T, U, MaskType, is_same_type>(
420 421 422 423
          ctx,
          this->rows_,
          this->cols_,
          epsilon_,
424
          this->dropout_param_.dropout_prob,
425 426 427 428 429 430 431 432 433 434
          this->dropout_param_.is_upscale_in_train,
          d_out,
          layernorm_src,
          gamma,
          mean,
          variance,
          mask,
          d_scale,
          d_layernorm_bias,
          d_residual,
435 436
          d_dropout_src);
    } else {
437 438 439 440 441 442 443 444 445 446 447 448 449 450
      LayerNormBackward<T, U, is_same_type>(layernorm_src,
                                            d_out,
                                            gamma,
                                            mean,
                                            variance,
                                            d_layernorm_src,
                                            d_scale,
                                            d_layernorm_bias,
                                            epsilon_,
                                            this->rows_,
                                            this->cols_,
                                            ctx);
      this->ResidualDropoutBiasGrad(
          ctx, d_layernorm_src, mask, d_dropout_src, d_residual, d_bias);
451
    }
452 453 454 455 456 457 458 459
  }

 protected:
  float epsilon_;
};

}  // namespace operators
}  // namespace paddle