add a fusion op: fused_residual_dropout_bias (#34963)

cf8bf032 · zhangkaihuo · GitHub · eb1fbf12 · cf8bf032 · cf8bf032
5 changed file
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -71,4 +71,9 @@ if (WITH_GPU OR WITH_ROCM)
        op_library(fused_bn_add_activation_op)
        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_bn_add_activation);\n")
    endif()
+    # fused_dropout
+    # only support CUDA
+    if(NOT WITH_ROCM)
+        nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op device_context generator memory)
+    endif()
 endif()
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cooperative_groups.h>
+#include <cuda.h>
+#include <curand_kernel.h>
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
+namespace paddle {
+namespace operators {
+#define CACHE_LINE 128
+#define MAX_CACHE_BYTES (CACHE_LINE / CHAR_BIT)
+/**
+ * get the threads for fused_residual_dropout_bias:
+ * 1D blocks: blockDim.x = cols
+ * 2D grids: gridDim.y = rows
+ */
+inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids(
+    const platform::CUDADeviceContext &ctx, const uint32_t rows,
+    const uint32_t cols, const int VecSize) {
+  const uint32_t tmp_cols = cols / VecSize;
+  int threads = std::max(
+      static_cast<uint32_t>(32),
+      std::min(tmp_cols, static_cast<uint32_t>(ctx.GetMaxThreadsPerBlock())));
+  const auto blocks_x =
+      std::max(static_cast<uint32_t>(1), (tmp_cols + threads - 1) / threads);
+  const auto blocks_y = std::max(static_cast<uint32_t>(1), rows);
+  platform::GpuLaunchConfig config;
+  config.block_per_grid.x = blocks_x;
+  config.block_per_grid.y = blocks_y;
+  config.thread_per_block.x = threads;
+  return config;
+}
+__forceinline__ __device__ void Rand1(curandStatePhilox4_32_10_t *state,
+                                      float *data) {
+  data[0] = curand_uniform(state);
+}
+__forceinline__ __device__ void Rand2(curandStatePhilox4_32_10_t *state,
+                                      float *data) {
+  data[0] = curand_uniform(state);
+  data[1] = curand_uniform(state);
+}
+__forceinline__ __device__ void Rand4(curandStatePhilox4_32_10_t *state,
+                                      float *data) {
+  float4 rand4 = curand_uniform4(state);
+  data[0] = rand4.x;
+  data[1] = rand4.y;
+  data[2] = rand4.w;
+  data[3] = rand4.z;
+}
+__forceinline__ __device__ void Rand8(curandStatePhilox4_32_10_t *state,
+                                      float *data) {
+  Rand4(state, data);
+  Rand4(state, data + 4);
+}
+__forceinline__ __device__ void RandVec(curandStatePhilox4_32_10_t *state,
+                                        float *data, const int VecSize) {
+  if (VecSize == 1) {
+    Rand1(state, data);
+  } else if (VecSize == 2) {
+    Rand2(state, data);
+  } else if (VecSize == 4) {
+    Rand4(state, data);
+  } else if (VecSize == 8) {
+    Rand8(state, data);
+  } else {
+    return;
+  }
+}
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <random>
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace memory = paddle::memory;
+USE_OP(dropout);
+/**
+ * @brief call paddle dropout op
+ */
+template <typename T>
+void Dropout(const std::vector<T> &x, const framework::DDim &x_dim,
+             std::vector<T> *out, std::vector<uint8_t> *mask,
+             const platform::CUDADeviceContext &ctx, uint64_t seed,
+             float dropout_prob, bool is_upscale_in_train, bool is_test) {
+  framework::Scope scope;
+  auto var_x = scope.Var("X");
+  auto tensor_x = var_x->GetMutable<framework::LoDTensor>();
+  framework::TensorFromVector(x, ctx, tensor_x);
+  tensor_x->Resize(x_dim);
+  auto var_out = scope.Var("Out");
+  auto tensor_out = var_out->GetMutable<framework::LoDTensor>();
+  auto var_mask = scope.Var("Mask");
+  auto tensor_mask = var_mask->GetMutable<framework::LoDTensor>();
+  framework::AttributeMap attrs;
+  attrs.insert({"fix_seed", 1});
+  attrs.insert({"seed", static_cast<int>(seed)});
+  attrs.insert({"dropout_prob", dropout_prob});
+  if (is_upscale_in_train) {
+    attrs.insert({"dropout_implementation", std::string("upscale_in_train")});
+  }
+  if (is_test) {
+    attrs.insert({"is_test", true});
+  }
+  auto op = framework::OpRegistry::CreateOp(
+      "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs);
+  op->Run(scope, ctx.GetPlace());
+  framework::TensorToVector<T>(*tensor_out, ctx, out);
+  if (!is_test) {
+    framework::TensorToVector<uint8_t>(*tensor_mask, ctx, mask);
+  }
+  ctx.Wait();
+}
+/**
+ * @brief call paddle dropout_grad op
+ */
+template <typename T>
+void DropoutGrad(std::vector<T> *dx, const framework::DDim &x_dim,
+                 const std::vector<T> &dout, const std::vector<uint8_t> &mask,
+                 const platform::CUDADeviceContext &ctx, float dropout_prob,
+                 bool is_upscale_in_train) {
+  framework::Scope scope;
+  const size_t n = x_dim[0] * x_dim[1];
+  auto var_out = scope.Var("DOut");
+  auto tensor_out = var_out->GetMutable<framework::LoDTensor>();
+  framework::TensorFromVector(dout, ctx, tensor_out);
+  tensor_out->Resize(x_dim);
+  auto var_mask = scope.Var("Mask");
+  auto tensor_mask = var_mask->GetMutable<framework::LoDTensor>();
+  framework::TensorFromVector(mask, ctx, tensor_mask);
+  tensor_mask->Resize(x_dim);
+  auto var_dx = scope.Var("DX");
+  auto tensor_dx = var_dx->GetMutable<framework::LoDTensor>();
+  framework::AttributeMap attrs;
+  attrs.insert({"dropout_prob", dropout_prob});
+  attrs.insert({"is_test", false});
+  if (is_upscale_in_train) {
+    attrs.insert({"dropout_implementation", std::string("upscale_in_train")});
+  } else {
+    attrs.insert({"dropout_implementation", std::string("downgrade_in_infer")});
+  }
+  auto op = framework::OpRegistry::CreateOp(
+      "dropout_grad", {{"Out@GRAD", {"DOut"}}, {"Mask", {"Mask"}}},
+      {{"X@GRAD", {"DX"}}}, attrs);
+  op->Run(scope, ctx.GetPlace());
+  framework::TensorToVector(*tensor_dx, ctx, dx);
+  ctx.Wait();
+}
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/operators/fused/fused_dropout_common.h"
+#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+namespace paddle {
+namespace operators {
+/**
+ * @brief The fused function called by every thread
+ * VecSize can be 1, 2, 4 or 8
+ */
+template <typename T, typename MaskType, int VecSize, bool ComputeLayerNorm>
+__forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
+    const int row_id, const int col_id, const int cols,
+    curandStatePhilox4_32_10_t *state, const float dropout_prob, const T factor,
+    const T *src, const T *residual, const T *bias, T *dst, MaskType *mask,
+    const bool is_test, typename details::MPTypeTrait<T>::Type *mean_val,
+    typename details::MPTypeTrait<T>::Type *var_val) {
+  using LoadT = platform::AlignedVector<T, VecSize>;
+  using StoreT = platform::AlignedVector<T, VecSize>;
+  using MaskStoreT = platform::AlignedVector<MaskType, VecSize>;
+  using U = typename details::MPTypeTrait<T>::Type;
+  LoadT src_vec;
+  LoadT residual_vec;
+  LoadT bias_vec;
+#pragma unroll
+  for (int ii = 0; ii < VecSize; ii++) {
+    bias_vec[ii] = static_cast<T>(0);
+  }
+  // vectorize load data from global
+  platform::Load<T, VecSize>(&src[row_id * cols + col_id], &src_vec);
+  platform::Load<T, VecSize>(&residual[row_id * cols + col_id], &residual_vec);
+  if (bias) {
+    platform::Load<T, VecSize>(&bias[col_id], &bias_vec);
+  }
+  MaskStoreT mask_vec;
+  if (!is_test) {
+    float rand[VecSize];
+    RandVec(state, rand, VecSize);
+#pragma unroll
+    for (int ii = 0; ii < VecSize; ii++) {
+      mask_vec[ii] = static_cast<MaskType>(rand[ii] >= dropout_prob);
+    }
+  } else {
+#pragma unroll
+    for (int ii = 0; ii < VecSize; ii++) {
+      mask_vec[ii] = static_cast<MaskType>(1);
+    }
+  }
+  StoreT dest_vec;
+#pragma unroll
+  for (int ii = 0; ii < VecSize; ii++) {
+    dest_vec[ii] =
+        (src_vec[ii] + bias_vec[ii]) * static_cast<T>(mask_vec[ii]) * factor +
+        residual_vec[ii];
+    if (ComputeLayerNorm) {
+      U tmp = static_cast<U>(dest_vec[ii]);
+      *mean_val += tmp;
+      *var_val += (tmp * tmp);
+    }
+  }
+  // store result to global
+  platform::Store<T, VecSize>(dest_vec, &dst[row_id * cols + col_id]);
+  if (!is_test) {
+    platform::Store<MaskType, VecSize>(mask_vec, &mask[row_id * cols + col_id]);
+  }
+}
+/**
+ * @brief dst = residual + dropout(src + bias);
+ * the src, residual, mask and dst shape is (rows, cols)
+ * the bias shape is (1, cols)
+ * is_test: only used in inference
+ * mask: can be null if is_test=true
+ */
+template <typename T, typename MaskType, int VecSize>
+__global__ void FusedResidualDropoutBias(
+    const size_t rows, const size_t cols, uint64_t seed,
+    const float dropout_prob, const bool is_upscale_in_train, const T *src,
+    const T *residual, const T *bias, MaskType *mask, T *dst,
+    uint64_t increment, const bool is_test) {
+  int col_id = blockDim.x * blockIdx.x + threadIdx.x;
+  int row_id = blockIdx.y;
+  int idx = row_id * cols + col_id;
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, idx, increment, &state);
+  T factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
+  if (!is_upscale_in_train) {
+    factor = static_cast<T>(1.0f);
+  }
+  if (is_test) {
+    factor = static_cast<T>(1.0f - dropout_prob);
+    if (is_upscale_in_train) {
+      factor = static_cast<T>(1.0f);
+    }
+  }
+  for (int r = row_id; r < rows; r += blockDim.y * gridDim.y) {
+    for (int i = col_id * VecSize; i < cols;
+         i += blockDim.x * gridDim.x * VecSize) {
+      FusedResidualDropoutBiasOneThread<T, MaskType, VecSize, false>(
+          r, i, cols, &state, dropout_prob, factor, src, residual, bias, dst,
+          mask, is_test, nullptr, nullptr);
+    }
+  }
+}
+/**
+ * @brief dst = residual + dropout(src + bias);
+ */
+template <typename T, typename MaskType>
+void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols,
+                               const int increment, uint64_t seed,
+                               const float dropout_prob, const bool is_test,
+                               bool is_upscale_in_train, const T *src,
+                               const T *residual, const T *bias,
+                               MaskType *mask_data, T *dst,
+                               const platform::CUDADeviceContext &ctx) {
+  // dropout_prob == 1.0f
+  if (std::abs(dropout_prob - 1.0f) < 1e-5) {
+    if (residual == dst) return;
+    auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+    memory::Copy(cuda_place, dst, cuda_place, residual, rows * cols * sizeof(T),
+                 ctx.stream());
+    if (!is_test) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(
+          mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream()));
+    }
+    return;
+  }
+  const int VecSize = MAX_CACHE_BYTES / sizeof(T);
+  const int real_vec_size = cols % VecSize == 0 ? VecSize : 1;
+  auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, real_vec_size);
+  if (cols % VecSize == 0) {
+    FusedResidualDropoutBias<T, uint8_t, VecSize><<<
+        config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+        rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
+        bias, mask_data, dst, increment, is_test);
+  } else {
+    FusedResidualDropoutBias<
+        T, uint8_t,
+        1><<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+        rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
+        bias, mask_data, dst, increment, is_test);
+  }
+}
+/*
+ * @brief calculate the grad of no bias
+ */
+template <typename T, typename MaskType, int VecSize>
+__global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask,
+                                         const T factor, const int64_t size,
+                                         T *dx) {
+  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+  using LoadT = platform::AlignedVector<T, VecSize>;
+  using StoreT = platform::AlignedVector<T, VecSize>;
+  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
+    LoadT dout_vec;
+    MaskLoadT mask_vec;
+    platform::Load<T, VecSize>(&dout[i], &dout_vec);
+    platform::Load<MaskType, VecSize>(&mask[i], &mask_vec);
+    StoreT dx_vec;
+#pragma unroll
+    for (int ii = 0; ii < VecSize; ii++) {
+      dx_vec[ii] = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
+    }
+    platform::Store<T, VecSize>(dx_vec, &dx[i]);
+  }
+}
+/**
+ * blocks(128 * 8)
+ * 1. calculate the dx and reduce total rows to 128 rows
+ * 2. save 128*8 temporary sum in 8*128 shared memory
+ * 3. reduce the sum of 128 rows data by 8*VecSize warps
+ */
+template <typename T, typename MaskType, int BlockSizeX, int BlockSizeY,
+          int VecSize>
+__global__ void FusedResidualDropoutBiasGrad(const T *dout,
+                                             const MaskType *mask,
+                                             const T factor, const int64_t rows,
+                                             const int64_t cols, T *dx,
+                                             T *dbias) {
+  int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;
+  using LoadT = platform::AlignedVector<T, VecSize>;
+  using StoreT = platform::AlignedVector<T, VecSize>;
+  using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
+  T tmp_sum[VecSize] = {static_cast<T>(0)};
+  // calculate the dx and temporary sum
+  if (col_id * VecSize < cols) {
+    for (int row_id = threadIdx.y; row_id < rows; row_id += blockDim.y) {
+      int index = row_id * cols + col_id * VecSize;
+      LoadT out_vec;
+      MaskLoadT mask_vec;
+      StoreT dx_vec;
+      platform::Load<T, VecSize>(&dout[index], &out_vec);
+      platform::Load<MaskType, VecSize>(&mask[index], &mask_vec);
+#pragma unroll
+      for (int i = 0; i < VecSize; i++) {
+        dx_vec[i] = out_vec[i] * static_cast<T>(mask_vec[i]) * factor;
+        tmp_sum[i] += out_vec[i];
+      }
+      platform::Store<T, VecSize>(dx_vec, &dx[index]);
+    }
+  }
+  // save temporary sum to cache and do transpose
+  __shared__ T cache[BlockSizeX * VecSize][BlockSizeY];
+  for (int i = 0; i < VecSize; i++) {
+    cache[threadIdx.x * VecSize + i][threadIdx.y] = tmp_sum[i];
+  }
+  __syncthreads();
+  // reduce sum
+  T sum = static_cast<T>(0);
+  int tid = threadIdx.y * blockDim.x + threadIdx.x;
+  int x = tid >> 5;  // warp id
+  int y = tid & 31;  // thread id on warp 0~31
+  // need BlockSizeX * VecSize warps
+  if (x < BlockSizeX * VecSize) {
+// reduce 128 to 32
+#pragma unroll
+    for (int i = 0; i < (BlockSizeY >> 5); i++) {
+      sum += cache[x][y + i * 32];
+    }
+  }
+  // reduce 32 to 1
+  sum = WarpReduceSum(sum);
+  // save sum to dbias
+  int bias_id = blockIdx.x * blockDim.x * VecSize + x;
+  if (y == 0 && x < VecSize * BlockSizeX && bias_id < cols) {
+    dbias[bias_id] = sum;
+  }
+}
+/**
+ * @brief to launch kernel FusedResidualDropoutBiasGradVec
+ */
+template <typename T, typename MaskType>
+void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask,
+                                   const float dropout_prob,
+                                   const bool is_upscale_in_train,
+                                   const uint32_t rows, const uint32_t cols,
+                                   T *dx, T *dbias,
+                                   const platform::CUDADeviceContext &ctx) {
+  const T zero = static_cast<T>(0.0f);
+  auto factor = dropout_prob == static_cast<float>(1.0f)
+                    ? zero
+                    : static_cast<T>(1.0f / (1.0f - dropout_prob));
+  if (!is_upscale_in_train) {
+    factor = static_cast<T>(1.0f);
+  }
+  const int VecSize = MAX_CACHE_BYTES / sizeof(T);
+  int real_vec_size = cols % VecSize == 0 ? VecSize : 1;
+  if (dbias != nullptr) {
+    auto threads = std::min(cols / real_vec_size, static_cast<uint32_t>(8));
+    auto blocks =
+        std::max((uint32_t)1, (cols / real_vec_size + threads - 1) / threads);
+    dim3 block_dim(threads, 128, 1);
+    dim3 grid_dim(blocks, 1, 1);
+    if (cols % VecSize == 0) {
+      FusedResidualDropoutBiasGrad<
+          T, MaskType, 8, 128,
+          VecSize><<<grid_dim, block_dim, 0, ctx.stream()>>>(
+          dout, mask, factor, rows, cols, dx, dbias);
+    } else {
+      FusedResidualDropoutBiasGrad<T, MaskType, 8, 128,
+                                   1><<<grid_dim, block_dim, 0, ctx.stream()>>>(
+          dout, mask, factor, rows, cols, dx, dbias);
+    }
+  } else {
+    const uint64_t n = rows * cols;
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(ctx, n / real_vec_size);
+    if (n % VecSize == 0) {
+      FusedResidualDropoutGrad<T, MaskType, VecSize><<<
+          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+          dout, mask, factor, n, dx);
+    } else {
+      FusedResidualDropoutGrad<T, MaskType, 1><<<
+          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+          dout, mask, factor, n, dx);
+    }
+  }
+}
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <time.h>
+#include <random>
+#include <vector>
+#include "paddle/fluid/operators/fused/fused_dropout_test.h"
+#include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h"
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+/**
+ * @brief the unittest of fusedresidualdropoutbias
+ * 1. random input data
+ * 2. add bias, call paddle dropout op, add residual, and get the base result
+ * 3. call FusedResidualDropoutBias function get fused result
+ * 4. compare ther base result and fused result
+ */
+template <typename T>
+struct TestFusedResidualDropoutBias {
+  uint32_t rows;
+  uint32_t cols;
+  uint64_t seed;
+  float dropout_prob;
+  bool is_upscale_in_train;
+  bool is_test;  // default false,  Set to true for inference only
+  bool has_bias = true;
+  framework::Tensor src, residual, bias, out, mask;
+  framework::Tensor dsrc, dbias;
+  std::vector<T> src_vec, residual_vec, bias_vec;
+  std::vector<T> correct_out, correct_dsrc, correct_dbias;
+  std::vector<uint8_t> correct_mask;
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext *ctx;
+  TestFusedResidualDropoutBias() {
+    rows = 32;
+    cols = 32;
+    seed = 0;
+    dropout_prob = 0.0;
+    is_upscale_in_train = false;
+    is_test = false;
+    has_bias = true;
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto device_ctx = pool.Get(place);
+    ctx = reinterpret_cast<platform::CUDADeviceContext *>(device_ctx);
+  }
+  TestFusedResidualDropoutBias(int rows_, int cols_, uint64_t seed_ = 0,
+                               float dropout_prob_ = 0.0,
+                               bool is_upscale_in_train_ = false,
+                               bool is_test_ = false) {
+    rows = rows_;
+    cols = cols_;
+    seed = seed_;
+    dropout_prob = dropout_prob_;
+    is_upscale_in_train = is_upscale_in_train_;
+    is_test = is_test_;
+    has_bias = true;
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto device_ctx = pool.Get(place);
+    ctx = reinterpret_cast<platform::CUDADeviceContext *>(device_ctx);
+  }
+  ~TestFusedResidualDropoutBias() {}
+  void SetUp() {
+    const int n = rows * cols;
+    correct_out.resize(n);
+    correct_mask.resize(n);
+    correct_dsrc.resize(n);
+    correct_dbias.resize(cols);
+    src_vec.resize(n);
+    residual_vec.resize(n);
+    bias_vec.resize(cols);
+    std::default_random_engine random(time(NULL));
+    std::uniform_real_distribution<float> dis(0.0, 1.0);
+    for (int i = 0; i < rows; i++) {
+      for (int j = 0; j < cols; j++) {
+        src_vec[i * cols + j] = static_cast<T>(dis(random));
+        residual_vec[i * cols + j] = static_cast<T>(dis(random));
+        if (i == 0) {
+          bias_vec[j] = dis(random);
+        }
+      }
+    }
+    framework::TensorFromVector<T>(src_vec, *ctx, &src);
+    src.Resize({rows, cols});
+    framework::TensorFromVector<T>(residual_vec, *ctx, &residual);
+    residual.Resize({rows, cols});
+    if (has_bias) {
+      framework::TensorFromVector<T>(bias_vec, *ctx, &bias);
+      bias.Resize({cols});
+    }
+    {
+      out.Resize({rows, cols});
+      out.mutable_data<T>(place);
+      mask.Resize({rows, cols});
+      mask.mutable_data<uint8_t>(place);
+      dsrc.Resize({rows, cols});
+      dsrc.mutable_data<T>(place);
+      if (has_bias) {
+        dbias.Resize({cols});
+        dbias.mutable_data<T>(place);
+      }
+    }
+  }
+  void BaseForward() {
+    std::vector<T> out1(rows * cols), out2(rows * cols);
+    if (has_bias) {
+      // add bias
+      for (int i = 0; i < rows; i++) {
+        for (int j = 0; j < cols; j++) {
+          out1[i * cols + j] = src_vec[i * cols + j] + bias_vec[j];
+        }
+      }
+      // call dropout
+      Dropout<T>(out1, src.dims(), &out2, &correct_mask, *ctx, seed,
+                 dropout_prob, is_upscale_in_train, is_test);
+    } else {
+      Dropout<T>(src_vec, src.dims(), &out2, &correct_mask, *ctx, seed,
+                 dropout_prob, is_upscale_in_train, is_test);
+    }
+    ctx->Wait();
+    // add residual
+    for (int i = 0; i < rows; i++) {
+      for (int j = 0; j < cols; j++) {
+        correct_out[i * cols + j] =
+            residual_vec[i * cols + j] + out2[i * cols + j];
+      }
+    }
+  }
+  void BaseBackward() {
+    DropoutGrad<T>(&correct_dsrc, src.dims(), correct_out, correct_mask, *ctx,
+                   dropout_prob, is_upscale_in_train);
+    // calc dbias
+    memset(&correct_dbias[0], 0, cols * sizeof(T));
+    for (int i = 0; i < rows; i++) {
+      for (int j = 0; j < cols; j++) {
+        correct_dbias[j] += correct_out[i * cols + j];
+      }
+    }
+  }
+  void FusedForward() {
+    const int VecSize = MAX_CACHE_BYTES / sizeof(T);
+    auto config = paddle::operators::Get1DBlocksAnd2DGrids(
+        *ctx, (uint64_t)rows, (uint64_t)cols, VecSize);
+    const int increment = ((cols - 1) / (config.thread_per_block.x *
+                                         config.block_per_grid.x * VecSize) +
+                           1) *
+                          VecSize;
+    T *bias_ptr = nullptr;
+    if (has_bias) {
+      bias_ptr = bias.data<T>();
+    }
+    paddle::operators::LaunchResidualDropoutBias<T, uint8_t>(
+        rows, cols, increment, seed, dropout_prob, is_test, is_upscale_in_train,
+        src.data<T>(), residual.data<T>(), bias_ptr, mask.data<uint8_t>(),
+        out.data<T>(), *ctx);
+    ctx->Wait();
+  }
+  void FusedBackward() {
+    if (is_test) {
+      return;
+    }
+    T *bias_ptr = nullptr;
+    if (has_bias) {
+      bias_ptr = dbias.data<T>();
+    }
+    paddle::operators::LaunchResidualDropoutBiasGrad<T, uint8_t>(
+        out.data<T>(), mask.data<uint8_t>(), dropout_prob, is_upscale_in_train,
+        rows, cols, dsrc.data<T>(), bias_ptr, *ctx);
+  }
+  void Run() {
+    SetUp();
+    BaseForward();
+    FusedForward();
+    BaseBackward();
+    FusedBackward();
+  }
+  void CheckOut(const T diff) {
+    const int n = rows * cols;
+    std::vector<T> _out(n);
+    std::vector<uint8_t> _mask(n);
+    framework::TensorToVector(out, *ctx, &_out);
+    if (!is_test) {
+      framework::TensorToVector<uint8_t>(mask, *ctx, &_mask);
+    }
+    ctx->Wait();
+    for (int i = 0; i < n; i++) {
+      EXPECT_LT(std::abs(_out[i] - correct_out[i]), diff);
+      if (!is_test) EXPECT_EQ(_mask[i], correct_mask[i]);
+    }
+  }
+  void CheckGrad(const T diff) {
+    if (is_test) {
+      return;
+    }
+    const int n = rows * cols;
+    std::vector<T> _dsrc(n);
+    framework::TensorToVector(dsrc, *ctx, &_dsrc);
+    for (int i = 0; i < n; i++) {
+      EXPECT_LT(std::abs(_dsrc[i] - correct_dsrc[i]), diff);
+    }
+    if (has_bias) {
+      std::vector<T> _dbias(cols);
+      framework::TensorToVector(dbias, *ctx, &_dbias);
+      ctx->Wait();
+      for (int i = 0; i < cols; i++) {
+        EXPECT_LT(std::abs(_dbias[i] - correct_dbias[i]), diff);
+      }
+    }
+  }
+};
+// test the shape and bias
+template <typename T>
+static void BaseTest(const bool is_fp16 = false) {
+  const int rows = 16;
+  std::vector<int> cols_list = {16, 17};
+  bool has_bias[2] = {true, false};
+  T default_diff = static_cast<T>(1e-5);
+  if (is_fp16) {
+    default_diff = static_cast<T>(1e-2);
+  }
+  for (int i = 0; i < cols_list.size(); i++) {
+    for (int j = 0; j < 2; j++) {
+      TestFusedResidualDropoutBias<T> test(rows, cols_list[i]);
+      test.has_bias = has_bias[j];
+      test.Run();
+      test.CheckOut(default_diff);
+      if (!is_fp16) {
+        test.CheckGrad(default_diff);
+      }
+    }
+  }
+}
+TEST(FusedDropout, GPUFusedResidualDropoutBias) { BaseTest<float>(); }
+TEST(FusedDropout, GPUFusedResidualDropoutBiasDouble) { BaseTest<double>(); }
+// test fp16, For inference, check_grad is not required. ref: testdropout_op.py
+TEST(FusedDropout, GPUFusedResidualDropoutBiasFp16) {
+  BaseTest<platform::float16>(true);
+}
+TEST(FusedDropout, GPUFusedResidualDropoutBias2) {
+  const int rows = 16;
+  const int cols = 16;
+  TestFusedResidualDropoutBias<float> test(rows, cols, 0, 1.0, false, false);
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-5));
+}
+TEST(FusedDropout, GPUFusedResidualDropoutBias3) {
+  const int rows = 16;
+  const int cols = 16;
+  TestFusedResidualDropoutBias<float> test(rows, cols, 0, 1.0, true, false);
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-5));
+}
+TEST(FusedDropout, GPUFusedResidualDropoutBias4) {
+  const int rows = 16;
+  const int cols = 16;
+  TestFusedResidualDropoutBias<float> test(rows, cols, 0, 0.35, true, true);
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-5));
+}
+TEST(FusedDropout, GPUFusedResidualDropoutBias5) {
+  const int rows = 16;
+  const int cols = 16;
+  TestFusedResidualDropoutBias<float> test(rows, cols, 125, 0.0, false, false);
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-5));
+}
+// test large shape
+TEST(FusedDropout, GPUFusedResidualDropoutBias6) {
+  const int rows = 256;
+  const int cols = 4096;
+  TestFusedResidualDropoutBias<float> test(rows, cols);
+  test.Run();
+  test.CheckOut(static_cast<float>(1e-5));
+  test.CheckGrad(static_cast<float>(1e-3));
+}