[cherry-pick 2.3] fix FusedResidualDropoutBias nan & fix lod_tensor_array gc (#42398)

* fix FusedResidualDropoutBias nan in v100 (#42344) * fix lod_tensor_array gc (#42377)

[cherry-pick 2.3] fix FusedResidualDropoutBias nan & fix lod_tensor_array gc (#42398)
* fix FusedResidualDropoutBias nan in v100 (#42344) * fix lod_tensor_array gc (#42377)
3b2bc0a0 · WangXi · GitHub · 50bfe420 · 3b2bc0a0 · 3b2bc0a0
3 changed file
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
@@ -156,6 +156,9 @@ void DeleteUnusedTensors(const Scope &scope,
      for (auto &t : *lod_tensor_arr) {
        garbages.emplace_back(t.MoveMemoryHolder());
      }
+      // NOTE(wangxi): need clear the vector, otherwise lod_tensor_arr.size() is
+      // wrong, if size() decrease in next step, an error maybe occur.
+      lod_tensor_arr->clear();
    } else if (var->IsType<Strings>()) {
    } else {
      PADDLE_THROW(platform::errors::Unimplemented(

--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -43,9 +43,17 @@ inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids(
    const platform::CUDADeviceContext &ctx, const uint32_t rows,
    const uint32_t cols, const int vec_size) {
  const uint32_t tmp_cols = cols / vec_size;
-  int threads = std::max(
-      static_cast<uint32_t>(32),
-      std::min(tmp_cols, static_cast<uint32_t>(ctx.GetMaxThreadsPerBlock())));
+  // NOTE(wangxi): We set max_block_size to 512, for `FusedResidualDropoutBias`
+  // needs too many register resources. If data_type is float16, CUDA
+  // error(701) will occur when block_size is 1024. Which error is
+  // 'cudaErrorLaunchOutOfResources', this indicates that a launch did not
+  // occur because it did not have appropriate resources.
+  // Of course, this kernel can be optimized later to reduce the use
+  // of registers.
+  int threads =
+      std::max(static_cast<uint32_t>(32),
+               std::min(tmp_cols, static_cast<uint32_t>(std::min(
+                                      ctx.GetMaxThreadsPerBlock(), 512))));
  const auto blocks_x =
      std::max(static_cast<uint32_t>(1), (tmp_cols + threads - 1) / threads);
  const auto blocks_y = std::max(static_cast<uint32_t>(1), rows);

--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -147,6 +147,7 @@ struct TestFusedResidualDropoutBias {
                 dropout_prob, is_upscale_in_train, is_test);
    }
    ctx->Wait();
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
    // add residual
    for (int i = 0; i < rows; i++) {
      for (int j = 0; j < cols; j++) {
@@ -186,6 +187,7 @@ struct TestFusedResidualDropoutBias {
        src.data<T>(), residual.data<T>(), bias_ptr, mask.data<uint8_t>(),
        out.data<T>(), *ctx);
    ctx->Wait();
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
  }

  void FusedBackward() {
@@ -313,3 +315,20 @@ TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShape) {
  test.CheckOut(static_cast<float>(1e-5));
  test.CheckGrad(static_cast<float>(1e-3));
 }
+
+TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShapeFp16) {
+  // Used to test that `cudaErrorLaunchOutOfResources` will not occur
+  int rows = 1;
+  int cols = 12288;
+  if (std::getenv("_rows") != nullptr) {
+    rows = atoi(std::getenv("_rows"));
+  }
+  if (std::getenv("_cols") != nullptr) {
+    cols = atoi(std::getenv("_cols"));
+  }
+  TestFusedResidualDropoutBias<platform::float16> test(rows, cols, 0, 0.0, true,
+                                                       true);
+  test.Run();
+  test.CheckOut(static_cast<platform::float16>(1e-1));
+  test.CheckGrad(static_cast<platform::float16>(1e-1));
+}