From 3b2bc0a0133f6db24f9798deccf890939425a0af Mon Sep 17 00:00:00 2001 From: WangXi Date: Fri, 29 Apr 2022 19:29:03 +0800 Subject: [PATCH] [cherry-pick 2.3] fix FusedResidualDropoutBias nan & fix lod_tensor_array gc (#42398) * fix FusedResidualDropoutBias nan in v100 (#42344) * fix lod_tensor_array gc (#42377) --- paddle/fluid/framework/executor_gc_helper.cc | 3 +++ .../operators/fused/fused_dropout_common.h | 14 +++++++++++--- .../fused/fused_residual_dropout_bias_test.cu | 19 +++++++++++++++++++ 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc index 6dc53c9649e..05215a9e5f1 100644 --- a/paddle/fluid/framework/executor_gc_helper.cc +++ b/paddle/fluid/framework/executor_gc_helper.cc @@ -156,6 +156,9 @@ void DeleteUnusedTensors(const Scope &scope, for (auto &t : *lod_tensor_arr) { garbages.emplace_back(t.MoveMemoryHolder()); } + // NOTE(wangxi): need clear the vector, otherwise lod_tensor_arr.size() is + // wrong, if size() decrease in next step, an error maybe occur. + lod_tensor_arr->clear(); } else if (var->IsType()) { } else { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h index 6bf3a7114f4..0fe76fa23a6 100644 --- a/paddle/fluid/operators/fused/fused_dropout_common.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -43,9 +43,17 @@ inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids( const platform::CUDADeviceContext &ctx, const uint32_t rows, const uint32_t cols, const int vec_size) { const uint32_t tmp_cols = cols / vec_size; - int threads = std::max( - static_cast(32), - std::min(tmp_cols, static_cast(ctx.GetMaxThreadsPerBlock()))); + // NOTE(wangxi): We set max_block_size to 512, for `FusedResidualDropoutBias` + // needs too many register resources. If data_type is float16, CUDA + // error(701) will occur when block_size is 1024. Which error is + // 'cudaErrorLaunchOutOfResources', this indicates that a launch did not + // occur because it did not have appropriate resources. + // Of course, this kernel can be optimized later to reduce the use + // of registers. + int threads = + std::max(static_cast(32), + std::min(tmp_cols, static_cast(std::min( + ctx.GetMaxThreadsPerBlock(), 512)))); const auto blocks_x = std::max(static_cast(1), (tmp_cols + threads - 1) / threads); const auto blocks_y = std::max(static_cast(1), rows); diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu index 5dff5e2225f..caceac1228e 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu @@ -147,6 +147,7 @@ struct TestFusedResidualDropoutBias { dropout_prob, is_upscale_in_train, is_test); } ctx->Wait(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); // add residual for (int i = 0; i < rows; i++) { for (int j = 0; j < cols; j++) { @@ -186,6 +187,7 @@ struct TestFusedResidualDropoutBias { src.data(), residual.data(), bias_ptr, mask.data(), out.data(), *ctx); ctx->Wait(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); } void FusedBackward() { @@ -313,3 +315,20 @@ TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShape) { test.CheckOut(static_cast(1e-5)); test.CheckGrad(static_cast(1e-3)); } + +TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShapeFp16) { + // Used to test that `cudaErrorLaunchOutOfResources` will not occur + int rows = 1; + int cols = 12288; + if (std::getenv("_rows") != nullptr) { + rows = atoi(std::getenv("_rows")); + } + if (std::getenv("_cols") != nullptr) { + cols = atoi(std::getenv("_cols")); + } + TestFusedResidualDropoutBias test(rows, cols, 0, 0.0, true, + true); + test.Run(); + test.CheckOut(static_cast(1e-1)); + test.CheckGrad(static_cast(1e-1)); +} -- GitLab