未验证 提交 687219fe 编写于 作者: W WangXi 提交者: GitHub

fix FusedResidualDropoutBias nan in v100 (#42344)

上级 8ad38701
...@@ -43,9 +43,17 @@ inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids( ...@@ -43,9 +43,17 @@ inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids(
const platform::CUDADeviceContext &ctx, const uint32_t rows, const platform::CUDADeviceContext &ctx, const uint32_t rows,
const uint32_t cols, const int vec_size) { const uint32_t cols, const int vec_size) {
const uint32_t tmp_cols = cols / vec_size; const uint32_t tmp_cols = cols / vec_size;
int threads = std::max( // NOTE(wangxi): We set max_block_size to 512, for `FusedResidualDropoutBias`
static_cast<uint32_t>(32), // needs too many register resources. If data_type is float16, CUDA
std::min(tmp_cols, static_cast<uint32_t>(ctx.GetMaxThreadsPerBlock()))); // error(701) will occur when block_size is 1024. Which error is
// 'cudaErrorLaunchOutOfResources', this indicates that a launch did not
// occur because it did not have appropriate resources.
// Of course, this kernel can be optimized later to reduce the use
// of registers.
int threads =
std::max(static_cast<uint32_t>(32),
std::min(tmp_cols, static_cast<uint32_t>(std::min(
ctx.GetMaxThreadsPerBlock(), 512))));
const auto blocks_x = const auto blocks_x =
std::max(static_cast<uint32_t>(1), (tmp_cols + threads - 1) / threads); std::max(static_cast<uint32_t>(1), (tmp_cols + threads - 1) / threads);
const auto blocks_y = std::max(static_cast<uint32_t>(1), rows); const auto blocks_y = std::max(static_cast<uint32_t>(1), rows);
......
...@@ -147,6 +147,7 @@ struct TestFusedResidualDropoutBias { ...@@ -147,6 +147,7 @@ struct TestFusedResidualDropoutBias {
dropout_prob, is_upscale_in_train, is_test); dropout_prob, is_upscale_in_train, is_test);
} }
ctx->Wait(); ctx->Wait();
PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
// add residual // add residual
for (int i = 0; i < rows; i++) { for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) { for (int j = 0; j < cols; j++) {
...@@ -186,6 +187,7 @@ struct TestFusedResidualDropoutBias { ...@@ -186,6 +187,7 @@ struct TestFusedResidualDropoutBias {
src.data<T>(), residual.data<T>(), bias_ptr, mask.data<uint8_t>(), src.data<T>(), residual.data<T>(), bias_ptr, mask.data<uint8_t>(),
out.data<T>(), *ctx); out.data<T>(), *ctx);
ctx->Wait(); ctx->Wait();
PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
} }
void FusedBackward() { void FusedBackward() {
...@@ -313,3 +315,20 @@ TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShape) { ...@@ -313,3 +315,20 @@ TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShape) {
test.CheckOut(static_cast<float>(1e-5)); test.CheckOut(static_cast<float>(1e-5));
test.CheckGrad(static_cast<float>(1e-3)); test.CheckGrad(static_cast<float>(1e-3));
} }
TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShapeFp16) {
// Used to test that `cudaErrorLaunchOutOfResources` will not occur
int rows = 1;
int cols = 12288;
if (std::getenv("_rows") != nullptr) {
rows = atoi(std::getenv("_rows"));
}
if (std::getenv("_cols") != nullptr) {
cols = atoi(std::getenv("_cols"));
}
TestFusedResidualDropoutBias<platform::float16> test(rows, cols, 0, 0.0, true,
true);
test.Run();
test.CheckOut(static_cast<platform::float16>(1e-1));
test.CheckGrad(static_cast<platform::float16>(1e-1));
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册