未验证 提交 3b2bc0a0 编写于 作者: W WangXi 提交者: GitHub

[cherry-pick 2.3] fix FusedResidualDropoutBias nan & fix lod_tensor_array gc (#42398)

* fix FusedResidualDropoutBias nan in v100 (#42344)

* fix lod_tensor_array gc (#42377)
上级 50bfe420
......@@ -156,6 +156,9 @@ void DeleteUnusedTensors(const Scope &scope,
for (auto &t : *lod_tensor_arr) {
garbages.emplace_back(t.MoveMemoryHolder());
}
// NOTE(wangxi): need clear the vector, otherwise lod_tensor_arr.size() is
// wrong, if size() decrease in next step, an error maybe occur.
lod_tensor_arr->clear();
} else if (var->IsType<Strings>()) {
} else {
PADDLE_THROW(platform::errors::Unimplemented(
......
......@@ -43,9 +43,17 @@ inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids(
const platform::CUDADeviceContext &ctx, const uint32_t rows,
const uint32_t cols, const int vec_size) {
const uint32_t tmp_cols = cols / vec_size;
int threads = std::max(
static_cast<uint32_t>(32),
std::min(tmp_cols, static_cast<uint32_t>(ctx.GetMaxThreadsPerBlock())));
// NOTE(wangxi): We set max_block_size to 512, for `FusedResidualDropoutBias`
// needs too many register resources. If data_type is float16, CUDA
// error(701) will occur when block_size is 1024. Which error is
// 'cudaErrorLaunchOutOfResources', this indicates that a launch did not
// occur because it did not have appropriate resources.
// Of course, this kernel can be optimized later to reduce the use
// of registers.
int threads =
std::max(static_cast<uint32_t>(32),
std::min(tmp_cols, static_cast<uint32_t>(std::min(
ctx.GetMaxThreadsPerBlock(), 512))));
const auto blocks_x =
std::max(static_cast<uint32_t>(1), (tmp_cols + threads - 1) / threads);
const auto blocks_y = std::max(static_cast<uint32_t>(1), rows);
......
......@@ -147,6 +147,7 @@ struct TestFusedResidualDropoutBias {
dropout_prob, is_upscale_in_train, is_test);
}
ctx->Wait();
PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
// add residual
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
......@@ -186,6 +187,7 @@ struct TestFusedResidualDropoutBias {
src.data<T>(), residual.data<T>(), bias_ptr, mask.data<uint8_t>(),
out.data<T>(), *ctx);
ctx->Wait();
PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
}
void FusedBackward() {
......@@ -313,3 +315,20 @@ TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShape) {
test.CheckOut(static_cast<float>(1e-5));
test.CheckGrad(static_cast<float>(1e-3));
}
TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShapeFp16) {
// Used to test that `cudaErrorLaunchOutOfResources` will not occur
int rows = 1;
int cols = 12288;
if (std::getenv("_rows") != nullptr) {
rows = atoi(std::getenv("_rows"));
}
if (std::getenv("_cols") != nullptr) {
cols = atoi(std::getenv("_cols"));
}
TestFusedResidualDropoutBias<platform::float16> test(rows, cols, 0, 0.0, true,
true);
test.Run();
test.CheckOut(static_cast<platform::float16>(1e-1));
test.CheckGrad(static_cast<platform::float16>(1e-1));
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册