From 28aa0c61a624111db5beac7699bf4ab391e2accc Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Wed, 27 Jul 2022 13:46:35 +0800 Subject: [PATCH] [DCU] Fix NAN problem when training BERT on DUC platform (#44643) --- .../operators/optimizers/distributed_fused_lamb_op.cu | 8 ++++++++ .../fluid/platform/device/gpu/rocm/rocm_device_function.h | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index 53c8eddd724..f8d55ff9cf7 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -166,7 +166,11 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place, constexpr int kNumTensor = MaxTensorNumPerLaunch; constexpr int kNumChunk = MaxChunkNumPerLaunch; +#ifdef PADDLE_WITH_HIP + constexpr int kBlockDim = 256; +#else constexpr int kBlockDim = 512; +#endif int max_chunk_num = -1; int vec_size = 8; @@ -805,7 +809,11 @@ static void MultiTensorUpdateLambParamAndBetaPows( platform::errors::InvalidArgument("Beta2Pow should be nullptr.")); } +#ifdef PADDLE_WITH_HIP + const int block_dim = 256; +#else const int block_dim = 512; +#endif int vec_size = 8; for (int i = 0; i < n; ++i) { diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h index da95fc3c164..a8ce5f1a182 100644 --- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h +++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h @@ -134,7 +134,11 @@ __device__ T reduceSum(T val, int tid, int len) { // I use Warp-Level Parallelism and assume the Warp size // is 32 which may be different for different GPU, // but most card's warp size is 32. +#ifdef PADDLE_WITH_HIP + const int warpSize = 64; +#else const int warpSize = 32; +#endif __shared__ T shm[warpSize]; unsigned mask = 0u; CREATE_SHFL_MASK(mask, tid < len); -- GitLab