[DCU] Fix NAN problem when training BERT on DUC platform (#44643)

28aa0c61 · Yuang Liu · GitHub · e7c7280f · 28aa0c61 · 28aa0c61
2 changed file
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -166,7 +166,11 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,

  constexpr int kNumTensor = MaxTensorNumPerLaunch;
  constexpr int kNumChunk = MaxChunkNumPerLaunch;
+#ifdef PADDLE_WITH_HIP
+  constexpr int kBlockDim = 256;
+#else
  constexpr int kBlockDim = 512;
+#endif

  int max_chunk_num = -1;
  int vec_size = 8;
@@ -805,7 +809,11 @@ static void MultiTensorUpdateLambParamAndBetaPows(
        platform::errors::InvalidArgument("Beta2Pow should be nullptr."));
  }

+#ifdef PADDLE_WITH_HIP
+  const int block_dim = 256;
+#else
  const int block_dim = 512;
+#endif

  int vec_size = 8;
  for (int i = 0; i < n; ++i) {

--- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
@@ -134,7 +134,11 @@ __device__ T reduceSum(T val, int tid, int len) {
  // I use Warp-Level Parallelism and assume the Warp size
  // is 32 which may be different for different GPU,
  // but most card's warp size is 32.
+#ifdef PADDLE_WITH_HIP
+  const int warpSize = 64;
+#else
  const int warpSize = 32;
+#endif
  __shared__ T shm[warpSize];
  unsigned mask = 0u;
  CREATE_SHFL_MASK(mask, tid < len);