diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index 53c8eddd7246c3fc299e1a68bcd5b4fe1902cb12..f8d55ff9cf72a46000bfe00e88b6163365757d0f 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -166,7 +166,11 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place, constexpr int kNumTensor = MaxTensorNumPerLaunch; constexpr int kNumChunk = MaxChunkNumPerLaunch; +#ifdef PADDLE_WITH_HIP + constexpr int kBlockDim = 256; +#else constexpr int kBlockDim = 512; +#endif int max_chunk_num = -1; int vec_size = 8; @@ -805,7 +809,11 @@ static void MultiTensorUpdateLambParamAndBetaPows( platform::errors::InvalidArgument("Beta2Pow should be nullptr.")); } +#ifdef PADDLE_WITH_HIP + const int block_dim = 256; +#else const int block_dim = 512; +#endif int vec_size = 8; for (int i = 0; i < n; ++i) { diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h index da95fc3c164ba888052278272822ce132f867b69..a8ce5f1a1827bba7983bb32a1b463c5a91cdf521 100644 --- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h +++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h @@ -134,7 +134,11 @@ __device__ T reduceSum(T val, int tid, int len) { // I use Warp-Level Parallelism and assume the Warp size // is 32 which may be different for different GPU, // but most card's warp size is 32. +#ifdef PADDLE_WITH_HIP + const int warpSize = 64; +#else const int warpSize = 32; +#endif __shared__ T shm[warpSize]; unsigned mask = 0u; CREATE_SHFL_MASK(mask, tid < len);