From d76794262eb3f3fd0b507576ae8047ee20359630 Mon Sep 17 00:00:00 2001 From: ronnywang Date: Thu, 31 Aug 2023 10:09:52 +0800 Subject: [PATCH] [ROCM] Remove the constraint with a maximum number of threads per block of 256, P1 (#56699) --- paddle/fluid/operators/graph_khop_sampler_op.cu | 10 +--------- paddle/fluid/operators/math/unpooling.cu | 16 ---------------- .../optimizers/distributed_fused_lamb_op.cu | 8 -------- .../platform/device/gpu/gpu_launch_config.h | 9 --------- paddle/phi/backends/gpu/gpu_launch_config.h | 9 --------- paddle/phi/kernels/funcs/cross_entropy.cu | 4 ---- paddle/phi/kernels/funcs/elementwise_grad_base.h | 4 ---- paddle/phi/kernels/funcs/for_range.h | 5 +---- paddle/phi/kernels/gpu/lars_momentum_kernel.cu | 4 ---- 9 files changed, 2 insertions(+), 67 deletions(-) diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu index e533960c8a6..b4e0f511f6d 100644 --- a/paddle/fluid/operators/graph_khop_sampler_op.cu +++ b/paddle/fluid/operators/graph_khop_sampler_op.cu @@ -287,11 +287,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, thrust::device_vector* keys, thrust::device_vector* values, thrust::device_vector* key_index) { -#ifdef PADDLE_WITH_HIP - int block = 256; -#else int block = 1024; -#endif const auto& dev_ctx = ctx.cuda_device_context(); int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int grid_tmp = (num_input + block - 1) / block; @@ -377,12 +373,8 @@ void ReindexFunc(const framework::ExecutionContext& ctx, subset->resize(unique_items.size()); thrust::copy(unique_items.begin(), unique_items.end(), subset->begin()); -// Fill outputs with reindex result. -#ifdef PADDLE_WITH_HIP - int block = 256; -#else + // Fill outputs with reindex result. int block = 1024; -#endif const auto& dev_ctx = ctx.cuda_device_context(); int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int64_t grid_tmp = (outputs->size() + block - 1) / block; diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu index 0ecac6c5fb0..a386772405a 100644 --- a/paddle/fluid/operators/math/unpooling.cu +++ b/paddle/fluid/operators/math/unpooling.cu @@ -126,11 +126,7 @@ class Unpool2dMaxFunctor { const T* input_data = input.data(); const int* indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); -#ifdef __HIPCC__ - int threads = 256; -#else int threads = 1024; -#endif int grid = (input.numel() + threads - 1) / threads; KernelUnpool2dMax <<>>(input.numel(), @@ -167,11 +163,7 @@ class Unpool2dMaxGradFunctor { const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); -#ifdef __HIPCC__ - int threads = 256; -#else int threads = 1024; -#endif int grid = (input.numel() + threads - 1) / threads; KernelUnpool2dMaxGrad <<>>(input.numel(), @@ -206,11 +198,7 @@ class Unpool3dMaxFunctor { const T* input_data = input.data(); const int* indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); -#ifdef __HIPCC__ - int threads = 256; -#else int threads = 1024; -#endif int grid = (input.numel() + threads - 1) / threads; KernelUnpool3dMax <<>>(input.numel(), @@ -251,11 +239,7 @@ class Unpool3dMaxGradFunctor { const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); -#ifdef __HIPCC__ - int threads = 256; -#else int threads = 1024; -#endif int grid = (input.numel() + threads - 1) / threads; KernelUnpool3dMaxGrad <<>>(input.numel(), diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index cad7e38ba1c..fdec898edbe 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -170,11 +170,7 @@ static void MultiTensorL2Norm(const phi::GPUPlace &place, constexpr int kNumTensor = MaxTensorNumPerLaunch; constexpr int kNumChunk = MaxChunkNumPerLaunch; -#ifdef PADDLE_WITH_HIP - constexpr int kBlockDim = 256; -#else constexpr int kBlockDim = 512; -#endif int max_chunk_num = -1; int vec_size = 8; @@ -812,11 +808,7 @@ static void MultiTensorUpdateLambParamAndBetaPows( phi::errors::InvalidArgument("Beta2Pow should be nullptr.")); } -#ifdef PADDLE_WITH_HIP - const int block_dim = 256; -#else const int block_dim = 512; -#endif int vec_size = 8; for (int i = 0; i < n; ++i) { diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h index d253a92c986..98c6e379342 100644 --- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h +++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h @@ -32,14 +32,9 @@ #include "paddle/fluid/platform/device_context.h" -#ifdef __HIPCC__ -// HIP results in error or nan if > 256 -#define PREDEFINED_BLOCK_SIZE 256 -#else /* CUDA performs better as thread_per_block num is between [64, 512] */ #define PREDEFINED_BLOCK_SIZE 512 -#endif namespace paddle { namespace platform { @@ -58,11 +53,7 @@ static inline int RoundToPowerOfTwo(int n) { n |= (n >> 4); n |= (n >> 8); n |= (n >> 16); -#ifdef __HIPCC__ - return std::min(256, std::max(32, (n + 1))); -#else return std::min(1024, std::max(32, (n + 1))); -#endif } #ifdef WITH_NV_JETSON diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h index a7a7ad03ad6..fd712baf754 100644 --- a/paddle/phi/backends/gpu/gpu_launch_config.h +++ b/paddle/phi/backends/gpu/gpu_launch_config.h @@ -34,13 +34,8 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/enforce.h" -#ifdef __HIPCC__ -// HIP results in error or nan if > 256 -#define PREDEFINED_BLOCK_SIZE 256 -#else // CUDA performs better when thread_per_block is between [64, 512] #define PREDEFINED_BLOCK_SIZE 512 -#endif namespace phi { namespace backends { @@ -69,11 +64,7 @@ inline int64_t RoundToNextHighPowOfTwo(int64_t n, int64_t min_val = 1) { inline int64_t RoundToPowerOfTwo(int64_t n) { constexpr int64_t min_val = 32; int64_t num = RoundToNextHighPowOfTwo(n, min_val); -#ifdef __HIPCC__ - int64_t max_val = 256; -#else int64_t max_val = 1024; -#endif return std::min(max_val, num); } diff --git a/paddle/phi/kernels/funcs/cross_entropy.cu b/paddle/phi/kernels/funcs/cross_entropy.cu index add838106bf..20a15f9e944 100644 --- a/paddle/phi/kernels/funcs/cross_entropy.cu +++ b/paddle/phi/kernels/funcs/cross_entropy.cu @@ -124,11 +124,7 @@ void CrossEntropyFunctor::operator()( int batch_size = prob->dims()[0]; int class_num = prob->dims()[1]; -#ifdef __HIPCC__ - constexpr int kMaxBlockDim = 256; -#else constexpr int kMaxBlockDim = 512; -#endif if (softLabel) { const T* label_data = labels->data(); diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h index df9e9347444..5ff70c86d5f 100644 --- a/paddle/phi/kernels/funcs/elementwise_grad_base.h +++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h @@ -32,11 +32,7 @@ limitations under the License. */ #endif -#ifdef __HIPCC__ -constexpr int ELEMWISE_MAX_BLOCK_DIM = 256; -#else constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; -#endif #define BLOCK_X 32 #define BLOCK_Y 32 diff --git a/paddle/phi/kernels/funcs/for_range.h b/paddle/phi/kernels/funcs/for_range.h index 9648a7d845f..484fbd21dc7 100644 --- a/paddle/phi/kernels/funcs/for_range.h +++ b/paddle/phi/kernels/funcs/for_range.h @@ -65,10 +65,7 @@ struct ForRange { template inline void operator()(Function func) const { -#ifdef __HIPCC__ - // HIP will throw core dump when threads > 256 - constexpr int num_threads = 256; -#elif WITH_NV_JETSON +#if WITH_NV_JETSON // JETSON_NANO will throw core dump when threads > 128 int num_thread = 256; backends::gpu::ChangeThreadNum(dev_ctx_, &num_thread, 128); diff --git a/paddle/phi/kernels/gpu/lars_momentum_kernel.cu b/paddle/phi/kernels/gpu/lars_momentum_kernel.cu index 14b7f1ca328..be8b1ff3796 100644 --- a/paddle/phi/kernels/gpu/lars_momentum_kernel.cu +++ b/paddle/phi/kernels/gpu/lars_momentum_kernel.cu @@ -25,11 +25,7 @@ #include #endif -#ifdef __HIPCC__ -#define LARS_BLOCK_SIZE 256 -#else #define LARS_BLOCK_SIZE 512 -#endif #define LARS_MAX_MERGED_OPS 60 -- GitLab