未验证 提交 d7679426 编写于 作者: R ronnywang 提交者: GitHub

[ROCM] Remove the constraint with a maximum number of threads per block of 256, P1 (#56699)

上级 68f8176b
...@@ -287,11 +287,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, ...@@ -287,11 +287,7 @@ void FillHashTable(const framework::ExecutionContext& ctx,
thrust::device_vector<T>* keys, thrust::device_vector<T>* keys,
thrust::device_vector<T>* values, thrust::device_vector<T>* values,
thrust::device_vector<int64_t>* key_index) { thrust::device_vector<int64_t>* key_index) {
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
int block = 1024; int block = 1024;
#endif
const auto& dev_ctx = ctx.cuda_device_context(); const auto& dev_ctx = ctx.cuda_device_context();
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int grid_tmp = (num_input + block - 1) / block; int grid_tmp = (num_input + block - 1) / block;
...@@ -377,12 +373,8 @@ void ReindexFunc(const framework::ExecutionContext& ctx, ...@@ -377,12 +373,8 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
subset->resize(unique_items.size()); subset->resize(unique_items.size());
thrust::copy(unique_items.begin(), unique_items.end(), subset->begin()); thrust::copy(unique_items.begin(), unique_items.end(), subset->begin());
// Fill outputs with reindex result. // Fill outputs with reindex result.
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
int block = 1024; int block = 1024;
#endif
const auto& dev_ctx = ctx.cuda_device_context(); const auto& dev_ctx = ctx.cuda_device_context();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (outputs->size() + block - 1) / block; int64_t grid_tmp = (outputs->size() + block - 1) / block;
......
...@@ -126,11 +126,7 @@ class Unpool2dMaxFunctor<phi::GPUContext, T> { ...@@ -126,11 +126,7 @@ class Unpool2dMaxFunctor<phi::GPUContext, T> {
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const int* indices_data = indices.data<int>(); const int* indices_data = indices.data<int>();
T* output_data = output->mutable_data<T>(context.GetPlace()); T* output_data = output->mutable_data<T>(context.GetPlace());
#ifdef __HIPCC__
int threads = 256;
#else
int threads = 1024; int threads = 1024;
#endif
int grid = (input.numel() + threads - 1) / threads; int grid = (input.numel() + threads - 1) / threads;
KernelUnpool2dMax<T> KernelUnpool2dMax<T>
<<<grid, threads, 0, context.stream()>>>(input.numel(), <<<grid, threads, 0, context.stream()>>>(input.numel(),
...@@ -167,11 +163,7 @@ class Unpool2dMaxGradFunctor<phi::GPUContext, T> { ...@@ -167,11 +163,7 @@ class Unpool2dMaxGradFunctor<phi::GPUContext, T> {
const T* output_data = output.data<T>(); const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
#ifdef __HIPCC__
int threads = 256;
#else
int threads = 1024; int threads = 1024;
#endif
int grid = (input.numel() + threads - 1) / threads; int grid = (input.numel() + threads - 1) / threads;
KernelUnpool2dMaxGrad<T> KernelUnpool2dMaxGrad<T>
<<<grid, threads, 0, context.stream()>>>(input.numel(), <<<grid, threads, 0, context.stream()>>>(input.numel(),
...@@ -206,11 +198,7 @@ class Unpool3dMaxFunctor<phi::GPUContext, T> { ...@@ -206,11 +198,7 @@ class Unpool3dMaxFunctor<phi::GPUContext, T> {
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const int* indices_data = indices.data<int>(); const int* indices_data = indices.data<int>();
T* output_data = output->mutable_data<T>(context.GetPlace()); T* output_data = output->mutable_data<T>(context.GetPlace());
#ifdef __HIPCC__
int threads = 256;
#else
int threads = 1024; int threads = 1024;
#endif
int grid = (input.numel() + threads - 1) / threads; int grid = (input.numel() + threads - 1) / threads;
KernelUnpool3dMax<T> KernelUnpool3dMax<T>
<<<grid, threads, 0, context.stream()>>>(input.numel(), <<<grid, threads, 0, context.stream()>>>(input.numel(),
...@@ -251,11 +239,7 @@ class Unpool3dMaxGradFunctor<phi::GPUContext, T> { ...@@ -251,11 +239,7 @@ class Unpool3dMaxGradFunctor<phi::GPUContext, T> {
const T* output_data = output.data<T>(); const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
#ifdef __HIPCC__
int threads = 256;
#else
int threads = 1024; int threads = 1024;
#endif
int grid = (input.numel() + threads - 1) / threads; int grid = (input.numel() + threads - 1) / threads;
KernelUnpool3dMaxGrad<T> KernelUnpool3dMaxGrad<T>
<<<grid, threads, 0, context.stream()>>>(input.numel(), <<<grid, threads, 0, context.stream()>>>(input.numel(),
......
...@@ -170,11 +170,7 @@ static void MultiTensorL2Norm(const phi::GPUPlace &place, ...@@ -170,11 +170,7 @@ static void MultiTensorL2Norm(const phi::GPUPlace &place,
constexpr int kNumTensor = MaxTensorNumPerLaunch; constexpr int kNumTensor = MaxTensorNumPerLaunch;
constexpr int kNumChunk = MaxChunkNumPerLaunch; constexpr int kNumChunk = MaxChunkNumPerLaunch;
#ifdef PADDLE_WITH_HIP
constexpr int kBlockDim = 256;
#else
constexpr int kBlockDim = 512; constexpr int kBlockDim = 512;
#endif
int max_chunk_num = -1; int max_chunk_num = -1;
int vec_size = 8; int vec_size = 8;
...@@ -812,11 +808,7 @@ static void MultiTensorUpdateLambParamAndBetaPows( ...@@ -812,11 +808,7 @@ static void MultiTensorUpdateLambParamAndBetaPows(
phi::errors::InvalidArgument("Beta2Pow should be nullptr.")); phi::errors::InvalidArgument("Beta2Pow should be nullptr."));
} }
#ifdef PADDLE_WITH_HIP
const int block_dim = 256;
#else
const int block_dim = 512; const int block_dim = 512;
#endif
int vec_size = 8; int vec_size = 8;
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
......
...@@ -32,14 +32,9 @@ ...@@ -32,14 +32,9 @@
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#ifdef __HIPCC__
// HIP results in error or nan if > 256
#define PREDEFINED_BLOCK_SIZE 256
#else
/* CUDA performs better as thread_per_block /* CUDA performs better as thread_per_block
num is between [64, 512] */ num is between [64, 512] */
#define PREDEFINED_BLOCK_SIZE 512 #define PREDEFINED_BLOCK_SIZE 512
#endif
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -58,11 +53,7 @@ static inline int RoundToPowerOfTwo(int n) { ...@@ -58,11 +53,7 @@ static inline int RoundToPowerOfTwo(int n) {
n |= (n >> 4); n |= (n >> 4);
n |= (n >> 8); n |= (n >> 8);
n |= (n >> 16); n |= (n >> 16);
#ifdef __HIPCC__
return std::min(256, std::max(32, (n + 1)));
#else
return std::min(1024, std::max(32, (n + 1))); return std::min(1024, std::max(32, (n + 1)));
#endif
} }
#ifdef WITH_NV_JETSON #ifdef WITH_NV_JETSON
......
...@@ -34,13 +34,8 @@ ...@@ -34,13 +34,8 @@
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#ifdef __HIPCC__
// HIP results in error or nan if > 256
#define PREDEFINED_BLOCK_SIZE 256
#else
// CUDA performs better when thread_per_block is between [64, 512] // CUDA performs better when thread_per_block is between [64, 512]
#define PREDEFINED_BLOCK_SIZE 512 #define PREDEFINED_BLOCK_SIZE 512
#endif
namespace phi { namespace phi {
namespace backends { namespace backends {
...@@ -69,11 +64,7 @@ inline int64_t RoundToNextHighPowOfTwo(int64_t n, int64_t min_val = 1) { ...@@ -69,11 +64,7 @@ inline int64_t RoundToNextHighPowOfTwo(int64_t n, int64_t min_val = 1) {
inline int64_t RoundToPowerOfTwo(int64_t n) { inline int64_t RoundToPowerOfTwo(int64_t n) {
constexpr int64_t min_val = 32; constexpr int64_t min_val = 32;
int64_t num = RoundToNextHighPowOfTwo(n, min_val); int64_t num = RoundToNextHighPowOfTwo(n, min_val);
#ifdef __HIPCC__
int64_t max_val = 256;
#else
int64_t max_val = 1024; int64_t max_val = 1024;
#endif
return std::min(max_val, num); return std::min(max_val, num);
} }
......
...@@ -124,11 +124,7 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()( ...@@ -124,11 +124,7 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
int batch_size = prob->dims()[0]; int batch_size = prob->dims()[0];
int class_num = prob->dims()[1]; int class_num = prob->dims()[1];
#ifdef __HIPCC__
constexpr int kMaxBlockDim = 256;
#else
constexpr int kMaxBlockDim = 512; constexpr int kMaxBlockDim = 512;
#endif
if (softLabel) { if (softLabel) {
const T* label_data = labels->data<T>(); const T* label_data = labels->data<T>();
......
...@@ -32,11 +32,7 @@ limitations under the License. */ ...@@ -32,11 +32,7 @@ limitations under the License. */
#endif #endif
#ifdef __HIPCC__
constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
#else
constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
#endif
#define BLOCK_X 32 #define BLOCK_X 32
#define BLOCK_Y 32 #define BLOCK_Y 32
......
...@@ -65,10 +65,7 @@ struct ForRange<phi::GPUContext> { ...@@ -65,10 +65,7 @@ struct ForRange<phi::GPUContext> {
template <typename Function> template <typename Function>
inline void operator()(Function func) const { inline void operator()(Function func) const {
#ifdef __HIPCC__ #if WITH_NV_JETSON
// HIP will throw core dump when threads > 256
constexpr int num_threads = 256;
#elif WITH_NV_JETSON
// JETSON_NANO will throw core dump when threads > 128 // JETSON_NANO will throw core dump when threads > 128
int num_thread = 256; int num_thread = 256;
backends::gpu::ChangeThreadNum(dev_ctx_, &num_thread, 128); backends::gpu::ChangeThreadNum(dev_ctx_, &num_thread, 128);
......
...@@ -25,11 +25,7 @@ ...@@ -25,11 +25,7 @@
#include <cooperative_groups.h> #include <cooperative_groups.h>
#endif #endif
#ifdef __HIPCC__
#define LARS_BLOCK_SIZE 256
#else
#define LARS_BLOCK_SIZE 512 #define LARS_BLOCK_SIZE 512
#endif
#define LARS_MAX_MERGED_OPS 60 #define LARS_MAX_MERGED_OPS 60
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册