未验证 提交 d7679426 编写于 作者: R ronnywang 提交者: GitHub

[ROCM] Remove the constraint with a maximum number of threads per block of 256, P1 (#56699)

上级 68f8176b
......@@ -287,11 +287,7 @@ void FillHashTable(const framework::ExecutionContext& ctx,
thrust::device_vector<T>* keys,
thrust::device_vector<T>* values,
thrust::device_vector<int64_t>* key_index) {
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
int block = 1024;
#endif
const auto& dev_ctx = ctx.cuda_device_context();
int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int grid_tmp = (num_input + block - 1) / block;
......@@ -377,12 +373,8 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
subset->resize(unique_items.size());
thrust::copy(unique_items.begin(), unique_items.end(), subset->begin());
// Fill outputs with reindex result.
#ifdef PADDLE_WITH_HIP
int block = 256;
#else
// Fill outputs with reindex result.
int block = 1024;
#endif
const auto& dev_ctx = ctx.cuda_device_context();
int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
int64_t grid_tmp = (outputs->size() + block - 1) / block;
......
......@@ -126,11 +126,7 @@ class Unpool2dMaxFunctor<phi::GPUContext, T> {
const T* input_data = input.data<T>();
const int* indices_data = indices.data<int>();
T* output_data = output->mutable_data<T>(context.GetPlace());
#ifdef __HIPCC__
int threads = 256;
#else
int threads = 1024;
#endif
int grid = (input.numel() + threads - 1) / threads;
KernelUnpool2dMax<T>
<<<grid, threads, 0, context.stream()>>>(input.numel(),
......@@ -167,11 +163,7 @@ class Unpool2dMaxGradFunctor<phi::GPUContext, T> {
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
#ifdef __HIPCC__
int threads = 256;
#else
int threads = 1024;
#endif
int grid = (input.numel() + threads - 1) / threads;
KernelUnpool2dMaxGrad<T>
<<<grid, threads, 0, context.stream()>>>(input.numel(),
......@@ -206,11 +198,7 @@ class Unpool3dMaxFunctor<phi::GPUContext, T> {
const T* input_data = input.data<T>();
const int* indices_data = indices.data<int>();
T* output_data = output->mutable_data<T>(context.GetPlace());
#ifdef __HIPCC__
int threads = 256;
#else
int threads = 1024;
#endif
int grid = (input.numel() + threads - 1) / threads;
KernelUnpool3dMax<T>
<<<grid, threads, 0, context.stream()>>>(input.numel(),
......@@ -251,11 +239,7 @@ class Unpool3dMaxGradFunctor<phi::GPUContext, T> {
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
#ifdef __HIPCC__
int threads = 256;
#else
int threads = 1024;
#endif
int grid = (input.numel() + threads - 1) / threads;
KernelUnpool3dMaxGrad<T>
<<<grid, threads, 0, context.stream()>>>(input.numel(),
......
......@@ -170,11 +170,7 @@ static void MultiTensorL2Norm(const phi::GPUPlace &place,
constexpr int kNumTensor = MaxTensorNumPerLaunch;
constexpr int kNumChunk = MaxChunkNumPerLaunch;
#ifdef PADDLE_WITH_HIP
constexpr int kBlockDim = 256;
#else
constexpr int kBlockDim = 512;
#endif
int max_chunk_num = -1;
int vec_size = 8;
......@@ -812,11 +808,7 @@ static void MultiTensorUpdateLambParamAndBetaPows(
phi::errors::InvalidArgument("Beta2Pow should be nullptr."));
}
#ifdef PADDLE_WITH_HIP
const int block_dim = 256;
#else
const int block_dim = 512;
#endif
int vec_size = 8;
for (int i = 0; i < n; ++i) {
......
......@@ -32,14 +32,9 @@
#include "paddle/fluid/platform/device_context.h"
#ifdef __HIPCC__
// HIP results in error or nan if > 256
#define PREDEFINED_BLOCK_SIZE 256
#else
/* CUDA performs better as thread_per_block
num is between [64, 512] */
#define PREDEFINED_BLOCK_SIZE 512
#endif
namespace paddle {
namespace platform {
......@@ -58,11 +53,7 @@ static inline int RoundToPowerOfTwo(int n) {
n |= (n >> 4);
n |= (n >> 8);
n |= (n >> 16);
#ifdef __HIPCC__
return std::min(256, std::max(32, (n + 1)));
#else
return std::min(1024, std::max(32, (n + 1)));
#endif
}
#ifdef WITH_NV_JETSON
......
......@@ -34,13 +34,8 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/enforce.h"
#ifdef __HIPCC__
// HIP results in error or nan if > 256
#define PREDEFINED_BLOCK_SIZE 256
#else
// CUDA performs better when thread_per_block is between [64, 512]
#define PREDEFINED_BLOCK_SIZE 512
#endif
namespace phi {
namespace backends {
......@@ -69,11 +64,7 @@ inline int64_t RoundToNextHighPowOfTwo(int64_t n, int64_t min_val = 1) {
inline int64_t RoundToPowerOfTwo(int64_t n) {
constexpr int64_t min_val = 32;
int64_t num = RoundToNextHighPowOfTwo(n, min_val);
#ifdef __HIPCC__
int64_t max_val = 256;
#else
int64_t max_val = 1024;
#endif
return std::min(max_val, num);
}
......
......@@ -124,11 +124,7 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
int batch_size = prob->dims()[0];
int class_num = prob->dims()[1];
#ifdef __HIPCC__
constexpr int kMaxBlockDim = 256;
#else
constexpr int kMaxBlockDim = 512;
#endif
if (softLabel) {
const T* label_data = labels->data<T>();
......
......@@ -32,11 +32,7 @@ limitations under the License. */
#endif
#ifdef __HIPCC__
constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
#else
constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
#endif
#define BLOCK_X 32
#define BLOCK_Y 32
......
......@@ -65,10 +65,7 @@ struct ForRange<phi::GPUContext> {
template <typename Function>
inline void operator()(Function func) const {
#ifdef __HIPCC__
// HIP will throw core dump when threads > 256
constexpr int num_threads = 256;
#elif WITH_NV_JETSON
#if WITH_NV_JETSON
// JETSON_NANO will throw core dump when threads > 128
int num_thread = 256;
backends::gpu::ChangeThreadNum(dev_ctx_, &num_thread, 128);
......
......@@ -25,11 +25,7 @@
#include <cooperative_groups.h>
#endif
#ifdef __HIPCC__
#define LARS_BLOCK_SIZE 256
#else
#define LARS_BLOCK_SIZE 512
#endif
#define LARS_MAX_MERGED_OPS 60
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册