未验证 提交 909d1e61 编写于 作者: N niuliling123 提交者: GitHub

Modified Reduce for XPU2 (#38918)

1. set xpu2 block_size = 64
2. fix a bug when reduce_num is too large
上级 6bf85eaf
...@@ -178,6 +178,8 @@ struct IndexCalculator { ...@@ -178,6 +178,8 @@ struct IndexCalculator {
: dim(dim) { : dim(dim) {
dims = details::VectorToArray<int, kMaxRank>(cal_dims); dims = details::VectorToArray<int, kMaxRank>(cal_dims);
strides = details::VectorToArray<int, kMaxRank>(full_strides); strides = details::VectorToArray<int, kMaxRank>(full_strides);
reduce_strides = details::VectorToArray<int, kMaxRank>(cal_strides);
#ifndef PADDLE_WITH_XPU_KP
std::vector<paddle::platform::FastDivMod> cal_divmoders; std::vector<paddle::platform::FastDivMod> cal_divmoders;
// fast divmod // fast divmod
for (auto i : cal_strides) { for (auto i : cal_strides) {
...@@ -185,9 +187,22 @@ struct IndexCalculator { ...@@ -185,9 +187,22 @@ struct IndexCalculator {
} }
divmoders = details::VectorToArray<paddle::platform::FastDivMod, kMaxRank>( divmoders = details::VectorToArray<paddle::platform::FastDivMod, kMaxRank>(
cal_divmoders); cal_divmoders);
#endif
} }
__device__ inline int operator()(int offset) const { __device__ inline int operator()(int offset) const {
#ifdef PADDLE_WITH_XPU_KP
int index = 0;
#pragma unroll
for (int i = 0; i < kMaxRank; ++i) {
if (i == dim) {
break;
}
index += (offset / reduce_strides[i]) * strides[dims[i]];
offset = offset % reduce_strides[i];
}
return index;
#else
int index = 0; int index = 0;
#pragma unroll #pragma unroll
for (int i = 0; i < kMaxRank; ++i) { for (int i = 0; i < kMaxRank; ++i) {
...@@ -199,12 +214,16 @@ struct IndexCalculator { ...@@ -199,12 +214,16 @@ struct IndexCalculator {
offset = divmod.val[1]; offset = divmod.val[1];
} }
return index; return index;
#endif
} }
int dim; int dim;
phi::Array<int, kMaxRank> dims; phi::Array<int, kMaxRank> dims;
phi::Array<int, kMaxRank> strides; phi::Array<int, kMaxRank> strides;
phi::Array<int, kMaxRank> reduce_strides;
#ifndef PADDLE_WITH_XPU2
phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders; phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
#endif
}; };
template <bool ReduceLastDim = false> template <bool ReduceLastDim = false>
...@@ -247,7 +266,7 @@ struct ReduceIndexMapping { ...@@ -247,7 +266,7 @@ struct ReduceIndexMapping {
__device__ __forceinline__ int BlockDimY() { __device__ __forceinline__ int BlockDimY() {
#ifdef PADDLE_WITH_XPU2 #ifdef PADDLE_WITH_XPU2
return dim.deal_size_y; return 1;
#else #else
return blockDim.y; return blockDim.y;
#endif #endif
...@@ -454,10 +473,14 @@ struct ReduceConfig { ...@@ -454,10 +473,14 @@ struct ReduceConfig {
bool is_last_dim = bool is_last_dim =
(rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1); (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
if (rank == reduce_rank || is_last_dim) { if (rank == reduce_rank || is_last_dim) {
#ifdef PADDLE_WITH_XPU_KP
reduce_type = static_cast<int>(ReduceType::kReduceAny);
#else
reduce_type = static_cast<int>(ReduceType::kReduceLastDim); reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
#endif
} else if (reduce_rank == 1) { } else if (reduce_rank == 1) {
// ReduceFirstDim and reduceSecondDim // ReduceFirstDim and reduceSecondDim
#ifdef PADDLE_WITH_XPU2 #ifdef PADDLE_WITH_XPU_KP
if (reduce_dim[0] == 0) { if (reduce_dim[0] == 0) {
reduce_type = static_cast<int>(ReduceType::kReduceHigherDim); reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
} else { } else {
...@@ -471,6 +494,7 @@ struct ReduceConfig { ...@@ -471,6 +494,7 @@ struct ReduceConfig {
} }
} }
#ifndef PADDLE_WITH_XPU_KP
void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) { void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) {
constexpr int min_reduce_num_per_thread = 16; constexpr int min_reduce_num_per_thread = 16;
constexpr int max_reduce_num_per_thread = 256; constexpr int max_reduce_num_per_thread = 256;
...@@ -569,6 +593,7 @@ struct ReduceConfig { ...@@ -569,6 +593,7 @@ struct ReduceConfig {
grid_dim->y = details::AlignUp(reduce_num, blocking_size); grid_dim->y = details::AlignUp(reduce_num, blocking_size);
} }
} }
#endif
void SetBlockDim() { void SetBlockDim() {
// init // init
...@@ -577,14 +602,14 @@ struct ReduceConfig { ...@@ -577,14 +602,14 @@ struct ReduceConfig {
dim3 block_dim(block_num, 1, 1); dim3 block_dim(block_num, 1, 1);
dim3 grid_dim(left_num, 1, 1); dim3 grid_dim(left_num, 1, 1);
blocking_size = reduce_num; blocking_size = reduce_num;
#ifdef PADDLE_WITH_XPU2 #ifdef PADDLE_WITH_XPU_KP
if (reduce_last_dim) { if (reduce_last_dim) {
block_dim.x = 128; block_dim.x = 64;
block_dim.y = reduce_num; block_dim.y = reduce_num;
grid_dim.x = 8; grid_dim.x = 1;
grid_dim.y = 1; grid_dim.y = 8;
} else { } else {
block_dim.x = 128; block_dim.x = 64;
block_dim.y = left_num; block_dim.y = left_num;
grid_dim.x = 8; grid_dim.x = 8;
grid_dim.y = 1; grid_dim.y = 1;
...@@ -661,7 +686,7 @@ __global__ void ReduceAnyKernel(const Tx* x, ...@@ -661,7 +686,7 @@ __global__ void ReduceAnyKernel(const Tx* x,
store_offset = block.BlockIdY() * left_num + left_idx; store_offset = block.BlockIdY() * left_num + left_idx;
loop_left = min(block.GetLoopSize(), left_num - left_idx); loop_left = min(block.GetLoopSize(), left_num - left_idx);
stride_left = 1; stride_left = 1;
tid = threadIdx.x; tid = THREAD_ID_X;
} else { } else {
auto block = ReduceIndexMapping<false>(dim); auto block = ReduceIndexMapping<false>(dim);
input_idx = block.BlockIdY() * block.BlockDimY(); input_idx = block.BlockIdY() * block.BlockDimY();
...@@ -672,18 +697,20 @@ __global__ void ReduceAnyKernel(const Tx* x, ...@@ -672,18 +697,20 @@ __global__ void ReduceAnyKernel(const Tx* x,
loop_left = min(block.GetLoopSize(), left_num - left_idx); loop_left = min(block.GetLoopSize(), left_num - left_idx);
stride_left = block.BlockDimX() * block.GridDimX(); stride_left = block.BlockDimX() * block.GridDimX();
store_offset = block.BlockIdY() * left_num + left_idx; store_offset = block.BlockIdY() * left_num + left_idx;
tid = threadIdx.y; tid = THREAD_ID_Y;
} }
// calculate the offset, means the addr where each thread really start. // calculate the offset, means the addr where each thread really start.
// 1. reduce for each thread // 1. reduce for each thread
MPType input_compute[REDUCE_VEC_SIZE]; MPType input_compute[REDUCE_VEC_SIZE];
Tx input_reg[REDUCE_VEC_SIZE]; Tx input_reg[REDUCE_VEC_SIZE];
int input_idx_tmp = input_idx;
for (int i = 0; i < loop_left; i += stride_left) { for (int i = 0; i < loop_left; i += stride_left) {
int input_offset = left_index_calculator(left_idx + i); int input_offset = left_index_calculator(left_idx + i);
const Tx* input = x + input_offset; const _ptr_ Tx* input = x + input_offset;
MPType reduce_var = init; MPType reduce_var = init;
// load REDUCE_VEC_SIZE data once, and then compute // load REDUCE_VEC_SIZE data once, and then compute
int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride; int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
input_idx = input_idx_tmp;
for (; input_idx + block_size < bound; for (; input_idx + block_size < bound;
input_idx += REDUCE_VEC_SIZE * stride) { input_idx += REDUCE_VEC_SIZE * stride) {
kps::ReadDataReduce<Tx, kps::ReadDataReduce<Tx,
...@@ -775,7 +802,7 @@ __global__ void ReduceHigherDimKernel(const Tx* x, ...@@ -775,7 +802,7 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
int loop_size = min(reduce_num - idy, blocking_size); int loop_size = min(reduce_num - idy, blocking_size);
int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY(); int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY();
int block_offset = idy * left_num + idz * reduce_num; int block_offset = idy * left_num + idz * reduce_num;
const Tx* input = x + block_offset; const _ptr_ Tx* input = x + block_offset;
Tx reduce_input; Tx reduce_input;
for (; idx < size; idx += stride) { for (; idx < size; idx += stride) {
MPType reduce_var = init; MPType reduce_var = init;
...@@ -838,7 +865,7 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -838,7 +865,7 @@ static void LaunchReduceKernel(const Tx* x_data,
const ReduceOp& reducer, const ReduceOp& reducer,
const TransformOp& transform, const TransformOp& transform,
MPType init, MPType init,
gpuStream_t stream, KPStream stream,
ReduceConfig<Ty> config) { ReduceConfig<Ty> config) {
if (config.reduce_type == kReduceLastDim) { if (config.reduce_type == kReduceLastDim) {
int stride_reduce = 1; int stride_reduce = 1;
...@@ -855,23 +882,24 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -855,23 +882,24 @@ static void LaunchReduceKernel(const Tx* x_data,
0); 0);
dim.SetRem(config.reduce_num % config.block.x, 0, 0); dim.SetRem(config.reduce_num % config.block.x, 0, 0);
#ifdef PADDLE_WITH_XPU2 #ifdef PADDLE_WITH_XPU_KP
ReduceAnyKernel<Tx, ReduceAnyKernel<Tx,
Ty, Ty,
MPType, MPType,
ReduceOp, ReduceOp,
TransformOp, TransformOp,
OneDimIndexCal><<<8, 128, stream>>>(x_data, OneDimIndexCal><<<8, 64, 0, stream>>>(
config.output_data, x_data,
reducer, config.output_data,
transform, reducer,
init, transform,
config.reduce_num, init,
config.left_num, config.reduce_num,
config.reduce_last_dim, config.left_num,
reduce_index_calculator, config.reduce_last_dim,
left_index_calculator, reduce_index_calculator,
dim); left_index_calculator,
dim);
#else #else
ReduceAnyKernel<Tx, ReduceAnyKernel<Tx,
Ty, Ty,
...@@ -910,13 +938,13 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -910,13 +938,13 @@ static void LaunchReduceKernel(const Tx* x_data,
0); 0);
dim.SetRem(config.reduce_num % config.block.x, 0, 0); dim.SetRem(config.reduce_num % config.block.x, 0, 0);
#ifdef PADDLE_WITH_XPU2 #ifdef PADDLE_WITH_XPU_KP
ReduceAnyKernel<Tx, ReduceAnyKernel<Tx,
Ty, Ty,
MPType, MPType,
ReduceOp, ReduceOp,
TransformOp, TransformOp,
IndexCalculator><<<8, 128, stream>>>( IndexCalculator><<<8, 64, 0, stream>>>(
x_data, x_data,
config.output_data, config.output_data,
reducer, reducer,
...@@ -965,12 +993,13 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -965,12 +993,13 @@ static void LaunchReduceKernel(const Tx* x_data,
kps::DimConfig dim = kps::DimConfig dim =
kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
dim.SetRem(config.left_num % block.x, 0, 0); dim.SetRem(config.left_num % block.x, 0, 0);
#ifdef PADDLE_WITH_XPU2 #ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel<Ty, ReduceHigherDimKernel<
Ty, Ty,
MPType, Ty,
ReduceOp, MPType,
kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>( ReduceOp,
kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
config.output_data, config.output_data,
y_data, y_data,
reducer, reducer,
...@@ -1011,7 +1040,7 @@ CubTensorReduceImpl(const Tx* x_data, ...@@ -1011,7 +1040,7 @@ CubTensorReduceImpl(const Tx* x_data,
const TransformOp& transform, const TransformOp& transform,
int reduce_num, int reduce_num,
const paddle::platform::Place& place, const paddle::platform::Place& place,
gpuStream_t stream) { KPStream stream) {
auto reducer = ReduceOp<Ty>(); auto reducer = ReduceOp<Ty>();
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data, cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
transform); transform);
...@@ -1054,7 +1083,7 @@ CubTensorReduceImpl(const Tx* x_data, ...@@ -1054,7 +1083,7 @@ CubTensorReduceImpl(const Tx* x_data,
const TransformOp& transform, const TransformOp& transform,
int reduce_num, int reduce_num,
const paddle::platform::Place& place, const paddle::platform::Place& place,
gpuStream_t stream) { KPStream stream) {
PADDLE_THROW(phi::errors::InvalidArgument( PADDLE_THROW(phi::errors::InvalidArgument(
"Tx should not be float16 when using cub::DeviceReduce::Reduce().")); "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
} }
...@@ -1068,7 +1097,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, ...@@ -1068,7 +1097,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
phi::DenseTensor* y, phi::DenseTensor* y,
const TransformOp& transform, const TransformOp& transform,
const std::vector<int>& origin_reduce_dims, const std::vector<int>& origin_reduce_dims,
gpuStream_t stream) { KPStream stream) {
y->mutable_data<Ty>(x.place()); y->mutable_data<Ty>(x.place());
auto x_dim = phi::vectorize<int>(x.dims()); auto x_dim = phi::vectorize<int>(x.dims());
...@@ -1098,11 +1127,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, ...@@ -1098,11 +1127,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
config.SetOutputData(y_data, x.place(), &tmp); config.SetOutputData(y_data, x.place(), &tmp);
constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value; constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value;
bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16; bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
#ifndef PADDLE_WITH_XPU_KP
if (use_cub_reduce) { if (use_cub_reduce) {
CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>( CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
x_data, y_data, transform, config.reduce_num, x.place(), stream); x_data, y_data, transform, config.reduce_num, x.place(), stream);
return; return;
} }
#endif
using MPType = typename kps::details::MPTypeTrait<Ty>::Type; using MPType = typename kps::details::MPTypeTrait<Ty>::Type;
auto reducer = ReduceOp<MPType>(); auto reducer = ReduceOp<MPType>();
...@@ -1124,20 +1155,21 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, ...@@ -1124,20 +1155,21 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
config.reduce_num % config.blocking_size, config.reduce_num % config.blocking_size,
0); 0);
#ifdef PADDLE_WITH_XPU2 #ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel<Tx, ReduceHigherDimKernel<Tx,
Ty, Ty,
MPType, MPType,
ReduceOp<MPType>, ReduceOp<MPType>,
TransformOp><<<8, 128, stream>>>(x_data, TransformOp><<<8, 64, 0, stream>>>(
config.output_data, x_data,
reducer, config.output_data,
transform, reducer,
reducer.initial(), transform,
config.reduce_num, reducer.initial(),
config.left_num, config.reduce_num,
config.blocking_size, config.left_num,
dim); config.blocking_size,
dim);
#else #else
ReduceHigherDimKernel< ReduceHigherDimKernel<
Tx, Tx,
...@@ -1163,13 +1195,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, ...@@ -1163,13 +1195,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
dim2.SetRem(config.left_num % config.block.x, 0, 0); dim2.SetRem(config.left_num % config.block.x, 0, 0);
#ifdef PADDLE_WITH_XPU2 #ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel< ReduceHigherDimKernel<
Ty, Ty,
Ty, Ty,
MPType, MPType,
ReduceOp<MPType>, ReduceOp<MPType>,
kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>( kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
config.output_data, config.output_data,
y_data, y_data,
reducer, reducer,
...@@ -1212,7 +1244,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, ...@@ -1212,7 +1244,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
template <typename T, template <typename T,
template <typename> class ReduceOp, template <typename> class ReduceOp,
template <typename, typename> class TransformOp> template <typename, typename> class TransformOp>
void Reduce(const GPUContext& dev_ctx, void Reduce(const KPDevice& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
bool reduce_all, bool reduce_all,
const std::vector<int64_t>& dims, const std::vector<int64_t>& dims,
...@@ -1227,7 +1259,7 @@ void Reduce(const GPUContext& dev_ctx, ...@@ -1227,7 +1259,7 @@ void Reduce(const GPUContext& dev_ctx,
reduce_num *= (x.dims())[i]; reduce_num *= (x.dims())[i];
} }
gpuStream_t stream = dev_ctx.stream(); KPStream stream = dev_ctx.stream();
if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) { if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) {
auto tmp_tensor = phi::Cast<T>(dev_ctx, x, out_dtype); auto tmp_tensor = phi::Cast<T>(dev_ctx, x, out_dtype);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册