未验证 提交 c142e37d 编写于 作者: N niuliling123 提交者: GitHub

Replace PADDLE_WITH_XPU2 with PADDLE_WITH_KP (#40560)

* Replace PADDLE_WITH_XPU2 with PADDLE_WITH_KP
上级 81848fff
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
#pragma once #pragma once
// CUDA and HIP use same api // CUDA, XPU and HIP use same api
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(__xpu__)
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
...@@ -220,7 +220,7 @@ struct IndexCalculator { ...@@ -220,7 +220,7 @@ struct IndexCalculator {
phi::Array<int, kMaxRank> dims; phi::Array<int, kMaxRank> dims;
phi::Array<int, kMaxRank> strides; phi::Array<int, kMaxRank> strides;
phi::Array<int, kMaxRank> reduce_strides; phi::Array<int, kMaxRank> reduce_strides;
#ifndef PADDLE_WITH_XPU2 #ifndef PADDLE_WITH_XPU_KP
phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders; phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
#endif #endif
}; };
...@@ -231,81 +231,65 @@ struct ReduceIndexMapping { ...@@ -231,81 +231,65 @@ struct ReduceIndexMapping {
HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims) HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
: dim(dims) {} : dim(dims) {}
#ifdef PADDLE_WITH_XPU_KP
__device__ __forceinline__ int BlockIdX() { __device__ __forceinline__ int BlockIdX() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) { if (ReduceLastDim) {
return (cluster_id() / dim.split_num_x % dim.split_num_y); return (cluster_id() / dim.split_num_x % dim.split_num_y);
} else { } else {
return cluster_id() % dim.split_num_x; return cluster_id() % dim.split_num_x;
} }
#else
return blockIdx.x;
#endif
} }
__device__ __forceinline__ int BlockIdY() { __device__ __forceinline__ int BlockIdY() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) { if (ReduceLastDim) {
return (cluster_id() % dim.split_num_x); return (cluster_id() % dim.split_num_x);
} else { } else {
return (cluster_id() / dim.split_num_x % dim.split_num_y); return (cluster_id() / dim.split_num_x % dim.split_num_y);
} }
#else
return blockIdx.y;
#endif
} }
__device__ __forceinline__ int BlockDimX() { __device__ __forceinline__ int BlockDimX() { return dim.deal_size_x; }
#ifdef PADDLE_WITH_XPU2
return dim.deal_size_x;
#else
return blockDim.x;
#endif
}
__device__ __forceinline__ int BlockDimY() { __device__ __forceinline__ int BlockDimY() { return 1; }
#ifdef PADDLE_WITH_XPU2
return 1;
#else
return blockDim.y;
#endif
}
__device__ __forceinline__ int GridDimX() { __device__ __forceinline__ int GridDimX() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) { if (ReduceLastDim) {
return dim.split_num_y; return dim.split_num_y;
} else { } else {
return dim.split_num_x; return dim.split_num_x;
} }
#else
return gridDim.x;
#endif
} }
__device__ __forceinline__ int GridDimY() { __device__ __forceinline__ int GridDimY() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) { if (ReduceLastDim) {
return dim.split_num_x; return dim.split_num_x;
} else { } else {
return dim.split_num_y; return dim.split_num_y;
} }
#else
return gridDim.y;
#endif
} }
__device__ __forceinline__ int GetLoopSize() { __device__ __forceinline__ int GetLoopSize() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) { if (ReduceLastDim) {
return dim.deal_size_y; return dim.deal_size_y;
} else { } else {
return dim.deal_size_x; return dim.deal_size_x;
} }
}
#else #else
return 1; __device__ __forceinline__ int BlockIdX() { return blockIdx.x; }
__device__ __forceinline__ int BlockIdY() { return blockIdx.y; }
__device__ __forceinline__ int BlockDimX() { return blockDim.x; }
__device__ __forceinline__ int BlockDimY() { return blockDim.y; }
__device__ __forceinline__ int GridDimX() { return gridDim.x; }
__device__ __forceinline__ int GridDimY() { return gridDim.y; }
__device__ int GetLoopSize() { return 1; }
#endif #endif
}
}; };
// when reduce_type == kReduceLastDim this struct will be used // when reduce_type == kReduceLastDim this struct will be used
...@@ -341,7 +325,7 @@ struct ReduceConfig { ...@@ -341,7 +325,7 @@ struct ReduceConfig {
// when should_reduce_again is true, we need malloc temp space for temp data // when should_reduce_again is true, we need malloc temp space for temp data
void SetOutputData(Ty* y_data, void SetOutputData(Ty* y_data,
const phi::GPUContext& dev_ctx, const KPDevice& dev_ctx,
phi::DenseTensor* tmp) { phi::DenseTensor* tmp) {
if (should_reduce_again) { if (should_reduce_again) {
tmp->Resize(phi::make_ddim( tmp->Resize(phi::make_ddim(
...@@ -640,9 +624,7 @@ struct ReduceConfig { ...@@ -640,9 +624,7 @@ struct ReduceConfig {
int blocking_size; int blocking_size;
bool should_reduce_again; bool should_reduce_again;
bool reduce_last_dim; bool reduce_last_dim;
Ty* output_data; Ty* output_data;
dim3 block; dim3 block;
dim3 grid; dim3 grid;
}; };
...@@ -770,9 +752,10 @@ __global__ void ReduceAnyKernel(const Tx* x, ...@@ -770,9 +752,10 @@ __global__ void ReduceAnyKernel(const Tx* x,
kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>( kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
&reduce_var, &reduce_var, reducer, reduce_last_dim); &reduce_var, &reduce_var, reducer, reduce_last_dim);
if (need_store) {
y[store_offset + i] = static_cast<Ty>(reduce_var); Ty result = static_cast<Ty>(reduce_var);
} kps::details::WriteData<Ty>(
y + store_offset + i, &result, static_cast<int>(need_store));
} }
} }
...@@ -882,30 +865,18 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -882,30 +865,18 @@ static void LaunchReduceKernel(const Tx* x_data,
dim.SetRem(config.reduce_num % config.block.x, 0, 0); dim.SetRem(config.reduce_num % config.block.x, 0, 0);
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
ReduceAnyKernel<Tx, auto grid_num = 8;
Ty, auto block_num = 64;
MPType,
ReduceOp,
TransformOp,
OneDimIndexCal><<<8, 64, 0, stream>>>(
x_data,
config.output_data,
reducer,
transform,
init,
config.reduce_num,
config.left_num,
config.reduce_last_dim,
reduce_index_calculator,
left_index_calculator,
dim);
#else #else
auto grid_num = config.grid;
auto block_num = config.block;
#endif
ReduceAnyKernel<Tx, ReduceAnyKernel<Tx,
Ty, Ty,
MPType, MPType,
ReduceOp, ReduceOp,
TransformOp, TransformOp,
OneDimIndexCal><<<config.grid, config.block, 0, stream>>>( OneDimIndexCal><<<grid_num, block_num, 0, stream>>>(
x_data, x_data,
config.output_data, config.output_data,
reducer, reducer,
...@@ -917,7 +888,6 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -917,7 +888,6 @@ static void LaunchReduceKernel(const Tx* x_data,
reduce_index_calculator, reduce_index_calculator,
left_index_calculator, left_index_calculator,
dim); dim);
#endif
} else { } else {
int reduce_rank = config.reduce_strides.size(); int reduce_rank = config.reduce_strides.size();
...@@ -938,30 +908,18 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -938,30 +908,18 @@ static void LaunchReduceKernel(const Tx* x_data,
dim.SetRem(config.reduce_num % config.block.x, 0, 0); dim.SetRem(config.reduce_num % config.block.x, 0, 0);
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
ReduceAnyKernel<Tx, auto grid_num = 8;
Ty, auto block_num = 64;
MPType,
ReduceOp,
TransformOp,
IndexCalculator><<<8, 64, 0, stream>>>(
x_data,
config.output_data,
reducer,
transform,
init,
config.reduce_num,
config.left_num,
config.reduce_last_dim,
reduce_index_calculator,
left_index_calculator,
dim);
#else #else
auto grid_num = config.grid;
auto block_num = config.block;
#endif
ReduceAnyKernel<Tx, ReduceAnyKernel<Tx,
Ty, Ty,
MPType, MPType,
ReduceOp, ReduceOp,
TransformOp, TransformOp,
IndexCalculator><<<config.grid, config.block, 0, stream>>>( IndexCalculator><<<grid_num, block_num, 0, stream>>>(
x_data, x_data,
config.output_data, config.output_data,
reducer, reducer,
...@@ -973,7 +931,6 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -973,7 +931,6 @@ static void LaunchReduceKernel(const Tx* x_data,
reduce_index_calculator, reduce_index_calculator,
left_index_calculator, left_index_calculator,
dim); dim);
#endif
} }
if (config.should_reduce_again) { if (config.should_reduce_again) {
...@@ -993,22 +950,9 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -993,22 +950,9 @@ static void LaunchReduceKernel(const Tx* x_data,
kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
dim.SetRem(config.left_num % block.x, 0, 0); dim.SetRem(config.left_num % block.x, 0, 0);
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel< grid = 8;
Ty, block = 64;
Ty, #endif
MPType,
ReduceOp,
kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
config.output_data,
y_data,
reducer,
kps::IdentityFunctor<Ty, MPType>(),
init,
config.grid.y,
config.left_num,
config.grid.y,
dim);
#else
ReduceHigherDimKernel< ReduceHigherDimKernel<
Ty, Ty,
Ty, Ty,
...@@ -1024,7 +968,6 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -1024,7 +968,6 @@ static void LaunchReduceKernel(const Tx* x_data,
config.left_num, config.left_num,
config.grid.y, config.grid.y,
dim); dim);
#endif
} }
} }
...@@ -1038,7 +981,7 @@ CubTensorReduceImpl(const Tx* x_data, ...@@ -1038,7 +981,7 @@ CubTensorReduceImpl(const Tx* x_data,
Ty* y_data, Ty* y_data,
const TransformOp& transform, const TransformOp& transform,
int reduce_num, int reduce_num,
const phi::GPUContext& dev_ctx, const KPDevice& dev_ctx,
KPStream stream) { KPStream stream) {
auto reducer = ReduceOp<Ty>(); auto reducer = ReduceOp<Ty>();
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data, cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
...@@ -1077,7 +1020,7 @@ CubTensorReduceImpl(const Tx* x_data, ...@@ -1077,7 +1020,7 @@ CubTensorReduceImpl(const Tx* x_data,
Ty* y_data, Ty* y_data,
const TransformOp& transform, const TransformOp& transform,
int reduce_num, int reduce_num,
const phi::GPUContext& dev_ctx, const KPDevice& dev_ctx,
KPStream stream) { KPStream stream) {
PADDLE_THROW(phi::errors::InvalidArgument( PADDLE_THROW(phi::errors::InvalidArgument(
"Tx should not be float16 when using cub::DeviceReduce::Reduce().")); "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
...@@ -1087,12 +1030,16 @@ template <typename Tx, ...@@ -1087,12 +1030,16 @@ template <typename Tx,
typename Ty, typename Ty,
template <typename> class ReduceOp, template <typename> class ReduceOp,
typename TransformOp> typename TransformOp>
void ReduceKernel(const phi::GPUContext& dev_ctx, void ReduceKernel(const KPDevice& dev_ctx,
const phi::DenseTensor& x, const phi::DenseTensor& x,
phi::DenseTensor* y, phi::DenseTensor* y,
const TransformOp& transform, const TransformOp& transform,
const std::vector<int>& origin_reduce_dims) { const std::vector<int>& origin_reduce_dims) {
#ifdef PADDLE_WITH_XPU_KP
auto stream = dev_ctx.x_context()->xpu_stream;
#else
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
#endif
dev_ctx.Alloc<Ty>(y); dev_ctx.Alloc<Ty>(y);
auto x_dim = phi::vectorize<int>(x.dims()); auto x_dim = phi::vectorize<int>(x.dims());
...@@ -1149,11 +1096,17 @@ void ReduceKernel(const phi::GPUContext& dev_ctx, ...@@ -1149,11 +1096,17 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
0); 0);
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
auto grid_num = 8;
auto block_num = 64;
#else
auto grid_num = config.grid;
auto block_num = config.block;
#endif
ReduceHigherDimKernel<Tx, ReduceHigherDimKernel<Tx,
Ty, Ty,
MPType, MPType,
ReduceOp<MPType>, ReduceOp<MPType>,
TransformOp><<<8, 64, 0, stream>>>( TransformOp><<<grid_num, block_num, 0, stream>>>(
x_data, x_data,
config.output_data, config.output_data,
reducer, reducer,
...@@ -1163,23 +1116,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx, ...@@ -1163,23 +1116,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
config.left_num, config.left_num,
config.blocking_size, config.blocking_size,
dim); dim);
#else
ReduceHigherDimKernel<
Tx,
Ty,
MPType,
ReduceOp<MPType>,
TransformOp><<<config.grid, config.block, 0, stream>>>(
x_data,
config.output_data,
reducer,
transform,
reducer.initial(),
config.reduce_num,
config.left_num,
config.blocking_size,
dim);
#endif
if (config.should_reduce_again) { if (config.should_reduce_again) {
dim3 block = dim3(config.block.x, 1, 1); dim3 block = dim3(config.block.x, 1, 1);
...@@ -1189,22 +1125,9 @@ void ReduceKernel(const phi::GPUContext& dev_ctx, ...@@ -1189,22 +1125,9 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
dim2.SetRem(config.left_num % config.block.x, 0, 0); dim2.SetRem(config.left_num % config.block.x, 0, 0);
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel< grid = 8;
Ty, block = 64;
Ty, #endif
MPType,
ReduceOp<MPType>,
kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
config.output_data,
y_data,
reducer,
kps::IdentityFunctor<Ty, MPType>(config.grid.y),
reducer.initial(),
config.grid.y,
config.left_num,
config.grid.y,
dim2);
#else
ReduceHigherDimKernel< ReduceHigherDimKernel<
Ty, Ty,
Ty, Ty,
...@@ -1220,7 +1143,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx, ...@@ -1220,7 +1143,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
config.left_num, config.left_num,
config.grid.y, config.grid.y,
dim2); dim2);
#endif
} }
return; return;
} }
......
...@@ -115,6 +115,14 @@ struct BroadcastConfig { ...@@ -115,6 +115,14 @@ struct BroadcastConfig {
} }
}; };
template <typename T>
__device__ __forceinline__ void WriteData(T* dst,
T* __restrict__ src,
int num) {
for (int i = 0; i < num; i++) {
dst[i] = src[i];
}
}
#undef INT_BITS #undef INT_BITS
} // namespace details } // namespace details
......
...@@ -76,6 +76,16 @@ struct BroadcastConfig { ...@@ -76,6 +76,16 @@ struct BroadcastConfig {
}; };
#pragma pack() #pragma pack()
template <typename T>
__device__ __forceinline__ void WriteData(T* _global_ptr_ dst,
T* src,
int num) {
if (num > 0) {
LM2GM(src, dst, num * sizeof(T));
}
}
#undef INT_BITS
} // namespace details } // namespace details
/** /**
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册