未验证 提交 9c5d5665 编写于 作者: N niuliling123 提交者: GitHub

Modify reduce_op.op.h for xpu2 with kernel primitive api (#36904)

* Modify reduce_op.op.h for xpu2 with kernel primitive api
上级 d08753df
...@@ -360,12 +360,12 @@ __device__ __forceinline__ void ReadDataBc( ...@@ -360,12 +360,12 @@ __device__ __forceinline__ void ReadDataBc(
* reduce_last_dim: Used to indicate whether the dimension of reduce contains * reduce_last_dim: Used to indicate whether the dimension of reduce contains
* the lowest dimension. * the lowest dimension.
*/ */
template <typename T, int NX, int NY, int BlockSize, int Rank, template <typename Tx, typename Ty, int NX, int NY, int BlockSize, int Rank,
typename IndexCal, bool IsBoundary = false> typename IndexCal, typename Functor, bool IsBoundary = false>
__device__ __forceinline__ void ReadDataReduce( __device__ __forceinline__ void ReadDataReduce(
T* dst, const T* __restrict__ src, int block_offset, Ty* dst, const Tx* __restrict__ src, int block_offset,
const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx, const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx,
int stride_ny, bool reduce_last_dim) { int stride_ny, Functor func, bool reduce_last_dim) {
int thread_offset = 0; int thread_offset = 0;
int left_idx = 0; int left_idx = 0;
if (reduce_last_dim) { if (reduce_last_dim) {
...@@ -385,7 +385,7 @@ __device__ __forceinline__ void ReadDataReduce( ...@@ -385,7 +385,7 @@ __device__ __forceinline__ void ReadDataReduce(
} }
} }
uint32_t index_src = index_cal(thread_offset + block_offset); uint32_t index_src = index_cal(thread_offset + block_offset);
dst[ny] = src[index_src]; dst[ny] = static_cast<Ty>(func(src[index_src]));
thread_offset += stride_ny; thread_offset += stride_ny;
} }
} else { } else {
...@@ -400,7 +400,7 @@ __device__ __forceinline__ void ReadDataReduce( ...@@ -400,7 +400,7 @@ __device__ __forceinline__ void ReadDataReduce(
} }
} }
uint32_t index_src = index_cal(thread_offset + block_offset); uint32_t index_src = index_cal(thread_offset + block_offset);
dst[nx + ny * NX] = src[index_src]; dst[nx + ny * NX] = static_cast<Ty>(func(src[index_src]));
thread_offset += stride_ny; thread_offset += stride_ny;
} }
} }
......
...@@ -17,64 +17,49 @@ ...@@ -17,64 +17,49 @@
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace kernel_primitives { namespace kernel_primitives {
namespace details {
static __device__ __forceinline__ platform::float16 ExpFunctor( #ifdef PADDLE_WITH_XPU2
platform::float16 x) { struct dim3 {
return ::Eigen::numext::exp(x); int x;
} int y;
static __device__ __forceinline__ float ExpFunctor(float x) { return expf(x); } int z;
static __device__ __forceinline__ double ExpFunctor(double x) { return exp(x); }
static __device__ __forceinline__ platform::float16 LogFunctor(
platform::float16 x) {
return ::Eigen::numext::log(x);
}
static __device__ __forceinline__ float LogFunctor(float x) { return logf(x); }
static __device__ __forceinline__ double LogFunctor(double x) { return log(x); }
/*************************** Compute Functor****************************/ explicit inline dim3(int split_x, int split_y = 1, int split_z = 1) {
// for margin_cross_entropy x = split_x;
template <typename Tx, typename Ty = Tx> y = split_y;
struct ExpLogitTransformer { z = split_z;
HOSTDEVICE explicit inline ExpLogitTransformer(int n) {}
HOSTDEVICE inline Ty operator()(const Tx* x) const {
return static_cast<Ty>(details::ExpFunctor(x[0]));
}
HOSTDEVICE inline Ty operator()(const Tx& x) const {
return static_cast<Ty>(details::ExpFunctor(x));
} }
}; };
#endif
// Post processing function for sum, max, min, prod, any struct DimConfig {
template <typename Tx, typename Ty = Tx> int split_num_x;
struct IdentityFunctor { int split_num_y;
HOSTDEVICE explicit inline IdentityFunctor(int n) {} int split_num_z;
int deal_size_x;
int deal_size_y;
int deal_size_z;
int rem_x;
int rem_y;
int rem_z;
HOSTDEVICE inline Ty operator()(const Tx* x) const { HOSTDEVICE explicit inline DimConfig(int split_x, int split_y, int split_z,
return static_cast<Ty>(x[0]); int size_x, int size_y, int size_z) {
split_num_x = split_x;
split_num_y = split_y;
split_num_z = split_z;
deal_size_x = size_x;
deal_size_y = size_y;
deal_size_z = size_z;
} }
HOSTDEVICE inline Ty operator()(const Tx& x) const { HOSTDEVICE void SetRem(int rem_nx, int rem_ny, int rem_nz) {
return static_cast<Ty>(x); rem_x = rem_nx;
rem_y = rem_ny;
rem_z = rem_nz;
} }
}; };
// Post processing function for mean
template <typename T>
struct DivideFunctor {
HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
HOSTDEVICE inline T operator()(const T* x) const { return x[0] * n_inv; }
HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
private:
T n_inv;
};
} // namespace details
} // namespace kernel_primitives } // namespace kernel_primitives
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -13,11 +13,45 @@ ...@@ -13,11 +13,45 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
#include "paddle/fluid/operators/kernel_primitives/helper_primitives.h"
#ifdef PADDLE_WITH_XPU2
#include "paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h"
#include "paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h"
#define THREAD_ID_X core_id()
#define THREAD_ID_Y 0
#define THREAD_ID_Z 0
#define BLOCK_NUM_X core_num()
#define BLOCK_NUM_Y 0
#define BLOCK_NUM_Z 0
#define BLOCK_ID_X cluster_id()
#define BLOCK_ID_Y 0
#define BLOCK_ID_Z 0
#define GRID_NUM_X cluster_num()
#define GRID_NUM_Y 0
#define GRID_NUM_Z 0
#else
#include "paddle/fluid/operators/kernel_primitives/compute_primitives.h" #include "paddle/fluid/operators/kernel_primitives/compute_primitives.h"
#include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h" #include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h"
#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h" #define THREAD_ID_X threadIdx.x
#include "paddle/fluid/operators/kernel_primitives/helper_primitives.h" #define THREAD_ID_Y threadIdx.y
#define THREAD_ID_Z threadIdx.z
#define BLOCK_NUM_X blockDim.x
#define BLOCK_NUM_Y blockDim.y
#define BLOCK_NUM_Z blockDim.z
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_ID_Z blockIdx.z
#define GRID_NUM_X gridDim.x
#define GRID_NUM_Y gridDim.y
#define GRID_NUM_Z gridDim.z
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -130,7 +130,7 @@ __global__ void AddMarginToPositiveLogitsKernel( ...@@ -130,7 +130,7 @@ __global__ void AddMarginToPositiveLogitsKernel(
template <typename Tx, typename Ty = Tx> template <typename Tx, typename Ty = Tx>
struct ExpAndSum { struct ExpAndSum {
using Transformer = kpds::ExpLogitTransformer<Tx>; using Transformer = kps::ExpFunctor<Tx>;
inline Ty initial() { return static_cast<Ty>(0.0f); } inline Ty initial() { return static_cast<Ty>(0.0f); }
...@@ -159,7 +159,7 @@ __global__ void LogitsMinusLogSumKernel(T* logits, const T* logits_sum_per_row, ...@@ -159,7 +159,7 @@ __global__ void LogitsMinusLogSumKernel(T* logits, const T* logits_sum_per_row,
const int64_t N, const int64_t D) { const int64_t N, const int64_t D) {
CUDA_KERNEL_LOOP(i, N * D) { CUDA_KERNEL_LOOP(i, N * D) {
auto row = i / D; auto row = i / D;
logits[i] -= kpds::LogFunctor(logits_sum_per_row[row]); logits[i] -= kps::details::Log(logits_sum_per_row[row]);
} }
} }
...@@ -174,9 +174,9 @@ __global__ void HardLabelSoftmaxWithCrossEntropyKernel( ...@@ -174,9 +174,9 @@ __global__ void HardLabelSoftmaxWithCrossEntropyKernel(
if ((col + start_index) == labels[row]) { if ((col + start_index) == labels[row]) {
auto softmax = log_softmax[i]; auto softmax = log_softmax[i];
loss[row] = -softmax; loss[row] = -softmax;
log_softmax[i] = kpds::ExpFunctor(softmax); log_softmax[i] = kps::details::Exp(softmax);
} else { } else {
log_softmax[i] = kpds::ExpFunctor(log_softmax[i]); log_softmax[i] = kps::details::Exp(log_softmax[i]);
} }
} }
} }
......
...@@ -24,11 +24,11 @@ limitations under the License. */ ...@@ -24,11 +24,11 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace kpds = paddle::operators::kernel_primitives::details; namespace kps = paddle::operators::kernel_primitives;
template <typename Tx, typename Ty = Tx> template <typename Tx, typename Ty = Tx>
struct CustomMin { struct CustomMin {
using Transformer = kpds::IdentityFunctor<Tx>; using Transformer = kps::IdentityFunctor<Tx>;
inline Ty initial() { inline Ty initial() {
return static_cast<Ty>(std::numeric_limits<Ty>::max()); return static_cast<Ty>(std::numeric_limits<Ty>::max());
...@@ -41,7 +41,7 @@ struct CustomMin { ...@@ -41,7 +41,7 @@ struct CustomMin {
template <typename Tx, typename Ty = Tx> template <typename Tx, typename Ty = Tx>
struct CustomMax { struct CustomMax {
using Transformer = kpds::IdentityFunctor<Tx>; using Transformer = kps::IdentityFunctor<Tx>;
inline Ty initial() { inline Ty initial() {
return static_cast<Ty>(std::numeric_limits<Ty>::lowest()); return static_cast<Ty>(std::numeric_limits<Ty>::lowest());
...@@ -55,7 +55,7 @@ struct CustomMax { ...@@ -55,7 +55,7 @@ struct CustomMax {
// for cub::Reduce // for cub::Reduce
template <typename Tx, typename Ty = Tx> template <typename Tx, typename Ty = Tx>
struct CustomSum { struct CustomSum {
using Transformer = kpds::IdentityFunctor<Tx, Ty>; using Transformer = kps::IdentityFunctor<Tx, Ty>;
inline Ty initial() { return static_cast<Ty>(0.0f); } inline Ty initial() { return static_cast<Ty>(0.0f); }
...@@ -66,7 +66,7 @@ struct CustomSum { ...@@ -66,7 +66,7 @@ struct CustomSum {
template <typename Tx, typename Ty = Tx> template <typename Tx, typename Ty = Tx>
struct CustomMean { struct CustomMean {
using Transformer = kpds::DivideFunctor<Tx>; using Transformer = kps::DivideFunctor<Tx>;
inline Ty initial() { return static_cast<Ty>(0.0f); } inline Ty initial() { return static_cast<Ty>(0.0f); }
...@@ -77,7 +77,7 @@ struct CustomMean { ...@@ -77,7 +77,7 @@ struct CustomMean {
template <typename Tx, typename Ty = Tx> template <typename Tx, typename Ty = Tx>
struct CustomMul { struct CustomMul {
using Transformer = kpds::IdentityFunctor<Tx>; using Transformer = kps::IdentityFunctor<Tx>;
inline Ty initial() { return static_cast<Ty>(1.0f); } inline Ty initial() { return static_cast<Ty>(1.0f); }
...@@ -88,7 +88,7 @@ struct CustomMul { ...@@ -88,7 +88,7 @@ struct CustomMul {
template <typename Tx, typename Ty = Tx> template <typename Tx, typename Ty = Tx>
struct CustomLogicalOr { struct CustomLogicalOr {
using Transformer = kpds::IdentityFunctor<Tx>; using Transformer = kps::IdentityFunctor<Tx>;
inline Ty initial() { return static_cast<Ty>(false); } inline Ty initial() { return static_cast<Ty>(false); }
...@@ -99,7 +99,7 @@ struct CustomLogicalOr { ...@@ -99,7 +99,7 @@ struct CustomLogicalOr {
template <typename Tx, typename Ty = Tx> template <typename Tx, typename Ty = Tx>
struct CustomLogicalAnd { struct CustomLogicalAnd {
using Transformer = kpds::IdentityFunctor<Tx>; using Transformer = kps::IdentityFunctor<Tx>;
inline Ty initial() { return static_cast<Ty>(true); } inline Ty initial() { return static_cast<Ty>(true); }
......
...@@ -165,10 +165,93 @@ struct IndexCalculator { ...@@ -165,10 +165,93 @@ struct IndexCalculator {
framework::Array<platform::FastDivMod, kMaxRank> divmoders; framework::Array<platform::FastDivMod, kMaxRank> divmoders;
}; };
template <bool ReduceLastDim = false>
struct ReduceIndexMapping {
const kps::DimConfig dim;
HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
: dim(dims) {}
__device__ __forceinline__ int BlockIdX() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) {
return (cluster_id() / dim.split_num_x % dim.split_num_y);
} else {
return cluster_id() % dim.split_num_x;
}
#else
return blockIdx.x;
#endif
}
__device__ __forceinline__ int BlockIdY() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) {
return (cluster_id() % dim.split_num_x);
} else {
return (cluster_id() / dim.split_num_x % dim.split_num_y);
}
#else
return blockIdx.y;
#endif
}
__device__ __forceinline__ int BlockDimX() {
#ifdef PADDLE_WITH_XPU2
return dim.deal_size_x;
#else
return blockDim.x;
#endif
}
__device__ __forceinline__ int BlockDimY() {
#ifdef PADDLE_WITH_XPU2
return dim.deal_size_y;
#else
return blockDim.y;
#endif
}
__device__ __forceinline__ int GridDimX() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) {
return dim.split_num_y;
} else {
return dim.split_num_x;
}
#else
return gridDim.x;
#endif
}
__device__ __forceinline__ int GridDimY() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) {
return dim.split_num_x;
} else {
return dim.split_num_y;
}
#else
return gridDim.y;
#endif
}
__device__ __forceinline__ int GetLoopSize() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) {
return dim.deal_size_y;
} else {
return dim.deal_size_x;
}
#else
return 1;
#endif
}
};
// when reduce_type == kReduceLastDim this struct will be used // when reduce_type == kReduceLastDim this struct will be used
// for higher performance // for higher performance
struct LastDimIndexCal { struct OneDimIndexCal {
explicit LastDimIndexCal(int num) : stride(num) {} explicit OneDimIndexCal(int num) : stride(num) {}
__device__ inline int operator()(int index) const { return index * stride; } __device__ inline int operator()(int index) const { return index * stride; }
int stride; int stride;
...@@ -331,8 +414,16 @@ struct ReduceConfig { ...@@ -331,8 +414,16 @@ struct ReduceConfig {
if (rank == reduce_rank || is_last_dim) { if (rank == reduce_rank || is_last_dim) {
reduce_type = static_cast<int>(ReduceType::kReduceLastDim); reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
} else if (reduce_rank == 1) { } else if (reduce_rank == 1) {
// ReduceFirstDim and reduceSecondDim // ReduceFirstDim and reduceSecondDim
#ifdef PADDLE_WITH_XPU2
if (reduce_dim[0] == 0) {
reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
} else {
reduce_type = static_cast<int>(ReduceType::kReduceAny);
}
#else
reduce_type = static_cast<int>(ReduceType::kReduceHigherDim); reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
#endif
} else { } else {
reduce_type = static_cast<int>(ReduceType::kReduceAny); reduce_type = static_cast<int>(ReduceType::kReduceAny);
} }
...@@ -408,59 +499,61 @@ struct ReduceConfig { ...@@ -408,59 +499,61 @@ struct ReduceConfig {
// for ReduceHigherDim: if block is enough -> splite reduce_num // for ReduceHigherDim: if block is enough -> splite reduce_num
// else init block(32, 1) grid(block_num, 1) // else init block(32, 1) grid(block_num, 1)
// for others: block(block_num, 1) , grid(left_num, 1) // for others: block(block_num, 1) , grid(left_num, 1)
void SetBlockDim() { void SetBlockDimForHigher(dim3* block_dim, dim3* grid_dim) {
// init
int block_num = details::GetBlockDim(reduce_num);
should_reduce_again = false;
dim3 block_dim(block_num, 1);
dim3 grid_dim(left_num, 1);
blocking_size = reduce_num;
if (reduce_type == ReduceType::kReduceHigherDim) {
int last_dim_num = x_dim.back(); int last_dim_num = x_dim.back();
// update left_num // update left_num
int grid_z = left_num / last_dim_num; int grid_z = left_num / last_dim_num;
left_num = last_dim_num; left_num = last_dim_num;
grid_dim->z = grid_z;
block_dim.z = 1;
grid_dim.z = grid_z;
int device_id = platform::GetCurrentDeviceId(); int device_id = platform::GetCurrentDeviceId();
int max_mp = platform::GetCUDAMultiProcessors(device_id); int max_mp = platform::GetCUDAMultiProcessors(device_id);
int max_threads_per_mp = int max_threads_per_mp =
platform::GetCUDAMaxThreadsPerMultiProcessor(device_id); platform::GetCUDAMaxThreadsPerMultiProcessor(device_id);
int max_threads = max_threads_per_mp * max_mp; int max_threads = max_threads_per_mp * max_mp;
// init // init
int num_block = (max_threads / left_num); int num_block = (max_threads / left_num);
block_dim->x = details::GetBlockDim(left_num);
grid_dim->x = details::AlignUp(left_num, block_dim->x);
blocking_size = reduce_num;
if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) { if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) {
blocking_size = details::GetLastPow2(reduce_num / num_block); blocking_size = details::GetLastPow2(reduce_num / num_block);
if (blocking_size <= 1) { if (blocking_size <= 1) {
blocking_size = details::GetLastPow2(sqrt(reduce_num)); blocking_size = details::GetLastPow2(sqrt(reduce_num));
} else if (blocking_size * 2 < reduce_num) { } else if (blocking_size * 2 < reduce_num) {
blocking_size *= 2; blocking_size *= 2;
} }
should_reduce_again = true; should_reduce_again = true;
grid_dim->y = details::AlignUp(reduce_num, blocking_size);
}
}
block_dim.x = details::GetBlockDim(left_num); void SetBlockDim() {
block_dim.y = 1; // init
grid_dim.x = (left_num + block_dim.x - 1) / block_dim.x; int block_num = details::GetBlockDim(reduce_num);
grid_dim.y = (reduce_num + blocking_size - 1) / blocking_size; should_reduce_again = false;
dim3 block_dim(block_num, 1, 1);
} else { dim3 grid_dim(left_num, 1, 1);
block_dim.x = details::GetBlockDim(left_num);
block_dim.y = 1;
blocking_size = reduce_num; blocking_size = reduce_num;
grid_dim.x = (left_num + block_dim.x - 1) / block_dim.x; #ifdef PADDLE_WITH_XPU2
if (reduce_last_dim) {
block_dim.x = 128;
block_dim.y = reduce_num;
grid_dim.x = 8;
grid_dim.y = 1;
} else {
block_dim.x = 128;
block_dim.y = left_num;
grid_dim.x = 8;
grid_dim.y = 1; grid_dim.y = 1;
} }
#else
if (reduce_type == ReduceType::kReduceHigherDim) {
SetBlockDimForHigher(&block_dim, &grid_dim);
} else { } else {
SetBlockDimForReduceAny(&block_dim, &grid_dim); SetBlockDimForReduceAny(&block_dim, &grid_dim);
} }
#endif
block = block_dim; block = block_dim;
grid = grid_dim; grid = grid_dim;
...@@ -487,72 +580,6 @@ struct ReduceConfig { ...@@ -487,72 +580,6 @@ struct ReduceConfig {
dim3 block; dim3 block;
dim3 grid; dim3 grid;
}; };
/* size : how many colonms left have to be reduced
* loop : how many rows data have to be reduced
* block_size: max rows this block to reduce
*/
template <typename Tx, typename Ty, typename MPType, typename ReduceOp,
typename TransformOp, bool IsBoundary = false>
__device__ void HigherDimDealSegment(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, MPType init,
int reduce_num, int left_num,
int block_size) {
const int NY = 1;
int idx = blockIdx.x * blockDim.x;
int idy = blockIdx.y * block_size;
// block_offset of rows
Tx reduce_input[NY];
MPType reduce_compute[NY];
MPType result = init;
// the offset of this block
int block_offset = idy * left_num + idx + blockIdx.z * reduce_num * left_num;
const Tx* input = x + block_offset;
int store_offset =
blockIdx.y * left_num + blockIdx.z * gridDim.y * left_num + idx;
// how many columns left
int size = left_num - idx;
// how many rows have to be reduced
int loop = reduce_num - idy;
loop = loop > block_size ? block_size : loop;
for (int loop_index = 0; loop_index < loop; loop_index += NY) {
kps::ReadData<Tx, Tx, 1, NY, 1, IsBoundary>(
&reduce_input[0], input + loop_index * left_num, size, NY, 1, left_num);
kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
&reduce_compute[0], &reduce_input[0], transformer);
kps::Reduce<MPType, NY, 1, 1, ReduceOp,
kps::details::ReduceMode::kLocalMode>(
&result, &reduce_compute[0], reducer, false);
}
Ty temp_data = static_cast<Ty>(result);
kps::WriteData<Ty, 1, 1, 1, IsBoundary>(y + store_offset, &temp_data, size);
}
template <typename Tx, typename MPType, typename ReduceOp, typename TransformOp,
typename Calculator, bool IsBoundary>
__device__ void ReduceAnyKernelImpl(const Tx* input, MPType* reduce_var,
ReduceOp reducer, TransformOp transformer,
MPType init, int reduce_num, int input_idx,
bool reduce_last_dim,
const Calculator& reduce_index_calculator,
int stride, int num) {
Tx input_reg[REDUCE_VEC_SIZE];
MPType input_compute[REDUCE_VEC_SIZE];
MPType input_transform[REDUCE_VEC_SIZE];
kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
kps::ReadDataReduce<Tx, 1, REDUCE_VEC_SIZE, 1, 1, Calculator, IsBoundary>(
&input_reg[0], input, input_idx, reduce_index_calculator, 1, reduce_num,
1, stride, reduce_last_dim);
kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
&input_transform[0], &input_reg[0], transformer);
kps::Init<MPType, REDUCE_VEC_SIZE, IsBoundary>(input_compute, input_transform,
num);
kps::Reduce<MPType, REDUCE_VEC_SIZE, 1, 1, ReduceOp,
kps::details::ReduceMode::kLocalMode>(
reduce_var, &input_compute[0], reducer, reduce_last_dim);
}
// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
...@@ -564,54 +591,76 @@ __global__ void ReduceAnyKernel(const Tx* x, Ty* y, ReduceOp reducer, ...@@ -564,54 +591,76 @@ __global__ void ReduceAnyKernel(const Tx* x, Ty* y, ReduceOp reducer,
int reduce_num, int left_num, int reduce_num, int left_num,
bool reduce_last_dim, bool reduce_last_dim,
const Calculator reduce_index_calculator, const Calculator reduce_index_calculator,
const Calculator left_index_calculator) { const Calculator left_index_calculator,
const kps::DimConfig dim) {
int input_idx, left_idx, stride; int input_idx, left_idx, stride;
int block_size = 0; int block_size = 0;
bool need_store = true; bool need_store = true;
int loop_left = 0;
int tid = 0; int tid = 0;
// the last dim gets involved in reduction // the last dim gets involved in reduction
int store_offset = 0;
int stride_left = 0;
if (reduce_last_dim) { if (reduce_last_dim) {
input_idx = blockIdx.y * blockDim.x; auto block = ReduceIndexMapping<true>(dim);
left_idx = blockIdx.x * blockDim.y + threadIdx.y; input_idx = block.BlockIdY() * block.BlockDimX();
stride = gridDim.y * blockDim.x; left_idx = block.BlockIdX() * block.BlockDimY() + THREAD_ID_Y;
block_size = blockDim.x; stride = block.GridDimY() * block.BlockDimX();
need_store = (threadIdx.x == 0) && (left_idx < left_num); block_size = block.BlockDimX();
need_store = (THREAD_ID_X == 0) && (left_idx < left_num);
store_offset = block.BlockIdY() * left_num + left_idx;
loop_left = min(block.GetLoopSize(), left_num - left_idx);
stride_left = 1;
tid = threadIdx.x; tid = threadIdx.x;
} else { } else {
input_idx = blockIdx.y * blockDim.y; auto block = ReduceIndexMapping<false>(dim);
left_idx = blockIdx.x * blockDim.x + threadIdx.x; input_idx = block.BlockIdY() * block.BlockDimY();
stride = gridDim.y * blockDim.y; left_idx = block.BlockIdX() * block.BlockDimX() + THREAD_ID_X;
block_size = blockDim.y; stride = block.GridDimY() * block.BlockDimY();
need_store = (threadIdx.y == 0) && (left_idx < left_num); block_size = block.BlockDimY();
need_store = (THREAD_ID_Y == 0) && (left_idx < left_num);
loop_left = min(block.GetLoopSize(), left_num - left_idx);
stride_left = block.BlockDimX() * block.GridDimX();
store_offset = block.BlockIdY() * left_num + left_idx;
tid = threadIdx.y; tid = threadIdx.y;
} }
int store_offset = blockIdx.y * left_num + left_idx;
// calculate the offset, means the addr where each thread really start. // calculate the offset, means the addr where each thread really start.
int input_offset = left_index_calculator(left_idx); // 1. reduce for each thread
MPType input_compute[REDUCE_VEC_SIZE];
Tx input_reg[REDUCE_VEC_SIZE];
for (int i = 0; i < loop_left; i += stride_left) {
int input_offset = left_index_calculator(left_idx + i);
const Tx* input = x + input_offset; const Tx* input = x + input_offset;
MPType reduce_var = init; MPType reduce_var = init;
Ty store_data;
// 1. reduce for each thread
if (left_idx < left_num) {
// load REDUCE_VEC_SIZE data once, and then compute // load REDUCE_VEC_SIZE data once, and then compute
int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride; int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
for (; input_idx + block_size < bound; for (; input_idx + block_size < bound;
input_idx += REDUCE_VEC_SIZE * stride) { input_idx += REDUCE_VEC_SIZE * stride) {
ReduceAnyKernelImpl<Tx, MPType, ReduceOp, TransformOp, Calculator, false>( kps::ReadDataReduce<Tx, Tx, 1, REDUCE_VEC_SIZE, 1, 1, Calculator,
input, &reduce_var, reducer, transformer, init, reduce_num, input_idx, kps::IdentityFunctor<Tx>, false>(
reduce_last_dim, reduce_index_calculator, stride, reduce_num); &input_reg[0], input, input_idx, reduce_index_calculator, 1,
} reduce_num, 1, stride, kps::IdentityFunctor<Tx>(), reduce_last_dim);
int num = (reduce_num - input_idx - tid + stride - 1) / stride; kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
ReduceAnyKernelImpl<Tx, MPType, ReduceOp, TransformOp, Calculator, true>( &input_compute[0], &input_reg[0], transformer);
input, &reduce_var, reducer, transformer, init, reduce_num - input_idx, kps::Reduce<MPType, REDUCE_VEC_SIZE, 1, 1, ReduceOp,
input_idx, reduce_last_dim, reduce_index_calculator, stride, num); kps::details::ReduceMode::kLocalMode>(
&reduce_var, &input_compute[0], reducer, reduce_last_dim);
} }
kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
kps::ReadDataReduce<Tx, MPType, 1, REDUCE_VEC_SIZE, 1, 1, Calculator,
TransformOp, true>(
&input_compute[0], input, input_idx, reduce_index_calculator, 1,
reduce_num - input_idx, 1, stride, transformer, reduce_last_dim);
kps::Reduce<MPType, REDUCE_VEC_SIZE, 1, 1, ReduceOp,
kps::details::ReduceMode::kLocalMode>(
&reduce_var, &input_compute[0], reducer, reduce_last_dim);
kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>( kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
&reduce_var, &reduce_var, reducer, reduce_last_dim); &reduce_var, &reduce_var, reducer, reduce_last_dim);
if (need_store) { if (need_store) {
y[store_offset] = static_cast<Ty>(reduce_var); y[store_offset + i] = static_cast<Ty>(reduce_var);
}
} }
} }
...@@ -620,21 +669,55 @@ template <typename Tx, typename Ty, typename MPType, typename ReduceOp, ...@@ -620,21 +669,55 @@ template <typename Tx, typename Ty, typename MPType, typename ReduceOp,
__global__ void ReduceHigherDimKernel(const Tx* x, Ty* y, ReduceOp reducer, __global__ void ReduceHigherDimKernel(const Tx* x, Ty* y, ReduceOp reducer,
TransformOp transformer, MPType init, TransformOp transformer, MPType init,
int reduce_num, int left_num, int reduce_num, int left_num,
int blocking_size) { int blocking_size,
const kps::DimConfig dim) {
// when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
// function will be used // function will be used
// eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1 auto block = ReduceIndexMapping<false>(dim);
// if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx / int idy = block.BlockIdY() * blocking_size;
// 32 int idx = block.BlockIdX() * block.BlockDimX();
// else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32 int idz = BLOCK_ID_Z * left_num;
int idx = blockIdx.x * blockDim.x; int stride = dim.split_num_x * dim.deal_size_x;
int size = left_num - idx; int size = left_num - dim.rem_x;
if (size >= blockDim.x) { // complete segment int loop_size = min(reduce_num - idy, blocking_size);
HigherDimDealSegment<Tx, Ty, MPType, ReduceOp, TransformOp>( int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY();
x, y, reducer, transformer, init, reduce_num, left_num, blocking_size); int block_offset = idy * left_num + idz * reduce_num;
} else { const Tx* input = x + block_offset;
HigherDimDealSegment<Tx, Ty, MPType, ReduceOp, TransformOp, true>( Tx reduce_input;
x, y, reducer, transformer, init, reduce_num, left_num, blocking_size); for (; idx < size; idx += stride) {
MPType reduce_var = init;
MPType reduce_compute = init;
for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
kps::ReadData<Tx, Tx, 1, 1, 1, false>(&reduce_input,
input + loop_idx * left_num + idx,
block.BlockDimX(), 1, 1, left_num);
kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
&reduce_compute, &reduce_input, transformer);
kps::Reduce<MPType, 1, 1, 1, ReduceOp,
kps::details::ReduceMode::kLocalMode>(
&reduce_var, &reduce_compute, reducer, false);
}
Ty result = static_cast<Ty>(reduce_var);
kps::WriteData<Ty, 1, 1, 1, false>(y + store_offset + idx, &result,
block.BlockDimX());
}
if (idx < left_num) {
MPType reduce_var = init;
MPType reduce_compute = init;
for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
kps::ReadData<Tx, Tx, 1, 1, 1, true>(&reduce_input,
input + loop_idx * left_num + idx,
dim.rem_x, 1, 1, left_num);
kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
&reduce_compute, &reduce_input, transformer);
kps::Reduce<MPType, 1, 1, 1, ReduceOp,
kps::details::ReduceMode::kLocalMode>(
&reduce_var, &reduce_compute, reducer, false);
}
Ty result = static_cast<Ty>(reduce_var);
kps::WriteData<Ty, 1, 1, 1, true>(y + store_offset + idx, &result,
dim.rem_x);
} }
} }
...@@ -648,14 +731,27 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, ...@@ -648,14 +731,27 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
int stride_reduce = 1; int stride_reduce = 1;
int stride_left = config.reduce_num; int stride_left = config.reduce_num;
// for higher performance // for higher performance
auto reduce_index_calculator = LastDimIndexCal(stride_reduce); auto reduce_index_calculator = OneDimIndexCal(stride_reduce);
auto left_index_calculator = LastDimIndexCal(stride_left); auto left_index_calculator = OneDimIndexCal(stride_left);
kps::DimConfig dim =
kps::DimConfig(config.grid.x, config.grid.y, config.grid.z,
config.block.x, config.block.y, 0);
dim.SetRem(config.reduce_num % config.block.x, 0, 0);
#ifdef PADDLE_WITH_XPU2
ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp,
OneDimIndexCal><<<8, 128, stream>>>(
x_data, config.output_data, reducer, TransformOp(config.reduce_num),
init, config.reduce_num, config.left_num, config.reduce_last_dim,
reduce_index_calculator, left_index_calculator, dim);
#else
ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp, ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp,
LastDimIndexCal><<<config.grid, config.block, 0, stream>>>( OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
x_data, config.output_data, reducer, TransformOp(config.reduce_num), x_data, config.output_data, reducer, TransformOp(config.reduce_num),
init, config.reduce_num, config.left_num, config.reduce_last_dim, init, config.reduce_num, config.left_num, config.reduce_last_dim,
reduce_index_calculator, left_index_calculator); reduce_index_calculator, left_index_calculator, dim);
#endif
} else { } else {
int reduce_rank = config.reduce_strides.size(); int reduce_rank = config.reduce_strides.size();
...@@ -665,11 +761,25 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, ...@@ -665,11 +761,25 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
config.x_strides); config.x_strides);
auto left_index_calculator = IndexCalculator( auto left_index_calculator = IndexCalculator(
left_rank, config.left_dim, config.left_strides, config.x_strides); left_rank, config.left_dim, config.left_strides, config.x_strides);
kps::DimConfig dim =
kps::DimConfig(config.grid.x, config.grid.y, config.grid.z,
config.block.x, config.block.y, 0);
dim.SetRem(config.reduce_num % config.block.x, 0, 0);
#ifdef PADDLE_WITH_XPU2
ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp,
IndexCalculator><<<8, 128, stream>>>(
x_data, config.output_data, reducer, TransformOp(config.reduce_num),
init, config.reduce_num, config.left_num, config.reduce_last_dim,
reduce_index_calculator, left_index_calculator, dim);
#else
ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp, ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp,
IndexCalculator><<<config.grid, config.block, 0, stream>>>( IndexCalculator><<<config.grid, config.block, 0, stream>>>(
x_data, config.output_data, reducer, TransformOp(config.reduce_num), x_data, config.output_data, reducer, TransformOp(config.reduce_num),
init, config.reduce_num, config.left_num, config.reduce_last_dim, init, config.reduce_num, config.left_num, config.reduce_last_dim,
reduce_index_calculator, left_index_calculator); reduce_index_calculator, left_index_calculator, dim);
#endif
} }
if (config.should_reduce_again) { if (config.should_reduce_again) {
...@@ -683,12 +793,25 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, ...@@ -683,12 +793,25 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
grid = dim3(config.grid.x, 1, config.grid.z); grid = dim3(config.grid.x, 1, config.grid.z);
} }
auto last_index = OneDimIndexCal(1);
auto first_index = OneDimIndexCal(config.left_num);
kps::DimConfig dim =
kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
dim.SetRem(config.left_num % block.x, 0, 0);
#ifdef PADDLE_WITH_XPU2
ReduceHigherDimKernel<Ty, Ty, MPType, ReduceOp,
kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
config.output_data, y_data, reducer,
kps::IdentityFunctor<Ty, MPType>(config.grid.y), init, config.grid.y,
config.left_num, config.grid.y, dim);
#else
ReduceHigherDimKernel< ReduceHigherDimKernel<
Ty, Ty, MPType, ReduceOp, Ty, Ty, MPType, ReduceOp,
kps::details::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>( kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
config.output_data, y_data, reducer, config.output_data, y_data, reducer,
kps::details::IdentityFunctor<Ty, MPType>(config.grid.y), init, kps::IdentityFunctor<Ty, MPType>(config.grid.y), init, config.grid.y,
config.grid.y, config.left_num, config.grid.y); config.left_num, config.grid.y, dim);
#endif
} }
} }
...@@ -699,7 +822,7 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, ...@@ -699,7 +822,7 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
gpuStream_t stream) { gpuStream_t stream) {
auto x_dim = framework::vectorize<int>(x.dims()); auto x_dim = framework::vectorize<int>(x.dims());
auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim); auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
config.Run(); // get the parameters of LaunchReduceKernel config.Run();
int numel = x.numel(); int numel = x.numel();
// after config.run() // after config.run()
// SetOutputData for ReduceHigherDim when should_reduce_again is true, // SetOutputData for ReduceHigherDim when should_reduce_again is true,
...@@ -759,23 +882,49 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, ...@@ -759,23 +882,49 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
// else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32 // else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32
if (config.reduce_type == ReduceType::kReduceHigherDim) { if (config.reduce_type == ReduceType::kReduceHigherDim) {
using TransformOp = typename ReduceOp<Tx, MPType>::Transformer; using TransformOp = typename ReduceOp<Tx, MPType>::Transformer;
kps::DimConfig dim =
kps::DimConfig(config.grid.x, config.grid.y, config.grid.z,
config.block.x, config.blocking_size, 0);
dim.SetRem(config.left_num % config.block.x,
config.reduce_num % config.blocking_size, 0);
#ifdef PADDLE_WITH_XPU2
ReduceHigherDimKernel<Tx, Ty, MPType, ReduceOp<Tx, MPType>,
TransformOp><<<8, 128, stream>>>(
x_data, config.output_data, reducer, TransformOp(config.reduce_num),
reducer.initial(), config.reduce_num, config.left_num,
config.blocking_size, dim);
#else
ReduceHigherDimKernel< ReduceHigherDimKernel<
Tx, Ty, MPType, ReduceOp<Tx, MPType>, Tx, Ty, MPType, ReduceOp<Tx, MPType>,
TransformOp><<<config.grid, config.block, 0, stream>>>( TransformOp><<<config.grid, config.block, 0, stream>>>(
x_data, config.output_data, reducer, TransformOp(config.reduce_num), x_data, config.output_data, reducer, TransformOp(config.reduce_num),
reducer.initial(), config.reduce_num, config.left_num, reducer.initial(), config.reduce_num, config.left_num,
config.blocking_size); config.blocking_size, dim);
#endif
if (config.should_reduce_again) { if (config.should_reduce_again) {
dim3 block = dim3(config.block.x, 1, 1); dim3 block = dim3(config.block.x, 1, 1);
dim3 grid = dim3(config.grid.x, 1, config.grid.z); dim3 grid = dim3(config.grid.x, 1, config.grid.z);
ReduceHigherDimKernel<Ty, Ty, MPType, ReduceOp<Tx, MPType>, kps::DimConfig dim2 =
kps::details::IdentityFunctor< kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
Ty, MPType>><<<grid, block, 0, stream>>>( dim2.SetRem(config.left_num % config.block.x, 0, 0);
#ifdef PADDLE_WITH_XPU2
ReduceHigherDimKernel<
Ty, Ty, MPType, ReduceOp<Tx, MPType>,
kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
config.output_data, y_data, reducer,
kps::IdentityFunctor<Ty, MPType>(config.grid.y), reducer.initial(),
config.grid.y, config.left_num, config.grid.y, dim2);
#else
ReduceHigherDimKernel<
Ty, Ty, MPType, ReduceOp<Tx, MPType>,
kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
config.output_data, y_data, reducer, config.output_data, y_data, reducer,
kps::details::IdentityFunctor<Ty, MPType>(config.grid.y), kps::IdentityFunctor<Ty, MPType>(config.grid.y), reducer.initial(),
reducer.initial(), config.grid.y, config.left_num, config.grid.y); config.grid.y, config.left_num, config.grid.y, dim2);
#endif
} }
return; return;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册