diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h index 860072bd0c52ec5c04f52525a7d154d6576a2e43..19355434955a24f0640439c1c4e82e612cca580a 100644 --- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h @@ -360,12 +360,12 @@ __device__ __forceinline__ void ReadDataBc( * reduce_last_dim: Used to indicate whether the dimension of reduce contains * the lowest dimension. */ -template +template __device__ __forceinline__ void ReadDataReduce( - T* dst, const T* __restrict__ src, int block_offset, + Ty* dst, const Tx* __restrict__ src, int block_offset, const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx, - int stride_ny, bool reduce_last_dim) { + int stride_ny, Functor func, bool reduce_last_dim) { int thread_offset = 0; int left_idx = 0; if (reduce_last_dim) { @@ -385,7 +385,7 @@ __device__ __forceinline__ void ReadDataReduce( } } uint32_t index_src = index_cal(thread_offset + block_offset); - dst[ny] = src[index_src]; + dst[ny] = static_cast(func(src[index_src])); thread_offset += stride_ny; } } else { @@ -400,7 +400,7 @@ __device__ __forceinline__ void ReadDataReduce( } } uint32_t index_src = index_cal(thread_offset + block_offset); - dst[nx + ny * NX] = src[index_src]; + dst[nx + ny * NX] = static_cast(func(src[index_src])); thread_offset += stride_ny; } } diff --git a/paddle/fluid/operators/kernel_primitives/helper_primitives.h b/paddle/fluid/operators/kernel_primitives/helper_primitives.h index 28c226d77ee14384b123c926cdafb4fb071d11e2..48ac1509d1f6e8cd3c6ecf06ac0f3445dac39a51 100644 --- a/paddle/fluid/operators/kernel_primitives/helper_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/helper_primitives.h @@ -17,64 +17,49 @@ namespace paddle { namespace operators { namespace kernel_primitives { -namespace details { -static __device__ __forceinline__ platform::float16 ExpFunctor( - platform::float16 x) { - return ::Eigen::numext::exp(x); -} -static __device__ __forceinline__ float ExpFunctor(float x) { return expf(x); } -static __device__ __forceinline__ double ExpFunctor(double x) { return exp(x); } -static __device__ __forceinline__ platform::float16 LogFunctor( - platform::float16 x) { - return ::Eigen::numext::log(x); -} -static __device__ __forceinline__ float LogFunctor(float x) { return logf(x); } -static __device__ __forceinline__ double LogFunctor(double x) { return log(x); } +#ifdef PADDLE_WITH_XPU2 +struct dim3 { + int x; + int y; + int z; -/*************************** Compute Functor****************************/ -// for margin_cross_entropy -template -struct ExpLogitTransformer { - HOSTDEVICE explicit inline ExpLogitTransformer(int n) {} - - HOSTDEVICE inline Ty operator()(const Tx* x) const { - return static_cast(details::ExpFunctor(x[0])); - } - - HOSTDEVICE inline Ty operator()(const Tx& x) const { - return static_cast(details::ExpFunctor(x)); + explicit inline dim3(int split_x, int split_y = 1, int split_z = 1) { + x = split_x; + y = split_y; + z = split_z; } }; +#endif -// Post processing function for sum, max, min, prod, any -template -struct IdentityFunctor { - HOSTDEVICE explicit inline IdentityFunctor(int n) {} +struct DimConfig { + int split_num_x; + int split_num_y; + int split_num_z; + int deal_size_x; + int deal_size_y; + int deal_size_z; + int rem_x; + int rem_y; + int rem_z; - HOSTDEVICE inline Ty operator()(const Tx* x) const { - return static_cast(x[0]); + HOSTDEVICE explicit inline DimConfig(int split_x, int split_y, int split_z, + int size_x, int size_y, int size_z) { + split_num_x = split_x; + split_num_y = split_y; + split_num_z = split_z; + deal_size_x = size_x; + deal_size_y = size_y; + deal_size_z = size_z; } - HOSTDEVICE inline Ty operator()(const Tx& x) const { - return static_cast(x); + HOSTDEVICE void SetRem(int rem_nx, int rem_ny, int rem_nz) { + rem_x = rem_nx; + rem_y = rem_ny; + rem_z = rem_nz; } }; -// Post processing function for mean -template -struct DivideFunctor { - HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} - - HOSTDEVICE inline T operator()(const T* x) const { return x[0] * n_inv; } - - HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } - - private: - T n_inv; -}; - -} // namespace details } // namespace kernel_primitives } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h index 9a4f8bb026b9da6d6d3d04d8ed020d0a98c01548..e20e77ae26a711a89364868ea53f18c2f1a0b37f 100644 --- a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h @@ -13,11 +13,45 @@ // limitations under the License. #pragma once +#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h" +#include "paddle/fluid/operators/kernel_primitives/helper_primitives.h" +#ifdef PADDLE_WITH_XPU2 +#include "paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h" +#include "paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h" +#define THREAD_ID_X core_id() +#define THREAD_ID_Y 0 +#define THREAD_ID_Z 0 + +#define BLOCK_NUM_X core_num() +#define BLOCK_NUM_Y 0 +#define BLOCK_NUM_Z 0 +#define BLOCK_ID_X cluster_id() +#define BLOCK_ID_Y 0 +#define BLOCK_ID_Z 0 + +#define GRID_NUM_X cluster_num() +#define GRID_NUM_Y 0 +#define GRID_NUM_Z 0 +#else #include "paddle/fluid/operators/kernel_primitives/compute_primitives.h" #include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h" -#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h" -#include "paddle/fluid/operators/kernel_primitives/helper_primitives.h" +#define THREAD_ID_X threadIdx.x +#define THREAD_ID_Y threadIdx.y +#define THREAD_ID_Z threadIdx.z + +#define BLOCK_NUM_X blockDim.x +#define BLOCK_NUM_Y blockDim.y +#define BLOCK_NUM_Z blockDim.z + +#define BLOCK_ID_X blockIdx.x +#define BLOCK_ID_Y blockIdx.y +#define BLOCK_ID_Z blockIdx.z + +#define GRID_NUM_X gridDim.x +#define GRID_NUM_Y gridDim.y +#define GRID_NUM_Z gridDim.z +#endif namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu index d617a384cf30119037db05a797739e272574fa94..7c5e64d2afa46a9a57fb8411c0592cd567d7cd8d 100644 --- a/paddle/fluid/operators/margin_cross_entropy_op.cu +++ b/paddle/fluid/operators/margin_cross_entropy_op.cu @@ -130,7 +130,7 @@ __global__ void AddMarginToPositiveLogitsKernel( template struct ExpAndSum { - using Transformer = kpds::ExpLogitTransformer; + using Transformer = kps::ExpFunctor; inline Ty initial() { return static_cast(0.0f); } @@ -159,7 +159,7 @@ __global__ void LogitsMinusLogSumKernel(T* logits, const T* logits_sum_per_row, const int64_t N, const int64_t D) { CUDA_KERNEL_LOOP(i, N * D) { auto row = i / D; - logits[i] -= kpds::LogFunctor(logits_sum_per_row[row]); + logits[i] -= kps::details::Log(logits_sum_per_row[row]); } } @@ -174,9 +174,9 @@ __global__ void HardLabelSoftmaxWithCrossEntropyKernel( if ((col + start_index) == labels[row]) { auto softmax = log_softmax[i]; loss[row] = -softmax; - log_softmax[i] = kpds::ExpFunctor(softmax); + log_softmax[i] = kps::details::Exp(softmax); } else { - log_softmax[i] = kpds::ExpFunctor(log_softmax[i]); + log_softmax[i] = kps::details::Exp(log_softmax[i]); } } } diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h index bdd84ca153a23ba9530f44bcabeaa75d5bcda6aa..90adea60927c0fd2bd0e2feae82ea4cc7a9bcf51 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h @@ -24,11 +24,11 @@ limitations under the License. */ namespace paddle { namespace operators { -namespace kpds = paddle::operators::kernel_primitives::details; +namespace kps = paddle::operators::kernel_primitives; template struct CustomMin { - using Transformer = kpds::IdentityFunctor; + using Transformer = kps::IdentityFunctor; inline Ty initial() { return static_cast(std::numeric_limits::max()); @@ -41,7 +41,7 @@ struct CustomMin { template struct CustomMax { - using Transformer = kpds::IdentityFunctor; + using Transformer = kps::IdentityFunctor; inline Ty initial() { return static_cast(std::numeric_limits::lowest()); @@ -55,7 +55,7 @@ struct CustomMax { // for cub::Reduce template struct CustomSum { - using Transformer = kpds::IdentityFunctor; + using Transformer = kps::IdentityFunctor; inline Ty initial() { return static_cast(0.0f); } @@ -66,7 +66,7 @@ struct CustomSum { template struct CustomMean { - using Transformer = kpds::DivideFunctor; + using Transformer = kps::DivideFunctor; inline Ty initial() { return static_cast(0.0f); } @@ -77,7 +77,7 @@ struct CustomMean { template struct CustomMul { - using Transformer = kpds::IdentityFunctor; + using Transformer = kps::IdentityFunctor; inline Ty initial() { return static_cast(1.0f); } @@ -88,7 +88,7 @@ struct CustomMul { template struct CustomLogicalOr { - using Transformer = kpds::IdentityFunctor; + using Transformer = kps::IdentityFunctor; inline Ty initial() { return static_cast(false); } @@ -99,7 +99,7 @@ struct CustomLogicalOr { template struct CustomLogicalAnd { - using Transformer = kpds::IdentityFunctor; + using Transformer = kps::IdentityFunctor; inline Ty initial() { return static_cast(true); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index bf451272a47b0a9a43b2f9a04dfc5e19877d9299..6b3b4843200188f81f38826bb5991c5c62de872d 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -165,10 +165,93 @@ struct IndexCalculator { framework::Array divmoders; }; +template +struct ReduceIndexMapping { + const kps::DimConfig dim; + HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims) + : dim(dims) {} + + __device__ __forceinline__ int BlockIdX() { +#ifdef PADDLE_WITH_XPU2 + if (ReduceLastDim) { + return (cluster_id() / dim.split_num_x % dim.split_num_y); + } else { + return cluster_id() % dim.split_num_x; + } +#else + return blockIdx.x; +#endif + } + + __device__ __forceinline__ int BlockIdY() { +#ifdef PADDLE_WITH_XPU2 + if (ReduceLastDim) { + return (cluster_id() % dim.split_num_x); + } else { + return (cluster_id() / dim.split_num_x % dim.split_num_y); + } +#else + return blockIdx.y; +#endif + } + + __device__ __forceinline__ int BlockDimX() { +#ifdef PADDLE_WITH_XPU2 + return dim.deal_size_x; +#else + return blockDim.x; +#endif + } + + __device__ __forceinline__ int BlockDimY() { +#ifdef PADDLE_WITH_XPU2 + return dim.deal_size_y; +#else + return blockDim.y; +#endif + } + + __device__ __forceinline__ int GridDimX() { +#ifdef PADDLE_WITH_XPU2 + if (ReduceLastDim) { + return dim.split_num_y; + } else { + return dim.split_num_x; + } +#else + return gridDim.x; +#endif + } + + __device__ __forceinline__ int GridDimY() { +#ifdef PADDLE_WITH_XPU2 + if (ReduceLastDim) { + return dim.split_num_x; + } else { + return dim.split_num_y; + } +#else + return gridDim.y; +#endif + } + + __device__ __forceinline__ int GetLoopSize() { +#ifdef PADDLE_WITH_XPU2 + if (ReduceLastDim) { + return dim.deal_size_y; + } else { + return dim.deal_size_x; + } +#else + return 1; +#endif + } +}; + // when reduce_type == kReduceLastDim this struct will be used // for higher performance -struct LastDimIndexCal { - explicit LastDimIndexCal(int num) : stride(num) {} +struct OneDimIndexCal { + explicit OneDimIndexCal(int num) : stride(num) {} __device__ inline int operator()(int index) const { return index * stride; } int stride; @@ -331,8 +414,16 @@ struct ReduceConfig { if (rank == reduce_rank || is_last_dim) { reduce_type = static_cast(ReduceType::kReduceLastDim); } else if (reduce_rank == 1) { - // ReduceFirstDim and reduceSecondDim +// ReduceFirstDim and reduceSecondDim +#ifdef PADDLE_WITH_XPU2 + if (reduce_dim[0] == 0) { + reduce_type = static_cast(ReduceType::kReduceHigherDim); + } else { + reduce_type = static_cast(ReduceType::kReduceAny); + } +#else reduce_type = static_cast(ReduceType::kReduceHigherDim); +#endif } else { reduce_type = static_cast(ReduceType::kReduceAny); } @@ -408,59 +499,61 @@ struct ReduceConfig { // for ReduceHigherDim: if block is enough -> splite reduce_num // else init block(32, 1) grid(block_num, 1) // for others: block(block_num, 1) , grid(left_num, 1) + void SetBlockDimForHigher(dim3* block_dim, dim3* grid_dim) { + int last_dim_num = x_dim.back(); + // update left_num + int grid_z = left_num / last_dim_num; + left_num = last_dim_num; + grid_dim->z = grid_z; + int device_id = platform::GetCurrentDeviceId(); + int max_mp = platform::GetCUDAMultiProcessors(device_id); + int max_threads_per_mp = + platform::GetCUDAMaxThreadsPerMultiProcessor(device_id); + int max_threads = max_threads_per_mp * max_mp; + // init + int num_block = (max_threads / left_num); + block_dim->x = details::GetBlockDim(left_num); + grid_dim->x = details::AlignUp(left_num, block_dim->x); + blocking_size = reduce_num; + + if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) { + blocking_size = details::GetLastPow2(reduce_num / num_block); + if (blocking_size <= 1) { + blocking_size = details::GetLastPow2(sqrt(reduce_num)); + } else if (blocking_size * 2 < reduce_num) { + blocking_size *= 2; + } + should_reduce_again = true; + grid_dim->y = details::AlignUp(reduce_num, blocking_size); + } + } + void SetBlockDim() { // init int block_num = details::GetBlockDim(reduce_num); should_reduce_again = false; - - dim3 block_dim(block_num, 1); - dim3 grid_dim(left_num, 1); + dim3 block_dim(block_num, 1, 1); + dim3 grid_dim(left_num, 1, 1); blocking_size = reduce_num; - +#ifdef PADDLE_WITH_XPU2 + if (reduce_last_dim) { + block_dim.x = 128; + block_dim.y = reduce_num; + grid_dim.x = 8; + grid_dim.y = 1; + } else { + block_dim.x = 128; + block_dim.y = left_num; + grid_dim.x = 8; + grid_dim.y = 1; + } +#else if (reduce_type == ReduceType::kReduceHigherDim) { - int last_dim_num = x_dim.back(); - // update left_num - int grid_z = left_num / last_dim_num; - left_num = last_dim_num; - - block_dim.z = 1; - grid_dim.z = grid_z; - - int device_id = platform::GetCurrentDeviceId(); - int max_mp = platform::GetCUDAMultiProcessors(device_id); - int max_threads_per_mp = - platform::GetCUDAMaxThreadsPerMultiProcessor(device_id); - int max_threads = max_threads_per_mp * max_mp; - - // init - int num_block = (max_threads / left_num); - - if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) { - blocking_size = details::GetLastPow2(reduce_num / num_block); - - if (blocking_size <= 1) { - blocking_size = details::GetLastPow2(sqrt(reduce_num)); - } else if (blocking_size * 2 < reduce_num) { - blocking_size *= 2; - } - - should_reduce_again = true; - - block_dim.x = details::GetBlockDim(left_num); - block_dim.y = 1; - grid_dim.x = (left_num + block_dim.x - 1) / block_dim.x; - grid_dim.y = (reduce_num + blocking_size - 1) / blocking_size; - - } else { - block_dim.x = details::GetBlockDim(left_num); - block_dim.y = 1; - blocking_size = reduce_num; - grid_dim.x = (left_num + block_dim.x - 1) / block_dim.x; - grid_dim.y = 1; - } + SetBlockDimForHigher(&block_dim, &grid_dim); } else { SetBlockDimForReduceAny(&block_dim, &grid_dim); } +#endif block = block_dim; grid = grid_dim; @@ -487,72 +580,6 @@ struct ReduceConfig { dim3 block; dim3 grid; }; -/* size : how many colonms left have to be reduced - * loop : how many rows data have to be reduced - * block_size: max rows this block to reduce - */ -template -__device__ void HigherDimDealSegment(const Tx* x, Ty* y, ReduceOp reducer, - TransformOp transformer, MPType init, - int reduce_num, int left_num, - int block_size) { - const int NY = 1; - int idx = blockIdx.x * blockDim.x; - int idy = blockIdx.y * block_size; - // block_offset of rows - Tx reduce_input[NY]; - MPType reduce_compute[NY]; - MPType result = init; - // the offset of this block - int block_offset = idy * left_num + idx + blockIdx.z * reduce_num * left_num; - const Tx* input = x + block_offset; - int store_offset = - blockIdx.y * left_num + blockIdx.z * gridDim.y * left_num + idx; - // how many columns left - int size = left_num - idx; - // how many rows have to be reduced - int loop = reduce_num - idy; - loop = loop > block_size ? block_size : loop; - - for (int loop_index = 0; loop_index < loop; loop_index += NY) { - kps::ReadData( - &reduce_input[0], input + loop_index * left_num, size, NY, 1, left_num); - kps::ElementwiseUnary( - &reduce_compute[0], &reduce_input[0], transformer); - kps::Reduce( - &result, &reduce_compute[0], reducer, false); - } - - Ty temp_data = static_cast(result); - kps::WriteData(y + store_offset, &temp_data, size); -} - -template -__device__ void ReduceAnyKernelImpl(const Tx* input, MPType* reduce_var, - ReduceOp reducer, TransformOp transformer, - MPType init, int reduce_num, int input_idx, - bool reduce_last_dim, - const Calculator& reduce_index_calculator, - int stride, int num) { - Tx input_reg[REDUCE_VEC_SIZE]; - MPType input_compute[REDUCE_VEC_SIZE]; - MPType input_transform[REDUCE_VEC_SIZE]; - - kps::Init(&input_compute[0], init); - kps::ReadDataReduce( - &input_reg[0], input, input_idx, reduce_index_calculator, 1, reduce_num, - 1, stride, reduce_last_dim); - kps::ElementwiseUnary( - &input_transform[0], &input_reg[0], transformer); - kps::Init(input_compute, input_transform, - num); - kps::Reduce( - reduce_var, &input_compute[0], reducer, reduce_last_dim); -} // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this @@ -564,54 +591,76 @@ __global__ void ReduceAnyKernel(const Tx* x, Ty* y, ReduceOp reducer, int reduce_num, int left_num, bool reduce_last_dim, const Calculator reduce_index_calculator, - const Calculator left_index_calculator) { + const Calculator left_index_calculator, + const kps::DimConfig dim) { int input_idx, left_idx, stride; int block_size = 0; bool need_store = true; + int loop_left = 0; int tid = 0; // the last dim gets involved in reduction + int store_offset = 0; + int stride_left = 0; if (reduce_last_dim) { - input_idx = blockIdx.y * blockDim.x; - left_idx = blockIdx.x * blockDim.y + threadIdx.y; - stride = gridDim.y * blockDim.x; - block_size = blockDim.x; - need_store = (threadIdx.x == 0) && (left_idx < left_num); + auto block = ReduceIndexMapping(dim); + input_idx = block.BlockIdY() * block.BlockDimX(); + left_idx = block.BlockIdX() * block.BlockDimY() + THREAD_ID_Y; + stride = block.GridDimY() * block.BlockDimX(); + block_size = block.BlockDimX(); + need_store = (THREAD_ID_X == 0) && (left_idx < left_num); + store_offset = block.BlockIdY() * left_num + left_idx; + loop_left = min(block.GetLoopSize(), left_num - left_idx); + stride_left = 1; tid = threadIdx.x; } else { - input_idx = blockIdx.y * blockDim.y; - left_idx = blockIdx.x * blockDim.x + threadIdx.x; - stride = gridDim.y * blockDim.y; - block_size = blockDim.y; - need_store = (threadIdx.y == 0) && (left_idx < left_num); + auto block = ReduceIndexMapping(dim); + input_idx = block.BlockIdY() * block.BlockDimY(); + left_idx = block.BlockIdX() * block.BlockDimX() + THREAD_ID_X; + stride = block.GridDimY() * block.BlockDimY(); + block_size = block.BlockDimY(); + need_store = (THREAD_ID_Y == 0) && (left_idx < left_num); + loop_left = min(block.GetLoopSize(), left_num - left_idx); + stride_left = block.BlockDimX() * block.GridDimX(); + store_offset = block.BlockIdY() * left_num + left_idx; tid = threadIdx.y; } - int store_offset = blockIdx.y * left_num + left_idx; // calculate the offset, means the addr where each thread really start. - int input_offset = left_index_calculator(left_idx); - const Tx* input = x + input_offset; - MPType reduce_var = init; - Ty store_data; - // 1. reduce for each thread - if (left_idx < left_num) { + MPType input_compute[REDUCE_VEC_SIZE]; + Tx input_reg[REDUCE_VEC_SIZE]; + for (int i = 0; i < loop_left; i += stride_left) { + int input_offset = left_index_calculator(left_idx + i); + const Tx* input = x + input_offset; + MPType reduce_var = init; // load REDUCE_VEC_SIZE data once, and then compute int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride; for (; input_idx + block_size < bound; input_idx += REDUCE_VEC_SIZE * stride) { - ReduceAnyKernelImpl( - input, &reduce_var, reducer, transformer, init, reduce_num, input_idx, - reduce_last_dim, reduce_index_calculator, stride, reduce_num); + kps::ReadDataReduce, false>( + &input_reg[0], input, input_idx, reduce_index_calculator, 1, + reduce_num, 1, stride, kps::IdentityFunctor(), reduce_last_dim); + kps::ElementwiseUnary( + &input_compute[0], &input_reg[0], transformer); + kps::Reduce( + &reduce_var, &input_compute[0], reducer, reduce_last_dim); } - int num = (reduce_num - input_idx - tid + stride - 1) / stride; - ReduceAnyKernelImpl( - input, &reduce_var, reducer, transformer, init, reduce_num - input_idx, - input_idx, reduce_last_dim, reduce_index_calculator, stride, num); - } - kps::Reduce( - &reduce_var, &reduce_var, reducer, reduce_last_dim); - if (need_store) { - y[store_offset] = static_cast(reduce_var); + kps::Init(&input_compute[0], init); + kps::ReadDataReduce( + &input_compute[0], input, input_idx, reduce_index_calculator, 1, + reduce_num - input_idx, 1, stride, transformer, reduce_last_dim); + kps::Reduce( + &reduce_var, &input_compute[0], reducer, reduce_last_dim); + + kps::Reduce( + &reduce_var, &reduce_var, reducer, reduce_last_dim); + if (need_store) { + y[store_offset + i] = static_cast(reduce_var); + } } } @@ -620,21 +669,55 @@ template = blockDim.x) { // complete segment - HigherDimDealSegment( - x, y, reducer, transformer, init, reduce_num, left_num, blocking_size); - } else { - HigherDimDealSegment( - x, y, reducer, transformer, init, reduce_num, left_num, blocking_size); + auto block = ReduceIndexMapping(dim); + int idy = block.BlockIdY() * blocking_size; + int idx = block.BlockIdX() * block.BlockDimX(); + int idz = BLOCK_ID_Z * left_num; + int stride = dim.split_num_x * dim.deal_size_x; + int size = left_num - dim.rem_x; + int loop_size = min(reduce_num - idy, blocking_size); + int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY(); + int block_offset = idy * left_num + idz * reduce_num; + const Tx* input = x + block_offset; + Tx reduce_input; + for (; idx < size; idx += stride) { + MPType reduce_var = init; + MPType reduce_compute = init; + for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) { + kps::ReadData(&reduce_input, + input + loop_idx * left_num + idx, + block.BlockDimX(), 1, 1, left_num); + kps::ElementwiseUnary( + &reduce_compute, &reduce_input, transformer); + kps::Reduce( + &reduce_var, &reduce_compute, reducer, false); + } + Ty result = static_cast(reduce_var); + kps::WriteData(y + store_offset + idx, &result, + block.BlockDimX()); + } + + if (idx < left_num) { + MPType reduce_var = init; + MPType reduce_compute = init; + for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) { + kps::ReadData(&reduce_input, + input + loop_idx * left_num + idx, + dim.rem_x, 1, 1, left_num); + kps::ElementwiseUnary( + &reduce_compute, &reduce_input, transformer); + kps::Reduce( + &reduce_var, &reduce_compute, reducer, false); + } + Ty result = static_cast(reduce_var); + kps::WriteData(y + store_offset + idx, &result, + dim.rem_x); } } @@ -648,14 +731,27 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, int stride_reduce = 1; int stride_left = config.reduce_num; // for higher performance - auto reduce_index_calculator = LastDimIndexCal(stride_reduce); - auto left_index_calculator = LastDimIndexCal(stride_left); + auto reduce_index_calculator = OneDimIndexCal(stride_reduce); + auto left_index_calculator = OneDimIndexCal(stride_left); + kps::DimConfig dim = + kps::DimConfig(config.grid.x, config.grid.y, config.grid.z, + config.block.x, config.block.y, 0); + dim.SetRem(config.reduce_num % config.block.x, 0, 0); + +#ifdef PADDLE_WITH_XPU2 + ReduceAnyKernel<<<8, 128, stream>>>( + x_data, config.output_data, reducer, TransformOp(config.reduce_num), + init, config.reduce_num, config.left_num, config.reduce_last_dim, + reduce_index_calculator, left_index_calculator, dim); +#else ReduceAnyKernel<<>>( + OneDimIndexCal><<>>( x_data, config.output_data, reducer, TransformOp(config.reduce_num), init, config.reduce_num, config.left_num, config.reduce_last_dim, - reduce_index_calculator, left_index_calculator); + reduce_index_calculator, left_index_calculator, dim); +#endif } else { int reduce_rank = config.reduce_strides.size(); @@ -665,11 +761,25 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, config.x_strides); auto left_index_calculator = IndexCalculator( left_rank, config.left_dim, config.left_strides, config.x_strides); + + kps::DimConfig dim = + kps::DimConfig(config.grid.x, config.grid.y, config.grid.z, + config.block.x, config.block.y, 0); + dim.SetRem(config.reduce_num % config.block.x, 0, 0); + +#ifdef PADDLE_WITH_XPU2 + ReduceAnyKernel<<<8, 128, stream>>>( + x_data, config.output_data, reducer, TransformOp(config.reduce_num), + init, config.reduce_num, config.left_num, config.reduce_last_dim, + reduce_index_calculator, left_index_calculator, dim); +#else ReduceAnyKernel<<>>( x_data, config.output_data, reducer, TransformOp(config.reduce_num), init, config.reduce_num, config.left_num, config.reduce_last_dim, - reduce_index_calculator, left_index_calculator); + reduce_index_calculator, left_index_calculator, dim); +#endif } if (config.should_reduce_again) { @@ -683,12 +793,25 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, grid = dim3(config.grid.x, 1, config.grid.z); } + auto last_index = OneDimIndexCal(1); + auto first_index = OneDimIndexCal(config.left_num); + kps::DimConfig dim = + kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); + dim.SetRem(config.left_num % block.x, 0, 0); +#ifdef PADDLE_WITH_XPU2 + ReduceHigherDimKernel><<<8, 128, stream>>>( + config.output_data, y_data, reducer, + kps::IdentityFunctor(config.grid.y), init, config.grid.y, + config.left_num, config.grid.y, dim); +#else ReduceHigherDimKernel< Ty, Ty, MPType, ReduceOp, - kps::details::IdentityFunctor><<>>( + kps::IdentityFunctor><<>>( config.output_data, y_data, reducer, - kps::details::IdentityFunctor(config.grid.y), init, - config.grid.y, config.left_num, config.grid.y); + kps::IdentityFunctor(config.grid.y), init, config.grid.y, + config.left_num, config.grid.y, dim); +#endif } } @@ -699,7 +822,7 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, gpuStream_t stream) { auto x_dim = framework::vectorize(x.dims()); auto config = ReduceConfig(origin_reduce_dims, x_dim); - config.Run(); // get the parameters of LaunchReduceKernel + config.Run(); int numel = x.numel(); // after config.run() // SetOutputData for ReduceHigherDim when should_reduce_again is true, @@ -759,23 +882,49 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, // else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32 if (config.reduce_type == ReduceType::kReduceHigherDim) { using TransformOp = typename ReduceOp::Transformer; - + kps::DimConfig dim = + kps::DimConfig(config.grid.x, config.grid.y, config.grid.z, + config.block.x, config.blocking_size, 0); + dim.SetRem(config.left_num % config.block.x, + config.reduce_num % config.blocking_size, 0); + +#ifdef PADDLE_WITH_XPU2 + ReduceHigherDimKernel, + TransformOp><<<8, 128, stream>>>( + x_data, config.output_data, reducer, TransformOp(config.reduce_num), + reducer.initial(), config.reduce_num, config.left_num, + config.blocking_size, dim); +#else ReduceHigherDimKernel< Tx, Ty, MPType, ReduceOp, TransformOp><<>>( x_data, config.output_data, reducer, TransformOp(config.reduce_num), reducer.initial(), config.reduce_num, config.left_num, - config.blocking_size); + config.blocking_size, dim); +#endif if (config.should_reduce_again) { dim3 block = dim3(config.block.x, 1, 1); dim3 grid = dim3(config.grid.x, 1, config.grid.z); - ReduceHigherDimKernel, - kps::details::IdentityFunctor< - Ty, MPType>><<>>( + kps::DimConfig dim2 = + kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); + dim2.SetRem(config.left_num % config.block.x, 0, 0); + +#ifdef PADDLE_WITH_XPU2 + ReduceHigherDimKernel< + Ty, Ty, MPType, ReduceOp, + kps::IdentityFunctor><<<8, 128, stream>>>( + config.output_data, y_data, reducer, + kps::IdentityFunctor(config.grid.y), reducer.initial(), + config.grid.y, config.left_num, config.grid.y, dim2); +#else + ReduceHigherDimKernel< + Ty, Ty, MPType, ReduceOp, + kps::IdentityFunctor><<>>( config.output_data, y_data, reducer, - kps::details::IdentityFunctor(config.grid.y), - reducer.initial(), config.grid.y, config.left_num, config.grid.y); + kps::IdentityFunctor(config.grid.y), reducer.initial(), + config.grid.y, config.left_num, config.grid.y, dim2); +#endif } return; }