diff --git a/paddle/phi/kernels/cpu/dirichlet_kernel.cc b/paddle/phi/kernels/cpu/dirichlet_kernel.cc index 76ef231344199e542afcc85d2aaebffa8b017c76..c124920dfa0db8af9cb85e5b4b5889b664dfe989 100644 --- a/paddle/phi/kernels/cpu/dirichlet_kernel.cc +++ b/paddle/phi/kernels/cpu/dirichlet_kernel.cc @@ -16,9 +16,9 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cpu/elementwise.h" -#include "paddle/phi/kernels/cpu/reduce.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/funcs/reduce_functor.h" #include "paddle/phi/kernels/impl/dirichlet_kernel_impl.h" @@ -83,7 +83,7 @@ struct DirichletSampler { gamma_sum.Resize(new_shape); dev_ctx.template Alloc(&gamma_sum); - ReduceKernelImpl( + funcs::ReduceKernelImpl( dev_ctx, gamma_samples, &gamma_sum, diff --git a/paddle/phi/kernels/cpu/reduce.h b/paddle/phi/kernels/cpu/reduce.h index dad288cff2c1a4565a7b67d688936d231a6947c9..e5f610b9554097676afe26f94207b7fcd8cb06a2 100644 --- a/paddle/phi/kernels/cpu/reduce.h +++ b/paddle/phi/kernels/cpu/reduce.h @@ -16,181 +16,11 @@ #include -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/cast_kernel.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/funcs/math_function.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/operators/eigen/eigen_function.h" -namespace phi { - -template -void ReduceFunctor(const DeviceContext& context, - const phi::DenseTensor& input, - phi::DenseTensor* output, - const std::vector& dims, - bool keep_dim) { - auto x = EigenTensor::From(input); - auto x_rank = static_cast(x.dimensions().size()); - auto reduce_dim = Eigen::array(); - std::vector dims_ref = dims; - for (size_t i = 0; i < dims_ref.size(); ++i) { - if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i]; - reduce_dim[i] = dims_ref[i]; - } - // construct the squeezed output tensor - DDim out_dims = output->dims(); - if (keep_dim && x_rank > 1) { - const int kDelFlag = -2; - auto dims_vector = phi::vectorize(out_dims); - for (size_t i = 0; i < dims_ref.size(); ++i) { - dims_vector[dims_ref[i]] = kDelFlag; - } - dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag), - dims_vector.end()); - out_dims = phi::make_ddim(dims_vector); - } - auto& place = *context.eigen_device(); - Functor functor; - - if (D == 1) { - auto out = EigenScalar::From(*output); - functor(place, &x, &out, reduce_dim); - } else { - auto out = EigenTensor::From(*output, out_dims); - functor(place, &x, &out, reduce_dim); - } -} - -#define HANDLE_REDUCE_DIM(NDIM, RDIM) \ - if (ndim == NDIM && rdim == RDIM) { \ - ReduceFunctor( \ - dev_ctx, input, output, dims, keep_dim); \ - } -//////////////// HandleLargeDim - -inline void GetShuffledDim(const DDim& src_dims, - DDim* dst_dims, - const std::vector& reduced_dims, - std::vector* perm_axis) { - // check if it's a reduced dim - std::vector src_dims_check(src_dims.size(), false); - size_t src_size = src_dims.size(); - size_t reduce_size = reduced_dims.size(); - std::vector regular_reduced_dims = reduced_dims; - for (size_t i = 0; i < regular_reduced_dims.size(); i++) { - if (regular_reduced_dims[i] < 0) { - regular_reduced_dims[i] = src_size + regular_reduced_dims[i]; - } - } - - for (size_t i = 0; i < reduce_size; ++i) { - dst_dims->at(src_size - reduce_size + i) = - src_dims[regular_reduced_dims[i]]; - (*perm_axis)[src_size - reduce_size + i] = regular_reduced_dims[i]; - src_dims_check[regular_reduced_dims[i]] = true; - } - - size_t offset = 0; - for (size_t i = 0; i < src_dims_check.size(); ++i) { - bool is_reduced = src_dims_check[i]; - if (!is_reduced) { - (*perm_axis)[offset] = i; - dst_dims->at(offset++) = src_dims[i]; - } - } -} - -template -void GetShuffledInput(const DeviceContext& dev_ctx, - const phi::DenseTensor& input, - phi::DenseTensor* shuffled_input, - const std::vector& dims) { - DDim shuffled_dims(input.dims()); - std::vector perm_axis(input.dims().size()); - GetShuffledDim(input.dims(), &shuffled_dims, dims, &perm_axis); - - shuffled_input->Resize(shuffled_dims); - dev_ctx.template Alloc(shuffled_input); - - phi::funcs::TransposeNormal trans; - trans(dev_ctx, input, shuffled_input, perm_axis); -} - -template -void HandleLargeDim(const DeviceContext& dev_ctx, - const phi::DenseTensor& input, - phi::DenseTensor* output, - const std::vector& dims, - bool keep_dim) { - // shuffle the reduced dim to the end - phi::DenseTensor shuffled_input; - GetShuffledInput(dev_ctx, input, &shuffled_input, dims); +#include "paddle/phi/kernels/funcs/reduce_function.h" - // transpose to 2D tensor whose shape is {unreduced, reduced}. - const int64_t unreduced = output->numel(); - const int64_t reduced = shuffled_input.numel() / unreduced; - shuffled_input.ResizeAndAllocate({unreduced, reduced}); - DDim output_dim = output->dims(); - output->ResizeAndAllocate({unreduced}); - ReduceFunctor( - dev_ctx, shuffled_input, output, {1}, keep_dim); - output->ResizeAndAllocate(output_dim); -} - -////////////// ReduceKernel - -template -void ReduceKernelImpl(const DeviceContext& dev_ctx, - const phi::DenseTensor& input, - phi::DenseTensor* output, - const std::vector& dims, - bool keep_dim, - bool reduce_all) { - dev_ctx.template Alloc(output); - - if (reduce_all) { - // Flatten and reduce 1-D tensor - auto x = EigenVector::Flatten(input); - auto out = EigenScalar::From(*output); - auto& dev = *dev_ctx.eigen_device(); - auto reduce_dim = Eigen::array({{0}}); - - Functor functor; - functor(dev, &x, &out, reduce_dim); - } else { - int ndim = input.dims().size(); - int rdim = dims.size(); - if (ndim > 6) { - HandleLargeDim( - dev_ctx, input, output, dims, keep_dim); - - } else { - HANDLE_REDUCE_DIM(6, 5); - HANDLE_REDUCE_DIM(6, 4); - HANDLE_REDUCE_DIM(6, 3); - HANDLE_REDUCE_DIM(6, 2); - HANDLE_REDUCE_DIM(6, 1); - HANDLE_REDUCE_DIM(5, 4); - HANDLE_REDUCE_DIM(5, 3); - HANDLE_REDUCE_DIM(5, 2); - HANDLE_REDUCE_DIM(5, 1); - HANDLE_REDUCE_DIM(4, 3); - HANDLE_REDUCE_DIM(4, 2); - HANDLE_REDUCE_DIM(4, 1); - HANDLE_REDUCE_DIM(3, 2); - HANDLE_REDUCE_DIM(3, 1); - HANDLE_REDUCE_DIM(2, 1); - HANDLE_REDUCE_DIM(1, 1); - } - } -} +namespace phi { template void Reduce(const DeviceContext& dev_ctx, @@ -218,7 +48,7 @@ void Reduce(const DeviceContext& dev_ctx, // do reduce sum PD_VISIT_ALL_TYPES( x.dtype(), "ReduceKernelImpl", ([&] { - phi::ReduceKernelImpl( + phi::funcs::ReduceKernelImpl( dev_ctx, x, out, dims, keep_dim, reduce_all); })); } else { @@ -228,7 +58,7 @@ void Reduce(const DeviceContext& dev_ctx, // do reduce sum PD_VISIT_ALL_TYPES( out_dtype, "ReduceKernelImpl", ([&] { - phi::ReduceKernelImpl( + phi::funcs::ReduceKernelImpl( dev_ctx, tmp_tensor, out, dims, keep_dim, reduce_all); })); } @@ -255,7 +85,7 @@ void BoolReduceKernel(const DeviceContext& dev_ctx, } reduce_all = (reduce_all || full_dim); - ReduceKernelImpl( + funcs::ReduceKernelImpl( dev_ctx, input, output, dims, keep_dim, reduce_all); } diff --git a/paddle/phi/kernels/funcs/matrix_reduce.cc b/paddle/phi/kernels/funcs/matrix_reduce.cc index 849fd7a0075a89cedeab4d87c779931f2a14f115..82396185e94521c0447d4c9ca9d82f9b19e933e1 100644 --- a/paddle/phi/kernels/funcs/matrix_reduce.cc +++ b/paddle/phi/kernels/funcs/matrix_reduce.cc @@ -14,7 +14,7 @@ #include "paddle/phi/kernels/funcs/matrix_reduce.h" -#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/funcs/reduce_functor.h" namespace phi { @@ -47,7 +47,7 @@ class MatrixReduceSumFunctor { out_reduce_dims.push_back(idx); } } - phi::ReduceKernelImpl( + ReduceKernelImpl( dev_ctx, in, out, out_reduce_dims, true, false); } }; diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h index 446dfc73d5bd692ba15eabc5ffc3a61d55a3809b..92fe3885b42f0a5a6c23bf6f0ea9445658b09a3d 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -15,8 +15,7 @@ #pragma once // CUDA, XPU and HIP use same api -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU_KP) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) #include #include @@ -40,10 +39,6 @@ namespace cub = hipcub; #include "paddle/phi/backends/gpu/gpu_info.h" #endif -#include "paddle/phi/api/ext/dispatch.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/enforce.h" -#include "paddle/phi/core/utils/array.h" #include "paddle/phi/kernels/cast_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" @@ -58,9 +53,19 @@ namespace kps = phi::kps; #ifdef PADDLE_WITH_XPU_KP using dim3 = phi::kps::dim3; #endif + +#endif + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/utils/array.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" namespace phi { namespace funcs { +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) namespace details { static inline int GetLastPow2(int n) { @@ -1190,8 +1195,174 @@ void ReduceKernel(const KPDevice& dev_ctx, is_mean); } +#endif + +template +void ReduceFunctor(const DeviceContext& context, + const phi::DenseTensor& input, + phi::DenseTensor* output, + const std::vector& dims, + bool keep_dim) { + auto x = EigenTensor::From(input); + auto x_rank = static_cast(x.dimensions().size()); + auto reduce_dim = Eigen::array(); + std::vector dims_ref = dims; + for (size_t i = 0; i < dims_ref.size(); ++i) { + if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i]; + reduce_dim[i] = dims_ref[i]; + } + // construct the squeezed output tensor + DDim out_dims = output->dims(); + if (keep_dim && x_rank > 1) { + const int kDelFlag = -2; + auto dims_vector = phi::vectorize(out_dims); + for (size_t i = 0; i < dims_ref.size(); ++i) { + dims_vector[dims_ref[i]] = kDelFlag; + } + dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag), + dims_vector.end()); + out_dims = phi::make_ddim(dims_vector); + } + auto& place = *context.eigen_device(); + Functor functor; + + if (D == 1) { + auto out = EigenScalar::From(*output); + functor(place, &x, &out, reduce_dim); + } else { + auto out = EigenTensor::From(*output, out_dims); + functor(place, &x, &out, reduce_dim); + } +} + +#define HANDLE_REDUCE_DIM(NDIM, RDIM) \ + if (ndim == NDIM && rdim == RDIM) { \ + ReduceFunctor( \ + dev_ctx, input, output, dims, keep_dim); \ + } +//////////////// HandleLargeDim + +inline void GetShuffledDim(const DDim& src_dims, + DDim* dst_dims, + const std::vector& reduced_dims, + std::vector* perm_axis) { + // check if it's a reduced dim + std::vector src_dims_check(src_dims.size(), false); + size_t src_size = src_dims.size(); + size_t reduce_size = reduced_dims.size(); + std::vector regular_reduced_dims = reduced_dims; + for (size_t i = 0; i < regular_reduced_dims.size(); i++) { + if (regular_reduced_dims[i] < 0) { + regular_reduced_dims[i] = src_size + regular_reduced_dims[i]; + } + } + + for (size_t i = 0; i < reduce_size; ++i) { + dst_dims->at(src_size - reduce_size + i) = + src_dims[regular_reduced_dims[i]]; + (*perm_axis)[src_size - reduce_size + i] = regular_reduced_dims[i]; + src_dims_check[regular_reduced_dims[i]] = true; + } + + size_t offset = 0; + for (size_t i = 0; i < src_dims_check.size(); ++i) { + bool is_reduced = src_dims_check[i]; + if (!is_reduced) { + (*perm_axis)[offset] = i; + dst_dims->at(offset++) = src_dims[i]; + } + } +} + +template +void GetShuffledInput(const DeviceContext& dev_ctx, + const phi::DenseTensor& input, + phi::DenseTensor* shuffled_input, + const std::vector& dims) { + DDim shuffled_dims(input.dims()); + std::vector perm_axis(input.dims().size()); + GetShuffledDim(input.dims(), &shuffled_dims, dims, &perm_axis); + + shuffled_input->Resize(shuffled_dims); + dev_ctx.template Alloc(shuffled_input); + + phi::funcs::TransposeNormal trans; + trans(dev_ctx, input, shuffled_input, perm_axis); +} + +template +void HandleLargeDim(const DeviceContext& dev_ctx, + const phi::DenseTensor& input, + phi::DenseTensor* output, + const std::vector& dims, + bool keep_dim) { + // shuffle the reduced dim to the end + phi::DenseTensor shuffled_input; + GetShuffledInput(dev_ctx, input, &shuffled_input, dims); + + // transpose to 2D tensor whose shape is {unreduced, reduced}. + const int64_t unreduced = output->numel(); + const int64_t reduced = shuffled_input.numel() / unreduced; + shuffled_input.ResizeAndAllocate({unreduced, reduced}); + DDim output_dim = output->dims(); + output->ResizeAndAllocate({unreduced}); + ReduceFunctor( + dev_ctx, shuffled_input, output, {1}, keep_dim); + output->ResizeAndAllocate(output_dim); +} + +////////////// ReduceKernel + +template +void ReduceKernelImpl(const DeviceContext& dev_ctx, + const phi::DenseTensor& input, + phi::DenseTensor* output, + const std::vector& dims, + bool keep_dim, + bool reduce_all) { + dev_ctx.template Alloc(output); + + if (reduce_all) { + // Flatten and reduce 1-D tensor + auto x = EigenVector::Flatten(input); + auto out = EigenScalar::From(*output); + auto& dev = *dev_ctx.eigen_device(); + auto reduce_dim = Eigen::array({{0}}); + + Functor functor; + functor(dev, &x, &out, reduce_dim); + } else { + int ndim = input.dims().size(); + int rdim = dims.size(); + if (ndim > 6) { + HandleLargeDim( + dev_ctx, input, output, dims, keep_dim); + + } else { + HANDLE_REDUCE_DIM(6, 5); + HANDLE_REDUCE_DIM(6, 4); + HANDLE_REDUCE_DIM(6, 3); + HANDLE_REDUCE_DIM(6, 2); + HANDLE_REDUCE_DIM(6, 1); + HANDLE_REDUCE_DIM(5, 4); + HANDLE_REDUCE_DIM(5, 3); + HANDLE_REDUCE_DIM(5, 2); + HANDLE_REDUCE_DIM(5, 1); + HANDLE_REDUCE_DIM(4, 3); + HANDLE_REDUCE_DIM(4, 2); + HANDLE_REDUCE_DIM(4, 1); + HANDLE_REDUCE_DIM(3, 2); + HANDLE_REDUCE_DIM(3, 1); + HANDLE_REDUCE_DIM(2, 1); + HANDLE_REDUCE_DIM(1, 1); + } + } +} + } // namespace funcs } // namespace phi - -#endif diff --git a/paddle/phi/kernels/gpu/dirichlet_kernel.cu b/paddle/phi/kernels/gpu/dirichlet_kernel.cu index eb34df90f0864952ec1177a2d91d41411090587d..5a2077c8a592e058f27aeb9e3ac2de8c521c45e7 100644 --- a/paddle/phi/kernels/gpu/dirichlet_kernel.cu +++ b/paddle/phi/kernels/gpu/dirichlet_kernel.cu @@ -16,10 +16,10 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/reduce.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/funcs/reduce_functor.h" #include "paddle/phi/kernels/impl/dirichlet_kernel_impl.h" @@ -99,7 +99,7 @@ struct DirichletSampler { gamma_sum.Resize(new_shape); dev_ctx.template Alloc(&gamma_sum); - ReduceKernelImpl( + funcs::ReduceKernelImpl( dev_ctx, gamma_samples, &gamma_sum, diff --git a/paddle/phi/kernels/impl/logsumexp_kernel_impl.h b/paddle/phi/kernels/impl/logsumexp_kernel_impl.h index 7a9573ff522b0a2f3c9cc62e39054c434b55282d..7c2eadcb7df7825e52b411d4e1eb9154b204641a 100644 --- a/paddle/phi/kernels/impl/logsumexp_kernel_impl.h +++ b/paddle/phi/kernels/impl/logsumexp_kernel_impl.h @@ -16,17 +16,17 @@ #include #include -#include "paddle/phi/kernels/cpu/reduce.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/logsumexp_kernel.h" namespace phi { -#define HANDLE_DIM(NDIM, RDIM) \ - if (ndim == NDIM && rdim == RDIM) { \ - ReduceFunctor( \ - dev_ctx, x, out, axis, keepdim); \ +#define HANDLE_DIM(NDIM, RDIM) \ + if (ndim == NDIM && rdim == RDIM) { \ + funcs::ReduceFunctor( \ + dev_ctx, x, out, axis, keepdim); \ } struct LogsumexpFunctor { diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h index be32f85fe99a431d837fa9cb467ff5efd6a18c4d..f499e59c307291f91de5b25b72fd4a12b20d68a6 100644 --- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h @@ -17,8 +17,8 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/kernels/complex_kernel.h" -#include "paddle/phi/kernels/cpu/reduce.h" #include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/funcs/reduce_functor.h" #include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h" #include "paddle/phi/kernels/impl/matmul_kernel_impl.h" @@ -45,7 +45,7 @@ struct ReduceSumForMatmulGrad { const std::vector& reduce_dims) { std::vector reduce_dims_tmp(reduce_dims.begin(), reduce_dims.end()); - ReduceKernelImpl( + funcs::ReduceKernelImpl( dev_ctx, input, output, reduce_dims_tmp, true, false); } }; diff --git a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h index 214db79383800a6e463a160ef360b3e13a9c2927..a661035ab5b749b81369d9b1dfc6454a14145f98 100644 --- a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h @@ -16,11 +16,11 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/kernels/cpu/reduce.h" #include "paddle/phi/kernels/expand_as_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/matrix_solve.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/funcs/reduce_functor.h" #include "paddle/phi/kernels/impl/solve_kernel_impl.h" #include "paddle/phi/kernels/squeeze_kernel.h" @@ -50,7 +50,7 @@ struct ReduceSumForSolvelGrad { bool keep_dims) { std::vector reduce_dims_tmp(reduce_dims.begin(), reduce_dims.end()); - phi::ReduceKernelImpl( + funcs::ReduceKernelImpl( dev_ctx, input, output, reduce_dims_tmp, keep_dims, false); } };