From c4604025adb84dab35b83a4c5b2bc22e27750ebb Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Tue, 3 Jan 2023 14:19:24 +0800 Subject: [PATCH] Use BroadcastKernel and ReduceKernel to optimize expand and expand_grad. (#49419) * Use BroadcastKernel and ReduceKernel to optimize expand and expand_grad. * Correct the axis when there is only 1 input in BroadcastKernel. * Add the calculate of output's shape. --- paddle/phi/infermeta/unary.cc | 3 - paddle/phi/kernels/funcs/broadcast_function.h | 19 +++--- paddle/phi/kernels/funcs/dims_simplifier.h | 56 +++++++++-------- paddle/phi/kernels/gpu/expand_grad_kernel.cu | 24 ++++++- paddle/phi/kernels/gpu/expand_kernel.cu | 62 ++++++++++++++++++- 5 files changed, 125 insertions(+), 39 deletions(-) diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index c3b96b813b8..131d504795f 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -909,9 +909,6 @@ void ExpandInferMeta(const MetaTensor& x, auto out_rank = std::max(static_cast(x_dims.size()), expand_shape.size()); std::vector out_shape(out_rank); - auto x_dim_vec = phi::vectorize(x_dims); - auto diff = expand_shape.size() - x_dim_vec.size(); - x_dim_vec.insert(x_dim_vec.begin(), diff, -1); for (size_t i = 0; i < expand_shape.size(); ++i) { if (x_dims[i] == -1) { out_shape[i] = -1; diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 49020337e08..cf974bdbe33 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -1023,15 +1023,20 @@ void BroadcastKernel(const KPDevice &ctx, std::vector *outs, int axis, Functor func) { - std::vector dims_size; - dims_size.reserve(ins.size()); + // When there are multiple inputs, the outputs's rank should be equal the + // maximum rank of all inputs. + int max_rank = 0; + int min_rank = phi::DDim::kMaxRank; for (auto *in : ins) { - dims_size.emplace_back(in->dims().size()); + max_rank = std::max(max_rank, in->dims().size()); + min_rank = std::min(min_rank, in->dims().size()); } - - axis = axis == -1 ? *std::max_element(dims_size.begin(), dims_size.end()) - - *std::min_element(dims_size.begin(), dims_size.end()) - : axis; + if (ins.size() == 1) { + // When there is only 1 input, the input's rank may be less than outputs' + // rank. + max_rank = std::max(max_rank, (*outs)[0]->dims().size()); + } + axis = axis == -1 ? max_rank - min_rank : axis; BroadcastKernelForDifferentVecSize( ctx, ins, outs, axis, func); } diff --git a/paddle/phi/kernels/funcs/dims_simplifier.h b/paddle/phi/kernels/funcs/dims_simplifier.h index 39123575469..a52373c117e 100644 --- a/paddle/phi/kernels/funcs/dims_simplifier.h +++ b/paddle/phi/kernels/funcs/dims_simplifier.h @@ -25,8 +25,8 @@ struct BroadcastDimsSimplifier { typedef void (*MergeFunctor)( bool &, std::vector &, DimVector &, int, int); - int64_t N; - int64_t rank; + int N; + int rank; DimVector out_dims; std::vector in_dims; @@ -103,41 +103,43 @@ struct BroadcastDimsSimplifier { // To compensate the lackage of input_tensors' dimension with axis. void ExtendInputDimensions(int N, int axis) { for (auto &in_dim : in_dims) { - int64_t in_idx = 0; if (in_dim.size() < rank) { - DimVector tmp_dim(rank, 1); - for (; in_idx < in_dim.size();) { - if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) { - tmp_dim[axis] = in_dim[in_idx]; - in_idx++; - axis++; + DimVector extended_in_dim(rank, 1); + int out_idx = axis; + for (int in_idx = 0; in_idx < in_dim.size(); in_idx++) { + if (in_dim[in_idx] == out_dims[out_idx] || in_dim[in_idx] == 1) { + extended_in_dim[out_idx] = in_dim[in_idx]; + out_idx++; } else { PADDLE_THROW(phi::errors::InvalidArgument( "The %d-th dimension of input tensor is expected to be equal " "with the %d-th dimension of output tensor %d or 1, but " - "received %d.", - in_idx + 1, - axis + 1, + "received %d. The input's shape is {%s}, the output's shape is " + "{%s}.", + in_idx, + out_idx, out_dims[axis], - in_dim[in_idx])); + in_dim[in_idx], + phi::make_ddim(in_dim), + phi::make_ddim(out_dims))); } } in_dim.resize(rank); - std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin()); + std::copy( + extended_in_dim.begin(), extended_in_dim.end(), in_dim.begin()); } else { - for (; in_idx < rank;) { - if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) { - in_idx++; - } else { - PADDLE_THROW(phi::errors::InvalidArgument( - "The %d-th dimension of input tensor is expected to be equal " - "with the %d-th dimension of output tensor %d or 1, but " - "received %d.", - in_idx + 1, - in_idx + 1, - out_dims[in_idx], - in_dim[in_idx])); - } + for (int in_idx = 0; in_idx < rank; in_idx++) { + PADDLE_ENFORCE_EQ( + in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1, + true, + phi::errors::InvalidArgument( + "The %d-th dimension of input tensor is expected to be equal " + "with the %d-th dimension of output tensor %d or 1, but " + "received %d.", + in_idx, + in_idx, + out_dims[in_idx], + in_dim[in_idx])); } } std::reverse(in_dim.begin(), in_dim.end()); diff --git a/paddle/phi/kernels/gpu/expand_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_grad_kernel.cu index e8729942b6e..35a6681b7af 100644 --- a/paddle/phi/kernels/gpu/expand_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/expand_grad_kernel.cu @@ -17,7 +17,28 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/expand_grad_kernel_impl.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" + +namespace phi { + +template +void ExpandGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const IntArray& shape, + DenseTensor* x_grad) { + ctx.template Alloc(x_grad); + if (x_grad->dims() == out_grad.dims()) { + phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad); + } else { + std::vector reduce_dims = + funcs::GetReduceDim(x_grad->dims(), out_grad.dims(), -1); + funcs::ReduceKernel>( + ctx, out_grad, x_grad, kps::IdentityFunctor(), reduce_dims); + } +} + +} // namespace phi PD_REGISTER_KERNEL(expand_grad, GPU, @@ -26,5 +47,6 @@ PD_REGISTER_KERNEL(expand_grad, float, double, phi::dtype::float16, + phi::dtype::bfloat16, int, int64_t) {} diff --git a/paddle/phi/kernels/gpu/expand_kernel.cu b/paddle/phi/kernels/gpu/expand_kernel.cu index 27c4e82c635..b2f973b0a88 100644 --- a/paddle/phi/kernels/gpu/expand_kernel.cu +++ b/paddle/phi/kernels/gpu/expand_kernel.cu @@ -18,7 +18,66 @@ #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/expand_kernel_impl.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" + +namespace phi { + +template +void ExpandKernel(const Context& ctx, + const DenseTensor& x, + const IntArray& shape, + DenseTensor* out) { + auto expand_shape = shape.GetData(); + auto diff = expand_shape.size() - x.dims().size(); + auto out_shape = phi::vectorize(x.dims()); + out_shape.insert(out_shape.begin(), diff, 1); + for (size_t i = 0; i < out_shape.size(); ++i) { + PADDLE_ENFORCE_NE( + expand_shape[i], + 0, + phi::errors::InvalidArgument("The expanded size cannot be zero.")); + if (i < diff) { + PADDLE_ENFORCE_GT( + expand_shape[i], + 0, + phi::errors::InvalidArgument( + "The expanded size (%d) for non-existing dimensions must be " + "positive for expand kernel.", + expand_shape[i])); + out_shape[i] = expand_shape[i]; + } else if (expand_shape[i] > 0) { + if (out_shape[i] != 1) { + PADDLE_ENFORCE_EQ( + out_shape[i], + expand_shape[i], + phi::errors::InvalidArgument( + "The value (%d) of the non-singleton dimension does not match" + " the corresponding value (%d) in shape for expand kernel.", + out_shape[i], + expand_shape[i])); + } else { + out_shape[i] = expand_shape[i]; + } + } else { + PADDLE_ENFORCE_EQ( + expand_shape[i], + -1, + phi::errors::InvalidArgument( + "When the value in shape is negative for expand_v2 op, " + "only -1 is supported, but the value received is %d.", + expand_shape[i])); + } + } + + out->Resize(phi::make_ddim(out_shape)); + ctx.template Alloc(out); + std::vector ins = {&x}; + std::vector outs = {out}; + phi::funcs::BroadcastKernel( + ctx, ins, &outs, -1, kps::IdentityFunctor()); +} + +} // namespace phi PD_REGISTER_KERNEL(expand, GPU, @@ -27,6 +86,7 @@ PD_REGISTER_KERNEL(expand, float, double, phi::dtype::float16, + phi::dtype::bfloat16, int, int64_t, bool) {} -- GitLab