未验证 提交 c4604025 编写于 作者: Y Yiqun Liu 提交者: GitHub

Use BroadcastKernel and ReduceKernel to optimize expand and expand_grad. (#49419)

* Use BroadcastKernel and ReduceKernel to optimize expand and expand_grad.

* Correct the axis when there is only 1 input in BroadcastKernel.

* Add the calculate of output's shape.
上级 347d2123
......@@ -909,9 +909,6 @@ void ExpandInferMeta(const MetaTensor& x,
auto out_rank =
std::max(static_cast<size_t>(x_dims.size()), expand_shape.size());
std::vector<int64_t> out_shape(out_rank);
auto x_dim_vec = phi::vectorize<int>(x_dims);
auto diff = expand_shape.size() - x_dim_vec.size();
x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
for (size_t i = 0; i < expand_shape.size(); ++i) {
if (x_dims[i] == -1) {
out_shape[i] = -1;
......
......@@ -1023,15 +1023,20 @@ void BroadcastKernel(const KPDevice &ctx,
std::vector<DenseTensor *> *outs,
int axis,
Functor func) {
std::vector<int> dims_size;
dims_size.reserve(ins.size());
// When there are multiple inputs, the outputs's rank should be equal the
// maximum rank of all inputs.
int max_rank = 0;
int min_rank = phi::DDim::kMaxRank;
for (auto *in : ins) {
dims_size.emplace_back(in->dims().size());
max_rank = std::max(max_rank, in->dims().size());
min_rank = std::min(min_rank, in->dims().size());
}
axis = axis == -1 ? *std::max_element(dims_size.begin(), dims_size.end()) -
*std::min_element(dims_size.begin(), dims_size.end())
: axis;
if (ins.size() == 1) {
// When there is only 1 input, the input's rank may be less than outputs'
// rank.
max_rank = std::max(max_rank, (*outs)[0]->dims().size());
}
axis = axis == -1 ? max_rank - min_rank : axis;
BroadcastKernelForDifferentVecSize<ET, InT, OutT, Functor, NumOuts>(
ctx, ins, outs, axis, func);
}
......
......@@ -25,8 +25,8 @@ struct BroadcastDimsSimplifier {
typedef void (*MergeFunctor)(
bool &, std::vector<DimVector> &, DimVector &, int, int);
int64_t N;
int64_t rank;
int N;
int rank;
DimVector out_dims;
std::vector<DimVector> in_dims;
......@@ -103,41 +103,43 @@ struct BroadcastDimsSimplifier {
// To compensate the lackage of input_tensors' dimension with axis.
void ExtendInputDimensions(int N, int axis) {
for (auto &in_dim : in_dims) {
int64_t in_idx = 0;
if (in_dim.size() < rank) {
DimVector tmp_dim(rank, 1);
for (; in_idx < in_dim.size();) {
if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) {
tmp_dim[axis] = in_dim[in_idx];
in_idx++;
axis++;
DimVector extended_in_dim(rank, 1);
int out_idx = axis;
for (int in_idx = 0; in_idx < in_dim.size(); in_idx++) {
if (in_dim[in_idx] == out_dims[out_idx] || in_dim[in_idx] == 1) {
extended_in_dim[out_idx] = in_dim[in_idx];
out_idx++;
} else {
PADDLE_THROW(phi::errors::InvalidArgument(
"The %d-th dimension of input tensor is expected to be equal "
"with the %d-th dimension of output tensor %d or 1, but "
"received %d.",
in_idx + 1,
axis + 1,
"received %d. The input's shape is {%s}, the output's shape is "
"{%s}.",
in_idx,
out_idx,
out_dims[axis],
in_dim[in_idx]));
in_dim[in_idx],
phi::make_ddim(in_dim),
phi::make_ddim(out_dims)));
}
}
in_dim.resize(rank);
std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin());
std::copy(
extended_in_dim.begin(), extended_in_dim.end(), in_dim.begin());
} else {
for (; in_idx < rank;) {
if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) {
in_idx++;
} else {
PADDLE_THROW(phi::errors::InvalidArgument(
"The %d-th dimension of input tensor is expected to be equal "
"with the %d-th dimension of output tensor %d or 1, but "
"received %d.",
in_idx + 1,
in_idx + 1,
out_dims[in_idx],
in_dim[in_idx]));
}
for (int in_idx = 0; in_idx < rank; in_idx++) {
PADDLE_ENFORCE_EQ(
in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1,
true,
phi::errors::InvalidArgument(
"The %d-th dimension of input tensor is expected to be equal "
"with the %d-th dimension of output tensor %d or 1, but "
"received %d.",
in_idx,
in_idx,
out_dims[in_idx],
in_dim[in_idx]));
}
}
std::reverse(in_dim.begin(), in_dim.end());
......
......@@ -17,7 +17,28 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/expand_grad_kernel_impl.h"
#include "paddle/phi/kernels/funcs/reduce_function.h"
namespace phi {
template <typename T, typename Context>
void ExpandGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& out_grad,
const IntArray& shape,
DenseTensor* x_grad) {
ctx.template Alloc<T>(x_grad);
if (x_grad->dims() == out_grad.dims()) {
phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);
} else {
std::vector<int> reduce_dims =
funcs::GetReduceDim(x_grad->dims(), out_grad.dims(), -1);
funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx, out_grad, x_grad, kps::IdentityFunctor<T>(), reduce_dims);
}
}
} // namespace phi
PD_REGISTER_KERNEL(expand_grad,
GPU,
......@@ -26,5 +47,6 @@ PD_REGISTER_KERNEL(expand_grad,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16,
int,
int64_t) {}
......@@ -18,7 +18,66 @@
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/expand_kernel_impl.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
namespace phi {
template <typename T, typename Context>
void ExpandKernel(const Context& ctx,
const DenseTensor& x,
const IntArray& shape,
DenseTensor* out) {
auto expand_shape = shape.GetData();
auto diff = expand_shape.size() - x.dims().size();
auto out_shape = phi::vectorize<int64_t>(x.dims());
out_shape.insert(out_shape.begin(), diff, 1);
for (size_t i = 0; i < out_shape.size(); ++i) {
PADDLE_ENFORCE_NE(
expand_shape[i],
0,
phi::errors::InvalidArgument("The expanded size cannot be zero."));
if (i < diff) {
PADDLE_ENFORCE_GT(
expand_shape[i],
0,
phi::errors::InvalidArgument(
"The expanded size (%d) for non-existing dimensions must be "
"positive for expand kernel.",
expand_shape[i]));
out_shape[i] = expand_shape[i];
} else if (expand_shape[i] > 0) {
if (out_shape[i] != 1) {
PADDLE_ENFORCE_EQ(
out_shape[i],
expand_shape[i],
phi::errors::InvalidArgument(
"The value (%d) of the non-singleton dimension does not match"
" the corresponding value (%d) in shape for expand kernel.",
out_shape[i],
expand_shape[i]));
} else {
out_shape[i] = expand_shape[i];
}
} else {
PADDLE_ENFORCE_EQ(
expand_shape[i],
-1,
phi::errors::InvalidArgument(
"When the value in shape is negative for expand_v2 op, "
"only -1 is supported, but the value received is %d.",
expand_shape[i]));
}
}
out->Resize(phi::make_ddim(out_shape));
ctx.template Alloc<T>(out);
std::vector<const DenseTensor*> ins = {&x};
std::vector<DenseTensor*> outs = {out};
phi::funcs::BroadcastKernel<ElementwiseType::kUnary, T, T>(
ctx, ins, &outs, -1, kps::IdentityFunctor<T>());
}
} // namespace phi
PD_REGISTER_KERNEL(expand,
GPU,
......@@ -27,6 +86,7 @@ PD_REGISTER_KERNEL(expand,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16,
int,
int64_t,
bool) {}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册