Use BroadcastKernel and ReduceKernel to optimize expand and expand_grad. (#49419)

* Use BroadcastKernel and ReduceKernel to optimize expand and expand_grad. * Correct the axis when there is only 1 input in BroadcastKernel. * Add the calculate of output's shape.

Use BroadcastKernel and ReduceKernel to optimize expand and expand_grad. (#49419)
* Use BroadcastKernel and ReduceKernel to optimize expand and expand_grad. * Correct the axis when there is only 1 input in BroadcastKernel. * Add the calculate of output's shape.
c4604025 · Yiqun Liu · GitHub · 347d2123 · c4604025 · c4604025
5 changed file
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -909,9 +909,6 @@ void ExpandInferMeta(const MetaTensor& x,
  auto out_rank =
      std::max(static_cast<size_t>(x_dims.size()), expand_shape.size());
  std::vector<int64_t> out_shape(out_rank);
-  auto x_dim_vec = phi::vectorize<int>(x_dims);
-  auto diff = expand_shape.size() - x_dim_vec.size();
-  x_dim_vec.insert(x_dim_vec.begin(), diff, -1);
  for (size_t i = 0; i < expand_shape.size(); ++i) {
    if (x_dims[i] == -1) {
      out_shape[i] = -1;

--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -1023,15 +1023,20 @@ void BroadcastKernel(const KPDevice &ctx,
                     std::vector<DenseTensor *> *outs,
                     int axis,
                     Functor func) {
-  std::vector<int> dims_size;
-  dims_size.reserve(ins.size());
+  // When there are multiple inputs, the outputs's rank should be equal the
+  // maximum rank of all inputs.
+  int max_rank = 0;
+  int min_rank = phi::DDim::kMaxRank;
  for (auto *in : ins) {
-    dims_size.emplace_back(in->dims().size());
+    max_rank = std::max(max_rank, in->dims().size());
+    min_rank = std::min(min_rank, in->dims().size());
  }
-
-  axis = axis == -1 ? *std::max_element(dims_size.begin(), dims_size.end()) -
-                          *std::min_element(dims_size.begin(), dims_size.end())
-                    : axis;
+  if (ins.size() == 1) {
+    // When there is only 1 input, the input's rank may be less than outputs'
+    // rank.
+    max_rank = std::max(max_rank, (*outs)[0]->dims().size());
+  }
+  axis = axis == -1 ? max_rank - min_rank : axis;
  BroadcastKernelForDifferentVecSize<ET, InT, OutT, Functor, NumOuts>(
      ctx, ins, outs, axis, func);
 }

--- a/paddle/phi/kernels/funcs/dims_simplifier.h
+++ b/paddle/phi/kernels/funcs/dims_simplifier.h
@@ -25,8 +25,8 @@ struct BroadcastDimsSimplifier {
  typedef void (*MergeFunctor)(
      bool &, std::vector<DimVector> &, DimVector &, int, int);

-  int64_t N;
-  int64_t rank;
+  int N;
+  int rank;
  DimVector out_dims;
  std::vector<DimVector> in_dims;

@@ -103,41 +103,43 @@ struct BroadcastDimsSimplifier {
  // To compensate the lackage of input_tensors' dimension with axis.
  void ExtendInputDimensions(int N, int axis) {
    for (auto &in_dim : in_dims) {
-      int64_t in_idx = 0;
      if (in_dim.size() < rank) {
-        DimVector tmp_dim(rank, 1);
-        for (; in_idx < in_dim.size();) {
-          if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) {
-            tmp_dim[axis] = in_dim[in_idx];
-            in_idx++;
-            axis++;
+        DimVector extended_in_dim(rank, 1);
+        int out_idx = axis;
+        for (int in_idx = 0; in_idx < in_dim.size(); in_idx++) {
+          if (in_dim[in_idx] == out_dims[out_idx] || in_dim[in_idx] == 1) {
+            extended_in_dim[out_idx] = in_dim[in_idx];
+            out_idx++;
          } else {
            PADDLE_THROW(phi::errors::InvalidArgument(
                "The %d-th dimension of input tensor is expected to be equal "
                "with the %d-th dimension of output tensor %d or 1, but "
-                "received %d.",
-                in_idx + 1,
-                axis + 1,
+                "received %d. The input's shape is {%s}, the output's shape is "
+                "{%s}.",
+                in_idx,
+                out_idx,
                out_dims[axis],
-                in_dim[in_idx]));
+                in_dim[in_idx],
+                phi::make_ddim(in_dim),
+                phi::make_ddim(out_dims)));
          }
        }
        in_dim.resize(rank);
-        std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin());
+        std::copy(
+            extended_in_dim.begin(), extended_in_dim.end(), in_dim.begin());
      } else {
-        for (; in_idx < rank;) {
-          if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) {
-            in_idx++;
-          } else {
-            PADDLE_THROW(phi::errors::InvalidArgument(
-                "The %d-th dimension of input tensor is expected to be equal "
-                "with the %d-th dimension of output tensor %d or 1, but "
-                "received %d.",
-                in_idx + 1,
-                in_idx + 1,
-                out_dims[in_idx],
-                in_dim[in_idx]));
-          }
+        for (int in_idx = 0; in_idx < rank; in_idx++) {
+          PADDLE_ENFORCE_EQ(
+              in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1,
+              true,
+              phi::errors::InvalidArgument(
+                  "The %d-th dimension of input tensor is expected to be equal "
+                  "with the %d-th dimension of output tensor %d or 1, but "
+                  "received %d.",
+                  in_idx,
+                  in_idx,
+                  out_dims[in_idx],
+                  in_dim[in_idx]));
        }
      }
      std::reverse(in_dim.begin(), in_dim.end());

--- a/paddle/phi/kernels/gpu/expand_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_grad_kernel.cu
@@ -17,7 +17,28 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/expand_grad_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ExpandGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const IntArray& shape,
+                      DenseTensor* x_grad) {
+  ctx.template Alloc<T>(x_grad);
+  if (x_grad->dims() == out_grad.dims()) {
+    phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad);
+  } else {
+    std::vector<int> reduce_dims =
+        funcs::GetReduceDim(x_grad->dims(), out_grad.dims(), -1);
+    funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        ctx, out_grad, x_grad, kps::IdentityFunctor<T>(), reduce_dims);
+  }
+}
+
+}  // namespace phi

 PD_REGISTER_KERNEL(expand_grad,
                   GPU,
@@ -26,5 +47,6 @@ PD_REGISTER_KERNEL(expand_grad,
                   float,
                   double,
                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
                   int,
                   int64_t) {}
--- a/paddle/phi/kernels/gpu/expand_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_kernel.cu
@@ -18,7 +18,66 @@
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/expand_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ExpandKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const IntArray& shape,
+                  DenseTensor* out) {
+  auto expand_shape = shape.GetData();
+  auto diff = expand_shape.size() - x.dims().size();
+  auto out_shape = phi::vectorize<int64_t>(x.dims());
+  out_shape.insert(out_shape.begin(), diff, 1);
+  for (size_t i = 0; i < out_shape.size(); ++i) {
+    PADDLE_ENFORCE_NE(
+        expand_shape[i],
+        0,
+        phi::errors::InvalidArgument("The expanded size cannot be zero."));
+    if (i < diff) {
+      PADDLE_ENFORCE_GT(
+          expand_shape[i],
+          0,
+          phi::errors::InvalidArgument(
+              "The expanded size (%d) for non-existing dimensions must be "
+              "positive for expand kernel.",
+              expand_shape[i]));
+      out_shape[i] = expand_shape[i];
+    } else if (expand_shape[i] > 0) {
+      if (out_shape[i] != 1) {
+        PADDLE_ENFORCE_EQ(
+            out_shape[i],
+            expand_shape[i],
+            phi::errors::InvalidArgument(
+                "The value (%d) of the non-singleton dimension does not match"
+                " the corresponding value (%d) in shape for expand kernel.",
+                out_shape[i],
+                expand_shape[i]));
+      } else {
+        out_shape[i] = expand_shape[i];
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(
+          expand_shape[i],
+          -1,
+          phi::errors::InvalidArgument(
+              "When the value in shape is negative for expand_v2 op, "
+              "only -1 is supported, but the value received is %d.",
+              expand_shape[i]));
+    }
+  }
+
+  out->Resize(phi::make_ddim(out_shape));
+  ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  phi::funcs::BroadcastKernel<ElementwiseType::kUnary, T, T>(
+      ctx, ins, &outs, -1, kps::IdentityFunctor<T>());
+}
+
+}  // namespace phi

 PD_REGISTER_KERNEL(expand,
                   GPU,
@@ -27,6 +86,7 @@ PD_REGISTER_KERNEL(expand,
                   float,
                   double,
                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
                   int,
                   int64_t,
                   bool) {}