【Hackathon No.32】为 Paddle 优化 expand_as 前向&反向 op 在 GPU 上的计算性能 (#52700)

* Implement optimized kernel for OP-expand_as. * Support fp16. Co-authored-by: Timber-Ye <ye_hanqiao@163.com> Co-authored-by: N BrianQian1999 <brianqianhitsz@gmail.com> * remove fp16 support * remove MAX_RANK_SUPPORTED --------- Co-authored-by: N BrianQian1999 <brianqianhitsz@gmail.com>

【Hackathon No.32】为 Paddle 优化 expand_as 前向&反向 op 在 GPU 上的计算性能 (#52700)
* Implement optimized kernel for OP-expand_as. * Support fp16. Co-authored-by: Timber-Ye <ye_hanqiao@163.com> Co-authored-by: N BrianQian1999 <brianqianhitsz@gmail.com> * remove fp16 support * remove MAX_RANK_SUPPORTED --------- Co-authored-by: N BrianQian1999 <brianqianhitsz@gmail.com>
3c44e948 · Hanchiao · GitHub · ea04bef8 · 3c44e948 · 3c44e948
Showing with 99 addition and 2 deletion

paddle/phi/kernels/gpu/expand_as_grad_kernel.cu paddle/phi/kernels/gpu/expand_as_grad_kernel.cu +36 -1

paddle/phi/kernels/gpu/expand_as_kernel.cu paddle/phi/kernels/gpu/expand_as_kernel.cu +63 -1

未找到文件。
--- a/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
@@ -15,8 +15,43 @@
 #include "paddle/phi/kernels/expand_as_grad_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+namespace phi {
+template <typename T, typename Context>
+void ExpandAsGradKernel(const Context& context,
+                        const DenseTensor& x,
+                        const DenseTensor& out_grad,
+                        const std::vector<int>& target_shape,
+                        DenseTensor* in_grad) {
+  auto in_dims = x.dims();
+  auto out_dims = out_grad.dims();
+  int in_rank = in_dims.size();
+  int out_rank = out_dims.size();
+  PADDLE_ENFORCE_LE(
+      out_rank,
+      6,
+      errors::InvalidArgument("The rank of the input 'Out@GRAD' for "
+                              "expand_as_v2_grad op must be less than or equal "
+                              "to 6, but the value received is %d.",
+                              out_rank));
+  context.template Alloc<T>(in_grad);
+  if (in_dims == out_dims) {
+    phi::Copy(context, out_grad, context.GetPlace(), false, in_grad);
+  } else {
+    std::vector<int> reduce_dims = funcs::GetReduceDim(in_dims, out_dims, -1);
+    funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        context, out_grad, in_grad, kps::IdentityFunctor<T>(), reduce_dims);
+  }
+}
+}  // namespace phi
 PD_REGISTER_KERNEL(expand_as_grad,
                   GPU,

--- a/paddle/phi/kernels/gpu/expand_as_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_as_kernel.cu
@@ -15,8 +15,70 @@
 #include "paddle/phi/kernels/expand_as_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+namespace phi {
+template <typename T, typename Context>
+void ExpandAsKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    const paddle::optional<DenseTensor>& y,
+                    const std::vector<int>& target_shape,
+                    DenseTensor* out) {
+  int rank = x.dims().size();
+  int target_rank = static_cast<int>(target_shape.size());
+  auto vec_in_dims = phi::vectorize<int>(x.dims());
+  unsigned int diff = target_rank - rank;
+  vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+  for (unsigned int i = 0; i < vec_in_dims.size(); ++i) {
+    PADDLE_ENFORCE_NE(
+        target_shape[i],
+        0,
+        errors::InvalidArgument("The value of target shape cannot be zero."));
+    if (i < diff) {
+      PADDLE_ENFORCE_GT(
+          target_shape[i],
+          0,
+          errors::InvalidArgument(
+              "The expanded size (%d) for non-existing dimensions must be "
+              "positive for expand_as_v2 op.",
+              target_shape[i]));
+    } else if (target_shape[i] > 0) {
+      if (vec_in_dims[i] != 1) {
+        PADDLE_ENFORCE_EQ(
+            vec_in_dims[i],
+            target_shape[i],
+            errors::InvalidArgument(
+                "The value (%d) of the non-singleton dimension does not match"
+                " the corresponding value (%d) in shape for expand_as_v2 op.",
+                vec_in_dims[i],
+                target_shape[i]));
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(
+          target_shape[i],
+          -1,
+          errors::InvalidArgument(
+              "When the value in shape is negative for expand_as_v2 op, "
+              "only -1 is supported, but the value received is %d.",
+              target_shape[i]));
+    }
+  }
+  out->Resize(phi::make_ddim(target_shape));
+  ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  phi::funcs::BroadcastKernel<ElementwiseType::kUnary, T, T>(
+      ctx, ins, &outs, -1, kps::IdentityFunctor<T>());
+}
+}  // namespace phi
 PD_REGISTER_KERNEL(expand_as,
                   GPU,