Replace EigenBroadcast with ElementwiseBroadcast in ReduceGrad (#39255)

772be4f5 · niuliling123 · GitHub · b3e049f8 · 772be4f5 · 772be4f5
7 changed file
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
@@ -17,15 +17,9 @@
 template <typename T>
 using CUDAReduceMeanGradKernel =
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, T,
+    ops::ReduceCudaGradKernel<T, kps::DivideFunctor>;
-                          ops::MeanGradFunctor, true>;
-using FP16CUDAReduceMeanGradKernel =
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                          paddle::platform::float16, ops::FP16MeanGradFunctor,
-                          true>;
 REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel<bool>,
-                        FP16CUDAReduceMeanGradKernel,
+                        CUDAReduceMeanGradKernel<paddle::platform::float16>,
                        CUDAReduceMeanGradKernel<float>,
                        CUDAReduceMeanGradKernel<double>);
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -30,6 +30,7 @@ limitations under the License. */
 #if defined(__HIPCC__) || defined(__NVCC__)
 #include "paddle/pten/kernels/gpu/reduce.h"
+#include "paddle/pten/kernels/gpu/reduce_grad.h"
 #endif
 namespace paddle {
@@ -620,11 +621,12 @@ class ReduceGradOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
-    int in_dtype = ctx.Attr<int>("in_dtype");
+    int out_dtype = ctx.Attr<int>("out_dtype");
    auto input_data_type =
-        (in_dtype >= 0) ? static_cast<framework::proto::VarType::Type>(in_dtype)
+        (out_dtype >= 0)
-                        : OperatorWithKernel::IndicateVarDataType(
+            ? static_cast<framework::proto::VarType::Type>(out_dtype)
-                              ctx, framework::GradVarName("Out"));
+            : OperatorWithKernel::IndicateVarDataType(
+                  ctx, framework::GradVarName("Out"));
 #ifdef PADDLE_WITH_MKLDNN
    auto CanMKLDNNReduceGradBeUsed = [&]() {
      auto dx_dims = ctx.Input<Tensor>("X")->dims();
@@ -730,6 +732,55 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
        dev_ctx, *input, reduce_all, dims_int64, false, pt_out_dtype, output);
  }
 };
+template <typename T, template <typename, typename> class TransformOp>
+class ReduceCudaGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    std::vector<int> dims = context.Attr<std::vector<int>>("dim");
+    auto* in_x = context.Input<Tensor>("X");
+    auto* d_out =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto out_dtype = context.Attr<int>("in_dtype");
+    // get reduce_dim and reduce_num for reduce_mean_grad
+    int dim_size = in_x->dims().size();
+    std::vector<int> reduce_dims = GetReduceDim(dims, dim_size, reduce_all);
+    auto update_dims = vectorize(d_x->dims());
+    int reduce_num = 1;
+    for (auto i : reduce_dims) {
+      reduce_num *= (in_x->dims())[i];
+      update_dims[i] = 1;
+    }
+    // make new tensor
+    framework::Tensor new_d_out(d_out->type());
+    new_d_out.ShareDataWith(*d_out);
+    new_d_out.Resize(paddle::framework::make_ddim(update_dims));
+    auto& dev_ctx = context.cuda_device_context();
+    if (out_dtype > 0) {
+      d_x->mutable_data(
+          dev_ctx.GetPlace(),
+          static_cast<framework::proto::VarType::Type>(out_dtype));
+    } else {
+      d_x->mutable_data(
+          dev_ctx.GetPlace(),
+          static_cast<framework::proto::VarType::Type>(d_out->type()));
+    }
+    auto pt_d_out = paddle::experimental::MakePtenDenseTensor(new_d_out);
+    auto pt_d_x = paddle::experimental::MakePtenDenseTensor(*d_x);
+    auto pt_out_dtype = pten::TransToPtenDataType(
+        static_cast<framework::proto::VarType::Type>(out_dtype));
+    if (out_dtype <= 0) {
+      pt_out_dtype = pten::TransToPtenDataType(
+          static_cast<framework::proto::VarType::Type>(d_out->type()));
+    }
+    using MPType = typename kps::details::MPTypeTrait<T>::Type;
+    pten::ReduceGrad<T, TransformOp<T, MPType>>(
+        dev_ctx, pt_d_out.get(), pt_d_x.get(), pt_out_dtype,
+        TransformOp<T, MPType>(reduce_num));
+  }
+};
 #endif
 }  // namespace operators

--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -50,7 +50,7 @@ class ReduceSumOpGradMaker : public framework::SingleGradOpMaker<T> {
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const {
-    int in_dtype = ctx.Attr<int>("in_dtype");
+    int in_dtype = ctx.Attr<int>("out_dtype");
    if (in_dtype >= 0) {
      return framework::OpKernelType(
          static_cast<framework::proto::VarType::Type>(in_dtype),

--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
@@ -74,7 +74,7 @@ class ReduceSumGradKernel : public framework::OpKernel<T> {
    auto dims = context.Attr<std::vector<int>>("dim");
    if (context.GetPlace().GetType() == platform::CPUPlace().GetType() &&
        dims.size() == 1) {
-      int in_dtype = context.Attr<int>("in_dtype");
+      int in_dtype = context.Attr<int>("out_dtype");
      if (in_dtype >= 0) {
        Tensor tmp_tensor;

--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
@@ -17,8 +17,7 @@
 template <typename T>
 using CUDAReduceSumGradKernel =
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, T,
+    ops::ReduceCudaGradKernel<T, kps::IdentityFunctor>;
-                          ops::SumGradFunctor, true>;
 REGISTER_OP_CUDA_KERNEL(
    reduce_sum_grad, CUDAReduceSumGradKernel<bool>,

--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -134,12 +134,19 @@ struct DimensionsTransform {
  explicit DimensionsTransform(const std::vector<const DenseTensor *> &ins,
                               const pten::framework::DDim &dims,
                               int axis) {
-    const int N = ins.size();
+    const int N = max(static_cast<int>(ins.size()), 2);
    dim_size = dims.size();
    out_dims = pten::framework::vectorize<int64_t>(dims);
    in_dims.resize(N);
-    for (int j = 0; j < N; ++j) {
+    if (ins.size() == 1) {
-      in_dims[j] = pten::framework::vectorize<int64_t>(ins[j]->dims());
+      // when ins.size() = 1, broadcast input to output
+      in_dims[0] = pten::framework::vectorize<int64_t>(ins[0]->dims());
+      in_dims[1] = out_dims;
+      // Add out_dims to in_dims to avoid errors in dims merging
+    } else {
+      for (int j = 0; j < N; ++j) {
+        in_dims[j] = pten::framework::vectorize<int64_t>(ins[j]->dims());
+      }
    }
    InputDimensionsExtend(N, axis);

--- a/paddle/pten/kernels/gpu/reduce_grad.h
+++ b/paddle/pten/kernels/gpu/reduce_grad.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <set>
+#include <vector>
+#include "paddle/pten/kernels/gpu/elementwise.h"
+namespace pten {
+template <typename InT, typename Functor>
+void ReduceGrad(const GPUContext& dev_ctx,
+                DenseTensor* d_out,
+                DenseTensor* d_x,
+                DataType out_dtype,
+                Functor functor) {
+  std::vector<const DenseTensor*> inputs = {d_out};
+  std::vector<DenseTensor*> outputs = {d_x};
+  PD_VISIT_ALL_TYPES(
+      out_dtype, "LaunchBroadcastElementwiseCudaKernel", ([&] {
+        LaunchBroadcastElementwiseCudaKernel<pten::ElementwiseType::kUnary,
+                                             InT,
+                                             data_t>(
+            dev_ctx, inputs, &outputs, 0, functor);
+      }));
+}
+}  // namespace pten
+#endif