From af0eca92af3303316dd964b76c57ad5614ec4e9a Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Tue, 25 Jan 2022 15:38:59 +0800 Subject: [PATCH] Revert "Replace EigenBroadcast with ElementwiseBroadcast in ReduceGrad (#38959)" This reverts commit 9059ef6935c92306a481b6c26f66db2191a3913e. --- .../reduce_ops/reduce_mean_op.part.cu | 10 +++- paddle/fluid/operators/reduce_ops/reduce_op.h | 58 ++----------------- .../operators/reduce_ops/reduce_sum_op.cc | 2 +- .../operators/reduce_ops/reduce_sum_op.h | 2 +- .../reduce_ops/reduce_sum_op.part.cu | 3 +- paddle/pten/kernels/gpu/elementwise.h | 13 +---- paddle/pten/kernels/gpu/reduce.h | 21 +------ 7 files changed, 21 insertions(+), 88 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu index a578c9f7d81..4cc2577f6b2 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu @@ -17,9 +17,15 @@ template using CUDAReduceMeanGradKernel = - ops::ReduceCudaGradKernel; + ops::ReduceGradKernel; + +using FP16CUDAReduceMeanGradKernel = + ops::ReduceGradKernel; REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel, - CUDAReduceMeanGradKernel, + FP16CUDAReduceMeanGradKernel, CUDAReduceMeanGradKernel, CUDAReduceMeanGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index ff6a4436384..661fb772f1c 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -623,12 +623,11 @@ class ReduceGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - int out_dtype = ctx.Attr("out_dtype"); + int in_dtype = ctx.Attr("in_dtype"); auto input_data_type = - (out_dtype >= 0) - ? static_cast(out_dtype) - : OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")); + (in_dtype >= 0) ? static_cast(in_dtype) + : OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); #ifdef PADDLE_WITH_MKLDNN auto CanMKLDNNReduceGradBeUsed = [&]() { auto dx_dims = ctx.Input("X")->dims(); @@ -737,55 +736,6 @@ class ReduceCudaKernel : public framework::OpKernel { pt_out.get()); } }; - -template class TransformOp> -class ReduceCudaGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - std::vector dims = context.Attr>("dim"); - auto* in_x = context.Input("X"); - auto* d_out = - context.Input(framework::GradVarName("Out")); - auto* d_x = context.Output(framework::GradVarName("X")); - auto out_dtype = context.Attr("in_dtype"); - // get reduce_dim and reduce_num for reduce_mean_grad - int dim_size = in_x->dims().size(); - std::vector reduce_dims = GetReduceDim(dims, dim_size, reduce_all); - auto update_dims = vectorize(d_x->dims()); - int reduce_num = 1; - for (auto i : reduce_dims) { - reduce_num *= (in_x->dims())[i]; - update_dims[i] = 1; - } - // make new tensor - framework::Tensor new_d_out(d_out->type()); - new_d_out.ShareDataWith(*d_out); - new_d_out.Resize(paddle::framework::make_ddim(update_dims)); - auto& dev_ctx = context.cuda_device_context(); - if (out_dtype > 0) { - d_x->mutable_data( - dev_ctx.GetPlace(), - static_cast(out_dtype)); - } else { - d_x->mutable_data( - dev_ctx.GetPlace(), - static_cast(d_out->type())); - } - auto pt_d_out = paddle::experimental::MakePtenDenseTensor(new_d_out); - auto pt_d_x = paddle::experimental::MakePtenDenseTensor(*d_x); - auto pt_out_dtype = pten::TransToPtenDataType( - static_cast(out_dtype)); - if (out_dtype <= 0) { - pt_out_dtype = pten::TransToPtenDataType( - static_cast(d_out->type())); - } - using MPType = typename kps::details::MPTypeTrait::Type; - pten::ReduceGrad>( - dev_ctx, pt_d_out.get(), pt_d_x.get(), pt_out_dtype, - TransformOp(reduce_num)); - } -}; #endif } // namespace operators diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index 9a715eb98ef..562a5719d74 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -50,7 +50,7 @@ class ReduceSumOpGradMaker : public framework::SingleGradOpMaker { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - int in_dtype = ctx.Attr("out_dtype"); + int in_dtype = ctx.Attr("in_dtype"); if (in_dtype >= 0) { return framework::OpKernelType( static_cast(in_dtype), diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h index 79b3480afbc..9782ce28da4 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h @@ -74,7 +74,7 @@ class ReduceSumGradKernel : public framework::OpKernel { auto dims = context.Attr>("dim"); if (context.GetPlace().GetType() == platform::CPUPlace().GetType() && dims.size() == 1) { - int in_dtype = context.Attr("out_dtype"); + int in_dtype = context.Attr("in_dtype"); if (in_dtype >= 0) { Tensor tmp_tensor; diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu index c3d3e0cf6ec..c629663b19e 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu @@ -17,7 +17,8 @@ template using CUDAReduceSumGradKernel = - ops::ReduceCudaGradKernel; + ops::ReduceGradKernel; REGISTER_OP_CUDA_KERNEL( reduce_sum_grad, CUDAReduceSumGradKernel, diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h index db85c596da9..9a3ae7f12df 100644 --- a/paddle/pten/kernels/gpu/elementwise.h +++ b/paddle/pten/kernels/gpu/elementwise.h @@ -134,19 +134,12 @@ struct DimensionsTransform { explicit DimensionsTransform(const std::vector &ins, const pten::framework::DDim &dims, int axis) { - const int N = max(static_cast(ins.size()), 2); + const int N = ins.size(); dim_size = dims.size(); out_dims = pten::framework::vectorize(dims); in_dims.resize(N); - if (ins.size() == 1) { - // when ins.size() = 1, broadcast input to output - in_dims[0] = pten::framework::vectorize(ins[0]->dims()); - in_dims[1] = out_dims; - // Add out_dims to in_dims to avoid errors in dims merging - } else { - for (int j = 0; j < N; ++j) { - in_dims[j] = pten::framework::vectorize(ins[j]->dims()); - } + for (int j = 0; j < N; ++j) { + in_dims[j] = pten::framework::vectorize(ins[j]->dims()); } InputDimensionsExtend(N, axis); diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h index 49a5e9e9b33..10badf00a1e 100644 --- a/paddle/pten/kernels/gpu/reduce.h +++ b/paddle/pten/kernels/gpu/reduce.h @@ -45,7 +45,8 @@ namespace cub = hipcub; #include "paddle/pten/api/ext/dispatch.h" #include "paddle/pten/backends/gpu/gpu_context.h" #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/kernels/gpu/elementwise.h" +#include "paddle/pten/kernels/funcs/elementwise_base.h" + // Reduce split or not, Whether to use ReduceHigherDim #define REDUCE_SPLIT_BOUNDARY 512 #define REDUCE_VEC_SIZE 4 @@ -1253,24 +1254,6 @@ void Reduce(const GPUContext& dev_ctx, x, out, TransformOp(reduce_num), reduce_dims, stream); } } - -template -void ReduceGrad(const GPUContext& dev_ctx, - DenseTensor* d_out, - DenseTensor* d_x, - DataType out_dtype, - Functor functor) { - std::vector inputs = {d_out}; - std::vector outputs = {d_x}; - PD_VISIT_ALL_TYPES( - out_dtype, "LaunchBroadcastElementwiseCudaKernel", ([&] { - LaunchBroadcastElementwiseCudaKernel( - dev_ctx, inputs, &outputs, 0, functor); - })); -} - } // namespace pten #endif -- GitLab