diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu index 4cc2577f6b2ec2c53421257d207aba83b95965a3..a578c9f7d81083c533028b9c8912a24006ed0292 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu @@ -17,15 +17,9 @@ template using CUDAReduceMeanGradKernel = - ops::ReduceGradKernel; - -using FP16CUDAReduceMeanGradKernel = - ops::ReduceGradKernel; + ops::ReduceCudaGradKernel; REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel, - FP16CUDAReduceMeanGradKernel, + CUDAReduceMeanGradKernel, CUDAReduceMeanGradKernel, CUDAReduceMeanGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index 661fb772f1c573f480def3e7162f62ba56a56d02..ff6a443638452d3ac01541c59245c08aedf5db81 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -623,11 +623,12 @@ class ReduceGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - int in_dtype = ctx.Attr("in_dtype"); + int out_dtype = ctx.Attr("out_dtype"); auto input_data_type = - (in_dtype >= 0) ? static_cast(in_dtype) - : OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")); + (out_dtype >= 0) + ? static_cast(out_dtype) + : OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); #ifdef PADDLE_WITH_MKLDNN auto CanMKLDNNReduceGradBeUsed = [&]() { auto dx_dims = ctx.Input("X")->dims(); @@ -736,6 +737,55 @@ class ReduceCudaKernel : public framework::OpKernel { pt_out.get()); } }; + +template class TransformOp> +class ReduceCudaGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + std::vector dims = context.Attr>("dim"); + auto* in_x = context.Input("X"); + auto* d_out = + context.Input(framework::GradVarName("Out")); + auto* d_x = context.Output(framework::GradVarName("X")); + auto out_dtype = context.Attr("in_dtype"); + // get reduce_dim and reduce_num for reduce_mean_grad + int dim_size = in_x->dims().size(); + std::vector reduce_dims = GetReduceDim(dims, dim_size, reduce_all); + auto update_dims = vectorize(d_x->dims()); + int reduce_num = 1; + for (auto i : reduce_dims) { + reduce_num *= (in_x->dims())[i]; + update_dims[i] = 1; + } + // make new tensor + framework::Tensor new_d_out(d_out->type()); + new_d_out.ShareDataWith(*d_out); + new_d_out.Resize(paddle::framework::make_ddim(update_dims)); + auto& dev_ctx = context.cuda_device_context(); + if (out_dtype > 0) { + d_x->mutable_data( + dev_ctx.GetPlace(), + static_cast(out_dtype)); + } else { + d_x->mutable_data( + dev_ctx.GetPlace(), + static_cast(d_out->type())); + } + auto pt_d_out = paddle::experimental::MakePtenDenseTensor(new_d_out); + auto pt_d_x = paddle::experimental::MakePtenDenseTensor(*d_x); + auto pt_out_dtype = pten::TransToPtenDataType( + static_cast(out_dtype)); + if (out_dtype <= 0) { + pt_out_dtype = pten::TransToPtenDataType( + static_cast(d_out->type())); + } + using MPType = typename kps::details::MPTypeTrait::Type; + pten::ReduceGrad>( + dev_ctx, pt_d_out.get(), pt_d_x.get(), pt_out_dtype, + TransformOp(reduce_num)); + } +}; #endif } // namespace operators diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index 562a5719d74d90062fb92c5198bacf82499c9948..9a715eb98ef994f89e201656e8c371d819b11f19 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -50,7 +50,7 @@ class ReduceSumOpGradMaker : public framework::SingleGradOpMaker { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - int in_dtype = ctx.Attr("in_dtype"); + int in_dtype = ctx.Attr("out_dtype"); if (in_dtype >= 0) { return framework::OpKernelType( static_cast(in_dtype), diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h index 9782ce28da4af0b17f38219c039dd4fd62f46846..79b3480afbcd7750297e9e02575a43fdb6809126 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h @@ -74,7 +74,7 @@ class ReduceSumGradKernel : public framework::OpKernel { auto dims = context.Attr>("dim"); if (context.GetPlace().GetType() == platform::CPUPlace().GetType() && dims.size() == 1) { - int in_dtype = context.Attr("in_dtype"); + int in_dtype = context.Attr("out_dtype"); if (in_dtype >= 0) { Tensor tmp_tensor; diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu index c629663b19ebd7f42f3a16e69bd4b46784ff67dd..c3d3e0cf6ecd51f3bb2baa063878f80444db3563 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu @@ -17,8 +17,7 @@ template using CUDAReduceSumGradKernel = - ops::ReduceGradKernel; + ops::ReduceCudaGradKernel; REGISTER_OP_CUDA_KERNEL( reduce_sum_grad, CUDAReduceSumGradKernel, diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h index 9a3ae7f12dfcd62a1a18154971fa99ab72c5561d..db85c596da945f77a7dd0fc3f88129c73bb0d7e5 100644 --- a/paddle/pten/kernels/gpu/elementwise.h +++ b/paddle/pten/kernels/gpu/elementwise.h @@ -134,12 +134,19 @@ struct DimensionsTransform { explicit DimensionsTransform(const std::vector &ins, const pten::framework::DDim &dims, int axis) { - const int N = ins.size(); + const int N = max(static_cast(ins.size()), 2); dim_size = dims.size(); out_dims = pten::framework::vectorize(dims); in_dims.resize(N); - for (int j = 0; j < N; ++j) { - in_dims[j] = pten::framework::vectorize(ins[j]->dims()); + if (ins.size() == 1) { + // when ins.size() = 1, broadcast input to output + in_dims[0] = pten::framework::vectorize(ins[0]->dims()); + in_dims[1] = out_dims; + // Add out_dims to in_dims to avoid errors in dims merging + } else { + for (int j = 0; j < N; ++j) { + in_dims[j] = pten::framework::vectorize(ins[j]->dims()); + } } InputDimensionsExtend(N, axis); diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h index 10badf00a1e246a36b0e0b37525ac9ffba028e92..49a5e9e9b33a5e286d738e7198c8914c42c1136d 100644 --- a/paddle/pten/kernels/gpu/reduce.h +++ b/paddle/pten/kernels/gpu/reduce.h @@ -45,8 +45,7 @@ namespace cub = hipcub; #include "paddle/pten/api/ext/dispatch.h" #include "paddle/pten/backends/gpu/gpu_context.h" #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/kernels/funcs/elementwise_base.h" - +#include "paddle/pten/kernels/gpu/elementwise.h" // Reduce split or not, Whether to use ReduceHigherDim #define REDUCE_SPLIT_BOUNDARY 512 #define REDUCE_VEC_SIZE 4 @@ -1254,6 +1253,24 @@ void Reduce(const GPUContext& dev_ctx, x, out, TransformOp(reduce_num), reduce_dims, stream); } } + +template +void ReduceGrad(const GPUContext& dev_ctx, + DenseTensor* d_out, + DenseTensor* d_x, + DataType out_dtype, + Functor functor) { + std::vector inputs = {d_out}; + std::vector outputs = {d_x}; + PD_VISIT_ALL_TYPES( + out_dtype, "LaunchBroadcastElementwiseCudaKernel", ([&] { + LaunchBroadcastElementwiseCudaKernel( + dev_ctx, inputs, &outputs, 0, functor); + })); +} + } // namespace pten #endif