diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu index a578c9f7d81083c533028b9c8912a24006ed0292..4cc2577f6b2ec2c53421257d207aba83b95965a3 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu @@ -17,9 +17,15 @@ template using CUDAReduceMeanGradKernel = - ops::ReduceCudaGradKernel; + ops::ReduceGradKernel; + +using FP16CUDAReduceMeanGradKernel = + ops::ReduceGradKernel; REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel, - CUDAReduceMeanGradKernel, + FP16CUDAReduceMeanGradKernel, CUDAReduceMeanGradKernel, CUDAReduceMeanGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index ff6a443638452d3ac01541c59245c08aedf5db81..661fb772f1c573f480def3e7162f62ba56a56d02 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -623,12 +623,11 @@ class ReduceGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - int out_dtype = ctx.Attr("out_dtype"); + int in_dtype = ctx.Attr("in_dtype"); auto input_data_type = - (out_dtype >= 0) - ? static_cast(out_dtype) - : OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")); + (in_dtype >= 0) ? static_cast(in_dtype) + : OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); #ifdef PADDLE_WITH_MKLDNN auto CanMKLDNNReduceGradBeUsed = [&]() { auto dx_dims = ctx.Input("X")->dims(); @@ -737,55 +736,6 @@ class ReduceCudaKernel : public framework::OpKernel { pt_out.get()); } }; - -template class TransformOp> -class ReduceCudaGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - std::vector dims = context.Attr>("dim"); - auto* in_x = context.Input("X"); - auto* d_out = - context.Input(framework::GradVarName("Out")); - auto* d_x = context.Output(framework::GradVarName("X")); - auto out_dtype = context.Attr("in_dtype"); - // get reduce_dim and reduce_num for reduce_mean_grad - int dim_size = in_x->dims().size(); - std::vector reduce_dims = GetReduceDim(dims, dim_size, reduce_all); - auto update_dims = vectorize(d_x->dims()); - int reduce_num = 1; - for (auto i : reduce_dims) { - reduce_num *= (in_x->dims())[i]; - update_dims[i] = 1; - } - // make new tensor - framework::Tensor new_d_out(d_out->type()); - new_d_out.ShareDataWith(*d_out); - new_d_out.Resize(paddle::framework::make_ddim(update_dims)); - auto& dev_ctx = context.cuda_device_context(); - if (out_dtype > 0) { - d_x->mutable_data( - dev_ctx.GetPlace(), - static_cast(out_dtype)); - } else { - d_x->mutable_data( - dev_ctx.GetPlace(), - static_cast(d_out->type())); - } - auto pt_d_out = paddle::experimental::MakePtenDenseTensor(new_d_out); - auto pt_d_x = paddle::experimental::MakePtenDenseTensor(*d_x); - auto pt_out_dtype = pten::TransToPtenDataType( - static_cast(out_dtype)); - if (out_dtype <= 0) { - pt_out_dtype = pten::TransToPtenDataType( - static_cast(d_out->type())); - } - using MPType = typename kps::details::MPTypeTrait::Type; - pten::ReduceGrad>( - dev_ctx, pt_d_out.get(), pt_d_x.get(), pt_out_dtype, - TransformOp(reduce_num)); - } -}; #endif } // namespace operators diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index 9a715eb98ef994f89e201656e8c371d819b11f19..562a5719d74d90062fb92c5198bacf82499c9948 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -50,7 +50,7 @@ class ReduceSumOpGradMaker : public framework::SingleGradOpMaker { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - int in_dtype = ctx.Attr("out_dtype"); + int in_dtype = ctx.Attr("in_dtype"); if (in_dtype >= 0) { return framework::OpKernelType( static_cast(in_dtype), diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h index 79b3480afbcd7750297e9e02575a43fdb6809126..9782ce28da4af0b17f38219c039dd4fd62f46846 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h @@ -74,7 +74,7 @@ class ReduceSumGradKernel : public framework::OpKernel { auto dims = context.Attr>("dim"); if (context.GetPlace().GetType() == platform::CPUPlace().GetType() && dims.size() == 1) { - int in_dtype = context.Attr("out_dtype"); + int in_dtype = context.Attr("in_dtype"); if (in_dtype >= 0) { Tensor tmp_tensor; diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu index c3d3e0cf6ecd51f3bb2baa063878f80444db3563..c629663b19ebd7f42f3a16e69bd4b46784ff67dd 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu @@ -17,7 +17,8 @@ template using CUDAReduceSumGradKernel = - ops::ReduceCudaGradKernel; + ops::ReduceGradKernel; REGISTER_OP_CUDA_KERNEL( reduce_sum_grad, CUDAReduceSumGradKernel, diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h index db85c596da945f77a7dd0fc3f88129c73bb0d7e5..9a3ae7f12dfcd62a1a18154971fa99ab72c5561d 100644 --- a/paddle/pten/kernels/gpu/elementwise.h +++ b/paddle/pten/kernels/gpu/elementwise.h @@ -134,19 +134,12 @@ struct DimensionsTransform { explicit DimensionsTransform(const std::vector &ins, const pten::framework::DDim &dims, int axis) { - const int N = max(static_cast(ins.size()), 2); + const int N = ins.size(); dim_size = dims.size(); out_dims = pten::framework::vectorize(dims); in_dims.resize(N); - if (ins.size() == 1) { - // when ins.size() = 1, broadcast input to output - in_dims[0] = pten::framework::vectorize(ins[0]->dims()); - in_dims[1] = out_dims; - // Add out_dims to in_dims to avoid errors in dims merging - } else { - for (int j = 0; j < N; ++j) { - in_dims[j] = pten::framework::vectorize(ins[j]->dims()); - } + for (int j = 0; j < N; ++j) { + in_dims[j] = pten::framework::vectorize(ins[j]->dims()); } InputDimensionsExtend(N, axis); diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h index 49a5e9e9b33a5e286d738e7198c8914c42c1136d..10badf00a1e246a36b0e0b37525ac9ffba028e92 100644 --- a/paddle/pten/kernels/gpu/reduce.h +++ b/paddle/pten/kernels/gpu/reduce.h @@ -45,7 +45,8 @@ namespace cub = hipcub; #include "paddle/pten/api/ext/dispatch.h" #include "paddle/pten/backends/gpu/gpu_context.h" #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/kernels/gpu/elementwise.h" +#include "paddle/pten/kernels/funcs/elementwise_base.h" + // Reduce split or not, Whether to use ReduceHigherDim #define REDUCE_SPLIT_BOUNDARY 512 #define REDUCE_VEC_SIZE 4 @@ -1253,24 +1254,6 @@ void Reduce(const GPUContext& dev_ctx, x, out, TransformOp(reduce_num), reduce_dims, stream); } } - -template -void ReduceGrad(const GPUContext& dev_ctx, - DenseTensor* d_out, - DenseTensor* d_x, - DataType out_dtype, - Functor functor) { - std::vector inputs = {d_out}; - std::vector outputs = {d_x}; - PD_VISIT_ALL_TYPES( - out_dtype, "LaunchBroadcastElementwiseCudaKernel", ([&] { - LaunchBroadcastElementwiseCudaKernel( - dev_ctx, inputs, &outputs, 0, functor); - })); -} - } // namespace pten #endif