diff --git a/paddle/fluid/op_use_default_grad_op_maker.spec b/paddle/fluid/op_use_default_grad_op_maker.spec index 0d106d8a6924281d347a0449cb5212fbcd0be5f1..389a174cc4a831a51814004a5984254deade380d 100644 --- a/paddle/fluid/op_use_default_grad_op_maker.spec +++ b/paddle/fluid/op_use_default_grad_op_maker.spec @@ -20,7 +20,6 @@ rank_loss reduce_max reduce_min reduce_prod -reduce_sum reshape rnn_memory_helper sequence_softmax diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index d1b508792c255fc650459ebf308665551e1f8bde..e549d2bddfef07ab438f72ab7273418ef0f97728 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -61,6 +61,8 @@ class ReduceMeanDoubleGradMaker : public framework::GradOpDescMakerBase { } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ReduceMeanGradNoNeedBufferVarInference, + "X"); } // namespace operators } // namespace paddle @@ -73,7 +75,8 @@ class __reduce_meanMaker__ : public ops::ReduceOpMaker { REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__, ops::ReduceMeanOpGradDescMaker); REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp, - ops::ReduceMeanDoubleGradMaker); + ops::ReduceMeanDoubleGradMaker, + ops::ReduceMeanGradNoNeedBufferVarInference); REGISTER_OP_CPU_KERNEL(reduce_mean, ops::ReduceKernel, @@ -83,12 +86,13 @@ REGISTER_OP_CPU_KERNEL(reduce_mean, int, ops::MeanFunctor>, ops::ReduceKernel); -REGISTER_OP_CPU_KERNEL(reduce_mean_grad, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); + +template +using CPUReduceMeanGradKernel = + ops::ReduceGradKernel; + +REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel, + CPUReduceMeanGradKernel, + CPUReduceMeanGradKernel, + CPUReduceMeanGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu index 9324ec1e1db6f40e463b415e5d2bdc5cfe664ef4..12eceb33ec27298d60713e72c9cc2cf91a5e7cfb 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu @@ -15,12 +15,12 @@ // .part used to speed up nvcc compile #include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" -REGISTER_OP_CUDA_KERNEL( - reduce_mean_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); +template +using CUDAReduceMeanGradKernel = + ops::ReduceGradKernel; + +REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel, + CUDAReduceMeanGradKernel, + CUDAReduceMeanGradKernel, + CUDAReduceMeanGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index 67fd3e1dad4b9c6036ac2c8f7f0fe5ec951c8e98..838ac895e5d570999742e39bd23a74a2bf6616c1 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -75,7 +75,8 @@ class ReduceKernel : public framework::OpKernel { } }; -template +template class ReduceGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -88,6 +89,17 @@ class ReduceGradKernel : public framework::OpKernel { auto* output = context.Output(framework::GradVarName("X")); output->mutable_data(context.GetPlace()); + // NOTE: EigenTensor::From() uses tensor->data() + // if op has NoNeedBufferVarsInferer, the corresponding kNoNeedBufferX or + // kNoNeedBufferY should set true + // and use fake var that has same dims. + if (kNoNeedBufferX) { + input0 = output; + } + if (kNoNeedBufferY) { + input1 = input2; + } + // NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and // not be set as Input in grad Maker, use Out_grad to replace here if (!input1) input1 = input2; @@ -220,6 +232,14 @@ class ReduceGradOp : public framework::OperatorWithKernel { ctx->ShareLoD("X", /*->*/ x_grad_name); } } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } }; class ReduceOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index c7742f45dd147ea87413aa17680d671bede5dd6c..14bb2cf0013a25239c0166e4abb8ca27645bf681 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -13,8 +13,47 @@ // limitations under the License. #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" +#include +#include + +namespace paddle { +namespace operators { + +// NOTE: Input(Out) is unnecessary in reduce_sum_grad, and Input(X) needs no +// buffer +class ReduceSumOpGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("reduce_sum_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetAttrMap(Attrs()); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ReduceSumGradNoNeedBufferVarInference, + "X"); + +} // namespace operators +} // namespace paddle + +class ReduceSumOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_sum"; } + virtual std::string GetOpType() const { return "Reduce reduce_sum"; } +}; + +REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker, + ops::ReduceSumOpGradDescMaker); +REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp, + ops::ReduceSumGradNoNeedBufferVarInference); -REGISTER_REDUCE_OP(reduce_sum); REGISTER_OP_CPU_KERNEL( reduce_sum, ops::ReduceKernel, @@ -23,13 +62,13 @@ REGISTER_OP_CPU_KERNEL( ops::ReduceKernel, ops::ReduceKernel); -REGISTER_OP_CPU_KERNEL( - reduce_sum_grad, - ops::ReduceSumGradKernel, - ops::ReduceSumGradKernel, - ops::ReduceSumGradKernel, - ops::ReduceSumGradKernel); + +template +using CPUReduceSumGradKernel = + ops::ReduceSumGradKernel; + +REGISTER_OP_CPU_KERNEL(reduce_sum_grad, CPUReduceSumGradKernel, + CPUReduceSumGradKernel, + CPUReduceSumGradKernel, + CPUReduceSumGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h index 26f59c72b4b99ff92a63c2fc2f00a31df0f5df61..7343d01e29d9983f546d7c6fd6b4be837cc1dcc5 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h @@ -22,7 +22,8 @@ namespace paddle { namespace operators { // use for loop to speed up Eigen broadcast. 4 timer faster then broadcast -template +template class ReduceSumGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -72,7 +73,7 @@ class ReduceSumGradKernel : public framework::OpKernel { } // default use Eigen broadcast - ReduceGradKernel kernel; + ReduceGradKernel kernel; kernel.Compute(context); } }; diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu index eb3295731b047391a244bfb598c9d802bca1fc0c..0d689d710a19103cf667a76e592dfba9571cae5c 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu @@ -15,12 +15,12 @@ #include "paddle/fluid/operators/reduce_ops/cub_reduce.h" #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" -REGISTER_OP_CUDA_KERNEL( - reduce_sum_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); +template +using CUDAReduceSumGradKernel = + ops::ReduceGradKernel; + +REGISTER_OP_CUDA_KERNEL(reduce_sum_grad, CUDAReduceSumGradKernel, + CUDAReduceSumGradKernel, + CUDAReduceSumGradKernel, + CUDAReduceSumGradKernel);