From af692c9140a8129cf73b6a0381b08dc4810d0860 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Thu, 5 Sep 2019 12:53:09 +0800 Subject: [PATCH] update reduce_sum and reduce_mean to save memory, test=develop (#19608) --- .../fluid/op_use_default_grad_op_maker.spec | 1 - .../operators/reduce_ops/reduce_mean_op.cc | 24 +++++--- .../reduce_ops/reduce_mean_op.part.cu | 18 +++--- paddle/fluid/operators/reduce_ops/reduce_op.h | 22 ++++++- .../operators/reduce_ops/reduce_sum_op.cc | 61 +++++++++++++++---- .../operators/reduce_ops/reduce_sum_op.h | 5 +- .../reduce_ops/reduce_sum_op.part.cu | 18 +++--- 7 files changed, 106 insertions(+), 43 deletions(-) diff --git a/paddle/fluid/op_use_default_grad_op_maker.spec b/paddle/fluid/op_use_default_grad_op_maker.spec index 0d106d8a692..389a174cc4a 100644 --- a/paddle/fluid/op_use_default_grad_op_maker.spec +++ b/paddle/fluid/op_use_default_grad_op_maker.spec @@ -20,7 +20,6 @@ rank_loss reduce_max reduce_min reduce_prod -reduce_sum reshape rnn_memory_helper sequence_softmax diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index d1b508792c2..e549d2bddfe 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -61,6 +61,8 @@ class ReduceMeanDoubleGradMaker : public framework::GradOpDescMakerBase { } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ReduceMeanGradNoNeedBufferVarInference, + "X"); } // namespace operators } // namespace paddle @@ -73,7 +75,8 @@ class __reduce_meanMaker__ : public ops::ReduceOpMaker { REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__, ops::ReduceMeanOpGradDescMaker); REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp, - ops::ReduceMeanDoubleGradMaker); + ops::ReduceMeanDoubleGradMaker, + ops::ReduceMeanGradNoNeedBufferVarInference); REGISTER_OP_CPU_KERNEL(reduce_mean, ops::ReduceKernel, @@ -83,12 +86,13 @@ REGISTER_OP_CPU_KERNEL(reduce_mean, int, ops::MeanFunctor>, ops::ReduceKernel); -REGISTER_OP_CPU_KERNEL(reduce_mean_grad, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); + +template +using CPUReduceMeanGradKernel = + ops::ReduceGradKernel; + +REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel, + CPUReduceMeanGradKernel, + CPUReduceMeanGradKernel, + CPUReduceMeanGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu index 9324ec1e1db..12eceb33ec2 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu @@ -15,12 +15,12 @@ // .part used to speed up nvcc compile #include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" -REGISTER_OP_CUDA_KERNEL( - reduce_mean_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); +template +using CUDAReduceMeanGradKernel = + ops::ReduceGradKernel; + +REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel, + CUDAReduceMeanGradKernel, + CUDAReduceMeanGradKernel, + CUDAReduceMeanGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index 67fd3e1dad4..838ac895e5d 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -75,7 +75,8 @@ class ReduceKernel : public framework::OpKernel { } }; -template +template class ReduceGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -88,6 +89,17 @@ class ReduceGradKernel : public framework::OpKernel { auto* output = context.Output(framework::GradVarName("X")); output->mutable_data(context.GetPlace()); + // NOTE: EigenTensor::From() uses tensor->data() + // if op has NoNeedBufferVarsInferer, the corresponding kNoNeedBufferX or + // kNoNeedBufferY should set true + // and use fake var that has same dims. + if (kNoNeedBufferX) { + input0 = output; + } + if (kNoNeedBufferY) { + input1 = input2; + } + // NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and // not be set as Input in grad Maker, use Out_grad to replace here if (!input1) input1 = input2; @@ -220,6 +232,14 @@ class ReduceGradOp : public framework::OperatorWithKernel { ctx->ShareLoD("X", /*->*/ x_grad_name); } } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } }; class ReduceOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index c7742f45dd1..14bb2cf0013 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -13,8 +13,47 @@ // limitations under the License. #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" +#include +#include + +namespace paddle { +namespace operators { + +// NOTE: Input(Out) is unnecessary in reduce_sum_grad, and Input(X) needs no +// buffer +class ReduceSumOpGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("reduce_sum_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetAttrMap(Attrs()); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ReduceSumGradNoNeedBufferVarInference, + "X"); + +} // namespace operators +} // namespace paddle + +class ReduceSumOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_sum"; } + virtual std::string GetOpType() const { return "Reduce reduce_sum"; } +}; + +REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker, + ops::ReduceSumOpGradDescMaker); +REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp, + ops::ReduceSumGradNoNeedBufferVarInference); -REGISTER_REDUCE_OP(reduce_sum); REGISTER_OP_CPU_KERNEL( reduce_sum, ops::ReduceKernel, @@ -23,13 +62,13 @@ REGISTER_OP_CPU_KERNEL( ops::ReduceKernel, ops::ReduceKernel); -REGISTER_OP_CPU_KERNEL( - reduce_sum_grad, - ops::ReduceSumGradKernel, - ops::ReduceSumGradKernel, - ops::ReduceSumGradKernel, - ops::ReduceSumGradKernel); + +template +using CPUReduceSumGradKernel = + ops::ReduceSumGradKernel; + +REGISTER_OP_CPU_KERNEL(reduce_sum_grad, CPUReduceSumGradKernel, + CPUReduceSumGradKernel, + CPUReduceSumGradKernel, + CPUReduceSumGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h index 26f59c72b4b..7343d01e29d 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h @@ -22,7 +22,8 @@ namespace paddle { namespace operators { // use for loop to speed up Eigen broadcast. 4 timer faster then broadcast -template +template class ReduceSumGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -72,7 +73,7 @@ class ReduceSumGradKernel : public framework::OpKernel { } // default use Eigen broadcast - ReduceGradKernel kernel; + ReduceGradKernel kernel; kernel.Compute(context); } }; diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu index eb3295731b0..0d689d710a1 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu @@ -15,12 +15,12 @@ #include "paddle/fluid/operators/reduce_ops/cub_reduce.h" #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" -REGISTER_OP_CUDA_KERNEL( - reduce_sum_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); +template +using CUDAReduceSumGradKernel = + ops::ReduceGradKernel; + +REGISTER_OP_CUDA_KERNEL(reduce_sum_grad, CUDAReduceSumGradKernel, + CUDAReduceSumGradKernel, + CUDAReduceSumGradKernel, + CUDAReduceSumGradKernel); -- GitLab