From e0d8c6ac6878097ca8a44bfffebd0dd2a99c3f52 Mon Sep 17 00:00:00 2001 From: chengduo Date: Tue, 2 Jul 2019 09:49:13 +0800 Subject: [PATCH] Add find_no_grad_vars in backward.py (#17942) * add not_been_used_vars to no_grad_set test=develop --- .../fluid/op_use_default_grad_op_maker.spec | 1 - .../operators/hierarchical_sigmoid_op.cc | 49 +++++++++++++--- python/paddle/fluid/backward.py | 24 +++++++- .../test_backward_find_no_grad_vars.py | 57 +++++++++++++++++++ 4 files changed, 122 insertions(+), 9 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_backward_find_no_grad_vars.py diff --git a/paddle/fluid/op_use_default_grad_op_maker.spec b/paddle/fluid/op_use_default_grad_op_maker.spec index a2355d2deee..4ec0a35b290 100644 --- a/paddle/fluid/op_use_default_grad_op_maker.spec +++ b/paddle/fluid/op_use_default_grad_op_maker.spec @@ -15,7 +15,6 @@ fusion_seqexpand_concat_fc fusion_seqpool_concat fusion_squared_mat_sub gru -hierarchical_sigmoid lrn lstm_unit max_pool2d_with_index diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 479b839e473..2b3e2e5c484 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -86,6 +86,10 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel { } }; +/* + * Inputs: X, W, Label, PathTable, PathCode, Bias + * Outputs: Out, PreOut, W_out + */ template class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { public: @@ -162,6 +166,37 @@ Hierarchical Probabilistic Neural Network Language Model." } }; +/* + * Inputs: X, W, Label, PathTable, PathCode, PreOut, Out@GRAD + * Outputs: X@GRAD, W@GRAD, Bias@GRAD + */ +class HierarchicalSigmoidGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType(this->ForwardOpType() + "_grad"); + // Inputs: X, W, Label, PathTable, PathCode, PreOut, Out@GRAD + op->SetInput("X", Input("X")); + op->SetInput("W", Input("W")); + op->SetInput("Bias", Input("Bias")); + op->SetInput("Label", Input("Label")); + op->SetInput("PathTable", Input("PathTable")); + op->SetInput("PathCode", Input("PathCode")); + op->SetInput("PreOut", Output("PreOut")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + + // Outputs: X@GRAD, W@GRAD, Bias@GRAD + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("W"), InputGrad("W")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + op->SetAttrMap(Attrs()); + + return std::unique_ptr(op); + } +}; + class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -209,17 +244,17 @@ class HierarchicalSigmoidGradOpGradVarTypeInference auto attr = ctx->GetAttr("is_sparse"); bool is_sparse = boost::get(attr); if (is_sparse) { - VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") - << " is set to SelectedRows"; + VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") + << " is set to SelectedRows"; ctx->SetType(w_grad_var_name, framework::proto::VarType::SELECTED_ROWS); } else { - VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") - << " is set to LoDTensor"; + VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") + << " is set to LoDTensor"; ctx->SetType(w_grad_var_name, framework::proto::VarType::LOD_TENSOR); } if (hasBias) { - VLOG(30) << "hierarchical_sigmoid_grad op " - << framework::GradVarName("Bias") << " is set to LoDTensor"; + VLOG(3) << "hierarchical_sigmoid_grad op " + << framework::GradVarName("Bias") << " is set to LoDTensor"; ctx->SetType(bias_grad_var_name, framework::proto::VarType::LOD_TENSOR); } ctx->SetDataType(w_grad_var_name, ctx->GetDataType(ctx->Input("W")[0])); @@ -232,7 +267,7 @@ class HierarchicalSigmoidGradOpGradVarTypeInference namespace ops = paddle::operators; REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp, ops::HierarchicalSigmoidOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::HierarchicalSigmoidGradMaker); REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp, ops::HierarchicalSigmoidGradOpGradVarTypeInference); REGISTER_OP_CPU_KERNEL( diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 3771361cd2d..1af0e436f8f 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -552,7 +552,9 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0])) op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set) - + no_grad_vars = _find_no_grad_vars(root_block, op_path, [loss], + block_no_grad_set) + block_no_grad_set.update(no_grad_vars) no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set))) input_grad_names_set = None @@ -630,6 +632,26 @@ def _as_list(x): return list(x) if isinstance(x, collections.Sequence) else [x] +def _find_no_grad_vars(block, op_path, targets, no_grad_set): + """ + Find the vars which is not used in the program, and + those var belong to no_grad_var. + """ + output_names = set([out.name for out in targets]) + no_grad_var = [] + for i, op in reversed(list(enumerate(op_path))): + # If the op has sub_block, it is too complicated to find the correct no_grad_var. + if not op.has_attr("sub_block"): + for out_var in op.desc.output_arg_names(): + if out_var not in output_names and out_var not in op.desc.input_arg_names( + ) and not block.vars[out_var].stop_gradient: + no_grad_var.append(out_var) + for name in op.desc.input_arg_names(): + if name not in no_grad_set: + output_names.add(name) + return set(no_grad_var) + + def _find_op_path_(block, outputs, inputs, no_grad_set): """ no_grad_set will also be changed diff --git a/python/paddle/fluid/tests/unittests/test_backward_find_no_grad_vars.py b/python/paddle/fluid/tests/unittests/test_backward_find_no_grad_vars.py new file mode 100644 index 00000000000..cc662dfbdde --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_backward_find_no_grad_vars.py @@ -0,0 +1,57 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid as fluid +from simple_nets import init_data + + +def simple_net1(): + x = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + feature = fluid.layers.fc(input=x, size=20, act=None) + part1, part2 = fluid.layers.split(feature, num_or_sections=[10, 10], dim=1) + # Note that: part2 is not used. + loss = fluid.layers.cross_entropy(input=part1, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestBackward(unittest.TestCase): + def check_backward(self, model): + place = fluid.CPUPlace() + exe = fluid.Executor(place) + + main = fluid.Program() + startup = fluid.Program() + batch_size = 2 + + with fluid.program_guard(main, startup): + loss = model() + + optimizer = fluid.optimizer.SGD(learning_rate=0.1) + optimizer.minimize(loss) + + exe.run(fluid.default_startup_program()) + img, label = init_data(batch_size, img_shape=[784], label_range=9) + exe.run(feed={'image': img, 'label': label}) + + def test_backward(self): + self.check_backward(simple_net1) + + +if __name__ == '__main__': + unittest.main() -- GitLab