[Cherry pick] Remove unnecessary op when trainable is false (#19434)

* fix optimizer bug test=develop

[Cherry pick] Remove unnecessary op when trainable is false (#19434)
* fix optimizer bug test=develop
9048229b · chengduo · GitHub · 5b3d33bd · 9048229b · 9048229b
3 changed file
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -712,8 +712,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
        parameters = parameter_list
    else:
        params = program.global_block().all_parameters()
-        program.global_block().iter_parameters()
-        parameters = [param.name for param in params]
+        parameters = [param.name for param in params if param.trainable]

    params_and_grads = []
    for param in parameters:

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -360,8 +360,9 @@ class Optimizer(object):
        global_block = framework.default_main_program().global_block()
        start = len(global_block.ops)
        self.helper = LayerHelper(self.__class__.__name__)
-        self._create_accumulators(global_block,
-                                  [p[0] for p in parameters_and_grads])
+        self._create_accumulators(
+            global_block,
+            [p[0] for p in parameters_and_grads if p[0].trainable])
        self._create_global_learning_rate()

        optimize_ops = []
@@ -587,6 +588,20 @@ class Optimizer(object):
            tuple: (optimize_ops, params_grads) which are, list of operators appended;
            and list of (param, grad) Variables pair for optimization.
        """
+        assert isinstance(loss, Variable), "The loss should be an Variable."
+        if no_grad_set is None:
+            no_grad_set = set()
+        elif isinstance(no_grad_set, set) or isinstance(
+                no_grad_set, list) or isinstance(no_grad_set, tuple):
+            no_grad_set = set(no_grad_set)
+        else:
+            assert "no_grad_set should be a set, but the passed type is {}".format(
+                type(no_grad_set))
+        parameters = loss.block.program.global_block().all_parameters()
+        param_no_trainable = set(
+            [param.name for param in parameters if param.trainable is False])
+        # If the parameter is no trainable, it should not have a gradient.
+        no_grad_set.update(param_no_trainable)
        params_grads = self.backward(
            loss,
            startup_program=startup_program,
@@ -1390,7 +1405,7 @@ class AdamOptimizer(Optimizer):
        assert isinstance(block, framework.Block)
        main_block = block.program.global_block()
        for param, grad in param_and_grads:
-            if grad is None:
+            if grad is None or param.trainable is False:
                continue
            with param.block.program._optimized_guard(
                [param, grad]), name_scope("optimizer"):
@@ -1553,7 +1568,7 @@ class AdamaxOptimizer(Optimizer):
        assert isinstance(block, framework.Block)
        main_block = block.program.global_block()
        for param, grad in parameters_and_grads:
-            if grad is None:
+            if grad is None or param.trainable is False:
                continue
            with param.block.program._optimized_guard(
                [param, grad]), name_scope('adamx'):

--- a/python/paddle/fluid/tests/unittests/test_trainable.py
+++ b/python/paddle/fluid/tests/unittests/test_trainable.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from collections import Counter
+import unittest
+import paddle.fluid as fluid
+from simple_nets import init_data
+
+
+def test_trainable():
+    x = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    feature = fluid.layers.fc(input=x,
+                              size=10,
+                              param_attr=fluid.ParamAttr(trainable=False))
+    loss = fluid.layers.cross_entropy(input=feature, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestTrainable(unittest.TestCase):
+    def check_trainable(self,
+                        model,
+                        feed_dict,
+                        op_count,
+                        optimizer=fluid.optimizer.Adam()):
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        main = fluid.Program()
+        startup = fluid.Program()
+
+        with fluid.program_guard(main, startup):
+            loss = model()
+            optimizer.minimize(loss)
+
+            # The number of adam should be one.
+            ops = Counter([op.type for op in main.global_block().ops])
+            for op in op_count:
+                if op_count[op] == 0:
+                    assert op not in ops
+                else:
+                    assert ops[op] == op_count[op]
+
+            exe.run(fluid.default_startup_program())
+            exe.run(feed=feed_dict)
+
+    def test_trainable(self):
+        batch_size = 2
+        img, label = init_data(batch_size, img_shape=[784], label_range=9)
+        feed_dict = {'image': img, 'label': label}
+        # Note that, because the Weight of FC is not trainable and the x is stop_gradient,
+        # so the 'mul_grad' should not be appended.
+        self.check_trainable(
+            test_trainable,
+            feed_dict,
+            op_count={'adam': 1,
+                      'scale': 2,
+                      'mul_grad': 0})
+        self.check_trainable(
+            test_trainable,
+            feed_dict,
+            op_count={'adamax': 1,
+                      'scale': 1,
+                      'mul_grad': 0},
+            optimizer=fluid.optimizer.Adamax(learning_rate=0.2))
+
+
+if __name__ == '__main__':
+    unittest.main()