From 770395cb93b979765045da37bd4cbc422bd0b33a Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Thu, 26 Nov 2020 22:49:18 +0800 Subject: [PATCH] Split train_mode and has_grad for tracer (#29064) * split train_mode and has_grad * fix format * fix ci problems * fix sample code --- paddle/fluid/imperative/tracer.cc | 17 +++++++++++++---- paddle/fluid/pybind/imperative.cc | 2 +- python/paddle/fluid/dygraph/base.py | 6 +++--- python/paddle/fluid/dygraph/tracer.py | 2 +- python/paddle/fluid/layers/nn.py | 13 +++++++++---- .../transformer_dygraph_model.py | 16 +++++++--------- .../tests/unittests/test_imperative_basic.py | 5 +++-- .../unittests/test_imperative_decorator.py | 2 +- 8 files changed, 38 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 2f802a775b..4747d08a94 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -38,11 +38,20 @@ void SetCurrentTracer(const std::shared_ptr& tracer) { } static void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) { - for (const auto& name_pair : outs) { - for (const auto& vb : name_pair.second) { - VLOG(6) << "Set output: " << vb->Name() << "'s OverridedStopGradient as " + for (const auto& pair : outs) { + for (const auto& var : pair.second) { + // NOTE(zhiqiu): this happends when None output are passed from python + // side. For example, fake_quantize_dequantize_moving_average_abs_max may + // pass None OutAccum in eval mode. + // It can be refined by generate several different pybind interface for + // one operator with different function signature. + if (var == nullptr) { + VLOG(4) << pair.first << " is NULL"; + continue; + } + VLOG(6) << "Set output: " << var->Name() << "'s OverridedStopGradient as " << generate_grad; - vb->InnerSetOverridedStopGradient(generate_grad); + var->InnerSetOverridedStopGradient(generate_grad); } } } diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index d932b25aea..7e3e175c09 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -1087,7 +1087,7 @@ void BindImperative(py::module *m_ptr) { &imperative::Tracer::SetEnableProgramDescTracing) .def_property("_enable_autocast", &imperative::Tracer::IsAutoCastEnabled, &imperative::Tracer::SetEnableAutoCast) - .def_property("_train_mode", &imperative::Tracer::HasGrad, + .def_property("_has_grad", &imperative::Tracer::HasGrad, &imperative::Tracer::SetHasGrad) .def_property( "_expected_place", diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index 397f873f96..76f4a74dd3 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -190,12 +190,12 @@ def disable_dygraph(): def _switch_tracer_mode_guard_(is_train=True): tracer = framework._dygraph_tracer() if tracer: - mode = tracer._train_mode - tracer._train_mode = is_train + has_grad = tracer._has_grad + tracer._has_grad = is_train try: yield finally: - tracer._train_mode = mode + tracer._has_grad = has_grad else: yield diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py index 6b1d237881..2047968085 100644 --- a/python/paddle/fluid/dygraph/tracer.py +++ b/python/paddle/fluid/dygraph/tracer.py @@ -41,7 +41,7 @@ class Tracer(core.Tracer): def trace_op(self, type, inputs, outputs, attrs, stop_gradient=False): self.trace(type, inputs, outputs, attrs, - framework._current_expected_place(), self._train_mode and + framework._current_expected_place(), self._has_grad and not stop_gradient) def train_mode(self): diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 030f2f2651..60174ed759 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -945,7 +945,7 @@ def cos_sim(X, Y): @deprecated(since="2.0.0", update_to="paddle.nn.functional.dropout") def dropout(x, dropout_prob, - is_test=False, + is_test=None, seed=None, name=None, dropout_implementation="downgrade_in_infer"): @@ -964,7 +964,8 @@ def dropout(x, Args: x (Variable): The input tensor variable. The data type is float16 or float32 or float64. dropout_prob (float): Probability of setting units to zero. - is_test (bool): A flag indicating whether it is in test phrase or not. + is_test (bool): A flag indicating whether it is in test phrase or not. + Default None, in dynamic graph, it use global tracer mode; in static graph, it means False. seed (int): A Python integer used to create random seeds. If this parameter is set to None, a random seed is used. NOTE: If an integer seed is given, always the same output @@ -996,7 +997,10 @@ def dropout(x, .. code-block:: python + import paddle import paddle.fluid as fluid + + paddle.enable_static() x = fluid.data(name="data", shape=[None, 32, 32], dtype="float32") dropped = fluid.layers.dropout(x, dropout_prob=0.5) """ @@ -1017,9 +1021,10 @@ def dropout(x, if (seed is None or seed == 0) and default_main_program().random_seed != 0: seed = default_main_program().random_seed - _is_test = not _dygraph_tracer()._train_mode + if is_test is None: + is_test = not _dygraph_tracer()._train_mode out, mask = core.ops.dropout( - x, 'dropout_prob', dropout_prob, 'is_test', _is_test, 'fix_seed', + x, 'dropout_prob', dropout_prob, 'is_test', is_test, 'fix_seed', seed is not None, 'seed', seed if seed is not None else 0, 'dropout_implementation', dropout_implementation) return out diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py index 5c4f6400cb..1fee1c1ef6 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py @@ -64,7 +64,7 @@ class PrePostProcessLayer(Layer): elif cmd == "d": # add dropout if dropout_rate: self.functors.append(lambda x: layers.dropout( - x, dropout_prob=dropout_rate, is_test=False)) + x, dropout_prob=dropout_rate)) def forward(self, x, residual=None): for i, cmd in enumerate(self.process_cmd): @@ -137,8 +137,7 @@ class MultiHeadAttention(Layer): product += attn_bias weights = layers.softmax(product) if self.dropout_rate: - weights = layers.dropout( - weights, dropout_prob=self.dropout_rate, is_test=False) + weights = layers.dropout(weights, dropout_prob=self.dropout_rate) out = layers.matmul(weights, v) out = layers.transpose(out, perm=[0, 2, 1, 3]) out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) @@ -156,8 +155,7 @@ class FFN(Layer): def forward(self, x): hidden = self.fc1(x) if self.dropout_rate: - hidden = layers.dropout( - hidden, dropout_prob=self.dropout_rate, is_test=False) + hidden = layers.dropout(hidden, dropout_prob=self.dropout_rate) out = self.fc2(hidden) return out @@ -276,8 +274,8 @@ class WrapEncoder(Layer): pos_enc.stop_gradient = True emb = word_emb + pos_enc enc_input = layers.dropout( - emb, dropout_prob=self.emb_dropout, - is_test=False) if self.emb_dropout else emb + emb, + dropout_prob=self.emb_dropout, ) if self.emb_dropout else emb enc_output = self.encoder(enc_input, src_slf_attn_bias) return enc_output @@ -407,8 +405,8 @@ class WrapDecoder(Layer): pos_enc.stop_gradient = True emb = word_emb + pos_enc dec_input = layers.dropout( - emb, dropout_prob=self.emb_dropout, - is_test=False) if self.emb_dropout else emb + emb, + dropout_prob=self.emb_dropout, ) if self.emb_dropout else emb dec_output = self.decoder(dec_input, enc_output, trg_slf_attn_bias, trg_src_attn_bias, caches) dec_output = layers.reshape( diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index 8892c08a47..514154f1dd 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -287,13 +287,14 @@ class TestImperative(unittest.TestCase): with paddle.no_grad(): self.assertTrue(l1.weight.stop_gradient is False) tmp = l1.weight * 2 - self.assertTrue(tmp.stop_gradient) + print(tmp) + self.assertFalse(tmp.stop_gradient) x = fluid.dygraph.to_variable(data) y = l0(x) + tmp o = l1(y) o.backward() - self.assertTrue(tmp._grad_ivar() is None) + self.assertTrue(tmp._grad_ivar() is not None) self.assertTrue(l0.weight._grad_ivar() is not None) def test_sum_op(self): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_decorator.py b/python/paddle/fluid/tests/unittests/test_imperative_decorator.py index 13ca1840d0..7d20a9b952 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_decorator.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_decorator.py @@ -30,7 +30,7 @@ class TestTracerMode(unittest.TestCase): @fluid.dygraph.no_grad def no_grad_func(self, a): - self.assertEqual(self.tracer._train_mode, False) + self.assertEqual(self.tracer._has_grad, False) return a @framework.dygraph_not_support -- GitLab