diff --git a/paddle/fluid/imperative/backward_strategy.h b/paddle/fluid/imperative/backward_strategy.h deleted file mode 100644 index 0f04d6db8e63d5d069745ed1895df774e69d60d0..0000000000000000000000000000000000000000 --- a/paddle/fluid/imperative/backward_strategy.h +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// -// Created by Jiabin on 2019-04-25. -// -#pragma once - -namespace paddle { -namespace imperative { -namespace detail { - -struct BackwardStrategy { - /* DyGraph now support two kinds of backward strategy, one is sorted sum - * gradient, another is sum gradient once they are created */ - // TODO(jiabin): add more Strategy when we support - bool sorted_sum_gradient_{false}; -}; - -} // namespace detail -} // namespace imperative -} // namespace paddle diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index de1246883f1019bc3e6adabadbc9e071926eb772..a91f14e56b719515bfd4d07896648e596a2282dd 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -30,12 +30,13 @@ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/profiler.h" +DECLARE_bool(sort_sum_gradient); + namespace paddle { namespace imperative { -void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy, - bool retain_graph) { - backward_strategy_ = strategy; +void BasicEngine::Init(VarBase* var, bool retain_graph) { + sorted_sum_gradient_ = FLAGS_sort_sum_gradient; retain_graph_ = retain_graph; init_node_ = var->GradVarBase()->GradNode(); var->GradVarBase()->ClearGradNode(); @@ -105,7 +106,7 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) { auto& accumulator = accumulators_[var.get()]; if (!accumulator) { - if (backward_strategy_.sorted_sum_gradient_) { + if (sorted_sum_gradient_) { accumulator.reset(new SortedGradientAccumulator(var.get())); } else { accumulator.reset(new EagerGradientAccumulator(var.get())); diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h index 4d25d81235098cca37491b1d8e43b481adc2fd0a..d1aa69f16868d3bcc67458330594dd149564c0bf 100644 --- a/paddle/fluid/imperative/basic_engine.h +++ b/paddle/fluid/imperative/basic_engine.h @@ -18,7 +18,6 @@ #include #include #include -#include "paddle/fluid/imperative/backward_strategy.h" #include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/gradient_accumulator.h" @@ -30,8 +29,7 @@ class OpBase; class BasicEngine : public Engine { public: - void Init(VarBase* var, const detail::BackwardStrategy& strategy, - bool retain_graph = false); + void Init(VarBase* var, bool retain_graph = false); void Execute() override; @@ -46,7 +44,7 @@ class BasicEngine : public Engine { private: std::shared_ptr init_node_; - detail::BackwardStrategy backward_strategy_; + bool sorted_sum_gradient_; std::unordered_map node_deps_; std::unordered_map> accumulators_; diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc index 4f133bf80c7904d9b6a84c933d431c2820b999e4..3afe5af7f6348654c4cad3d44952cef43ba93f7e 100644 --- a/paddle/fluid/imperative/partial_grad_engine.cc +++ b/paddle/fluid/imperative/partial_grad_engine.cc @@ -33,6 +33,8 @@ #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/string_helper.h" +DECLARE_bool(sort_sum_gradient); + namespace paddle { namespace imperative { @@ -529,8 +531,7 @@ class PartialGradTask { const std::vector> &output_targets, const std::vector> &output_grads, const std::vector> &no_grad_vars, - const platform::Place &place, - const detail::BackwardStrategy &strategy, bool create_graph, + const platform::Place &place, bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs); std::vector> Run(); @@ -577,7 +578,7 @@ class PartialGradTask { bool retain_graph_; bool allow_unused_; bool only_inputs_; - detail::BackwardStrategy strategy_; + bool sorted_sum_gradient_{FLAGS_sort_sum_gradient}; }; PartialGradTask::PartialGradTask( @@ -585,15 +586,14 @@ PartialGradTask::PartialGradTask( const std::vector> &output_targets, const std::vector> &output_grads, const std::vector> &no_grad_vars, - const platform::Place &place, const detail::BackwardStrategy &strategy, - bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs) { + const platform::Place &place, bool create_graph, bool retain_graph, + bool allow_unused, bool only_inputs) { input_targets_ = input_targets; place_ = place; create_graph_ = create_graph; retain_graph_ = retain_graph; allow_unused_ = allow_unused; only_inputs_ = only_inputs; - strategy_ = strategy; PADDLE_ENFORCE_EQ(only_inputs_, true, platform::errors::Unimplemented( @@ -981,7 +981,7 @@ void PartialGradTask::PrepareInitialGradientAccumulators(const OpBase *op) { if (!accumulator) { accumulator.reset(new GradientAccumulationInfo( - var, strategy_.sorted_sum_gradient_, create_graph_)); + var, sorted_sum_gradient_, create_graph_)); } accumulator->IncreaseTotalRefCnt(); @@ -1033,11 +1033,11 @@ PartialGradEngine::PartialGradEngine( const std::vector> &output_targets, const std::vector> &output_grads, const std::vector> &no_grad_vars, - const platform::Place &place, const detail::BackwardStrategy &strategy, - bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs) + const platform::Place &place, bool create_graph, bool retain_graph, + bool allow_unused, bool only_inputs) : task_(new PartialGradTask(input_targets, output_targets, output_grads, - no_grad_vars, place, strategy, create_graph, - retain_graph, allow_unused, only_inputs)) {} + no_grad_vars, place, create_graph, retain_graph, + allow_unused, only_inputs)) {} PartialGradEngine::~PartialGradEngine() { Clear(); } diff --git a/paddle/fluid/imperative/partial_grad_engine.h b/paddle/fluid/imperative/partial_grad_engine.h index a7f28c49ec3950674cd43127f51934089a497412..b5da39f8d4237130fd4674eacb479aaf6b9ba348 100644 --- a/paddle/fluid/imperative/partial_grad_engine.h +++ b/paddle/fluid/imperative/partial_grad_engine.h @@ -16,7 +16,6 @@ #include #include -#include "paddle/fluid/imperative/backward_strategy.h" #include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/platform/place.h" @@ -33,8 +32,7 @@ class PartialGradEngine : public Engine { const std::vector> &output_targets, const std::vector> &output_grads, const std::vector> &no_grad_vars, - const platform::Place &place, - const detail::BackwardStrategy &strategy, bool create_graph, + const platform::Place &place, bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs); ~PartialGradEngine(); diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc index 3c3ec2e6263396881597649d3ab643b5492d630a..892acffb712d9734e525a403881fda47ca0df23a 100644 --- a/paddle/fluid/imperative/tests/test_tracer.cc +++ b/paddle/fluid/imperative/tests/test_tracer.cc @@ -240,9 +240,8 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) { framework::AttributeMap reduce_attr_map; tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map, gpu_place, true); - detail::BackwardStrategy back_st; imperative::BasicEngine engine; - engine.Init(reduce_sum_out.get(), back_st); + engine.Init(reduce_sum_out.get()); engine.Execute(); framework::LoDTensor rlt; @@ -356,9 +355,8 @@ TEST(test_tracer, test_var_without_grad_var) { ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL); ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL); - detail::BackwardStrategy back_st; imperative::BasicEngine engine; - engine.Init(vout.get(), back_st); + engine.Init(vout.get()); engine.Execute(); // check the grad diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 8667375c6f2726f1099c6e57c6e793252b01d454..af8798a4b7cf5a8832ce9345cad45ce3096484e4 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -508,3 +508,16 @@ DEFINE_int32( "summary will be shown." "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and " "error message summary will be shown."); + +/** + * Debug related FLAG + * Name: sort_sum_gradient + * Since Version: 2.0.0 + * Value Range: bool, default=false + * Example: + * Note: If True, gradients are summed by the reverse order of + * the forward execution sequence. + */ +DEFINE_bool(sort_sum_gradient, false, + "Sum gradients by the reverse order of " + "the forward execution sequence."); diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc index deca9625e63d05625c407a1282b396398bb78ccc..f1084018d9c79e46c33098dafdb48dc395dac652 100644 --- a/paddle/fluid/pybind/global_value_getter_setter.cc +++ b/paddle/fluid/pybind/global_value_getter_setter.cc @@ -38,6 +38,7 @@ DECLARE_bool(enable_rpc_profiler); DECLARE_int32(multiple_of_cupti_buffer_size); DECLARE_bool(reader_queue_speed_test_mode); DECLARE_int32(call_stack_level); +DECLARE_bool(sort_sum_gradient); // device management DECLARE_int32(paddle_num_threads); // executor @@ -340,7 +341,7 @@ static void RegisterGlobalVarGetterSetter() { REGISTER_PUBLIC_GLOBAL_VAR( FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph, FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf, - FLAGS_call_stack_level, FLAGS_cpu_deterministic, + FLAGS_call_stack_level, FLAGS_sort_sum_gradient, FLAGS_cpu_deterministic, FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size, FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname, FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use, diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 021d10ca7facb0bac11cd5d08eddea7e01b9b566..489dd198876204486fc94518fbef0c806d0543d4 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -30,7 +30,6 @@ limitations under the License. */ #include "paddle/fluid/imperative/all_reduce.h" #include "paddle/fluid/imperative/amp_auto_cast.h" -#include "paddle/fluid/imperative/backward_strategy.h" #include "paddle/fluid/imperative/basic_engine.h" #include "paddle/fluid/imperative/data_loader.h" #include "paddle/fluid/imperative/layer.h" @@ -507,50 +506,6 @@ void BindImperative(py::module *m_ptr) { []() { memory::allocation::MemoryMapFdSet::Instance().Clear(); }); #endif - py::class_ backward_strategy( - m, "BackwardStrategy", R"DOC( - - BackwardStrategy is a descriptor of how to run the backward process. - - **Note**: - **This API is only available in** `Dygraph <../../user_guides/howto/dygraph/DyGraph.html>`_ **Mode** - - Attribute: - **sort_sum_gradient**: - - If framework will sum the gradient by the reverse order of trace. eg. x_var ( :ref:`api_guide_Variable` ) will be the input of multiple OP such as :ref:`api_fluid_layers_scale` , this attr will decide if framework will sum gradient of `x_var` by the reverse order. - - By Default: False - - Examples: - .. code-block:: python - - import numpy as np - import paddle.fluid as fluid - - x = np.ones([2, 2], np.float32) - with fluid.dygraph.guard(): - x_var = fluid.dygraph.to_variable(x) - sums_inputs = [] - # x_var will be multi-scales' input here - for _ in range(10): - sums_inputs.append(fluid.layers.scale(x_var)) - ret2 = fluid.layers.sums(sums_inputs) - loss2 = fluid.layers.reduce_sum(ret2) - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True - loss2.backward(backward_strategy) - )DOC"); - backward_strategy.def(py::init()) - .def_property("sort_sum_gradient", - [](const imperative::detail::BackwardStrategy &self) { - return self.sorted_sum_gradient_; - }, - [](imperative::detail::BackwardStrategy &self, - bool sorted_sum_gradient) { - self.sorted_sum_gradient_ = sorted_sum_gradient; - }); - m.def("start_imperative_gperf_profiler", []() { imperative::StartProfile(); }); @@ -745,21 +700,18 @@ void BindImperative(py::module *m_ptr) { inputs2.append(tmp) ret2 = fluid.layers.sums(inputs2) loss2 = fluid.layers.reduce_sum(ret2) - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True - loss2.backward(backward_strategy) + loss2.backward() print(loss2.gradient()) loss2.clear_gradient() print("After clear {}".format(loss2.gradient())) )DOC") .def("_run_backward", - [](imperative::VarBase &self, - const imperative::detail::BackwardStrategy &bckst, - const imperative::Tracer &tracer, bool retain_graph) { + [](imperative::VarBase &self, const imperative::Tracer &tracer, + bool retain_graph) { // TODO(jiabin): when we impl more backward execution we can // select them auto *engine = tracer.GetEngine(); - engine->Init(&self, bckst, retain_graph); + engine->Init(&self, retain_graph); VLOG(3) << "Start backward"; engine->Execute(); VLOG(3) << "Finish backward"; @@ -1024,13 +976,11 @@ void BindImperative(py::module *m_ptr) { &output_targets, const std::vector> &output_grads, const std::vector> &no_grad_vars, - const platform::Place &place, - const imperative::detail::BackwardStrategy &strategy, - bool create_graph, bool retain_graph, bool allow_unused, - bool only_inputs) { + const platform::Place &place, bool create_graph, bool retain_graph, + bool allow_unused, bool only_inputs) { imperative::PartialGradEngine engine( input_targets, output_targets, output_grads, no_grad_vars, place, - strategy, create_graph, retain_graph, allow_unused, only_inputs); + create_graph, retain_graph, allow_unused, only_inputs); engine.Execute(); return engine.GetResult(); }, diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 4e1e04043ad7d2fd72bfe891b755a2503c2096b3..c22eee3df6f294d0e364b734c9472a0ef62270e4 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -225,7 +225,6 @@ from .framework import CPUPlace #DEFINE_ALIAS from .framework import CUDAPlace #DEFINE_ALIAS from .framework import CUDAPinnedPlace #DEFINE_ALIAS -from .framework import BackwardStrategy #DEFINE_ALIAS from .framework import to_variable #DEFINE_ALIAS from .framework import grad #DEFINE_ALIAS from .framework import no_grad #DEFINE_ALIAS diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 2ed8642c86d95bf049920d281f4063da9779623e..9f748b7956f9faa6b1c948d87f0ef4659057a421 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -196,6 +196,7 @@ def __bootstrap__(): 'free_idle_chunk', 'free_when_no_cache_hit', 'call_stack_level', + 'sort_sum_gradient', ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py index fc14e9b390e6ae4d695252f064f1f0697aaee258..cf270ced3b704179856b1ab04dbeae8a04fbc589 100644 --- a/python/paddle/fluid/dygraph/__init__.py +++ b/python/paddle/fluid/dygraph/__init__.py @@ -38,9 +38,6 @@ from .checkpoint import * from . import learning_rate_scheduler from .learning_rate_scheduler import * -from . import backward_strategy -from .backward_strategy import * - from . import jit from .jit import * @@ -69,7 +66,6 @@ __all__ += nn.__all__ __all__ += parallel.__all__ __all__ += checkpoint.__all__ __all__ += learning_rate_scheduler.__all__ -__all__ += backward_strategy.__all__ __all__ += jit.__all__ __all__ += io.__all__ __all__ += rnn.__all__ diff --git a/python/paddle/fluid/dygraph/backward_strategy.py b/python/paddle/fluid/dygraph/backward_strategy.py deleted file mode 100644 index bfcf66af31ce13b3394b5b091882b1976f9f003a..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/dygraph/backward_strategy.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.fluid import core - -__all__ = ["BackwardStrategy"] - -BackwardStrategy = core.BackwardStrategy diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index d4f1ca333945d8933a7a9df7ca93ea825e5cf110..0c4a1964838c608fc5dd46a1dfb16d3d3d7b6ed9 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -319,8 +319,7 @@ def grad(outputs, create_graph=False, only_inputs=True, allow_unused=False, - no_grad_vars=None, - backward_strategy=None): + no_grad_vars=None): ''' .. note:: **This API is ONLY available in Dygraph mode.** @@ -363,9 +362,6 @@ def grad(outputs, their gradients if allow_unused=True. Default False. no_grad_vars (Variable|list(Variable)|tuple(Variable)|set(Variable), optional): the Variables whose gradients are not needed to compute. Default None. - backward_strategy (BackwardStrategy, optional): The backward strategy to - compute gradients. See :ref:`api_fluid_dygraph_BackwardStrategy` for - details. Default None. Returns: tuple: a tuple of Variables, whose length is the same as the Variable number @@ -503,12 +499,6 @@ def grad(outputs, raise AssertionError( "no_grad_vars must be None, Variable or list/tuple/set of Variables") - if backward_strategy is None: - backward_strategy = core.BackwardStrategy() - - assert isinstance(backward_strategy, core.BackwardStrategy), \ - "backward_strategy must be type paddle.fluid.dygraph.BackwardStrategy" - assert isinstance(create_graph, bool), "create_graph must be True or False" if retain_graph is None: @@ -524,9 +514,9 @@ def grad(outputs, place = core.Place() place.set_place(framework._current_expected_place()) - return core.dygraph_partial_grad( - inputs, outputs, grad_outputs, no_grad_vars, place, backward_strategy, - create_graph, retain_graph, allow_unused, only_inputs) + return core.dygraph_partial_grad(inputs, outputs, grad_outputs, + no_grad_vars, place, create_graph, + retain_graph, allow_unused, only_inputs) @framework.dygraph_only diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 9dbaab2580d21397fa7a4e03b03a5f1c4ac887f2..7cb17843396a6ed79c36126172a253864dbf3d0f 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -15,7 +15,6 @@ import inspect from .. import framework from .. import core -from . import BackwardStrategy from ..framework import Variable, Parameter, ParamBase from .base import switch_to_static_graph import numpy as np @@ -129,19 +128,18 @@ def monkey_patch_varbase(): framework._current_expected_place()) @framework.dygraph_only - def backward(self, backward_strategy=None, retain_graph=False): + def backward(self, retain_graph=False): """ **Notes**: **This API is ONLY available in Dygraph mode** - Run backward of current Graph which starts from current Variable + Run backward of current Graph which starts from current Tensor. Args: - backward_strategy( :ref:`api_fluid_dygraph_BackwardStrategy` ): The Backward Strategy to run backward retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would - like to add more ops to the built graph after calling this method(`backward`), set the parameter - `retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient. - Defaults to False. + like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter + :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient. + Defaults to False. Returns: NoneType: None @@ -149,32 +147,25 @@ def monkey_patch_varbase(): Examples: .. code-block:: python - import paddle.fluid as fluid import numpy as np + import paddle + paddle.disable_static() x = np.ones([2, 2], np.float32) - with fluid.dygraph.guard(): - inputs2 = [] - for _ in range(10): - tmp = fluid.dygraph.base.to_variable(x) - # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since - # there is no one need gradient on it. - tmp.stop_gradient=False - inputs2.append(tmp) - ret2 = fluid.layers.sums(inputs2) - loss2 = fluid.layers.reduce_sum(ret2) - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True - loss2.backward(backward_strategy) + inputs = [] + for _ in range(10): + tmp = paddle.to_tensor(x) + # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since + # there is no one need gradient on it. + tmp.stop_gradient=False + inputs.append(tmp) + ret = paddle.sums(inputs) + loss = paddle.reduce_sum(ret) + loss.backward() """ if framework.in_dygraph_mode(): - if backward_strategy is None: - backward_strategy = BackwardStrategy() - backward_strategy.sort_sum_gradient = False - - self._run_backward(backward_strategy, - framework._dygraph_tracer(), retain_graph) + self._run_backward(framework._dygraph_tracer(), retain_graph) else: raise ValueError( "Variable.backward() is only available in DyGraph mode") @@ -205,9 +196,7 @@ def monkey_patch_varbase(): inputs2.append(tmp) ret2 = fluid.layers.sums(inputs2) loss2 = fluid.layers.reduce_sum(ret2) - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True - loss2.backward(backward_strategy) + loss2.backward() print(loss2.gradient()) """ diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index ef50294b8e762ae84f9b37f2571458e6588c4bc6..fc4e91aad4fff1db325e17828d26ccd94c164c3d 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1106,15 +1106,18 @@ class Variable(object): pass @fake_interface_only - def backward(self, backward_strategy=None): + def backward(self, retain_graph=False): """ **Notes**: **This API is ONLY available in Dygraph mode** - Run backward of current Graph which starts from current Variable + Run backward of current Graph which starts from current Tensor. Args: - backward_strategy( :ref:`api_fluid_dygraph_BackwardStrategy` ): The Backward Strategy to run backward + retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would + like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter + :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient. + Defaults to False. Returns: NoneType: None @@ -1122,23 +1125,21 @@ class Variable(object): Examples: .. code-block:: python - import paddle.fluid as fluid import numpy as np + import paddle + paddle.disable_static() x = np.ones([2, 2], np.float32) - with fluid.dygraph.guard(): - inputs2 = [] - for _ in range(10): - tmp = fluid.dygraph.base.to_variable(x) - # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since - # there is no one need gradient on it. - tmp.stop_gradient=False - inputs2.append(tmp) - ret2 = fluid.layers.sums(inputs2) - loss2 = fluid.layers.reduce_sum(ret2) - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True - loss2.backward(backward_strategy) + inputs = [] + for _ in range(10): + tmp = paddle.to_tensor(x) + # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since + # there is no one need gradient on it. + tmp.stop_gradient=False + inputs.append(tmp) + ret = paddle.sums(inputs) + loss = paddle.reduce_sum(ret) + loss.backward() """ pass @@ -1170,9 +1171,7 @@ class Variable(object): inputs2.append(tmp) ret2 = fluid.layers.sums(inputs2) loss2 = fluid.layers.reduce_sum(ret2) - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True - loss2.backward(backward_strategy) + loss2.backward() print(loss2.gradient()) # example2: return tuple of ndarray @@ -1218,9 +1217,7 @@ class Variable(object): inputs2.append(tmp) ret2 = fluid.layers.sums(inputs2) loss2 = fluid.layers.reduce_sum(ret2) - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True - loss2.backward(backward_strategy) + loss2.backward() print(loss2.gradient()) loss2.clear_gradient() print("After clear {}".format(loss2.gradient())) diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py index bc858828058079e7d54d3c753807725ce654a778..74cc87bd9dbd691c6a1683ac44cba246e67c4af2 100644 --- a/python/paddle/fluid/tests/unittests/test_directory_migration.py +++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py @@ -38,8 +38,7 @@ class TestDirectory(unittest.TestCase): 'paddle.enable_static', 'paddle.disable_static', 'paddle.in_dynamic_mode', 'paddle.to_variable', 'paddle.grad', 'paddle.no_grad', 'paddle.save', 'paddle.load', - 'paddle.static.save', 'paddle.static.load', - 'paddle.BackwardStrategy', 'paddle.ParallelEnv', + 'paddle.static.save', 'paddle.static.load', 'paddle.ParallelEnv', 'paddle.prepare_context', 'paddle.DataParallel', 'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static', 'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer', @@ -98,7 +97,6 @@ class TestDirectory(unittest.TestCase): 'paddle.imperative.enable', 'paddle.imperative.guard', 'paddle.imperative.grad', 'paddle.imperative.no_grad', 'paddle.imperative.save', 'paddle.imperative.load', - 'paddle.imperative.BackwardStrategy', 'paddle.imperative.ParallelEnv', 'paddle.imperative.prepare_context', 'paddle.imperative.DataParalell', 'paddle.imperative.jit', diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py index 2a25bf6f8abade11d9ad25894753f6d17066e7fd..837e82882e9df8f50ca83a5df20ddf0f03ee504b 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py @@ -238,8 +238,7 @@ class TestImperativeAutoPrune(unittest.TestCase): out2 = linear2(b) out1.stop_gradient = True out = fluid.layers.concat(input=[out1, out2, c], axis=1) - backward_strategy = fluid.dygraph.BackwardStrategy() - out.backward(backward_strategy) + out.backward() self.assertTrue(linear.weight.gradient() is None) self.assertTrue(out1.gradient() is None) @@ -311,9 +310,8 @@ class TestImperativeAutoPrune(unittest.TestCase): out2 = linear2(b) out1.stop_gradient = True out = fluid.layers.concat(input=[out1, out2, c], axis=1) - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True - out.backward(backward_strategy) + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) + out.backward() self.assertTrue(linear.weight.gradient() is None) self.assertTrue(out1.gradient() is None) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index f83f8ef35215e5a0199c4d63744882126212b928..b74182d27ab8c89cc43d3fc1656ca13916d159c1 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -314,9 +314,8 @@ class TestImperative(unittest.TestCase): inputs2.append(tmp) ret2 = fluid.layers.sums(inputs2) loss2 = fluid.layers.reduce_sum(ret2) - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True - loss2.backward(backward_strategy) + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) + loss2.backward() self.assertTrue(np.allclose(ret.numpy(), x * 10)) self.assertTrue(np.allclose(inputs[0].gradient(), x)) @@ -403,9 +402,8 @@ class TestImperative(unittest.TestCase): x2 = l2(var_inp2)[0] self.assertIsNotNone(x2) dy_out2 = x2.numpy() - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True - x2.backward(backward_strategy) + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) + x2.backward() dy_grad2 = l2._x_for_debug.gradient() with new_program_scope(): @@ -442,9 +440,8 @@ class TestImperative(unittest.TestCase): mlp2 = MLP(input_size=2) out2 = mlp2(var_inp2) dy_out2 = out2.numpy() - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True - out2.backward(backward_strategy) + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) + out2.backward() dy_grad2 = mlp2._linear1.weight.gradient() with new_program_scope(): @@ -552,9 +549,8 @@ class TestImperative(unittest.TestCase): simple_rnn2 = SimpleRNN() outs2, pre_hiddens2 = simple_rnn2.forward(var_inp2) dy_out2 = outs2[3].numpy() - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True - outs2[3].backward(backward_strategy) + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) + outs2[3].backward() dy_grad_h2o2 = simple_rnn2._cell._h2o_w.gradient() dy_grad_h2h2 = simple_rnn2._cell._h2h_w.gradient() dy_grad_i2h2 = simple_rnn2._cell._i2h_w.gradient() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py index f76c3bd958081070939a85c390eeaeaa389ad5a4..af71d9d27b9a349e2b0e08c03dd04e3936d34afb 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py @@ -275,8 +275,7 @@ class TestDygraphDeepCF(unittest.TestCase): deepcf2 = DeepCF(num_users, num_items, matrix) adam2 = fluid.optimizer.AdamOptimizer( 0.01, parameter_list=deepcf2.parameters()) - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) for e in range(NUM_EPOCHES): sys.stderr.write('epoch %d\n' % e) for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE): @@ -289,7 +288,7 @@ class TestDygraphDeepCF(unittest.TestCase): fluid.layers.log_loss(prediction2, to_variable(labels_np[ slice:slice + BATCH_SIZE]))) - loss2.backward(backward_strategy) + loss2.backward() adam2.minimize(loss2) deepcf2.clear_gradients() dy_loss2 = loss2.numpy() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py index 429736803a192a7cdf01522406f95f8e7c892390..227cd5d4acb290baeb622a84d729b01bc45d48b1 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py @@ -52,8 +52,7 @@ class TestDygraphDoubleGrad(TestCase): retain_graph=None, create_graph=False, allow_unused=False): - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = self.sort_sum_gradient + fluid.set_flags({'FLAGS_sort_sum_gradient': self.sort_sum_gradient}) return fluid.dygraph.grad( outputs=outputs, inputs=inputs, @@ -61,8 +60,7 @@ class TestDygraphDoubleGrad(TestCase): no_grad_vars=no_grad_vars, retain_graph=retain_graph, create_graph=create_graph, - allow_unused=allow_unused, - backward_strategy=backward_strategy) + allow_unused=allow_unused) @dygraph_guard def test_exception(self): @@ -310,8 +308,8 @@ class TestDygraphDoubleGradVisitedUniq(TestCase): out = out + linear(input) return out - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) + with fluid.dygraph.guard(): paddle.manual_seed(123) a = fluid.dygraph.to_variable(value) @@ -324,8 +322,7 @@ class TestDygraphDoubleGradVisitedUniq(TestCase): inputs=[a], create_graph=False, only_inputs=True, - allow_unused=False, - backward_strategy=backward_strategy) + allow_unused=False) grad_1 = dx[0].numpy() @@ -335,7 +332,7 @@ class TestDygraphDoubleGradVisitedUniq(TestCase): a.stop_gradient = False out = model_f(a) - out.backward(backward_strategy) + out.backward() grad_2 = a.gradient() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py index b7ebd23a0b74208e768ea4e67b69dc4a596c6764..80bdf2ea8a898716fa20be315ac57371191b1a61 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py @@ -179,9 +179,8 @@ class TestDygraphGAN(unittest.TestCase): with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True discriminator2 = Discriminator() generator2 = Generator() sgd2 = SGDOptimizer( @@ -201,7 +200,7 @@ class TestDygraphGAN(unittest.TestCase): x=d_fake2, label=to_variable(np.zeros([2, 1], np.float32)))) d_loss2 = d_loss_real2 + d_loss_fake2 - d_loss2.backward(backward_strategy) + d_loss2.backward() sgd2.minimize(d_loss2) discriminator2.clear_gradients() generator2.clear_gradients() @@ -211,7 +210,7 @@ class TestDygraphGAN(unittest.TestCase): g_loss2 = fluid.layers.reduce_mean( fluid.layers.sigmoid_cross_entropy_with_logits( x=d_fake2, label=to_variable(np.ones([2, 1], np.float32)))) - g_loss2.backward(backward_strategy) + g_loss2.backward() sgd2.minimize(g_loss2) for p in discriminator2.parameters(): dy_params2[p.name] = p.numpy() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py index 4fe4d963ca5ee4cff1e7073d11361de69e68aa9f..317353684317f6fa0e8cf37cda58f2041e70befd 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py @@ -62,8 +62,7 @@ class Test_Forward_Hook(unittest.TestCase): with fluid.dygraph.guard(place): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) input_word = np.array( [0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, @@ -132,8 +131,7 @@ class Test_Forward_Hook(unittest.TestCase): with fluid.dygraph.guard(place): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) global call_forward_hook global call_forward_pre_hook diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py index 69fd7d80327f1a666870dc76e041449366565b01..6349d71760934c9da3aed4896ea651c45af657ad 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py @@ -113,8 +113,9 @@ class TestDygraphSimpleNet(unittest.TestCase): dy_loss = None helper = DyGraphProgramDescTracerTestHelper(self) - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = is_sort_sum_gradient + fluid.set_flags({ + 'FLAGS_sort_sum_gradient': is_sort_sum_gradient + }) for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') @@ -129,7 +130,7 @@ class TestDygraphSimpleNet(unittest.TestCase): if i == 0: for param in simple_net.parameters(): dy_param_init[param.name] = param.numpy() - dy_loss.backward(backward_strategy) + dy_loss.backward() sgd.minimize(dy_loss) sgd.clear_gradients() if i == batch_num - 1: diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py index 4ce0ca350ddb9e8b9873a1650eefa1d5b2db4938..bda1958c0f3544bef51e51cf418ae6c07bdd7056 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py @@ -36,8 +36,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase): with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) mnist2 = MNIST() sgd2 = SGDOptimizer( @@ -69,7 +68,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase): for param in mnist2.parameters(): dy_param_init_value2[param.name] = param.numpy() - avg_loss2.backward(backward_strategy) + avg_loss2.backward() sgd2.minimize(avg_loss2) mnist2.clear_gradients() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py index 246b013f1ada6bc853711e146379b8bb2df5e363..499a4311f6e1714b239259d68217370edea20a2f 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py @@ -403,8 +403,7 @@ class TestDygraphOCRAttention(unittest.TestCase): with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) ocr_attention = OCRAttention() if Config.learning_rate_decay == "piecewise_decay": @@ -438,7 +437,7 @@ class TestDygraphOCRAttention(unittest.TestCase): for param in ocr_attention.parameters(): if param.name not in dy_param_init_value: dy_param_init_value[param.name] = param.numpy() - avg_loss.backward(backward_strategy) + avg_loss.backward() dy_grad_value = {} for param in ocr_attention.parameters(): if param.trainable: diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py index 8e85fe5dfefea3221fe0566ac506b1277263eec2..526c1706e2d08bdf779846a6f30706435eb4a503 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py @@ -45,8 +45,7 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase): with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) # TODO: marsyang1993 Change seed to ptb_model = PtbModel( hidden_size=hidden_size, @@ -82,7 +81,7 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase): if i == 0: for param in ptb_model.parameters(): dy_param_init[param.name] = param.numpy() - dy_loss.backward(backward_strategy) + dy_loss.backward() sgd.minimize(dy_loss) ptb_model.clear_gradients() if i == batch_num - 1: diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py index 8cbd08ea3e245f70a6a4aceb3f6c9e0b83356981..d26d6f25aa8ffbbde3af9148bebba156eeef5e38 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py @@ -79,8 +79,7 @@ class TestDygraphResnetSortGradient(unittest.TestCase): with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) resnet = ResNet() optimizer = optimizer_setting( train_parameters, parameter_list=resnet.parameters()) @@ -119,7 +118,7 @@ class TestDygraphResnetSortGradient(unittest.TestCase): if param.name not in dy_param_init_value: dy_param_init_value[param.name] = param.numpy() - avg_loss.backward(backward_strategy) + avg_loss.backward() dy_grad_value = {} for param in resnet.parameters(): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py index 9878e2f9ad772fe3d03addb4ced9f3b66a6cd58a..59ddb365e539603c1eba06ca8828fc244b6e542d 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py @@ -48,8 +48,9 @@ class TestSimpleNet(unittest.TestCase): for dtype in ["float32", "float64"]: for sort_sum_gradient in [True, False]: paddle.disable_static(place) - backward_strategy = paddle.BackwardStrategy() - backward_strategy.sort_sum_gradient = sort_sum_gradient + fluid.set_flags({ + 'FLAGS_sort_sum_gradient': sort_sum_gradient + }) # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0) input_word = np.array([[1, 2], [2, 1]]).astype('int64') @@ -65,7 +66,7 @@ class TestSimpleNet(unittest.TestCase): self.assertTrue(emb.weight.gradient() is None) self.assertTrue(input_emb.gradient() is None) - input_emb.backward(backward_strategy) + input_emb.backward() adam.minimize(input_emb) self.assertTrue(emb.weight.gradient() is not None) @@ -84,8 +85,9 @@ class TestSimpleNet(unittest.TestCase): for place in places: for sort_sum_gradient in [True, False]: with fluid.dygraph.guard(place): - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = sort_sum_gradient + fluid.set_flags({ + 'FLAGS_sort_sum_gradient': sort_sum_gradient + }) grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0) input_word = np.array([[1, 2], [2, 1]]).astype('int64') @@ -101,7 +103,7 @@ class TestSimpleNet(unittest.TestCase): self.assertTrue(emb.weight.gradient() is None) self.assertTrue(input_emb.gradient() is None) - input_emb.backward(backward_strategy) + input_emb.backward() adam.minimize(input_emb) self.assertTrue(emb.weight.gradient() is not None) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py index a42a62019ba54a771d26ad853e39fcf8ca991180..3765cb784d6522cd0249a77045f8cbc841a2d9ac 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py @@ -119,8 +119,9 @@ class TestDygraphSimpleNet(unittest.TestCase): dy_param_init = dict() dy_loss = None - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = is_sort_sum_gradient + fluid.set_flags({ + 'FLAGS_sort_sum_gradient': is_sort_sum_gradient + }) for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') @@ -135,7 +136,7 @@ class TestDygraphSimpleNet(unittest.TestCase): if i == 0: for param in simple_net.parameters(): dy_param_init[param.name] = param.numpy() - dy_loss.backward(backward_strategy) + dy_loss.backward() sgd.minimize(dy_loss) sgd.clear_gradients() if i == batch_num - 1: diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py index 649dc1ad91d3878dacc551fd08527885c3f479aa..d603a7d6ca0dea8df2e60207211f2061f1fe616d 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py @@ -479,8 +479,7 @@ class DyGraphTrainModel(object): self.cfg = cfg - self.backward_strategy = fluid.dygraph.BackwardStrategy() - self.backward_strategy.sort_sum_gradient = cfg.sort_sum_gradient + fluid.set_flags({'FLAGS_sort_sum_gradient': cfg.sort_sum_gradient}) def clear_gradients(self): if self.g_optimizer: @@ -497,7 +496,7 @@ class DyGraphTrainModel(object): g_loss = get_generator_loss(image_real, label_org, label_trg, self.generator, self.discriminator, self.cfg) - g_loss.backward(self.backward_strategy) + g_loss.backward() if self.g_optimizer: self.g_optimizer.minimize(g_loss) @@ -506,7 +505,7 @@ class DyGraphTrainModel(object): d_loss = get_discriminator_loss(image_real, label_org, label_trg, self.generator, self.discriminator, self.cfg) - d_loss.backward(self.backward_strategy) + d_loss.backward() if self.d_optimizer: self.d_optimizer.minimize(d_loss) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py index acc56b7db27f48cad92ed44cddfcfd4b9591dba3..f10d2df7f06f98334df62d3021403d686054b7d9 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py @@ -121,8 +121,7 @@ class TestImperativeStaticModelRunnerMnist(unittest.TestCase): with fluid.dygraph.guard(place): fluid.default_startup_program().random_seed = self.seed fluid.default_main_program().random_seed = self.seed - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) mnist = fluid.dygraph.static_runner.StaticModelRunner( model_dir=self.save_dirname, @@ -156,7 +155,7 @@ class TestImperativeStaticModelRunnerMnist(unittest.TestCase): loss = fluid.layers.cross_entropy(cost, label) avg_loss = fluid.layers.mean(loss) - avg_loss.backward(backward_strategy) + avg_loss.backward() sgd.minimize(avg_loss) mnist.clear_gradients() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py index 0792582175ef03cba3d3ba809132f3c591ecfe87..db47170c7bfff4575a9b4dcf694cd8ed722b0b8f 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py @@ -111,9 +111,7 @@ class TestImperativeStaticModelRunnerWhile(unittest.TestCase): fluid.default_startup_program().random_seed = self.seed fluid.default_main_program().random_seed = self.seed np.random.seed(self.seed) - - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) while_net = fluid.dygraph.static_runner.StaticModelRunner( self.save_dirname) @@ -141,7 +139,7 @@ class TestImperativeStaticModelRunnerWhile(unittest.TestCase): loss = fluid.layers.cross_entropy(cost, label) avg_loss = fluid.layers.mean(loss) - avg_loss.backward(backward_strategy) + avg_loss.backward() sgd.minimize(avg_loss) while_net.clear_gradients() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py index 29cc718f14ff98de2b668d313d380d784cbaa6ef..c59ce44ec96a87383ec12998767af70ac07ff743 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py @@ -951,8 +951,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): with guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = True + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) transformer = TransFormer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, @@ -1021,7 +1020,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): for param in transformer.parameters(): dy_param_init[param.name] = param.numpy() - dy_avg_cost.backward(backward_strategy) + dy_avg_cost.backward() optimizer.minimize(dy_avg_cost) transformer.clear_gradients() diff --git a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py index 858d56c1fc04f61c9dd281a633f7be9aceff8338..2ffe523ef6dda18a24813e702a1892c335ba6a68 100644 --- a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py @@ -52,8 +52,6 @@ class TestDygraphDoubleGrad(TestCase): retain_graph=None, create_graph=False, allow_unused=False): - backward_strategy = fluid.dygraph.BackwardStrategy() - backward_strategy.sort_sum_gradient = self.sort_sum_gradient return paddle.grad( outputs=outputs, inputs=inputs, @@ -61,8 +59,7 @@ class TestDygraphDoubleGrad(TestCase): no_grad_vars=no_grad_vars, retain_graph=retain_graph, create_graph=create_graph, - allow_unused=allow_unused, - backward_strategy=backward_strategy) + allow_unused=allow_unused) @dygraph_guard def test_exception(self): diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py index f01dc01973a603a0b6ea08358f73237c68924c78..95a0cb52046790e44150dd6f74733ae86a75a570 100644 --- a/python/paddle/framework/__init__.py +++ b/python/paddle/framework/__init__.py @@ -20,8 +20,8 @@ __all__ = [ ] __all__ += [ - 'BackwardStrategy', 'grad', 'LayerList', 'load', 'save', 'prepare_context', - 'to_variable', 'no_grad', 'ParallelEnv', 'DataParallel' + 'grad', 'LayerList', 'load', 'save', 'prepare_context', 'to_variable', + 'no_grad', 'ParallelEnv', 'DataParallel' ] __all__ += [ @@ -61,5 +61,3 @@ from ..fluid.dygraph.learning_rate_scheduler import ExponentialDecay #DEFINE_AL from ..fluid.dygraph.learning_rate_scheduler import InverseTimeDecay #DEFINE_ALIAS from ..fluid.dygraph.learning_rate_scheduler import PolynomialDecay #DEFINE_ALIAS from ..fluid.dygraph.learning_rate_scheduler import CosineDecay #DEFINE_ALIAS - -BackwardStrategy = core.BackwardStrategy