未验证 提交 f9066e6a 编写于 作者: Z Zhen Wang 提交者: GitHub

Update the demo code and the doc of varbase.backward. (#26506)

* update the demo code and the doc of varbase.backward.

* update the doc of the fake interface `paddle.fluid.Variable`.

* remove BackwardStrategy.
上级 1c898b66
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Created by Jiabin on 2019-04-25.
//
#pragma once
namespace paddle {
namespace imperative {
namespace detail {
struct BackwardStrategy {
/* DyGraph now support two kinds of backward strategy, one is sorted sum
* gradient, another is sum gradient once they are created */
// TODO(jiabin): add more Strategy when we support
bool sorted_sum_gradient_{false};
};
} // namespace detail
} // namespace imperative
} // namespace paddle
...@@ -30,12 +30,13 @@ ...@@ -30,12 +30,13 @@
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DECLARE_bool(sort_sum_gradient);
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy, void BasicEngine::Init(VarBase* var, bool retain_graph) {
bool retain_graph) { sorted_sum_gradient_ = FLAGS_sort_sum_gradient;
backward_strategy_ = strategy;
retain_graph_ = retain_graph; retain_graph_ = retain_graph;
init_node_ = var->GradVarBase()->GradNode(); init_node_ = var->GradVarBase()->GradNode();
var->GradVarBase()->ClearGradNode(); var->GradVarBase()->ClearGradNode();
...@@ -105,7 +106,7 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) { ...@@ -105,7 +106,7 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) {
auto& accumulator = accumulators_[var.get()]; auto& accumulator = accumulators_[var.get()];
if (!accumulator) { if (!accumulator) {
if (backward_strategy_.sorted_sum_gradient_) { if (sorted_sum_gradient_) {
accumulator.reset(new SortedGradientAccumulator(var.get())); accumulator.reset(new SortedGradientAccumulator(var.get()));
} else { } else {
accumulator.reset(new EagerGradientAccumulator(var.get())); accumulator.reset(new EagerGradientAccumulator(var.get()));
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/imperative/backward_strategy.h"
#include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/engine.h"
#include "paddle/fluid/imperative/gradient_accumulator.h" #include "paddle/fluid/imperative/gradient_accumulator.h"
...@@ -30,8 +29,7 @@ class OpBase; ...@@ -30,8 +29,7 @@ class OpBase;
class BasicEngine : public Engine { class BasicEngine : public Engine {
public: public:
void Init(VarBase* var, const detail::BackwardStrategy& strategy, void Init(VarBase* var, bool retain_graph = false);
bool retain_graph = false);
void Execute() override; void Execute() override;
...@@ -46,7 +44,7 @@ class BasicEngine : public Engine { ...@@ -46,7 +44,7 @@ class BasicEngine : public Engine {
private: private:
std::shared_ptr<GradOpNode> init_node_; std::shared_ptr<GradOpNode> init_node_;
detail::BackwardStrategy backward_strategy_; bool sorted_sum_gradient_;
std::unordered_map<GradOpNode*, size_t> node_deps_; std::unordered_map<GradOpNode*, size_t> node_deps_;
std::unordered_map<VariableWrapper*, std::unique_ptr<GradientAccumulator>> std::unordered_map<VariableWrapper*, std::unique_ptr<GradientAccumulator>>
accumulators_; accumulators_;
......
...@@ -33,6 +33,8 @@ ...@@ -33,6 +33,8 @@
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
DECLARE_bool(sort_sum_gradient);
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
...@@ -529,8 +531,7 @@ class PartialGradTask { ...@@ -529,8 +531,7 @@ class PartialGradTask {
const std::vector<std::shared_ptr<VarBase>> &output_targets, const std::vector<std::shared_ptr<VarBase>> &output_targets,
const std::vector<std::shared_ptr<VarBase>> &output_grads, const std::vector<std::shared_ptr<VarBase>> &output_grads,
const std::vector<std::shared_ptr<VarBase>> &no_grad_vars, const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
const platform::Place &place, const platform::Place &place, bool create_graph,
const detail::BackwardStrategy &strategy, bool create_graph,
bool retain_graph, bool allow_unused, bool only_inputs); bool retain_graph, bool allow_unused, bool only_inputs);
std::vector<std::shared_ptr<VarBase>> Run(); std::vector<std::shared_ptr<VarBase>> Run();
...@@ -577,7 +578,7 @@ class PartialGradTask { ...@@ -577,7 +578,7 @@ class PartialGradTask {
bool retain_graph_; bool retain_graph_;
bool allow_unused_; bool allow_unused_;
bool only_inputs_; bool only_inputs_;
detail::BackwardStrategy strategy_; bool sorted_sum_gradient_{FLAGS_sort_sum_gradient};
}; };
PartialGradTask::PartialGradTask( PartialGradTask::PartialGradTask(
...@@ -585,15 +586,14 @@ PartialGradTask::PartialGradTask( ...@@ -585,15 +586,14 @@ PartialGradTask::PartialGradTask(
const std::vector<std::shared_ptr<VarBase>> &output_targets, const std::vector<std::shared_ptr<VarBase>> &output_targets,
const std::vector<std::shared_ptr<VarBase>> &output_grads, const std::vector<std::shared_ptr<VarBase>> &output_grads,
const std::vector<std::shared_ptr<VarBase>> &no_grad_vars, const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
const platform::Place &place, const detail::BackwardStrategy &strategy, const platform::Place &place, bool create_graph, bool retain_graph,
bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs) { bool allow_unused, bool only_inputs) {
input_targets_ = input_targets; input_targets_ = input_targets;
place_ = place; place_ = place;
create_graph_ = create_graph; create_graph_ = create_graph;
retain_graph_ = retain_graph; retain_graph_ = retain_graph;
allow_unused_ = allow_unused; allow_unused_ = allow_unused;
only_inputs_ = only_inputs; only_inputs_ = only_inputs;
strategy_ = strategy;
PADDLE_ENFORCE_EQ(only_inputs_, true, PADDLE_ENFORCE_EQ(only_inputs_, true,
platform::errors::Unimplemented( platform::errors::Unimplemented(
...@@ -981,7 +981,7 @@ void PartialGradTask::PrepareInitialGradientAccumulators(const OpBase *op) { ...@@ -981,7 +981,7 @@ void PartialGradTask::PrepareInitialGradientAccumulators(const OpBase *op) {
if (!accumulator) { if (!accumulator) {
accumulator.reset(new GradientAccumulationInfo( accumulator.reset(new GradientAccumulationInfo(
var, strategy_.sorted_sum_gradient_, create_graph_)); var, sorted_sum_gradient_, create_graph_));
} }
accumulator->IncreaseTotalRefCnt(); accumulator->IncreaseTotalRefCnt();
...@@ -1033,11 +1033,11 @@ PartialGradEngine::PartialGradEngine( ...@@ -1033,11 +1033,11 @@ PartialGradEngine::PartialGradEngine(
const std::vector<std::shared_ptr<VarBase>> &output_targets, const std::vector<std::shared_ptr<VarBase>> &output_targets,
const std::vector<std::shared_ptr<VarBase>> &output_grads, const std::vector<std::shared_ptr<VarBase>> &output_grads,
const std::vector<std::shared_ptr<VarBase>> &no_grad_vars, const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
const platform::Place &place, const detail::BackwardStrategy &strategy, const platform::Place &place, bool create_graph, bool retain_graph,
bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs) bool allow_unused, bool only_inputs)
: task_(new PartialGradTask(input_targets, output_targets, output_grads, : task_(new PartialGradTask(input_targets, output_targets, output_grads,
no_grad_vars, place, strategy, create_graph, no_grad_vars, place, create_graph, retain_graph,
retain_graph, allow_unused, only_inputs)) {} allow_unused, only_inputs)) {}
PartialGradEngine::~PartialGradEngine() { Clear(); } PartialGradEngine::~PartialGradEngine() { Clear(); }
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/imperative/backward_strategy.h"
#include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/engine.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -33,8 +32,7 @@ class PartialGradEngine : public Engine { ...@@ -33,8 +32,7 @@ class PartialGradEngine : public Engine {
const std::vector<std::shared_ptr<VarBase>> &output_targets, const std::vector<std::shared_ptr<VarBase>> &output_targets,
const std::vector<std::shared_ptr<VarBase>> &output_grads, const std::vector<std::shared_ptr<VarBase>> &output_grads,
const std::vector<std::shared_ptr<VarBase>> &no_grad_vars, const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
const platform::Place &place, const platform::Place &place, bool create_graph,
const detail::BackwardStrategy &strategy, bool create_graph,
bool retain_graph, bool allow_unused, bool only_inputs); bool retain_graph, bool allow_unused, bool only_inputs);
~PartialGradEngine(); ~PartialGradEngine();
......
...@@ -240,9 +240,8 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) { ...@@ -240,9 +240,8 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
framework::AttributeMap reduce_attr_map; framework::AttributeMap reduce_attr_map;
tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map, tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map,
gpu_place, true); gpu_place, true);
detail::BackwardStrategy back_st;
imperative::BasicEngine engine; imperative::BasicEngine engine;
engine.Init(reduce_sum_out.get(), back_st); engine.Init(reduce_sum_out.get());
engine.Execute(); engine.Execute();
framework::LoDTensor rlt; framework::LoDTensor rlt;
...@@ -356,9 +355,8 @@ TEST(test_tracer, test_var_without_grad_var) { ...@@ -356,9 +355,8 @@ TEST(test_tracer, test_var_without_grad_var) {
ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL); ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL); ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL);
detail::BackwardStrategy back_st;
imperative::BasicEngine engine; imperative::BasicEngine engine;
engine.Init(vout.get(), back_st); engine.Init(vout.get());
engine.Execute(); engine.Execute();
// check the grad // check the grad
......
...@@ -508,3 +508,16 @@ DEFINE_int32( ...@@ -508,3 +508,16 @@ DEFINE_int32(
"summary will be shown." "summary will be shown."
"If FLAGS_call_stack_level == 2, the python stack, c++ stack, and " "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
"error message summary will be shown."); "error message summary will be shown.");
/**
* Debug related FLAG
* Name: sort_sum_gradient
* Since Version: 2.0.0
* Value Range: bool, default=false
* Example:
* Note: If True, gradients are summed by the reverse order of
* the forward execution sequence.
*/
DEFINE_bool(sort_sum_gradient, false,
"Sum gradients by the reverse order of "
"the forward execution sequence.");
...@@ -38,6 +38,7 @@ DECLARE_bool(enable_rpc_profiler); ...@@ -38,6 +38,7 @@ DECLARE_bool(enable_rpc_profiler);
DECLARE_int32(multiple_of_cupti_buffer_size); DECLARE_int32(multiple_of_cupti_buffer_size);
DECLARE_bool(reader_queue_speed_test_mode); DECLARE_bool(reader_queue_speed_test_mode);
DECLARE_int32(call_stack_level); DECLARE_int32(call_stack_level);
DECLARE_bool(sort_sum_gradient);
// device management // device management
DECLARE_int32(paddle_num_threads); DECLARE_int32(paddle_num_threads);
// executor // executor
...@@ -340,7 +341,7 @@ static void RegisterGlobalVarGetterSetter() { ...@@ -340,7 +341,7 @@ static void RegisterGlobalVarGetterSetter() {
REGISTER_PUBLIC_GLOBAL_VAR( REGISTER_PUBLIC_GLOBAL_VAR(
FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph, FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph,
FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf, FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf,
FLAGS_call_stack_level, FLAGS_cpu_deterministic, FLAGS_call_stack_level, FLAGS_sort_sum_gradient, FLAGS_cpu_deterministic,
FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size, FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size,
FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname, FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname,
FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use, FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use,
......
...@@ -30,7 +30,6 @@ limitations under the License. */ ...@@ -30,7 +30,6 @@ limitations under the License. */
#include "paddle/fluid/imperative/all_reduce.h" #include "paddle/fluid/imperative/all_reduce.h"
#include "paddle/fluid/imperative/amp_auto_cast.h" #include "paddle/fluid/imperative/amp_auto_cast.h"
#include "paddle/fluid/imperative/backward_strategy.h"
#include "paddle/fluid/imperative/basic_engine.h" #include "paddle/fluid/imperative/basic_engine.h"
#include "paddle/fluid/imperative/data_loader.h" #include "paddle/fluid/imperative/data_loader.h"
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
...@@ -507,50 +506,6 @@ void BindImperative(py::module *m_ptr) { ...@@ -507,50 +506,6 @@ void BindImperative(py::module *m_ptr) {
[]() { memory::allocation::MemoryMapFdSet::Instance().Clear(); }); []() { memory::allocation::MemoryMapFdSet::Instance().Clear(); });
#endif #endif
py::class_<imperative::detail::BackwardStrategy> backward_strategy(
m, "BackwardStrategy", R"DOC(
BackwardStrategy is a descriptor of how to run the backward process.
**Note**:
**This API is only available in** `Dygraph <../../user_guides/howto/dygraph/DyGraph.html>`_ **Mode**
Attribute:
**sort_sum_gradient**:
If framework will sum the gradient by the reverse order of trace. eg. x_var ( :ref:`api_guide_Variable` ) will be the input of multiple OP such as :ref:`api_fluid_layers_scale` , this attr will decide if framework will sum gradient of `x_var` by the reverse order.
By Default: False
Examples:
.. code-block:: python
import numpy as np
import paddle.fluid as fluid
x = np.ones([2, 2], np.float32)
with fluid.dygraph.guard():
x_var = fluid.dygraph.to_variable(x)
sums_inputs = []
# x_var will be multi-scales' input here
for _ in range(10):
sums_inputs.append(fluid.layers.scale(x_var))
ret2 = fluid.layers.sums(sums_inputs)
loss2 = fluid.layers.reduce_sum(ret2)
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
loss2.backward(backward_strategy)
)DOC");
backward_strategy.def(py::init())
.def_property("sort_sum_gradient",
[](const imperative::detail::BackwardStrategy &self) {
return self.sorted_sum_gradient_;
},
[](imperative::detail::BackwardStrategy &self,
bool sorted_sum_gradient) {
self.sorted_sum_gradient_ = sorted_sum_gradient;
});
m.def("start_imperative_gperf_profiler", m.def("start_imperative_gperf_profiler",
[]() { imperative::StartProfile(); }); []() { imperative::StartProfile(); });
...@@ -745,21 +700,18 @@ void BindImperative(py::module *m_ptr) { ...@@ -745,21 +700,18 @@ void BindImperative(py::module *m_ptr) {
inputs2.append(tmp) inputs2.append(tmp)
ret2 = fluid.layers.sums(inputs2) ret2 = fluid.layers.sums(inputs2)
loss2 = fluid.layers.reduce_sum(ret2) loss2 = fluid.layers.reduce_sum(ret2)
backward_strategy = fluid.dygraph.BackwardStrategy() loss2.backward()
backward_strategy.sort_sum_gradient = True
loss2.backward(backward_strategy)
print(loss2.gradient()) print(loss2.gradient())
loss2.clear_gradient() loss2.clear_gradient()
print("After clear {}".format(loss2.gradient())) print("After clear {}".format(loss2.gradient()))
)DOC") )DOC")
.def("_run_backward", .def("_run_backward",
[](imperative::VarBase &self, [](imperative::VarBase &self, const imperative::Tracer &tracer,
const imperative::detail::BackwardStrategy &bckst, bool retain_graph) {
const imperative::Tracer &tracer, bool retain_graph) {
// TODO(jiabin): when we impl more backward execution we can // TODO(jiabin): when we impl more backward execution we can
// select them // select them
auto *engine = tracer.GetEngine(); auto *engine = tracer.GetEngine();
engine->Init(&self, bckst, retain_graph); engine->Init(&self, retain_graph);
VLOG(3) << "Start backward"; VLOG(3) << "Start backward";
engine->Execute(); engine->Execute();
VLOG(3) << "Finish backward"; VLOG(3) << "Finish backward";
...@@ -1024,13 +976,11 @@ void BindImperative(py::module *m_ptr) { ...@@ -1024,13 +976,11 @@ void BindImperative(py::module *m_ptr) {
&output_targets, &output_targets,
const std::vector<std::shared_ptr<imperative::VarBase>> &output_grads, const std::vector<std::shared_ptr<imperative::VarBase>> &output_grads,
const std::vector<std::shared_ptr<imperative::VarBase>> &no_grad_vars, const std::vector<std::shared_ptr<imperative::VarBase>> &no_grad_vars,
const platform::Place &place, const platform::Place &place, bool create_graph, bool retain_graph,
const imperative::detail::BackwardStrategy &strategy, bool allow_unused, bool only_inputs) {
bool create_graph, bool retain_graph, bool allow_unused,
bool only_inputs) {
imperative::PartialGradEngine engine( imperative::PartialGradEngine engine(
input_targets, output_targets, output_grads, no_grad_vars, place, input_targets, output_targets, output_grads, no_grad_vars, place,
strategy, create_graph, retain_graph, allow_unused, only_inputs); create_graph, retain_graph, allow_unused, only_inputs);
engine.Execute(); engine.Execute();
return engine.GetResult(); return engine.GetResult();
}, },
......
...@@ -225,7 +225,6 @@ from .framework import CPUPlace #DEFINE_ALIAS ...@@ -225,7 +225,6 @@ from .framework import CPUPlace #DEFINE_ALIAS
from .framework import CUDAPlace #DEFINE_ALIAS from .framework import CUDAPlace #DEFINE_ALIAS
from .framework import CUDAPinnedPlace #DEFINE_ALIAS from .framework import CUDAPinnedPlace #DEFINE_ALIAS
from .framework import BackwardStrategy #DEFINE_ALIAS
from .framework import to_variable #DEFINE_ALIAS from .framework import to_variable #DEFINE_ALIAS
from .framework import grad #DEFINE_ALIAS from .framework import grad #DEFINE_ALIAS
from .framework import no_grad #DEFINE_ALIAS from .framework import no_grad #DEFINE_ALIAS
......
...@@ -196,6 +196,7 @@ def __bootstrap__(): ...@@ -196,6 +196,7 @@ def __bootstrap__():
'free_idle_chunk', 'free_idle_chunk',
'free_when_no_cache_hit', 'free_when_no_cache_hit',
'call_stack_level', 'call_stack_level',
'sort_sum_gradient',
] ]
if 'Darwin' not in sysstr: if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory') read_env_flags.append('use_pinned_memory')
......
...@@ -38,9 +38,6 @@ from .checkpoint import * ...@@ -38,9 +38,6 @@ from .checkpoint import *
from . import learning_rate_scheduler from . import learning_rate_scheduler
from .learning_rate_scheduler import * from .learning_rate_scheduler import *
from . import backward_strategy
from .backward_strategy import *
from . import jit from . import jit
from .jit import * from .jit import *
...@@ -69,7 +66,6 @@ __all__ += nn.__all__ ...@@ -69,7 +66,6 @@ __all__ += nn.__all__
__all__ += parallel.__all__ __all__ += parallel.__all__
__all__ += checkpoint.__all__ __all__ += checkpoint.__all__
__all__ += learning_rate_scheduler.__all__ __all__ += learning_rate_scheduler.__all__
__all__ += backward_strategy.__all__
__all__ += jit.__all__ __all__ += jit.__all__
__all__ += io.__all__ __all__ += io.__all__
__all__ += rnn.__all__ __all__ += rnn.__all__
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid import core
__all__ = ["BackwardStrategy"]
BackwardStrategy = core.BackwardStrategy
...@@ -319,8 +319,7 @@ def grad(outputs, ...@@ -319,8 +319,7 @@ def grad(outputs,
create_graph=False, create_graph=False,
only_inputs=True, only_inputs=True,
allow_unused=False, allow_unused=False,
no_grad_vars=None, no_grad_vars=None):
backward_strategy=None):
''' '''
.. note:: .. note::
**This API is ONLY available in Dygraph mode.** **This API is ONLY available in Dygraph mode.**
...@@ -363,9 +362,6 @@ def grad(outputs, ...@@ -363,9 +362,6 @@ def grad(outputs,
their gradients if allow_unused=True. Default False. their gradients if allow_unused=True. Default False.
no_grad_vars (Variable|list(Variable)|tuple(Variable)|set(Variable), optional): no_grad_vars (Variable|list(Variable)|tuple(Variable)|set(Variable), optional):
the Variables whose gradients are not needed to compute. Default None. the Variables whose gradients are not needed to compute. Default None.
backward_strategy (BackwardStrategy, optional): The backward strategy to
compute gradients. See :ref:`api_fluid_dygraph_BackwardStrategy` for
details. Default None.
Returns: Returns:
tuple: a tuple of Variables, whose length is the same as the Variable number tuple: a tuple of Variables, whose length is the same as the Variable number
...@@ -503,12 +499,6 @@ def grad(outputs, ...@@ -503,12 +499,6 @@ def grad(outputs,
raise AssertionError( raise AssertionError(
"no_grad_vars must be None, Variable or list/tuple/set of Variables") "no_grad_vars must be None, Variable or list/tuple/set of Variables")
if backward_strategy is None:
backward_strategy = core.BackwardStrategy()
assert isinstance(backward_strategy, core.BackwardStrategy), \
"backward_strategy must be type paddle.fluid.dygraph.BackwardStrategy"
assert isinstance(create_graph, bool), "create_graph must be True or False" assert isinstance(create_graph, bool), "create_graph must be True or False"
if retain_graph is None: if retain_graph is None:
...@@ -524,9 +514,9 @@ def grad(outputs, ...@@ -524,9 +514,9 @@ def grad(outputs,
place = core.Place() place = core.Place()
place.set_place(framework._current_expected_place()) place.set_place(framework._current_expected_place())
return core.dygraph_partial_grad( return core.dygraph_partial_grad(inputs, outputs, grad_outputs,
inputs, outputs, grad_outputs, no_grad_vars, place, backward_strategy, no_grad_vars, place, create_graph,
create_graph, retain_graph, allow_unused, only_inputs) retain_graph, allow_unused, only_inputs)
@framework.dygraph_only @framework.dygraph_only
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
import inspect import inspect
from .. import framework from .. import framework
from .. import core from .. import core
from . import BackwardStrategy
from ..framework import Variable, Parameter, ParamBase from ..framework import Variable, Parameter, ParamBase
from .base import switch_to_static_graph from .base import switch_to_static_graph
import numpy as np import numpy as np
...@@ -129,19 +128,18 @@ def monkey_patch_varbase(): ...@@ -129,19 +128,18 @@ def monkey_patch_varbase():
framework._current_expected_place()) framework._current_expected_place())
@framework.dygraph_only @framework.dygraph_only
def backward(self, backward_strategy=None, retain_graph=False): def backward(self, retain_graph=False):
""" """
**Notes**: **Notes**:
**This API is ONLY available in Dygraph mode** **This API is ONLY available in Dygraph mode**
Run backward of current Graph which starts from current Variable Run backward of current Graph which starts from current Tensor.
Args: Args:
backward_strategy( :ref:`api_fluid_dygraph_BackwardStrategy` ): The Backward Strategy to run backward
retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
like to add more ops to the built graph after calling this method(`backward`), set the parameter like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient. :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
Defaults to False. Defaults to False.
Returns: Returns:
NoneType: None NoneType: None
...@@ -149,32 +147,25 @@ def monkey_patch_varbase(): ...@@ -149,32 +147,25 @@ def monkey_patch_varbase():
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid
import numpy as np import numpy as np
import paddle
paddle.disable_static()
x = np.ones([2, 2], np.float32) x = np.ones([2, 2], np.float32)
with fluid.dygraph.guard(): inputs = []
inputs2 = [] for _ in range(10):
for _ in range(10): tmp = paddle.to_tensor(x)
tmp = fluid.dygraph.base.to_variable(x) # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since
# if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since # there is no one need gradient on it.
# there is no one need gradient on it. tmp.stop_gradient=False
tmp.stop_gradient=False inputs.append(tmp)
inputs2.append(tmp) ret = paddle.sums(inputs)
ret2 = fluid.layers.sums(inputs2) loss = paddle.reduce_sum(ret)
loss2 = fluid.layers.reduce_sum(ret2) loss.backward()
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
loss2.backward(backward_strategy)
""" """
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
if backward_strategy is None: self._run_backward(framework._dygraph_tracer(), retain_graph)
backward_strategy = BackwardStrategy()
backward_strategy.sort_sum_gradient = False
self._run_backward(backward_strategy,
framework._dygraph_tracer(), retain_graph)
else: else:
raise ValueError( raise ValueError(
"Variable.backward() is only available in DyGraph mode") "Variable.backward() is only available in DyGraph mode")
...@@ -205,9 +196,7 @@ def monkey_patch_varbase(): ...@@ -205,9 +196,7 @@ def monkey_patch_varbase():
inputs2.append(tmp) inputs2.append(tmp)
ret2 = fluid.layers.sums(inputs2) ret2 = fluid.layers.sums(inputs2)
loss2 = fluid.layers.reduce_sum(ret2) loss2 = fluid.layers.reduce_sum(ret2)
backward_strategy = fluid.dygraph.BackwardStrategy() loss2.backward()
backward_strategy.sort_sum_gradient = True
loss2.backward(backward_strategy)
print(loss2.gradient()) print(loss2.gradient())
""" """
......
...@@ -1106,15 +1106,18 @@ class Variable(object): ...@@ -1106,15 +1106,18 @@ class Variable(object):
pass pass
@fake_interface_only @fake_interface_only
def backward(self, backward_strategy=None): def backward(self, retain_graph=False):
""" """
**Notes**: **Notes**:
**This API is ONLY available in Dygraph mode** **This API is ONLY available in Dygraph mode**
Run backward of current Graph which starts from current Variable Run backward of current Graph which starts from current Tensor.
Args: Args:
backward_strategy( :ref:`api_fluid_dygraph_BackwardStrategy` ): The Backward Strategy to run backward retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
:code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
Defaults to False.
Returns: Returns:
NoneType: None NoneType: None
...@@ -1122,23 +1125,21 @@ class Variable(object): ...@@ -1122,23 +1125,21 @@ class Variable(object):
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid
import numpy as np import numpy as np
import paddle
paddle.disable_static()
x = np.ones([2, 2], np.float32) x = np.ones([2, 2], np.float32)
with fluid.dygraph.guard(): inputs = []
inputs2 = [] for _ in range(10):
for _ in range(10): tmp = paddle.to_tensor(x)
tmp = fluid.dygraph.base.to_variable(x) # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since
# if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since # there is no one need gradient on it.
# there is no one need gradient on it. tmp.stop_gradient=False
tmp.stop_gradient=False inputs.append(tmp)
inputs2.append(tmp) ret = paddle.sums(inputs)
ret2 = fluid.layers.sums(inputs2) loss = paddle.reduce_sum(ret)
loss2 = fluid.layers.reduce_sum(ret2) loss.backward()
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
loss2.backward(backward_strategy)
""" """
pass pass
...@@ -1170,9 +1171,7 @@ class Variable(object): ...@@ -1170,9 +1171,7 @@ class Variable(object):
inputs2.append(tmp) inputs2.append(tmp)
ret2 = fluid.layers.sums(inputs2) ret2 = fluid.layers.sums(inputs2)
loss2 = fluid.layers.reduce_sum(ret2) loss2 = fluid.layers.reduce_sum(ret2)
backward_strategy = fluid.dygraph.BackwardStrategy() loss2.backward()
backward_strategy.sort_sum_gradient = True
loss2.backward(backward_strategy)
print(loss2.gradient()) print(loss2.gradient())
# example2: return tuple of ndarray # example2: return tuple of ndarray
...@@ -1218,9 +1217,7 @@ class Variable(object): ...@@ -1218,9 +1217,7 @@ class Variable(object):
inputs2.append(tmp) inputs2.append(tmp)
ret2 = fluid.layers.sums(inputs2) ret2 = fluid.layers.sums(inputs2)
loss2 = fluid.layers.reduce_sum(ret2) loss2 = fluid.layers.reduce_sum(ret2)
backward_strategy = fluid.dygraph.BackwardStrategy() loss2.backward()
backward_strategy.sort_sum_gradient = True
loss2.backward(backward_strategy)
print(loss2.gradient()) print(loss2.gradient())
loss2.clear_gradient() loss2.clear_gradient()
print("After clear {}".format(loss2.gradient())) print("After clear {}".format(loss2.gradient()))
......
...@@ -38,8 +38,7 @@ class TestDirectory(unittest.TestCase): ...@@ -38,8 +38,7 @@ class TestDirectory(unittest.TestCase):
'paddle.enable_static', 'paddle.disable_static', 'paddle.enable_static', 'paddle.disable_static',
'paddle.in_dynamic_mode', 'paddle.to_variable', 'paddle.grad', 'paddle.in_dynamic_mode', 'paddle.to_variable', 'paddle.grad',
'paddle.no_grad', 'paddle.save', 'paddle.load', 'paddle.no_grad', 'paddle.save', 'paddle.load',
'paddle.static.save', 'paddle.static.load', 'paddle.static.save', 'paddle.static.load', 'paddle.ParallelEnv',
'paddle.BackwardStrategy', 'paddle.ParallelEnv',
'paddle.prepare_context', 'paddle.DataParallel', 'paddle.jit', 'paddle.prepare_context', 'paddle.DataParallel', 'paddle.jit',
'paddle.jit.TracedLayer', 'paddle.jit.to_static', 'paddle.jit.TracedLayer', 'paddle.jit.to_static',
'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer', 'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
...@@ -98,7 +97,6 @@ class TestDirectory(unittest.TestCase): ...@@ -98,7 +97,6 @@ class TestDirectory(unittest.TestCase):
'paddle.imperative.enable', 'paddle.imperative.guard', 'paddle.imperative.enable', 'paddle.imperative.guard',
'paddle.imperative.grad', 'paddle.imperative.no_grad', 'paddle.imperative.grad', 'paddle.imperative.no_grad',
'paddle.imperative.save', 'paddle.imperative.load', 'paddle.imperative.save', 'paddle.imperative.load',
'paddle.imperative.BackwardStrategy',
'paddle.imperative.ParallelEnv', 'paddle.imperative.ParallelEnv',
'paddle.imperative.prepare_context', 'paddle.imperative.prepare_context',
'paddle.imperative.DataParalell', 'paddle.imperative.jit', 'paddle.imperative.DataParalell', 'paddle.imperative.jit',
......
...@@ -238,8 +238,7 @@ class TestImperativeAutoPrune(unittest.TestCase): ...@@ -238,8 +238,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
out2 = linear2(b) out2 = linear2(b)
out1.stop_gradient = True out1.stop_gradient = True
out = fluid.layers.concat(input=[out1, out2, c], axis=1) out = fluid.layers.concat(input=[out1, out2, c], axis=1)
backward_strategy = fluid.dygraph.BackwardStrategy() out.backward()
out.backward(backward_strategy)
self.assertTrue(linear.weight.gradient() is None) self.assertTrue(linear.weight.gradient() is None)
self.assertTrue(out1.gradient() is None) self.assertTrue(out1.gradient() is None)
...@@ -311,9 +310,8 @@ class TestImperativeAutoPrune(unittest.TestCase): ...@@ -311,9 +310,8 @@ class TestImperativeAutoPrune(unittest.TestCase):
out2 = linear2(b) out2 = linear2(b)
out1.stop_gradient = True out1.stop_gradient = True
out = fluid.layers.concat(input=[out1, out2, c], axis=1) out = fluid.layers.concat(input=[out1, out2, c], axis=1)
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({'FLAGS_sort_sum_gradient': True})
backward_strategy.sort_sum_gradient = True out.backward()
out.backward(backward_strategy)
self.assertTrue(linear.weight.gradient() is None) self.assertTrue(linear.weight.gradient() is None)
self.assertTrue(out1.gradient() is None) self.assertTrue(out1.gradient() is None)
......
...@@ -314,9 +314,8 @@ class TestImperative(unittest.TestCase): ...@@ -314,9 +314,8 @@ class TestImperative(unittest.TestCase):
inputs2.append(tmp) inputs2.append(tmp)
ret2 = fluid.layers.sums(inputs2) ret2 = fluid.layers.sums(inputs2)
loss2 = fluid.layers.reduce_sum(ret2) loss2 = fluid.layers.reduce_sum(ret2)
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({'FLAGS_sort_sum_gradient': True})
backward_strategy.sort_sum_gradient = True loss2.backward()
loss2.backward(backward_strategy)
self.assertTrue(np.allclose(ret.numpy(), x * 10)) self.assertTrue(np.allclose(ret.numpy(), x * 10))
self.assertTrue(np.allclose(inputs[0].gradient(), x)) self.assertTrue(np.allclose(inputs[0].gradient(), x))
...@@ -403,9 +402,8 @@ class TestImperative(unittest.TestCase): ...@@ -403,9 +402,8 @@ class TestImperative(unittest.TestCase):
x2 = l2(var_inp2)[0] x2 = l2(var_inp2)[0]
self.assertIsNotNone(x2) self.assertIsNotNone(x2)
dy_out2 = x2.numpy() dy_out2 = x2.numpy()
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({'FLAGS_sort_sum_gradient': True})
backward_strategy.sort_sum_gradient = True x2.backward()
x2.backward(backward_strategy)
dy_grad2 = l2._x_for_debug.gradient() dy_grad2 = l2._x_for_debug.gradient()
with new_program_scope(): with new_program_scope():
...@@ -442,9 +440,8 @@ class TestImperative(unittest.TestCase): ...@@ -442,9 +440,8 @@ class TestImperative(unittest.TestCase):
mlp2 = MLP(input_size=2) mlp2 = MLP(input_size=2)
out2 = mlp2(var_inp2) out2 = mlp2(var_inp2)
dy_out2 = out2.numpy() dy_out2 = out2.numpy()
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({'FLAGS_sort_sum_gradient': True})
backward_strategy.sort_sum_gradient = True out2.backward()
out2.backward(backward_strategy)
dy_grad2 = mlp2._linear1.weight.gradient() dy_grad2 = mlp2._linear1.weight.gradient()
with new_program_scope(): with new_program_scope():
...@@ -552,9 +549,8 @@ class TestImperative(unittest.TestCase): ...@@ -552,9 +549,8 @@ class TestImperative(unittest.TestCase):
simple_rnn2 = SimpleRNN() simple_rnn2 = SimpleRNN()
outs2, pre_hiddens2 = simple_rnn2.forward(var_inp2) outs2, pre_hiddens2 = simple_rnn2.forward(var_inp2)
dy_out2 = outs2[3].numpy() dy_out2 = outs2[3].numpy()
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({'FLAGS_sort_sum_gradient': True})
backward_strategy.sort_sum_gradient = True outs2[3].backward()
outs2[3].backward(backward_strategy)
dy_grad_h2o2 = simple_rnn2._cell._h2o_w.gradient() dy_grad_h2o2 = simple_rnn2._cell._h2o_w.gradient()
dy_grad_h2h2 = simple_rnn2._cell._h2h_w.gradient() dy_grad_h2h2 = simple_rnn2._cell._h2h_w.gradient()
dy_grad_i2h2 = simple_rnn2._cell._i2h_w.gradient() dy_grad_i2h2 = simple_rnn2._cell._i2h_w.gradient()
......
...@@ -275,8 +275,7 @@ class TestDygraphDeepCF(unittest.TestCase): ...@@ -275,8 +275,7 @@ class TestDygraphDeepCF(unittest.TestCase):
deepcf2 = DeepCF(num_users, num_items, matrix) deepcf2 = DeepCF(num_users, num_items, matrix)
adam2 = fluid.optimizer.AdamOptimizer( adam2 = fluid.optimizer.AdamOptimizer(
0.01, parameter_list=deepcf2.parameters()) 0.01, parameter_list=deepcf2.parameters())
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({'FLAGS_sort_sum_gradient': True})
backward_strategy.sort_sum_gradient = True
for e in range(NUM_EPOCHES): for e in range(NUM_EPOCHES):
sys.stderr.write('epoch %d\n' % e) sys.stderr.write('epoch %d\n' % e)
for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE): for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
...@@ -289,7 +288,7 @@ class TestDygraphDeepCF(unittest.TestCase): ...@@ -289,7 +288,7 @@ class TestDygraphDeepCF(unittest.TestCase):
fluid.layers.log_loss(prediction2, fluid.layers.log_loss(prediction2,
to_variable(labels_np[ to_variable(labels_np[
slice:slice + BATCH_SIZE]))) slice:slice + BATCH_SIZE])))
loss2.backward(backward_strategy) loss2.backward()
adam2.minimize(loss2) adam2.minimize(loss2)
deepcf2.clear_gradients() deepcf2.clear_gradients()
dy_loss2 = loss2.numpy() dy_loss2 = loss2.numpy()
......
...@@ -52,8 +52,7 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -52,8 +52,7 @@ class TestDygraphDoubleGrad(TestCase):
retain_graph=None, retain_graph=None,
create_graph=False, create_graph=False,
allow_unused=False): allow_unused=False):
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({'FLAGS_sort_sum_gradient': self.sort_sum_gradient})
backward_strategy.sort_sum_gradient = self.sort_sum_gradient
return fluid.dygraph.grad( return fluid.dygraph.grad(
outputs=outputs, outputs=outputs,
inputs=inputs, inputs=inputs,
...@@ -61,8 +60,7 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -61,8 +60,7 @@ class TestDygraphDoubleGrad(TestCase):
no_grad_vars=no_grad_vars, no_grad_vars=no_grad_vars,
retain_graph=retain_graph, retain_graph=retain_graph,
create_graph=create_graph, create_graph=create_graph,
allow_unused=allow_unused, allow_unused=allow_unused)
backward_strategy=backward_strategy)
@dygraph_guard @dygraph_guard
def test_exception(self): def test_exception(self):
...@@ -310,8 +308,8 @@ class TestDygraphDoubleGradVisitedUniq(TestCase): ...@@ -310,8 +308,8 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
out = out + linear(input) out = out + linear(input)
return out return out
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({'FLAGS_sort_sum_gradient': True})
backward_strategy.sort_sum_gradient = True
with fluid.dygraph.guard(): with fluid.dygraph.guard():
paddle.manual_seed(123) paddle.manual_seed(123)
a = fluid.dygraph.to_variable(value) a = fluid.dygraph.to_variable(value)
...@@ -324,8 +322,7 @@ class TestDygraphDoubleGradVisitedUniq(TestCase): ...@@ -324,8 +322,7 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
inputs=[a], inputs=[a],
create_graph=False, create_graph=False,
only_inputs=True, only_inputs=True,
allow_unused=False, allow_unused=False)
backward_strategy=backward_strategy)
grad_1 = dx[0].numpy() grad_1 = dx[0].numpy()
...@@ -335,7 +332,7 @@ class TestDygraphDoubleGradVisitedUniq(TestCase): ...@@ -335,7 +332,7 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
a.stop_gradient = False a.stop_gradient = False
out = model_f(a) out = model_f(a)
out.backward(backward_strategy) out.backward()
grad_2 = a.gradient() grad_2 = a.gradient()
......
...@@ -179,9 +179,8 @@ class TestDygraphGAN(unittest.TestCase): ...@@ -179,9 +179,8 @@ class TestDygraphGAN(unittest.TestCase):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
fluid.set_flags({'FLAGS_sort_sum_gradient': True})
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
discriminator2 = Discriminator() discriminator2 = Discriminator()
generator2 = Generator() generator2 = Generator()
sgd2 = SGDOptimizer( sgd2 = SGDOptimizer(
...@@ -201,7 +200,7 @@ class TestDygraphGAN(unittest.TestCase): ...@@ -201,7 +200,7 @@ class TestDygraphGAN(unittest.TestCase):
x=d_fake2, label=to_variable(np.zeros([2, 1], np.float32)))) x=d_fake2, label=to_variable(np.zeros([2, 1], np.float32))))
d_loss2 = d_loss_real2 + d_loss_fake2 d_loss2 = d_loss_real2 + d_loss_fake2
d_loss2.backward(backward_strategy) d_loss2.backward()
sgd2.minimize(d_loss2) sgd2.minimize(d_loss2)
discriminator2.clear_gradients() discriminator2.clear_gradients()
generator2.clear_gradients() generator2.clear_gradients()
...@@ -211,7 +210,7 @@ class TestDygraphGAN(unittest.TestCase): ...@@ -211,7 +210,7 @@ class TestDygraphGAN(unittest.TestCase):
g_loss2 = fluid.layers.reduce_mean( g_loss2 = fluid.layers.reduce_mean(
fluid.layers.sigmoid_cross_entropy_with_logits( fluid.layers.sigmoid_cross_entropy_with_logits(
x=d_fake2, label=to_variable(np.ones([2, 1], np.float32)))) x=d_fake2, label=to_variable(np.ones([2, 1], np.float32))))
g_loss2.backward(backward_strategy) g_loss2.backward()
sgd2.minimize(g_loss2) sgd2.minimize(g_loss2)
for p in discriminator2.parameters(): for p in discriminator2.parameters():
dy_params2[p.name] = p.numpy() dy_params2[p.name] = p.numpy()
......
...@@ -62,8 +62,7 @@ class Test_Forward_Hook(unittest.TestCase): ...@@ -62,8 +62,7 @@ class Test_Forward_Hook(unittest.TestCase):
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({'FLAGS_sort_sum_gradient': True})
backward_strategy.sort_sum_gradient = True
input_word = np.array( input_word = np.array(
[0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, [0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7,
...@@ -132,8 +131,7 @@ class Test_Forward_Hook(unittest.TestCase): ...@@ -132,8 +131,7 @@ class Test_Forward_Hook(unittest.TestCase):
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({'FLAGS_sort_sum_gradient': True})
backward_strategy.sort_sum_gradient = True
global call_forward_hook global call_forward_hook
global call_forward_pre_hook global call_forward_pre_hook
......
...@@ -113,8 +113,9 @@ class TestDygraphSimpleNet(unittest.TestCase): ...@@ -113,8 +113,9 @@ class TestDygraphSimpleNet(unittest.TestCase):
dy_loss = None dy_loss = None
helper = DyGraphProgramDescTracerTestHelper(self) helper = DyGraphProgramDescTracerTestHelper(self)
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({
backward_strategy.sort_sum_gradient = is_sort_sum_gradient 'FLAGS_sort_sum_gradient': is_sort_sum_gradient
})
for i in range(batch_num): for i in range(batch_num):
x_data = np.arange(12).reshape(4, 3).astype('int64') x_data = np.arange(12).reshape(4, 3).astype('int64')
...@@ -129,7 +130,7 @@ class TestDygraphSimpleNet(unittest.TestCase): ...@@ -129,7 +130,7 @@ class TestDygraphSimpleNet(unittest.TestCase):
if i == 0: if i == 0:
for param in simple_net.parameters(): for param in simple_net.parameters():
dy_param_init[param.name] = param.numpy() dy_param_init[param.name] = param.numpy()
dy_loss.backward(backward_strategy) dy_loss.backward()
sgd.minimize(dy_loss) sgd.minimize(dy_loss)
sgd.clear_gradients() sgd.clear_gradients()
if i == batch_num - 1: if i == batch_num - 1:
......
...@@ -36,8 +36,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase): ...@@ -36,8 +36,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({'FLAGS_sort_sum_gradient': True})
backward_strategy.sort_sum_gradient = True
mnist2 = MNIST() mnist2 = MNIST()
sgd2 = SGDOptimizer( sgd2 = SGDOptimizer(
...@@ -69,7 +68,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase): ...@@ -69,7 +68,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase):
for param in mnist2.parameters(): for param in mnist2.parameters():
dy_param_init_value2[param.name] = param.numpy() dy_param_init_value2[param.name] = param.numpy()
avg_loss2.backward(backward_strategy) avg_loss2.backward()
sgd2.minimize(avg_loss2) sgd2.minimize(avg_loss2)
mnist2.clear_gradients() mnist2.clear_gradients()
......
...@@ -403,8 +403,7 @@ class TestDygraphOCRAttention(unittest.TestCase): ...@@ -403,8 +403,7 @@ class TestDygraphOCRAttention(unittest.TestCase):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({'FLAGS_sort_sum_gradient': True})
backward_strategy.sort_sum_gradient = True
ocr_attention = OCRAttention() ocr_attention = OCRAttention()
if Config.learning_rate_decay == "piecewise_decay": if Config.learning_rate_decay == "piecewise_decay":
...@@ -438,7 +437,7 @@ class TestDygraphOCRAttention(unittest.TestCase): ...@@ -438,7 +437,7 @@ class TestDygraphOCRAttention(unittest.TestCase):
for param in ocr_attention.parameters(): for param in ocr_attention.parameters():
if param.name not in dy_param_init_value: if param.name not in dy_param_init_value:
dy_param_init_value[param.name] = param.numpy() dy_param_init_value[param.name] = param.numpy()
avg_loss.backward(backward_strategy) avg_loss.backward()
dy_grad_value = {} dy_grad_value = {}
for param in ocr_attention.parameters(): for param in ocr_attention.parameters():
if param.trainable: if param.trainable:
......
...@@ -45,8 +45,7 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase): ...@@ -45,8 +45,7 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({'FLAGS_sort_sum_gradient': True})
backward_strategy.sort_sum_gradient = True
# TODO: marsyang1993 Change seed to # TODO: marsyang1993 Change seed to
ptb_model = PtbModel( ptb_model = PtbModel(
hidden_size=hidden_size, hidden_size=hidden_size,
...@@ -82,7 +81,7 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase): ...@@ -82,7 +81,7 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
if i == 0: if i == 0:
for param in ptb_model.parameters(): for param in ptb_model.parameters():
dy_param_init[param.name] = param.numpy() dy_param_init[param.name] = param.numpy()
dy_loss.backward(backward_strategy) dy_loss.backward()
sgd.minimize(dy_loss) sgd.minimize(dy_loss)
ptb_model.clear_gradients() ptb_model.clear_gradients()
if i == batch_num - 1: if i == batch_num - 1:
......
...@@ -79,8 +79,7 @@ class TestDygraphResnetSortGradient(unittest.TestCase): ...@@ -79,8 +79,7 @@ class TestDygraphResnetSortGradient(unittest.TestCase):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({'FLAGS_sort_sum_gradient': True})
backward_strategy.sort_sum_gradient = True
resnet = ResNet() resnet = ResNet()
optimizer = optimizer_setting( optimizer = optimizer_setting(
train_parameters, parameter_list=resnet.parameters()) train_parameters, parameter_list=resnet.parameters())
...@@ -119,7 +118,7 @@ class TestDygraphResnetSortGradient(unittest.TestCase): ...@@ -119,7 +118,7 @@ class TestDygraphResnetSortGradient(unittest.TestCase):
if param.name not in dy_param_init_value: if param.name not in dy_param_init_value:
dy_param_init_value[param.name] = param.numpy() dy_param_init_value[param.name] = param.numpy()
avg_loss.backward(backward_strategy) avg_loss.backward()
dy_grad_value = {} dy_grad_value = {}
for param in resnet.parameters(): for param in resnet.parameters():
......
...@@ -48,8 +48,9 @@ class TestSimpleNet(unittest.TestCase): ...@@ -48,8 +48,9 @@ class TestSimpleNet(unittest.TestCase):
for dtype in ["float32", "float64"]: for dtype in ["float32", "float64"]:
for sort_sum_gradient in [True, False]: for sort_sum_gradient in [True, False]:
paddle.disable_static(place) paddle.disable_static(place)
backward_strategy = paddle.BackwardStrategy() fluid.set_flags({
backward_strategy.sort_sum_gradient = sort_sum_gradient 'FLAGS_sort_sum_gradient': sort_sum_gradient
})
# grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0) # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
input_word = np.array([[1, 2], [2, 1]]).astype('int64') input_word = np.array([[1, 2], [2, 1]]).astype('int64')
...@@ -65,7 +66,7 @@ class TestSimpleNet(unittest.TestCase): ...@@ -65,7 +66,7 @@ class TestSimpleNet(unittest.TestCase):
self.assertTrue(emb.weight.gradient() is None) self.assertTrue(emb.weight.gradient() is None)
self.assertTrue(input_emb.gradient() is None) self.assertTrue(input_emb.gradient() is None)
input_emb.backward(backward_strategy) input_emb.backward()
adam.minimize(input_emb) adam.minimize(input_emb)
self.assertTrue(emb.weight.gradient() is not None) self.assertTrue(emb.weight.gradient() is not None)
...@@ -84,8 +85,9 @@ class TestSimpleNet(unittest.TestCase): ...@@ -84,8 +85,9 @@ class TestSimpleNet(unittest.TestCase):
for place in places: for place in places:
for sort_sum_gradient in [True, False]: for sort_sum_gradient in [True, False]:
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({
backward_strategy.sort_sum_gradient = sort_sum_gradient 'FLAGS_sort_sum_gradient': sort_sum_gradient
})
grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0) grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
input_word = np.array([[1, 2], [2, 1]]).astype('int64') input_word = np.array([[1, 2], [2, 1]]).astype('int64')
...@@ -101,7 +103,7 @@ class TestSimpleNet(unittest.TestCase): ...@@ -101,7 +103,7 @@ class TestSimpleNet(unittest.TestCase):
self.assertTrue(emb.weight.gradient() is None) self.assertTrue(emb.weight.gradient() is None)
self.assertTrue(input_emb.gradient() is None) self.assertTrue(input_emb.gradient() is None)
input_emb.backward(backward_strategy) input_emb.backward()
adam.minimize(input_emb) adam.minimize(input_emb)
self.assertTrue(emb.weight.gradient() is not None) self.assertTrue(emb.weight.gradient() is not None)
......
...@@ -119,8 +119,9 @@ class TestDygraphSimpleNet(unittest.TestCase): ...@@ -119,8 +119,9 @@ class TestDygraphSimpleNet(unittest.TestCase):
dy_param_init = dict() dy_param_init = dict()
dy_loss = None dy_loss = None
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({
backward_strategy.sort_sum_gradient = is_sort_sum_gradient 'FLAGS_sort_sum_gradient': is_sort_sum_gradient
})
for i in range(batch_num): for i in range(batch_num):
x_data = np.arange(12).reshape(4, 3).astype('int64') x_data = np.arange(12).reshape(4, 3).astype('int64')
...@@ -135,7 +136,7 @@ class TestDygraphSimpleNet(unittest.TestCase): ...@@ -135,7 +136,7 @@ class TestDygraphSimpleNet(unittest.TestCase):
if i == 0: if i == 0:
for param in simple_net.parameters(): for param in simple_net.parameters():
dy_param_init[param.name] = param.numpy() dy_param_init[param.name] = param.numpy()
dy_loss.backward(backward_strategy) dy_loss.backward()
sgd.minimize(dy_loss) sgd.minimize(dy_loss)
sgd.clear_gradients() sgd.clear_gradients()
if i == batch_num - 1: if i == batch_num - 1:
......
...@@ -479,8 +479,7 @@ class DyGraphTrainModel(object): ...@@ -479,8 +479,7 @@ class DyGraphTrainModel(object):
self.cfg = cfg self.cfg = cfg
self.backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({'FLAGS_sort_sum_gradient': cfg.sort_sum_gradient})
self.backward_strategy.sort_sum_gradient = cfg.sort_sum_gradient
def clear_gradients(self): def clear_gradients(self):
if self.g_optimizer: if self.g_optimizer:
...@@ -497,7 +496,7 @@ class DyGraphTrainModel(object): ...@@ -497,7 +496,7 @@ class DyGraphTrainModel(object):
g_loss = get_generator_loss(image_real, label_org, label_trg, g_loss = get_generator_loss(image_real, label_org, label_trg,
self.generator, self.discriminator, self.generator, self.discriminator,
self.cfg) self.cfg)
g_loss.backward(self.backward_strategy) g_loss.backward()
if self.g_optimizer: if self.g_optimizer:
self.g_optimizer.minimize(g_loss) self.g_optimizer.minimize(g_loss)
...@@ -506,7 +505,7 @@ class DyGraphTrainModel(object): ...@@ -506,7 +505,7 @@ class DyGraphTrainModel(object):
d_loss = get_discriminator_loss(image_real, label_org, label_trg, d_loss = get_discriminator_loss(image_real, label_org, label_trg,
self.generator, self.discriminator, self.generator, self.discriminator,
self.cfg) self.cfg)
d_loss.backward(self.backward_strategy) d_loss.backward()
if self.d_optimizer: if self.d_optimizer:
self.d_optimizer.minimize(d_loss) self.d_optimizer.minimize(d_loss)
......
...@@ -121,8 +121,7 @@ class TestImperativeStaticModelRunnerMnist(unittest.TestCase): ...@@ -121,8 +121,7 @@ class TestImperativeStaticModelRunnerMnist(unittest.TestCase):
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
fluid.default_startup_program().random_seed = self.seed fluid.default_startup_program().random_seed = self.seed
fluid.default_main_program().random_seed = self.seed fluid.default_main_program().random_seed = self.seed
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({'FLAGS_sort_sum_gradient': True})
backward_strategy.sort_sum_gradient = True
mnist = fluid.dygraph.static_runner.StaticModelRunner( mnist = fluid.dygraph.static_runner.StaticModelRunner(
model_dir=self.save_dirname, model_dir=self.save_dirname,
...@@ -156,7 +155,7 @@ class TestImperativeStaticModelRunnerMnist(unittest.TestCase): ...@@ -156,7 +155,7 @@ class TestImperativeStaticModelRunnerMnist(unittest.TestCase):
loss = fluid.layers.cross_entropy(cost, label) loss = fluid.layers.cross_entropy(cost, label)
avg_loss = fluid.layers.mean(loss) avg_loss = fluid.layers.mean(loss)
avg_loss.backward(backward_strategy) avg_loss.backward()
sgd.minimize(avg_loss) sgd.minimize(avg_loss)
mnist.clear_gradients() mnist.clear_gradients()
......
...@@ -111,9 +111,7 @@ class TestImperativeStaticModelRunnerWhile(unittest.TestCase): ...@@ -111,9 +111,7 @@ class TestImperativeStaticModelRunnerWhile(unittest.TestCase):
fluid.default_startup_program().random_seed = self.seed fluid.default_startup_program().random_seed = self.seed
fluid.default_main_program().random_seed = self.seed fluid.default_main_program().random_seed = self.seed
np.random.seed(self.seed) np.random.seed(self.seed)
fluid.set_flags({'FLAGS_sort_sum_gradient': True})
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
while_net = fluid.dygraph.static_runner.StaticModelRunner( while_net = fluid.dygraph.static_runner.StaticModelRunner(
self.save_dirname) self.save_dirname)
...@@ -141,7 +139,7 @@ class TestImperativeStaticModelRunnerWhile(unittest.TestCase): ...@@ -141,7 +139,7 @@ class TestImperativeStaticModelRunnerWhile(unittest.TestCase):
loss = fluid.layers.cross_entropy(cost, label) loss = fluid.layers.cross_entropy(cost, label)
avg_loss = fluid.layers.mean(loss) avg_loss = fluid.layers.mean(loss)
avg_loss.backward(backward_strategy) avg_loss.backward()
sgd.minimize(avg_loss) sgd.minimize(avg_loss)
while_net.clear_gradients() while_net.clear_gradients()
......
...@@ -951,8 +951,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): ...@@ -951,8 +951,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
with guard(): with guard():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
backward_strategy = fluid.dygraph.BackwardStrategy() fluid.set_flags({'FLAGS_sort_sum_gradient': True})
backward_strategy.sort_sum_gradient = True
transformer = TransFormer( transformer = TransFormer(
ModelHyperParams.src_vocab_size, ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size, ModelHyperParams.trg_vocab_size,
...@@ -1021,7 +1020,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): ...@@ -1021,7 +1020,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
for param in transformer.parameters(): for param in transformer.parameters():
dy_param_init[param.name] = param.numpy() dy_param_init[param.name] = param.numpy()
dy_avg_cost.backward(backward_strategy) dy_avg_cost.backward()
optimizer.minimize(dy_avg_cost) optimizer.minimize(dy_avg_cost)
transformer.clear_gradients() transformer.clear_gradients()
......
...@@ -52,8 +52,6 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -52,8 +52,6 @@ class TestDygraphDoubleGrad(TestCase):
retain_graph=None, retain_graph=None,
create_graph=False, create_graph=False,
allow_unused=False): allow_unused=False):
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = self.sort_sum_gradient
return paddle.grad( return paddle.grad(
outputs=outputs, outputs=outputs,
inputs=inputs, inputs=inputs,
...@@ -61,8 +59,7 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -61,8 +59,7 @@ class TestDygraphDoubleGrad(TestCase):
no_grad_vars=no_grad_vars, no_grad_vars=no_grad_vars,
retain_graph=retain_graph, retain_graph=retain_graph,
create_graph=create_graph, create_graph=create_graph,
allow_unused=allow_unused, allow_unused=allow_unused)
backward_strategy=backward_strategy)
@dygraph_guard @dygraph_guard
def test_exception(self): def test_exception(self):
......
...@@ -20,8 +20,8 @@ __all__ = [ ...@@ -20,8 +20,8 @@ __all__ = [
] ]
__all__ += [ __all__ += [
'BackwardStrategy', 'grad', 'LayerList', 'load', 'save', 'prepare_context', 'grad', 'LayerList', 'load', 'save', 'prepare_context', 'to_variable',
'to_variable', 'no_grad', 'ParallelEnv', 'DataParallel' 'no_grad', 'ParallelEnv', 'DataParallel'
] ]
__all__ += [ __all__ += [
...@@ -61,5 +61,3 @@ from ..fluid.dygraph.learning_rate_scheduler import ExponentialDecay #DEFINE_AL ...@@ -61,5 +61,3 @@ from ..fluid.dygraph.learning_rate_scheduler import ExponentialDecay #DEFINE_AL
from ..fluid.dygraph.learning_rate_scheduler import InverseTimeDecay #DEFINE_ALIAS from ..fluid.dygraph.learning_rate_scheduler import InverseTimeDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import PolynomialDecay #DEFINE_ALIAS from ..fluid.dygraph.learning_rate_scheduler import PolynomialDecay #DEFINE_ALIAS
from ..fluid.dygraph.learning_rate_scheduler import CosineDecay #DEFINE_ALIAS from ..fluid.dygraph.learning_rate_scheduler import CosineDecay #DEFINE_ALIAS
BackwardStrategy = core.BackwardStrategy
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册