未验证 提交 83b953f5 编写于 作者: C chentianyu03 提交者: GitHub

add custom init grad for backward function (#31540)

* add custom init grad for backward function

* add custom init grad for backward function

* handle when the grad_tensor is none

* handle when the grad_tensor is none

* fix the args type error on windows platform

* modify the args order and doc

* format code

* add grad_tensor to xpu

* modify the grad_tensor type check

* add paddle.backward api to support multi tensors gradient compute

* add paddle.backward api to support multi tensors gradient compute

* add paddle.atuograd module and backward api

* change tensor.backward func args

* modify tensor backward api

* remove create_graph intputs args

* add doc and examplex code for backward api

* when have the same tensor, throw error

* modify test Init func args

* modify the execute.Init func args in test files

* add paddle.autograd package in setup.py.in

* modify error msg, remove _run_backward method in class Tensor

* add test cases for backward api
上级 9c5d0286
......@@ -36,48 +36,73 @@ DECLARE_bool(sort_sum_gradient);
namespace paddle {
namespace imperative {
void BasicEngine::Init(VarBase* var, bool retain_graph) {
void BasicEngine::Init(
const std::vector<std::shared_ptr<VarBase>>& tensors,
const std::vector<std::shared_ptr<VarBase>>& grad_tensors,
bool retain_graph) {
retain_graph_ = retain_graph;
init_node_ = var->GradVarBase()->GradNode();
PADDLE_ENFORCE_EQ(var->GradVarBase()->GraphIsFreed(), false,
platform::errors::Unavailable(
"%s trying to backward through the same graph a second "
"time, but this graph have already been freed. Please "
"specify Tensor.backward(retain_graph=True) when "
"calling backward at the first time.",
var->Name()));
if (!retain_graph) {
VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name()
<< " because of retain_graph=False when calling backward";
var->GradVarBase()->SetGraphIsFreed(true);
var->GradVarBase()->ClearGradNode();
}
if (init_node_ == nullptr || var->OverridedStopGradient()) {
VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
"stop_gradient=True: "
<< var->Name();
return;
}
PADDLE_ENFORCE_EQ(
tensors.size(), grad_tensors.size(),
platform::errors::Unavailable(
"The size of tensors do not equal the size of grad_tensors,"
"the size of tensors is %s, but the size of grad_tensors is %s.",
tensors.size(), grad_tensors.size()));
for (size_t i = 0; i < tensors.size(); ++i) {
auto var = tensors[i];
auto grad_tensor = grad_tensors[i];
auto init_node = var->GradVarBase()->GradNode();
PADDLE_ENFORCE_EQ(
var->GradVarBase()->GraphIsFreed(), false,
platform::errors::Unavailable(
"%s trying to backward through the same graph a second "
"time, but this graph have already been freed. Please "
"specify Tensor.backward(retain_graph=True) when "
"calling backward at the first time.",
var->Name()));
if (!retain_graph) {
VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name()
<< " because of retain_graph=False when calling backward";
var->GradVarBase()->SetGraphIsFreed(true);
var->GradVarBase()->ClearGradNode();
}
VLOG(3) << "Init first node of backward";
if (init_node == nullptr || var->OverridedStopGradient()) {
VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
"stop_gradient=True: "
<< var->Name();
continue;
}
PADDLE_ENFORCE_EQ(
var->HasGradVar(), true,
platform::errors::NotFound("Grad variable not exist for variable %s",
var->Name()));
auto& fwd_var = var->Var().Get<framework::LoDTensor>();
auto* grad_var =
var->GradVarBase()->MutableVar()->GetMutable<framework::LoDTensor>();
VLOG(6) << "init loss grad:" << var->GradVarBase()->Name()
<< " as stop_gradient false";
var->GradVarBase()->InnerSetOverridedStopGradient(false);
auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place());
grad_var->Resize(fwd_var.dims());
grad_var->mutable_data(fwd_var.place(), fwd_var.type());
operators::math::set_constant(*dev_ctx, grad_var, 1.0);
VLOG(3) << "Init node of backward";
PADDLE_ENFORCE_EQ(
var->HasGradVar(), true,
platform::errors::NotFound("Tensor %s has no gradient", var->Name()));
auto& fwd_var = var->Var().Get<framework::LoDTensor>();
auto* grad_var =
var->GradVarBase()->MutableVar()->GetMutable<framework::LoDTensor>();
VLOG(6) << "init loss grad:" << var->GradVarBase()->Name()
<< " as stop_gradient false";
var->GradVarBase()->InnerSetOverridedStopGradient(false);
auto* dev_ctx =
platform::DeviceContextPool::Instance().Get(fwd_var.place());
if (grad_tensor == nullptr) {
grad_var->Resize(fwd_var.dims());
grad_var->mutable_data(fwd_var.place(), fwd_var.type());
operators::math::set_constant(*dev_ctx, grad_var, 1.0);
} else {
paddle::framework::TensorCopy(
grad_tensor->Var().Get<framework::LoDTensor>(), fwd_var.place(),
*dev_ctx, grad_var);
}
init_nodes_.push_back(init_node);
}
}
void BasicEngine::CheckBackwardInputs(const OpBase& op) {
......@@ -224,8 +249,10 @@ void BasicEngine::PrepareDeps() {
std::queue<GradOpNode*> q;
std::unordered_set<GradOpNode*> visited;
q.push(init_node_.get());
visited.insert(init_node_.get());
for (size_t i = 0; i < init_nodes_.size(); ++i) {
q.push(init_nodes_[i].get());
visited.insert(init_nodes_[i].get());
}
while (!q.empty()) {
auto* cur_node = q.front();
......@@ -276,14 +303,16 @@ static std::shared_ptr<NameVarMap<VariableWrapper>> CallGradientHooks(
}
void BasicEngine::Execute() {
if (init_node_ == nullptr) {
if (init_nodes_.empty()) {
return;
}
PrepareDeps();
// Start execute Computation graph
std::queue<std::shared_ptr<GradOpNode>> q;
q.push(std::move(init_node_));
for (size_t i = 0; i < init_nodes_.size(); ++i) {
q.push(std::move(init_nodes_[i]));
}
size_t op_num = 0;
......@@ -505,7 +534,7 @@ void BasicEngine::Execute() {
}
void BasicEngine::Clear() {
init_node_.reset();
init_nodes_.clear();
node_deps_.clear();
accumulators_.clear();
accumulators_with_grad_node_.clear();
......
......@@ -30,7 +30,9 @@ class OpBase;
class BasicEngine : public Engine {
public:
void Init(VarBase* var, bool retain_graph = false);
void Init(const std::vector<std::shared_ptr<VarBase>>& tensors,
const std::vector<std::shared_ptr<VarBase>>& grad_tensors,
bool retain_graph = false);
void Execute() override;
......@@ -46,7 +48,7 @@ class BasicEngine : public Engine {
void Clear();
private:
std::shared_ptr<GradOpNode> init_node_;
std::vector<std::shared_ptr<GradOpNode>> init_nodes_;
std::unordered_map<GradOpNode*, size_t> node_deps_;
// The input and output of Inplace op are the same. If only `var` is used
// as the key, then the input and output of inplace op must be gradient
......
......@@ -92,8 +92,10 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL);
// 3. backward
std::vector<std::shared_ptr<imperative::VarBase>> tensors{out};
std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
BasicEngine engine;
engine.Init(out.get());
engine.Init(tensors, grad_tensors);
engine.Execute();
framework::LoDTensor x_grad;
......@@ -191,8 +193,10 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL);
// 3. backward
std::vector<std::shared_ptr<imperative::VarBase>> tensors{out};
std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
BasicEngine engine;
engine.Init(out.get());
engine.Init(tensors, grad_tensors);
engine.Execute();
framework::LoDTensor x_grad;
......
......@@ -250,7 +250,10 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map,
gpu_place, true);
imperative::BasicEngine engine;
engine.Init(reduce_sum_out.get());
std::vector<std::shared_ptr<imperative::VarBase>> tensors{reduce_sum_out};
std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
engine.Init(tensors, grad_tensors);
engine.Execute();
framework::LoDTensor rlt;
......@@ -376,8 +379,10 @@ TEST(test_tracer, test_var_without_grad_var) {
ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL);
std::vector<std::shared_ptr<imperative::VarBase>> tensors{vout};
std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
imperative::BasicEngine engine;
engine.Init(vout.get());
engine.Init(tensors, grad_tensors);
engine.Execute();
// check the grad
......
......@@ -805,6 +805,7 @@ void BindImperative(py::module *m_ptr) {
Bump the version whenever the Tensor is modified through an inplace operation.
)DOC")
.def("numpy",
[](imperative::VarBase &self) -> py::array {
const auto &tensor =
self.MutableVar()->Get<framework::LoDTensor>();
......@@ -1003,18 +1004,6 @@ void BindImperative(py::module *m_ptr) {
print(x.stop_gradient) # True
print(x.grad) # None
)DOC")
.def("_run_backward",
[](imperative::VarBase &self, const imperative::Tracer &tracer,
bool retain_graph) {
// TODO(jiabin): when we impl more backward execution we can
// select them
auto *engine = tracer.GetEngine();
engine->Init(&self, retain_graph);
VLOG(3) << "Start backward";
engine->Execute();
VLOG(3) << "Finish backward";
},
py::call_guard<py::gil_scoped_release>())
.def("_grad_name", &imperative::VarBase::GradVarName)
.def("_grad_value",
[](imperative::VarBase &self) {
......@@ -1549,6 +1538,19 @@ void BindImperative(py::module *m_ptr) {
},
py::call_guard<py::gil_scoped_release>());
m.def(
"dygraph_run_backward",
[](const std::vector<std::shared_ptr<imperative::VarBase>> &tensors,
const std::vector<std::shared_ptr<imperative::VarBase>> &grad_tensors,
bool retain_graph, const imperative::Tracer &tracer) {
auto *engine = tracer.GetEngine();
engine->Init(tensors, grad_tensors, retain_graph);
VLOG(3) << "Start backward";
engine->Execute();
VLOG(3) << "Finish backward";
},
py::call_guard<py::gil_scoped_release>());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
py::class_<imperative::ParallelContext,
......
......@@ -44,6 +44,7 @@ import paddle.metric
import paddle.device
import paddle.regularizer
import paddle.incubate
import paddle.autograd
# TODO: define alias in tensor and framework directory
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ..fluid.dygraph.base import grad #DEFINE_ALIAS
from . import backward_mode
from .backward_mode import backward
__all__ = ['grad']
__all__ += backward_mode.__all__
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid import core
from paddle.fluid import framework
import paddle
__all__ = ['backward']
@framework.dygraph_only
def backward(tensors, grad_tensors=None, retain_graph=False):
"""
Compute the backward gradients of given tensors.
Args:
tensors(list of Tensors): the tensors which the gradient to be computed. The tensors can not contain the same tensor.
grad_tensors(list of Tensors of None, optional): the init gradients of the `tensors`` .If not None, it must have the same length with ``tensors`` ,
and if any of the elements is None, then the init gradient is the default value which is filled with 1.0.
If None, all the gradients of the ``tensors`` is the default value which is filled with 1.0.
Defaults to None.
retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
:code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
Defaults to False.
Returns:
NoneType: None
Examples:
.. code-block:: python
import paddle
x = paddle.to_tensor([[1, 2], [3, 4]], dtype='float32', stop_gradient=False)
y = paddle.to_tensor([[3, 2], [3, 4]], dtype='float32')
grad_tensor1 = paddle.to_tensor([[1,2], [2, 3]], dtype='float32')
grad_tensor2 = paddle.to_tensor([[1,1], [1, 1]], dtype='float32')
z1 = paddle.matmul(x, y)
z2 = paddle.matmul(x, y)
paddle.autograd.backward([z1, z2], [grad_tensor1, grad_tensor2], True)
print(x.grad)
#[[12. 18.]
# [17. 25.]]
x.clear_grad()
paddle.autograd.backward([z1, z2], [grad_tensor1, None], True)
print(x.grad)
#[[12. 18.]
# [17. 25.]]
x.clear_grad()
paddle.autograd.backward([z1, z2])
print(x.grad)
#[[10. 14.]
# [10. 14.]]
"""
def check_tensors(in_out_list, name):
assert in_out_list is not None, "{} should not be None".format(name)
if isinstance(in_out_list, (list, tuple)):
assert len(in_out_list) > 0, "{} connot be empyt".format(name)
for each_var in in_out_list:
assert isinstance(
each_var, paddle.
Tensor), "Elements of {} must be paddle.Tensor".format(name)
return in_out_list
else:
assert isinstance(
in_out_list,
paddle.Tensor), "{} must be Tensor or list of Tensor".format(
name)
return [in_out_list]
tensors = check_tensors(tensors, "tensors")
assert len(tensors) == len(
set(tensors)
), "The argument 'tensors' of paddle.autograd.backward contains duplicate paddle.Tensor object."
if grad_tensors is not None:
if not isinstance(grad_tensors, (list, tuple)):
grad_tensors = [grad_tensors]
for each_tensor in grad_tensors:
if each_tensor is not None:
assert isinstance(
each_tensor, paddle.Tensor
), "The argument 'grad_tensors' of paddle.autograd.backward is invalid, it can be 'None', 'paddle.Tensor' or 'list[None/paddle.Tensor]'."
else:
grad_tensors = [None] * len(tensors)
if len(grad_tensors) > 0:
assert len(tensors) == len(
grad_tensors), "The length of grad_tensors must be equal to tensors"
assert isinstance(retain_graph, bool), "retain_graph must be True or False"
core.dygraph_run_backward(tensors, grad_tensors, retain_graph,
framework._dygraph_tracer())
......@@ -26,6 +26,7 @@ import logging
from ..data_feeder import convert_dtype
import warnings
from ..framework import _get_paddle_place
import paddle
__all__ = [
'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph',
......
......@@ -163,7 +163,7 @@ def monkey_patch_varbase():
framework._current_expected_place())
@framework.dygraph_only
def backward(self, retain_graph=False):
def backward(self, grad_tensor=None, retain_graph=False):
"""
Run backward of current Graph which starts from current Tensor.
......@@ -172,17 +172,22 @@ def monkey_patch_varbase():
You can clear gradient by ``Tensor.clear_grad()`` .
Args:
grad_tensor(Tensor, optional): initial gradient values of the current Tensor. If `grad_tensor` is None,
the initial gradient values of the current Tensor would be Tensor filled with 1.0;
if `grad_tensor` is not None, it must have the same length as the current Tensor.
Teh default value is None.
retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
:code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
Defaults to False.
Returns:
NoneType: None
Examples:
.. code-block:: python
import paddle
x = paddle.to_tensor(5., stop_gradient=False)
for i in range(5):
y = paddle.pow(x, 4.0)
......@@ -198,15 +203,36 @@ def monkey_patch_varbase():
print("{}".format(x.grad))
# 0.
grad_tensor=paddle.to_tensor(2.)
for i in range(5):
y = paddle.pow(x, 4.0)
y.backward(grad_tensor)
print("{}: {}".format(i, x.grad))
# 0: [1000.]
# 1: [2000.]
# 2: [3000.]
# 3: [4000.]
# 4: [5000.]
"""
if framework.in_dygraph_mode():
if grad_tensor is not None:
assert isinstance(
grad_tensor, paddle.
Tensor), "The type of grad_tensot must be paddle.Tensor"
assert grad_tensor.shape == self.shape, \
"Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format(
grad_tensor.name, grad_tensor.shape, self.name, self.shape)
if paddle.is_compiled_with_xpu():
# TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
scaled_loss = scale_loss(self)
scaled_loss._run_backward(framework._dygraph_tracer(),
retain_graph)
core.dygraph_run_backward([scaled_loss], [grad_tensor],
retain_graph,
framework._dygraph_tracer())
else:
self._run_backward(framework._dygraph_tracer(), retain_graph)
core.dygraph_run_backward([self], [grad_tensor], retain_graph,
framework._dygraph_tracer())
else:
raise ValueError(
"Variable.backward() is only available in DyGraph mode")
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid.dygraph as dg
from op_test import OpTest
class TestTensorBackward(unittest.TestCase):
def setUp(self):
self._dtypes = ["float32", "float64"]
self._places = [paddle.CPUPlace()]
if paddle.is_compiled_with_cuda():
self._places.append(paddle.CUDAPlace(0))
def test_tensor_backward(self):
for dtype in self._dtypes:
x = np.random.random([2, 100]).astype(dtype)
y = np.random.random([100, 2]).astype(dtype)
z = np.matmul(x, y)
grad = np.random.random(z.shape).astype(dtype)
for place in self._places:
with dg.guard(place):
x_tensor = paddle.to_tensor(x, stop_gradient=False)
y_tensor = paddle.to_tensor(y)
z_tensor = paddle.matmul(x_tensor, y_tensor)
grad_tensor = paddle.to_tensor(grad)
z_tensor.backward(grad_tensor)
x_grad = np.matmul(grad, y.T)
self.assertTrue(np.allclose(x_grad, x_tensor.grad))
class TestBackwardAPI(unittest.TestCase):
def setUp(self):
self._dtypes = ["float32", "float64"]
self._places = [paddle.CPUPlace()]
if paddle.is_compiled_with_cuda():
self._places.append(paddle.CUDAPlace(0))
def test_backward_api(self):
for dtype in self._dtypes:
x = np.random.random([2, 2]).astype(dtype)
y = np.random.random([2, 2]).astype(dtype)
z = np.matmul(x, y)
grad = np.random.random(z.shape).astype(dtype)
for place in self._places:
with dg.guard(place):
x_tensor = paddle.to_tensor(x, stop_gradient=False)
y_tensor = paddle.to_tensor(y)
z_tensor1 = paddle.matmul(x_tensor, y_tensor)
z_tensor2 = paddle.matmul(x_tensor, y_tensor)
grad_tensor = paddle.to_tensor(grad)
paddle.autograd.backward([z_tensor1, z_tensor2],
[grad_tensor, grad_tensor], True)
x_grad = np.matmul(grad, y.T)
self.assertTrue(np.allclose(x_grad * 2, x_tensor.grad))
def test_backward_single_tensor(self):
for dtype in self._dtypes:
x = np.random.random([2, 2]).astype(dtype)
y = np.random.random([2, 2]).astype(dtype)
z = np.matmul(x, y)
grad = np.random.random(z.shape).astype(dtype)
for place in self._places:
with dg.guard(place):
x_tensor = paddle.to_tensor(x, stop_gradient=False)
y_tensor = paddle.to_tensor(y)
z_tensor1 = paddle.matmul(x_tensor, y_tensor)
grad_tensor = paddle.to_tensor(grad)
paddle.autograd.backward(z_tensor1, grad_tensor, True)
x_grad = np.matmul(grad, y.T)
self.assertTrue(np.allclose(x_grad, x_tensor.grad))
def test_backward_none_grad_tensor(self):
for dtype in self._dtypes:
x = np.random.random([2, 2]).astype(dtype)
y = np.random.random([2, 2]).astype(dtype)
z = np.matmul(x, y)
grad = np.ones(z.shape).astype(dtype)
for place in self._places:
with dg.guard(place):
x_tensor = paddle.to_tensor(x, stop_gradient=False)
y_tensor = paddle.to_tensor(y)
z_tensor1 = paddle.matmul(x_tensor, y_tensor)
paddle.autograd.backward(z_tensor1, None)
x_grad = np.matmul(grad, y.T)
self.assertTrue(np.allclose(x_grad, x_tensor.grad))
if __name__ == '__main__':
unittest.main()
......@@ -216,6 +216,7 @@ packages=['paddle',
'paddle.static.amp',
'paddle.tensor',
'paddle.onnx',
'paddle.autograd',
]
with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册