未验证 提交 dd63e5b4 编写于 作者: X Xiaoxu Chen 提交者: GitHub

reorganize the higher order autodiff api (#44119)

* move _gradients to primapi and rename to grad

* modify jvp to call forward_grad in primitive mode

* add primapi unittest and remove some unused test cases.

* fix  circular import problem

* move paddle/autograd/functional into paddle/incubate.autograd/functional

* remove unused JacobianBatchLast class
上级 37216a8f
......@@ -26,8 +26,6 @@ else:
from .py_layer import LegacyPyLayerContext as PyLayerContext # noqa: F401
from ..framework import set_grad_enabled, is_grad_enabled # noqa: F401
from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401
from .functional import vjp, jvp, Jacobian, Hessian # noqa: F401
from .functional import jacobian, hessian, batch_jacobian, batch_hessian, vhp # noqa: F401
__all__ = [ # noqa
'backward',
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import typing
from paddle.fluid import framework
def as_tensors(xs):
if isinstance(xs, framework.Variable):
return (xs, )
elif isinstance(xs, typing.Sequence):
return tuple(xs)
else:
return xs
......@@ -2211,12 +2211,6 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
check_type(target_gradients, 'target_gradients',
(framework.Variable, list, tuple, type(None)),
'paddle.static.gradients')
from ..incubate.autograd.primx import _gradients
from ..incubate.autograd.utils import prim_enabled
if prim_enabled():
return _gradients(targets, inputs, target_gradients)
outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
return _as_list(outs)
......
......@@ -17,7 +17,7 @@ endforeach()
set_tests_properties(test_autograd_functional_dynamic PROPERTIES TIMEOUT 200)
set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 160)
set_tests_properties(test_gradients_and_minimize PROPERTIES TIMEOUT 60)
set_tests_properties(test_minimize PROPERTIES TIMEOUT 60)
if(NOT WIN32)
set_tests_properties(test_autograd_functional_prim PROPERTIES TIMEOUT 60)
endif()
......@@ -21,7 +21,7 @@ import paddle
import paddle.fluid as fluid
import paddle.compat as cpt
import paddle.nn.functional as F
from paddle.autograd.utils import as_tensors
from paddle.incubate.autograd.utils import as_tensors
from paddle.fluid.framework import _test_eager_guard, _in_legacy_dygraph, _in_eager_without_dygraph_check
import config
......@@ -78,9 +78,9 @@ class TestAutogradFunctional(unittest.TestCase):
xs = self.gen_inputs(inputs)
if v is not None:
v = self.gen_inputs(v)
outputs, inputs_grad = paddle.autograd.vjp(func, xs, v)
outputs, inputs_grad = paddle.incubate.autograd.vjp(func, xs, v)
else:
outputs, inputs_grad = paddle.autograd.vjp(func, xs)
outputs, inputs_grad = paddle.incubate.autograd.vjp(func, xs)
return outputs, inputs_grad
def grad_test():
......@@ -116,14 +116,14 @@ class TestAutogradFunctional(unittest.TestCase):
xs = self.gen_inputs(inputs)
if v is not None:
v = self.gen_inputs(v)
outputs, outputs_grad = paddle.autograd.jvp(
outputs, outputs_grad = paddle.incubate.autograd.jvp(
func,
xs,
v,
create_graph=create_graph,
allow_unused=allow_unused)
else:
outputs, outputs_grad = paddle.autograd.jvp(
outputs, outputs_grad = paddle.incubate.autograd.jvp(
func,
xs,
create_graph=create_graph,
......@@ -233,7 +233,7 @@ class TestVJPException(unittest.TestCase):
def func_vjp(self):
with self.assertRaises(self.expected_exception):
paddle.autograd.vjp(self.fun, paddle.to_tensor(self.xs),
paddle.incubate.autograd.vjp(self.fun, paddle.to_tensor(self.xs),
paddle.to_tensor(self.v))
def test_all_cases(self):
......@@ -243,8 +243,10 @@ class TestVJPException(unittest.TestCase):
def jac(grad_fn, f, inputs):
assert grad_fn in [paddle.autograd.vjp, paddle.autograd.jvp]
if grad_fn is paddle.autograd.jvp:
assert grad_fn in [
paddle.incubate.autograd.vjp, paddle.incubate.autograd.jvp
]
if grad_fn is paddle.incubate.autograd.jvp:
vs = [paddle.zeros_like(x) for x in inputs]
else:
outputs = f(*inputs)
......@@ -265,7 +267,7 @@ def jac(grad_fn, f, inputs):
JJ_cols.append(d_outs)
# JJ is the fully unrolled jacobian
JJ = paddle.stack(JJ_cols)
if grad_fn is paddle.autograd.vjp:
if grad_fn is paddle.incubate.autograd.vjp:
JJ = JJ.t()
return JJ
......@@ -279,8 +281,8 @@ class TestJVP(TestAutogradFunctional):
] # noqa
for f, inputs in test_cases:
inputs = self.gen_inputs(inputs)
forward_jac = jac(paddle.autograd.jvp, f, inputs)
reverse_jac = jac(paddle.autograd.vjp, f, inputs)
forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs)
reverse_jac = jac(paddle.incubate.autograd.vjp, f, inputs)
self.check_results(forward_jac, reverse_jac)
def func_jvp_i2o1(self):
......@@ -289,8 +291,8 @@ class TestJVP(TestAutogradFunctional):
] # noqa
for f, inputs in test_cases:
inputs = self.gen_inputs(inputs)
forward_jac = jac(paddle.autograd.jvp, f, inputs)
reverse_jac = jac(paddle.autograd.vjp, f, inputs)
forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs)
reverse_jac = jac(paddle.incubate.autograd.vjp, f, inputs)
self.check_results(forward_jac, reverse_jac)
def func_jvp_i2o2(self):
......@@ -299,8 +301,8 @@ class TestJVP(TestAutogradFunctional):
] # noqa
for f, inputs in test_cases:
inputs = self.gen_inputs(inputs)
forward_jac = jac(paddle.autograd.jvp, f, inputs)
reverse_jac = jac(paddle.autograd.vjp, f, inputs)
forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs)
reverse_jac = jac(paddle.incubate.autograd.vjp, f, inputs)
self.check_results(forward_jac, reverse_jac)
def func_jvp_i2o2_omitting_v(self):
......@@ -309,9 +311,9 @@ class TestJVP(TestAutogradFunctional):
] # noqa
for f, inputs in test_cases:
inputs = self.gen_inputs(inputs)
results_omitting_v = paddle.autograd.jvp(f, inputs)
results_omitting_v = paddle.incubate.autograd.jvp(f, inputs)
v = [paddle.ones_like(x) for x in inputs]
results_with_v = paddle.autograd.jvp(f, inputs, v)
results_with_v = paddle.incubate.autograd.jvp(f, inputs, v)
self.check_results(results_omitting_v, results_with_v)
def test_all_cases(self):
......@@ -334,7 +336,7 @@ class TestJVP(TestAutogradFunctional):
('multi_in_single_out', paddle.matmul,
(np.random.rand(2, 2), np.random.rand(2, 2))),
))
class TestJacobianClassNoBatch(unittest.TestCase):
class TestJacobianNoBatch(unittest.TestCase):
def setUp(self):
self._dtype = self.xs[0].dtype if isinstance(
......@@ -349,7 +351,7 @@ class TestJacobianClassNoBatch(unittest.TestCase):
def func_jacobian(self):
xs = [paddle.to_tensor(x) for x in self.xs] if isinstance(
self.xs, typing.Sequence) else paddle.to_tensor(self.xs)
self._actual = paddle.autograd.Jacobian(self.func, xs, False)
self._actual = paddle.incubate.autograd.Jacobian(self.func, xs, False)
self._expected = self._get_expected()
Index = collections.namedtuple('Index', ('type', 'value'))
......@@ -387,7 +389,7 @@ class TestJacobianClassNoBatch(unittest.TestCase):
('3d_in_3d_out', utils.square, np.random.rand(2, 3, 4)),
('multi_in_single_out', utils.square, np.random.rand(2, 3)),
))
class TestJacobianClassBatchFirst(unittest.TestCase):
class TestJacobianBatchFirst(unittest.TestCase):
def setUp(self):
self._dtype = self.xs[0].dtype if isinstance(
......@@ -402,7 +404,7 @@ class TestJacobianClassBatchFirst(unittest.TestCase):
def func_jacobian(self):
xs = [paddle.to_tensor(x) for x in self.xs] if isinstance(
self.xs, typing.Sequence) else paddle.to_tensor(self.xs)
self._actual = paddle.autograd.Jacobian(self.func, xs, True)
self._actual = paddle.incubate.autograd.Jacobian(self.func, xs, True)
self._expected = self._get_expected()
Index = collections.namedtuple('Index', ('type', 'value'))
......@@ -444,7 +446,7 @@ class TestJacobianClassBatchFirst(unittest.TestCase):
self.func_jacobian()
class TestHessianClassNoBatch(unittest.TestCase):
class TestHessianNoBatch(unittest.TestCase):
@classmethod
def setUpClass(self):
......@@ -470,7 +472,7 @@ class TestHessianClassNoBatch(unittest.TestCase):
numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian)
self.x.stop_gradient = False
hessian = paddle.autograd.Hessian(func, self.x)
hessian = paddle.incubate.autograd.Hessian(func, self.x)
np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian,
self.rtol, self.atol)
......@@ -484,7 +486,7 @@ class TestHessianClassNoBatch(unittest.TestCase):
numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian)
self.x.stop_gradient = False
self.y.stop_gradient = False
hessian = paddle.autograd.Hessian(func, [self.x, self.y])
hessian = paddle.incubate.autograd.Hessian(func, [self.x, self.y])
np.testing.assert_allclose(hessian[:].numpy(),
numerical_hessian,
rtol=self.rtol,
......@@ -500,7 +502,7 @@ class TestHessianClassNoBatch(unittest.TestCase):
numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian)
self.x.stop_gradient = False
self.y.stop_gradient = False
hessian = paddle.autograd.Hessian(func, [self.x, self.y])
hessian = paddle.incubate.autograd.Hessian(func, [self.x, self.y])
np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian,
self.rtol, self.atol)
......@@ -514,7 +516,7 @@ class TestHessianClassNoBatch(unittest.TestCase):
func, self.x, self.numerical_delta, self.np_dtype)
numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian)
self.x.stop_gradient = False
hessian = paddle.autograd.Hessian(func, self.x)
hessian = paddle.incubate.autograd.Hessian(func, self.x)
assert hessian[:].stop_gradient == False
np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian,
self.rtol, self.atol)
......@@ -526,7 +528,7 @@ class TestHessianClassNoBatch(unittest.TestCase):
return x * x
with self.assertRaises(RuntimeError):
paddle.autograd.Hessian(func, paddle.ones([3]))
paddle.incubate.autograd.Hessian(func, paddle.ones([3]))
def test_all_cases(self):
with _test_eager_guard():
......@@ -544,7 +546,7 @@ class TestHessianClassNoBatch(unittest.TestCase):
self.func_out_not_single()
class TestHessianClassBatchFirst(unittest.TestCase):
class TestHessianBatchFirst(unittest.TestCase):
@classmethod
def setUpClass(self):
......@@ -572,7 +574,7 @@ class TestHessianClassBatchFirst(unittest.TestCase):
expected = utils._compute_numerical_batch_hessian(
func, self.x, self.numerical_delta, self.np_dtype)
H = paddle.autograd.Hessian(func, self.x, is_batched=True)
H = paddle.incubate.autograd.Hessian(func, self.x, is_batched=True)
actual = utils._np_transpose_matrix_format(H[:].numpy(),
utils.MatrixFormat.BNM,
utils.MatrixFormat.NBM)
......@@ -596,7 +598,8 @@ class TestHessianClassBatchFirst(unittest.TestCase):
self.x.stop_gradient = False
self.y.stop_gradient = False
H = paddle.autograd.Hessian(func, [self.x, self.y], is_batched=True)
H = paddle.incubate.autograd.Hessian(func, [self.x, self.y],
is_batched=True)
actual = utils._np_transpose_matrix_format(H[:].numpy(),
utils.MatrixFormat.BNM,
utils.MatrixFormat.NBM)
......@@ -620,7 +623,7 @@ class TestHessianClassBatchFirst(unittest.TestCase):
utils.MatrixFormat.NBM,
utils.MatrixFormat.BNM)
actual = paddle.autograd.Hessian(func, [self.x, self.y],
actual = paddle.incubate.autograd.Hessian(func, [self.x, self.y],
is_batched=True)[:]
np.testing.assert_allclose(actual,
......@@ -638,7 +641,7 @@ class TestHessianClassBatchFirst(unittest.TestCase):
x = self.x.clone()
x.stop_gradient = True
H = paddle.autograd.Hessian(func, self.x, is_batched=True)[:]
H = paddle.incubate.autograd.Hessian(func, self.x, is_batched=True)[:]
actual = utils._np_transpose_matrix_format(H[:].numpy(),
utils.MatrixFormat.BNM,
utils.MatrixFormat.NBM)
......@@ -652,7 +655,9 @@ class TestHessianClassBatchFirst(unittest.TestCase):
return (x * x)
with self.assertRaises(RuntimeError):
paddle.autograd.Hessian(func, paddle.ones((3, 3)), is_batched=True)
paddle.incubate.autograd.Hessian(func,
paddle.ones((3, 3)),
is_batched=True)
def test_all_cases(self):
with _test_eager_guard():
......@@ -670,829 +675,5 @@ class TestHessianClassBatchFirst(unittest.TestCase):
self.func_out_not_single()
class TestHessian(unittest.TestCase):
@classmethod
def setUpClass(self):
self.shape = (2, 2)
self.dtype = 'float32'
self.np_dtype = np.float32
self.numerical_delta = config.TOLERANCE.get(
self.dtype).get("second_order_grad").get("eps")
self.rtol = config.TOLERANCE.get(
self.dtype).get("second_order_grad").get("rtol")
self.atol = config.TOLERANCE.get(
self.dtype).get("second_order_grad").get("atol")
self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
def func_single_input(self):
def func(x):
return paddle.sum(paddle.matmul(x, x))
numerical_hessian = _compute_numerical_hessian(func, self.x,
self.numerical_delta,
self.np_dtype)
self.x.stop_gradient = False
hessian = paddle.autograd.hessian(func, self.x)
np.testing.assert_allclose(hessian.numpy(), numerical_hessian[0][0],
self.rtol, self.atol)
def func_multi_input(self):
def func(x, y):
return paddle.sum(paddle.matmul(x, y))
numerical_hessian = _compute_numerical_hessian(func, [self.x, self.y],
self.numerical_delta,
self.np_dtype)
self.x.stop_gradient = False
self.y.stop_gradient = False
hessian = paddle.autograd.hessian(func, [self.x, self.y])
for i in range(len(hessian)):
for j in range(len(hessian[0])):
np.testing.assert_allclose(hessian[i][j].numpy(),
numerical_hessian[i][j], self.rtol,
self.atol)
def func_allow_unused_false(self):
def func(x, y):
return paddle.sum(paddle.matmul(x, x))
try:
self.x.stop_gradient = False
self.y.stop_gradient = False
hessian = paddle.autograd.hessian(func, [self.x, self.y])
except ValueError as e:
error_msg = cpt.get_exception_message(e)
assert error_msg.find("allow_unused") > 0
def func_allow_unused_true(self):
def func(x, y):
return paddle.sum(paddle.matmul(x, x))
numerical_hessian = _compute_numerical_hessian(func, [self.x, self.y],
self.numerical_delta,
self.np_dtype)
self.x.stop_gradient = False
self.y.stop_gradient = False
hessian = paddle.autograd.hessian(func, [self.x, self.y],
allow_unused=True)
for i in range(len(hessian)):
for j in range(len(hessian[0])):
if i == j == 0:
np.testing.assert_allclose(hessian[i][j].numpy(),
numerical_hessian[i][j],
self.rtol, self.atol)
else:
assert hessian[i][j] is None
def func_create_graph_false(self):
def func(x):
return paddle.sum(paddle.matmul(x, x))
numerical_hessian = _compute_numerical_hessian(func, self.x,
self.numerical_delta,
self.np_dtype)
self.x.stop_gradient = False
hessian = paddle.autograd.hessian(func, self.x)
assert hessian.stop_gradient == True
np.testing.assert_allclose(hessian.numpy(), numerical_hessian[0][0],
self.rtol, self.atol)
try:
paddle.grad(hessian, self.x)
except Exception as e:
error_msg = cpt.get_exception_message(e)
assert error_msg.find("has no gradient") > 0 or error_msg.find(
"does not appear") > 0
def func_create_graph_true(self):
fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
def func(x):
return paddle.sum(F.sigmoid(x))
numerical_hessian = _compute_numerical_hessian(func, self.x,
self.numerical_delta,
self.np_dtype)
self.x.stop_gradient = False
hessian = paddle.autograd.hessian(func, self.x, create_graph=True)
assert hessian.stop_gradient == False
np.testing.assert_allclose(hessian.numpy(), numerical_hessian[0][0],
self.rtol, self.atol)
triple_grad = paddle.grad(hessian, self.x)
assert triple_grad is not None
fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
def test_all_cases(self):
with _test_eager_guard():
self.setUpClass()
self.func_single_input()
self.func_multi_input()
self.func_allow_unused_false()
self.func_allow_unused_true()
self.func_create_graph_false()
self.func_create_graph_true()
self.setUpClass()
self.func_single_input()
self.func_multi_input()
self.func_allow_unused_false()
self.func_allow_unused_true()
self.func_create_graph_false()
self.func_create_graph_true()
class TestHessianFloat64(TestHessian):
@classmethod
def setUpClass(self):
self.shape = (2, 2)
self.dtype = 'float64'
self.np_dtype = np.float64
self.numerical_delta = config.TOLERANCE.get(
self.dtype).get("second_order_grad").get("eps")
self.rtol = config.TOLERANCE.get(
self.dtype).get("second_order_grad").get("rtol")
self.atol = config.TOLERANCE.get(
self.dtype).get("second_order_grad").get("atol")
self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
class TestBatchHessian(unittest.TestCase):
@classmethod
def setUpClass(self):
self.x_shape = (5, 2)
self.weight_shape = (2, 4)
self.y_shape = (5, 2)
self.dtype = 'float32'
self.np_dtype = np.float32
self.numerical_delta = config.TOLERANCE.get(
self.dtype).get("second_order_grad").get("eps")
self.rtol = config.TOLERANCE.get(
self.dtype).get("second_order_grad").get("rtol")
self.atol = config.TOLERANCE.get(
self.dtype).get("second_order_grad").get("atol")
self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype)
self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype)
self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype)
def func_single_input(self):
def func(x):
return paddle.matmul(x * x, self.weight)[:, 0:1]
numerical_hessian = _compute_numerical_batch_hessian(
func, self.x, self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
hessian = paddle.autograd.batch_hessian(func, self.x, create_graph=True)
np.testing.assert_allclose(hessian, numerical_hessian, self.rtol,
self.atol)
def func_multi_input(self):
def func(x, y):
return paddle.matmul(x * x * y * y, self.weight)[:, 0:1]
numerical_hessian = _compute_numerical_batch_hessian(
func, [self.x, self.y], self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
self.y.stop_gradient = False
hessian = paddle.autograd.batch_hessian(func, [self.x, self.y])
shape_tensor = paddle.to_tensor(numerical_hessian).astype("float64")
hessian_reshape = np.reshape(hessian, (shape_tensor.shape))
np.testing.assert_allclose(hessian_reshape, numerical_hessian,
self.rtol, self.atol)
def func_allow_unused_false(self):
def func(x, y):
return paddle.matmul(x * x, self.weight)[:, 0:1]
try:
self.x.stop_gradient = False
self.y.stop_gradient = False
hessian = paddle.autograd.batch_hessian(func, [self.x, self.y])
except ValueError as e:
error_msg = cpt.get_exception_message(e)
assert error_msg.find("allow_unused") > 0
def func_allow_unused_true(self):
def func(x, y):
return paddle.matmul(x * x, self.weight)[:, 0:1]
numerical_hessian = _compute_numerical_batch_hessian(
func, [self.x, self.y], self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
self.y.stop_gradient = False
hessian = paddle.autograd.batch_hessian(func, [self.x, self.y],
allow_unused=True)
for i in range(len(hessian)):
for j in range(len(hessian[0])):
if i == j == 0:
numerical_hessian = np.stack(
(numerical_hessian[i][j], numerical_hessian[i][j + 1]),
axis=0)
np.testing.assert_allclose(hessian[i][j], numerical_hessian,
self.rtol, self.atol)
else:
assert hessian[i][j] is None
def func_create_graph_false(self):
def func(x):
return paddle.matmul(x * x, self.weight)[:, 0:1]
numerical_hessian = _compute_numerical_batch_hessian(
func, self.x, self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
hessian = paddle.autograd.batch_hessian(func, self.x)
assert hessian.stop_gradient == True
np.testing.assert_allclose(hessian.numpy(), numerical_hessian,
self.rtol, self.atol)
try:
paddle.grad(hessian, self.x)
except Exception as e:
error_msg = cpt.get_exception_message(e)
assert error_msg.find("has no gradient") > 0 or error_msg.find(
"does not appear") > 0
def func_create_graph_true(self):
def func(x):
return paddle.matmul(x * x, self.weight)[:, 0:1]
numerical_hessian = _compute_numerical_batch_hessian(
func, self.x, self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
hessian = paddle.autograd.batch_hessian(func, self.x, create_graph=True)
assert hessian.stop_gradient == False
np.testing.assert_allclose(hessian.numpy(), numerical_hessian,
self.rtol, self.atol)
triple_grad = paddle.grad(hessian, self.x)
assert triple_grad is not None
def test_all_cases(self):
with _test_eager_guard():
self.setUpClass()
self.func_single_input()
self.func_multi_input()
self.func_allow_unused_false()
self.func_allow_unused_true()
self.func_create_graph_false()
self.func_create_graph_true()
self.setUpClass()
self.func_single_input()
self.func_multi_input()
self.func_allow_unused_false()
self.func_allow_unused_true()
self.func_create_graph_false()
self.func_create_graph_true()
class TestBatchHessianFloat64(TestBatchHessian):
@classmethod
def setUpClass(self):
self.x_shape = (5, 2)
self.weight_shape = (2, 4)
self.y_shape = (5, 2)
self.dtype = 'float64'
self.np_dtype = np.float64
self.numerical_delta = config.TOLERANCE.get(
self.dtype).get("second_order_grad").get("eps")
self.rtol = config.TOLERANCE.get(
self.dtype).get("second_order_grad").get("rtol")
self.atol = config.TOLERANCE.get(
self.dtype).get("second_order_grad").get("atol")
self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype)
self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype)
self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype)
class TestVHP(unittest.TestCase):
@classmethod
def setUpClass(self):
self.shape = (2, 2)
self.dtype = 'float32'
self.np_dtype = np.float32
self.numerical_delta = config.TOLERANCE.get(
self.dtype).get("second_order_grad").get("eps")
self.rtol = config.TOLERANCE.get(
self.dtype).get("second_order_grad").get("rtol")
self.atol = config.TOLERANCE.get(
self.dtype).get("second_order_grad").get("atol")
self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
self.vx = paddle.rand(shape=self.shape, dtype=self.dtype)
self.vy = paddle.rand(shape=self.shape, dtype=self.dtype)
def func_single_input(self):
def func(x):
return paddle.sum(paddle.matmul(x, x))
numerical_func_output = func(self.x).numpy()
numerical_vhp = _compute_numerical_vhp(func, self.x, self.vx,
self.numerical_delta,
self.np_dtype)
self.x.stop_gradient = False
func_output, vhp = paddle.autograd.vhp(func, self.x, self.vx)
np.testing.assert_allclose(func_output.numpy(), numerical_func_output,
self.rtol, self.atol)
np.testing.assert_allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol,
self.atol)
def func_multi_input(self):
def func(x, y):
return paddle.sum(paddle.matmul(x, y))
numerical_func_output = func(self.x, self.y).numpy()
numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y],
[self.vx, self.vy],
self.numerical_delta,
self.np_dtype)
self.x.stop_gradient = False
self.y.stop_gradient = False
func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y],
[self.vx, self.vy])
np.testing.assert_allclose(func_output.numpy(), numerical_func_output,
self.rtol, self.atol)
for i in range(len(vhp)):
np.testing.assert_allclose(vhp[i].numpy(), numerical_vhp[i],
self.rtol, self.atol)
def func_v_default(self):
def func(x, y):
return paddle.sum(paddle.matmul(x, y))
numerical_func_output = func(self.x, self.y).numpy()
vx = paddle.ones(self.vx.shape, dtype=self.vx.dtype)
vy = paddle.ones(self.vy.shape, dtype=self.vy.dtype)
numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y], [vx, vy],
self.numerical_delta,
self.np_dtype)
self.x.stop_gradient = False
self.y.stop_gradient = False
func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y])
np.testing.assert_allclose(func_output.numpy(), numerical_func_output,
self.rtol, self.atol)
for i in range(len(vhp)):
np.testing.assert_allclose(vhp[i].numpy(), numerical_vhp[i],
self.rtol, self.atol)
def func_allow_unused_true(self):
def func(x, y):
return paddle.sum(paddle.matmul(x, x))
numerical_func_output = func(self.x, self.y).numpy()
numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y],
[self.vx, self.vy],
self.numerical_delta,
self.np_dtype)
self.x.stop_gradient = False
self.y.stop_gradient = False
func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y],
[self.vx, self.vy])
np.testing.assert_allclose(func_output.numpy(), numerical_func_output,
self.rtol, self.atol)
np.testing.assert_allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol,
self.atol)
def func_create_graph_true(self):
fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
def func(x):
return paddle.sum(F.sigmoid(x))
numerical_func_output = func(self.x).numpy()
numerical_vhp = _compute_numerical_vhp(func, self.x, self.vx,
self.numerical_delta,
self.np_dtype)
self.x.stop_gradient = False
func_output, vhp = paddle.autograd.vhp(func, self.x, self.vx)
np.testing.assert_allclose(func_output.numpy(), numerical_func_output,
self.rtol, self.atol)
assert vhp[0].stop_gradient == False
np.testing.assert_allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol,
self.atol)
triple_grad = paddle.grad(vhp, self.x)
assert triple_grad is not None
fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
def test_all_cases(self):
with _test_eager_guard():
self.setUpClass()
self.func_v_default()
self.func_multi_input()
self.func_single_input()
self.func_allow_unused_true()
self.func_create_graph_true()
self.setUpClass()
self.func_v_default()
self.func_multi_input()
self.func_single_input()
self.func_allow_unused_true()
self.func_create_graph_true()
class TestJacobian(unittest.TestCase):
@classmethod
def setUpClass(self):
self.shape = (4, 4)
self.dtype = 'float32'
self.np_dtype = np.float32
self.numerical_delta = 1e-4
self.rtol = 1e-3
self.atol = 1e-3
self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
def func_single_input_and_single_output(self):
def func(x):
return paddle.matmul(x, x)
numerical_jacobian = _compute_numerical_jacobian(
func, self.x, self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
jacobian = paddle.autograd.jacobian(func, self.x)
np.testing.assert_allclose(jacobian.numpy(), numerical_jacobian[0][0],
self.rtol, self.atol)
def func_single_input_and_multi_output(self):
def func(x):
return paddle.matmul(x, x), x * x
numerical_jacobian = _compute_numerical_jacobian(
func, self.x, self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
jacobian = paddle.autograd.jacobian(func, self.x)
for i in range(len(jacobian)):
np.testing.assert_allclose(jacobian[i].numpy(),
numerical_jacobian[i][0], self.rtol,
self.atol)
def func_multi_input_and_single_output(self):
def func(x, y):
return paddle.matmul(x, y)
numerical_jacobian = _compute_numerical_jacobian(
func, [self.x, self.y], self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
self.y.stop_gradient = False
jacobian = paddle.autograd.jacobian(func, [self.x, self.y])
for j in range(len(jacobian)):
np.testing.assert_allclose(jacobian[j].numpy(),
numerical_jacobian[0][j], self.rtol,
self.atol)
def func_multi_input_and_multi_output(self):
fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
def func(x, y):
return paddle.matmul(x, y), x * y
numerical_jacobian = _compute_numerical_jacobian(
func, [self.x, self.y], self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
self.y.stop_gradient = False
jacobian = paddle.autograd.jacobian(func, [self.x, self.y])
for i in range(len(jacobian)):
for j in range(len(jacobian[0])):
np.testing.assert_allclose(jacobian[i][j].numpy(),
numerical_jacobian[i][j], self.rtol,
self.atol)
fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
def func_allow_unused_false(self):
def func(x, y):
return paddle.matmul(x, x)
try:
self.x.stop_gradient = False
self.y.stop_gradient = False
jacobian = paddle.autograd.jacobian(func, [self.x, self.y])
except ValueError as e:
error_msg = cpt.get_exception_message(e)
assert error_msg.find("allow_unused") > 0
def func_allow_unused_true(self):
def func(x, y):
return paddle.matmul(x, x)
numerical_jacobian = _compute_numerical_jacobian(
func, [self.x, self.y], self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
self.y.stop_gradient = False
jacobian = paddle.autograd.jacobian(func, [self.x, self.y],
allow_unused=True)
np.testing.assert_allclose(jacobian[0].numpy(),
numerical_jacobian[0][0], self.rtol,
self.atol)
assert jacobian[1] is None
def func_create_graph_false(self):
def func(x, y):
return paddle.matmul(x, y)
numerical_jacobian = _compute_numerical_jacobian(
func, [self.x, self.y], self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
self.y.stop_gradient = False
jacobian = paddle.autograd.jacobian(func, [self.x, self.y])
for j in range(len(jacobian)):
assert jacobian[j].stop_gradient == True
np.testing.assert_allclose(jacobian[j].numpy(),
numerical_jacobian[0][j], self.rtol,
self.atol)
try:
paddle.grad(jacobian[0], [self.x, self.y])
except Exception as e:
error_msg = cpt.get_exception_message(e)
assert error_msg.find("has no gradient") > 0 or error_msg.find(
"does not appear") > 0
def func_create_graph_true(self):
def func(x, y):
return paddle.matmul(x, y)
numerical_jacobian = _compute_numerical_jacobian(
func, [self.x, self.y], self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
self.y.stop_gradient = False
jacobian = paddle.autograd.jacobian(func, [self.x, self.y],
create_graph=True)
for j in range(len(jacobian)):
assert jacobian[j].stop_gradient == False
np.testing.assert_allclose(jacobian[j].numpy(),
numerical_jacobian[0][j], self.rtol,
self.atol)
double_grad = paddle.grad(jacobian[0], [self.x, self.y])
assert double_grad is not None
def test_all_cases(self):
with _test_eager_guard():
self.setUpClass()
self.func_multi_input_and_multi_output()
self.func_multi_input_and_single_output()
self.func_single_input_and_multi_output()
self.func_single_input_and_single_output()
self.func_allow_unused_false()
self.func_allow_unused_true()
self.func_create_graph_false()
self.func_create_graph_true()
self.setUpClass()
self.func_multi_input_and_multi_output()
self.func_multi_input_and_single_output()
self.func_single_input_and_multi_output()
self.func_single_input_and_single_output()
self.func_allow_unused_false()
self.func_allow_unused_true()
self.func_create_graph_false()
self.func_create_graph_true()
class TestJacobianFloat64(TestJacobian):
@classmethod
def setUpClass(self):
self.shape = (4, 4)
self.dtype = 'float64'
self.np_dtype = np.float64
self.numerical_delta = 1e-7
self.rtol = 1e-7
self.atol = 1e-7
self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
class TestJacobianBatch(unittest.TestCase):
@classmethod
def setUpClass(self):
self.x_shape = (4, 2)
self.weight_shape = (2, 4)
self.y_shape = (4, 2)
self.dtype = 'float32'
self.np_dtype = np.float32
self.numerical_delta = 1e-4
self.rtol = 1e-3
self.atol = 1e-3
self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype)
self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype)
self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype)
def func_batch_single_input_and_batch_single_output(self):
def func(x):
return paddle.matmul(paddle.matmul(x, self.weight), self.y)
numerical_jacobian = _compute_numerical_batch_jacobian(
func, [self.x], self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
batch_jacobian = paddle.autograd.batch_jacobian(
func,
self.x,
)
self.assertTrue(
np.allclose(batch_jacobian.numpy().all(),
numerical_jacobian[0][0].all()))
def func_batch_single_input_and_batch_multi_output(self):
def func(x):
return paddle.matmul(paddle.matmul(x, self.weight), self.y), x * x
numerical_jacobian = _compute_numerical_batch_jacobian(
func, [self.x], self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
batch_jacobian = paddle.autograd.batch_jacobian(
func,
self.x,
)
for i in range(len(batch_jacobian)):
np.testing.assert_allclose(batch_jacobian[i].numpy(),
numerical_jacobian[i][0], self.rtol,
self.atol)
def func_batch_multi_input_and_batch_single_output(self):
def func(x, y):
return x * y
numerical_jacobian = _compute_numerical_batch_jacobian(
func, [self.x, self.y], self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
self.y.stop_gradient = False
batch_jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y])
for j in range(len(batch_jacobian)):
np.testing.assert_allclose(batch_jacobian[j].numpy(),
numerical_jacobian[0][j], self.rtol,
self.atol)
def func_batch_multi_input_and_batch_multi_output(self):
def func(x, y):
return x * y, x * y
numerical_jacobian = _compute_numerical_batch_jacobian(
func, [self.x, self.y], self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
self.y.stop_gradient = False
batch_jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y])
for i in range(len(batch_jacobian)):
np.testing.assert_allclose(batch_jacobian[i], numerical_jacobian[i],
self.rtol, self.atol)
def func_allow_unused_false(self):
def func(x, y):
return x * x
try:
self.x.stop_gradient = False
self.y.stop_gradient = False
jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y])
except ValueError as e:
error_msg = cpt.get_exception_message(e)
assert error_msg.find("allow_unused") > 0
def func_allow_unused_true(self):
def func(x, y):
return x * x
numerical_jacobian = _compute_numerical_batch_jacobian(
func, [self.x, self.y], self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
self.y.stop_gradient = False
jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y],
allow_unused=True)
np.testing.assert_allclose(jacobian[0].numpy(),
numerical_jacobian[0][0], self.rtol,
self.atol)
assert jacobian[1] is None
def func_create_graph_false(self):
def func(x, y):
return x * y
numerical_jacobian = _compute_numerical_batch_jacobian(
func, [self.x, self.y], self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
self.y.stop_gradient = False
jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y])
for j in range(len(jacobian)):
assert jacobian[j].stop_gradient == True
np.testing.assert_allclose(jacobian[j].numpy(),
numerical_jacobian[0][j], self.rtol,
self.atol)
try:
paddle.grad(jacobian[0], [self.x, self.y])
except Exception as e:
error_msg = cpt.get_exception_message(e)
assert error_msg.find("has no gradient") > 0 or error_msg.find(
"does not appear") > 0
def func_create_graph_true(self):
def func(x, y):
return x * y
numerical_jacobian = _compute_numerical_batch_jacobian(
func, [self.x, self.y], self.numerical_delta, self.np_dtype)
self.x.stop_gradient = False
self.y.stop_gradient = False
jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y],
create_graph=True)
for j in range(len(jacobian)):
assert jacobian[j].stop_gradient == False
np.testing.assert_allclose(jacobian[j].numpy(),
numerical_jacobian[0][j], self.rtol,
self.atol)
double_grad = paddle.grad(jacobian[0], [self.x, self.y])
assert double_grad is not None
def test_all_cases(self):
with _test_eager_guard():
self.setUpClass()
self.func_batch_single_input_and_batch_single_output()
self.func_batch_single_input_and_batch_multi_output()
self.func_batch_multi_input_and_batch_single_output()
self.func_batch_multi_input_and_batch_multi_output()
self.func_allow_unused_false()
self.func_allow_unused_true()
self.func_create_graph_false()
self.func_create_graph_true()
self.setUpClass()
self.func_batch_single_input_and_batch_single_output()
self.func_batch_single_input_and_batch_multi_output()
self.func_batch_multi_input_and_batch_single_output()
self.func_batch_multi_input_and_batch_multi_output()
self.func_allow_unused_false()
self.func_allow_unused_true()
self.func_create_graph_false()
self.func_create_graph_true()
class TestJacobianBatchFloat64(TestJacobianBatch):
@classmethod
def setUpClass(self):
self.x_shape = (12, 2)
self.weight_shape = (2, 12)
self.y_shape = (12, 2)
self.dtype = 'float64'
self.np_dtype = np.float64
self.numerical_delta = config.TOLERANCE.get(
self.dtype).get('second_order_grad').get('eps')
self.rtol = config.TOLERANCE.get(
self.dtype).get('second_order_grad').get('rtol')
self.atol = config.TOLERANCE.get(
self.dtype).get('second_order_grad').get('atol')
self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype)
self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype)
self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype)
if __name__ == "__main__":
unittest.main()
......@@ -145,5 +145,130 @@ class TestHessianPrim(unittest.TestCase):
atol=self._atol)
@utils.place(config.DEVICES)
@utils.parameterize((utils.TEST_CASE_NAME, 'fun', 'args', 'dtype'), (
('unary_float32', paddle.tanh, (np.random.rand(2, 3), ), 'float32'),
('binary_float32', paddle.matmul,
(np.random.rand(2, 3), np.random.rand(3, 2)), 'float32'),
('unary_float64', paddle.tanh, (np.random.rand(2, 3), ), 'float64'),
('binary_float64', paddle.matmul,
(np.random.rand(2, 3), np.random.rand(3, 2)), 'float64'),
))
class TestJvpPrim(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.args = [arg.astype(cls.dtype) for arg in cls.args]
cls._rtol = config.TOLERANCE.get(
cls.dtype).get('first_order_grad').get('rtol')
cls._atol = config.TOLERANCE.get(
cls.dtype).get('first_order_grad').get('atol')
def setUp(self):
paddle.enable_static()
paddle.incubate.autograd.enable_prim()
def tearDown(self):
paddle.incubate.autograd.disable_prim()
paddle.disable_static()
def test_jacobian_prim(self):
def wrapper(fun, args):
mp = paddle.static.Program()
sp = paddle.static.Program()
with paddle.static.program_guard(mp, sp):
static_args = [
paddle.static.data(f'arg{i}', arg.shape, self.dtype)
for i, arg in enumerate(args)
]
for arg in static_args:
arg.stop_gradient = False
_, jvp_res = paddle.incubate.autograd.jvp(fun, static_args)
if paddle.incubate.autograd.prim_enabled():
paddle.incubate.autograd.prim2orig()
exe = paddle.static.Executor()
exe.run(sp)
jvp_res = exe.run(
mp,
feed={f'arg{i}': arg
for i, arg in enumerate(args)},
fetch_list=[jvp_res])
return jvp_res
paddle.incubate.autograd.enable_prim()
prim_jvp = wrapper(self.fun, self.args)
paddle.incubate.autograd.disable_prim()
orig_jvp = wrapper(self.fun, self.args)
np.testing.assert_allclose(orig_jvp,
prim_jvp,
rtol=self._rtol,
atol=self._atol)
@utils.place(config.DEVICES)
@utils.parameterize((utils.TEST_CASE_NAME, 'fun', 'args', 'dtype'), (
('unary_float32', paddle.tanh, (np.random.rand(2, 3), ), 'float32'),
('binary_float32', paddle.matmul,
(np.random.rand(2, 3), np.random.rand(3, 2)), 'float32'),
('unary_float64', paddle.tanh, (np.random.rand(2, 3), ), 'float64'),
('binary_float64', paddle.matmul,
(np.random.rand(2, 3), np.random.rand(3, 2)), 'float64'),
))
class TestVjpPrim(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.args = [arg.astype(cls.dtype) for arg in cls.args]
cls._rtol = config.TOLERANCE.get(
cls.dtype).get('first_order_grad').get('rtol')
cls._atol = config.TOLERANCE.get(
cls.dtype).get('first_order_grad').get('atol')
def setUp(self):
paddle.enable_static()
paddle.incubate.autograd.enable_prim()
def tearDown(self):
paddle.incubate.autograd.disable_prim()
paddle.disable_static()
def test_jacobian_prim(self):
def wrapper(fun, args):
mp = paddle.static.Program()
sp = paddle.static.Program()
with paddle.static.program_guard(mp, sp):
static_args = [
paddle.static.data(f'arg{i}', arg.shape, self.dtype)
for i, arg in enumerate(args)
]
for arg in static_args:
arg.stop_gradient = False
_, vjp_res = paddle.incubate.autograd.vjp(fun, static_args)
if paddle.incubate.autograd.prim_enabled():
paddle.incubate.autograd.prim2orig()
exe = paddle.static.Executor()
exe.run(sp)
vjp_res = exe.run(
mp,
feed={f'arg{i}': arg
for i, arg in enumerate(args)},
fetch_list=[vjp_res])
return vjp_res
paddle.incubate.autograd.enable_prim()
prim_vjp = wrapper(self.fun, self.args)
paddle.incubate.autograd.disable_prim()
orig_vjp = wrapper(self.fun, self.args)
for orig, prim in zip(orig_vjp, prim_vjp):
np.testing.assert_allclose(orig,
prim,
rtol=self._rtol,
atol=self._atol)
if __name__ == "__main__":
unittest.main()
......@@ -59,7 +59,8 @@ class TestVJP(unittest.TestCase):
with paddle.static.program_guard(mp, sp):
feed, static_xs, static_v = utils.gen_static_data_and_feed(
self.xs, self.v, stop_gradient=self.stop_gradient)
ys, xs_grads = paddle.autograd.vjp(self.fun, static_xs, static_v)
ys, xs_grads = paddle.incubate.autograd.vjp(self.fun, static_xs,
static_v)
exe.run(sp)
return exe.run(mp, feed=feed, fetch_list=[ys, xs_grads])
......@@ -103,7 +104,8 @@ class TestVJPException(unittest.TestCase):
with paddle.static.program_guard(mp, sp):
feed, static_xs, static_v = utils.gen_static_data_and_feed(
self.xs, self.v)
ys, xs_grads = paddle.autograd.vjp(self.fun, static_xs, static_v)
ys, xs_grads = paddle.incubate.autograd.vjp(self.fun, static_xs,
static_v)
self.exe.run(sp)
return self.exe.run(mp, feed, fetch_list=[ys, xs_grads])
......@@ -214,7 +216,7 @@ class TestJacobianFloat32(unittest.TestCase):
startup = fluid.Program()
with fluid.program_guard(main, startup):
xs = make_tensors(inps)
JJ = paddle.autograd.functional.Jacobian(pd_f, xs, is_batched=batch)
JJ = paddle.incubate.autograd.Jacobian(pd_f, xs, is_batched=batch)
if batch:
_, nrow, ncol = JJ.shape
else:
......@@ -244,7 +246,7 @@ class TestJacobianFloat32(unittest.TestCase):
startup = fluid.Program()
with fluid.program_guard(main, startup):
xs = make_tensors(inps)
JJ = paddle.autograd.functional.Jacobian(pd_f, xs, is_batched=batch)
JJ = paddle.incubate.autograd.Jacobian(pd_f, xs, is_batched=batch)
if batch:
nbatch, nrow, ncol = JJ.shape
rows = [JJ[:, i, :] for i in range(nrow)]
......@@ -269,7 +271,7 @@ class TestJacobianFloat32(unittest.TestCase):
startup = fluid.Program()
with fluid.program_guard(main, startup):
xs = make_tensors(inps)
JJ = paddle.autograd.functional.Jacobian(pd_f, xs, is_batched=batch)
JJ = paddle.incubate.autograd.Jacobian(pd_f, xs, is_batched=batch)
if batch:
nbatch, nrow, ncol = JJ.shape
entries = [
......@@ -390,7 +392,7 @@ class TestHessianFloat32(unittest.TestCase):
startup = fluid.Program()
with fluid.program_guard(main, startup):
xs = make_tensors(inps)
HH = paddle.autograd.functional.Hessian(pd_f, xs, is_batched=batch)
HH = paddle.incubate.autograd.Hessian(pd_f, xs, is_batched=batch)
nrow, ncol = HH.shape
full_hessian = HH[:]
exe = fluid.Executor(self.place)
......
......@@ -13,82 +13,16 @@
# limitations under the License.
import unittest
import numpy as np
import numpy as np
import paddle
from paddle.incubate.autograd.primx import prim2orig
from paddle.incubate.autograd.utils import enable_prim, disable_prim, prim_enabled
from paddle.incubate.autograd.utils import (disable_prim, enable_prim,
prim_enabled)
paddle.enable_static()
class TestGradients(unittest.TestCase):
def test_third_order(self):
enable_prim()
main = paddle.static.Program()
startup = paddle.static.Program()
with paddle.static.program_guard(main, startup):
x = paddle.static.data(name='x', shape=[1], dtype='float32')
x2 = paddle.multiply(x, x)
x3 = paddle.multiply(x2, x)
x4 = paddle.multiply(x3, x)
grad1, = paddle.static.gradients([x4], [x])
grad2, = paddle.static.gradients([grad1], [x])
grad3, = paddle.static.gradients([grad2], [x])
prim2orig(main.block(0))
feed = {x.name: np.array([2.]).astype('float32')}
fetch_list = [grad3.name]
result = [np.array([48.])]
place = paddle.CPUPlace()
if paddle.device.is_compiled_with_cuda():
place = paddle.CUDAPlace(0)
exe = paddle.static.Executor(place)
exe.run(startup)
outs = exe.run(main, feed=feed, fetch_list=fetch_list)
np.allclose(outs, result)
disable_prim()
def test_fourth_order(self):
enable_prim()
main = paddle.static.Program()
startup = paddle.static.Program()
with paddle.static.program_guard(main, startup):
x = paddle.static.data(name='x', shape=[1], dtype='float32')
x2 = paddle.multiply(x, x)
x3 = paddle.multiply(x2, x)
x4 = paddle.multiply(x3, x)
x5 = paddle.multiply(x4, x)
out = paddle.sqrt(x5 + x4)
grad1, = paddle.static.gradients([out], [x])
grad2, = paddle.static.gradients([grad1], [x])
grad3, = paddle.static.gradients([grad2], [x])
grad4, = paddle.static.gradients([grad3], [x])
prim2orig(main.block(0))
feed = {
x.name: np.array([2.]).astype('float32'),
}
fetch_list = [grad4.name]
# (3*(-5*x^2-16*x-16))/(16*(x+1)^3.5)
result = [np.array([-0.27263762711])]
place = paddle.CPUPlace()
if paddle.device.is_compiled_with_cuda():
place = paddle.CUDAPlace(0)
exe = paddle.static.Executor(place)
exe.run(startup)
outs = exe.run(main, feed=feed, fetch_list=fetch_list)
np.allclose(outs, result)
disable_prim()
class TestMinimize(unittest.TestCase):
def model(self, x, w, bias, opt):
......
......@@ -37,7 +37,7 @@ import utils
('input_gradients_not_none', paddle.matmul,
(np.random.rand(3, 3), np.random.rand(3, 3)),
(np.random.rand(3, 3), np.random.rand(3, 3)), 'float64')))
class TestForwardGradients(unittest.TestCase):
class TestForwardGrad(unittest.TestCase):
@classmethod
def setUpClass(cls):
......@@ -55,7 +55,7 @@ class TestForwardGradients(unittest.TestCase):
paddle.incubate.autograd.disable_prim()
paddle.disable_static()
def test_forward_gradients(self):
def test_forward_grad(self):
def expected():
paddle.incubate.autograd.disable_prim()
......@@ -64,7 +64,8 @@ class TestForwardGradients(unittest.TestCase):
with paddle.static.program_guard(mp, sp):
feed, static_xs, static_v = utils.gen_static_data_and_feed(
self.xs, self.v, stop_gradient=False)
_, ys_grad = paddle.autograd.jvp(self.fun, static_xs, static_v)
_, ys_grad = paddle.incubate.autograd.jvp(
self.fun, static_xs, static_v)
exe = paddle.static.Executor()
exe.run(sp)
out = exe.run(mp, feed=feed, fetch_list=ys_grad)
......@@ -80,7 +81,8 @@ class TestForwardGradients(unittest.TestCase):
self.xs, self.v, stop_gradient=False)
ys = self.fun(*static_xs) if isinstance(
static_xs, typing.Sequence) else self.fun(static_xs)
ys_grad = primapi.forward_gradients(ys, static_xs, static_v)
ys_grad = paddle.incubate.autograd.forward_grad(
ys, static_xs, static_v)
paddle.incubate.autograd.prim2orig(mp.block(0))
exe = paddle.static.Executor()
exe.run(sp)
......@@ -106,7 +108,7 @@ class TestForwardGradients(unittest.TestCase):
self.xs, self.v, stop_gradient=False)
ys = self.fun(*static_xs) if isinstance(
static_xs, typing.Sequence) else self.fun(static_xs)
ys_grad = primapi.forward_gradients(ys, static_xs, static_v)
ys_grad = primapi.forward_grad(ys, static_xs, static_v)
paddle.incubate.autograd.prim2orig(mp.block(0))
exe = paddle.static.Executor()
exe.run(sp)
......@@ -116,14 +118,125 @@ class TestForwardGradients(unittest.TestCase):
def test_illegal_param(self):
paddle.incubate.autograd.enable_prim()
with self.assertRaises(TypeError):
primapi.forward_gradients(1, paddle.static.data('inputs',
shape=[1]))
primapi.forward_grad(1, paddle.static.data('inputs', shape=[1]))
with self.assertRaises(TypeError):
primapi.forward_gradients(paddle.static.data('targets', shape=[1]),
1)
primapi.forward_grad(paddle.static.data('targets', shape=[1]), 1)
paddle.incubate.autograd.disable_prim()
class TestGrad(unittest.TestCase):
def setUp(self):
paddle.enable_static()
paddle.incubate.autograd.enable_prim()
def tearDown(self):
paddle.incubate.autograd.disable_prim()
paddle.disable_static()
def test_third_order(self):
paddle.incubate.autograd.enable_prim()
main = paddle.static.Program()
startup = paddle.static.Program()
with paddle.static.program_guard(main, startup):
x = paddle.static.data(name='x', shape=[1], dtype='float32')
x2 = paddle.multiply(x, x)
x3 = paddle.multiply(x2, x)
x4 = paddle.multiply(x3, x)
grad1, = paddle.incubate.autograd.grad([x4], [x])
grad2, = paddle.incubate.autograd.grad([grad1], [x])
grad3, = paddle.incubate.autograd.grad([grad2], [x])
paddle.incubate.autograd.prim2orig(main.block(0))
feed = {x.name: np.array([2.]).astype('float32')}
fetch_list = [grad3.name]
result = [np.array([48.])]
place = paddle.CPUPlace()
if paddle.device.is_compiled_with_cuda():
place = paddle.CUDAPlace(0)
exe = paddle.static.Executor(place)
exe.run(startup)
outs = exe.run(main, feed=feed, fetch_list=fetch_list)
np.allclose(outs, result)
paddle.incubate.autograd.disable_prim()
def test_fourth_order(self):
paddle.incubate.autograd.enable_prim()
main = paddle.static.Program()
startup = paddle.static.Program()
with paddle.static.program_guard(main, startup):
x = paddle.static.data(name='x', shape=[1], dtype='float32')
x2 = paddle.multiply(x, x)
x3 = paddle.multiply(x2, x)
x4 = paddle.multiply(x3, x)
x5 = paddle.multiply(x4, x)
out = paddle.sqrt(x5 + x4)
grad1, = paddle.incubate.autograd.grad([out], [x])
grad2, = paddle.incubate.autograd.grad([grad1], [x])
grad3, = paddle.incubate.autograd.grad([grad2], [x])
grad4, = paddle.incubate.autograd.grad([grad3], [x])
paddle.incubate.autograd.prim2orig(main.block(0))
feed = {
x.name: np.array([2.]).astype('float32'),
}
fetch_list = [grad4.name]
# (3*(-5*x^2-16*x-16))/(16*(x+1)^3.5)
result = [np.array([-0.27263762711])]
place = paddle.CPUPlace()
if paddle.device.is_compiled_with_cuda():
place = paddle.CUDAPlace(0)
exe = paddle.static.Executor(place)
exe.run(startup)
outs = exe.run(main, feed=feed, fetch_list=fetch_list)
np.allclose(outs, result)
paddle.incubate.autograd.disable_prim()
def test_disable_prim(self):
def actual(x: np.array):
paddle.incubate.autograd.disable_prim()
main = paddle.static.Program()
startup = paddle.static.Program()
with paddle.static.program_guard(main, startup):
var_x = paddle.static.data('x', shape=x.shape, dtype=x.dtype)
var_x.stop_gradient = False
y = paddle.tanh(var_x)
y_grad = paddle.incubate.autograd.grad(y, var_x)
y_second_grad = paddle.incubate.autograd.grad(y_grad, var_x)
exe = paddle.static.Executor()
exe.run(startup)
return exe.run(main,
feed={'x': x},
fetch_list=[y_grad, y_second_grad])
def expect(x: np.array):
paddle.incubate.autograd.disable_prim()
main = paddle.static.Program()
startup = paddle.static.Program()
with paddle.static.program_guard(main, startup):
var_x = paddle.static.data('x', shape=x.shape, dtype=x.dtype)
var_x.stop_gradient = False
y = paddle.tanh(var_x)
y_grad = paddle.static.gradients(y, var_x)
y_second_grad = paddle.static.gradients(y_grad, var_x)
exe = paddle.static.Executor()
exe.run(startup)
return exe.run(main,
feed={'x': x},
fetch_list=[y_grad, y_second_grad])
x = np.random.randn(100, 200)
for i, j in zip(actual(x), expect(x)):
np.testing.assert_allclose(i, j)
if __name__ == '__main__':
unittest.main()
......@@ -21,7 +21,7 @@ from paddle.incubate.autograd.primops import (neg, set_value, add, sub, mul,
concat, reduce, matmul,
slice_select, slice_assign,
gather, scatter_add, fill_const)
from paddle.incubate.autograd.primx import Transform, topo_path, orig2prim, prim2orig, _gradients
from paddle.incubate.autograd.primx import Transform, topo_path, orig2prim, prim2orig
from paddle.incubate.autograd.utils import enable_prim, disable_prim, prim_enabled
......
......@@ -22,7 +22,7 @@ import contextlib
import collections
import numpy as np
import paddle
from paddle.autograd.utils import as_tensors
from paddle.incubate.autograd.utils import as_tensors
##########################################################
......
......@@ -11,11 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.autograd.functional import Hessian, Jacobian, jvp, vjp
from .functional import Hessian, Jacobian, jvp, vjp
from .primapi import forward_grad, grad
from .primx import prim2orig
from .utils import enable_prim, disable_prim, prim_enabled
from .utils import disable_prim, enable_prim, prim_enabled
__all__ = [ # noqa
'vjp', 'jvp', 'Jacobian', 'Hessian', 'prim2orig', 'enable_prim',
'disable_prim', 'prim_enabled'
'vjp', 'jvp', 'Jacobian', 'Hessian', 'enable_prim', 'disable_prim',
'forward_grad', 'grad'
]
......@@ -17,7 +17,7 @@ import typing
import paddle
from paddle.fluid import framework
from paddle.autograd.utils import as_tensors
from paddle.incubate.autograd import primapi, utils
def vjp(func, xs, v=None):
......@@ -70,6 +70,7 @@ def vjp(func, xs, v=None):
# ``_seprate`` breaks the dependencies between ``xs`` and other
# variables. See more ``_seprate`` .
if paddle.fluid._non_static_mode() or not utils.prim_enabled():
xs, v = _separate(xs), _separate(v)
ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs)
_check_v_shape(v, ys)
......@@ -130,9 +131,14 @@ def jvp(func, xs, v=None):
_check_inputs(func, xs, v)
# ``_seprate`` breaks the dependencies between ``xs`` and other
# variables. See more ``_seprate`` .
if paddle.fluid._non_static_mode() or not utils.prim_enabled():
xs, v = _separate(xs), _separate(v)
ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs)
_check_v_shape(v, xs)
if not paddle.fluid._non_static_mode() and utils.prim_enabled():
return ys, primapi.forward_grad(ys, xs, v)
else:
return ys, _double_backward_trick(ys, xs, v)
......@@ -349,14 +355,13 @@ class _Jacobian(object):
def __init__(self, func, xs):
# Skip separating in prim mode temporarily, as detach and clone are not
# primitive operators.
if not paddle.fluid._non_static_mode(
) and paddle.incubate.autograd.prim_enabled():
if not paddle.fluid._non_static_mode() and utils.prim_enabled():
self._xs = xs
else:
self._xs = _separate(xs)
self._ys = func(*as_tensors(self._xs))
self._flatten_xs = self._flatten(as_tensors(self._xs))
self._flatten_ys = self._flatten(as_tensors(self._ys))
self._ys = func(*utils.as_tensors(self._xs))
self._flatten_xs = self._flatten(utils.as_tensors(self._xs))
self._flatten_ys = self._flatten(utils.as_tensors(self._ys))
self._cache = {}
@property
......@@ -440,32 +445,6 @@ class _JacobianNoBatch(_Jacobian):
))
class _JacobianBatchLast(_Jacobian):
"""Compute Jacobian matrix with batch at last axis.
Suppose the mapping is :math:`f: R^{M,B} \to R^{N,B}`, the output shape is
``(N, M, B)`` .
"""
def __init__(self, func, xs):
super(_JacobianBatchLast, self).__init__(func, xs)
@property
def shape(self):
return (self._flatten_ys.shape[0], self._flatten_xs.shape[0],
self._flatten_xs.shape[1])
@property
def _lazy_axis(self):
return 0
def _flatten(self, xs):
return paddle.concat(
tuple(x.reshape((-1, x.shape[-1])) for x in as_tensors(xs)), 0)
def _evaluate(self, row):
return self._flatten(_grad(self._flatten_ys[row, :], self._xs))
class _JacobianBatchFirst(_Jacobian):
"""Compute Jacobian matrix with batch at first axis.
Suppose the mapping is :math:`f: R^{B,M} \to R^{B,N}`, the output shape is
......@@ -486,7 +465,7 @@ class _JacobianBatchFirst(_Jacobian):
def _flatten(self, xs):
return paddle.concat(
tuple(x.reshape((x.shape[0], -1)) for x in as_tensors(xs)), 1)
tuple(x.reshape((x.shape[0], -1)) for x in utils.as_tensors(xs)), 1)
def _evaluate(self, row_index):
return self._flatten(_grad(self._flatten_ys[:, row_index], self._xs))
......@@ -537,12 +516,6 @@ def _multi_index(indexes, shape):
return tuple(positive_indexes)
def _stack_tensor_or_return_none(origin_list):
assert len(origin_list) > 0, "Can't not stack an empty list"
return paddle.stack(origin_list, axis=0) if isinstance(
origin_list[0], paddle.fluid.framework.Variable) else None
def _replace_none_with_zero_tensor(xs, refs):
if xs is None:
xs = paddle.zeros_like(refs)
......@@ -594,7 +567,7 @@ def _grad(ys, xs, v=None):
if paddle.fluid._non_static_mode():
xs_grad = paddle.grad(ys, xs, v, create_graph=True, allow_unused=True)
else:
xs_grad = paddle.static.gradients(ys, xs, v)
xs_grad = paddle.incubate.autograd.grad(ys, xs, v)
if isinstance(xs, paddle.fluid.framework.Variable):
xs_grad = xs_grad[0]
......@@ -690,7 +663,7 @@ def _check_v_shape(v, refs):
if v is None:
return
v, refs = as_tensors(v), as_tensors(refs)
v, refs = utils.as_tensors(v), utils.as_tensors(refs)
if len(refs) != len(v):
raise RuntimeError(f"The argument v is a tuple of invalid length:"
f"should be {len(refs)} but got {len(v)}.")
......@@ -700,663 +673,3 @@ def _check_v_shape(v, refs):
raise RuntimeError(
f"The v[{index}] has invalid shape: should "
f"be {element_ref.shape} but got {element_v.shape}.")
@framework.dygraph_only
def jacobian(func, inputs, create_graph=False, allow_unused=False):
'''
.. note::
**This API is ONLY available in the imperative mode.**
This function computes the Jacobian matrix of `func` with respect to `inputs`.
Parameters:
func (function): a Python function that takes a Tensor or a Tensor
list/tuple as inputs and returns a Tensor or a Tensor tuple.
inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or
Tensor list/tuple of the function ``func``.
create_graph (bool, optional): whether to create the gradient graphs
of the computing process. When it is True, higher order derivatives
are supported to compute; when it is False, the gradient graphs of
the computing process would be discarded. Defaults to ``False``.
allow_unused (bool, optional): whether to raise error or return None if
some Tensors of `inputs` are unreachable in the graph. Error would
be raised if allow_unused=False, and None would be returned as
their gradients if allow_unused=True. Default False.
Returns:
Jacobian (Tensor or nested tuple of Tensors): if function ``func``
takes a Tensor as inputs and returns a Tensor as outputs, Jacobian
will be a single Tensor containing the Jacobian matrix for the
linearized inputs and outputs. If one of the inputs and outputs is
a Tensor, and another is a Tensor list/tuple, then the Jacobian will
be a tuple of Tensors. If both of inputs and outputs are Tensor
list/tuple, then the Jacobian will be a tuple of tuple of Tensors
where ``Jacobian[i][j]`` will contain the Jacobian matrix of the
linearized ``i``th output and ``j``th input and will have same
dtype and device as the corresponding input. ``Jacobian[i][j]`` will
have as size ``m * n``, where ``m`` and ``n`` denote the numbers of
elements of ``i``th output and ``j``th input respectively.
Examples 1:
.. code-block:: python
import paddle
def func(x):
return paddle.matmul(x, x)
x = paddle.ones(shape=[2, 2], dtype='float32')
x.stop_gradient = False
jacobian = paddle.autograd.jacobian(func, x)
print(jacobian)
# Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[2., 1., 1., 0.],
# [1., 2., 0., 1.],
# [1., 0., 2., 1.],
# [0., 1., 1., 2.]])
Examples 2:
.. code-block:: python
import paddle
def func(x, y):
return paddle.matmul(x, y)
x = paddle.ones(shape=[2, 2], dtype='float32')
y = paddle.ones(shape=[2, 2], dtype='float32') * 2
x.stop_gradient = False
y.stop_gradient = False
jacobian = paddle.autograd.jacobian(func, [x, y], create_graph=True)
print(jacobian)
# (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
# [[2., 2., 0., 0.],
# [2., 2., 0., 0.],
# [0., 0., 2., 2.],
# [0., 0., 2., 2.]]),
# Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
# [[1., 0., 1., 0.],
# [0., 1., 0., 1.],
# [1., 0., 1., 0.],
# [0., 1., 0., 1.]]))
Examples 3:
.. code-block:: python
import paddle
def func(x, y):
return paddle.matmul(x, y), x * x
x = paddle.ones(shape=[2, 2], dtype='float32')
y = paddle.ones(shape=[2, 2], dtype='float32') * 2
x.stop_gradient = False
y.stop_gradient = False
jacobian = paddle.autograd.jacobian(func, [x, y], allow_unused=True)
print(jacobian)
# ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[2., 2., 0., 0.],
# [2., 2., 0., 0.],
# [0., 0., 2., 2.],
# [0., 0., 2., 2.]]),
# Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[1., 0., 1., 0.],
# [0., 1., 0., 1.],
# [1., 0., 1., 0.],
# [0., 1., 0., 1.]])),
# (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[2., 0., 0., 0.],
# [0., 2., 0., 0.],
# [0., 0., 2., 0.],
# [0., 0., 0., 2.]]), None))
'''
inputs = as_tensors(inputs)
outputs = as_tensors(func(*inputs))
fin_size = len(inputs)
fout_size = len(outputs)
flat_outputs = tuple(
paddle.reshape(output, shape=[-1]) for output in outputs)
jacobian = tuple()
for i, flat_output in enumerate(flat_outputs):
jac_i = list([] for _ in range(fin_size))
for k in range(len(flat_output)):
row_k = paddle.grad(flat_output[k],
inputs,
create_graph=create_graph,
retain_graph=True,
allow_unused=allow_unused)
for j in range(fin_size):
jac_i[j].append(
paddle.reshape(row_k[j], shape=[-1]) if isinstance(
row_k[j], paddle.Tensor) else None)
jacobian += (tuple(
_stack_tensor_or_return_none(jac_i_j) for jac_i_j in jac_i), )
if fin_size == 1 and fout_size == 1:
return jacobian[0][0]
elif fin_size == 1 and fout_size != 1:
return tuple(jacobian[i][0] for i in range(fout_size))
elif fin_size != 1 and fout_size == 1:
return jacobian[0]
else:
return jacobian
@framework.dygraph_only
def batch_jacobian(func, inputs, create_graph=False, allow_unused=False):
'''
.. note::
**This API is ONLY available in the imperative mode.**
This function computes the batch Jacobian matrix of `func` with respect to `inputs`.
Noted that the first dimension of inputs is batch size.
Parameters:
func (function): a Python function that takes a Tensor or a Tensor
list/tuple as inputs(the first dimension is batch size) and
returns a Tensor or a Tensor tuple.
inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or
Tensor list/tuple of the function ``func``, Noted that
the first dimension of inputs is batch size.
create_graph (bool, optional): whether to create the gradient graphs
of the computing process. When it is True, higher order derivatives
are supported to compute; when it is False, the gradient graphs of
the computing process would be discarded. Defaults to ``False``.
allow_unused (bool, optional): whether to raise error or return None if
some Tensors of `inputs` are unreachable in the graph. Error would
be raised if allow_unused=False, and None would be returned as
their gradients if allow_unused=True. Default False.
Returns:
Jacobian (Tensor or nested tuple of Tensors): if function ``func``
takes a Tensor as inputs and returns a Tensor as outputs, Jacobian
will be a single Tensor containing the Jacobian matrix for the
linearized inputs and outputs. If one of the inputs and outputs is
a Tensor, and another is a Tensor list/tuple, then the Jacobian will
be a tuple of Tensors. If both of inputs and outputs are Tensor
list/tuple, then the Jacobian will be a tuple of tuple of Tensors.
Noted that the first dimension of inputs is batch size.
For example,
the inputs shape and outputs shape of function ``func` is [batch_size, num]
and [batch_size, num] respectively, then the Jacobian will be a Tensor with
a shape of [num, batch_size * num], where ``Jacobian[i][j]`` will contain
the Jacobian matrix of the ``i``th column output and the ``j``th input and
will have same dtype and device as the corresponding input.
Other situations can be deduced by analogy.
Examples 1:
.. code-block:: python
import paddle
x = paddle.ones(shape=(4, 2), dtype='float64')
weight = paddle.ones(shape=(2, 4), dtype='float64')
y = paddle.ones(shape=(4, 2), dtype='float64')
def func(x):
return paddle.matmul(paddle.matmul(x, weight), y)
x.stop_gradient = False
batch_jacobian = paddle.autograd.batch_jacobian(func, x)
print(batch_jacobian)
# Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[4., 4., 4., 4., 4., 4., 4., 4.],
# [4., 4., 4., 4., 4., 4., 4., 4.]])
Examples 2:
.. code-block:: python
import paddle
x = paddle.ones(shape=(4, 2), dtype='float64')
weight = paddle.ones(shape=(2, 4), dtype='float64')
y = paddle.ones(shape=(4, 2), dtype='float64')
def func(x):
return paddle.matmul(paddle.matmul(x, weight), y), x * x
x.stop_gradient = False
batch_jacobian = paddle.autograd.batch_jacobian(func, x)
print(batch_jacobian)
# (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[4., 4., 4., 4., 4., 4., 4., 4.],
# [4., 4., 4., 4., 4., 4., 4., 4.]]), Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[2., 0., 2., 0., 2., 0., 2., 0.],
# [0., 2., 0., 2., 0., 2., 0., 2.]]))
Examples 3:
.. code-block:: python
import paddle
x = paddle.ones(shape=(4, 2), dtype='float64')
weight = paddle.ones(shape=(2, 4), dtype='float64')
y = paddle.ones(shape=(4, 2), dtype='float64')
def func(x, y):
return x * y
x.stop_gradient = False
y.stop_gradient = False
batch_jacobian = paddle.autograd.batch_jacobian(func, [x, y])
print(batch_jacobian)
# (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[1., 0., 1., 0., 1., 0., 1., 0.],
# [0., 1., 0., 1., 0., 1., 0., 1.]]), Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[1., 0., 1., 0., 1., 0., 1., 0.],
# [0., 1., 0., 1., 0., 1., 0., 1.]]))
'''
inputs = as_tensors(inputs)
outputs = as_tensors(func(*inputs))
batch_size = inputs[0].shape[0]
for input in inputs:
assert input.shape[
0] == batch_size, "The first dimension of input should equals to the same batch size!"
for output in outputs:
assert output.shape[
0] == batch_size, "The first dimension of output should equals to the same batch size!"
fin_size = len(inputs)
fout_size = len(outputs)
flat_outputs = tuple(
paddle.reshape(output, shape=[batch_size, -1]) for output in outputs)
jacobian = tuple()
for i, flat_output in enumerate(flat_outputs):
jac_i = list([] for _ in range(fin_size))
for k in range(flat_output.shape[1]):
row_k = paddle.grad(flat_output[:, k],
inputs,
create_graph=create_graph,
retain_graph=True,
allow_unused=allow_unused)
for j in range(fin_size):
jac_i[j].append(
paddle.reshape(row_k[j], shape=[-1]) if isinstance(
row_k[j], paddle.Tensor) else None)
jacobian += (tuple(
_stack_tensor_or_return_none(jac_i_j) for jac_i_j in jac_i), )
if fin_size == 1 and fout_size == 1:
return jacobian[0][0]
elif fin_size == 1 and fout_size != 1:
return tuple(jacobian[i][0] for i in range(fout_size))
elif fin_size != 1 and fout_size == 1:
return jacobian[0]
else:
return jacobian
@framework.dygraph_only
def batch_hessian(func, inputs, create_graph=False, allow_unused=False):
'''
.. note::
**This API is ONLY available in the imperative mode.**
This function computes the batch Hessian matrix of `func` with respect to `inputs`.
Noted that the first dimension of inputs is batch size.
Parameters:
func (function): a Python function that takes a Tensor or a Tensor
list/tuple as inputs(the first dimension is batch size) and
returns a Tensor with shape [batch_size, 1].
inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or
Tensor list/tuple of the function ``func``.
Noted that the first dimension of inputs is batch size.
create_graph (bool, optional): whether to create the gradient graphs
of the computing process. When it is True, higher order derivatives
are supported to compute; when it is False, the gradient graphs of
the computing process would be discarded. Defaults to ``False``.
allow_unused (bool, optional): whether to raise error or return None if
some Tensors of `inputs` are unreachable in the graph. Error would
be raised if allow_unused=False, and None would be returned as
their gradients if allow_unused=True. Default False.
Returns:
Hessian (Tensor or a tuple of tuple of Tensors): if function ``func``
takes a Tensor as ``inputs``, Hessian will be a single Tensor containing
the Hessian matrix for the linearized ``inputs`` Tensor. If function
``func`` takes a Tensor list/tuple as ``inputs``, then the Hessian will
be a tuple of tuple of Tensors. Noted that the first dimension of inputs
is batch size and the execution step is to obtain the result of the
first order differentiation, and then differentiate the batch input.
For example,
the inputs shape and outputs shape of function ``func` is [batch_size, num]
and [batch_size, 1] respectively, then the batched Hessian will be a Tensor with
a shape of [num, batch_size * num].
Why the final shape in this case is that?
because batch_hessian will create a inner func(the wrapper of paddle.grad() func)
to computes the sum of gradients of `outputs` with respect to each `inputs`,
this inner func will get the first order differentiation and shape is [batch_size, num],
then call batch_jacobian to compute jacobian between the first order differentiation
and the origin inputs. The final result ``Hessian[i][j]`` will contain the Jacobian
matrix of the ``i``th column output(Noted that this output means the first order
differentiation) and the ``j``th input and will have same dtype and device as the
corresponding input. Other situations can be deduced by analogy.
Examples 1:
.. code-block:: python
import paddle
x = paddle.ones(shape=(4, 2), dtype='float64')
weight = paddle.ones(shape=(2, 4), dtype='float64')
y = paddle.ones(shape=(4, 2), dtype='float64')
def func(x):
return paddle.matmul(x * x, weight)[:, 0:1]
x.stop_gradient = False
batch_hessian = paddle.autograd.batch_hessian(func, x)
print(batch_hessian)
# Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[2., 0., 2., 0., 2., 0., 2., 0.],
# [0., 2., 0., 2., 0., 2., 0., 2.]])
Examples 2:
.. code-block:: python
import paddle
x = paddle.ones(shape=(4, 2), dtype='float64')
weight = paddle.ones(shape=(2, 4), dtype='float64')
y = paddle.ones(shape=(4, 2), dtype='float64')
def func(x, y):
return paddle.matmul(x * x * y * y, weight)[:, 0:1]
x.stop_gradient = False
y.stop_gradient = False
batch_hessian = paddle.autograd.batch_hessian(func, [x, y])
print(batch_hessian)
# ((Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[2., 0., 2., 0., 2., 0., 2., 0.],
# [0., 2., 0., 2., 0., 2., 0., 2.]]),
# Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[4., 0., 4., 0., 4., 0., 4., 0.],
# [0., 4., 0., 4., 0., 4., 0., 4.]])),
# (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[4., 0., 4., 0., 4., 0., 4., 0.],
# [0., 4., 0., 4., 0., 4., 0., 4.]]),
# Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[2., 0., 2., 0., 2., 0., 2., 0.],
# [0., 2., 0., 2., 0., 2., 0., 2.]])))
Examples 3:
.. code-block:: python
import paddle
x = paddle.ones(shape=(4, 2), dtype='float64')
weight = paddle.ones(shape=(2, 4), dtype='float64')
y = paddle.ones(shape=(4, 2), dtype='float64')
def func(x, y):
return paddle.matmul(x * x, weight)[:, 0:1]
x.stop_gradient = False
y.stop_gradient = False
batch_hessian = paddle.autograd.batch_hessian(func, [x, y], allow_unused=True)
print(batch_hessian)
# ((Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[2., 0., 2., 0., 2., 0., 2., 0.],
# [0., 2., 0., 2., 0., 2., 0., 2.]]), None), (None, None))
'''
inputs = as_tensors(inputs)
outputs = func(*inputs)
batch_size = inputs[0].shape[0]
for input in inputs:
assert input.shape[
0] == batch_size, "The first dimension of input should equals to the same batch size!"
assert isinstance(outputs, paddle.Tensor) and outputs.shape == [
batch_size, 1
], "The function to compute batched Hessian matrix should return a Tensor of shape [batch_size, 1]"
def jac_func(*ins):
grad_inputs = paddle.grad(outputs,
ins,
create_graph=True,
retain_graph=True,
allow_unused=allow_unused)
return tuple(
_replace_none_with_zero_tensor(grad_inputs[i], inputs[i])
for i in range(len(inputs)))
return batch_jacobian(jac_func,
inputs,
create_graph=create_graph,
allow_unused=allow_unused)
@framework.dygraph_only
def hessian(func, inputs, create_graph=False, allow_unused=False):
'''
.. note::
**This API is ONLY available in the imperative mode.**
This function computes the Hessian matrix of `func` with respect to `inputs`.
Parameters:
func (function): a Python function that takes a Tensor or a Tensor
list/tuple as inputs and returns a Tensor with a single element.
inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or
Tensor list/tuple of the function ``func``.
create_graph (bool, optional): whether to create the gradient graphs
of the computing process. When it is True, higher order derivatives
are supported to compute; when it is False, the gradient graphs of
the computing process would be discarded. Defaults to ``False``.
allow_unused (bool, optional): whether to raise error or return None if
some Tensors of `inputs` are unreachable in the graph. Error would
be raised if allow_unused=False, and None would be returned as
their gradients if allow_unused=True. Default False.
Returns:
Hessian (Tensor or a tuple of tuple of Tensors): if function ``func``
takes a Tensor as ``inputs``, Hessian will be a single Tensor containing
the Hessian matrix for the linearized ``inputs`` Tensor. If function
``func`` takes a Tensor list/tuple as ``inputs``, then the Hessian will
be a tuple of tuple of Tensors where ``Hessian[i][j]`` will contain the
Hessian matrix of the ``i``th input and ``j``th input with size ``m * n``.
Here ``m`` and ``n`` denote the number of elements of the ``i`` th input
and the ``j`` th input respectively.
Examples 1:
.. code-block:: python
import paddle
def func(x):
return paddle.sum(paddle.matmul(x, x))
x = paddle.ones(shape=[2, 2], dtype='float32')
x.stop_gradient = False
hessian = paddle.autograd.hessian(func, x)
print(hessian)
# Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[2., 1., 1., 0.],
# [1., 0., 2., 1.],
# [1., 2., 0., 1.],
# [0., 1., 1., 2.]])
Examples 2:
.. code-block:: python
import paddle
def func(x, y):
return paddle.sum(paddle.matmul(x, y))
x = paddle.ones(shape=[2, 2], dtype='float32')
y = paddle.ones(shape=[2, 2], dtype='float32')
x.stop_gradient = False
y.stop_gradient = False
hessian = paddle.autograd.hessian(func, [x, y])
print(hessian)
# ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[0., 0., 0., 0.],
# [0., 0., 0., 0.],
# [0., 0., 0., 0.],
# [0., 0., 0., 0.]]),
# Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[1., 1., 0., 0.],
# [0., 0., 1., 1.],
# [1., 1., 0., 0.],
# [0., 0., 1., 1.]])),
# (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[1., 0., 1., 0.],
# [1., 0., 1., 0.],
# [0., 1., 0., 1.],
# [0., 1., 0., 1.]]),
# Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[0., 0., 0., 0.],
# [0., 0., 0., 0.],
# [0., 0., 0., 0.],
# [0., 0., 0., 0.]])))
Examples 3:
.. code-block:: python
import paddle
def func(x, y):
return paddle.sum(paddle.matmul(x, x))
x = paddle.ones(shape=[2, 2], dtype='float32')
y = paddle.ones(shape=[2, 2], dtype='float32')
x.stop_gradient = False
y.stop_gradient = False
hessian = paddle.autograd.hessian(func, [x, y], allow_unused=True)
print(hessian)
# ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[2., 1., 1., 0.],
# [1., 0., 2., 1.],
# [1., 2., 0., 1.],
# [0., 1., 1., 2.]]), None), (None, None))
'''
inputs = as_tensors(inputs)
outputs = func(*inputs)
assert isinstance(outputs, paddle.Tensor) and outputs.shape == [
1
], "The function to compute Hessian matrix should return a Tensor with a single element"
def jac_func(*ins):
grad_inputs = paddle.grad(outputs,
ins,
create_graph=True,
retain_graph=True,
allow_unused=allow_unused)
return tuple(
_replace_none_with_zero_tensor(grad_inputs[i], inputs[i])
for i in range(len(inputs)))
return jacobian(jac_func,
inputs,
create_graph=create_graph,
allow_unused=allow_unused)
def vhp(func, inputs, v=None, create_graph=False, allow_unused=False):
'''
.. note::
**This API is ONLY available in the imperative mode.**
This function computes the product between a vector ``v`` and the
Hessian matrix of `func` with respect to `inputs`.
Parameters:
func (function): a Python function that takes a Tensor or a Tensor
list/tuple as inputs and returns a Tensor with a single element.
inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or
Tensor list/tuple of the function ``func``.
v (Tensor|list(Tensor)|tuple(Tensor)|None, optional): the vector used
to compute vector hessian product. ``v`` should have same shape
and dtype with ``inputs``. If ``v`` is None, it will be set as
Tensor|list(Tensor) with all elements 1. Defaults to "None".
create_graph (bool, optional): whether to create the gradient graphs
of the computing process. When it is True, higher order derivatives
are supported to compute; when it is False, the gradient graphs of
the computing process would be discarded. Defaults to ``False``.
allow_unused (bool, optional): whether to raise error or return None if
some Tensors of `inputs` are unreachable in the graph. Error would
be raised if allow_unused=False, and None would be returned as
their gradients if allow_unused=True. Default False.
Returns:
output (tuple): tuple with:
func_output (Tensor): output of ``func(inputs)``
vhp (list(Tensor)): result of the vector hessian product
with the same shape and dtype as the inputs.
Examples 1:
.. code-block:: python
import paddle
def func(x):
return paddle.sum(paddle.matmul(x, x))
x = paddle.ones(shape=[2, 2], dtype='float32')
x.stop_gradient = False
vx = paddle.ones(shape=[2, 2], dtype='float32') * 2
vhp_rslt = paddle.autograd.vhp(func, x, v=vx)
print(vhp_rslt)
# (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
# [8.]),
# Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[8., 8.],
# [8., 8.]]))
Examples 2:
.. code-block:: python
import paddle
def func(x):
return paddle.sum(paddle.matmul(x, x))
x = paddle.ones(shape=[2, 2], dtype='float32')
x.stop_gradient = False
vhp_rslt = paddle.autograd.vhp(func, x)
print(vhp_rslt)
# (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
# [8.]),
# Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[4., 4.],
# [4., 4.]]))
Examples 3:
.. code-block:: python
import paddle
def func(x, y):
return paddle.sum(paddle.matmul(x, x))
x = paddle.ones(shape=[2, 2], dtype='float32')
x.stop_gradient = False
y = paddle.ones(shape=[2, 2], dtype='float32')
y.stop_gradient = False
vx = paddle.ones(shape=[2, 2], dtype='float32') * 2
vy = paddle.ones(shape=[2, 2], dtype='float32') * 3
vhp_rslt = paddle.autograd.vhp(func, [x, y], v=[vx, vy], allow_unused=True)
print(vhp_rslt)
# (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
# [8.]),
# [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[8., 8.],
# [8., 8.]]), None])
'''
xs = as_tensors(inputs)
if v is not None:
v = as_tensors(v)
xs, v = _separate(xs), _separate(v)
outputs = func(*xs)
ys = as_tensors(outputs)
assert len(ys) == 1 and isinstance(
ys[0], framework.Variable
) and ys[0].shape == [
1
], "The function to compute vhp should return a Tensor with a single element"
jac = _grad(ys, xs)
vhp = _grad(jac, xs, v)
return outputs, vhp
......@@ -14,28 +14,26 @@
import typing
import paddle.autograd.utils as tensor_utils
import paddle.incubate.autograd.utils as prim_utils
from paddle.fluid import framework
from paddle.incubate.autograd import primx
from paddle.fluid import backward, framework
from paddle.incubate.autograd import primx, utils
@framework.static_only
def forward_gradients(targets, inputs, input_gradients=None):
def forward_grad(outputs, inputs, grad_inputs=None):
"""Forward mode of automatic differentiation.
.. note::
**ONLY available in the static mode and primitive operators.**
Args:
targets: The target tensor or tensors
outputs: The output tensor or tensors
inputs: The input tensor or tensors
input_gradients: The gradient Tensor or Tensors of inputs which has
grad_inputs: The gradient Tensor or Tensors of inputs which has
the same shape with inputs, Defaults to None, in this case is
equivalent to all ones .
Returns:
target_gradients (Tensor|Sequence[Tensor]): The gradients for targets.
grad_outputs (Tensor|Sequence[Tensor]): The gradients for outputs.
Examples:
......@@ -53,7 +51,7 @@ def forward_gradients(targets, inputs, input_gradients=None):
with paddle.static.program_guard(main_program, startup_program):
x = paddle.static.data('x', shape=[1], dtype='float32')
y = x * x
y_grad = paddle.incubate.autograd.forward_gradients(y, x)
y_grad = paddle.incubate.autograd.forward_grad(y, x)
paddle.incubate.autograd.prim2orig()
exe = paddle.static.Executor()
......@@ -65,20 +63,20 @@ def forward_gradients(targets, inputs, input_gradients=None):
paddle.incubate.autograd.disable_prim()
paddle.disable_static()
"""
if not prim_utils.prim_enabled():
raise RuntimeError('forward_gradients must be running on primitive'
if not utils.prim_enabled():
raise RuntimeError('forward_grad must be running on primitive'
'operators, use enable_prim to turn it on.')
if not isinstance(targets, (framework.Variable, typing.Sequence)):
raise TypeError(f'Expected targets is Tensor|Sequence[Tesnor], '
f'but got {type(targets)}.')
if not isinstance(outputs, (framework.Variable, typing.Sequence)):
raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], '
f'but got {type(outputs)}.')
if not isinstance(inputs, (framework.Variable, typing.Sequence)):
raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], '
f'but got {type(inputs)}.')
ys, xs, xs_dot = tensor_utils.as_tensors(targets), tensor_utils.as_tensors(
inputs), tensor_utils.as_tensors(input_gradients)
ys, xs, xs_dot = utils.as_tensors(outputs), utils.as_tensors(
inputs), utils.as_tensors(grad_inputs)
block = framework.default_main_program().current_block()
if any(x.block != block for x in xs + ys):
......@@ -90,4 +88,95 @@ def forward_gradients(targets, inputs, input_gradients=None):
ad = primx.Transform(ys[0].block)
_, ys_dot = ad.linearize(xs, ys, xs_dot)
return ys_dot[0] if isinstance(targets, framework.Variable) else ys_dot
return ys_dot[0] if isinstance(outputs, framework.Variable) else ys_dot
@framework.static_only
def grad(outputs, inputs, grad_outputs=None):
"""Reverse mode of automatic differentiation.
.. note::
**ONLY available in the static mode and primitive operators**
Args:
outputs (Tensor|Sequence[Tensor]): The output Tensor or Tensors.
inputs (Tensor|Sequence[Tensor]): The input Tensor or Tensors.
grad_outputs (Tensor|Sequence[Tensor]): The gradient Tensor or
Tensors of outputs which has the same shape with outputs, Defaults
to None, in this case is equivalent to all ones .
Returns:
grad_inputs (Tensor|Tensors): The gradients for inputs.
Examples:
.. code-block:: python
import numpy as np
import paddle
paddle.enable_static()
paddle.incubate.autograd.enable_prim()
startup_program = paddle.static.Program()
main_program = paddle.static.Program()
with paddle.static.program_guard(main_program, startup_program):
x = paddle.static.data('x', shape=[1], dtype='float32')
x.stop_gradients = False
y = x * x
x_grad = paddle.incubate.autograd.grad(y, x)
paddle.incubate.autograd.prim2orig()
exe = paddle.static.Executor()
exe.run(startup_program)
x_grad = exe.run(main_program, feed={'x': np.array([2.]).astype('float32')}, fetch_list=[x_grad])
print(x_grad)
# [array([4.], dtype=float32)]
paddle.incubate.autograd.disable_prim()
paddle.disable_static()
"""
if not utils.prim_enabled():
return backward.gradients(outputs, inputs, grad_outputs)
if not isinstance(outputs, (framework.Variable, typing.Sequence)):
raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], '
f'but got {type(outputs)}.')
if not isinstance(inputs, (framework.Variable, typing.Sequence)):
raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], '
f'but got {type(inputs)}.')
ys, xs, ys_bar = utils.as_tensors(outputs), utils.as_tensors(
inputs), utils.as_tensors(grad_outputs)
block = framework.default_main_program().current_block()
if any((x is not None and x.block != block) for x in xs + ys):
raise RuntimeError(
'Variable in inputs and outputs should be None or in current block of main program'
)
# TODO(Tongxin) without any prior knowledge about whether the program
# is completely lowered to primitive ops, it's mandatory to run the lowering
# pass once and again. This is obviously inefficient and needs to be
# optimized.
primx.orig2prim(block)
ad = primx.Transform(block)
xs_dot, ys_dot = ad.linearize(xs, ys)
if any(var is None for var in ys_dot):
raise RuntimeError(
'Grads cannot be computed. The given outputs does not depend on inputs'
)
ys_bar, xs_bar = ad.transpose(ys_dot, xs_dot, ys_bar)
# remove xs_dot and their constructor ops
op_indexes = []
for var in xs_dot:
if var is not None:
op_index = block.ops.index(var.op)
if op_index < 0:
raise ValueError(
f'op_index should be greater than or equal to 0, but op_index={op_index}.'
)
op_indexes.append(op_index)
ad.erase_ops(sorted(op_indexes))
ad.erase_dots(xs_dot)
return xs_bar[0] if isinstance(inputs, framework.Variable) else xs_bar
......@@ -14,6 +14,7 @@
import paddle
from paddle.fluid.layer_helper import LayerHelper
from .primreg import REGISTER_FN
......
......@@ -22,7 +22,7 @@ from .primreg import op_position_inputs, op_position_output, lookup_orig2prim, l
from .primrules import _orig2prim, _prim2orig, _jvp, _transpose
from .utils import get_input_var_list, get_output_var_list, flatten, flatten_and_remove_none
from collections import OrderedDict
from paddle.autograd.utils import as_tensors
from paddle.incubate.autograd.utils import as_tensors
def topo_path(xs, ys, block=None):
......@@ -577,47 +577,3 @@ def prim2orig(block=None):
assert block == default_main_program().current_block(
), f'block is neither None nor current block of main program'
_lower(block, reverse=True)
def _gradients(ys, xs, ys_bar=None):
""" A drop-in replacement of paddle.gradients but instead computing
on primitive ops.
Args:
ys: the target tensor or tensors
xs: the input tensor or tensors
ys_bar: the optional gradient tensors of `ys`
Returns:
xs_bar: a list gradients of input `xs`
"""
ys, xs, ys_bar = as_tensors(ys), as_tensors(xs), as_tensors(ys_bar)
block = default_main_program().current_block()
for el in xs + ys:
assert el is None or el.block == block, f'variable in xs and ys should be None or in current block of main program'
# TODO(Tongxin) without any prior knowledge about whether the program
# is completely lowered to primitive ops, it's mandatory to run the lowering
# pass once and again. This is obviously inefficient and needs to be
# optimized.
orig2prim(block)
ad = Transform(block)
xs_dot, ys_dot = ad.linearize(xs, ys)
if any(var is None for var in ys_dot):
assert False, f'Gradients cannot be computed. The given output `ys` does not depend on input `xs`.'
ys_bar, xs_bar = ad.transpose(ys_dot, xs_dot, ys_bar)
# remove xs_dot and their constructor ops
op_indexes = []
for var in xs_dot:
if var is not None:
op_index = block.ops.index(var.op)
assert op_index >= 0, f'op_index should be greater than or equal to 0, but op_index={op_index}.'
op_indexes.append(op_index)
ad.erase_ops(sorted(op_indexes))
ad.erase_dots(xs_dot)
return xs_bar
......@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import typing
import paddle
from paddle.fluid import framework as framework
......@@ -170,3 +171,12 @@ def flatten(inp):
def flatten_and_remove_none(inp):
flattened = flatten(inp)
return [var for var in flattened if var is not None]
def as_tensors(xs):
if isinstance(xs, framework.Variable):
return (xs, )
elif isinstance(xs, typing.Sequence):
return tuple(xs)
else:
return xs
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册