From dd63e5b46911cff800159176085a9c7b82b5430a Mon Sep 17 00:00:00 2001 From: Xiaoxu Chen Date: Mon, 11 Jul 2022 20:22:49 +0800 Subject: [PATCH] reorganize the higher order autodiff api (#44119) * move _gradients to primapi and rename to grad * modify jvp to call forward_grad in primitive mode * add primapi unittest and remove some unused test cases. * fix circular import problem * move paddle/autograd/functional into paddle/incubate.autograd/functional * remove unused JacobianBatchLast class --- python/paddle/autograd/__init__.py | 2 - python/paddle/autograd/functional.py | 1362 ----------------- python/paddle/autograd/utils.py | 26 - python/paddle/fluid/backward.py | 6 - .../tests/unittests/autograd/CMakeLists.txt | 2 +- .../test_autograd_functional_dynamic.py | 899 +---------- .../autograd/test_autograd_functional_prim.py | 125 ++ .../test_autograd_functional_static.py | 14 +- ...ients_and_minimize.py => test_minimize.py} | 72 +- .../tests/unittests/autograd/test_primapi.py | 131 +- .../tests/unittests/autograd/test_primops.py | 2 +- .../fluid/tests/unittests/autograd/utils.py | 2 +- python/paddle/incubate/autograd/__init__.py | 9 +- python/paddle/incubate/autograd/functional.py | 675 ++++++++ python/paddle/incubate/autograd/primapi.py | 123 +- python/paddle/incubate/autograd/primops.py | 1 + python/paddle/incubate/autograd/primx.py | 46 +- python/paddle/incubate/autograd/utils.py | 10 + 18 files changed, 1099 insertions(+), 2408 deletions(-) delete mode 100644 python/paddle/autograd/functional.py delete mode 100644 python/paddle/autograd/utils.py rename python/paddle/fluid/tests/unittests/autograd/{test_gradients_and_minimize.py => test_minimize.py} (56%) create mode 100644 python/paddle/incubate/autograd/functional.py diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py index 6669e4f4c70..8bc7b113686 100644 --- a/python/paddle/autograd/__init__.py +++ b/python/paddle/autograd/__init__.py @@ -26,8 +26,6 @@ else: from .py_layer import LegacyPyLayerContext as PyLayerContext # noqa: F401 from ..framework import set_grad_enabled, is_grad_enabled # noqa: F401 from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401 -from .functional import vjp, jvp, Jacobian, Hessian # noqa: F401 -from .functional import jacobian, hessian, batch_jacobian, batch_hessian, vhp # noqa: F401 __all__ = [ # noqa 'backward', diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py deleted file mode 100644 index aa3e99978b7..00000000000 --- a/python/paddle/autograd/functional.py +++ /dev/null @@ -1,1362 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import functools -import typing - -import paddle -from paddle.fluid import framework -from paddle.autograd.utils import as_tensors - - -def vjp(func, xs, v=None): - r"""Computes the Vector-Jacobian product, a functional form of - reverse mode automatic differentiation. - - Warning: - This API is in beta, the signatures could be changed in future version. - - Args: - func(Callable): A function that takes ``xs`` as inputs parameter and - returns a sequence of Tensors or a Tensor. - xs(Tensor|Sequence[Tensor]): Used as positional arguments to evaluate - ``func``. ``xs`` is accepted as one Tensor or a sequence of Tensors. - v(Tensor|Sequence[Tensor]|None, optional): The cotangent vector invovled - in the VJP computation. ``v`` matches the size and shape of - ``func`` 's output. Defaults to None, which is equivalent to all - ones the same size of ``func`` 's output. - - Returns: - output(tuple): - - - func_out(Tensor|tuple[Tensor]): The output of ``func(xs)`` . - - vjp(Tensor|tuple[Tensor]): The vjp result. - - Examples: - - .. code-block:: python - - import paddle - - def func(x): - return paddle.matmul(x, x) - - x = paddle.ones(shape=[2, 2], dtype='float32') - _, vjp_result = paddle.incubate.autograd.vjp(func, x) - print(vjp_result) - # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, - # [[4., 4.], - # [4., 4.]]) - - v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]]) - _, vjp_result = paddle.incubate.autograd.vjp(func, x, v) - print(vjp_result) - # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, - # [[2., 1.], - # [1., 0.]]) - """ - _check_inputs(func, xs, v) - - # ``_seprate`` breaks the dependencies between ``xs`` and other - # variables. See more ``_seprate`` . - xs, v = _separate(xs), _separate(v) - ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs) - _check_v_shape(v, ys) - - return ys, _grad(ys, xs, v) - - -def jvp(func, xs, v=None): - r""" - Computes the Jacobian-Vector product for a function at the given - inputs and a vector in the tangent space induced by the inputs. - - Warning: - This API is in beta, the signatures could be changed in future version. - - Args: - func(Callable): The ``func`` takes as input a Tensor or a Sequence - of Tensors and returns a Tensor or a Sequence of Tensors. - xs(Tensor|Sequence[Tensor]): Used as positional arguments to - evaluate ``func``. The ``xs`` is accepted as one Tensor or a - Sequence of Tensors. - v(Tensor|Sequence[Tensor]|None, Optional): The tangent vector invovled - in the JVP computation. The ``v`` matches the size and shape of - ``xs`` . Default value is None and in this case is equivalent to - all ones the same size of ``xs`` . - - Returns: - output(tuple): - - - func_out(Tensor|tuple[Tensor]): The output of ``func(xs)`` . - - jvp(Tensor|tuple[Tensor]): The jvp result. - - Examples: - - .. code-block:: python - - import paddle - - - def func(x): - return paddle.matmul(x, x) - - - x = paddle.ones(shape=[2, 2], dtype='float32') - _, jvp_result = paddle.incubate.autograd.jvp(func, x) - print(jvp_result) - # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, - # [[4., 4.], - # [4., 4.]]) - v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]]) - _, jvp_result = paddle.incubate.autograd.jvp(func, x, v) - print(jvp_result) - # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, - # [[2., 1.], - # [1., 0.]]) - - """ - _check_inputs(func, xs, v) - # ``_seprate`` breaks the dependencies between ``xs`` and other - # variables. See more ``_seprate`` . - xs, v = _separate(xs), _separate(v) - ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs) - _check_v_shape(v, xs) - return ys, _double_backward_trick(ys, xs, v) - - -def _double_backward_trick(ys, xs, v): - """Double backward trick for computing ``jvp`` by ``vjp`` - see details: https://j-towns.github.io/2017/06/12/A-new-trick.html - """ - # The value of ys_grad is not important, it can be any random value in - # theory, but it's required to set stop_gradient=False. - ys_grad = _zeros_like_with_grad(ys) - xs_grad = _grad(ys, xs, ys_grad) - return _grad(xs_grad, ys_grad, v) - - -def _zeros_like_with_grad(xs): - """Create a zero or zeros sequence Tensor like ``xs`` with a flag - ``stop_graident=False`` . - """ - if not isinstance(xs, typing.Sequence): - ys = paddle.zeros_like(xs) - ys.stop_gradient = False - else: - ys = [] - for x in xs: - y = paddle.zeros_like(x) - y.stop_gradient = False - ys.append(y) - return ys - - -class Jacobian(object): - r""" - Computes the Jacobian matrix of a given function. - - If the function has multiple inputs and multiple outputs, during internal - implementation, all input tensors are concatenated after being flatten, - the batch dimension is retained, and the output is subject to the same - processing rules. - - Once the Jacobian ``J`` is constructed, you can use a multidimensional index - to retrieve the submatrix of ``J``, as same as slicing a Tensor. The - submatrix is lazily evaluated along row axis, and will be cached once - evaluated. - - For examples, supposing ``is_batched=True``, you can retrieve the submatrix - by following methods: - - * J[:], retrieving the full matrix. - * J[:, :, j], retrieving the partial derivatives w.r.t. the j'th input - variable. - * J[:, i, :], retrieving the partial derivatives w.r.t. the i'th output - variable. - * J[:, i, j], retrieving the partial derivatives w.r.t. the i'th output - variable and the j'th input variable. - - Notes: - - Eclipsis index is not supported currently. - - Warning: - This API is in beta, the signatures could be changed in future version. - - Args: - - func (Callable): A python function that takes a Tensor or a sequence of - Tensors as inputs(the first dimension is batch size) and - returns a Tensor a sequence of Tensors. - xs (Tensor|Sequence[Tensor]): The input to the function ``func`` . - is_batched (bool): If true, the first axis is batch axis. Defaults to - False. - - Returns: - - Jacobian (Object): A python object retains the Jacobian matrix. - - Examples: - - .. code-block:: python - - import paddle - - - def func(x, y): - return paddle.matmul(x, y) - - - x = paddle.to_tensor([[1., 2.], [3., 4.]]) - J = paddle.incubate.autograd.Jacobian(func, [x, x]) - print(J[:, :]) - # Tensor(shape=[4, 8], dtype=float32, place=Place(gpu:0), stop_gradient=False, - # [[1., 3., 0., 0., 1., 0., 2., 0.], - # [2., 4., 0., 0., 0., 1., 0., 2.], - # [0., 0., 1., 3., 3., 0., 4., 0.], - # [0., 0., 2., 4., 0., 3., 0., 4.]]) - - print(J[0, :]) - # Tensor(shape=[8], dtype=float32, place=Place(gpu:0), stop_gradient=False, - # [1., 3., 0., 0., 1., 0., 2., 0.]) - print(J[:, 0]) - # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=False, - # [1., 2., 0., 0.]) - - """ - - def __init__(self, func, xs, is_batched=False): - if not is_batched: - self._jacobian = _JacobianNoBatch(func, xs) - else: - self._jacobian = _JacobianBatchFirst(func, xs) - - def __getitem__(self, indexes): - return self._jacobian[indexes] - - @property - def shape(self): - """The shape of flattened Jacobian matrix. - """ - return self._jacobian.shape - - -class Hessian(object): - """ - Computes the Hessian matrix with a given ``func`` with respect to ``xs`` . - - If the function has multiple inputs, during internal implementation, - all input tensors are concatenated after being flatten, the batch dimension - is retained. - - The Hessian submatrix is lazily evaluated, and can be retrieved with a - multidimensional indexes. See details ``Jacobian`` . - - Warning: - This API is in beta, the signatures could be changed in future version. - - Args: - func (Callable): A python function that takes a Tensor or a Tensor - sequence as inputs and returns a Tensor with shape - ``[batch_size, 1]`` with batch or ``[1]`` without batch. - xs (Tensor|Sequence(Tensor)): The input Tensor or Tensor sequence of - the function ``func``. - is_batched (bool): If true, the first axis is batch axis. Defaults to - False. - - Returns: - - Hessian (Object): A python object retains the Hessian matrix. - - - Examples: - - .. code-block:: python - - import paddle - - - def reducer(x): - return paddle.sum(x * x) - - - x = paddle.rand([2, 2]) - h = paddle.incubate.autograd.Hessian(reducer, x) - print(h[:]) - # Tensor(shape=[4, 4], dtype=float32, place=Place(gpu:0), stop_gradient=False, - # [[2., 0., 0., 0.], - # [0., 2., 0., 0.], - # [0., 0., 2., 0.], - # [0., 0., 0., 2.]]) - """ - - def __init__(self, func, xs, is_batched=False): - - def _jac_func(*xs): - jac = Jacobian(func, xs, is_batched=is_batched) - if (is_batched and jac.shape[1] != 1) or (not is_batched - and jac.shape[0] != 1): - raise RuntimeError( - "The function given to Hessian shoud return as single element Tensor or batched single element Tensor." - ) - return jac[:, 0, :] if is_batched else jac[0, :] - - self.symbolic = Jacobian(_jac_func, xs, is_batched=is_batched) - - def __getitem__(self, indexes): - return self.symbolic[indexes] - - @property - def shape(self): - """The shape of flattened Hessian matrix. - """ - return self.symbolic.shape - - -class _Jacobian(object): - """The base class for computing Jacobian matrix. - - ``_Jacobian`` implementes the core logic of multidimensional index and lazy - evaluation for Jacobian matrix, subclass only need to overwrite following - methods: - - * ``_lazy_axis()``, return the axis along which will be lazy - evaluating. - * ``_flatten(xs)``, flattens the inputs ``xs``. - * ``_evaluate(index)``, evaluates one slice along ``_lazy_axis`` . - - Notes: - - Because currently PaddlePaddle only support reverse differentiation by - ``paddle.grad``, so lazy evaluation is only supported along the row of - Jacobian matrix, which means that slicing along row will get better - performance. - - """ - - def __init__(self, func, xs): - # Skip separating in prim mode temporarily, as detach and clone are not - # primitive operators. - if not paddle.fluid._non_static_mode( - ) and paddle.incubate.autograd.prim_enabled(): - self._xs = xs - else: - self._xs = _separate(xs) - self._ys = func(*as_tensors(self._xs)) - self._flatten_xs = self._flatten(as_tensors(self._xs)) - self._flatten_ys = self._flatten(as_tensors(self._ys)) - self._cache = {} - - @property - def shape(self): - raise NotImplementedError - - @property - def _lazy_axis(self): - """"The axis of lazily evaluated.""" - raise NotImplementedError - - def _lazy_indexes(self, indexes): - idx = indexes[self._lazy_axis] - return (idx, ) if isinstance(idx, int) else tuple( - range(idx.start, idx.stop, idx.step)) - - def _flatten(self, xs): - raise NotImplementedError - - def _shifted_indexes(self, indexes, lazy_axis_size=0): - idx = indexes[self._lazy_axis] - shifted_lazy_axis_idx = 0 if isinstance(idx, int) else slice( - 0, lazy_axis_size, 1) - return indexes[:self._lazy_axis] + ( - shifted_lazy_axis_idx, ) + indexes[self._lazy_axis + 1:] - - def __getitem__(self, indexes): - indexes = _multi_index(indexes, self.shape) - - if isinstance(indexes[self._lazy_axis], int): - other_indexes = indexes[:self._lazy_axis] + \ - indexes[self._lazy_axis+1:] - return self._cached_evaluate( - indexes[self._lazy_axis])[other_indexes] - lazy_indexes = self._lazy_indexes(indexes) - # Using concat and reshape to replace stack operator temporarily, as - # it is not a primitive operator. - shape = list(self.shape) - shape[self._lazy_axis] = len(lazy_indexes) - part_jac = paddle.concat( - [self._cached_evaluate(i) for i in lazy_indexes], - axis=self._lazy_axis).reshape(shape) - return part_jac[self._shifted_indexes(indexes, len(lazy_indexes))] - - def _cached_evaluate(self, k): - v = self._cache.get(k) - if v is None: - v = self._evaluate(k) - self._cache[k] = v - return v - - def _evaluate(self, index): - """Evaluate one slice at along lazy axis.""" - raise NotImplementedError - - -class _JacobianNoBatch(_Jacobian): - """Compute Jacobian matrix without batch dimension. - Suppose the mapping is :math:`f: R^M \to R^N`, the output shape is - ``(N, M)`` . - """ - - def __init__(self, func, xs): - super(_JacobianNoBatch, self).__init__(func, xs) - - @property - def shape(self): - return (self._flatten_ys.shape[0], self._flatten_xs.shape[0]) - - @property - def _lazy_axis(self): - return 0 - - def _flatten(self, xs): - return paddle.concat(tuple(x.reshape((-1, )) for x in xs)) - - def _evaluate(self, row_index): - return self._flatten(_grad( - self._flatten_ys[row_index], - self._xs, - )) - - -class _JacobianBatchLast(_Jacobian): - """Compute Jacobian matrix with batch at last axis. - Suppose the mapping is :math:`f: R^{M,B} \to R^{N,B}`, the output shape is - ``(N, M, B)`` . - """ - - def __init__(self, func, xs): - super(_JacobianBatchLast, self).__init__(func, xs) - - @property - def shape(self): - return (self._flatten_ys.shape[0], self._flatten_xs.shape[0], - self._flatten_xs.shape[1]) - - @property - def _lazy_axis(self): - return 0 - - def _flatten(self, xs): - return paddle.concat( - tuple(x.reshape((-1, x.shape[-1])) for x in as_tensors(xs)), 0) - - def _evaluate(self, row): - return self._flatten(_grad(self._flatten_ys[row, :], self._xs)) - - -class _JacobianBatchFirst(_Jacobian): - """Compute Jacobian matrix with batch at first axis. - Suppose the mapping is :math:`f: R^{B,M} \to R^{B,N}`, the output shape is - ``(B, N, M)`` . - """ - - def __init__(self, func, xs): - super(_JacobianBatchFirst, self).__init__(func, xs) - - @property - def shape(self): - return (self._flatten_xs.shape[0], self._flatten_ys.shape[1], - self._flatten_xs.shape[1]) - - @property - def _lazy_axis(self): - return 1 - - def _flatten(self, xs): - return paddle.concat( - tuple(x.reshape((x.shape[0], -1)) for x in as_tensors(xs)), 1) - - def _evaluate(self, row_index): - return self._flatten(_grad(self._flatten_ys[:, row_index], self._xs)) - - -def _multi_index(indexes, shape): - """A tool for parsing N-dimensional index into a standard format. - - Currently supporting following input format: - * ([positive|negative|slice], ...), the right-most elements can be - omited. - - The standard format after converted is slice tuple which contains N elements: - * ([positive|slice], ..., [positive|slice]) - - Notes: - Ellipsis indexes such as ``(..., i), (i, ...)`` is not supported. - - Args: - indexes (tuple): The input indexes. - shape (tuple): The input shape. - - Returns: - tuple: The standard format index as the above description. - """ - indexes = indexes if isinstance(indexes, typing.Sequence) else (indexes, ) - if any(isinstance(i, type(Ellipsis)) for i in indexes): - raise IndexError('Ellipsis index currently is not supported.') - # Fill the right-most elements. - indexes = indexes + (slice(0, None, None), ) * (len(shape) - len(indexes)) - # Convert to positive index. - positive_indexes = [] - for i, index in enumerate(indexes): - if isinstance(index, slice): - index = slice(index.start or 0, index.stop or shape[i], index.step - or 1) - positive_indexes.append( - slice( - index.start + shape[i] if index.start < 0 else index.start, - index.stop + shape[i] if index.stop < 0 else index.stop, - # Negative step means index backward, no need to convert to - # positive interger. - index.step)) - elif isinstance(index, int): - positive_indexes.append(index + shape[i] if index < 0 else index) - else: - raise TypeError(f'Not supported index type {index}.') - return tuple(positive_indexes) - - -def _stack_tensor_or_return_none(origin_list): - assert len(origin_list) > 0, "Can't not stack an empty list" - return paddle.stack(origin_list, axis=0) if isinstance( - origin_list[0], paddle.fluid.framework.Variable) else None - - -def _replace_none_with_zero_tensor(xs, refs): - if xs is None: - xs = paddle.zeros_like(refs) - xs.stop_gradient = refs.stop_gradient - return xs - elif isinstance(xs, typing.Sequence): - return tuple( - _replace_none_with_zero_tensor(x, refs[i]) - for i, x in enumerate(xs)) - else: - return xs - - -def _grad(ys, xs, v=None): - """A gradient function that can be used in dynamic graph and static graph. - - The ``grad`` combines ``paddle.grad`` used in dynamic graph and - ``paddle.static.gradients`` used in static graph, and do following changes: - - * The ``allow_unused`` flag is removed and set defaults to true internally, - none in outputs will be replaced by zero tensor. - * The ``create_graph`` flag is removed and set defaults to true internally, - only makes sense in dynamic graph. - * When xs is a single Tensor, ``paddle.grad`` returns a list which only - contains one Tensor. It may confuse users, thus in this case we improve - to return a single Tensor in _grad interface. - - Args: - ys (Tensor|Sequence[Tensor]): The output tensor or tensor sequence of - the graph to compute gradients. - xs (Tensor|Sequence[Tensor]): The input tensor or tensor sequence of the graph to - compute gradients. The returned values of this API are the - gradients of inputs . - v (Tensor|Sequence[Tensor]|None,optional): The initial gradient values - of outputs . If grad_outputs is None, the initial gradient values of - outputs would be Tensors filled with 1; if grad_outputs is not None, - it must have the same length as outputs , and in this case, the - initial gradient value of the i-th outputs would be: (1) a Tensor - filled with 1 when the i-th element of grad_outputs is None; - (2) the i-th element of grad_outputs when the i-th element of - grad_outputs is a Tensor. Default None. - - Returns: - Tensor|tuple[Tensor]: Tensor or a tuple of Tensors, whose length is the - same as the Tensor number inside inputs, and the i-th returned - Tensor is the sum of gradients of outputs with respect to the i-th - inputs. - """ - if paddle.fluid._non_static_mode(): - xs_grad = paddle.grad(ys, xs, v, create_graph=True, allow_unused=True) - else: - xs_grad = paddle.static.gradients(ys, xs, v) - - if isinstance(xs, paddle.fluid.framework.Variable): - xs_grad = xs_grad[0] - - return _replace_none_with_zero_tensor(xs_grad, xs) - - -def _separate(xs): - """ - ``_separate`` separates ``xs`` from the computation graph through ``clone`` - or ``deteach`` . - - Interally, ``paddle.grad(xs, ys)`` is stateful API implemented based on - computional graph, which will reduce gradients along all path from ys to xs. - - However, funcional autograd API such as ``vjp``, ``jvp`` is stateless, and - only compute gradients with a given ``func`` . - - For example, given a ``func`` :math:`y0=f(x0)`, supposing forward path is: - ``x0 -> y0``, ``x0 -> x1 -> y0`` . - ``paddle.grad(y0, x0)`` will reduce gradients along ``y0->x0`` and - ``y0->x1->x0``, and ``vjp`` only need reduce along ``y0->x0``. - - So, it's needed to clone or detach xs for breaking the dependencies with - other variables. - - Examples: - - .. code-block:: python - - import paddle - from paddle.autograd.functional import _separate - - - def func(x, y): - return x * y - - - x = paddle.ones((1,)) - x.stop_gradient = False - - y = func(x, x) - print(paddle.grad(y, x)) - # [Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, - # [2.])] - - x1, x2 = _separate((x, x)) - y = func(x1, x2) - print(paddle.grad(y, x1)) - # [Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, - # [1.])] - - """ - if isinstance(xs, typing.Sequence): - return tuple(_single_separate(x) for x in xs) - else: - return _single_separate(xs) - - -def _single_separate(x): - if x is None: # x maybe none because grad input's v defaults to none. - return x - if not x.stop_gradient: - return paddle.clone(x) - else: # use detach to share memory when no need gradients. - x = x.detach() - x.stop_gradient = False - return x - return x - - -def _check_inputs(func, xs, v=None): - if not callable(func): - raise TypeError(f"Expected 'fun' is Callable, but got {type(func)}.") - - if not isinstance(xs, (framework.Variable, typing.Sequence)): - raise TypeError(f"Expected 'xs' is a Tensor|Sequence[Tensor]," - f"but got {type(xs)}.") - if isinstance(xs, typing.Sequence) and not all( - isinstance(x, framework.Variable) for x in xs): - raise TypeError("All elements of 'xs' shoule be Tensor.") - - if not isinstance(v, (framework.Variable, typing.Sequence, type(None))): - raise TypeError( - f"Expected 'v' is Tensor|Sequence[Tensor]|None, but got {type(v)}.") - - if isinstance(v, typing.Sequence) and not all( - isinstance(e, framework.Variable) for e in v): - raise TypeError("All elements of 'xs' shoule be Tensor.") - - -def _check_v_shape(v, refs): - if v is None: - return - - v, refs = as_tensors(v), as_tensors(refs) - if len(refs) != len(v): - raise RuntimeError(f"The argument v is a tuple of invalid length:" - f"should be {len(refs)} but got {len(v)}.") - - for index, (element_v, element_ref) in enumerate(zip(v, refs)): - if element_v.shape != element_ref.shape: - raise RuntimeError( - f"The v[{index}] has invalid shape: should " - f"be {element_ref.shape} but got {element_v.shape}.") - - -@framework.dygraph_only -def jacobian(func, inputs, create_graph=False, allow_unused=False): - ''' - .. note:: - **This API is ONLY available in the imperative mode.** - - This function computes the Jacobian matrix of `func` with respect to `inputs`. - - Parameters: - func (function): a Python function that takes a Tensor or a Tensor - list/tuple as inputs and returns a Tensor or a Tensor tuple. - inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or - Tensor list/tuple of the function ``func``. - create_graph (bool, optional): whether to create the gradient graphs - of the computing process. When it is True, higher order derivatives - are supported to compute; when it is False, the gradient graphs of - the computing process would be discarded. Defaults to ``False``. - allow_unused (bool, optional): whether to raise error or return None if - some Tensors of `inputs` are unreachable in the graph. Error would - be raised if allow_unused=False, and None would be returned as - their gradients if allow_unused=True. Default False. - Returns: - Jacobian (Tensor or nested tuple of Tensors): if function ``func`` - takes a Tensor as inputs and returns a Tensor as outputs, Jacobian - will be a single Tensor containing the Jacobian matrix for the - linearized inputs and outputs. If one of the inputs and outputs is - a Tensor, and another is a Tensor list/tuple, then the Jacobian will - be a tuple of Tensors. If both of inputs and outputs are Tensor - list/tuple, then the Jacobian will be a tuple of tuple of Tensors - where ``Jacobian[i][j]`` will contain the Jacobian matrix of the - linearized ``i``th output and ``j``th input and will have same - dtype and device as the corresponding input. ``Jacobian[i][j]`` will - have as size ``m * n``, where ``m`` and ``n`` denote the numbers of - elements of ``i``th output and ``j``th input respectively. - - - Examples 1: - .. code-block:: python - - import paddle - - def func(x): - return paddle.matmul(x, x) - - x = paddle.ones(shape=[2, 2], dtype='float32') - x.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, x) - print(jacobian) - # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[2., 1., 1., 0.], - # [1., 2., 0., 1.], - # [1., 0., 2., 1.], - # [0., 1., 1., 2.]]) - - Examples 2: - .. code-block:: python - - import paddle - - def func(x, y): - return paddle.matmul(x, y) - - x = paddle.ones(shape=[2, 2], dtype='float32') - y = paddle.ones(shape=[2, 2], dtype='float32') * 2 - x.stop_gradient = False - y.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, [x, y], create_graph=True) - print(jacobian) - # (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=False, - # [[2., 2., 0., 0.], - # [2., 2., 0., 0.], - # [0., 0., 2., 2.], - # [0., 0., 2., 2.]]), - # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=False, - # [[1., 0., 1., 0.], - # [0., 1., 0., 1.], - # [1., 0., 1., 0.], - # [0., 1., 0., 1.]])) - - Examples 3: - .. code-block:: python - - import paddle - - def func(x, y): - return paddle.matmul(x, y), x * x - - x = paddle.ones(shape=[2, 2], dtype='float32') - y = paddle.ones(shape=[2, 2], dtype='float32') * 2 - x.stop_gradient = False - y.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, [x, y], allow_unused=True) - print(jacobian) - # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[2., 2., 0., 0.], - # [2., 2., 0., 0.], - # [0., 0., 2., 2.], - # [0., 0., 2., 2.]]), - # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[1., 0., 1., 0.], - # [0., 1., 0., 1.], - # [1., 0., 1., 0.], - # [0., 1., 0., 1.]])), - # (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[2., 0., 0., 0.], - # [0., 2., 0., 0.], - # [0., 0., 2., 0.], - # [0., 0., 0., 2.]]), None)) - - ''' - inputs = as_tensors(inputs) - outputs = as_tensors(func(*inputs)) - fin_size = len(inputs) - fout_size = len(outputs) - flat_outputs = tuple( - paddle.reshape(output, shape=[-1]) for output in outputs) - jacobian = tuple() - for i, flat_output in enumerate(flat_outputs): - jac_i = list([] for _ in range(fin_size)) - for k in range(len(flat_output)): - row_k = paddle.grad(flat_output[k], - inputs, - create_graph=create_graph, - retain_graph=True, - allow_unused=allow_unused) - for j in range(fin_size): - jac_i[j].append( - paddle.reshape(row_k[j], shape=[-1]) if isinstance( - row_k[j], paddle.Tensor) else None) - jacobian += (tuple( - _stack_tensor_or_return_none(jac_i_j) for jac_i_j in jac_i), ) - if fin_size == 1 and fout_size == 1: - return jacobian[0][0] - elif fin_size == 1 and fout_size != 1: - return tuple(jacobian[i][0] for i in range(fout_size)) - elif fin_size != 1 and fout_size == 1: - return jacobian[0] - else: - return jacobian - - -@framework.dygraph_only -def batch_jacobian(func, inputs, create_graph=False, allow_unused=False): - ''' - .. note:: - **This API is ONLY available in the imperative mode.** - - This function computes the batch Jacobian matrix of `func` with respect to `inputs`. - Noted that the first dimension of inputs is batch size. - - Parameters: - func (function): a Python function that takes a Tensor or a Tensor - list/tuple as inputs(the first dimension is batch size) and - returns a Tensor or a Tensor tuple. - inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or - Tensor list/tuple of the function ``func``, Noted that - the first dimension of inputs is batch size. - create_graph (bool, optional): whether to create the gradient graphs - of the computing process. When it is True, higher order derivatives - are supported to compute; when it is False, the gradient graphs of - the computing process would be discarded. Defaults to ``False``. - allow_unused (bool, optional): whether to raise error or return None if - some Tensors of `inputs` are unreachable in the graph. Error would - be raised if allow_unused=False, and None would be returned as - their gradients if allow_unused=True. Default False. - Returns: - Jacobian (Tensor or nested tuple of Tensors): if function ``func`` - takes a Tensor as inputs and returns a Tensor as outputs, Jacobian - will be a single Tensor containing the Jacobian matrix for the - linearized inputs and outputs. If one of the inputs and outputs is - a Tensor, and another is a Tensor list/tuple, then the Jacobian will - be a tuple of Tensors. If both of inputs and outputs are Tensor - list/tuple, then the Jacobian will be a tuple of tuple of Tensors. - Noted that the first dimension of inputs is batch size. - - For example, - the inputs shape and outputs shape of function ``func` is [batch_size, num] - and [batch_size, num] respectively, then the Jacobian will be a Tensor with - a shape of [num, batch_size * num], where ``Jacobian[i][j]`` will contain - the Jacobian matrix of the ``i``th column output and the ``j``th input and - will have same dtype and device as the corresponding input. - Other situations can be deduced by analogy. - - Examples 1: - .. code-block:: python - - import paddle - - x = paddle.ones(shape=(4, 2), dtype='float64') - weight = paddle.ones(shape=(2, 4), dtype='float64') - y = paddle.ones(shape=(4, 2), dtype='float64') - - def func(x): - return paddle.matmul(paddle.matmul(x, weight), y) - - x.stop_gradient = False - batch_jacobian = paddle.autograd.batch_jacobian(func, x) - print(batch_jacobian) - # Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[4., 4., 4., 4., 4., 4., 4., 4.], - # [4., 4., 4., 4., 4., 4., 4., 4.]]) - - Examples 2: - .. code-block:: python - - import paddle - - x = paddle.ones(shape=(4, 2), dtype='float64') - weight = paddle.ones(shape=(2, 4), dtype='float64') - y = paddle.ones(shape=(4, 2), dtype='float64') - - def func(x): - return paddle.matmul(paddle.matmul(x, weight), y), x * x - - x.stop_gradient = False - batch_jacobian = paddle.autograd.batch_jacobian(func, x) - print(batch_jacobian) - # (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[4., 4., 4., 4., 4., 4., 4., 4.], - # [4., 4., 4., 4., 4., 4., 4., 4.]]), Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[2., 0., 2., 0., 2., 0., 2., 0.], - # [0., 2., 0., 2., 0., 2., 0., 2.]])) - - Examples 3: - .. code-block:: python - - import paddle - - x = paddle.ones(shape=(4, 2), dtype='float64') - weight = paddle.ones(shape=(2, 4), dtype='float64') - y = paddle.ones(shape=(4, 2), dtype='float64') - - def func(x, y): - return x * y - - x.stop_gradient = False - y.stop_gradient = False - batch_jacobian = paddle.autograd.batch_jacobian(func, [x, y]) - print(batch_jacobian) - # (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[1., 0., 1., 0., 1., 0., 1., 0.], - # [0., 1., 0., 1., 0., 1., 0., 1.]]), Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[1., 0., 1., 0., 1., 0., 1., 0.], - # [0., 1., 0., 1., 0., 1., 0., 1.]])) - - ''' - - inputs = as_tensors(inputs) - outputs = as_tensors(func(*inputs)) - - batch_size = inputs[0].shape[0] - for input in inputs: - assert input.shape[ - 0] == batch_size, "The first dimension of input should equals to the same batch size!" - for output in outputs: - assert output.shape[ - 0] == batch_size, "The first dimension of output should equals to the same batch size!" - fin_size = len(inputs) - fout_size = len(outputs) - flat_outputs = tuple( - paddle.reshape(output, shape=[batch_size, -1]) for output in outputs) - jacobian = tuple() - for i, flat_output in enumerate(flat_outputs): - jac_i = list([] for _ in range(fin_size)) - for k in range(flat_output.shape[1]): - - row_k = paddle.grad(flat_output[:, k], - inputs, - create_graph=create_graph, - retain_graph=True, - allow_unused=allow_unused) - - for j in range(fin_size): - jac_i[j].append( - paddle.reshape(row_k[j], shape=[-1]) if isinstance( - row_k[j], paddle.Tensor) else None) - jacobian += (tuple( - _stack_tensor_or_return_none(jac_i_j) for jac_i_j in jac_i), ) - if fin_size == 1 and fout_size == 1: - return jacobian[0][0] - elif fin_size == 1 and fout_size != 1: - return tuple(jacobian[i][0] for i in range(fout_size)) - elif fin_size != 1 and fout_size == 1: - return jacobian[0] - else: - return jacobian - - -@framework.dygraph_only -def batch_hessian(func, inputs, create_graph=False, allow_unused=False): - ''' - .. note:: - **This API is ONLY available in the imperative mode.** - - This function computes the batch Hessian matrix of `func` with respect to `inputs`. - Noted that the first dimension of inputs is batch size. - - Parameters: - func (function): a Python function that takes a Tensor or a Tensor - list/tuple as inputs(the first dimension is batch size) and - returns a Tensor with shape [batch_size, 1]. - inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or - Tensor list/tuple of the function ``func``. - Noted that the first dimension of inputs is batch size. - create_graph (bool, optional): whether to create the gradient graphs - of the computing process. When it is True, higher order derivatives - are supported to compute; when it is False, the gradient graphs of - the computing process would be discarded. Defaults to ``False``. - allow_unused (bool, optional): whether to raise error or return None if - some Tensors of `inputs` are unreachable in the graph. Error would - be raised if allow_unused=False, and None would be returned as - their gradients if allow_unused=True. Default False. - Returns: - Hessian (Tensor or a tuple of tuple of Tensors): if function ``func`` - takes a Tensor as ``inputs``, Hessian will be a single Tensor containing - the Hessian matrix for the linearized ``inputs`` Tensor. If function - ``func`` takes a Tensor list/tuple as ``inputs``, then the Hessian will - be a tuple of tuple of Tensors. Noted that the first dimension of inputs - is batch size and the execution step is to obtain the result of the - first order differentiation, and then differentiate the batch input. - - For example, - the inputs shape and outputs shape of function ``func` is [batch_size, num] - and [batch_size, 1] respectively, then the batched Hessian will be a Tensor with - a shape of [num, batch_size * num]. - - Why the final shape in this case is that? - because batch_hessian will create a inner func(the wrapper of paddle.grad() func) - to computes the sum of gradients of `outputs` with respect to each `inputs`, - this inner func will get the first order differentiation and shape is [batch_size, num], - then call batch_jacobian to compute jacobian between the first order differentiation - and the origin inputs. The final result ``Hessian[i][j]`` will contain the Jacobian - matrix of the ``i``th column output(Noted that this output means the first order - differentiation) and the ``j``th input and will have same dtype and device as the - corresponding input. Other situations can be deduced by analogy. - - - Examples 1: - .. code-block:: python - - import paddle - - x = paddle.ones(shape=(4, 2), dtype='float64') - weight = paddle.ones(shape=(2, 4), dtype='float64') - y = paddle.ones(shape=(4, 2), dtype='float64') - - def func(x): - return paddle.matmul(x * x, weight)[:, 0:1] - - - x.stop_gradient = False - batch_hessian = paddle.autograd.batch_hessian(func, x) - print(batch_hessian) - # Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[2., 0., 2., 0., 2., 0., 2., 0.], - # [0., 2., 0., 2., 0., 2., 0., 2.]]) - - Examples 2: - .. code-block:: python - - import paddle - - x = paddle.ones(shape=(4, 2), dtype='float64') - weight = paddle.ones(shape=(2, 4), dtype='float64') - y = paddle.ones(shape=(4, 2), dtype='float64') - - def func(x, y): - return paddle.matmul(x * x * y * y, weight)[:, 0:1] - - x.stop_gradient = False - y.stop_gradient = False - batch_hessian = paddle.autograd.batch_hessian(func, [x, y]) - print(batch_hessian) - # ((Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[2., 0., 2., 0., 2., 0., 2., 0.], - # [0., 2., 0., 2., 0., 2., 0., 2.]]), - # Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[4., 0., 4., 0., 4., 0., 4., 0.], - # [0., 4., 0., 4., 0., 4., 0., 4.]])), - # (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[4., 0., 4., 0., 4., 0., 4., 0.], - # [0., 4., 0., 4., 0., 4., 0., 4.]]), - # Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[2., 0., 2., 0., 2., 0., 2., 0.], - # [0., 2., 0., 2., 0., 2., 0., 2.]]))) - - - Examples 3: - .. code-block:: python - - import paddle - - x = paddle.ones(shape=(4, 2), dtype='float64') - weight = paddle.ones(shape=(2, 4), dtype='float64') - y = paddle.ones(shape=(4, 2), dtype='float64') - - def func(x, y): - return paddle.matmul(x * x, weight)[:, 0:1] - - x.stop_gradient = False - y.stop_gradient = False - batch_hessian = paddle.autograd.batch_hessian(func, [x, y], allow_unused=True) - print(batch_hessian) - # ((Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[2., 0., 2., 0., 2., 0., 2., 0.], - # [0., 2., 0., 2., 0., 2., 0., 2.]]), None), (None, None)) - - ''' - inputs = as_tensors(inputs) - outputs = func(*inputs) - batch_size = inputs[0].shape[0] - for input in inputs: - assert input.shape[ - 0] == batch_size, "The first dimension of input should equals to the same batch size!" - assert isinstance(outputs, paddle.Tensor) and outputs.shape == [ - batch_size, 1 - ], "The function to compute batched Hessian matrix should return a Tensor of shape [batch_size, 1]" - - def jac_func(*ins): - grad_inputs = paddle.grad(outputs, - ins, - create_graph=True, - retain_graph=True, - allow_unused=allow_unused) - return tuple( - _replace_none_with_zero_tensor(grad_inputs[i], inputs[i]) - for i in range(len(inputs))) - - return batch_jacobian(jac_func, - inputs, - create_graph=create_graph, - allow_unused=allow_unused) - - -@framework.dygraph_only -def hessian(func, inputs, create_graph=False, allow_unused=False): - ''' - .. note:: - **This API is ONLY available in the imperative mode.** - - This function computes the Hessian matrix of `func` with respect to `inputs`. - - Parameters: - func (function): a Python function that takes a Tensor or a Tensor - list/tuple as inputs and returns a Tensor with a single element. - inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or - Tensor list/tuple of the function ``func``. - create_graph (bool, optional): whether to create the gradient graphs - of the computing process. When it is True, higher order derivatives - are supported to compute; when it is False, the gradient graphs of - the computing process would be discarded. Defaults to ``False``. - allow_unused (bool, optional): whether to raise error or return None if - some Tensors of `inputs` are unreachable in the graph. Error would - be raised if allow_unused=False, and None would be returned as - their gradients if allow_unused=True. Default False. - Returns: - Hessian (Tensor or a tuple of tuple of Tensors): if function ``func`` - takes a Tensor as ``inputs``, Hessian will be a single Tensor containing - the Hessian matrix for the linearized ``inputs`` Tensor. If function - ``func`` takes a Tensor list/tuple as ``inputs``, then the Hessian will - be a tuple of tuple of Tensors where ``Hessian[i][j]`` will contain the - Hessian matrix of the ``i``th input and ``j``th input with size ``m * n``. - Here ``m`` and ``n`` denote the number of elements of the ``i`` th input - and the ``j`` th input respectively. - - Examples 1: - .. code-block:: python - - import paddle - - def func(x): - return paddle.sum(paddle.matmul(x, x)) - - x = paddle.ones(shape=[2, 2], dtype='float32') - x.stop_gradient = False - hessian = paddle.autograd.hessian(func, x) - print(hessian) - # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[2., 1., 1., 0.], - # [1., 0., 2., 1.], - # [1., 2., 0., 1.], - # [0., 1., 1., 2.]]) - - Examples 2: - .. code-block:: python - - import paddle - - def func(x, y): - return paddle.sum(paddle.matmul(x, y)) - - x = paddle.ones(shape=[2, 2], dtype='float32') - y = paddle.ones(shape=[2, 2], dtype='float32') - x.stop_gradient = False - y.stop_gradient = False - hessian = paddle.autograd.hessian(func, [x, y]) - print(hessian) - # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.]]), - # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[1., 1., 0., 0.], - # [0., 0., 1., 1.], - # [1., 1., 0., 0.], - # [0., 0., 1., 1.]])), - # (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[1., 0., 1., 0.], - # [1., 0., 1., 0.], - # [0., 1., 0., 1.], - # [0., 1., 0., 1.]]), - # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.]]))) - - Examples 3: - .. code-block:: python - - import paddle - - def func(x, y): - return paddle.sum(paddle.matmul(x, x)) - - x = paddle.ones(shape=[2, 2], dtype='float32') - y = paddle.ones(shape=[2, 2], dtype='float32') - x.stop_gradient = False - y.stop_gradient = False - hessian = paddle.autograd.hessian(func, [x, y], allow_unused=True) - print(hessian) - # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[2., 1., 1., 0.], - # [1., 0., 2., 1.], - # [1., 2., 0., 1.], - # [0., 1., 1., 2.]]), None), (None, None)) - - ''' - inputs = as_tensors(inputs) - outputs = func(*inputs) - assert isinstance(outputs, paddle.Tensor) and outputs.shape == [ - 1 - ], "The function to compute Hessian matrix should return a Tensor with a single element" - - def jac_func(*ins): - grad_inputs = paddle.grad(outputs, - ins, - create_graph=True, - retain_graph=True, - allow_unused=allow_unused) - return tuple( - _replace_none_with_zero_tensor(grad_inputs[i], inputs[i]) - for i in range(len(inputs))) - - return jacobian(jac_func, - inputs, - create_graph=create_graph, - allow_unused=allow_unused) - - -def vhp(func, inputs, v=None, create_graph=False, allow_unused=False): - ''' - .. note:: - **This API is ONLY available in the imperative mode.** - - This function computes the product between a vector ``v`` and the - Hessian matrix of `func` with respect to `inputs`. - - Parameters: - func (function): a Python function that takes a Tensor or a Tensor - list/tuple as inputs and returns a Tensor with a single element. - inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or - Tensor list/tuple of the function ``func``. - v (Tensor|list(Tensor)|tuple(Tensor)|None, optional): the vector used - to compute vector hessian product. ``v`` should have same shape - and dtype with ``inputs``. If ``v`` is None, it will be set as - Tensor|list(Tensor) with all elements 1. Defaults to "None". - create_graph (bool, optional): whether to create the gradient graphs - of the computing process. When it is True, higher order derivatives - are supported to compute; when it is False, the gradient graphs of - the computing process would be discarded. Defaults to ``False``. - allow_unused (bool, optional): whether to raise error or return None if - some Tensors of `inputs` are unreachable in the graph. Error would - be raised if allow_unused=False, and None would be returned as - their gradients if allow_unused=True. Default False. - Returns: - output (tuple): tuple with: - func_output (Tensor): output of ``func(inputs)`` - vhp (list(Tensor)): result of the vector hessian product - with the same shape and dtype as the inputs. - Examples 1: - .. code-block:: python - import paddle - def func(x): - return paddle.sum(paddle.matmul(x, x)) - - x = paddle.ones(shape=[2, 2], dtype='float32') - x.stop_gradient = False - vx = paddle.ones(shape=[2, 2], dtype='float32') * 2 - vhp_rslt = paddle.autograd.vhp(func, x, v=vx) - print(vhp_rslt) - # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False, - # [8.]), - # Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[8., 8.], - # [8., 8.]])) - - Examples 2: - .. code-block:: python - import paddle - def func(x): - return paddle.sum(paddle.matmul(x, x)) - - x = paddle.ones(shape=[2, 2], dtype='float32') - x.stop_gradient = False - vhp_rslt = paddle.autograd.vhp(func, x) - print(vhp_rslt) - # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False, - # [8.]), - # Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[4., 4.], - # [4., 4.]])) - - Examples 3: - .. code-block:: python - import paddle - def func(x, y): - return paddle.sum(paddle.matmul(x, x)) - - x = paddle.ones(shape=[2, 2], dtype='float32') - x.stop_gradient = False - y = paddle.ones(shape=[2, 2], dtype='float32') - y.stop_gradient = False - vx = paddle.ones(shape=[2, 2], dtype='float32') * 2 - vy = paddle.ones(shape=[2, 2], dtype='float32') * 3 - vhp_rslt = paddle.autograd.vhp(func, [x, y], v=[vx, vy], allow_unused=True) - print(vhp_rslt) - # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False, - # [8.]), - # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[8., 8.], - # [8., 8.]]), None]) - ''' - xs = as_tensors(inputs) - if v is not None: - v = as_tensors(v) - xs, v = _separate(xs), _separate(v) - outputs = func(*xs) - ys = as_tensors(outputs) - assert len(ys) == 1 and isinstance( - ys[0], framework.Variable - ) and ys[0].shape == [ - 1 - ], "The function to compute vhp should return a Tensor with a single element" - jac = _grad(ys, xs) - vhp = _grad(jac, xs, v) - return outputs, vhp diff --git a/python/paddle/autograd/utils.py b/python/paddle/autograd/utils.py deleted file mode 100644 index 6b8865f4d7d..00000000000 --- a/python/paddle/autograd/utils.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import typing - -from paddle.fluid import framework - - -def as_tensors(xs): - if isinstance(xs, framework.Variable): - return (xs, ) - elif isinstance(xs, typing.Sequence): - return tuple(xs) - else: - return xs diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index c37ac87da71..5ed01a01144 100755 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -2211,12 +2211,6 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None): check_type(target_gradients, 'target_gradients', (framework.Variable, list, tuple, type(None)), 'paddle.static.gradients') - - from ..incubate.autograd.primx import _gradients - from ..incubate.autograd.utils import prim_enabled - if prim_enabled(): - return _gradients(targets, inputs, target_gradients) - outs = calc_gradient(targets, inputs, target_gradients, no_grad_set) return _as_list(outs) diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt index 832ecc61ee1..45c0a08efe8 100644 --- a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt @@ -17,7 +17,7 @@ endforeach() set_tests_properties(test_autograd_functional_dynamic PROPERTIES TIMEOUT 200) set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 160) -set_tests_properties(test_gradients_and_minimize PROPERTIES TIMEOUT 60) +set_tests_properties(test_minimize PROPERTIES TIMEOUT 60) if(NOT WIN32) set_tests_properties(test_autograd_functional_prim PROPERTIES TIMEOUT 60) endif() diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py index a98b509f963..6c67b78d6a5 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py @@ -21,7 +21,7 @@ import paddle import paddle.fluid as fluid import paddle.compat as cpt import paddle.nn.functional as F -from paddle.autograd.utils import as_tensors +from paddle.incubate.autograd.utils import as_tensors from paddle.fluid.framework import _test_eager_guard, _in_legacy_dygraph, _in_eager_without_dygraph_check import config @@ -78,9 +78,9 @@ class TestAutogradFunctional(unittest.TestCase): xs = self.gen_inputs(inputs) if v is not None: v = self.gen_inputs(v) - outputs, inputs_grad = paddle.autograd.vjp(func, xs, v) + outputs, inputs_grad = paddle.incubate.autograd.vjp(func, xs, v) else: - outputs, inputs_grad = paddle.autograd.vjp(func, xs) + outputs, inputs_grad = paddle.incubate.autograd.vjp(func, xs) return outputs, inputs_grad def grad_test(): @@ -116,14 +116,14 @@ class TestAutogradFunctional(unittest.TestCase): xs = self.gen_inputs(inputs) if v is not None: v = self.gen_inputs(v) - outputs, outputs_grad = paddle.autograd.jvp( + outputs, outputs_grad = paddle.incubate.autograd.jvp( func, xs, v, create_graph=create_graph, allow_unused=allow_unused) else: - outputs, outputs_grad = paddle.autograd.jvp( + outputs, outputs_grad = paddle.incubate.autograd.jvp( func, xs, create_graph=create_graph, @@ -233,8 +233,8 @@ class TestVJPException(unittest.TestCase): def func_vjp(self): with self.assertRaises(self.expected_exception): - paddle.autograd.vjp(self.fun, paddle.to_tensor(self.xs), - paddle.to_tensor(self.v)) + paddle.incubate.autograd.vjp(self.fun, paddle.to_tensor(self.xs), + paddle.to_tensor(self.v)) def test_all_cases(self): with _test_eager_guard(): @@ -243,8 +243,10 @@ class TestVJPException(unittest.TestCase): def jac(grad_fn, f, inputs): - assert grad_fn in [paddle.autograd.vjp, paddle.autograd.jvp] - if grad_fn is paddle.autograd.jvp: + assert grad_fn in [ + paddle.incubate.autograd.vjp, paddle.incubate.autograd.jvp + ] + if grad_fn is paddle.incubate.autograd.jvp: vs = [paddle.zeros_like(x) for x in inputs] else: outputs = f(*inputs) @@ -265,7 +267,7 @@ def jac(grad_fn, f, inputs): JJ_cols.append(d_outs) # JJ is the fully unrolled jacobian JJ = paddle.stack(JJ_cols) - if grad_fn is paddle.autograd.vjp: + if grad_fn is paddle.incubate.autograd.vjp: JJ = JJ.t() return JJ @@ -279,8 +281,8 @@ class TestJVP(TestAutogradFunctional): ] # noqa for f, inputs in test_cases: inputs = self.gen_inputs(inputs) - forward_jac = jac(paddle.autograd.jvp, f, inputs) - reverse_jac = jac(paddle.autograd.vjp, f, inputs) + forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs) + reverse_jac = jac(paddle.incubate.autograd.vjp, f, inputs) self.check_results(forward_jac, reverse_jac) def func_jvp_i2o1(self): @@ -289,8 +291,8 @@ class TestJVP(TestAutogradFunctional): ] # noqa for f, inputs in test_cases: inputs = self.gen_inputs(inputs) - forward_jac = jac(paddle.autograd.jvp, f, inputs) - reverse_jac = jac(paddle.autograd.vjp, f, inputs) + forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs) + reverse_jac = jac(paddle.incubate.autograd.vjp, f, inputs) self.check_results(forward_jac, reverse_jac) def func_jvp_i2o2(self): @@ -299,8 +301,8 @@ class TestJVP(TestAutogradFunctional): ] # noqa for f, inputs in test_cases: inputs = self.gen_inputs(inputs) - forward_jac = jac(paddle.autograd.jvp, f, inputs) - reverse_jac = jac(paddle.autograd.vjp, f, inputs) + forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs) + reverse_jac = jac(paddle.incubate.autograd.vjp, f, inputs) self.check_results(forward_jac, reverse_jac) def func_jvp_i2o2_omitting_v(self): @@ -309,9 +311,9 @@ class TestJVP(TestAutogradFunctional): ] # noqa for f, inputs in test_cases: inputs = self.gen_inputs(inputs) - results_omitting_v = paddle.autograd.jvp(f, inputs) + results_omitting_v = paddle.incubate.autograd.jvp(f, inputs) v = [paddle.ones_like(x) for x in inputs] - results_with_v = paddle.autograd.jvp(f, inputs, v) + results_with_v = paddle.incubate.autograd.jvp(f, inputs, v) self.check_results(results_omitting_v, results_with_v) def test_all_cases(self): @@ -334,7 +336,7 @@ class TestJVP(TestAutogradFunctional): ('multi_in_single_out', paddle.matmul, (np.random.rand(2, 2), np.random.rand(2, 2))), )) -class TestJacobianClassNoBatch(unittest.TestCase): +class TestJacobianNoBatch(unittest.TestCase): def setUp(self): self._dtype = self.xs[0].dtype if isinstance( @@ -349,7 +351,7 @@ class TestJacobianClassNoBatch(unittest.TestCase): def func_jacobian(self): xs = [paddle.to_tensor(x) for x in self.xs] if isinstance( self.xs, typing.Sequence) else paddle.to_tensor(self.xs) - self._actual = paddle.autograd.Jacobian(self.func, xs, False) + self._actual = paddle.incubate.autograd.Jacobian(self.func, xs, False) self._expected = self._get_expected() Index = collections.namedtuple('Index', ('type', 'value')) @@ -387,7 +389,7 @@ class TestJacobianClassNoBatch(unittest.TestCase): ('3d_in_3d_out', utils.square, np.random.rand(2, 3, 4)), ('multi_in_single_out', utils.square, np.random.rand(2, 3)), )) -class TestJacobianClassBatchFirst(unittest.TestCase): +class TestJacobianBatchFirst(unittest.TestCase): def setUp(self): self._dtype = self.xs[0].dtype if isinstance( @@ -402,7 +404,7 @@ class TestJacobianClassBatchFirst(unittest.TestCase): def func_jacobian(self): xs = [paddle.to_tensor(x) for x in self.xs] if isinstance( self.xs, typing.Sequence) else paddle.to_tensor(self.xs) - self._actual = paddle.autograd.Jacobian(self.func, xs, True) + self._actual = paddle.incubate.autograd.Jacobian(self.func, xs, True) self._expected = self._get_expected() Index = collections.namedtuple('Index', ('type', 'value')) @@ -444,7 +446,7 @@ class TestJacobianClassBatchFirst(unittest.TestCase): self.func_jacobian() -class TestHessianClassNoBatch(unittest.TestCase): +class TestHessianNoBatch(unittest.TestCase): @classmethod def setUpClass(self): @@ -470,7 +472,7 @@ class TestHessianClassNoBatch(unittest.TestCase): numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian) self.x.stop_gradient = False - hessian = paddle.autograd.Hessian(func, self.x) + hessian = paddle.incubate.autograd.Hessian(func, self.x) np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian, self.rtol, self.atol) @@ -484,7 +486,7 @@ class TestHessianClassNoBatch(unittest.TestCase): numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian) self.x.stop_gradient = False self.y.stop_gradient = False - hessian = paddle.autograd.Hessian(func, [self.x, self.y]) + hessian = paddle.incubate.autograd.Hessian(func, [self.x, self.y]) np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian, rtol=self.rtol, @@ -500,7 +502,7 @@ class TestHessianClassNoBatch(unittest.TestCase): numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian) self.x.stop_gradient = False self.y.stop_gradient = False - hessian = paddle.autograd.Hessian(func, [self.x, self.y]) + hessian = paddle.incubate.autograd.Hessian(func, [self.x, self.y]) np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian, self.rtol, self.atol) @@ -514,7 +516,7 @@ class TestHessianClassNoBatch(unittest.TestCase): func, self.x, self.numerical_delta, self.np_dtype) numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian) self.x.stop_gradient = False - hessian = paddle.autograd.Hessian(func, self.x) + hessian = paddle.incubate.autograd.Hessian(func, self.x) assert hessian[:].stop_gradient == False np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian, self.rtol, self.atol) @@ -526,7 +528,7 @@ class TestHessianClassNoBatch(unittest.TestCase): return x * x with self.assertRaises(RuntimeError): - paddle.autograd.Hessian(func, paddle.ones([3])) + paddle.incubate.autograd.Hessian(func, paddle.ones([3])) def test_all_cases(self): with _test_eager_guard(): @@ -544,7 +546,7 @@ class TestHessianClassNoBatch(unittest.TestCase): self.func_out_not_single() -class TestHessianClassBatchFirst(unittest.TestCase): +class TestHessianBatchFirst(unittest.TestCase): @classmethod def setUpClass(self): @@ -572,7 +574,7 @@ class TestHessianClassBatchFirst(unittest.TestCase): expected = utils._compute_numerical_batch_hessian( func, self.x, self.numerical_delta, self.np_dtype) - H = paddle.autograd.Hessian(func, self.x, is_batched=True) + H = paddle.incubate.autograd.Hessian(func, self.x, is_batched=True) actual = utils._np_transpose_matrix_format(H[:].numpy(), utils.MatrixFormat.BNM, utils.MatrixFormat.NBM) @@ -596,7 +598,8 @@ class TestHessianClassBatchFirst(unittest.TestCase): self.x.stop_gradient = False self.y.stop_gradient = False - H = paddle.autograd.Hessian(func, [self.x, self.y], is_batched=True) + H = paddle.incubate.autograd.Hessian(func, [self.x, self.y], + is_batched=True) actual = utils._np_transpose_matrix_format(H[:].numpy(), utils.MatrixFormat.BNM, utils.MatrixFormat.NBM) @@ -620,8 +623,8 @@ class TestHessianClassBatchFirst(unittest.TestCase): utils.MatrixFormat.NBM, utils.MatrixFormat.BNM) - actual = paddle.autograd.Hessian(func, [self.x, self.y], - is_batched=True)[:] + actual = paddle.incubate.autograd.Hessian(func, [self.x, self.y], + is_batched=True)[:] np.testing.assert_allclose(actual, expected, @@ -638,7 +641,7 @@ class TestHessianClassBatchFirst(unittest.TestCase): x = self.x.clone() x.stop_gradient = True - H = paddle.autograd.Hessian(func, self.x, is_batched=True)[:] + H = paddle.incubate.autograd.Hessian(func, self.x, is_batched=True)[:] actual = utils._np_transpose_matrix_format(H[:].numpy(), utils.MatrixFormat.BNM, utils.MatrixFormat.NBM) @@ -652,7 +655,9 @@ class TestHessianClassBatchFirst(unittest.TestCase): return (x * x) with self.assertRaises(RuntimeError): - paddle.autograd.Hessian(func, paddle.ones((3, 3)), is_batched=True) + paddle.incubate.autograd.Hessian(func, + paddle.ones((3, 3)), + is_batched=True) def test_all_cases(self): with _test_eager_guard(): @@ -670,829 +675,5 @@ class TestHessianClassBatchFirst(unittest.TestCase): self.func_out_not_single() -class TestHessian(unittest.TestCase): - - @classmethod - def setUpClass(self): - self.shape = (2, 2) - self.dtype = 'float32' - self.np_dtype = np.float32 - self.numerical_delta = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("eps") - self.rtol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("rtol") - self.atol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("atol") - - self.x = paddle.rand(shape=self.shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.shape, dtype=self.dtype) - - def func_single_input(self): - - def func(x): - return paddle.sum(paddle.matmul(x, x)) - - numerical_hessian = _compute_numerical_hessian(func, self.x, - self.numerical_delta, - self.np_dtype) - - self.x.stop_gradient = False - hessian = paddle.autograd.hessian(func, self.x) - np.testing.assert_allclose(hessian.numpy(), numerical_hessian[0][0], - self.rtol, self.atol) - - def func_multi_input(self): - - def func(x, y): - return paddle.sum(paddle.matmul(x, y)) - - numerical_hessian = _compute_numerical_hessian(func, [self.x, self.y], - self.numerical_delta, - self.np_dtype) - - self.x.stop_gradient = False - self.y.stop_gradient = False - hessian = paddle.autograd.hessian(func, [self.x, self.y]) - for i in range(len(hessian)): - for j in range(len(hessian[0])): - np.testing.assert_allclose(hessian[i][j].numpy(), - numerical_hessian[i][j], self.rtol, - self.atol) - - def func_allow_unused_false(self): - - def func(x, y): - return paddle.sum(paddle.matmul(x, x)) - - try: - self.x.stop_gradient = False - self.y.stop_gradient = False - hessian = paddle.autograd.hessian(func, [self.x, self.y]) - except ValueError as e: - error_msg = cpt.get_exception_message(e) - assert error_msg.find("allow_unused") > 0 - - def func_allow_unused_true(self): - - def func(x, y): - return paddle.sum(paddle.matmul(x, x)) - - numerical_hessian = _compute_numerical_hessian(func, [self.x, self.y], - self.numerical_delta, - self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - hessian = paddle.autograd.hessian(func, [self.x, self.y], - allow_unused=True) - for i in range(len(hessian)): - for j in range(len(hessian[0])): - if i == j == 0: - np.testing.assert_allclose(hessian[i][j].numpy(), - numerical_hessian[i][j], - self.rtol, self.atol) - else: - assert hessian[i][j] is None - - def func_create_graph_false(self): - - def func(x): - return paddle.sum(paddle.matmul(x, x)) - - numerical_hessian = _compute_numerical_hessian(func, self.x, - self.numerical_delta, - self.np_dtype) - self.x.stop_gradient = False - hessian = paddle.autograd.hessian(func, self.x) - assert hessian.stop_gradient == True - np.testing.assert_allclose(hessian.numpy(), numerical_hessian[0][0], - self.rtol, self.atol) - try: - paddle.grad(hessian, self.x) - except Exception as e: - error_msg = cpt.get_exception_message(e) - assert error_msg.find("has no gradient") > 0 or error_msg.find( - "does not appear") > 0 - - def func_create_graph_true(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) - - def func(x): - return paddle.sum(F.sigmoid(x)) - - numerical_hessian = _compute_numerical_hessian(func, self.x, - self.numerical_delta, - self.np_dtype) - self.x.stop_gradient = False - hessian = paddle.autograd.hessian(func, self.x, create_graph=True) - assert hessian.stop_gradient == False - np.testing.assert_allclose(hessian.numpy(), numerical_hessian[0][0], - self.rtol, self.atol) - triple_grad = paddle.grad(hessian, self.x) - assert triple_grad is not None - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) - - def test_all_cases(self): - with _test_eager_guard(): - self.setUpClass() - self.func_single_input() - self.func_multi_input() - self.func_allow_unused_false() - self.func_allow_unused_true() - self.func_create_graph_false() - self.func_create_graph_true() - self.setUpClass() - self.func_single_input() - self.func_multi_input() - self.func_allow_unused_false() - self.func_allow_unused_true() - self.func_create_graph_false() - self.func_create_graph_true() - - -class TestHessianFloat64(TestHessian): - - @classmethod - def setUpClass(self): - self.shape = (2, 2) - self.dtype = 'float64' - self.np_dtype = np.float64 - self.numerical_delta = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("eps") - self.rtol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("rtol") - self.atol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("atol") - self.x = paddle.rand(shape=self.shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.shape, dtype=self.dtype) - - -class TestBatchHessian(unittest.TestCase): - - @classmethod - def setUpClass(self): - self.x_shape = (5, 2) - self.weight_shape = (2, 4) - self.y_shape = (5, 2) - self.dtype = 'float32' - self.np_dtype = np.float32 - self.numerical_delta = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("eps") - self.rtol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("rtol") - self.atol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("atol") - self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype) - self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype) - - def func_single_input(self): - - def func(x): - return paddle.matmul(x * x, self.weight)[:, 0:1] - - numerical_hessian = _compute_numerical_batch_hessian( - func, self.x, self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - hessian = paddle.autograd.batch_hessian(func, self.x, create_graph=True) - np.testing.assert_allclose(hessian, numerical_hessian, self.rtol, - self.atol) - - def func_multi_input(self): - - def func(x, y): - return paddle.matmul(x * x * y * y, self.weight)[:, 0:1] - - numerical_hessian = _compute_numerical_batch_hessian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - - self.x.stop_gradient = False - self.y.stop_gradient = False - hessian = paddle.autograd.batch_hessian(func, [self.x, self.y]) - - shape_tensor = paddle.to_tensor(numerical_hessian).astype("float64") - hessian_reshape = np.reshape(hessian, (shape_tensor.shape)) - np.testing.assert_allclose(hessian_reshape, numerical_hessian, - self.rtol, self.atol) - - def func_allow_unused_false(self): - - def func(x, y): - return paddle.matmul(x * x, self.weight)[:, 0:1] - - try: - self.x.stop_gradient = False - self.y.stop_gradient = False - hessian = paddle.autograd.batch_hessian(func, [self.x, self.y]) - except ValueError as e: - error_msg = cpt.get_exception_message(e) - assert error_msg.find("allow_unused") > 0 - - def func_allow_unused_true(self): - - def func(x, y): - return paddle.matmul(x * x, self.weight)[:, 0:1] - - numerical_hessian = _compute_numerical_batch_hessian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - hessian = paddle.autograd.batch_hessian(func, [self.x, self.y], - allow_unused=True) - - for i in range(len(hessian)): - for j in range(len(hessian[0])): - if i == j == 0: - numerical_hessian = np.stack( - (numerical_hessian[i][j], numerical_hessian[i][j + 1]), - axis=0) - np.testing.assert_allclose(hessian[i][j], numerical_hessian, - self.rtol, self.atol) - else: - assert hessian[i][j] is None - - def func_create_graph_false(self): - - def func(x): - return paddle.matmul(x * x, self.weight)[:, 0:1] - - numerical_hessian = _compute_numerical_batch_hessian( - func, self.x, self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - hessian = paddle.autograd.batch_hessian(func, self.x) - assert hessian.stop_gradient == True - np.testing.assert_allclose(hessian.numpy(), numerical_hessian, - self.rtol, self.atol) - try: - paddle.grad(hessian, self.x) - except Exception as e: - error_msg = cpt.get_exception_message(e) - assert error_msg.find("has no gradient") > 0 or error_msg.find( - "does not appear") > 0 - - def func_create_graph_true(self): - - def func(x): - return paddle.matmul(x * x, self.weight)[:, 0:1] - - numerical_hessian = _compute_numerical_batch_hessian( - func, self.x, self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - hessian = paddle.autograd.batch_hessian(func, self.x, create_graph=True) - assert hessian.stop_gradient == False - np.testing.assert_allclose(hessian.numpy(), numerical_hessian, - self.rtol, self.atol) - triple_grad = paddle.grad(hessian, self.x) - assert triple_grad is not None - - def test_all_cases(self): - with _test_eager_guard(): - self.setUpClass() - self.func_single_input() - self.func_multi_input() - self.func_allow_unused_false() - self.func_allow_unused_true() - self.func_create_graph_false() - self.func_create_graph_true() - self.setUpClass() - self.func_single_input() - self.func_multi_input() - self.func_allow_unused_false() - self.func_allow_unused_true() - self.func_create_graph_false() - self.func_create_graph_true() - - -class TestBatchHessianFloat64(TestBatchHessian): - - @classmethod - def setUpClass(self): - self.x_shape = (5, 2) - self.weight_shape = (2, 4) - self.y_shape = (5, 2) - self.dtype = 'float64' - self.np_dtype = np.float64 - self.numerical_delta = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("eps") - self.rtol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("rtol") - self.atol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("atol") - self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype) - self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype) - - -class TestVHP(unittest.TestCase): - - @classmethod - def setUpClass(self): - self.shape = (2, 2) - self.dtype = 'float32' - self.np_dtype = np.float32 - self.numerical_delta = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("eps") - self.rtol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("rtol") - self.atol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("atol") - self.x = paddle.rand(shape=self.shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.shape, dtype=self.dtype) - self.vx = paddle.rand(shape=self.shape, dtype=self.dtype) - self.vy = paddle.rand(shape=self.shape, dtype=self.dtype) - - def func_single_input(self): - - def func(x): - return paddle.sum(paddle.matmul(x, x)) - - numerical_func_output = func(self.x).numpy() - numerical_vhp = _compute_numerical_vhp(func, self.x, self.vx, - self.numerical_delta, - self.np_dtype) - - self.x.stop_gradient = False - func_output, vhp = paddle.autograd.vhp(func, self.x, self.vx) - np.testing.assert_allclose(func_output.numpy(), numerical_func_output, - self.rtol, self.atol) - np.testing.assert_allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol, - self.atol) - - def func_multi_input(self): - - def func(x, y): - return paddle.sum(paddle.matmul(x, y)) - - numerical_func_output = func(self.x, self.y).numpy() - numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y], - [self.vx, self.vy], - self.numerical_delta, - self.np_dtype) - - self.x.stop_gradient = False - self.y.stop_gradient = False - func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y], - [self.vx, self.vy]) - np.testing.assert_allclose(func_output.numpy(), numerical_func_output, - self.rtol, self.atol) - for i in range(len(vhp)): - np.testing.assert_allclose(vhp[i].numpy(), numerical_vhp[i], - self.rtol, self.atol) - - def func_v_default(self): - - def func(x, y): - return paddle.sum(paddle.matmul(x, y)) - - numerical_func_output = func(self.x, self.y).numpy() - vx = paddle.ones(self.vx.shape, dtype=self.vx.dtype) - vy = paddle.ones(self.vy.shape, dtype=self.vy.dtype) - numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y], [vx, vy], - self.numerical_delta, - self.np_dtype) - - self.x.stop_gradient = False - self.y.stop_gradient = False - func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y]) - np.testing.assert_allclose(func_output.numpy(), numerical_func_output, - self.rtol, self.atol) - for i in range(len(vhp)): - np.testing.assert_allclose(vhp[i].numpy(), numerical_vhp[i], - self.rtol, self.atol) - - def func_allow_unused_true(self): - - def func(x, y): - return paddle.sum(paddle.matmul(x, x)) - - numerical_func_output = func(self.x, self.y).numpy() - numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y], - [self.vx, self.vy], - self.numerical_delta, - self.np_dtype) - - self.x.stop_gradient = False - self.y.stop_gradient = False - func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y], - [self.vx, self.vy]) - np.testing.assert_allclose(func_output.numpy(), numerical_func_output, - self.rtol, self.atol) - np.testing.assert_allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol, - self.atol) - - def func_create_graph_true(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) - - def func(x): - return paddle.sum(F.sigmoid(x)) - - numerical_func_output = func(self.x).numpy() - numerical_vhp = _compute_numerical_vhp(func, self.x, self.vx, - self.numerical_delta, - self.np_dtype) - - self.x.stop_gradient = False - func_output, vhp = paddle.autograd.vhp(func, self.x, self.vx) - np.testing.assert_allclose(func_output.numpy(), numerical_func_output, - self.rtol, self.atol) - assert vhp[0].stop_gradient == False - np.testing.assert_allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol, - self.atol) - triple_grad = paddle.grad(vhp, self.x) - assert triple_grad is not None - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) - - def test_all_cases(self): - with _test_eager_guard(): - self.setUpClass() - self.func_v_default() - self.func_multi_input() - self.func_single_input() - self.func_allow_unused_true() - self.func_create_graph_true() - self.setUpClass() - self.func_v_default() - self.func_multi_input() - self.func_single_input() - self.func_allow_unused_true() - self.func_create_graph_true() - - -class TestJacobian(unittest.TestCase): - - @classmethod - def setUpClass(self): - self.shape = (4, 4) - self.dtype = 'float32' - self.np_dtype = np.float32 - self.numerical_delta = 1e-4 - self.rtol = 1e-3 - self.atol = 1e-3 - self.x = paddle.rand(shape=self.shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.shape, dtype=self.dtype) - - def func_single_input_and_single_output(self): - - def func(x): - return paddle.matmul(x, x) - - numerical_jacobian = _compute_numerical_jacobian( - func, self.x, self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, self.x) - np.testing.assert_allclose(jacobian.numpy(), numerical_jacobian[0][0], - self.rtol, self.atol) - - def func_single_input_and_multi_output(self): - - def func(x): - return paddle.matmul(x, x), x * x - - numerical_jacobian = _compute_numerical_jacobian( - func, self.x, self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, self.x) - for i in range(len(jacobian)): - np.testing.assert_allclose(jacobian[i].numpy(), - numerical_jacobian[i][0], self.rtol, - self.atol) - - def func_multi_input_and_single_output(self): - - def func(x, y): - return paddle.matmul(x, y) - - numerical_jacobian = _compute_numerical_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, [self.x, self.y]) - for j in range(len(jacobian)): - np.testing.assert_allclose(jacobian[j].numpy(), - numerical_jacobian[0][j], self.rtol, - self.atol) - - def func_multi_input_and_multi_output(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) - - def func(x, y): - return paddle.matmul(x, y), x * y - - numerical_jacobian = _compute_numerical_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, [self.x, self.y]) - for i in range(len(jacobian)): - for j in range(len(jacobian[0])): - np.testing.assert_allclose(jacobian[i][j].numpy(), - numerical_jacobian[i][j], self.rtol, - self.atol) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) - - def func_allow_unused_false(self): - - def func(x, y): - return paddle.matmul(x, x) - - try: - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, [self.x, self.y]) - except ValueError as e: - error_msg = cpt.get_exception_message(e) - assert error_msg.find("allow_unused") > 0 - - def func_allow_unused_true(self): - - def func(x, y): - return paddle.matmul(x, x) - - numerical_jacobian = _compute_numerical_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, [self.x, self.y], - allow_unused=True) - np.testing.assert_allclose(jacobian[0].numpy(), - numerical_jacobian[0][0], self.rtol, - self.atol) - assert jacobian[1] is None - - def func_create_graph_false(self): - - def func(x, y): - return paddle.matmul(x, y) - - numerical_jacobian = _compute_numerical_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, [self.x, self.y]) - for j in range(len(jacobian)): - assert jacobian[j].stop_gradient == True - np.testing.assert_allclose(jacobian[j].numpy(), - numerical_jacobian[0][j], self.rtol, - self.atol) - try: - paddle.grad(jacobian[0], [self.x, self.y]) - except Exception as e: - error_msg = cpt.get_exception_message(e) - assert error_msg.find("has no gradient") > 0 or error_msg.find( - "does not appear") > 0 - - def func_create_graph_true(self): - - def func(x, y): - return paddle.matmul(x, y) - - numerical_jacobian = _compute_numerical_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, [self.x, self.y], - create_graph=True) - for j in range(len(jacobian)): - assert jacobian[j].stop_gradient == False - np.testing.assert_allclose(jacobian[j].numpy(), - numerical_jacobian[0][j], self.rtol, - self.atol) - double_grad = paddle.grad(jacobian[0], [self.x, self.y]) - assert double_grad is not None - - def test_all_cases(self): - with _test_eager_guard(): - self.setUpClass() - self.func_multi_input_and_multi_output() - self.func_multi_input_and_single_output() - self.func_single_input_and_multi_output() - self.func_single_input_and_single_output() - self.func_allow_unused_false() - self.func_allow_unused_true() - self.func_create_graph_false() - self.func_create_graph_true() - self.setUpClass() - self.func_multi_input_and_multi_output() - self.func_multi_input_and_single_output() - self.func_single_input_and_multi_output() - self.func_single_input_and_single_output() - self.func_allow_unused_false() - self.func_allow_unused_true() - self.func_create_graph_false() - self.func_create_graph_true() - - -class TestJacobianFloat64(TestJacobian): - - @classmethod - def setUpClass(self): - self.shape = (4, 4) - self.dtype = 'float64' - self.np_dtype = np.float64 - self.numerical_delta = 1e-7 - self.rtol = 1e-7 - self.atol = 1e-7 - self.x = paddle.rand(shape=self.shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.shape, dtype=self.dtype) - - -class TestJacobianBatch(unittest.TestCase): - - @classmethod - def setUpClass(self): - self.x_shape = (4, 2) - self.weight_shape = (2, 4) - self.y_shape = (4, 2) - self.dtype = 'float32' - self.np_dtype = np.float32 - self.numerical_delta = 1e-4 - self.rtol = 1e-3 - self.atol = 1e-3 - self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype) - self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype) - - def func_batch_single_input_and_batch_single_output(self): - - def func(x): - return paddle.matmul(paddle.matmul(x, self.weight), self.y) - - numerical_jacobian = _compute_numerical_batch_jacobian( - func, [self.x], self.numerical_delta, self.np_dtype) - - self.x.stop_gradient = False - batch_jacobian = paddle.autograd.batch_jacobian( - func, - self.x, - ) - - self.assertTrue( - np.allclose(batch_jacobian.numpy().all(), - numerical_jacobian[0][0].all())) - - def func_batch_single_input_and_batch_multi_output(self): - - def func(x): - return paddle.matmul(paddle.matmul(x, self.weight), self.y), x * x - - numerical_jacobian = _compute_numerical_batch_jacobian( - func, [self.x], self.numerical_delta, self.np_dtype) - - self.x.stop_gradient = False - batch_jacobian = paddle.autograd.batch_jacobian( - func, - self.x, - ) - - for i in range(len(batch_jacobian)): - np.testing.assert_allclose(batch_jacobian[i].numpy(), - numerical_jacobian[i][0], self.rtol, - self.atol) - - def func_batch_multi_input_and_batch_single_output(self): - - def func(x, y): - return x * y - - numerical_jacobian = _compute_numerical_batch_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - - self.x.stop_gradient = False - self.y.stop_gradient = False - batch_jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y]) - - for j in range(len(batch_jacobian)): - np.testing.assert_allclose(batch_jacobian[j].numpy(), - numerical_jacobian[0][j], self.rtol, - self.atol) - - def func_batch_multi_input_and_batch_multi_output(self): - - def func(x, y): - return x * y, x * y - - numerical_jacobian = _compute_numerical_batch_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - - self.x.stop_gradient = False - self.y.stop_gradient = False - batch_jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y]) - - for i in range(len(batch_jacobian)): - np.testing.assert_allclose(batch_jacobian[i], numerical_jacobian[i], - self.rtol, self.atol) - - def func_allow_unused_false(self): - - def func(x, y): - return x * x - - try: - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y]) - except ValueError as e: - error_msg = cpt.get_exception_message(e) - assert error_msg.find("allow_unused") > 0 - - def func_allow_unused_true(self): - - def func(x, y): - return x * x - - numerical_jacobian = _compute_numerical_batch_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y], - allow_unused=True) - - np.testing.assert_allclose(jacobian[0].numpy(), - numerical_jacobian[0][0], self.rtol, - self.atol) - assert jacobian[1] is None - - def func_create_graph_false(self): - - def func(x, y): - return x * y - - numerical_jacobian = _compute_numerical_batch_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y]) - for j in range(len(jacobian)): - assert jacobian[j].stop_gradient == True - np.testing.assert_allclose(jacobian[j].numpy(), - numerical_jacobian[0][j], self.rtol, - self.atol) - try: - paddle.grad(jacobian[0], [self.x, self.y]) - except Exception as e: - error_msg = cpt.get_exception_message(e) - assert error_msg.find("has no gradient") > 0 or error_msg.find( - "does not appear") > 0 - - def func_create_graph_true(self): - - def func(x, y): - return x * y - - numerical_jacobian = _compute_numerical_batch_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y], - create_graph=True) - for j in range(len(jacobian)): - assert jacobian[j].stop_gradient == False - np.testing.assert_allclose(jacobian[j].numpy(), - numerical_jacobian[0][j], self.rtol, - self.atol) - double_grad = paddle.grad(jacobian[0], [self.x, self.y]) - assert double_grad is not None - - def test_all_cases(self): - with _test_eager_guard(): - self.setUpClass() - self.func_batch_single_input_and_batch_single_output() - self.func_batch_single_input_and_batch_multi_output() - self.func_batch_multi_input_and_batch_single_output() - self.func_batch_multi_input_and_batch_multi_output() - self.func_allow_unused_false() - self.func_allow_unused_true() - self.func_create_graph_false() - self.func_create_graph_true() - self.setUpClass() - self.func_batch_single_input_and_batch_single_output() - self.func_batch_single_input_and_batch_multi_output() - self.func_batch_multi_input_and_batch_single_output() - self.func_batch_multi_input_and_batch_multi_output() - self.func_allow_unused_false() - self.func_allow_unused_true() - self.func_create_graph_false() - self.func_create_graph_true() - - -class TestJacobianBatchFloat64(TestJacobianBatch): - - @classmethod - def setUpClass(self): - self.x_shape = (12, 2) - self.weight_shape = (2, 12) - self.y_shape = (12, 2) - self.dtype = 'float64' - self.np_dtype = np.float64 - self.numerical_delta = config.TOLERANCE.get( - self.dtype).get('second_order_grad').get('eps') - self.rtol = config.TOLERANCE.get( - self.dtype).get('second_order_grad').get('rtol') - self.atol = config.TOLERANCE.get( - self.dtype).get('second_order_grad').get('atol') - self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype) - self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype) - - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_prim.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_prim.py index f75460df6b5..d17420c9045 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_prim.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_prim.py @@ -145,5 +145,130 @@ class TestHessianPrim(unittest.TestCase): atol=self._atol) +@utils.place(config.DEVICES) +@utils.parameterize((utils.TEST_CASE_NAME, 'fun', 'args', 'dtype'), ( + ('unary_float32', paddle.tanh, (np.random.rand(2, 3), ), 'float32'), + ('binary_float32', paddle.matmul, + (np.random.rand(2, 3), np.random.rand(3, 2)), 'float32'), + ('unary_float64', paddle.tanh, (np.random.rand(2, 3), ), 'float64'), + ('binary_float64', paddle.matmul, + (np.random.rand(2, 3), np.random.rand(3, 2)), 'float64'), +)) +class TestJvpPrim(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.args = [arg.astype(cls.dtype) for arg in cls.args] + cls._rtol = config.TOLERANCE.get( + cls.dtype).get('first_order_grad').get('rtol') + cls._atol = config.TOLERANCE.get( + cls.dtype).get('first_order_grad').get('atol') + + def setUp(self): + paddle.enable_static() + paddle.incubate.autograd.enable_prim() + + def tearDown(self): + paddle.incubate.autograd.disable_prim() + paddle.disable_static() + + def test_jacobian_prim(self): + + def wrapper(fun, args): + mp = paddle.static.Program() + sp = paddle.static.Program() + with paddle.static.program_guard(mp, sp): + static_args = [ + paddle.static.data(f'arg{i}', arg.shape, self.dtype) + for i, arg in enumerate(args) + ] + for arg in static_args: + arg.stop_gradient = False + _, jvp_res = paddle.incubate.autograd.jvp(fun, static_args) + if paddle.incubate.autograd.prim_enabled(): + paddle.incubate.autograd.prim2orig() + exe = paddle.static.Executor() + exe.run(sp) + jvp_res = exe.run( + mp, + feed={f'arg{i}': arg + for i, arg in enumerate(args)}, + fetch_list=[jvp_res]) + return jvp_res + + paddle.incubate.autograd.enable_prim() + prim_jvp = wrapper(self.fun, self.args) + paddle.incubate.autograd.disable_prim() + orig_jvp = wrapper(self.fun, self.args) + + np.testing.assert_allclose(orig_jvp, + prim_jvp, + rtol=self._rtol, + atol=self._atol) + + +@utils.place(config.DEVICES) +@utils.parameterize((utils.TEST_CASE_NAME, 'fun', 'args', 'dtype'), ( + ('unary_float32', paddle.tanh, (np.random.rand(2, 3), ), 'float32'), + ('binary_float32', paddle.matmul, + (np.random.rand(2, 3), np.random.rand(3, 2)), 'float32'), + ('unary_float64', paddle.tanh, (np.random.rand(2, 3), ), 'float64'), + ('binary_float64', paddle.matmul, + (np.random.rand(2, 3), np.random.rand(3, 2)), 'float64'), +)) +class TestVjpPrim(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.args = [arg.astype(cls.dtype) for arg in cls.args] + cls._rtol = config.TOLERANCE.get( + cls.dtype).get('first_order_grad').get('rtol') + cls._atol = config.TOLERANCE.get( + cls.dtype).get('first_order_grad').get('atol') + + def setUp(self): + paddle.enable_static() + paddle.incubate.autograd.enable_prim() + + def tearDown(self): + paddle.incubate.autograd.disable_prim() + paddle.disable_static() + + def test_jacobian_prim(self): + + def wrapper(fun, args): + mp = paddle.static.Program() + sp = paddle.static.Program() + with paddle.static.program_guard(mp, sp): + static_args = [ + paddle.static.data(f'arg{i}', arg.shape, self.dtype) + for i, arg in enumerate(args) + ] + for arg in static_args: + arg.stop_gradient = False + _, vjp_res = paddle.incubate.autograd.vjp(fun, static_args) + if paddle.incubate.autograd.prim_enabled(): + paddle.incubate.autograd.prim2orig() + exe = paddle.static.Executor() + exe.run(sp) + vjp_res = exe.run( + mp, + feed={f'arg{i}': arg + for i, arg in enumerate(args)}, + fetch_list=[vjp_res]) + return vjp_res + + paddle.incubate.autograd.enable_prim() + prim_vjp = wrapper(self.fun, self.args) + paddle.incubate.autograd.disable_prim() + orig_vjp = wrapper(self.fun, self.args) + + for orig, prim in zip(orig_vjp, prim_vjp): + np.testing.assert_allclose(orig, + prim, + rtol=self._rtol, + atol=self._atol) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py index 4e01ad5382c..9b2098d37b8 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py @@ -59,7 +59,8 @@ class TestVJP(unittest.TestCase): with paddle.static.program_guard(mp, sp): feed, static_xs, static_v = utils.gen_static_data_and_feed( self.xs, self.v, stop_gradient=self.stop_gradient) - ys, xs_grads = paddle.autograd.vjp(self.fun, static_xs, static_v) + ys, xs_grads = paddle.incubate.autograd.vjp(self.fun, static_xs, + static_v) exe.run(sp) return exe.run(mp, feed=feed, fetch_list=[ys, xs_grads]) @@ -103,7 +104,8 @@ class TestVJPException(unittest.TestCase): with paddle.static.program_guard(mp, sp): feed, static_xs, static_v = utils.gen_static_data_and_feed( self.xs, self.v) - ys, xs_grads = paddle.autograd.vjp(self.fun, static_xs, static_v) + ys, xs_grads = paddle.incubate.autograd.vjp(self.fun, static_xs, + static_v) self.exe.run(sp) return self.exe.run(mp, feed, fetch_list=[ys, xs_grads]) @@ -214,7 +216,7 @@ class TestJacobianFloat32(unittest.TestCase): startup = fluid.Program() with fluid.program_guard(main, startup): xs = make_tensors(inps) - JJ = paddle.autograd.functional.Jacobian(pd_f, xs, is_batched=batch) + JJ = paddle.incubate.autograd.Jacobian(pd_f, xs, is_batched=batch) if batch: _, nrow, ncol = JJ.shape else: @@ -244,7 +246,7 @@ class TestJacobianFloat32(unittest.TestCase): startup = fluid.Program() with fluid.program_guard(main, startup): xs = make_tensors(inps) - JJ = paddle.autograd.functional.Jacobian(pd_f, xs, is_batched=batch) + JJ = paddle.incubate.autograd.Jacobian(pd_f, xs, is_batched=batch) if batch: nbatch, nrow, ncol = JJ.shape rows = [JJ[:, i, :] for i in range(nrow)] @@ -269,7 +271,7 @@ class TestJacobianFloat32(unittest.TestCase): startup = fluid.Program() with fluid.program_guard(main, startup): xs = make_tensors(inps) - JJ = paddle.autograd.functional.Jacobian(pd_f, xs, is_batched=batch) + JJ = paddle.incubate.autograd.Jacobian(pd_f, xs, is_batched=batch) if batch: nbatch, nrow, ncol = JJ.shape entries = [ @@ -390,7 +392,7 @@ class TestHessianFloat32(unittest.TestCase): startup = fluid.Program() with fluid.program_guard(main, startup): xs = make_tensors(inps) - HH = paddle.autograd.functional.Hessian(pd_f, xs, is_batched=batch) + HH = paddle.incubate.autograd.Hessian(pd_f, xs, is_batched=batch) nrow, ncol = HH.shape full_hessian = HH[:] exe = fluid.Executor(self.place) diff --git a/python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py b/python/paddle/fluid/tests/unittests/autograd/test_minimize.py similarity index 56% rename from python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py rename to python/paddle/fluid/tests/unittests/autograd/test_minimize.py index 67ebe01d9f0..10259802c69 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_minimize.py @@ -13,82 +13,16 @@ # limitations under the License. import unittest -import numpy as np +import numpy as np import paddle from paddle.incubate.autograd.primx import prim2orig -from paddle.incubate.autograd.utils import enable_prim, disable_prim, prim_enabled +from paddle.incubate.autograd.utils import (disable_prim, enable_prim, + prim_enabled) paddle.enable_static() -class TestGradients(unittest.TestCase): - - def test_third_order(self): - enable_prim() - main = paddle.static.Program() - startup = paddle.static.Program() - with paddle.static.program_guard(main, startup): - x = paddle.static.data(name='x', shape=[1], dtype='float32') - x2 = paddle.multiply(x, x) - x3 = paddle.multiply(x2, x) - x4 = paddle.multiply(x3, x) - - grad1, = paddle.static.gradients([x4], [x]) - grad2, = paddle.static.gradients([grad1], [x]) - grad3, = paddle.static.gradients([grad2], [x]) - - prim2orig(main.block(0)) - - feed = {x.name: np.array([2.]).astype('float32')} - fetch_list = [grad3.name] - result = [np.array([48.])] - - place = paddle.CPUPlace() - if paddle.device.is_compiled_with_cuda(): - place = paddle.CUDAPlace(0) - exe = paddle.static.Executor(place) - exe.run(startup) - outs = exe.run(main, feed=feed, fetch_list=fetch_list) - np.allclose(outs, result) - disable_prim() - - def test_fourth_order(self): - enable_prim() - main = paddle.static.Program() - startup = paddle.static.Program() - with paddle.static.program_guard(main, startup): - x = paddle.static.data(name='x', shape=[1], dtype='float32') - x2 = paddle.multiply(x, x) - x3 = paddle.multiply(x2, x) - x4 = paddle.multiply(x3, x) - x5 = paddle.multiply(x4, x) - out = paddle.sqrt(x5 + x4) - - grad1, = paddle.static.gradients([out], [x]) - grad2, = paddle.static.gradients([grad1], [x]) - grad3, = paddle.static.gradients([grad2], [x]) - grad4, = paddle.static.gradients([grad3], [x]) - - prim2orig(main.block(0)) - - feed = { - x.name: np.array([2.]).astype('float32'), - } - fetch_list = [grad4.name] - # (3*(-5*x^2-16*x-16))/(16*(x+1)^3.5) - result = [np.array([-0.27263762711])] - - place = paddle.CPUPlace() - if paddle.device.is_compiled_with_cuda(): - place = paddle.CUDAPlace(0) - exe = paddle.static.Executor(place) - exe.run(startup) - outs = exe.run(main, feed=feed, fetch_list=fetch_list) - np.allclose(outs, result) - disable_prim() - - class TestMinimize(unittest.TestCase): def model(self, x, w, bias, opt): diff --git a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py index 0137f4103fb..dc52c5bc33b 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py @@ -37,7 +37,7 @@ import utils ('input_gradients_not_none', paddle.matmul, (np.random.rand(3, 3), np.random.rand(3, 3)), (np.random.rand(3, 3), np.random.rand(3, 3)), 'float64'))) -class TestForwardGradients(unittest.TestCase): +class TestForwardGrad(unittest.TestCase): @classmethod def setUpClass(cls): @@ -55,7 +55,7 @@ class TestForwardGradients(unittest.TestCase): paddle.incubate.autograd.disable_prim() paddle.disable_static() - def test_forward_gradients(self): + def test_forward_grad(self): def expected(): paddle.incubate.autograd.disable_prim() @@ -64,7 +64,8 @@ class TestForwardGradients(unittest.TestCase): with paddle.static.program_guard(mp, sp): feed, static_xs, static_v = utils.gen_static_data_and_feed( self.xs, self.v, stop_gradient=False) - _, ys_grad = paddle.autograd.jvp(self.fun, static_xs, static_v) + _, ys_grad = paddle.incubate.autograd.jvp( + self.fun, static_xs, static_v) exe = paddle.static.Executor() exe.run(sp) out = exe.run(mp, feed=feed, fetch_list=ys_grad) @@ -80,7 +81,8 @@ class TestForwardGradients(unittest.TestCase): self.xs, self.v, stop_gradient=False) ys = self.fun(*static_xs) if isinstance( static_xs, typing.Sequence) else self.fun(static_xs) - ys_grad = primapi.forward_gradients(ys, static_xs, static_v) + ys_grad = paddle.incubate.autograd.forward_grad( + ys, static_xs, static_v) paddle.incubate.autograd.prim2orig(mp.block(0)) exe = paddle.static.Executor() exe.run(sp) @@ -106,7 +108,7 @@ class TestForwardGradients(unittest.TestCase): self.xs, self.v, stop_gradient=False) ys = self.fun(*static_xs) if isinstance( static_xs, typing.Sequence) else self.fun(static_xs) - ys_grad = primapi.forward_gradients(ys, static_xs, static_v) + ys_grad = primapi.forward_grad(ys, static_xs, static_v) paddle.incubate.autograd.prim2orig(mp.block(0)) exe = paddle.static.Executor() exe.run(sp) @@ -116,14 +118,125 @@ class TestForwardGradients(unittest.TestCase): def test_illegal_param(self): paddle.incubate.autograd.enable_prim() with self.assertRaises(TypeError): - primapi.forward_gradients(1, paddle.static.data('inputs', - shape=[1])) + primapi.forward_grad(1, paddle.static.data('inputs', shape=[1])) with self.assertRaises(TypeError): - primapi.forward_gradients(paddle.static.data('targets', shape=[1]), - 1) + primapi.forward_grad(paddle.static.data('targets', shape=[1]), 1) paddle.incubate.autograd.disable_prim() +class TestGrad(unittest.TestCase): + + def setUp(self): + paddle.enable_static() + paddle.incubate.autograd.enable_prim() + + def tearDown(self): + paddle.incubate.autograd.disable_prim() + paddle.disable_static() + + def test_third_order(self): + paddle.incubate.autograd.enable_prim() + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + x = paddle.static.data(name='x', shape=[1], dtype='float32') + x2 = paddle.multiply(x, x) + x3 = paddle.multiply(x2, x) + x4 = paddle.multiply(x3, x) + + grad1, = paddle.incubate.autograd.grad([x4], [x]) + grad2, = paddle.incubate.autograd.grad([grad1], [x]) + grad3, = paddle.incubate.autograd.grad([grad2], [x]) + + paddle.incubate.autograd.prim2orig(main.block(0)) + + feed = {x.name: np.array([2.]).astype('float32')} + fetch_list = [grad3.name] + result = [np.array([48.])] + + place = paddle.CPUPlace() + if paddle.device.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) + exe = paddle.static.Executor(place) + exe.run(startup) + outs = exe.run(main, feed=feed, fetch_list=fetch_list) + np.allclose(outs, result) + paddle.incubate.autograd.disable_prim() + + def test_fourth_order(self): + paddle.incubate.autograd.enable_prim() + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + x = paddle.static.data(name='x', shape=[1], dtype='float32') + x2 = paddle.multiply(x, x) + x3 = paddle.multiply(x2, x) + x4 = paddle.multiply(x3, x) + x5 = paddle.multiply(x4, x) + out = paddle.sqrt(x5 + x4) + + grad1, = paddle.incubate.autograd.grad([out], [x]) + grad2, = paddle.incubate.autograd.grad([grad1], [x]) + grad3, = paddle.incubate.autograd.grad([grad2], [x]) + grad4, = paddle.incubate.autograd.grad([grad3], [x]) + + paddle.incubate.autograd.prim2orig(main.block(0)) + + feed = { + x.name: np.array([2.]).astype('float32'), + } + fetch_list = [grad4.name] + # (3*(-5*x^2-16*x-16))/(16*(x+1)^3.5) + result = [np.array([-0.27263762711])] + + place = paddle.CPUPlace() + if paddle.device.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) + exe = paddle.static.Executor(place) + exe.run(startup) + outs = exe.run(main, feed=feed, fetch_list=fetch_list) + np.allclose(outs, result) + paddle.incubate.autograd.disable_prim() + + def test_disable_prim(self): + + def actual(x: np.array): + paddle.incubate.autograd.disable_prim() + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + var_x = paddle.static.data('x', shape=x.shape, dtype=x.dtype) + var_x.stop_gradient = False + y = paddle.tanh(var_x) + y_grad = paddle.incubate.autograd.grad(y, var_x) + y_second_grad = paddle.incubate.autograd.grad(y_grad, var_x) + exe = paddle.static.Executor() + exe.run(startup) + return exe.run(main, + feed={'x': x}, + fetch_list=[y_grad, y_second_grad]) + + def expect(x: np.array): + paddle.incubate.autograd.disable_prim() + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + var_x = paddle.static.data('x', shape=x.shape, dtype=x.dtype) + var_x.stop_gradient = False + y = paddle.tanh(var_x) + y_grad = paddle.static.gradients(y, var_x) + y_second_grad = paddle.static.gradients(y_grad, var_x) + exe = paddle.static.Executor() + exe.run(startup) + return exe.run(main, + feed={'x': x}, + fetch_list=[y_grad, y_second_grad]) + + x = np.random.randn(100, 200) + for i, j in zip(actual(x), expect(x)): + np.testing.assert_allclose(i, j) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/autograd/test_primops.py b/python/paddle/fluid/tests/unittests/autograd/test_primops.py index ccbd630bfd0..f14664237f3 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_primops.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_primops.py @@ -21,7 +21,7 @@ from paddle.incubate.autograd.primops import (neg, set_value, add, sub, mul, concat, reduce, matmul, slice_select, slice_assign, gather, scatter_add, fill_const) -from paddle.incubate.autograd.primx import Transform, topo_path, orig2prim, prim2orig, _gradients +from paddle.incubate.autograd.primx import Transform, topo_path, orig2prim, prim2orig from paddle.incubate.autograd.utils import enable_prim, disable_prim, prim_enabled diff --git a/python/paddle/fluid/tests/unittests/autograd/utils.py b/python/paddle/fluid/tests/unittests/autograd/utils.py index 8a0e51f60f4..6afd0ff3922 100644 --- a/python/paddle/fluid/tests/unittests/autograd/utils.py +++ b/python/paddle/fluid/tests/unittests/autograd/utils.py @@ -22,7 +22,7 @@ import contextlib import collections import numpy as np import paddle -from paddle.autograd.utils import as_tensors +from paddle.incubate.autograd.utils import as_tensors ########################################################## diff --git a/python/paddle/incubate/autograd/__init__.py b/python/paddle/incubate/autograd/__init__.py index 718bc018d9f..c5ff3b18d4d 100644 --- a/python/paddle/incubate/autograd/__init__.py +++ b/python/paddle/incubate/autograd/__init__.py @@ -11,11 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from paddle.autograd.functional import Hessian, Jacobian, jvp, vjp +from .functional import Hessian, Jacobian, jvp, vjp +from .primapi import forward_grad, grad from .primx import prim2orig -from .utils import enable_prim, disable_prim, prim_enabled +from .utils import disable_prim, enable_prim, prim_enabled __all__ = [ # noqa - 'vjp', 'jvp', 'Jacobian', 'Hessian', 'prim2orig', 'enable_prim', - 'disable_prim', 'prim_enabled' + 'vjp', 'jvp', 'Jacobian', 'Hessian', 'enable_prim', 'disable_prim', + 'forward_grad', 'grad' ] diff --git a/python/paddle/incubate/autograd/functional.py b/python/paddle/incubate/autograd/functional.py new file mode 100644 index 00000000000..6c740005f82 --- /dev/null +++ b/python/paddle/incubate/autograd/functional.py @@ -0,0 +1,675 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import typing + +import paddle +from paddle.fluid import framework +from paddle.incubate.autograd import primapi, utils + + +def vjp(func, xs, v=None): + r"""Computes the Vector-Jacobian product, a functional form of + reverse mode automatic differentiation. + + Warning: + This API is in beta, the signatures could be changed in future version. + + Args: + func(Callable): A function that takes ``xs`` as inputs parameter and + returns a sequence of Tensors or a Tensor. + xs(Tensor|Sequence[Tensor]): Used as positional arguments to evaluate + ``func``. ``xs`` is accepted as one Tensor or a sequence of Tensors. + v(Tensor|Sequence[Tensor]|None, optional): The cotangent vector invovled + in the VJP computation. ``v`` matches the size and shape of + ``func`` 's output. Defaults to None, which is equivalent to all + ones the same size of ``func`` 's output. + + Returns: + output(tuple): + + - func_out(Tensor|tuple[Tensor]): The output of ``func(xs)`` . + - vjp(Tensor|tuple[Tensor]): The vjp result. + + Examples: + + .. code-block:: python + + import paddle + + def func(x): + return paddle.matmul(x, x) + + x = paddle.ones(shape=[2, 2], dtype='float32') + _, vjp_result = paddle.incubate.autograd.vjp(func, x) + print(vjp_result) + # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, + # [[4., 4.], + # [4., 4.]]) + + v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]]) + _, vjp_result = paddle.incubate.autograd.vjp(func, x, v) + print(vjp_result) + # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, + # [[2., 1.], + # [1., 0.]]) + """ + _check_inputs(func, xs, v) + + # ``_seprate`` breaks the dependencies between ``xs`` and other + # variables. See more ``_seprate`` . + if paddle.fluid._non_static_mode() or not utils.prim_enabled(): + xs, v = _separate(xs), _separate(v) + ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs) + _check_v_shape(v, ys) + + return ys, _grad(ys, xs, v) + + +def jvp(func, xs, v=None): + r""" + Computes the Jacobian-Vector product for a function at the given + inputs and a vector in the tangent space induced by the inputs. + + Warning: + This API is in beta, the signatures could be changed in future version. + + Args: + func(Callable): The ``func`` takes as input a Tensor or a Sequence + of Tensors and returns a Tensor or a Sequence of Tensors. + xs(Tensor|Sequence[Tensor]): Used as positional arguments to + evaluate ``func``. The ``xs`` is accepted as one Tensor or a + Sequence of Tensors. + v(Tensor|Sequence[Tensor]|None, Optional): The tangent vector invovled + in the JVP computation. The ``v`` matches the size and shape of + ``xs`` . Default value is None and in this case is equivalent to + all ones the same size of ``xs`` . + + Returns: + output(tuple): + + - func_out(Tensor|tuple[Tensor]): The output of ``func(xs)`` . + - jvp(Tensor|tuple[Tensor]): The jvp result. + + Examples: + + .. code-block:: python + + import paddle + + + def func(x): + return paddle.matmul(x, x) + + + x = paddle.ones(shape=[2, 2], dtype='float32') + _, jvp_result = paddle.incubate.autograd.jvp(func, x) + print(jvp_result) + # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, + # [[4., 4.], + # [4., 4.]]) + v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]]) + _, jvp_result = paddle.incubate.autograd.jvp(func, x, v) + print(jvp_result) + # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, + # [[2., 1.], + # [1., 0.]]) + + """ + _check_inputs(func, xs, v) + # ``_seprate`` breaks the dependencies between ``xs`` and other + # variables. See more ``_seprate`` . + if paddle.fluid._non_static_mode() or not utils.prim_enabled(): + xs, v = _separate(xs), _separate(v) + ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs) + _check_v_shape(v, xs) + + if not paddle.fluid._non_static_mode() and utils.prim_enabled(): + return ys, primapi.forward_grad(ys, xs, v) + else: + return ys, _double_backward_trick(ys, xs, v) + + +def _double_backward_trick(ys, xs, v): + """Double backward trick for computing ``jvp`` by ``vjp`` + see details: https://j-towns.github.io/2017/06/12/A-new-trick.html + """ + # The value of ys_grad is not important, it can be any random value in + # theory, but it's required to set stop_gradient=False. + ys_grad = _zeros_like_with_grad(ys) + xs_grad = _grad(ys, xs, ys_grad) + return _grad(xs_grad, ys_grad, v) + + +def _zeros_like_with_grad(xs): + """Create a zero or zeros sequence Tensor like ``xs`` with a flag + ``stop_graident=False`` . + """ + if not isinstance(xs, typing.Sequence): + ys = paddle.zeros_like(xs) + ys.stop_gradient = False + else: + ys = [] + for x in xs: + y = paddle.zeros_like(x) + y.stop_gradient = False + ys.append(y) + return ys + + +class Jacobian(object): + r""" + Computes the Jacobian matrix of a given function. + + If the function has multiple inputs and multiple outputs, during internal + implementation, all input tensors are concatenated after being flatten, + the batch dimension is retained, and the output is subject to the same + processing rules. + + Once the Jacobian ``J`` is constructed, you can use a multidimensional index + to retrieve the submatrix of ``J``, as same as slicing a Tensor. The + submatrix is lazily evaluated along row axis, and will be cached once + evaluated. + + For examples, supposing ``is_batched=True``, you can retrieve the submatrix + by following methods: + + * J[:], retrieving the full matrix. + * J[:, :, j], retrieving the partial derivatives w.r.t. the j'th input + variable. + * J[:, i, :], retrieving the partial derivatives w.r.t. the i'th output + variable. + * J[:, i, j], retrieving the partial derivatives w.r.t. the i'th output + variable and the j'th input variable. + + Notes: + + Eclipsis index is not supported currently. + + Warning: + This API is in beta, the signatures could be changed in future version. + + Args: + + func (Callable): A python function that takes a Tensor or a sequence of + Tensors as inputs(the first dimension is batch size) and + returns a Tensor a sequence of Tensors. + xs (Tensor|Sequence[Tensor]): The input to the function ``func`` . + is_batched (bool): If true, the first axis is batch axis. Defaults to + False. + + Returns: + + Jacobian (Object): A python object retains the Jacobian matrix. + + Examples: + + .. code-block:: python + + import paddle + + + def func(x, y): + return paddle.matmul(x, y) + + + x = paddle.to_tensor([[1., 2.], [3., 4.]]) + J = paddle.incubate.autograd.Jacobian(func, [x, x]) + print(J[:, :]) + # Tensor(shape=[4, 8], dtype=float32, place=Place(gpu:0), stop_gradient=False, + # [[1., 3., 0., 0., 1., 0., 2., 0.], + # [2., 4., 0., 0., 0., 1., 0., 2.], + # [0., 0., 1., 3., 3., 0., 4., 0.], + # [0., 0., 2., 4., 0., 3., 0., 4.]]) + + print(J[0, :]) + # Tensor(shape=[8], dtype=float32, place=Place(gpu:0), stop_gradient=False, + # [1., 3., 0., 0., 1., 0., 2., 0.]) + print(J[:, 0]) + # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=False, + # [1., 2., 0., 0.]) + + """ + + def __init__(self, func, xs, is_batched=False): + if not is_batched: + self._jacobian = _JacobianNoBatch(func, xs) + else: + self._jacobian = _JacobianBatchFirst(func, xs) + + def __getitem__(self, indexes): + return self._jacobian[indexes] + + @property + def shape(self): + """The shape of flattened Jacobian matrix. + """ + return self._jacobian.shape + + +class Hessian(object): + """ + Computes the Hessian matrix with a given ``func`` with respect to ``xs`` . + + If the function has multiple inputs, during internal implementation, + all input tensors are concatenated after being flatten, the batch dimension + is retained. + + The Hessian submatrix is lazily evaluated, and can be retrieved with a + multidimensional indexes. See details ``Jacobian`` . + + Warning: + This API is in beta, the signatures could be changed in future version. + + Args: + func (Callable): A python function that takes a Tensor or a Tensor + sequence as inputs and returns a Tensor with shape + ``[batch_size, 1]`` with batch or ``[1]`` without batch. + xs (Tensor|Sequence(Tensor)): The input Tensor or Tensor sequence of + the function ``func``. + is_batched (bool): If true, the first axis is batch axis. Defaults to + False. + + Returns: + + Hessian (Object): A python object retains the Hessian matrix. + + + Examples: + + .. code-block:: python + + import paddle + + + def reducer(x): + return paddle.sum(x * x) + + + x = paddle.rand([2, 2]) + h = paddle.incubate.autograd.Hessian(reducer, x) + print(h[:]) + # Tensor(shape=[4, 4], dtype=float32, place=Place(gpu:0), stop_gradient=False, + # [[2., 0., 0., 0.], + # [0., 2., 0., 0.], + # [0., 0., 2., 0.], + # [0., 0., 0., 2.]]) + """ + + def __init__(self, func, xs, is_batched=False): + + def _jac_func(*xs): + jac = Jacobian(func, xs, is_batched=is_batched) + if (is_batched and jac.shape[1] != 1) or (not is_batched + and jac.shape[0] != 1): + raise RuntimeError( + "The function given to Hessian shoud return as single element Tensor or batched single element Tensor." + ) + return jac[:, 0, :] if is_batched else jac[0, :] + + self.symbolic = Jacobian(_jac_func, xs, is_batched=is_batched) + + def __getitem__(self, indexes): + return self.symbolic[indexes] + + @property + def shape(self): + """The shape of flattened Hessian matrix. + """ + return self.symbolic.shape + + +class _Jacobian(object): + """The base class for computing Jacobian matrix. + + ``_Jacobian`` implementes the core logic of multidimensional index and lazy + evaluation for Jacobian matrix, subclass only need to overwrite following + methods: + + * ``_lazy_axis()``, return the axis along which will be lazy + evaluating. + * ``_flatten(xs)``, flattens the inputs ``xs``. + * ``_evaluate(index)``, evaluates one slice along ``_lazy_axis`` . + + Notes: + + Because currently PaddlePaddle only support reverse differentiation by + ``paddle.grad``, so lazy evaluation is only supported along the row of + Jacobian matrix, which means that slicing along row will get better + performance. + + """ + + def __init__(self, func, xs): + # Skip separating in prim mode temporarily, as detach and clone are not + # primitive operators. + if not paddle.fluid._non_static_mode() and utils.prim_enabled(): + self._xs = xs + else: + self._xs = _separate(xs) + self._ys = func(*utils.as_tensors(self._xs)) + self._flatten_xs = self._flatten(utils.as_tensors(self._xs)) + self._flatten_ys = self._flatten(utils.as_tensors(self._ys)) + self._cache = {} + + @property + def shape(self): + raise NotImplementedError + + @property + def _lazy_axis(self): + """"The axis of lazily evaluated.""" + raise NotImplementedError + + def _lazy_indexes(self, indexes): + idx = indexes[self._lazy_axis] + return (idx, ) if isinstance(idx, int) else tuple( + range(idx.start, idx.stop, idx.step)) + + def _flatten(self, xs): + raise NotImplementedError + + def _shifted_indexes(self, indexes, lazy_axis_size=0): + idx = indexes[self._lazy_axis] + shifted_lazy_axis_idx = 0 if isinstance(idx, int) else slice( + 0, lazy_axis_size, 1) + return indexes[:self._lazy_axis] + ( + shifted_lazy_axis_idx, ) + indexes[self._lazy_axis + 1:] + + def __getitem__(self, indexes): + indexes = _multi_index(indexes, self.shape) + + if isinstance(indexes[self._lazy_axis], int): + other_indexes = indexes[:self._lazy_axis] + \ + indexes[self._lazy_axis+1:] + return self._cached_evaluate( + indexes[self._lazy_axis])[other_indexes] + lazy_indexes = self._lazy_indexes(indexes) + # Using concat and reshape to replace stack operator temporarily, as + # it is not a primitive operator. + shape = list(self.shape) + shape[self._lazy_axis] = len(lazy_indexes) + part_jac = paddle.concat( + [self._cached_evaluate(i) for i in lazy_indexes], + axis=self._lazy_axis).reshape(shape) + return part_jac[self._shifted_indexes(indexes, len(lazy_indexes))] + + def _cached_evaluate(self, k): + v = self._cache.get(k) + if v is None: + v = self._evaluate(k) + self._cache[k] = v + return v + + def _evaluate(self, index): + """Evaluate one slice at along lazy axis.""" + raise NotImplementedError + + +class _JacobianNoBatch(_Jacobian): + """Compute Jacobian matrix without batch dimension. + Suppose the mapping is :math:`f: R^M \to R^N`, the output shape is + ``(N, M)`` . + """ + + def __init__(self, func, xs): + super(_JacobianNoBatch, self).__init__(func, xs) + + @property + def shape(self): + return (self._flatten_ys.shape[0], self._flatten_xs.shape[0]) + + @property + def _lazy_axis(self): + return 0 + + def _flatten(self, xs): + return paddle.concat(tuple(x.reshape((-1, )) for x in xs)) + + def _evaluate(self, row_index): + return self._flatten(_grad( + self._flatten_ys[row_index], + self._xs, + )) + + +class _JacobianBatchFirst(_Jacobian): + """Compute Jacobian matrix with batch at first axis. + Suppose the mapping is :math:`f: R^{B,M} \to R^{B,N}`, the output shape is + ``(B, N, M)`` . + """ + + def __init__(self, func, xs): + super(_JacobianBatchFirst, self).__init__(func, xs) + + @property + def shape(self): + return (self._flatten_xs.shape[0], self._flatten_ys.shape[1], + self._flatten_xs.shape[1]) + + @property + def _lazy_axis(self): + return 1 + + def _flatten(self, xs): + return paddle.concat( + tuple(x.reshape((x.shape[0], -1)) for x in utils.as_tensors(xs)), 1) + + def _evaluate(self, row_index): + return self._flatten(_grad(self._flatten_ys[:, row_index], self._xs)) + + +def _multi_index(indexes, shape): + """A tool for parsing N-dimensional index into a standard format. + + Currently supporting following input format: + * ([positive|negative|slice], ...), the right-most elements can be + omited. + + The standard format after converted is slice tuple which contains N elements: + * ([positive|slice], ..., [positive|slice]) + + Notes: + Ellipsis indexes such as ``(..., i), (i, ...)`` is not supported. + + Args: + indexes (tuple): The input indexes. + shape (tuple): The input shape. + + Returns: + tuple: The standard format index as the above description. + """ + indexes = indexes if isinstance(indexes, typing.Sequence) else (indexes, ) + if any(isinstance(i, type(Ellipsis)) for i in indexes): + raise IndexError('Ellipsis index currently is not supported.') + # Fill the right-most elements. + indexes = indexes + (slice(0, None, None), ) * (len(shape) - len(indexes)) + # Convert to positive index. + positive_indexes = [] + for i, index in enumerate(indexes): + if isinstance(index, slice): + index = slice(index.start or 0, index.stop or shape[i], index.step + or 1) + positive_indexes.append( + slice( + index.start + shape[i] if index.start < 0 else index.start, + index.stop + shape[i] if index.stop < 0 else index.stop, + # Negative step means index backward, no need to convert to + # positive interger. + index.step)) + elif isinstance(index, int): + positive_indexes.append(index + shape[i] if index < 0 else index) + else: + raise TypeError(f'Not supported index type {index}.') + return tuple(positive_indexes) + + +def _replace_none_with_zero_tensor(xs, refs): + if xs is None: + xs = paddle.zeros_like(refs) + xs.stop_gradient = refs.stop_gradient + return xs + elif isinstance(xs, typing.Sequence): + return tuple( + _replace_none_with_zero_tensor(x, refs[i]) + for i, x in enumerate(xs)) + else: + return xs + + +def _grad(ys, xs, v=None): + """A gradient function that can be used in dynamic graph and static graph. + + The ``grad`` combines ``paddle.grad`` used in dynamic graph and + ``paddle.static.gradients`` used in static graph, and do following changes: + + * The ``allow_unused`` flag is removed and set defaults to true internally, + none in outputs will be replaced by zero tensor. + * The ``create_graph`` flag is removed and set defaults to true internally, + only makes sense in dynamic graph. + * When xs is a single Tensor, ``paddle.grad`` returns a list which only + contains one Tensor. It may confuse users, thus in this case we improve + to return a single Tensor in _grad interface. + + Args: + ys (Tensor|Sequence[Tensor]): The output tensor or tensor sequence of + the graph to compute gradients. + xs (Tensor|Sequence[Tensor]): The input tensor or tensor sequence of the graph to + compute gradients. The returned values of this API are the + gradients of inputs . + v (Tensor|Sequence[Tensor]|None,optional): The initial gradient values + of outputs . If grad_outputs is None, the initial gradient values of + outputs would be Tensors filled with 1; if grad_outputs is not None, + it must have the same length as outputs , and in this case, the + initial gradient value of the i-th outputs would be: (1) a Tensor + filled with 1 when the i-th element of grad_outputs is None; + (2) the i-th element of grad_outputs when the i-th element of + grad_outputs is a Tensor. Default None. + + Returns: + Tensor|tuple[Tensor]: Tensor or a tuple of Tensors, whose length is the + same as the Tensor number inside inputs, and the i-th returned + Tensor is the sum of gradients of outputs with respect to the i-th + inputs. + """ + if paddle.fluid._non_static_mode(): + xs_grad = paddle.grad(ys, xs, v, create_graph=True, allow_unused=True) + else: + xs_grad = paddle.incubate.autograd.grad(ys, xs, v) + + if isinstance(xs, paddle.fluid.framework.Variable): + xs_grad = xs_grad[0] + + return _replace_none_with_zero_tensor(xs_grad, xs) + + +def _separate(xs): + """ + ``_separate`` separates ``xs`` from the computation graph through ``clone`` + or ``deteach`` . + + Interally, ``paddle.grad(xs, ys)`` is stateful API implemented based on + computional graph, which will reduce gradients along all path from ys to xs. + + However, funcional autograd API such as ``vjp``, ``jvp`` is stateless, and + only compute gradients with a given ``func`` . + + For example, given a ``func`` :math:`y0=f(x0)`, supposing forward path is: + ``x0 -> y0``, ``x0 -> x1 -> y0`` . + ``paddle.grad(y0, x0)`` will reduce gradients along ``y0->x0`` and + ``y0->x1->x0``, and ``vjp`` only need reduce along ``y0->x0``. + + So, it's needed to clone or detach xs for breaking the dependencies with + other variables. + + Examples: + + .. code-block:: python + + import paddle + from paddle.autograd.functional import _separate + + + def func(x, y): + return x * y + + + x = paddle.ones((1,)) + x.stop_gradient = False + + y = func(x, x) + print(paddle.grad(y, x)) + # [Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [2.])] + + x1, x2 = _separate((x, x)) + y = func(x1, x2) + print(paddle.grad(y, x1)) + # [Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [1.])] + + """ + if isinstance(xs, typing.Sequence): + return tuple(_single_separate(x) for x in xs) + else: + return _single_separate(xs) + + +def _single_separate(x): + if x is None: # x maybe none because grad input's v defaults to none. + return x + if not x.stop_gradient: + return paddle.clone(x) + else: # use detach to share memory when no need gradients. + x = x.detach() + x.stop_gradient = False + return x + return x + + +def _check_inputs(func, xs, v=None): + if not callable(func): + raise TypeError(f"Expected 'fun' is Callable, but got {type(func)}.") + + if not isinstance(xs, (framework.Variable, typing.Sequence)): + raise TypeError(f"Expected 'xs' is a Tensor|Sequence[Tensor]," + f"but got {type(xs)}.") + if isinstance(xs, typing.Sequence) and not all( + isinstance(x, framework.Variable) for x in xs): + raise TypeError("All elements of 'xs' shoule be Tensor.") + + if not isinstance(v, (framework.Variable, typing.Sequence, type(None))): + raise TypeError( + f"Expected 'v' is Tensor|Sequence[Tensor]|None, but got {type(v)}.") + + if isinstance(v, typing.Sequence) and not all( + isinstance(e, framework.Variable) for e in v): + raise TypeError("All elements of 'xs' shoule be Tensor.") + + +def _check_v_shape(v, refs): + if v is None: + return + + v, refs = utils.as_tensors(v), utils.as_tensors(refs) + if len(refs) != len(v): + raise RuntimeError(f"The argument v is a tuple of invalid length:" + f"should be {len(refs)} but got {len(v)}.") + + for index, (element_v, element_ref) in enumerate(zip(v, refs)): + if element_v.shape != element_ref.shape: + raise RuntimeError( + f"The v[{index}] has invalid shape: should " + f"be {element_ref.shape} but got {element_v.shape}.") diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py index 75a70b09731..5b3ad0dd78a 100644 --- a/python/paddle/incubate/autograd/primapi.py +++ b/python/paddle/incubate/autograd/primapi.py @@ -14,28 +14,26 @@ import typing -import paddle.autograd.utils as tensor_utils -import paddle.incubate.autograd.utils as prim_utils -from paddle.fluid import framework -from paddle.incubate.autograd import primx +from paddle.fluid import backward, framework +from paddle.incubate.autograd import primx, utils @framework.static_only -def forward_gradients(targets, inputs, input_gradients=None): +def forward_grad(outputs, inputs, grad_inputs=None): """Forward mode of automatic differentiation. .. note:: **ONLY available in the static mode and primitive operators.** Args: - targets: The target tensor or tensors + outputs: The output tensor or tensors inputs: The input tensor or tensors - input_gradients: The gradient Tensor or Tensors of inputs which has + grad_inputs: The gradient Tensor or Tensors of inputs which has the same shape with inputs, Defaults to None, in this case is equivalent to all ones . Returns: - target_gradients (Tensor|Sequence[Tensor]): The gradients for targets. + grad_outputs (Tensor|Sequence[Tensor]): The gradients for outputs. Examples: @@ -53,7 +51,7 @@ def forward_gradients(targets, inputs, input_gradients=None): with paddle.static.program_guard(main_program, startup_program): x = paddle.static.data('x', shape=[1], dtype='float32') y = x * x - y_grad = paddle.incubate.autograd.forward_gradients(y, x) + y_grad = paddle.incubate.autograd.forward_grad(y, x) paddle.incubate.autograd.prim2orig() exe = paddle.static.Executor() @@ -65,20 +63,20 @@ def forward_gradients(targets, inputs, input_gradients=None): paddle.incubate.autograd.disable_prim() paddle.disable_static() """ - if not prim_utils.prim_enabled(): - raise RuntimeError('forward_gradients must be running on primitive' + if not utils.prim_enabled(): + raise RuntimeError('forward_grad must be running on primitive' 'operators, use enable_prim to turn it on.') - if not isinstance(targets, (framework.Variable, typing.Sequence)): - raise TypeError(f'Expected targets is Tensor|Sequence[Tesnor], ' - f'but got {type(targets)}.') + if not isinstance(outputs, (framework.Variable, typing.Sequence)): + raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], ' + f'but got {type(outputs)}.') if not isinstance(inputs, (framework.Variable, typing.Sequence)): raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], ' f'but got {type(inputs)}.') - ys, xs, xs_dot = tensor_utils.as_tensors(targets), tensor_utils.as_tensors( - inputs), tensor_utils.as_tensors(input_gradients) + ys, xs, xs_dot = utils.as_tensors(outputs), utils.as_tensors( + inputs), utils.as_tensors(grad_inputs) block = framework.default_main_program().current_block() if any(x.block != block for x in xs + ys): @@ -90,4 +88,95 @@ def forward_gradients(targets, inputs, input_gradients=None): ad = primx.Transform(ys[0].block) _, ys_dot = ad.linearize(xs, ys, xs_dot) - return ys_dot[0] if isinstance(targets, framework.Variable) else ys_dot + return ys_dot[0] if isinstance(outputs, framework.Variable) else ys_dot + + +@framework.static_only +def grad(outputs, inputs, grad_outputs=None): + """Reverse mode of automatic differentiation. + + .. note:: + **ONLY available in the static mode and primitive operators** + + Args: + outputs (Tensor|Sequence[Tensor]): The output Tensor or Tensors. + inputs (Tensor|Sequence[Tensor]): The input Tensor or Tensors. + grad_outputs (Tensor|Sequence[Tensor]): The gradient Tensor or + Tensors of outputs which has the same shape with outputs, Defaults + to None, in this case is equivalent to all ones . + + Returns: + grad_inputs (Tensor|Tensors): The gradients for inputs. + + Examples: + + .. code-block:: python + + import numpy as np + import paddle + paddle.enable_static() + paddle.incubate.autograd.enable_prim() + startup_program = paddle.static.Program() + main_program = paddle.static.Program() + with paddle.static.program_guard(main_program, startup_program): + x = paddle.static.data('x', shape=[1], dtype='float32') + x.stop_gradients = False + y = x * x + x_grad = paddle.incubate.autograd.grad(y, x) + paddle.incubate.autograd.prim2orig() + exe = paddle.static.Executor() + exe.run(startup_program) + x_grad = exe.run(main_program, feed={'x': np.array([2.]).astype('float32')}, fetch_list=[x_grad]) + print(x_grad) + # [array([4.], dtype=float32)] + paddle.incubate.autograd.disable_prim() + paddle.disable_static() + """ + + if not utils.prim_enabled(): + return backward.gradients(outputs, inputs, grad_outputs) + + if not isinstance(outputs, (framework.Variable, typing.Sequence)): + raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], ' + f'but got {type(outputs)}.') + + if not isinstance(inputs, (framework.Variable, typing.Sequence)): + raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], ' + f'but got {type(inputs)}.') + + ys, xs, ys_bar = utils.as_tensors(outputs), utils.as_tensors( + inputs), utils.as_tensors(grad_outputs) + block = framework.default_main_program().current_block() + if any((x is not None and x.block != block) for x in xs + ys): + raise RuntimeError( + 'Variable in inputs and outputs should be None or in current block of main program' + ) + + # TODO(Tongxin) without any prior knowledge about whether the program + # is completely lowered to primitive ops, it's mandatory to run the lowering + # pass once and again. This is obviously inefficient and needs to be + # optimized. + primx.orig2prim(block) + ad = primx.Transform(block) + xs_dot, ys_dot = ad.linearize(xs, ys) + if any(var is None for var in ys_dot): + raise RuntimeError( + 'Grads cannot be computed. The given outputs does not depend on inputs' + ) + ys_bar, xs_bar = ad.transpose(ys_dot, xs_dot, ys_bar) + + # remove xs_dot and their constructor ops + op_indexes = [] + for var in xs_dot: + if var is not None: + op_index = block.ops.index(var.op) + if op_index < 0: + raise ValueError( + f'op_index should be greater than or equal to 0, but op_index={op_index}.' + ) + op_indexes.append(op_index) + + ad.erase_ops(sorted(op_indexes)) + ad.erase_dots(xs_dot) + + return xs_bar[0] if isinstance(inputs, framework.Variable) else xs_bar diff --git a/python/paddle/incubate/autograd/primops.py b/python/paddle/incubate/autograd/primops.py index 6017ac35989..b9a3ac45996 100644 --- a/python/paddle/incubate/autograd/primops.py +++ b/python/paddle/incubate/autograd/primops.py @@ -14,6 +14,7 @@ import paddle from paddle.fluid.layer_helper import LayerHelper + from .primreg import REGISTER_FN diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py index d5037dcf649..260a97cdc16 100644 --- a/python/paddle/incubate/autograd/primx.py +++ b/python/paddle/incubate/autograd/primx.py @@ -22,7 +22,7 @@ from .primreg import op_position_inputs, op_position_output, lookup_orig2prim, l from .primrules import _orig2prim, _prim2orig, _jvp, _transpose from .utils import get_input_var_list, get_output_var_list, flatten, flatten_and_remove_none from collections import OrderedDict -from paddle.autograd.utils import as_tensors +from paddle.incubate.autograd.utils import as_tensors def topo_path(xs, ys, block=None): @@ -577,47 +577,3 @@ def prim2orig(block=None): assert block == default_main_program().current_block( ), f'block is neither None nor current block of main program' _lower(block, reverse=True) - - -def _gradients(ys, xs, ys_bar=None): - """ A drop-in replacement of paddle.gradients but instead computing - on primitive ops. - - Args: - ys: the target tensor or tensors - xs: the input tensor or tensors - ys_bar: the optional gradient tensors of `ys` - - Returns: - xs_bar: a list gradients of input `xs` - """ - - ys, xs, ys_bar = as_tensors(ys), as_tensors(xs), as_tensors(ys_bar) - block = default_main_program().current_block() - for el in xs + ys: - assert el is None or el.block == block, f'variable in xs and ys should be None or in current block of main program' - # TODO(Tongxin) without any prior knowledge about whether the program - # is completely lowered to primitive ops, it's mandatory to run the lowering - # pass once and again. This is obviously inefficient and needs to be - # optimized. - orig2prim(block) - - ad = Transform(block) - - xs_dot, ys_dot = ad.linearize(xs, ys) - if any(var is None for var in ys_dot): - assert False, f'Gradients cannot be computed. The given output `ys` does not depend on input `xs`.' - ys_bar, xs_bar = ad.transpose(ys_dot, xs_dot, ys_bar) - # remove xs_dot and their constructor ops - - op_indexes = [] - for var in xs_dot: - if var is not None: - op_index = block.ops.index(var.op) - assert op_index >= 0, f'op_index should be greater than or equal to 0, but op_index={op_index}.' - op_indexes.append(op_index) - - ad.erase_ops(sorted(op_indexes)) - ad.erase_dots(xs_dot) - - return xs_bar diff --git a/python/paddle/incubate/autograd/utils.py b/python/paddle/incubate/autograd/utils.py index 9d6a8c4f6a3..96faf7f7440 100644 --- a/python/paddle/incubate/autograd/utils.py +++ b/python/paddle/incubate/autograd/utils.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import typing import paddle from paddle.fluid import framework as framework @@ -170,3 +171,12 @@ def flatten(inp): def flatten_and_remove_none(inp): flattened = flatten(inp) return [var for var in flattened if var is not None] + + +def as_tensors(xs): + if isinstance(xs, framework.Variable): + return (xs, ) + elif isinstance(xs, typing.Sequence): + return tuple(xs) + else: + return xs -- GitLab