[xdoctest] No.44-47 and No.50-59 doc style (#55813)

84fe0454 · gouzil · GitHub · 30a02d27 · 84fe0454 · 84fe0454
9 changed file
--- a/python/paddle/distribution/cauchy.py
+++ b/python/paddle/distribution/cauchy.py
@@ -39,20 +39,20 @@ class Cauchy(distribution.Distribution):

        .. code-block:: python

-            import paddle
-            from paddle.distribution import Cauchy
-
-            # init Cauchy with float
-            rv = Cauchy(loc=0.1, scale=1.2)
-            print(rv.entropy())
-            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        2.71334577)
-
-            # init Cauchy with N-Dim tensor
-            rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-            print(rv.entropy())
-            # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [2.53102422, 3.22417140])
+            >>> import paddle
+            >>> from paddle.distribution import Cauchy
+
+            >>> # init Cauchy with float
+            >>> rv = Cauchy(loc=0.1, scale=1.2)
+            >>> print(rv.entropy())
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    2.71334577)
+
+            >>> # init Cauchy with N-Dim tensor
+            >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+            >>> print(rv.entropy())
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [2.53102422, 3.22417140])
    """

    def __init__(self, loc, scale, name=None):
@@ -114,32 +114,32 @@ class Cauchy(distribution.Distribution):

            .. code-block:: python

-                import paddle
-                from paddle.distribution import Cauchy
+                >>> import paddle
+                >>> from paddle.distribution import Cauchy

-                # init Cauchy with float
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.sample([10]).shape)
-                # [10]
+                >>> # init Cauchy with float
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.sample([10]).shape)
+                [10]

-                # init Cauchy with 0-Dim tensor
-                rv = Cauchy(loc=paddle.full((), 0.1), scale=paddle.full((), 1.2))
-                print(rv.sample([10]).shape)
-                # [10]
+                >>> # init Cauchy with 0-Dim tensor
+                >>> rv = Cauchy(loc=paddle.full((), 0.1), scale=paddle.full((), 1.2))
+                >>> print(rv.sample([10]).shape)
+                [10]

-                # init Cauchy with N-Dim tensor
-                rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.sample([10]).shape)
-                # [10, 2]
+                >>> # init Cauchy with N-Dim tensor
+                >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.sample([10]).shape)
+                [10, 2]

-                # sample 2-Dim data
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.sample([10, 2]).shape)
-                # [10, 2]
+                >>> # sample 2-Dim data
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.sample([10, 2]).shape)
+                [10, 2]

-                rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.sample([10, 2]).shape)
-                # [10, 2, 2]
+                >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.sample([10, 2]).shape)
+                [10, 2, 2]
        """
        name = name if name is not None else (self.name + '_sample')
        with paddle.no_grad():
@@ -159,32 +159,32 @@ class Cauchy(distribution.Distribution):

            .. code-block:: python

-                import paddle
-                from paddle.distribution import Cauchy
+                >>> import paddle
+                >>> from paddle.distribution import Cauchy

-                # init Cauchy with float
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.rsample([10]).shape)
-                # [10]
+                >>> # init Cauchy with float
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.rsample([10]).shape)
+                [10]

-                # init Cauchy with 0-Dim tensor
-                rv = Cauchy(loc=paddle.full((), 0.1), scale=paddle.full((), 1.2))
-                print(rv.rsample([10]).shape)
-                # [10]
+                >>> # init Cauchy with 0-Dim tensor
+                >>> rv = Cauchy(loc=paddle.full((), 0.1), scale=paddle.full((), 1.2))
+                >>> print(rv.rsample([10]).shape)
+                [10]

-                # init Cauchy with N-Dim tensor
-                rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.rsample([10]).shape)
-                # [10, 2]
+                >>> # init Cauchy with N-Dim tensor
+                >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.rsample([10]).shape)
+                [10, 2]

-                # sample 2-Dim data
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.rsample([10, 2]).shape)
-                # [10, 2]
+                >>> # sample 2-Dim data
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.rsample([10, 2]).shape)
+                [10, 2]

-                rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.rsample([10, 2]).shape)
-                # [10, 2, 2]
+                >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.rsample([10, 2]).shape)
+                [10, 2, 2]
        """
        name = name if name is not None else (self.name + '_rsample')

@@ -222,32 +222,32 @@ class Cauchy(distribution.Distribution):

            .. code-block:: python

-                import paddle
-                from paddle.distribution import Cauchy
-
-                # init Cauchy with float
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.prob(paddle.to_tensor(1.5)))
-                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        0.11234467)
-
-                # broadcast to value
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.prob(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.11234467, 0.01444674])
-
-                # init Cauchy with N-Dim tensor
-                rv = Cauchy(loc=paddle.to_tensor([0.1, 0.1]), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.prob(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.10753712, 0.02195240])
-
-                # init Cauchy with N-Dim tensor with broadcast
-                rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.prob(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.10753712, 0.02195240])
+                >>> import paddle
+                >>> from paddle.distribution import Cauchy
+
+                >>> # init Cauchy with float
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.prob(paddle.to_tensor(1.5)))
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        0.11234467)
+
+                >>> # broadcast to value
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.prob(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [0.11234467, 0.01444674])
+
+                >>> # init Cauchy with N-Dim tensor
+                >>> rv = Cauchy(loc=paddle.to_tensor([0.1, 0.1]), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.prob(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [0.10753712, 0.02195240])
+
+                >>> # init Cauchy with N-Dim tensor with broadcast
+                >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.prob(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [0.10753712, 0.02195240])
        """
        name = self.name + '_prob'

@@ -271,32 +271,32 @@ class Cauchy(distribution.Distribution):

            .. code-block:: python

-                import paddle
-                from paddle.distribution import Cauchy
-
-                # init Cauchy with float
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.log_prob(paddle.to_tensor(1.5)))
-                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        -2.18618369)
-
-                # broadcast to value
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.log_prob(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [-2.18618369, -4.23728657])
-
-                # init Cauchy with N-Dim tensor
-                rv = Cauchy(loc=paddle.to_tensor([0.1, 0.1]), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.log_prob(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [-2.22991920, -3.81887865])
-
-                # init Cauchy with N-Dim tensor with broadcast
-                rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.log_prob(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [-2.22991920, -3.81887865])
+                >>> import paddle
+                >>> from paddle.distribution import Cauchy
+
+                >>> # init Cauchy with float
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.log_prob(paddle.to_tensor(1.5)))
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        -2.18618369)
+
+                >>> # broadcast to value
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.log_prob(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [-2.18618369, -4.23728657])
+
+                >>> # init Cauchy with N-Dim tensor
+                >>> rv = Cauchy(loc=paddle.to_tensor([0.1, 0.1]), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.log_prob(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [-2.22991920, -3.81887865])
+
+                >>> # init Cauchy with N-Dim tensor with broadcast
+                >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.log_prob(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [-2.22991920, -3.81887865])
        """
        name = self.name + '_log_prob'

@@ -338,32 +338,32 @@ class Cauchy(distribution.Distribution):

            .. code-block:: python

-                import paddle
-                from paddle.distribution import Cauchy
-
-                # init Cauchy with float
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.cdf(paddle.to_tensor(1.5)))
-                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        0.77443725)
-
-                # broadcast to value
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.cdf(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.77443725, 0.92502367])
-
-                # init Cauchy with N-Dim tensor
-                rv = Cauchy(loc=paddle.to_tensor([0.1, 0.1]), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.cdf(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.80256844, 0.87888104])
-
-                # init Cauchy with N-Dim tensor with broadcast
-                rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.cdf(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.80256844, 0.87888104])
+                >>> import paddle
+                >>> from paddle.distribution import Cauchy
+
+                >>> # init Cauchy with float
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.cdf(paddle.to_tensor(1.5)))
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        0.77443725)
+
+                >>> # broadcast to value
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.cdf(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [0.77443725, 0.92502367])
+
+                >>> # init Cauchy with N-Dim tensor
+                >>> rv = Cauchy(loc=paddle.to_tensor([0.1, 0.1]), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.cdf(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [0.80256844, 0.87888104])
+
+                >>> # init Cauchy with N-Dim tensor with broadcast
+                >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.cdf(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [0.80256844, 0.87888104])
        """
        name = self.name + '_cdf'

@@ -399,20 +399,20 @@ class Cauchy(distribution.Distribution):

            .. code-block:: python

-                import paddle
-                from paddle.distribution import Cauchy
+                >>> import paddle
+                >>> from paddle.distribution import Cauchy

-                # init Cauchy with float
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.entropy())
-                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        2.71334577)
+                >>> # init Cauchy with float
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.entropy())
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        2.71334577)

-                # init Cauchy with N-Dim tensor
-                rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.entropy())
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [2.53102422, 3.22417140])
+                >>> # init Cauchy with N-Dim tensor
+                >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.entropy())
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [2.53102422, 3.22417140])

        """
        name = self.name + '_entropy'
@@ -438,14 +438,14 @@ class Cauchy(distribution.Distribution):

            .. code-block:: python

-                import paddle
-                from paddle.distribution import Cauchy
+                >>> import paddle
+                >>> from paddle.distribution import Cauchy

-                rv = Cauchy(loc=0.1, scale=1.2)
-                rv_other = Cauchy(loc=paddle.to_tensor(1.2), scale=paddle.to_tensor([2.3, 3.4]))
-                print(rv.kl_divergence(rv_other))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.19819736, 0.31532931])
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> rv_other = Cauchy(loc=paddle.to_tensor(1.2), scale=paddle.to_tensor([2.3, 3.4]))
+                >>> print(rv.kl_divergence(rv_other))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [0.19819736, 0.31532931])
        """
        name = self.name + '_kl_divergence'


--- a/python/paddle/distribution/independent.py
+++ b/python/paddle/distribution/independent.py
@@ -31,21 +31,21 @@ class Independent(distribution.Distribution):

        .. code-block:: python

-            import paddle
-            from paddle.distribution import independent
-
-            beta = paddle.distribution.Beta(paddle.to_tensor([0.5, 0.5]), paddle.to_tensor([0.5, 0.5]))
-            print(beta.batch_shape, beta.event_shape)
-            # (2,) ()
-            print(beta.log_prob(paddle.to_tensor(0.2)))
-            # Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [-0.22843921, -0.22843921])
-            reinterpreted_beta = independent.Independent(beta, 1)
-            print(reinterpreted_beta.batch_shape, reinterpreted_beta.event_shape)
-            # () (2,)
-            print(reinterpreted_beta.log_prob(paddle.to_tensor([0.2,  0.2])))
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        -0.45687842)
+            >>> import paddle
+            >>> from paddle.distribution import independent
+
+            >>> beta = paddle.distribution.Beta(paddle.to_tensor([0.5, 0.5]), paddle.to_tensor([0.5, 0.5]))
+            >>> print(beta.batch_shape, beta.event_shape)
+            (2,) ()
+            >>> print(beta.log_prob(paddle.to_tensor(0.2)))
+            Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    [-0.22843921, -0.22843921])
+            >>> reinterpreted_beta = independent.Independent(beta, 1)
+            >>> print(reinterpreted_beta.batch_shape, reinterpreted_beta.event_shape)
+            () (2,)
+            >>> print(reinterpreted_beta.log_prob(paddle.to_tensor([0.2,  0.2])))
+            Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    -0.45687842)
    """

    def __init__(self, base, reinterpreted_batch_rank):

--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
@@ -53,14 +53,14 @@ def kl_divergence(p, q):

        .. code-block:: python

-            import paddle
+            >>> import paddle

-            p = paddle.distribution.Beta(alpha=0.5, beta=0.5)
-            q = paddle.distribution.Beta(alpha=0.3, beta=0.7)
+            >>> p = paddle.distribution.Beta(alpha=0.5, beta=0.5)
+            >>> q = paddle.distribution.Beta(alpha=0.3, beta=0.7)

-            print(paddle.distribution.kl_divergence(p, q))
-            # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        0.21193528)
+            >>> print(paddle.distribution.kl_divergence(p, q))
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                0.21193528)

    """
    return _dispatch(type(p), type(q))(p, q)
@@ -82,11 +82,11 @@ def register_kl(cls_p, cls_q):
    Examples:
        .. code-block:: python

-            import paddle
+            >>> import paddle

-            @paddle.distribution.register_kl(paddle.distribution.Beta, paddle.distribution.Beta)
-            def kl_beta_beta():
-                pass # insert implementation here
+            >>> @paddle.distribution.register_kl(paddle.distribution.Beta, paddle.distribution.Beta)
+            >>> def kl_beta_beta():
+            ...     pass # insert implementation here
    """
    if not issubclass(cls_p, Distribution) or not issubclass(
        cls_q, Distribution

--- a/python/paddle/distribution/laplace.py
+++ b/python/paddle/distribution/laplace.py
@@ -44,12 +44,12 @@ class Laplace(distribution.Distribution):
    Examples:
        .. code-block:: python

-            import paddle
-
-            m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
-            m.sample()  # Laplace distributed with loc=0, scale=1
-            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        3.68546247)
+            >>> import paddle
+            >>> paddle.seed(2023)
+            >>> m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
+            >>> m.sample()  # Laplace distributed with loc=0, scale=1
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                1.31554604)

    """

@@ -173,13 +173,13 @@ class Laplace(distribution.Distribution):
        Examples:
            .. code-block:: python

-                import paddle
+                >>> import paddle

-                m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
-                value = paddle.to_tensor(0.1)
-                m.log_prob(value)
-                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        -0.79314721)
+                >>> m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
+                >>> value = paddle.to_tensor(0.1)
+                >>> m.log_prob(value)
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        -0.79314721)

        """
        loc, scale, value = self._validate_value(value)
@@ -205,12 +205,12 @@ class Laplace(distribution.Distribution):
        Examples:
            .. code-block:: python

-                import paddle
+                >>> import paddle

-                m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
-                m.entropy()
-                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        1.69314718)
+                >>> m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
+                >>> m.entropy()
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        1.69314718)
        """
        return 1 + paddle.log(2 * self.scale)

@@ -236,13 +236,13 @@ class Laplace(distribution.Distribution):
        Examples:
            .. code-block:: python

-                import paddle
+                >>> import paddle

-                m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
-                value = paddle.to_tensor(0.1)
-                m.cdf(value)
-                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        0.54758132)
+                >>> m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
+                >>> value = paddle.to_tensor(0.1)
+                >>> m.cdf(value)
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        0.54758132)
        """
        loc, scale, value = self._validate_value(value)
        iterm = (
@@ -275,13 +275,12 @@ class Laplace(distribution.Distribution):
        Examples:
            .. code-block:: python

-                            import paddle
-
-                            m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
-                            value = paddle.to_tensor(0.1)
-                            m.icdf(value)
-                            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                            #        -1.60943794)
+                >>> import paddle
+                >>> m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
+                >>> value = paddle.to_tensor(0.1)
+                >>> m.icdf(value)
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        -1.60943794)
        """
        loc, scale, value = self._validate_value(value)
        term = value - 0.5
@@ -300,12 +299,12 @@ class Laplace(distribution.Distribution):
        Examples:
            .. code-block:: python

-                            import paddle
-
-                            m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
-                            m.sample()  # Laplace distributed with loc=0, scale=1
-                            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                            #        3.68546247)
+                >>> import paddle
+                >>> paddle.seed(2023)
+                >>> m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
+                >>> m.sample()  # Laplace distributed with loc=0, scale=1
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    1.31554604)
        """
        shape = shape if isinstance(shape, tuple) else tuple(shape)
        with paddle.no_grad():
@@ -323,12 +322,12 @@ class Laplace(distribution.Distribution):
        Examples:
            .. code-block:: python

-                            import paddle
-
-                            m = paddle.distribution.Laplace(paddle.to_tensor([0.0]), paddle.to_tensor([1.0]))
-                            m.rsample((1,))  # Laplace distributed with loc=0, scale=1
-                            # Tensor(shape=[1, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                            # [[0.04337667]])
+                >>> import paddle
+                >>> paddle.seed(2023)
+                >>> m = paddle.distribution.Laplace(paddle.to_tensor([0.0]), paddle.to_tensor([1.0]))
+                >>> m.rsample((1,))  # Laplace distributed with loc=0, scale=1
+                Tensor(shape=[1, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[1.31554604]])
        """

        eps = self._get_eps()
@@ -395,13 +394,13 @@ class Laplace(distribution.Distribution):
        Examples:
            .. code-block:: python

-                import paddle
+                >>> import paddle

-                m1 = paddle.distribution.Laplace(paddle.to_tensor([0.0]), paddle.to_tensor([1.0]))
-                m2 = paddle.distribution.Laplace(paddle.to_tensor([1.0]), paddle.to_tensor([0.5]))
-                m1.kl_divergence(m2)
-                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                # [1.04261160])
+                >>> m1 = paddle.distribution.Laplace(paddle.to_tensor([0.0]), paddle.to_tensor([1.0]))
+                >>> m2 = paddle.distribution.Laplace(paddle.to_tensor([1.0]), paddle.to_tensor([0.5]))
+                >>> m1.kl_divergence(m2)
+                Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [1.04261160])
        """

        var_ratio = other.scale / self.scale

--- a/python/paddle/distribution/lognormal.py
+++ b/python/paddle/distribution/lognormal.py
@@ -49,36 +49,44 @@ class LogNormal(TransformedDistribution):
    Examples:
        .. code-block:: python

-          import paddle
-          from paddle.distribution import LogNormal
-
-          # Define a single scalar LogNormal distribution.
-          dist = LogNormal(loc=0., scale=3.)
-          # Define a batch of two scalar valued LogNormals.
-          # The underlying Normal of first has mean 1 and standard deviation 11, the underlying Normal of second 2 and 22.
-          dist = LogNormal(loc=[1., 2.], scale=[11., 22.])
-          # Get 3 samples, returning a 3 x 2 tensor.
-          dist.sample((3, ))
-
-          # Define a batch of two scalar valued LogNormals.
-          # Their underlying Normal have mean 1, but different standard deviations.
-          dist = LogNormal(loc=1., scale=[11., 22.])
-
-          # Complete example
-          value_tensor = paddle.to_tensor([0.8], dtype="float32")
-
-          lognormal_a = LogNormal([0.], [1.])
-          lognormal_b = LogNormal([0.5], [2.])
-          sample = lognormal_a.sample((2, ))
-          # a random tensor created by lognormal distribution with shape: [2, 1]
-          entropy = lognormal_a.entropy()
-          # [1.4189385] with shape: [1]
-          lp = lognormal_a.log_prob(value_tensor)
-          # [-0.72069150] with shape: [1]
-          p = lognormal_a.probs(value_tensor)
-          # [0.48641577] with shape: [1]
-          kl = lognormal_a.kl_divergence(lognormal_b)
-          # [0.34939718] with shape: [1]
+            >>> import paddle
+            >>> from paddle.distribution import LogNormal
+
+            >>> # Define a single scalar LogNormal distribution.
+            >>> dist = LogNormal(loc=0., scale=3.)
+            >>> # Define a batch of two scalar valued LogNormals.
+            >>> # The underlying Normal of first has mean 1 and standard deviation 11, the underlying Normal of second 2 and 22.
+            >>> dist = LogNormal(loc=[1., 2.], scale=[11., 22.])
+            >>> # Get 3 samples, returning a 3 x 2 tensor.
+            >>> dist.sample((3, ))
+
+            >>> # Define a batch of two scalar valued LogNormals.
+            >>> # Their underlying Normal have mean 1, but different standard deviations.
+            >>> dist = LogNormal(loc=1., scale=[11., 22.])
+
+            >>> # Complete example
+            >>> value_tensor = paddle.to_tensor([0.8], dtype="float32")
+
+            >>> lognormal_a = LogNormal([0.], [1.])
+            >>> lognormal_b = LogNormal([0.5], [2.])
+            >>> sample = lognormal_a.sample((2, ))
+            >>> # a random tensor created by lognormal distribution with shape: [2, 1]
+            >>> entropy = lognormal_a.entropy()
+            >>> print(entropy)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [1.41893852])
+            >>> lp = lognormal_a.log_prob(value_tensor)
+            >>> print(lp)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [-0.72069150])
+            >>> p = lognormal_a.probs(value_tensor)
+            >>> print(p)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [0.48641577])
+            >>> kl = lognormal_a.kl_divergence(lognormal_b)
+            >>> print(kl)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [0.34939718])
    """

    def __init__(self, loc, scale):

--- a/python/paddle/distribution/multinomial.py
+++ b/python/paddle/distribution/multinomial.py
@@ -53,18 +53,17 @@ class Multinomial(distribution.Distribution):

    .. code-block:: python

-        import paddle
-
-        multinomial = paddle.distribution.Multinomial(10, paddle.to_tensor([0.2, 0.3, 0.5]))
-        print(multinomial.sample((2, 3)))
-        # Tensor(shape=[2, 3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-        #        [[[1., 4., 5.],
-        #          [0., 2., 8.],
-        #          [2., 4., 4.]],
-
-        #         [[1., 6., 3.],
-        #          [3., 3., 4.],
-        #          [3., 4., 3.]]])
+        >>> import paddle
+        >>> paddle.seed(2023)
+        >>> multinomial = paddle.distribution.Multinomial(10, paddle.to_tensor([0.2, 0.3, 0.5]))
+        >>> print(multinomial.sample((2, 3)))
+        Tensor(shape=[2, 3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[1., 5., 4.],
+              [0., 4., 6.],
+              [1., 3., 6.]],
+            [[2., 2., 6.],
+              [0., 6., 4.],
+              [3., 3., 4.]]])
    """

    def __init__(self, total_count, probs):

--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -54,36 +54,44 @@ class Normal(distribution.Distribution):
    Examples:
        .. code-block:: python

-            import paddle
-            from paddle.distribution import Normal
-
-            # Define a single scalar Normal distribution.
-            dist = Normal(loc=0., scale=3.)
-            # Define a batch of two scalar valued Normals.
-            # The first has mean 1 and standard deviation 11, the second 2 and 22.
-            dist = Normal(loc=[1., 2.], scale=[11., 22.])
-            # Get 3 samples, returning a 3 x 2 tensor.
-            dist.sample([3])
-
-            # Define a batch of two scalar valued Normals.
-            # Both have mean 1, but different standard deviations.
-            dist = Normal(loc=1., scale=[11., 22.])
-
-            # Complete example
-            value_tensor = paddle.to_tensor([0.8], dtype="float32")
-
-            normal_a = Normal([0.], [1.])
-            normal_b = Normal([0.5], [2.])
-            sample = normal_a.sample([2])
-            # a random tensor created by normal distribution with shape: [2, 1]
-            entropy = normal_a.entropy()
-            # [1.4189385] with shape: [1]
-            lp = normal_a.log_prob(value_tensor)
-            # [-1.2389386] with shape: [1]
-            p = normal_a.probs(value_tensor)
-            # [0.28969154] with shape: [1]
-            kl = normal_a.kl_divergence(normal_b)
-            # [0.34939718] with shape: [1]
+            >>> import paddle
+            >>> from paddle.distribution import Normal
+
+            >>> # Define a single scalar Normal distribution.
+            >>> dist = Normal(loc=0., scale=3.)
+            >>> # Define a batch of two scalar valued Normals.
+            >>> # The first has mean 1 and standard deviation 11, the second 2 and 22.
+            >>> dist = Normal(loc=[1., 2.], scale=[11., 22.])
+            >>> # Get 3 samples, returning a 3 x 2 tensor.
+            >>> dist.sample([3])
+
+            >>> # Define a batch of two scalar valued Normals.
+            >>> # Both have mean 1, but different standard deviations.
+            >>> dist = Normal(loc=1., scale=[11., 22.])
+
+            >>> # Complete example
+            >>> value_tensor = paddle.to_tensor([0.8], dtype="float32")
+
+            >>> normal_a = Normal([0.], [1.])
+            >>> normal_b = Normal([0.5], [2.])
+            >>> sample = normal_a.sample([2])
+            >>> # a random tensor created by normal distribution with shape: [2, 1]
+            >>> entropy = normal_a.entropy()
+            >>> print(entropy)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [1.41893852])
+            >>> lp = normal_a.log_prob(value_tensor)
+            >>> print(lp)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [-1.23893857])
+            >>> p = normal_a.probs(value_tensor)
+            >>> print(p)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [0.28969154])
+            >>> kl = normal_a.kl_divergence(normal_b)
+            >>> print(kl)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [0.34939718])
    """

    def __init__(self, loc, scale, name=None):

--- a/python/paddle/distribution/transform.py
+++ b/python/paddle/distribution/transform.py
@@ -360,34 +360,34 @@ class AbsTransform(Transform):

        .. code-block:: python

-            import paddle
-
-            abs = paddle.distribution.AbsTransform()
-
-            print(abs.forward(paddle.to_tensor([-1., 0., 1.])))
-            # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1., 0., 1.])
-
-            print(abs.inverse(paddle.to_tensor([1.])))
-            # (Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [-1.]), Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1.]))
-
-            # The |dX/dY| is constant 1. So Log|dX/dY| == 0
-            print(abs.inverse_log_det_jacobian(paddle.to_tensor(1.)))
-            # (Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        0.), Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        0.))
-
-            #Special case handling of 0.
-            print(abs.inverse(paddle.to_tensor([0.])))
-            # (Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [0.]), Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [0.]))
-            print(abs.inverse_log_det_jacobian(paddle.to_tensor(0.)))
-            # (Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        0.), Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        0.))
+            >>> import paddle
+
+            >>> abs = paddle.distribution.AbsTransform()
+
+            >>> print(abs.forward(paddle.to_tensor([-1., 0., 1.])))
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [1., 0., 1.])
+
+            >>> print(abs.inverse(paddle.to_tensor([1.])))
+            (Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [-1.]), Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [1.]))
+
+            >>> # The |dX/dY| is constant 1. So Log|dX/dY| == 0
+            >>> print(abs.inverse_log_det_jacobian(paddle.to_tensor(1.)))
+            (Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    0.), Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    0.))
+
+            >>> #Special case handling of 0.
+            >>> print(abs.inverse(paddle.to_tensor([0.])))
+            (Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [0.]), Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [0.]))
+            >>> print(abs.inverse_log_det_jacobian(paddle.to_tensor(0.)))
+            (Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    0.), Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    0.))

    """
    _type = Type.SURJECTION
@@ -423,20 +423,20 @@ class AffineTransform(Transform):

        .. code-block:: python

-            import paddle
+            >>> import paddle

-            x = paddle.to_tensor([1., 2.])
-            affine = paddle.distribution.AffineTransform(paddle.to_tensor(0.), paddle.to_tensor(1.))
+            >>> x = paddle.to_tensor([1., 2.])
+            >>> affine = paddle.distribution.AffineTransform(paddle.to_tensor(0.), paddle.to_tensor(1.))

-            print(affine.forward(x))
-            # Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1., 2.])
-            print(affine.inverse(affine.forward(x)))
-            # Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1., 2.])
-            print(affine.forward_log_det_jacobian(x))
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        0.)
+            >>> print(affine.forward(x))
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [1., 2.])
+            >>> print(affine.inverse(affine.forward(x)))
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [1., 2.])
+            >>> print(affine.forward_log_det_jacobian(x))
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    0.)
    """
    _type = Type.BIJECTION

@@ -503,28 +503,28 @@ class ChainTransform(Transform):

        .. code-block:: python

-            import paddle
-
-
-            x = paddle.to_tensor([0., 1., 2., 3.])
-
-            chain = paddle.distribution.ChainTransform((
-                paddle.distribution.AffineTransform(
-                    paddle.to_tensor(0.), paddle.to_tensor(1.)),
-                paddle.distribution.ExpTransform()
-            ))
-            print(chain.forward(x))
-            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1.         , 2.71828175 , 7.38905621 , 20.08553696])
-            print(chain.inverse(chain.forward(x)))
-            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [0., 1., 2., 3.])
-            print(chain.forward_log_det_jacobian(x))
-            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [0., 1., 2., 3.])
-            print(chain.inverse_log_det_jacobian(chain.forward(x)))
-            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [ 0., -1., -2., -3.])
+            >>> import paddle
+
+
+            >>> x = paddle.to_tensor([0., 1., 2., 3.])
+
+            >>> chain = paddle.distribution.ChainTransform((
+            ...     paddle.distribution.AffineTransform(
+            ...         paddle.to_tensor(0.), paddle.to_tensor(1.)),
+            ...     paddle.distribution.ExpTransform()
+            >>> ))
+            >>> print(chain.forward(x))
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [1.         , 2.71828175 , 7.38905621 , 20.08553696])
+            >>> print(chain.inverse(chain.forward(x)))
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [0., 1., 2., 3.])
+            >>> print(chain.forward_log_det_jacobian(x))
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [0., 1., 2., 3.])
+            >>> print(chain.inverse_log_det_jacobian(chain.forward(x)))
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [ 0., -1., -2., -3.])
    """

    def __init__(self, transforms):
@@ -625,24 +625,24 @@ class ExpTransform(Transform):

        .. code-block:: python

-            import paddle
+            >>> import paddle

-            exp = paddle.distribution.ExpTransform()
-            print(exp.forward(paddle.to_tensor([1., 2., 3.])))
-            # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [2.71828175 , 7.38905621 , 20.08553696])
+            >>> exp = paddle.distribution.ExpTransform()
+            >>> print(exp.forward(paddle.to_tensor([1., 2., 3.])))
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [2.71828175 , 7.38905621 , 20.08553696])

-            print(exp.inverse(paddle.to_tensor([1., 2., 3.])))
-            # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [0.        , 0.69314718, 1.09861231])
+            >>> print(exp.inverse(paddle.to_tensor([1., 2., 3.])))
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [0.        , 0.69314718, 1.09861231])

-            print(exp.forward_log_det_jacobian(paddle.to_tensor([1., 2., 3.])))
-            # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1., 2., 3.])
+            >>> print(exp.forward_log_det_jacobian(paddle.to_tensor([1., 2., 3.])))
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [1., 2., 3.])

-            print(exp.inverse_log_det_jacobian(paddle.to_tensor([1., 2., 3.])))
-            # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [ 0.        , -0.69314718, -1.09861231])
+            >>> print(exp.inverse_log_det_jacobian(paddle.to_tensor([1., 2., 3.])))
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [ 0.        , -0.69314718, -1.09861231])
    """
    _type = Type.BIJECTION

@@ -695,20 +695,20 @@ class IndependentTransform(Transform):

        .. code-block:: python

-            import paddle
+            >>> import paddle

-            x = paddle.to_tensor([[1., 2., 3.], [4., 5., 6.]])
+            >>> x = paddle.to_tensor([[1., 2., 3.], [4., 5., 6.]])

-            # Exponential transform with event_rank = 1
-            multi_exp = paddle.distribution.IndependentTransform(
-                paddle.distribution.ExpTransform(), 1)
-            print(multi_exp.forward(x))
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[2.71828175  , 7.38905621  , 20.08553696 ],
-            #         [54.59814835 , 148.41316223, 403.42880249]])
-            print(multi_exp.forward_log_det_jacobian(x))
-            # Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [6. , 15.])
+            >>> # Exponential transform with event_rank = 1
+            >>> multi_exp = paddle.distribution.IndependentTransform(
+            ...     paddle.distribution.ExpTransform(), 1)
+            >>> print(multi_exp.forward(x))
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[2.71828175  , 7.38905621  , 20.08553696 ],
+                     [54.59814835 , 148.41316223, 403.42880249]])
+            >>> print(multi_exp.forward_log_det_jacobian(x))
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [6. , 15.])
    """

    def __init__(self, base, reinterpreted_batch_rank):
@@ -773,20 +773,20 @@ class PowerTransform(Transform):

        .. code-block:: python

-            import paddle
+            >>> import paddle

-            x = paddle.to_tensor([1., 2.])
-            power = paddle.distribution.PowerTransform(paddle.to_tensor(2.))
+            >>> x = paddle.to_tensor([1., 2.])
+            >>> power = paddle.distribution.PowerTransform(paddle.to_tensor(2.))

-            print(power.forward(x))
-            # Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1., 4.])
-            print(power.inverse(power.forward(x)))
-            # Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1., 2.])
-            print(power.forward_log_det_jacobian(x))
-            # Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [0.69314718, 1.38629436])
+            >>> print(power.forward(x))
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [1., 4.])
+            >>> print(power.inverse(power.forward(x)))
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [1., 2.])
+            >>> print(power.forward_log_det_jacobian(x))
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [0.69314718, 1.38629436])
    """
    _type = Type.BIJECTION

@@ -840,24 +840,24 @@ class ReshapeTransform(Transform):

        .. code-block:: python

-            import paddle
-
-            x = paddle.ones((1,2,3))
-            reshape_transform = paddle.distribution.ReshapeTransform((2, 3), (3, 2))
-            print(reshape_transform.forward_shape((1,2,3)))
-            # (5, 2, 6)
-            print(reshape_transform.forward(x))
-            # Tensor(shape=[1, 3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[[1., 1.],
-            #          [1., 1.],
-            #          [1., 1.]]])
-            print(reshape_transform.inverse(reshape_transform.forward(x)))
-            # Tensor(shape=[1, 2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[[1., 1., 1.],
-            #          [1., 1., 1.]]])
-            print(reshape_transform.forward_log_det_jacobian(x))
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        0.)
+            >>> import paddle
+
+            >>> x = paddle.ones((1,2,3))
+            >>> reshape_transform = paddle.distribution.ReshapeTransform((2, 3), (3, 2))
+            >>> print(reshape_transform.forward_shape((1,2,3)))
+            (1, 3, 2)
+            >>> print(reshape_transform.forward(x))
+            Tensor(shape=[1, 3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[[1., 1.],
+                    [1., 1.],
+                    [1., 1.]]])
+            >>> print(reshape_transform.inverse(reshape_transform.forward(x)))
+            Tensor(shape=[1, 2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[[1., 1., 1.],
+                        [1., 1., 1.]]])
+            >>> print(reshape_transform.forward_log_det_jacobian(x))
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [0.])
    """
    _type = Type.BIJECTION

@@ -956,22 +956,22 @@ class SigmoidTransform(Transform):

        .. code-block:: python

-            import paddle
-
-            x = paddle.ones((2,3))
-            t = paddle.distribution.SigmoidTransform()
-            print(t.forward(x))
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[0.73105860, 0.73105860, 0.73105860],
-            #         [0.73105860, 0.73105860, 0.73105860]])
-            print(t.inverse(t.forward(x)))
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[1.00000012, 1.00000012, 1.00000012],
-            #         [1.00000012, 1.00000012, 1.00000012]])
-            print(t.forward_log_det_jacobian(x))
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[-1.62652326, -1.62652326, -1.62652326],
-            #         [-1.62652326, -1.62652326, -1.62652326]])
+            >>> import paddle
+
+            >>> x = paddle.ones((2,3))
+            >>> t = paddle.distribution.SigmoidTransform()
+            >>> print(t.forward(x))
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[0.73105860, 0.73105860, 0.73105860],
+                     [0.73105860, 0.73105860, 0.73105860]])
+            >>> print(t.inverse(t.forward(x)))
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[1.00000012, 1.00000012, 1.00000012],
+                     [1.00000012, 1.00000012, 1.00000012]])
+            >>> print(t.forward_log_det_jacobian(x))
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[-1.62652326, -1.62652326, -1.62652326],
+                     [-1.62652326, -1.62652326, -1.62652326]])
    """

    @property
@@ -1003,18 +1003,18 @@ class SoftmaxTransform(Transform):

        .. code-block:: python

-            import paddle
-
-            x = paddle.ones((2,3))
-            t = paddle.distribution.SoftmaxTransform()
-            print(t.forward(x))
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[0.33333334, 0.33333334, 0.33333334],
-            #         [0.33333334, 0.33333334, 0.33333334]])
-            print(t.inverse(t.forward(x)))
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[-1.09861231, -1.09861231, -1.09861231],
-            #         [-1.09861231, -1.09861231, -1.09861231]])
+            >>> import paddle
+
+            >>> x = paddle.ones((2,3))
+            >>> t = paddle.distribution.SoftmaxTransform()
+            >>> print(t.forward(x))
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[0.33333334, 0.33333334, 0.33333334],
+                     [0.33333334, 0.33333334, 0.33333334]])
+            >>> print(t.inverse(t.forward(x)))
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[-1.09861231, -1.09861231, -1.09861231],
+                     [-1.09861231, -1.09861231, -1.09861231]])
    """
    _type = Type.OTHER

@@ -1061,32 +1061,32 @@ class StackTransform(Transform):

        .. code-block:: python

-            import paddle
-
-            x = paddle.stack(
-                (paddle.to_tensor([1., 2., 3.]), paddle.to_tensor([1, 2., 3.])), 1)
-            t = paddle.distribution.StackTransform(
-                (paddle.distribution.ExpTransform(),
-                paddle.distribution.PowerTransform(paddle.to_tensor(2.))),
-                1
-            )
-            print(t.forward(x))
-            # Tensor(shape=[3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[2.71828175 , 1.         ],
-            #         [7.38905621 , 4.         ],
-            #         [20.08553696, 9.         ]])
-
-            print(t.inverse(t.forward(x)))
-            # Tensor(shape=[3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[1., 1.],
-            #         [2., 2.],
-            #         [3., 3.]])
-
-            print(t.forward_log_det_jacobian(x))
-            # Tensor(shape=[3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[1.        , 0.69314718],
-            #         [2.        , 1.38629436],
-            #         [3.        , 1.79175949]])
+            >>> import paddle
+
+            >>> x = paddle.stack(
+            ...     (paddle.to_tensor([1., 2., 3.]), paddle.to_tensor([1, 2., 3.])), 1)
+            >>> t = paddle.distribution.StackTransform(
+            ...     (paddle.distribution.ExpTransform(),
+            ...     paddle.distribution.PowerTransform(paddle.to_tensor(2.))),
+            ...     1
+            >>> )
+            >>> print(t.forward(x))
+            Tensor(shape=[3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[2.71828175 , 1.         ],
+                     [7.38905621 , 4.         ],
+                     [20.08553696, 9.         ]])
+
+            >>> print(t.inverse(t.forward(x)))
+            Tensor(shape=[3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[1., 1.],
+                     [2., 2.],
+                     [3., 3.]])
+
+            >>> print(t.forward_log_det_jacobian(x))
+            Tensor(shape=[3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[1.        , 0.69314718],
+                     [2.        , 1.38629436],
+                     [3.        , 1.79175949]])
    """

    def __init__(self, transforms, axis=0):
@@ -1176,20 +1176,20 @@ class StickBreakingTransform(Transform):

        .. code-block:: python

-            import paddle
+            >>> import paddle


-            x = paddle.to_tensor([1.,2.,3.])
-            t = paddle.distribution.StickBreakingTransform()
-            print(t.forward(x))
-            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [0.47536686, 0.41287899, 0.10645414, 0.00530004])
-            print(t.inverse(t.forward(x)))
-            # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [0.99999988, 2.        , 2.99999881])
-            print(t.forward_log_det_jacobian(x))
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        -9.10835075)
+            >>> x = paddle.to_tensor([1.,2.,3.])
+            >>> t = paddle.distribution.StickBreakingTransform()
+            >>> print(t.forward(x))
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [0.47536686, 0.41287899, 0.10645414, 0.00530004])
+            >>> print(t.inverse(t.forward(x)))
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [0.99999988, 2.        , 2.99999881])
+            >>> print(t.forward_log_det_jacobian(x))
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    -9.10835075)
    """

    _type = Type.BIJECTION
@@ -1241,28 +1241,30 @@ class TanhTransform(Transform):

        .. code-block:: python

-            import paddle
-
-            tanh = paddle.distribution.TanhTransform()
-
-            x = paddle.to_tensor([[1., 2., 3.], [4., 5., 6.]])
-
-            print(tanh.forward(x))
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[0.76159418, 0.96402758, 0.99505478],
-            #         [0.99932933, 0.99990922, 0.99998772]])
-            print(tanh.inverse(tanh.forward(x)))
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[1.00000012, 2.        , 3.00000286],
-            #         [4.00002146, 5.00009823, 6.00039864]])
-            print(tanh.forward_log_det_jacobian(x))
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[-0.86756170 , -2.65000558 , -4.61865711 ],
-            #         [-6.61437654 , -8.61379623 , -10.61371803]])
-            print(tanh.inverse_log_det_jacobian(tanh.forward(x)))
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[0.86756176 , 2.65000558 , 4.61866283 ],
-            #         [6.61441946 , 8.61399269 , 10.61451530]])
+            >>> import paddle
+
+            >>> tanh = paddle.distribution.TanhTransform()
+
+            >>> x = paddle.to_tensor([[1., 2., 3.], [4., 5., 6.]])
+
+            >>> # doctest: +SKIP('random sample')
+            >>> print(tanh.forward(x))
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [[0.76159418, 0.96402758, 0.99505472],
+                    [0.99932921, 0.99990916, 0.99998784]])
+            >>> print(tanh.inverse(tanh.forward(x)))
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [[1.        , 2.        , 2.99999666],
+                    [3.99993253, 4.99977016, 6.00527668]])
+            >>> print(tanh.forward_log_det_jacobian(x))
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[-0.86756170 , -2.65000558 , -4.61865711 ],
+                     [-6.61437654 , -8.61379623 , -10.61371803]])
+            >>> print(tanh.inverse_log_det_jacobian(tanh.forward(x)))
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[0.86756176 , 2.65000558 , 4.61866283 ],
+                     [6.61441946 , 8.61399269 , 10.61451530]])
+            >>> # doctest: -SKIP
    """
    _type = Type.BIJECTION


--- a/python/paddle/distribution/transformed_distribution.py
+++ b/python/paddle/distribution/transformed_distribution.py
@@ -29,21 +29,24 @@ class TransformedDistribution(distribution.Distribution):

        .. code-block:: python

-            import paddle
-            from paddle.distribution import transformed_distribution
-
-            d = transformed_distribution.TransformedDistribution(
-                paddle.distribution.Normal(0., 1.),
-                [paddle.distribution.AffineTransform(paddle.to_tensor(1.), paddle.to_tensor(2.))]
-            )
-
-            print(d.sample([10]))
-            # Tensor(shape=[10], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [-0.10697651,  3.33609009, -0.86234951,  5.07457638,  0.75925219,
-            #         -4.17087793,  2.22579336, -0.93845034,  0.66054249,  1.50957513])
-            print(d.log_prob(paddle.to_tensor(0.5)))
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        -1.64333570)
+            >>> import paddle
+            >>> paddle.seed(2023)
+            >>> from paddle.distribution import transformed_distribution
+
+            >>> d = transformed_distribution.TransformedDistribution(
+            ...     paddle.distribution.Normal(0., 1.),
+            ...     [paddle.distribution.AffineTransform(paddle.to_tensor(1.), paddle.to_tensor(2.))]
+            ... )
+
+            >>> # doctest: +SKIP('random sample')
+            >>> print(d.sample([10]))
+            Tensor(shape=[10], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [ 3.22699189,  1.12264419,  0.50283587,  1.83812487, -2.00740123,
+                -2.70338631,  1.26663208,  4.47909021, -0.11529565,  4.32719326])
+            >>> print(d.log_prob(paddle.to_tensor(0.5)))
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                -1.64333570)
+            >>> # doctest: -SKIP
    """

    def __init__(self, base, transforms):