[xdoctest] No.44-47 and No.50-59 doc style (#55813)

84fe0454 · gouzil · GitHub · 30a02d27 · 84fe0454 · 84fe0454
9 changed file
--- a/python/paddle/distribution/cauchy.py
+++ b/python/paddle/distribution/cauchy.py
@@ -39,20 +39,20 @@ class Cauchy(distribution.Distribution):

        .. code-block:: python

-            import paddle
-            from paddle.distribution import Cauchy
-
-            # init Cauchy with float
-            rv = Cauchy(loc=0.1, scale=1.2)
-            print(rv.entropy())
-            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        2.71334577)
-
-            # init Cauchy with N-Dim tensor
-            rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-            print(rv.entropy())
-            # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [2.53102422, 3.22417140])
+            >>> import paddle
+            >>> from paddle.distribution import Cauchy
+
+            >>> # init Cauchy with float
+            >>> rv = Cauchy(loc=0.1, scale=1.2)
+            >>> print(rv.entropy())
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    2.71334577)
+
+            >>> # init Cauchy with N-Dim tensor
+            >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+            >>> print(rv.entropy())
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [2.53102422, 3.22417140])
    """

    def __init__(self, loc, scale, name=None):
@@ -114,32 +114,32 @@ class Cauchy(distribution.Distribution):

            .. code-block:: python

-                import paddle
-                from paddle.distribution import Cauchy
+                >>> import paddle
+                >>> from paddle.distribution import Cauchy

-                # init Cauchy with float
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.sample([10]).shape)
-                # [10]
+                >>> # init Cauchy with float
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.sample([10]).shape)
+                [10]

-                # init Cauchy with 0-Dim tensor
-                rv = Cauchy(loc=paddle.full((), 0.1), scale=paddle.full((), 1.2))
-                print(rv.sample([10]).shape)
-                # [10]
+                >>> # init Cauchy with 0-Dim tensor
+                >>> rv = Cauchy(loc=paddle.full((), 0.1), scale=paddle.full((), 1.2))
+                >>> print(rv.sample([10]).shape)
+                [10]

-                # init Cauchy with N-Dim tensor
-                rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.sample([10]).shape)
-                # [10, 2]
+                >>> # init Cauchy with N-Dim tensor
+                >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.sample([10]).shape)
+                [10, 2]

-                # sample 2-Dim data
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.sample([10, 2]).shape)
-                # [10, 2]
+                >>> # sample 2-Dim data
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.sample([10, 2]).shape)
+                [10, 2]

-                rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.sample([10, 2]).shape)
-                # [10, 2, 2]
+                >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.sample([10, 2]).shape)
+                [10, 2, 2]
        """
        name = name if name is not None else (self.name + '_sample')
        with paddle.no_grad():
@@ -159,32 +159,32 @@ class Cauchy(distribution.Distribution):

            .. code-block:: python

-                import paddle
-                from paddle.distribution import Cauchy
+                >>> import paddle
+                >>> from paddle.distribution import Cauchy

-                # init Cauchy with float
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.rsample([10]).shape)
-                # [10]
+                >>> # init Cauchy with float
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.rsample([10]).shape)
+                [10]

-                # init Cauchy with 0-Dim tensor
-                rv = Cauchy(loc=paddle.full((), 0.1), scale=paddle.full((), 1.2))
-                print(rv.rsample([10]).shape)
-                # [10]
+                >>> # init Cauchy with 0-Dim tensor
+                >>> rv = Cauchy(loc=paddle.full((), 0.1), scale=paddle.full((), 1.2))
+                >>> print(rv.rsample([10]).shape)
+                [10]

-                # init Cauchy with N-Dim tensor
-                rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.rsample([10]).shape)
-                # [10, 2]
+                >>> # init Cauchy with N-Dim tensor
+                >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.rsample([10]).shape)
+                [10, 2]

-                # sample 2-Dim data
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.rsample([10, 2]).shape)
-                # [10, 2]
+                >>> # sample 2-Dim data
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.rsample([10, 2]).shape)
+                [10, 2]

-                rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.rsample([10, 2]).shape)
-                # [10, 2, 2]
+                >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.rsample([10, 2]).shape)
+                [10, 2, 2]
        """
        name = name if name is not None else (self.name + '_rsample')

@@ -222,32 +222,32 @@ class Cauchy(distribution.Distribution):

            .. code-block:: python

-                import paddle
-                from paddle.distribution import Cauchy
-
-                # init Cauchy with float
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.prob(paddle.to_tensor(1.5)))
-                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        0.11234467)
-
-                # broadcast to value
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.prob(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.11234467, 0.01444674])
-
-                # init Cauchy with N-Dim tensor
-                rv = Cauchy(loc=paddle.to_tensor([0.1, 0.1]), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.prob(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.10753712, 0.02195240])
-
-                # init Cauchy with N-Dim tensor with broadcast
-                rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.prob(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.10753712, 0.02195240])
+                >>> import paddle
+                >>> from paddle.distribution import Cauchy
+
+                >>> # init Cauchy with float
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.prob(paddle.to_tensor(1.5)))
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        0.11234467)
+
+                >>> # broadcast to value
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.prob(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [0.11234467, 0.01444674])
+
+                >>> # init Cauchy with N-Dim tensor
+                >>> rv = Cauchy(loc=paddle.to_tensor([0.1, 0.1]), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.prob(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [0.10753712, 0.02195240])
+
+                >>> # init Cauchy with N-Dim tensor with broadcast
+                >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.prob(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [0.10753712, 0.02195240])
        """
        name = self.name + '_prob'

@@ -271,32 +271,32 @@ class Cauchy(distribution.Distribution):

            .. code-block:: python

-                import paddle
-                from paddle.distribution import Cauchy
-
-                # init Cauchy with float
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.log_prob(paddle.to_tensor(1.5)))
-                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        -2.18618369)
-
-                # broadcast to value
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.log_prob(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [-2.18618369, -4.23728657])
-
-                # init Cauchy with N-Dim tensor
-                rv = Cauchy(loc=paddle.to_tensor([0.1, 0.1]), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.log_prob(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [-2.22991920, -3.81887865])
-
-                # init Cauchy with N-Dim tensor with broadcast
-                rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.log_prob(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [-2.22991920, -3.81887865])
+                >>> import paddle
+                >>> from paddle.distribution import Cauchy
+
+                >>> # init Cauchy with float
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.log_prob(paddle.to_tensor(1.5)))
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        -2.18618369)
+
+                >>> # broadcast to value
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.log_prob(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [-2.18618369, -4.23728657])
+
+                >>> # init Cauchy with N-Dim tensor
+                >>> rv = Cauchy(loc=paddle.to_tensor([0.1, 0.1]), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.log_prob(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [-2.22991920, -3.81887865])
+
+                >>> # init Cauchy with N-Dim tensor with broadcast
+                >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.log_prob(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [-2.22991920, -3.81887865])
        """
        name = self.name + '_log_prob'

@@ -338,32 +338,32 @@ class Cauchy(distribution.Distribution):

            .. code-block:: python

-                import paddle
-                from paddle.distribution import Cauchy
-
-                # init Cauchy with float
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.cdf(paddle.to_tensor(1.5)))
-                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        0.77443725)
-
-                # broadcast to value
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.cdf(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.77443725, 0.92502367])
-
-                # init Cauchy with N-Dim tensor
-                rv = Cauchy(loc=paddle.to_tensor([0.1, 0.1]), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.cdf(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.80256844, 0.87888104])
-
-                # init Cauchy with N-Dim tensor with broadcast
-                rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.cdf(paddle.to_tensor([1.5, 5.1])))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.80256844, 0.87888104])
+                >>> import paddle
+                >>> from paddle.distribution import Cauchy
+
+                >>> # init Cauchy with float
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.cdf(paddle.to_tensor(1.5)))
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        0.77443725)
+
+                >>> # broadcast to value
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.cdf(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [0.77443725, 0.92502367])
+
+                >>> # init Cauchy with N-Dim tensor
+                >>> rv = Cauchy(loc=paddle.to_tensor([0.1, 0.1]), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.cdf(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [0.80256844, 0.87888104])
+
+                >>> # init Cauchy with N-Dim tensor with broadcast
+                >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.cdf(paddle.to_tensor([1.5, 5.1])))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [0.80256844, 0.87888104])
        """
        name = self.name + '_cdf'

@@ -399,20 +399,20 @@ class Cauchy(distribution.Distribution):

            .. code-block:: python

-                import paddle
-                from paddle.distribution import Cauchy
+                >>> import paddle
+                >>> from paddle.distribution import Cauchy

-                # init Cauchy with float
-                rv = Cauchy(loc=0.1, scale=1.2)
-                print(rv.entropy())
-                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        2.71334577)
+                >>> # init Cauchy with float
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> print(rv.entropy())
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        2.71334577)

-                # init Cauchy with N-Dim tensor
-                rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
-                print(rv.entropy())
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [2.53102422, 3.22417140])
+                >>> # init Cauchy with N-Dim tensor
+                >>> rv = Cauchy(loc=paddle.to_tensor(0.1), scale=paddle.to_tensor([1.0, 2.0]))
+                >>> print(rv.entropy())
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [2.53102422, 3.22417140])

        """
        name = self.name + '_entropy'
@@ -438,14 +438,14 @@ class Cauchy(distribution.Distribution):

            .. code-block:: python

-                import paddle
-                from paddle.distribution import Cauchy
+                >>> import paddle
+                >>> from paddle.distribution import Cauchy

-                rv = Cauchy(loc=0.1, scale=1.2)
-                rv_other = Cauchy(loc=paddle.to_tensor(1.2), scale=paddle.to_tensor([2.3, 3.4]))
-                print(rv.kl_divergence(rv_other))
-                # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.19819736, 0.31532931])
+                >>> rv = Cauchy(loc=0.1, scale=1.2)
+                >>> rv_other = Cauchy(loc=paddle.to_tensor(1.2), scale=paddle.to_tensor([2.3, 3.4]))
+                >>> print(rv.kl_divergence(rv_other))
+                Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [0.19819736, 0.31532931])
        """
        name = self.name + '_kl_divergence'


--- a/python/paddle/distribution/independent.py
+++ b/python/paddle/distribution/independent.py
@@ -31,21 +31,21 @@ class Independent(distribution.Distribution):

        .. code-block:: python

-            import paddle
-            from paddle.distribution import independent
-
-            beta = paddle.distribution.Beta(paddle.to_tensor([0.5, 0.5]), paddle.to_tensor([0.5, 0.5]))
-            print(beta.batch_shape, beta.event_shape)
-            # (2,) ()
-            print(beta.log_prob(paddle.to_tensor(0.2)))
-            # Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [-0.22843921, -0.22843921])
-            reinterpreted_beta = independent.Independent(beta, 1)
-            print(reinterpreted_beta.batch_shape, reinterpreted_beta.event_shape)
-            # () (2,)
-            print(reinterpreted_beta.log_prob(paddle.to_tensor([0.2,  0.2])))
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        -0.45687842)
+            >>> import paddle
+            >>> from paddle.distribution import independent
+
+            >>> beta = paddle.distribution.Beta(paddle.to_tensor([0.5, 0.5]), paddle.to_tensor([0.5, 0.5]))
+            >>> print(beta.batch_shape, beta.event_shape)
+            (2,) ()
+            >>> print(beta.log_prob(paddle.to_tensor(0.2)))
+            Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    [-0.22843921, -0.22843921])
+            >>> reinterpreted_beta = independent.Independent(beta, 1)
+            >>> print(reinterpreted_beta.batch_shape, reinterpreted_beta.event_shape)
+            () (2,)
+            >>> print(reinterpreted_beta.log_prob(paddle.to_tensor([0.2,  0.2])))
+            Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    -0.45687842)
    """

    def __init__(self, base, reinterpreted_batch_rank):

--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
@@ -53,14 +53,14 @@ def kl_divergence(p, q):

        .. code-block:: python

-            import paddle
+            >>> import paddle

-            p = paddle.distribution.Beta(alpha=0.5, beta=0.5)
-            q = paddle.distribution.Beta(alpha=0.3, beta=0.7)
+            >>> p = paddle.distribution.Beta(alpha=0.5, beta=0.5)
+            >>> q = paddle.distribution.Beta(alpha=0.3, beta=0.7)

-            print(paddle.distribution.kl_divergence(p, q))
-            # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        0.21193528)
+            >>> print(paddle.distribution.kl_divergence(p, q))
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                0.21193528)

    """
    return _dispatch(type(p), type(q))(p, q)
@@ -82,11 +82,11 @@ def register_kl(cls_p, cls_q):
    Examples:
        .. code-block:: python

-            import paddle
+            >>> import paddle

-            @paddle.distribution.register_kl(paddle.distribution.Beta, paddle.distribution.Beta)
-            def kl_beta_beta():
-                pass # insert implementation here
+            >>> @paddle.distribution.register_kl(paddle.distribution.Beta, paddle.distribution.Beta)
+            >>> def kl_beta_beta():
+            ...     pass # insert implementation here
    """
    if not issubclass(cls_p, Distribution) or not issubclass(
        cls_q, Distribution

--- a/python/paddle/distribution/laplace.py
+++ b/python/paddle/distribution/laplace.py
@@ -44,12 +44,12 @@ class Laplace(distribution.Distribution):
    Examples:
        .. code-block:: python

-            import paddle
-
-            m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
-            m.sample()  # Laplace distributed with loc=0, scale=1
-            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        3.68546247)
+            >>> import paddle
+            >>> paddle.seed(2023)
+            >>> m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
+            >>> m.sample()  # Laplace distributed with loc=0, scale=1
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                1.31554604)

    """

@@ -173,13 +173,13 @@ class Laplace(distribution.Distribution):
        Examples:
            .. code-block:: python

-                import paddle
+                >>> import paddle

-                m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
-                value = paddle.to_tensor(0.1)
-                m.log_prob(value)
-                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        -0.79314721)
+                >>> m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
+                >>> value = paddle.to_tensor(0.1)
+                >>> m.log_prob(value)
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        -0.79314721)

        """
        loc, scale, value = self._validate_value(value)
@@ -205,12 +205,12 @@ class Laplace(distribution.Distribution):
        Examples:
            .. code-block:: python

-                import paddle
+                >>> import paddle

-                m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
-                m.entropy()
-                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        1.69314718)
+                >>> m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
+                >>> m.entropy()
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        1.69314718)
        """
        return 1 + paddle.log(2 * self.scale)

@@ -236,13 +236,13 @@ class Laplace(distribution.Distribution):
        Examples:
            .. code-block:: python

-                import paddle
+                >>> import paddle

-                m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
-                value = paddle.to_tensor(0.1)
-                m.cdf(value)
-                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        0.54758132)
+                >>> m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
+                >>> value = paddle.to_tensor(0.1)
+                >>> m.cdf(value)
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        0.54758132)
        """
        loc, scale, value = self._validate_value(value)
        iterm = (
@@ -275,13 +275,12 @@ class Laplace(distribution.Distribution):
        Examples:
            .. code-block:: python

-                            import paddle
-
-                            m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
-                            value = paddle.to_tensor(0.1)
-                            m.icdf(value)
-                            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                            #        -1.60943794)
+                >>> import paddle
+                >>> m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
+                >>> value = paddle.to_tensor(0.1)
+                >>> m.icdf(value)
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        -1.60943794)
        """
        loc, scale, value = self._validate_value(value)
        term = value - 0.5
@@ -300,12 +299,12 @@ class Laplace(distribution.Distribution):
        Examples:
            .. code-block:: python

-                            import paddle
-
-                            m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
-                            m.sample()  # Laplace distributed with loc=0, scale=1
-                            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                            #        3.68546247)
+                >>> import paddle
+                >>> paddle.seed(2023)
+                >>> m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
+                >>> m.sample()  # Laplace distributed with loc=0, scale=1
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    1.31554604)
        """
        shape = shape if isinstance(shape, tuple) else tuple(shape)
        with paddle.no_grad():
@@ -323,12 +322,12 @@ class Laplace(distribution.Distribution):
        Examples:
            .. code-block:: python

-                            import paddle
-
-                            m = paddle.distribution.Laplace(paddle.to_tensor([0.0]), paddle.to_tensor([1.0]))
-                            m.rsample((1,))  # Laplace distributed with loc=0, scale=1
-                            # Tensor(shape=[1, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                            # [[0.04337667]])
+                >>> import paddle
+                >>> paddle.seed(2023)
+                >>> m = paddle.distribution.Laplace(paddle.to_tensor([0.0]), paddle.to_tensor([1.0]))
+                >>> m.rsample((1,))  # Laplace distributed with loc=0, scale=1
+                Tensor(shape=[1, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                    [[1.31554604]])
        """

        eps = self._get_eps()
@@ -395,13 +394,13 @@ class Laplace(distribution.Distribution):
        Examples:
            .. code-block:: python

-                import paddle
+                >>> import paddle

-                m1 = paddle.distribution.Laplace(paddle.to_tensor([0.0]), paddle.to_tensor([1.0]))
-                m2 = paddle.distribution.Laplace(paddle.to_tensor([1.0]), paddle.to_tensor([0.5]))
-                m1.kl_divergence(m2)
-                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
-                # [1.04261160])
+                >>> m1 = paddle.distribution.Laplace(paddle.to_tensor([0.0]), paddle.to_tensor([1.0]))
+                >>> m2 = paddle.distribution.Laplace(paddle.to_tensor([1.0]), paddle.to_tensor([0.5]))
+                >>> m1.kl_divergence(m2)
+                Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                        [1.04261160])
        """

        var_ratio = other.scale / self.scale

--- a/python/paddle/distribution/lognormal.py
+++ b/python/paddle/distribution/lognormal.py
@@ -49,36 +49,44 @@ class LogNormal(TransformedDistribution):
    Examples:
        .. code-block:: python

-          import paddle
-          from paddle.distribution import LogNormal
-
-          # Define a single scalar LogNormal distribution.
-          dist = LogNormal(loc=0., scale=3.)
-          # Define a batch of two scalar valued LogNormals.
-          # The underlying Normal of first has mean 1 and standard deviation 11, the underlying Normal of second 2 and 22.
-          dist = LogNormal(loc=[1., 2.], scale=[11., 22.])
-          # Get 3 samples, returning a 3 x 2 tensor.
-          dist.sample((3, ))
-
-          # Define a batch of two scalar valued LogNormals.
-          # Their underlying Normal have mean 1, but different standard deviations.
-          dist = LogNormal(loc=1., scale=[11., 22.])
-
-          # Complete example
-          value_tensor = paddle.to_tensor([0.8], dtype="float32")
-
-          lognormal_a = LogNormal([0.], [1.])
-          lognormal_b = LogNormal([0.5], [2.])
-          sample = lognormal_a.sample((2, ))
-          # a random tensor created by lognormal distribution with shape: [2, 1]
-          entropy = lognormal_a.entropy()
-          # [1.4189385] with shape: [1]
-          lp = lognormal_a.log_prob(value_tensor)
-          # [-0.72069150] with shape: [1]
-          p = lognormal_a.probs(value_tensor)
-          # [0.48641577] with shape: [1]
-          kl = lognormal_a.kl_divergence(lognormal_b)
-          # [0.34939718] with shape: [1]
+            >>> import paddle
+            >>> from paddle.distribution import LogNormal
+
+            >>> # Define a single scalar LogNormal distribution.
+            >>> dist = LogNormal(loc=0., scale=3.)
+            >>> # Define a batch of two scalar valued LogNormals.
+            >>> # The underlying Normal of first has mean 1 and standard deviation 11, the underlying Normal of second 2 and 22.
+            >>> dist = LogNormal(loc=[1., 2.], scale=[11., 22.])
+            >>> # Get 3 samples, returning a 3 x 2 tensor.
+            >>> dist.sample((3, ))
+
+            >>> # Define a batch of two scalar valued LogNormals.
+            >>> # Their underlying Normal have mean 1, but different standard deviations.
+            >>> dist = LogNormal(loc=1., scale=[11., 22.])
+
+            >>> # Complete example
+            >>> value_tensor = paddle.to_tensor([0.8], dtype="float32")
+
+            >>> lognormal_a = LogNormal([0.], [1.])
+            >>> lognormal_b = LogNormal([0.5], [2.])
+            >>> sample = lognormal_a.sample((2, ))
+            >>> # a random tensor created by lognormal distribution with shape: [2, 1]
+            >>> entropy = lognormal_a.entropy()
+            >>> print(entropy)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [1.41893852])
+            >>> lp = lognormal_a.log_prob(value_tensor)
+            >>> print(lp)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [-0.72069150])
+            >>> p = lognormal_a.probs(value_tensor)
+            >>> print(p)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [0.48641577])
+            >>> kl = lognormal_a.kl_divergence(lognormal_b)
+            >>> print(kl)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [0.34939718])
    """

    def __init__(self, loc, scale):

--- a/python/paddle/distribution/multinomial.py
+++ b/python/paddle/distribution/multinomial.py
@@ -53,18 +53,17 @@ class Multinomial(distribution.Distribution):

    .. code-block:: python

-        import paddle
-
-        multinomial = paddle.distribution.Multinomial(10, paddle.to_tensor([0.2, 0.3, 0.5]))
-        print(multinomial.sample((2, 3)))
-        # Tensor(shape=[2, 3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-        #        [[[1., 4., 5.],
-        #          [0., 2., 8.],
-        #          [2., 4., 4.]],
-
-        #         [[1., 6., 3.],
-        #          [3., 3., 4.],
-        #          [3., 4., 3.]]])
+        >>> import paddle
+        >>> paddle.seed(2023)
+        >>> multinomial = paddle.distribution.Multinomial(10, paddle.to_tensor([0.2, 0.3, 0.5]))
+        >>> print(multinomial.sample((2, 3)))
+        Tensor(shape=[2, 3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[1., 5., 4.],
+              [0., 4., 6.],
+              [1., 3., 6.]],
+            [[2., 2., 6.],
+              [0., 6., 4.],
+              [3., 3., 4.]]])
    """

    def __init__(self, total_count, probs):

--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -54,36 +54,44 @@ class Normal(distribution.Distribution):
    Examples:
        .. code-block:: python

-            import paddle
-            from paddle.distribution import Normal
-
-            # Define a single scalar Normal distribution.
-            dist = Normal(loc=0., scale=3.)
-            # Define a batch of two scalar valued Normals.
-            # The first has mean 1 and standard deviation 11, the second 2 and 22.
-            dist = Normal(loc=[1., 2.], scale=[11., 22.])
-            # Get 3 samples, returning a 3 x 2 tensor.
-            dist.sample([3])
-
-            # Define a batch of two scalar valued Normals.
-            # Both have mean 1, but different standard deviations.
-            dist = Normal(loc=1., scale=[11., 22.])
-
-            # Complete example
-            value_tensor = paddle.to_tensor([0.8], dtype="float32")
-
-            normal_a = Normal([0.], [1.])
-            normal_b = Normal([0.5], [2.])
-            sample = normal_a.sample([2])
-            # a random tensor created by normal distribution with shape: [2, 1]
-            entropy = normal_a.entropy()
-            # [1.4189385] with shape: [1]
-            lp = normal_a.log_prob(value_tensor)
-            # [-1.2389386] with shape: [1]
-            p = normal_a.probs(value_tensor)
-            # [0.28969154] with shape: [1]
-            kl = normal_a.kl_divergence(normal_b)
-            # [0.34939718] with shape: [1]
+            >>> import paddle
+            >>> from paddle.distribution import Normal
+
+            >>> # Define a single scalar Normal distribution.
+            >>> dist = Normal(loc=0., scale=3.)
+            >>> # Define a batch of two scalar valued Normals.
+            >>> # The first has mean 1 and standard deviation 11, the second 2 and 22.
+            >>> dist = Normal(loc=[1., 2.], scale=[11., 22.])
+            >>> # Get 3 samples, returning a 3 x 2 tensor.
+            >>> dist.sample([3])
+
+            >>> # Define a batch of two scalar valued Normals.
+            >>> # Both have mean 1, but different standard deviations.
+            >>> dist = Normal(loc=1., scale=[11., 22.])
+
+            >>> # Complete example
+            >>> value_tensor = paddle.to_tensor([0.8], dtype="float32")
+
+            >>> normal_a = Normal([0.], [1.])
+            >>> normal_b = Normal([0.5], [2.])
+            >>> sample = normal_a.sample([2])
+            >>> # a random tensor created by normal distribution with shape: [2, 1]
+            >>> entropy = normal_a.entropy()
+            >>> print(entropy)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [1.41893852])
+            >>> lp = normal_a.log_prob(value_tensor)
+            >>> print(lp)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [-1.23893857])
+            >>> p = normal_a.probs(value_tensor)
+            >>> print(p)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [0.28969154])
+            >>> kl = normal_a.kl_divergence(normal_b)
+            >>> print(kl)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [0.34939718])
    """

    def __init__(self, loc, scale, name=None):

--- a/python/paddle/distribution/transform.py
+++ b/python/paddle/distribution/transform.py
--- a/python/paddle/distribution/transformed_distribution.py
+++ b/python/paddle/distribution/transformed_distribution.py
@@ -29,21 +29,24 @@ class TransformedDistribution(distribution.Distribution):

        .. code-block:: python

-            import paddle
-            from paddle.distribution import transformed_distribution
-
-            d = transformed_distribution.TransformedDistribution(
-                paddle.distribution.Normal(0., 1.),
-                [paddle.distribution.AffineTransform(paddle.to_tensor(1.), paddle.to_tensor(2.))]
-            )
-
-            print(d.sample([10]))
-            # Tensor(shape=[10], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [-0.10697651,  3.33609009, -0.86234951,  5.07457638,  0.75925219,
-            #         -4.17087793,  2.22579336, -0.93845034,  0.66054249,  1.50957513])
-            print(d.log_prob(paddle.to_tensor(0.5)))
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        -1.64333570)
+            >>> import paddle
+            >>> paddle.seed(2023)
+            >>> from paddle.distribution import transformed_distribution
+
+            >>> d = transformed_distribution.TransformedDistribution(
+            ...     paddle.distribution.Normal(0., 1.),
+            ...     [paddle.distribution.AffineTransform(paddle.to_tensor(1.), paddle.to_tensor(2.))]
+            ... )
+
+            >>> # doctest: +SKIP('random sample')
+            >>> print(d.sample([10]))
+            Tensor(shape=[10], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [ 3.22699189,  1.12264419,  0.50283587,  1.83812487, -2.00740123,
+                -2.70338631,  1.26663208,  4.47909021, -0.11529565,  4.32719326])
+            >>> print(d.log_prob(paddle.to_tensor(0.5)))
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                -1.64333570)
+            >>> # doctest: -SKIP
    """

    def __init__(self, base, transforms):