[Zero-Dim] update 0d tensor api en doc, test=document_fix (#53823)

50f0acc0 · zhouweiwei2014 · GitHub · 640cff0a · 50f0acc0 · 50f0acc0
38 changed file
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -393,7 +393,7 @@ def get_data_parallel_group(dist_ctx, op, act_grad_names, rank):
    for var_name in act_grad_names:
        var_dim_mapping = op_dist_attr.get_input_dims_mapping(var_name)
-        # consider that the variable's shape is [], which is 0D
+        # consider that the variable's shape is [], which is 0-D
        # TODO utilize the batch_dim attr instead of "0" in future
        batch_size_axis = var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1

--- a/python/paddle/distributed/communication/stream/all_gather.py
+++ b/python/paddle/distributed/communication/stream/all_gather.py
@@ -108,7 +108,7 @@ def _all_gather_in_static_mode(tensor_list, tensor, group, sync_op):
        },
    )
    tensor_list.clear()
-    # 0D use stack/unstack while others use concat/split
+    # 0-D use stack/unstack while others use concat/split
    if len(tensor.shape) == 0:
        tensor_list.extend(paddle.unstack(out, 0))
    else:

--- a/python/paddle/distributed/communication/stream/all_to_all.py
+++ b/python/paddle/distributed/communication/stream/all_to_all.py
@@ -78,7 +78,7 @@ def _all_to_all_in_static_mode(
    if isinstance(in_tensor_or_tensor_list, list):
        if len(in_tensor_or_tensor_list) == 0:
            raise RuntimeError("The input tensor_list should not be empty.")
-        # 0D use stack/unstack while others use concat/split
+        # 0-D use stack/unstack while others use concat/split
        if len(in_tensor_or_tensor_list[0].shape) == 0:
            in_tensor = paddle.stack(in_tensor_or_tensor_list, axis=0)
        else:
@@ -115,7 +115,7 @@ def _all_to_all_in_static_mode(
    if isinstance(out_tensor_or_tensor_list, list):
        if not sync_op:
            dist.wait(out_tensor, use_calc_stream=False)
-        # 0D use stack/unstack while others use concat/split
+        # 0-D use stack/unstack while others use concat/split
        if len(in_tensor_or_tensor_list[0].shape) == 0:
            out_tensor_or_tensor_list.extend(paddle.unstack(out_tensor, 0))
        else:

--- a/python/paddle/distributed/communication/stream/scatter.py
+++ b/python/paddle/distributed/communication/stream/scatter.py
@@ -91,7 +91,7 @@ def _scatter_in_static_mode(
                )
        else:
            tensor_list = [tensor for _ in range(nranks)]
-        # 0D use stack/unstack while others use concat/split
+        # 0-D use stack/unstack while others use concat/split
        if len(tensor_list[0].shape) == 0:
            input_tensor = paddle.stack(tensor_list, axis=0)
        else:

--- a/python/paddle/distribution/bernoulli.py
+++ b/python/paddle/distribution/bernoulli.py
@@ -79,16 +79,16 @@ class Bernoulli(exponential_family.ExponentialFamily):
            rv = Bernoulli(probs=0.3)
            print(rv.mean)
-            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [0.30000001])
+            #        0.30000001)
            print(rv.variance)
-            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [0.21000001])
+            #        0.21000001)
            print(rv.entropy())
-            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [0.61086434])
+            #        0.61086434)
    """
    def __init__(self, probs, name=None):
@@ -247,12 +247,12 @@ class Bernoulli(exponential_family.ExponentialFamily):
                # The smaller the `temperature`, the distribution of `rsample` closer to `sample`, with `probs` of 0.3.
                print(paddle.nn.functional.sigmoid(rv.rsample([1000, ], temperature=1.0)).sum())
-                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [361.06829834])
+                #        361.06829834)
                print(paddle.nn.functional.sigmoid(rv.rsample([1000, ], temperature=0.1)).sum())
-                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [288.66418457])
+                #        288.66418457)
        """
        name = self.name + '_rsample'
        if not _non_static_mode():
@@ -420,8 +420,8 @@ class Bernoulli(exponential_family.ExponentialFamily):
                rv = Bernoulli(0.3)
                print(rv.entropy())
-                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.61086434])
+                #        0.61086434)
        """
        name = self.name + '_entropy'
@@ -455,8 +455,8 @@ class Bernoulli(exponential_family.ExponentialFamily):
                rv_other = Bernoulli(0.7)
                print(rv.kl_divergence(rv_other))
-                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.33891910])
+                #        0.33891910)
        """
        name = self.name + '_kl_divergence'
        if not _non_static_mode():

--- a/python/paddle/distribution/beta.py
+++ b/python/paddle/distribution/beta.py
@@ -61,13 +61,13 @@ class Beta(exponential_family.ExponentialFamily):
            beta = paddle.distribution.Beta(alpha=0.5, beta=0.5)
            print(beta.mean)
            # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [0.50000000])
+            #        0.50000000)
            print(beta.variance)
            # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [0.12500000])
+            #        0.12500000)
            print(beta.entropy())
            # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [0.12500000])
+            #        0.12500000)
            # tensor input with broadcast
            beta = paddle.distribution.Beta(alpha=paddle.to_tensor([0.2, 0.4]), beta=0.6)

--- a/python/paddle/distribution/cauchy.py
+++ b/python/paddle/distribution/cauchy.py
@@ -45,7 +45,7 @@ class Cauchy(distribution.Distribution):
            # init Cauchy with float
            rv = Cauchy(loc=0.1, scale=1.2)
            print(rv.entropy())
-            # Tensor(shape=1, dtype=float32, place=Place(cpu), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
            #        2.71334577)
            # init Cauchy with N-Dim tensor
@@ -228,8 +228,8 @@ class Cauchy(distribution.Distribution):
                # init Cauchy with float
                rv = Cauchy(loc=0.1, scale=1.2)
                print(rv.prob(paddle.to_tensor(1.5)))
-                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.11234467])
+                #        0.11234467)
                # broadcast to value
                rv = Cauchy(loc=0.1, scale=1.2)
@@ -277,8 +277,8 @@ class Cauchy(distribution.Distribution):
                # init Cauchy with float
                rv = Cauchy(loc=0.1, scale=1.2)
                print(rv.log_prob(paddle.to_tensor(1.5)))
-                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [-2.18618369])
+                #        -2.18618369)
                # broadcast to value
                rv = Cauchy(loc=0.1, scale=1.2)
@@ -344,8 +344,8 @@ class Cauchy(distribution.Distribution):
                # init Cauchy with float
                rv = Cauchy(loc=0.1, scale=1.2)
                print(rv.cdf(paddle.to_tensor(1.5)))
-                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.77443725])
+                #        0.77443725)
                # broadcast to value
                rv = Cauchy(loc=0.1, scale=1.2)

--- a/python/paddle/distribution/dirichlet.py
+++ b/python/paddle/distribution/dirichlet.py
@@ -63,10 +63,10 @@ class Dirichlet(exponential_family.ExponentialFamily):
            print(dirichlet.entropy())
            # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [-1.24434423])
+            #        -1.24434423)
            print(dirichlet.prob(paddle.to_tensor([.3, .5, .6])))
            # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [10.80000114])
+            #        10.80000114)
    """

--- a/python/paddle/distribution/geometric.py
+++ b/python/paddle/distribution/geometric.py
@@ -55,16 +55,16 @@ class Geometric(distribution.Distribution):
            geom = Geometric(0.5)
            geom.mean
-            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [2.])
+            #        2.)
            geom.variance
-            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [2.])
+            #        2.)
            geom.stddev
-            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [1.41421354])
+            #        1.41421354)
    """
    def __init__(self, probs):
@@ -145,8 +145,8 @@ class Geometric(distribution.Distribution):
                geom = Geometric(0.5)
                geom.pmf(2)
-                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.25000000])
+                #        0.25000000)
        """
        if isinstance(k, (numbers.Integral, framework.Variable)):
            return paddle.pow((1.0 - self.probs), k - 1.0) * self.probs
@@ -176,8 +176,8 @@ class Geometric(distribution.Distribution):
                geom = Geometric(0.5)
                geom.log_pmf(2)
-                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [-1.38629436])
+                #        -1.38629436)
        """
        if isinstance(k, (numbers.Integral, framework.Variable)):
            return paddle.log(self.pmf(k))
@@ -266,8 +266,8 @@ class Geometric(distribution.Distribution):
                geom = Geometric(0.5)
                geom.entropy()
-                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [1.38629436])
+                #        1.38629436)
        """
        x = (1.0 - self.probs) * paddle.log(1.0 - self.probs)
        y = self.probs * paddle.log(self.probs)
@@ -296,8 +296,8 @@ class Geometric(distribution.Distribution):
                geom = Geometric(0.5)
                geom.cdf(4)
-                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.93750000])
+                #        0.93750000)
        """
        if isinstance(k, (numbers.Integral, framework.Variable)):
            return 1.0 - paddle.pow((1.0 - self.probs), k)
@@ -329,8 +329,8 @@ class Geometric(distribution.Distribution):
                geom_p = Geometric(0.5)
                geom_q = Geometric(0.1)
                geom_p.kl_divergence(geom_q)
-                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                #        [0.51082563])
+                #        0.51082563)
        """
        if isinstance(other, Geometric):
            p, q = self.probs, other.probs

--- a/python/paddle/distribution/gumbel.py
+++ b/python/paddle/distribution/gumbel.py
@@ -61,7 +61,7 @@ class Gumbel(TransformedDistribution):
          dist.cdf(value)
          # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, [0.54523915])
          dist.entropy()
-          # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True, [1.57721567])
+          # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, [1.57721567])
          dist.rsample([2])
          # Tensor(shape=[2, 1], dtype=float32, place=Place(gpu:0), stop_gradient=True, [[0.80463481], [0.91893655]])

--- a/python/paddle/distribution/independent.py
+++ b/python/paddle/distribution/independent.py
@@ -44,8 +44,8 @@ class Independent(distribution.Distribution):
            print(reinterpreted_beta.batch_shape, reinterpreted_beta.event_shape)
            # () (2,)
            print(reinterpreted_beta.log_prob(paddle.to_tensor([0.2,  0.2])))
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [-0.45687842])
+            #        -0.45687842)
    """
    def __init__(self, base, reinterpreted_batch_rank):

--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
@@ -60,7 +60,7 @@ def kl_divergence(p, q):
            print(paddle.distribution.kl_divergence(p, q))
            # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [0.21193528])
+            #        0.21193528)
    """
    return _dispatch(type(p), type(q))(p, q)

--- a/python/paddle/distribution/laplace.py
+++ b/python/paddle/distribution/laplace.py
@@ -46,10 +46,10 @@ class Laplace(distribution.Distribution):
            import paddle
-                        m = paddle.distribution.Laplace(paddle.to_tensor([0.0]), paddle.to_tensor([1.0]))
+            m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
            m.sample()  # Laplace distributed with loc=0, scale=1
            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                        # [3.68546247])
+            #        3.68546247)
    """
@@ -175,11 +175,11 @@ class Laplace(distribution.Distribution):
                import paddle
-                            m = paddle.distribution.Laplace(paddle.to_tensor([0.0]), paddle.to_tensor([1.0]))
+                m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
-                            value = paddle.to_tensor([0.1])
+                value = paddle.to_tensor(0.1)
                m.log_prob(value)
-                            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                            # [-0.79314721])
+                #        -0.79314721)
        """
        loc, scale, value = self._validate_value(value)
@@ -207,10 +207,10 @@ class Laplace(distribution.Distribution):
                import paddle
-                            m = paddle.distribution.Laplace(paddle.to_tensor([0.0]), paddle.to_tensor([1.0]))
+                m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
                m.entropy()
                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                            # [1.69314718])
+                #        1.69314718)
        """
        return 1 + paddle.log(2 * self.scale)
@@ -238,11 +238,11 @@ class Laplace(distribution.Distribution):
                import paddle
-                            m = paddle.distribution.Laplace(paddle.to_tensor([0.0]), paddle.to_tensor([1.0]))
+                m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
-                            value = paddle.to_tensor([0.1])
+                value = paddle.to_tensor(0.1)
                m.cdf(value)
-                            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                            # [0.54758132])
+                #        0.54758132)
        """
        loc, scale, value = self._validate_value(value)
        iterm = (
@@ -277,11 +277,11 @@ class Laplace(distribution.Distribution):
                            import paddle
-                            m = paddle.distribution.Laplace(paddle.to_tensor([0.0]), paddle.to_tensor([1.0]))
+                            m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
-                            value = paddle.to_tensor([0.1])
+                            value = paddle.to_tensor(0.1)
                            m.icdf(value)
-                            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                            # [-1.60943794])
+                            #        -1.60943794)
        """
        loc, scale, value = self._validate_value(value)
        term = value - 0.5
@@ -302,10 +302,10 @@ class Laplace(distribution.Distribution):
                            import paddle
-                            m = paddle.distribution.Laplace(paddle.to_tensor([0.0]), paddle.to_tensor([1.0]))
+                            m = paddle.distribution.Laplace(paddle.to_tensor(0.0), paddle.to_tensor(1.0))
                            m.sample()  # Laplace distributed with loc=0, scale=1
                            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-                            # [3.68546247])
+                            #        3.68546247)
        """
        shape = shape if isinstance(shape, tuple) else tuple(shape)
        with paddle.no_grad():
@@ -400,7 +400,7 @@ class Laplace(distribution.Distribution):
                m1 = paddle.distribution.Laplace(paddle.to_tensor([0.0]), paddle.to_tensor([1.0]))
                m2 = paddle.distribution.Laplace(paddle.to_tensor([1.0]), paddle.to_tensor([0.5]))
                m1.kl_divergence(m2)
-                            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
                # [1.04261160])
        """

--- a/python/paddle/distribution/lognormal.py
+++ b/python/paddle/distribution/lognormal.py
@@ -72,13 +72,13 @@ class LogNormal(TransformedDistribution):
          sample = lognormal_a.sample((2, ))
          # a random tensor created by lognormal distribution with shape: [2, 1]
          entropy = lognormal_a.entropy()
-          # [1.4189385] with shape: []
+          # [1.4189385] with shape: [1]
          lp = lognormal_a.log_prob(value_tensor)
          # [-0.72069150] with shape: [1]
          p = lognormal_a.probs(value_tensor)
          # [0.48641577] with shape: [1]
          kl = lognormal_a.kl_divergence(lognormal_b)
-          # [0.34939718] with shape: []
+          # [0.34939718] with shape: [1]
    """
    def __init__(self, loc, scale):

--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -77,13 +77,13 @@ class Normal(distribution.Distribution):
            sample = normal_a.sample([2])
            # a random tensor created by normal distribution with shape: [2, 1]
            entropy = normal_a.entropy()
-            # [1.4189385] with shape: []
+            # [1.4189385] with shape: [1]
            lp = normal_a.log_prob(value_tensor)
            # [-1.2389386] with shape: [1]
            p = normal_a.probs(value_tensor)
            # [0.28969154] with shape: [1]
            kl = normal_a.kl_divergence(normal_b)
-            # [0.34939718] with shape: []
+            # [0.34939718] with shape: [1]
    """
    def __init__(self, loc, scale, name=None):

--- a/python/paddle/distribution/transform.py
+++ b/python/paddle/distribution/transform.py
@@ -435,8 +435,8 @@ class AffineTransform(Transform):
            # Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
            #        [1., 2.])
            print(affine.forward_log_det_jacobian(x))
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [0.])
+            #        0.)
    """
    _type = Type.BIJECTION
@@ -1189,8 +1189,8 @@ class StickBreakingTransform(Transform):
            # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
            #        [0.99999988, 2.        , 2.99999881])
            print(t.forward_log_det_jacobian(x))
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [-9.10835075])
+            #        -9.10835075)
    """
    _type = Type.BIJECTION

--- a/python/paddle/distribution/transformed_distribution.py
+++ b/python/paddle/distribution/transformed_distribution.py
@@ -42,8 +42,8 @@ class TransformedDistribution(distribution.Distribution):
            #        [-0.10697651,  3.33609009, -0.86234951,  5.07457638,  0.75925219,
            #         -4.17087793,  2.22579336, -0.93845034,  0.66054249,  1.50957513])
            print(d.log_prob(paddle.to_tensor(0.5)))
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [-1.64333570])
+            #        -1.64333570)
    """
    def __init__(self, base, transforms):

--- a/python/paddle/distribution/uniform.py
+++ b/python/paddle/distribution/uniform.py
@@ -84,7 +84,7 @@ class Uniform(distribution.Distribution):
            sample = uniform.sample([2])
            # a random tensor created by uniform distribution with shape: [2, 1]
            entropy = uniform.entropy()
-            # [0.6931472] with shape: []
+            # [0.6931472] with shape: [1]
            lp = uniform.log_prob(value_tensor)
            # [-0.6931472] with shape: [1]
            p = uniform.probs(value_tensor)

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -387,7 +387,7 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
 def _create_loss_op_desc_(loss):
-    # 0D Tensor or 0-Size Tensor
+    # 0-D Tensor or 0-Size Tensor
    if len(loss.shape) == 0 or 0 in loss.shape:
        create_shape = loss.shape
    else:

--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -789,7 +789,7 @@ class ReduceLROnPlateau(LearningRateDecay):
    Reduce learning rate when ``loss`` has stopped descending. Models often benefit from reducing the learning rate
    by 2 to 10 times once model performance has no longer improvement.
-    The ``loss`` is the one which has been pass into ``step`` , it must be 1-D Tensor with shape [1]. When ``loss``
+    The ``loss`` is the one which has been pass into ``step`` , it must be 0-D Tensor with shape []. When ``loss``
    stop descending for a ``patience`` number of epochs, the learning rate will be reduced to ``learning_rate * decay_rate`` .
    (Specially, ``mode`` can also be set to ``'max`` , in this case, when ``loss`` stop ascending for a ``patience`` number
    of epochs, the learning rate will be reduced.)
@@ -943,7 +943,7 @@ class ReduceLROnPlateau(LearningRateDecay):
        Args:
            loss (Variable): A ``Variable`` that will be monitored to determine whether the learning rate will reduce.
                If it stop descending for a ``patience`` number of epochs, the learning rate will reduce. It should
-                be 1-D Tensor with shape [1].
+                be 0-D Tensor with shape [].
                Specially, if ``mode`` has been set to ``'max'`` ,  the learning rate will reduce when it stops ascending.
        Returns:
            None
@@ -952,7 +952,7 @@ class ReduceLROnPlateau(LearningRateDecay):
            Please refer to the example of current LearningRateDecay.
        """
-        # loss must be 1-D Tensor with shape [1]
+        # loss.size must be 1
        check_type(loss, 'loss', Variable, 'ReduceLROnPlateau.step')
        assert np.prod(loss.shape) == 1, (
            "The number of elements of loss should be 1, but the current loss.shape is {}, whose number of elements is not 1. "

--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -131,7 +131,7 @@ def monkey_patch_math_tensor():
        return int(np.array(var).flatten()[0])
    def _len_(var):
-        assert var.ndim > 0, "len() of a 0D tensor is wrong"
+        assert var.ndim > 0, "len() of a 0-D tensor is wrong"
        if var.type == core.VarDesc.VarType.VOCAB:
            return len(var.value().get_map_tensor())
        elif var.type == core.VarDesc.VarType.STRINGS:

--- a/python/paddle/fluid/dygraph/tensor_patch_methods.py
+++ b/python/paddle/fluid/dygraph/tensor_patch_methods.py
@@ -516,7 +516,7 @@ def monkey_patch_tensor():
                y = paddle.pow(x, 4.0)
                y.backward()
                print("grad of x: {}".format(x.grad))
-                # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False, [500.])
+                # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=False, 500.)
        """
        msg = (
@@ -638,12 +638,12 @@ def monkey_patch_tensor():
                y = copy.deepcopy(x)
                print(x)
-                # Tensor(shape=[1], dtype=float32, place=CPUPlace, stop_gradient=True,
+                # Tensor(shape=[], dtype=float32, place=CPUPlace, stop_gradient=True,
-                #        [2.])
+                #        2.)
                print(y)
-                # Tensor(shape=[1], dtype=float32, place=CPUPlace, stop_gradient=True,
+                # Tensor(shape=[], dtype=float32, place=CPUPlace, stop_gradient=True,
-                #        [2.])
+                #        2.)
        """
        if not self.is_leaf:

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2476,7 +2476,7 @@ class Variable(metaclass=VariableMetaClass):
    def size(self):
        """
-        Returns the number of elements for current Variable, which is a int64 Variable with shape [1]
+        Returns the number of elements for current Variable, which is a int64 Variable with shape [] .
        Returns:
            Variable, the number of elements for current Variable

--- a/python/paddle/fluid/tests/unittests/test_numel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_numel_op.py
@@ -120,7 +120,6 @@ class TestNumelAPI(unittest.TestCase):
                },
                fetch_list=[out_1, out_2],
            )
-            # TODO(zhouwei): will change shape [1] to [] to support zero-dim
            assert np.array_equal(
                res_1, np.array(np.size(input_1)).astype("int64")
            )

--- a/python/paddle/fluid/tests/unittests/test_size_op.py
+++ b/python/paddle/fluid/tests/unittests/test_size_op.py
@@ -83,7 +83,6 @@ class TestSizeAPI(unittest.TestCase):
                },
                fetch_list=[out_1, out_2],
            )
-            # TODO(zhouwei): will change shape [1] to [] to support zero-dim
            assert np.array_equal(
                res_1, np.array(np.size(input_1)).astype("int64")
            )

--- a/python/paddle/jit/dy2static/convert_operators.py
+++ b/python/paddle/jit/dy2static/convert_operators.py
@@ -517,7 +517,7 @@ def convert_len(var):
          `shape_op` in var.block.
    """
    if isinstance(var, Variable):
-        assert var.ndim > 0, "len() of a 0D tensor is wrong"
+        assert var.ndim > 0, "len() of a 0-D tensor is wrong"
        if var.type in [
            core.VarDesc.VarType.LOD_TENSOR,
            core.VarDesc.VarType.SELECTED_ROWS,

--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -798,7 +798,7 @@ def accuracy(input, label, k=1, correct=None, total=None, name=None):
            predictions = paddle.to_tensor([[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]], dtype='float32')
            label = paddle.to_tensor([[2], [0]], dtype="int64")
            result = paddle.metric.accuracy(input=predictions, label=label, k=1)
-            # [0.5]
+            # 0.5
    """
    if label.dtype == paddle.int32:
        label = paddle.cast(label, paddle.int64)

--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -61,7 +61,7 @@ def dice_loss(input, label, epsilon=0.00001, name=None):
                             For more information, please refer to :ref:`api_guide_Name`
    Returns:
-        Tensor, which shape is [1], data type is the same as `input` .
+        0-D Tensor, which shape is [], data type is the same as `input` .
    Example:
        .. code-block:: python
@@ -327,7 +327,7 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
    Returns:
-      A Tensor representing the npair loss, the data type is the same as anchor, the shape is [1].
+      A 0-D Tensor representing the npair loss, the data type is the same as anchor, the shape is [].
    Examples:
@@ -634,7 +634,7 @@ def binary_cross_entropy(
            input = paddle.to_tensor([0.5, 0.6, 0.7], 'float32')
            label = paddle.to_tensor([1.0, 0.0, 1.0], 'float32')
            output = paddle.nn.functional.binary_cross_entropy(input, label)
-            print(output)  # [0.65537095]
+            print(output)  # 0.65537095
    """
    if reduction not in ['sum', 'mean', 'none']:
@@ -774,7 +774,7 @@ def binary_cross_entropy_with_logits(
            logit = paddle.to_tensor([5.0, 1.0, 3.0])
            label = paddle.to_tensor([1.0, 0.0, 1.0])
            output = paddle.nn.functional.binary_cross_entropy_with_logits(logit, label)
-            print(output)  # [0.45618808]
+            print(output)  # 0.45618808
    """
    if reduction not in ['sum', 'mean', 'none']:
@@ -1077,7 +1077,7 @@ def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
            label = paddle.rand([3, 3]).astype('float32')
            output = paddle.nn.functional.smooth_l1_loss(input, label)
            print(output)
-            # [0.068004]
+            # 0.068004
    """
    if in_dygraph_mode():
@@ -1147,7 +1147,7 @@ def margin_ranking_loss(
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        Tensor, if :attr:`reduction` is ``'mean'`` or ``'sum'``, the out shape is :math:`[1]`, otherwise the shape is the same as `input` .The same dtype as input tensor.
+        Tensor, if :attr:`reduction` is ``'mean'`` or ``'sum'``, the out shape is :math:`[]`, otherwise the shape is the same as `input` .The same dtype as input tensor.
    Examples:
@@ -1159,7 +1159,7 @@ def margin_ranking_loss(
            other = paddle.to_tensor([[2, 1], [2, 4]], dtype='float32')
            label = paddle.to_tensor([[1, -1], [-1, -1]], dtype='float32')
            loss = paddle.nn.functional.margin_ranking_loss(input, other, label)
-            print(loss) # [0.75]
+            print(loss) # 0.75
    """
    if reduction not in ['sum', 'mean', 'none']:
        raise ValueError(
@@ -1264,7 +1264,7 @@ def l1_loss(input, label, reduction='mean', name=None):
    Returns:
        Tensor, the L1 Loss of Tensor ``input`` and ``label``.
        If `reduction` is ``'none'``, the shape of output loss is :math:`[N, *]`, the same as ``input`` .
-        If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
+        If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [].
    Examples:
        .. code-block:: python
@@ -1276,8 +1276,8 @@ def l1_loss(input, label, reduction='mean', name=None):
            l1_loss = paddle.nn.functional.l1_loss(input, label)
            print(l1_loss)
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [0.34999999])
+            #        0.34999999)
            l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='none')
            print(l1_loss)
@@ -1287,8 +1287,8 @@ def l1_loss(input, label, reduction='mean', name=None):
            l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum')
            print(l1_loss)
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1.39999998])
+            #        1.39999998)
    """
    if reduction not in ['sum', 'mean', 'none']:
@@ -1377,7 +1377,7 @@ def nll_loss(
                log_out = log_softmax(input)
                label = paddle.to_tensor([0, 2, 1, 1, 0], "int64")
                result = nll_loss(log_out, label)
-                print(result) # Tensor(shape=[1], dtype=float32, place=CPUPlace, stop_gradient=True, [1.07202101])
+                print(result) # Tensor(shape=[], dtype=float32, place=CPUPlace, stop_gradient=True, 1.07202101)
    """
    if reduction not in ['sum', 'mean', 'none']:
        raise ValueError(
@@ -1578,9 +1578,9 @@ def kl_div(input, label, reduction='mean', name=None):
    If `reduction` is ``'none'``, the output loss is the same shape as the input, and the loss at each point is calculated separately. There is no reduction to the result.
-    If `reduction` is ``'mean'``, the output loss is the shape of [1], and the output is the average of all losses.
+    If `reduction` is ``'mean'``, the output loss is the shape of [], and the output is the average of all losses.
-    If `reduction` is ``'sum'``, the output loss is the shape of [1], and the output is the sum of all losses.
+    If `reduction` is ``'sum'``, the output loss is the shape of [], and the output is the sum of all losses.
    If `reduction` is ``'batchmean'``, the output loss is the shape of [N], N is the batch size, and the output is the sum of all losses divided by the batch size.
@@ -1611,17 +1611,17 @@ def kl_div(input, label, reduction='mean', name=None):
            x = paddle.uniform(shape, min=-10, max=10).astype('float32')
            target = paddle.uniform(shape, min=-10, max=10).astype('float32')
-            # 'batchmean' reduction, loss shape will be [1]
+            # 'batchmean' reduction, loss shape will be [], who is 0-D Tensor
            pred_loss = F.kl_div(x, target, reduction='batchmean')
-            # shape=[1]
+            # shape=[]
-            # 'mean' reduction, loss shape will be [1]
+            # 'mean' reduction, loss shape will be [], who is 0-D Tensor
            pred_loss = F.kl_div(x, target, reduction='mean')
-            # shape=[1]
+            # shape=[]
-            # 'sum' reduction, loss shape will be [1]
+            # 'sum' reduction, loss shape will be [], who is 0-D Tensor
            pred_loss = F.kl_div(x, target, reduction='sum')
-            # shape=[1]
+            # shape=[]
            # 'none' reduction, loss shape is same with input shape
            pred_loss = F.kl_div(x, target, reduction='none')
@@ -1724,7 +1724,7 @@ def mse_loss(input, label, reduction='mean', name=None):
            label = paddle.to_tensor(1.7)
            output = mse_loss(input, label)
            print(output)
-            # [0.04000002]
+            # 0.04000002
    """
@@ -1780,7 +1780,7 @@ def ctc_loss(
        norm_by_times (bool, optional): Whether to normalize the gradients by the number of time-step, which is also the sequence's length. There is no need to normalize the gradients if reduction mode is 'mean'. Default: False.
    Returns:
-        Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
+        Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is []. Data type is the same as ``log_probs``.
    Examples:
@@ -1834,8 +1834,8 @@ def ctc_loss(
                blank=0,
                reduction='mean')
            print(loss)
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1.13760614])
+            #        1.13760614)
    """
@@ -1929,7 +1929,7 @@ def rnnt_loss(
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        Tensor, The RNN-T loss between ``logprobs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``logprobs``.
+        Tensor, The RNN-T loss between ``logprobs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is []. Data type is the same as ``logprobs``.
    Examples:
@@ -1961,8 +1961,8 @@ def rnnt_loss(
            costs = fn(acts, labels, lengths, label_lengths)
            print(costs)
-            # Tensor(shape=[1], dtype=float64, place=Place(gpu:0), stop_gradient=False,
+            # Tensor(shape=[], dtype=float64, place=Place(gpu:0), stop_gradient=False,
-            #        [4.49566677])
+            #        4.49566677)
    """
    def warprnnt(
@@ -2078,7 +2078,7 @@ def margin_cross_entropy(
            softmax is shard_softmax when using model parallel, otherwise
            softmax is in the same shape with input logits. If
            ``reduction == None``, the shape of loss is ``[N, 1]``, otherwise
-            the shape is ``[1]``.
+            the shape is ``[]``.
    Examples:
@@ -2633,8 +2633,8 @@ def cross_entropy(
                                        input,
                                        label)
            print(dy_ret)
-            # Tensor(shape=[1], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float64, place=Place(gpu:0), stop_gradient=True,
-            #        [5.34043430])
+            #        5.34043430)
        .. code-block:: python
@@ -2659,8 +2659,8 @@ def cross_entropy(
                                                                    weight=weight,
                                                                    reduction=reduction)
            print(paddle_loss_mean)
-            # Tensor(shape=[1], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float64, place=Place(gpu:0), stop_gradient=True,
-            #        [1.11043464])
+            #        1.11043464)
    """
@@ -3012,7 +3012,7 @@ def sigmoid_focal_loss(
            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        Tensor, if :attr:`reduction` is ``'mean'`` or ``'sum'``, the out shape is :math:`[1]`, otherwise the shape is the same as ``logit``. The same dtype as ``logit`` tensor.
+        Tensor, if :attr:`reduction` is ``'mean'`` or ``'sum'``, the out shape is :math:`[]`, otherwise the shape is the same as ``logit``. The same dtype as ``logit`` tensor.
    Examples:
@@ -3026,7 +3026,7 @@ def sigmoid_focal_loss(
            fg_label = paddle.greater_equal(label, one)
            fg_num = paddle.sum(paddle.cast(fg_label, dtype='float32'))
            output = paddle.nn.functional.sigmoid_focal_loss(logit, label, normalizer=fg_num)
-            print(output)  # [0.65782464]
+            print(output)  # 0.65782464
    """
    if reduction not in ['sum', 'mean', 'none']:
@@ -3183,7 +3183,7 @@ def multi_label_soft_margin_loss(
            # Tensor([3.49625897, 0.71111226, 0.43989015])
            loss = F.multi_label_soft_margin_loss(input, label, reduction='mean')
            print(loss)
-            # Tensor([1.54908717])
+            # Tensor(1.54908717)
    """
    if reduction not in ['sum', 'mean', 'none']:
        raise ValueError(
@@ -3307,7 +3307,7 @@ def hinge_embedding_loss(input, label, margin=1.0, reduction='mean', name=None):
            loss = F.hinge_embedding_loss(input, label, margin=1.0, reduction='mean')
            print(loss)
-            # Tensor([0.22222222])
+            # Tensor(0.22222222)
    """
    if reduction not in ['sum', 'mean', 'none']:
@@ -3377,7 +3377,7 @@ def cosine_embedding_loss(
    Returns:
        Tensor, the cosine embedding Loss of Tensor ``input1`` ``input2`` and ``label``.
            If `reduction` is ``'none'``, the shape of output loss is [N], the same as ``input`` .
-            If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
+            If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [].
    Examples:
        .. code-block:: python
@@ -3389,10 +3389,10 @@ def cosine_embedding_loss(
            label = paddle.to_tensor([1, -1], 'int64')
            output = paddle.nn.functional.cosine_embedding_loss(input1, input2, label, margin=0.5, reduction='mean')
-            print(output)  # [0.21155193]
+            print(output)  # 0.21155193
            output = paddle.nn.functional.cosine_embedding_loss(input1, input2, label, margin=0.5, reduction='sum')
-            print(output)  # [0.42310387]
+            print(output)  # 0.42310387
            output = paddle.nn.functional.cosine_embedding_loss(input1, input2, label, margin=0.5, reduction='none')
            print(output)  # [0.42310387, 0.        ]
@@ -3528,7 +3528,7 @@ def triplet_margin_with_distance_loss(
            loss = F.triplet_margin_with_distance_loss(input, positive, negative, margin=1.0, reduction='mean')
            print(loss)
-            # Tensor([0.19165580])
+            # Tensor(0.19165580)
    """
    if reduction not in ['sum', 'mean', 'none']:
@@ -3678,7 +3678,7 @@ def triplet_margin_loss(
            loss = F.triplet_margin_loss(input, positive, negative, margin=1.0, reduction='mean')
            print(loss)
-            # Tensor([0.19165580])
+            # Tensor(0.19165580)
    """
    if reduction not in ['sum', 'mean', 'none']:
@@ -3886,7 +3886,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
    Returns:
-        Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is [1].
+        Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is [].
    Examples:
        .. code-block:: python
@@ -3897,8 +3897,8 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
            label = paddle.to_tensor([[1.0, -1.0, 1.0],[-1.0, 1.0, 1.0]], 'float32')
            output = paddle.nn.functional.soft_margin_loss(input, label)
            print(output)
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [0.64022040])
+            #        0.64022040)
            input = paddle.uniform(shape=(5, 5), dtype="float32", min=0.1, max=0.8)
            label = paddle.randint(0, 2, shape=(5, 5), dtype="int64")
@@ -3998,7 +3998,7 @@ def gaussian_nll_loss(
    Returns:
-        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is [1].
+        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is [].
    Examples::
        .. code-block:: python

--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -98,8 +98,8 @@ class BCEWithLogitsLoss(Layer):
            bce_logit_loss = paddle.nn.BCEWithLogitsLoss()
            output = bce_logit_loss(logit, label)
            print(output)
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [0.45618814])
+            #        0.45618814)
    """
@@ -319,8 +319,8 @@ class CrossEntropyLoss(Layer):
                                        input,
                                        label)
            print(dy_ret)
-            # Tensor(shape=[1], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float64, place=Place(gpu:0), stop_gradient=True,
-            #        [5.34043430])
+            #        5.34043430)
        .. code-block:: python
@@ -345,8 +345,8 @@ class CrossEntropyLoss(Layer):
                                                                    weight=weight,
                                                                    reduction=reduction)
            print(paddle_loss_mean)
-            # Tensor(shape=[1], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float64, place=Place(gpu:0), stop_gradient=True,
-            #        [1.11043464])
+            #        1.11043464)
    """
@@ -564,7 +564,7 @@ class MSELoss(Layer):
            label = paddle.to_tensor([1.7])
            output = mse_loss(input, label)
            print(output)
-            # [0.04000002]
+            # 0.04000002
    """
@@ -637,7 +637,7 @@ class L1Loss(Layer):
        - label (Tensor): label. The shapes is ``[N, *]``, same shape as ``input`` . It's data type should be float32, float64, int32, int64.
        - output (Tensor): The L1 Loss of ``input`` and ``label``.
          If `reduction` is ``'none'``, the shape of output loss is ``[N, *]``, the same as ``input`` .
-          If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
+          If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [].
    Examples:
        .. code-block:: python
@@ -650,14 +650,14 @@ class L1Loss(Layer):
            l1_loss = paddle.nn.L1Loss()
            output = l1_loss(input, label)
            print(output)
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [0.34999999])
+            #        0.34999999)
            l1_loss = paddle.nn.L1Loss(reduction='sum')
            output = l1_loss(input, label)
            print(output)
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1.39999998])
+            #        1.39999998)
            l1_loss = paddle.nn.L1Loss(reduction='none')
            output = l1_loss(input, label)
@@ -747,8 +747,8 @@ class BCELoss(Layer):
            bce_loss = paddle.nn.BCELoss()
            output = bce_loss(input, label)
            print(output)
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [0.65537101])
+            #        0.65537101)
    """
@@ -835,7 +835,7 @@ class NLLLoss(Layer):
            The data type is int64.
        - output (Tensor): the `negative log likelihood loss` between input `x` and `label`.
            If `reduction` is `'none'`, the shape is `[N, *]`.
-            If `reduction` is `'sum'` or `'mean'`, the shape is `[1]`.
+            If `reduction` is `'sum'` or `'mean'`, the shape is `[]`.
    Examples:
        .. code-block:: python
@@ -853,7 +853,7 @@ class NLLLoss(Layer):
                log_out = log_softmax(input)
                label = paddle.to_tensor([0, 2, 1, 1, 0], "int64")
                result = nll_loss(log_out, label)
-                print(result) # Tensor(shape=[1], dtype=float32, place=CPUPlace, stop_gradient=True, [1.07202101])
+                print(result) # Tensor(shape=[], dtype=float32, place=CPUPlace, stop_gradient=True, 1.07202101)
    """
@@ -991,9 +991,9 @@ class KLDivLoss(Layer):
    If `reduction` is ``'none'``, the output loss is the same shape as the input, and the loss at each point is calculated separately. There is no reduction to the result.
-    If `reduction` is ``'mean'``, the output loss is the shape of [1], and the output is the average of all losses.
+    If `reduction` is ``'mean'``, the output loss is the shape of [], and the output is the average of all losses.
-    If `reduction` is ``'sum'``, the output loss is the shape of [1], and the output is the sum of all losses.
+    If `reduction` is ``'sum'``, the output loss is the shape of [], and the output is the sum of all losses.
    If `reduction` is ``'batchmean'``, the output loss is the shape of [N], N is the batch size, and the output is the sum of all losses divided by the batch size.
@@ -1012,7 +1012,7 @@ class KLDivLoss(Layer):
        label (Tensor): ``(N, *)``, same shape as input.
-        output (Tensor): tensor with shape: [1] by default.
+        output (Tensor): tensor with shape: [] by default.
    Examples:
        .. code-block:: python
@@ -1024,20 +1024,20 @@ class KLDivLoss(Layer):
            x = paddle.uniform(shape, min=-10, max=10).astype('float32')
            target = paddle.uniform(shape, min=-10, max=10).astype('float32')
-            # 'batchmean' reduction, loss shape will be [1]
+            # 'batchmean' reduction, loss shape will be []
            kldiv_criterion = nn.KLDivLoss(reduction='batchmean')
            pred_loss = kldiv_criterion(x, target)
-            # shape=[1]
+            # shape=[]
-            # 'mean' reduction, loss shape will be [1]
+            # 'mean' reduction, loss shape will be []
            kldiv_criterion = nn.KLDivLoss(reduction='mean')
            pred_loss = kldiv_criterion(x, target)
-            # shape=[1]
+            # shape=[]
-            # 'sum' reduction, loss shape will be [1]
+            # 'sum' reduction, loss shape will be []
            kldiv_criterion = nn.KLDivLoss(reduction='sum')
            pred_loss = kldiv_criterion(x, target)
-            # shape=[1]
+            # shape=[]
            # 'none' reduction, loss shape is same with X shape
            kldiv_criterion = nn.KLDivLoss(reduction='none')
@@ -1090,7 +1090,7 @@ class MarginRankingLoss(Layer):
        label: N-D Tensor, label have the same shape and dtype as `input`.
-        output: If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the out shape is :math:`[1]`, otherwise the shape is the same as `input` .The same dtype as input tensor.
+        output: If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the out shape is :math:`[]`, otherwise the shape is the same as `input` .The same dtype as input tensor.
    Returns:
        A callable object of MarginRankingLoss.
@@ -1108,7 +1108,7 @@ class MarginRankingLoss(Layer):
            loss = margin_rank_loss(input, other, label)
            print(loss)
-            # [0.75]
+            # 0.75
    """
    def __init__(self, margin=0.0, reduction='mean', name=None):
@@ -1149,7 +1149,7 @@ class CTCLoss(Layer):
        - norm_by_times (bool, optional): Whether to normalize the gradients by the number of time-step, which is also the sequence's length. There is no need to normalize the gradients if reduction mode is 'mean'. Default: False.
    Returns:
-        Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
+        Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is []. Data type is the same as ``log_probs``.
    Examples:
@@ -1197,8 +1197,8 @@ class CTCLoss(Layer):
                input_lengths,
                label_lengths)
            print(loss)
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1.13760614])
+            #        1.13760614)
    """
    def __init__(self, blank=0, reduction='mean'):
@@ -1242,7 +1242,7 @@ class RNNTLoss(Layer):
        label_lengths: Tensor of (batch) containing label length of each example
    Returns:
-     Tensor, The RNN-T loss between ``logprobs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``logprobs``.
+     Tensor, The RNN-T loss between ``logprobs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is []. Data type is the same as ``logprobs``.
    Examples:
        .. code-block:: python
@@ -1272,8 +1272,8 @@ class RNNTLoss(Layer):
            costs = fn(acts, labels, lengths, label_lengths)
            print(costs)
-            # Tensor(shape=[1], dtype=float64, place=Place(gpu:0), stop_gradient=False,
+            # Tensor(shape=[], dtype=float64, place=Place(gpu:0), stop_gradient=False,
-            #        [4.49566677])
+            #        4.49566677)
    """
    def __init__(
@@ -1352,7 +1352,7 @@ class SmoothL1Loss(Layer):
            loss = paddle.nn.SmoothL1Loss()
            output = loss(input, label)
            print(output)
-            # [0.049606]
+            # 0.049606
    """
    def __init__(self, reduction='mean', delta=1.0, name=None):
@@ -1428,7 +1428,7 @@ class MultiLabelSoftMarginLoss(Layer):
                multi_label_soft_margin_loss = nn.MultiLabelSoftMarginLoss(reduction='mean')
                loss = multi_label_soft_margin_loss(input, label)
                print(loss)
-                # Tensor([1.54908717])
+                # Tensor(1.54908717)
        """
    def __init__(self, weight=None, reduction="mean", name=None):
@@ -1529,7 +1529,7 @@ class HingeEmbeddingLoss(Layer):
            hinge_embedding_loss = nn.HingeEmbeddingLoss(margin=1.0, reduction='mean')
            loss = hinge_embedding_loss(input, label)
            print(loss)
-            # Tensor([0.22222222])
+            # Tensor(0.22222222)
    """
    def __init__(self, margin=1.0, reduction="mean", name=None):
@@ -1590,7 +1590,7 @@ class CosineEmbeddingLoss(Layer):
                         Available dtypes are int32, int64, float32, float64.
        output (Tensor): Tensor, the cosine embedding Loss of Tensor ``input1`` ``input2`` and ``label``.
                         If `reduction` is ``'none'``, the shape of output loss is [N], the same as ``input`` .
-                         If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
+                         If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [].
    Examples:
        .. code-block:: python
@@ -1603,11 +1603,11 @@ class CosineEmbeddingLoss(Layer):
            cosine_embedding_loss = paddle.nn.CosineEmbeddingLoss(margin=0.5, reduction='mean')
            output = cosine_embedding_loss(input1, input2, label)
-            print(output) # [0.21155193]
+            print(output) # 0.21155193
            cosine_embedding_loss = paddle.nn.CosineEmbeddingLoss(margin=0.5, reduction='sum')
            output = cosine_embedding_loss(input1, input2, label)
-            print(output) # [0.42310387]
+            print(output) # 0.42310387
            cosine_embedding_loss = paddle.nn.CosineEmbeddingLoss(margin=0.5, reduction='none')
            output = cosine_embedding_loss(input1, input2, label)
@@ -1717,7 +1717,7 @@ class TripletMarginWithDistanceLoss(Layer):
            triplet_margin_with_distance_loss = TripletMarginWithDistanceLoss(reduction='mean')
            loss = triplet_margin_with_distance_loss(input, positive, negative,)
            print(loss)
-            # Tensor([0.19165580])
+            # Tensor(0.19165580)
    """
@@ -1825,7 +1825,7 @@ class TripletMarginLoss(Layer):
            triplet_margin_loss = paddle.nn.TripletMarginLoss(margin=1.0, swap=True, reduction='mean', )
            loss = triplet_margin_loss(input, positive, negative,)
            print(loss)
-            # Tensor([0.19165580])
+            # Tensor(0.19165580)
    """
@@ -1995,7 +1995,7 @@ class SoftMarginLoss(Layer):
          ``input``. The target labels which values should be numbers -1 or 1.
          Available dtype is int32, int64, float32, float64.
        - Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
-          same as ``input`` , else the shape of output is [1].
+          same as ``input`` , else the shape of output is [].
    Returns:
        A callable object of SoftMarginLoss.
@@ -2010,8 +2010,8 @@ class SoftMarginLoss(Layer):
            soft_margin_loss = paddle.nn.SoftMarginLoss()
            output = soft_margin_loss(input, label)
            print(output)
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [0.64022040])
+            #        0.64022040)
            input_np = paddle.uniform(shape=(5, 5), min=0.1, max=0.8, dtype="float64")
            label_np = paddle.randint(high=2, shape=(5, 5), dtype="int64")

--- a/python/paddle/static/nn/metric.py
+++ b/python/paddle/static/nn/metric.py
@@ -66,10 +66,10 @@ def accuracy(input, label, k=1, correct=None, total=None):
            exe.run(static.default_startup_program())
            x = np.random.rand(3, 32, 32).astype("float32")
            y = np.array([[1],[0],[1]])
-            output= exe.run(feed={"input": x,"label": y},
+            output = exe.run(feed={"input": x,"label": y},
-                        fetch_list=[result[0]])
+                             fetch_list=[result])
            print(output)
-            #[array([0.], dtype=float32)]
+            # [array(0.33333334, dtype=float32)]
    """
    if _non_static_mode():

--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -717,10 +717,10 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
        We use the dtype conversion rules following this:
                Keep dtype
        np.number ───────────► paddle.Tensor
-                                (0D-Tensor)
+                                (0-D Tensor)
                    default_dtype
        Python Number ───────────────► paddle.Tensor
-                                        (0D-Tensor)
+                                        (0-D Tensor)
                    Keep dtype
        np.ndarray ───────────► paddle.Tensor
@@ -753,7 +753,6 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
        #        1)
        x = paddle.to_tensor(1, stop_gradient=False)
-        print(x)
        # Tensor(shape=[], dtype=int64, place=CPUPlace, stop_gradient=False,
        #        1)

--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -333,8 +333,8 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
            # compute inf-order  norm
            out_pnorm = paddle.linalg.norm(x, p=float("inf"))
-            # out_pnorm  = Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            # out_pnorm  = Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #                    [12.])
+            #                    12.)
            out_pnorm = paddle.linalg.norm(x, p=float("inf"), axis=0)
            # out_pnorm: Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
@@ -344,8 +344,8 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
            # compute -inf-order  norm
            out_pnorm = paddle.linalg.norm(x, p=-float("inf"))
-            # out_pnorm: Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            # out_pnorm: Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #                  [0.])
+            #                  0.)
            out_pnorm = paddle.linalg.norm(x, p=-float("inf"), axis=0)
            # out_pnorm: Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
@@ -690,16 +690,16 @@ def dist(x, y, p=2, name=None):
            x = paddle.to_tensor([[3, 3],[3, 3]], dtype="float32")
            y = paddle.to_tensor([[3, 3],[3, 1]], dtype="float32")
            out = paddle.dist(x, y, 0)
-            print(out) # out = [1.]
+            print(out) # out = 1.
            out = paddle.dist(x, y, 2)
-            print(out) # out = [2.]
+            print(out) # out = 2.
            out = paddle.dist(x, y, float("inf"))
-            print(out) # out = [2.]
+            print(out) # out = 2.
            out = paddle.dist(x, y, float("-inf"))
-            print(out) # out = [0.]
+            print(out) # out = 0.
    """
    if in_dygraph_mode():
        return _C_ops.dist(x, y, p)
@@ -745,48 +745,48 @@ def cond(x, p=None, name=None):
            # compute conditional number when p is None
            out = paddle.linalg.cond(x)
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1.41421342])
+            #        1.41421342)
            # compute conditional number when order of the norm is 'fro'
            out_fro = paddle.linalg.cond(x, p='fro')
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [3.16227770])
+            #        3.16227770)
            # compute conditional number when order of the norm is 'nuc'
            out_nuc = paddle.linalg.cond(x, p='nuc')
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [9.24263859])
+            #        9.24263859)
            # compute conditional number when order of the norm is 1
            out_1 = paddle.linalg.cond(x, p=1)
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [2.])
+            #        2.)
            # compute conditional number when order of the norm is -1
            out_minus_1 = paddle.linalg.cond(x, p=-1)
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1.])
+            #        1.)
            # compute conditional number when order of the norm is 2
            out_2 = paddle.linalg.cond(x, p=2)
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1.41421342])
+            #        1.41421342)
            # compute conditional number when order of the norm is -1
            out_minus_2 = paddle.linalg.cond(x, p=-2)
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [0.70710683])
+            #        0.70710683)
            # compute conditional number when order of the norm is inf
            out_inf = paddle.linalg.cond(x, p=float("inf"))
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [2.])
+            #        2.)
            # compute conditional number when order of the norm is -inf
            out_minus_inf = paddle.linalg.cond(x, p=-float("inf"))
-            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1.])
+            #        1.)
            a = paddle.randn([2, 4, 4])
            # Tensor(shape=[2, 4, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
@@ -1095,13 +1095,13 @@ def dot(x, y, name=None):
        x = paddle.to_tensor([1, 2, 3])
        y = paddle.to_tensor([4, 5, 6])
        z = paddle.dot(x, y)
-        print(z)  # [32]
+        print(z)  # 32
        # 2-D Tensor * 2-D Tensor
        x = paddle.to_tensor([[1, 2, 3], [2, 4, 6]])
        y = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
        z = paddle.dot(x, y)
-        print(z)  # [[32], [64]]
+        print(z)  # [32, 64]
    """
    if in_dygraph_mode():
@@ -1163,7 +1163,7 @@ def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None):
        import paddle
-        xt = paddle.rand((3,4))
+        xt = paddle.rand((3, 4))
        paddle.linalg.cov(xt)
        '''
@@ -1485,7 +1485,7 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
            a = paddle.eye(10)
            b = paddle.linalg.matrix_rank(a)
            print(b)
-            # b = [10]
+            # b = 10
            c = paddle.ones(shape=[3, 4, 5, 5])
            d = paddle.linalg.matrix_rank(c, tol=0.01, hermitian=True)

--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -288,13 +288,8 @@ def is_empty(x, name=None):
            input = paddle.rand(shape=[4, 32, 32], dtype='float32')
            res = paddle.is_empty(x=input)
-            print("res:", res)
+            # res: Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
-            # ('res:', Tensor: eager_tmp_1
+            #        False)
-            #    - place: CPUPlace
-            #    - shape: [1]
-            #    - layout: NCHW
-            #    - dtype: bool
-            #    - data: [0])
    """
    if in_dygraph_mode():
@@ -339,9 +334,9 @@ def equal_all(x, y, name=None):
          y = paddle.to_tensor([1, 2, 3])
          z = paddle.to_tensor([1, 4, 3])
          result1 = paddle.equal_all(x, y)
-          print(result1) # result1 = [True ]
+          print(result1) # result1 = True
          result2 = paddle.equal_all(x, z)
-          print(result2) # result2 = [False ]
+          print(result2) # result2 = False
    """
    if in_dygraph_mode():
        return _C_ops.equal_all(x, y)
@@ -388,21 +383,21 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
          y = paddle.to_tensor([10000.1, 1e-08])
          result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                  equal_nan=False, name="ignore_nan")
-          # [False]
+          # False
          result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                      equal_nan=True, name="equal_nan")
-          # [False]
+          # False
          x = paddle.to_tensor([1.0, float('nan')])
          y = paddle.to_tensor([1.0, float('nan')])
          result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                  equal_nan=False, name="ignore_nan")
-          # [False]
+          # False
          result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                      equal_nan=True, name="equal_nan")
-          # [True]
+          # True
    """
    if in_dygraph_mode():

--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -268,11 +268,11 @@ def slice(input, axes, starts, ends):
    Args:
        input (Tensor): A ``Tensor`` . The data type is ``float16``, ``float32``, ``float64``, ``int32`` or ``int64``.
        axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to .
-        starts (list|tuple|Tensor): The data type is ``int32`` . If ``starts`` is a list or tuple, the elements of
+        starts (list|tuple|Tensor): The data type is ``int32`` . If ``starts`` is a list or tuple, each element of
-                it should be integers or Tensors with shape [1]. If ``starts`` is an Tensor, it should be an 1-D Tensor.
+                it should be integer or 0-D int Tensor with shape []. If ``starts`` is an Tensor, it should be an 1-D Tensor.
                It represents starting indices of corresponding axis in ``axes``.
-        ends (list|tuple|Tensor): The data type is ``int32`` . If ``ends`` is a list or tuple, the elements of
+        ends (list|tuple|Tensor): The data type is ``int32`` . If ``ends`` is a list or tuple, each element of
-                it should be integers or Tensors with shape [1]. If ``ends`` is an Tensor, it should be an 1-D Tensor .
+                it should be integer or 0-D int Tensor with shape []. If ``ends`` is an Tensor, it should be an 1-D Tensor .
                It represents ending indices of corresponding axis in ``axes``.
    Returns:
@@ -1065,21 +1065,21 @@ def tolist(x):
            print(expectlist)   #[0, 1, 2, 3, 4]
    """
-    # TODO(zhouwei): will remove 0D Tensor.numpy() hack
+    # TODO(zhouwei): will remove 0-D Tensor.numpy() hack
    return x.numpy(False).tolist()
 def concat(x, axis=0, name=None):
    """
-    Concatenates the input along the axis.
+    Concatenates the input along the axis. It doesn't support 0-D Tensor because it requires a certain axis, and 0-D Tensor
+    doesn't have any axis.
    Args:
        x (list|tuple): ``x`` is a Tensor list or Tensor tuple which is with data type bool, float16,
            float32, float64, int32, int64, int8, uint8. All the Tensors in ``x`` must have same data type.
        axis (int|Tensor, optional): Specify the axis to operate on the input Tensors.
-            It's a scalar with data type int or a Tensor with shape [1] and data type int32
+            Tt should be integer or 0-D int Tensor with shape []. The effective range is [-R, R), where R is Rank(x). When ``axis < 0``,
-            or int64. The effective range is [-R, R), where R is Rank(x). When ``axis < 0``,
            it works the same way as ``axis+R``. Default is 0.
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
@@ -1550,11 +1550,11 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
    if x_dim == 0:
        if not (isinstance(start_axis, int)) or start_axis not in [0, -1]:
            raise ValueError(
-                "The start_axis should be int, and should be 0 or -1 when the input tensor is a 0D-Tensor"
+                "The start_axis should be int, and should be 0 or -1 when the input tensor is a 0-D-Tensor"
            )
        if not (isinstance(stop_axis, int)) or stop_axis not in [0, -1]:
            raise ValueError(
-                "The stop_axis should be int, and should be 0 or -1 when the input tensor is a 0D-Tensor"
+                "The stop_axis should be int, and should be 0 or -1 when the input tensor is a 0-D-Tensor"
            )
    else:
        if (
@@ -1913,8 +1913,8 @@ def split(x, num_or_sections, axis=0, name=None):
            If ``num_or_sections`` is a list or tuple, the length of it indicates the number of
            sub-Tensors and the elements in it indicate the sizes of sub-Tensors'  dimension orderly.
            The length of the list must not  be larger than the ``x`` 's size of specified ``axis``.
-        axis (int|Tensor, optional): The axis along which to split, it can be a scalar with type
+        axis (int|Tensor, optional): The axis along which to split, it can be a integer or a ``0-D Tensor``
-            ``int`` or a ``Tensor`` with shape [1] and data type  ``int32`` or ``int64``.
+            with shape [] and data type  ``int32`` or ``int64``.
            If :math::`axis < 0`, the axis to split along is :math:`rank(x) + axis`. Default is 0.
        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name` .
@@ -2557,7 +2557,7 @@ def unsqueeze(x, axis, name=None):
    Args:
        x (Tensor): The input Tensor to be unsqueezed. Supported data type: bfloat16, float16, float32, float64, bool, int8, int32, int64.
        axis (int|list|tuple|Tensor): Indicates the dimensions to be inserted. The data type is ``int32`` .
-                                    If ``axis`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
+                                    If ``axis`` is a list or tuple, each element of it should be integer or 0-D Tensor with shape [].
                                    If ``axis`` is a Tensor, it should be an 1-D Tensor .
                                    If ``axis`` is negative, ``axis = axis + ndim(x) + 1``.
        name (str|None): Name for this layer. Please refer to :ref:`api_guide_Name`, Default None.
@@ -3083,8 +3083,8 @@ def chunk(x, chunks, axis=0, name=None):
    Args:
        x (Tensor): A N-D Tensor. The data type is bool, float16, float32, float64, int32 or int64.
        chunks(int): The number of tensor to be split along the certain axis.
-        axis (int|Tensor, optional): The axis along which to split, it can be a scalar with type
+        axis (int|Tensor, optional): The axis along which to split, it can be a integer or a ``0-D Tensor``
-            ``int`` or a ``Tensor`` with shape [1] and data type  ``int32`` or ``int64``.
+            with shape [] and data type  ``int32`` or ``int64``.
            If :math::`axis < 0`, the axis to split along is :math:`rank(x) + axis`. Default is 0.
        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name` .
@@ -3523,7 +3523,7 @@ def reshape(x, shape, name=None):
    Args:
        x (Tensor): An N-D Tensor. The data type is ``float32``, ``float64``, ``int32``, ``int64`` or ``bool``
        shape (list|tuple|Tensor): Define the target shape. At most one dimension of the target shape can be -1.
-                        The data type is ``int32`` . If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [].
+                        The data type is ``int32`` . If ``shape`` is a list or tuple, each element of it should be integer or Tensor with shape [].
                        If ``shape`` is an Tensor, it should be an 1-D Tensor .
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
@@ -3843,11 +3843,15 @@ def strided_slice(x, axes, starts, ends, strides, name=None):
        x (Tensor): An N-D ``Tensor``. The data type is ``bool``, ``float16``, ``float32``, ``float64``, ``int32`` or ``int64``.
        axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to.
                            It's optional. If it is not provides, it will be treated as :math:`[0,1,...,len(starts)-1]`.
-        starts (list|tuple|Tensor): The data type is ``int32`` . If ``starts`` is a list or tuple, the elements of                                                                                          it should be integers or Tensors with shape [1]. If ``starts`` is an Tensor, it should be an 1-D Tensor.                                                                                    It represents starting indices of corresponding axis in ``axes``.
+        starts (list|tuple|Tensor): The data type is ``int32`` . If ``starts`` is a list or tuple, the elements of it should be
-        ends (list|tuple|Tensor): The data type is ``int32`` . If ``ends`` is a list or tuple, the elements of
+            integers or Tensors with shape []. If ``starts`` is an Tensor, it should be an 1-D Tensor.
-                it should be integers or Tensors with shape [1]. If ``ends`` is an Tensor, it should be an 1-D Tensor .                                                                                     It represents ending indices of corresponding axis in ``axes``.
+            It represents starting indices of corresponding axis in ``axes``.
-        strides (list|tuple|Tensor): The data type is ``int32`` . If ``strides`` is a list or tuple, the elements of
+        ends (list|tuple|Tensor): The data type is ``int32`` . If ``ends`` is a list or tuple, the elements of it should be
-                it should be integers or Tensors with shape [1]. If ``strides`` is an Tensor, it should be an 1-D Tensor .                                                                                  It represents slice step of corresponding axis in ``axes``.
+            integers or Tensors with shape []. If ``ends`` is an Tensor, it should be an 1-D Tensor.
+            It represents ending indices of corresponding axis in ``axes``.
+        strides (list|tuple|Tensor): The data type is ``int32`` . If ``strides`` is a list or tuple, the elements of it should be
+            integers or Tensors with shape []. If ``strides`` is an Tensor, it should be an 1-D Tensor.
+            It represents slice step of corresponding axis in ``axes``.
        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
                        For more information, please refer to :ref:`api_guide_Name` .
@@ -4074,7 +4078,7 @@ def tensordot(x, y, axes=2, name=None):
            y = paddle.arange(10, dtype=data_type)
            z1 = paddle.tensordot(x, y, axes=1)
            z2 = paddle.dot(x, y)
-            # z1 = z2 = [285.]
+            # z1 = z2 = 285.
            # For two 2-d tensor x and y, the case axes=1 is equivalent to matrix multiplication.

--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -187,7 +187,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
    Args:
        x (Tensor): Input N-D Tensor of scale operator. Data type can be float32, float64, int8, int16, int32, int64, uint8.
-        scale (float|Tensor): The scale factor of the input, it should be a float number or a Tensor with shape [1] and data type as float32.
+        scale (float|Tensor): The scale factor of the input, it should be a float number or a 0-D Tensor with shape [] and data type as float32.
        bias (float): The bias to be put on the input.
        bias_after_scale (bool): Apply bias addition after or before scaling. It is useful for numeric stability in some circumstances.
        act (str, optional): Activation applied to the output such as tanh, softmax, sigmoid, relu.
@@ -1337,7 +1337,7 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
            # Each example is followed by the corresponding output tensor.
            x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
                                  [0.1, 0.2, 0.6, 0.7]])
-            out1 = paddle.sum(x)  # [3.5]
+            out1 = paddle.sum(x)          # 3.5
            out2 = paddle.sum(x, axis=0)  # [0.3, 0.5, 1.1, 1.6]
            out3 = paddle.sum(x, axis=-1) # [1.9, 1.6]
            out4 = paddle.sum(x, axis=1, keepdim=True)  # [[1.9], [1.6]]
@@ -1357,7 +1357,7 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
            # Each example is followed by the corresponding output tensor.
            x = paddle.to_tensor([[True, True, True, True],
                                  [False, False, False, False]])
-            out7 = paddle.sum(x)  # [4]
+            out7 = paddle.sum(x)          # 4
            out8 = paddle.sum(x, axis=0)  # [1, 1, 1, 1]
            out9 = paddle.sum(x, axis=1)  # [4, 0]
    """
@@ -1493,7 +1493,7 @@ def nansum(x, axis=None, dtype=None, keepdim=False, name=None):
            # Each example is followed by the corresponding output tensor.
            x = paddle.to_tensor([[float('nan'), 0.3, 0.5, 0.9],
                            [0.1, 0.2, float('-nan'), 0.7]],dtype="float32")
-            out1 = paddle.nansum(x)  # [2.7]
+            out1 = paddle.nansum(x)          # 2.7
            out2 = paddle.nansum(x, axis=0)  # [0.1, 0.5, 0.5, 1.6]
            out3 = paddle.nansum(x, axis=-1) # [1.7, 1.0]
            out4 = paddle.nansum(x, axis=1, keepdim=True)  # [[1.7], [1.0]]
@@ -1553,7 +1553,7 @@ def nanmean(x, axis=None, keepdim=False, name=None):
            x = paddle.to_tensor([[float('nan'), 0.3, 0.5, 0.9],
                                  [0.1, 0.2, float('-nan'), 0.7]])
            out1 = paddle.nanmean(x)
-            # [0.44999996]
+            # 0.44999996
            out2 = paddle.nanmean(x, axis=0)
            # [0.1, 0.25, 0.5, 0.79999995]
            out3 = paddle.nanmean(x, axis=0, keepdim=True)
@@ -2263,7 +2263,7 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
        import paddle
        x = paddle.to_tensor([[-1.5, 0., 2.], [3., 1.2, -2.4]])
-        out1 = paddle.logsumexp(x) # [3.4691226]
+        out1 = paddle.logsumexp(x)    # 3.4691226
        out2 = paddle.logsumexp(x, 1) # [2.15317821, 3.15684602]
    """
@@ -2375,7 +2375,7 @@ def max(x, axis=None, keepdim=False, name=None):
            result1 = paddle.max(x)
            result1.backward()
            print(result1, x.grad)
-            #[0.9], [[0., 0., 0., 1.], [0., 0., 0., 0.]]
+            # 0.9, [[0., 0., 0., 1.], [0., 0., 0., 0.]]
            x.clear_grad()
            result2 = paddle.max(x, axis=0)
@@ -2476,7 +2476,7 @@ def min(x, axis=None, keepdim=False, name=None):
            result1 = paddle.min(x)
            result1.backward()
            print(result1, x.grad)
-            #[0.1], [[0., 0., 0., 0.], [1., 0., 0., 0.]]
+            # 0.1, [[0., 0., 0., 0.], [1., 0., 0., 0.]]
            x.clear_grad()
            result2 = paddle.min(x, axis=0)
@@ -2580,13 +2580,13 @@ def amax(x, axis=None, keepdim=False, name=None):
            result1 = paddle.amax(x)
            result1.backward()
            print(result1, x.grad)
-            #[0.9], [[0., 0.2, 0.2, 0.2], [0.2, 0.2, 0., 0.]]
+            # 0.9, [[0., 0.2, 0.2, 0.2], [0.2, 0.2, 0., 0.]]
            x.clear_grad()
            result1_max = paddle.max(x)
            result1_max.backward()
            print(result1_max, x.grad)
-            #[0.9], [[0., 1.0, 1.0, 1.0], [1.0, 1.0, 0., 0.]]
+            # 0.9, [[0., 1.0, 1.0, 1.0], [1.0, 1.0, 0., 0.]]
            ###############################
@@ -2690,13 +2690,13 @@ def amin(x, axis=None, keepdim=False, name=None):
            result1 = paddle.amin(x)
            result1.backward()
            print(result1, x.grad)
-            #[0.1], [[0., 0.2, 0.2, 0.2], [0.2, 0.2, 0., 0.]]
+            # 0.1, [[0., 0.2, 0.2, 0.2], [0.2, 0.2, 0., 0.]]
            x.clear_grad()
            result1_min = paddle.min(x)
            result1_min.backward()
            print(result1_min, x.grad)
-            #[0.1], [[0., 1.0, 1.0, 1.0], [1.0, 1.0, 0., 0.]]
+            # 0.1, [[0., 1.0, 1.0, 1.0], [1.0, 1.0, 0., 0.]]
            ###############################
@@ -2907,10 +2907,10 @@ def clip(x, min=None, max=None, name=None):
    Args:
        x (Tensor): An N-D Tensor with data type float16, float32, float64, int32 or int64.
-        min (float|int|Tensor, optional): The lower bound with type ``float`` , ``int`` or a ``Tensor``
+        min (float|int|Tensor, optional): The lower bound with type ``float`` , ``int`` or a ``0-D Tensor``
-            with shape [1] and type ``int32``, ``float16``, ``float32``, ``float64``.
+            with shape [] and type ``int32``, ``float16``, ``float32``, ``float64``.
-        max (float|int|Tensor, optional): The upper bound with type ``float``, ``int`` or a ``Tensor``
+        max (float|int|Tensor, optional): The upper bound with type ``float``, ``int`` or a ``0-D Tensor``
-            with shape [1] and type ``int32``, ``float16``, ``float32``, ``float64``.
+            with shape [] and type ``int32``, ``float16``, ``float32``, ``float64``.
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
    Returns:
@@ -3064,7 +3064,7 @@ def trace(x, offset=0, axis1=0, axis2=1, name=None):
            case1 = paddle.randn([2, 3])
            case2 = paddle.randn([3, 10, 10])
            case3 = paddle.randn([3, 10, 5, 10])
-            data1 = paddle.trace(case1) # data1.shape = [1]
+            data1 = paddle.trace(case1) # data1.shape = []
            data2 = paddle.trace(case2, offset=1, axis1=1, axis2=2) # data2.shape = [3]
            data3 = paddle.trace(case3, offset=-3, axis1=1, axis2=-1) # data2.shape = [3, 5]
    """
@@ -3692,7 +3692,7 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None):
            x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
                                  [0.1, 0.2, 0.6, 0.7]])
            out1 = paddle.prod(x)
-            # [0.0002268]
+            # 0.0002268
            out2 = paddle.prod(x, -1)
            # [0.027  0.0084]
@@ -3898,8 +3898,8 @@ def all(x, axis=None, keepdim=False, name=None):
            print(x)
            x = paddle.cast(x, 'bool')
-            # out1 should be [False]
+            # out1 should be False
-            out1 = paddle.all(x)  # [False]
+            out1 = paddle.all(x)          # False
            print(out1)
            # out2 should be [True, False]
@@ -3972,8 +3972,8 @@ def any(x, axis=None, keepdim=False, name=None):
            #    [[True, False]
            #     [True, True]]
-            # out1 should be [True]
+            # out1 should be True
-            out1 = paddle.any(x)  # [True]
+            out1 = paddle.any(x)           # True
            print(out1)
            # out2 should be [True, True]
@@ -4481,8 +4481,8 @@ def rad2deg(x, name=None):
            x2 = paddle.to_tensor(math.pi/2)
            result2 = paddle.rad2deg(x2)
            print(result2)
-            # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #         [90.])
+            #         90.)
            x3 = paddle.to_tensor(1)
            result3 = paddle.rad2deg(x3)
@@ -5382,27 +5382,27 @@ def trapezoid(y, x=None, dx=None, axis=-1, name=None):
            y = paddle.to_tensor([4, 5, 6], dtype='float32')
            print(paddle.trapezoid(y))
-            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [10.])
+            #        10.)
            print(paddle.trapezoid(y, dx=2.))
-            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [20.])
+            #        20.)
            y = paddle.to_tensor([4, 5, 6], dtype='float32')
            x = paddle.to_tensor([1, 2, 3], dtype='float32')
            print(paddle.trapezoid(y, x))
-            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [10.])
+            #        10.)
            y = paddle.to_tensor([1, 2, 3], dtype='float64')
            x = paddle.to_tensor([8, 6, 4], dtype='float64')
            print(paddle.trapezoid(y, x))
-            # Tensor(shape=[1], dtype=float64, place=Place(cpu), stop_gradient=True,
+            # Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
-            #        [-8.])
+            #        -8.)
            y = paddle.arange(6).reshape((2, 3)).astype('float32')
            print(paddle.trapezoid(y, axis=0))

--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -65,7 +65,7 @@ def mean(x, axis=None, keepdim=False, name=None):
                                   [17., 18., 19., 20.],
                                   [21., 22., 23., 24.]]])
            out1 = paddle.mean(x)
-            # [12.5]
+            # 12.5
            out2 = paddle.mean(x, axis=-1)
            # [[ 2.5  6.5 10.5]
            #  [14.5 18.5 22.5]]
@@ -140,7 +140,7 @@ def var(x, axis=None, unbiased=True, keepdim=False, name=None):
            x = paddle.to_tensor([[1.0, 2.0, 3.0], [1.0, 4.0, 5.0]])
            out1 = paddle.var(x)
-            # [2.66666667]
+            # 2.66666667
            out2 = paddle.var(x, axis=1)
            # [1.         4.33333333]
    """
@@ -205,9 +205,9 @@ def std(x, axis=None, unbiased=True, keepdim=False, name=None):
            x = paddle.to_tensor([[1.0, 2.0, 3.0], [1.0, 4.0, 5.0]])
            out1 = paddle.std(x)
-            # [1.63299316]
+            # 1.63299316
            out2 = paddle.std(x, unbiased=False)
-            # [1.49071205]
+            # 1.49071205
            out3 = paddle.std(x, axis=1)
            # [1.       2.081666]
@@ -222,8 +222,7 @@ def std(x, axis=None, unbiased=True, keepdim=False, name=None):
 def numel(x, name=None):
    """
-    Returns the number of elements for a tensor, which is a int64 Tensor with shape [1] in static graph mode
+    Returns the number of elements for a tensor, which is a 0-D int64 Tensor with shape [].
-    or a scalar value in imperative mode.
    Args:
        x (Tensor): The input Tensor, it's data type can be bool, float16, float32, float64, int32, int64.
@@ -231,7 +230,7 @@ def numel(x, name=None):
            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        Tensor: The number of elements for the input Tensor.
+        Tensor: The number of elements for the input Tensor, whose shape is [].
    Examples:
        .. code-block:: python
@@ -387,8 +386,8 @@ def median(x, axis=None, keepdim=False, name=None):
            #         [8 , 9 , 10, 11]])
            y1 = paddle.median(x)
-            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            # Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [5.50000000])
+            #        5.50000000)
            y2 = paddle.median(x, axis=0)
            # Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
@@ -416,7 +415,7 @@ def median(x, axis=None, keepdim=False, name=None):
            -1,
            0,
            None,
-        ], 'when input 0D, axis can only be [-1, 0] or default None'
+        ], 'when input 0-D, axis can only be [-1, 0] or default None'
        is_flatten = True
    if axis is None:

--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -181,8 +181,7 @@ def _format_tensor(var, summary, indent=0, max_width=0, signed=False):
    linewidth = DEFAULT_PRINT_OPTIONS.linewidth
    if len(var.shape) == 0:
-        # currently, shape = [], i.e., scaler tensor is not supported.
+        # 0-D Tensor, whose shape = [], should be formatted like this.
-        # If it is supported, it should be formatted like this.
        return _format_item(var, max_width, signed)
    elif len(var.shape) == 1:
        item_length = max_width + 2
@@ -291,7 +290,7 @@ def _format_dense_tensor(tensor, indent):
    if tensor.dtype == core.VarDesc.VarType.BF16:
        tensor = tensor.astype('float32')
-    # TODO(zhouwei): will remove 0D Tensor.numpy() hack
+    # TODO(zhouwei): will remove 0-D Tensor.numpy() hack
    np_tensor = tensor.numpy(False)
    if len(tensor.shape) == 0:

--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -562,8 +562,6 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
                extra_compile_args[compiler] = []
    if IS_WINDOWS:
-        # TODO(zhouwei): may append compile flags in future
-        pass
        # append link flags
        extra_link_args = kwargs.get('extra_link_args', [])
        extra_link_args.extend(MSVC_LINK_FLAGS)