[cherry-pick2.5] [Zero-Dim] Support...

[cherry-pick2.5] [Zero-Dim] Support all/any/min/max/prod/logsumexp/amax/amin/some loss output 0D (#53192)

[cherry-pick2.5] [Zero-Dim] Support...
[cherry-pick2.5] [Zero-Dim] Support all/any/min/max/prod/logsumexp/amax/amin/some loss output 0D (#53192)
b6996598 · zhouweiwei2014 · GitHub · f84ac449 · b6996598 · b6996598
32 changed file
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -86,10 +86,10 @@ template <typename T>
 nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape,
                            std::string input,
                            bool with_dynamic_shape = false) {
-  PADDLE_ENFORCE_GT(shape.size(),
+  PADDLE_ENFORCE_GE(shape.size(),
                    0UL,
                    platform::errors::InvalidArgument(
-                        "TensorRT's tensor input requires at least 1 "
+                        "TensorRT's tensor input requires at least 0 "
                        "dimensions, but input %s has %d dims.",
                        input,
                        shape.size()));

--- a/paddle/fluid/operators/assert_op.cc
+++ b/paddle/fluid/operators/assert_op.cc
@@ -58,8 +58,8 @@ class AssertOp : public framework::OperatorBase {
                                "Input(Condition) of AssertOp is not found."));
    const phi::DenseTensor &cond = cond_var_ptr->Get<phi::DenseTensor>();
    PADDLE_ENFORCE_EQ(
-        cond.dims(),
+        cond.numel(),
-        phi::make_ddim({1}),
+        1,
        platform::errors::InvalidArgument(
            "The numel of Input(Condition) of AssertOp must be 1. But now "
            "the Condition's shape is %s.",

--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -98,10 +98,9 @@ class __reduce_meanMaker__ : public ops::ReduceBaseOpMaker {
  virtual std::string GetOpType() const { return "Reduce reduce_mean"; }
 };
-DECLARE_INFER_SHAPE_FUNCTOR(
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_mean,
-    reduce_mean,
+                            ReduceMeanInferShapeFunctor,
-    ReduceMeanInferShapeFunctor,
+                            PD_INFER_META(phi::OriginReduceInferMetaBase));
-    PD_INFER_META(phi::ReduceIntArrayAxisInferMetaBase));
 REGISTER_OPERATOR(reduce_mean,
                  ops::ReduceBaseOp,

--- a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
+++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
@@ -1132,7 +1132,7 @@ void prod_grad(const Tensor& x,
      if (!keep_dim) {
        auto axis_ = std::vector<int64_t>();
        if (reduce_all) {
-          for (int64_t i = 1; i < x_dim_size; i++) {
+          for (int64_t i = 0; i < x_dim_size; i++) {
            axis_.push_back(i);
          }
        } else {
@@ -1187,7 +1187,7 @@ void max_grad(const Tensor& x,
  } else {
    auto axis_ = std::vector<int64_t>();
    if (reduce_all) {
-      for (int64_t i = 1; i < x_dim_size; i++) {
+      for (int64_t i = 0; i < x_dim_size; i++) {
        axis_.push_back(i);
      }
    } else {

--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -820,7 +820,7 @@
  args : (Tensor x, IntArray axis={}, bool keepdim=false)
  output : Tensor(out)
  infer_meta :
-    func : ReduceIntArrayAxisInferMeta
+    func : OriginReduceInferMeta
  kernel :
    func : mean
  backward : mean_grad

--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -2146,7 +2146,7 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x,
 }
 void MeanAllInferMeta(const MetaTensor& x, MetaTensor* out) {
-  out->set_dims(phi::make_ddim({1}));
+  out->set_dims(phi::make_ddim({}));
  out->set_dtype(x.dtype());
  out->set_layout(x.layout());
 }
@@ -3050,29 +3050,19 @@ DDim ReduceInferDim(const MetaTensor& x,
  reduce_all = reduce_all || full_dim;
  std::vector<int64_t> out_dim_vector;
-  if (keep_dim) {
+  for (int64_t i = 0; i < x_rank; ++i) {
-    for (int64_t i = 0; i < x_rank; ++i) {
+    if (reduce_all || dims_set.find(i) != dims_set.end()) {
-      if (reduce_all || dims_set.find(i) != dims_set.end()) {
+      if (keep_dim) {
        out_dim_vector.push_back(1);
      } else {
-        out_dim_vector.push_back(x.dims().at(i));
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < x_rank; ++i) {
-      if (reduce_all || dims_set.find(i) != dims_set.end()) {
        continue;
-      } else {
-        out_dim_vector.push_back(x.dims().at(i));
      }
-    }
+    } else {
+      out_dim_vector.push_back(x.dims().at(i));
-    if (x_rank > 0 && out_dim_vector.size() == 0) {
-      out_dim_vector.push_back(1);
    }
  }
-  DDim out_dim = phi::make_ddim(out_dim_vector);
+  DDim out_dim = phi::make_ddim(out_dim_vector);
  return out_dim;
 }
@@ -3086,14 +3076,14 @@ DDim ReduceInferDimForIntArrayAxis(const MetaTensor& x,
    if (keep_dim) {
      vec_dim = std::vector<int64_t>(x.dims().size(), 1);
    } else {
-      vec_dim = {1};
+      vec_dim = {};
    }
  } else {
    if (keep_dim) {
      vec_dim = std::vector<int64_t>(x.dims().size(), -1);
    } else {
      auto x_rank = static_cast<size_t>(x.dims().size());
-      if (vec_axis.size() >= x_rank) {
+      if (vec_axis.size() > x_rank) {
        vec_dim = {-1};
      } else {
        vec_dim = std::vector<int64_t>(x.dims().size() - vec_axis.size(), -1);
@@ -3125,22 +3115,6 @@ void ReduceInferMetaBase(const MetaTensor& x,
  out->set_layout(x.layout());
 }
-void ReduceIntArrayAxisInferMetaBase(const MetaTensor& x,
-                                     const IntArray& axis,
-                                     bool keep_dim,
-                                     bool reduce_all,
-                                     MetaTensor* out,
-                                     MetaConfig config) {
-  if (config.is_runtime || !axis.FromTensor()) {
-    ReduceInferMetaBase(x, axis.GetData(), keep_dim, reduce_all, out);
-  } else {
-    DDim out_dim = ReduceInferDimForIntArrayAxis(x, axis, keep_dim, reduce_all);
-    out->set_dims(out_dim);
-    out->set_dtype(x.dtype());
-    out->set_layout(x.layout());
-  }
-}
 void ReduceIntArrayAxisInferMeta(const MetaTensor& x,
                                 const IntArray& axis,
                                 bool keep_dim,
@@ -3153,6 +3127,23 @@ void ReduceIntArrayAxisInferMeta(const MetaTensor& x,
  ReduceIntArrayAxisInferMetaBase(x, axis, keep_dim, reduce_all, out, config);
 }
+void ReduceIntArrayAxisInferMetaBase(const MetaTensor& x,
+                                     const IntArray& axis,
+                                     bool keep_dim,
+                                     bool reduce_all,
+                                     MetaTensor* out,
+                                     MetaConfig config) {
+  DDim out_dim;
+  if (config.is_runtime || !axis.FromTensor()) {
+    out_dim = ReduceInferDim(x, axis.GetData(), keep_dim, reduce_all);
+  } else {
+    out_dim = ReduceInferDimForIntArrayAxis(x, axis, keep_dim, reduce_all);
+  }
+  out->set_dims(out_dim);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+}
 void ReduceScatterInferMeta(const MetaTensor& x, int nranks, MetaTensor* out) {
  auto dim = x.dims();
  if (dim[0] > 0 || dim[0] < -1) {
@@ -3951,6 +3942,105 @@ void StridedSliceInferMeta(const MetaTensor& x,
      x, axes, starts, ends, strides, infer_flags, decrease_axis, out, config);
 }
+// TODO(zhouwei): OriginReduceInferDim doesn't support 0D, remove in future
+DDim OriginReduceInferDim(const MetaTensor& x,
+                          const std::vector<int64_t>& axis,
+                          bool keep_dim,
+                          bool reduce_all) {
+  auto x_rank = x.dims().size();
+  std::vector<int64_t> formated_axis = axis;
+  for (size_t i = 0; i < axis.size(); ++i) {
+    if (x_rank == 0) {
+      PADDLE_ENFORCE_EQ(
+          axis[i] == 0 || axis[i] == -1,
+          true,
+          phi::errors::InvalidArgument(
+              "When input 0D Tensor, the axis can only be -1, 0, None or []"));
+    } else {
+      PADDLE_ENFORCE_LT(axis[i],
+                        x_rank,
+                        errors::InvalidArgument(
+                            "The reduce dim index %d should be in the "
+                            "range [ -dimension(X), dimension(X) ) "
+                            "which dimesion = %d. But received dim index = %d.",
+                            i,
+                            x_rank,
+                            axis[i]));
+      PADDLE_ENFORCE_GE(axis[i],
+                        -x_rank,
+                        errors::InvalidArgument(
+                            "The reduce dim index %d should be in the "
+                            "range [ -dimension(X), dimension(X) )  "
+                            "which dimesion = %d. But received dim index = %d.",
+                            i,
+                            x_rank,
+                            axis[i]));
+    }
+    if (axis[i] < 0) {
+      formated_axis[i] = axis[i] + x_rank;
+    }
+  }
+  bool full_dim = true;
+  std::set<int64_t> dims_set(formated_axis.begin(), formated_axis.end());
+  for (int64_t i = 0; i < x_rank; ++i) {
+    if (dims_set.find(i) == dims_set.end()) {
+      full_dim = false;
+      break;
+    }
+  }
+  reduce_all = reduce_all || full_dim;
+  std::vector<int64_t> out_dim_vector;
+  for (int64_t i = 0; i < x_rank; ++i) {
+    if (reduce_all || dims_set.find(i) != dims_set.end()) {
+      if (keep_dim) {
+        out_dim_vector.push_back(1);
+      } else {
+        continue;
+      }
+    } else {
+      out_dim_vector.push_back(x.dims().at(i));
+    }
+  }
+  if (x_rank > 0 && out_dim_vector.size() == 0) {
+    out_dim_vector.push_back(1);
+  }
+  DDim out_dim = phi::make_ddim(out_dim_vector);
+  return out_dim;
+}
+// TODO(zhouwei): OriginReduceInferDim doesn't support 0D, remove in future
+DDim OriginReduceInferDimForIntArrayAxis(const MetaTensor& x,
+                                         const IntArray& axis,
+                                         bool keep_dim,
+                                         bool reduce_all) {
+  std::vector<int64_t> vec_axis = axis.GetData();
+  std::vector<int64_t> vec_dim;
+  if (reduce_all) {
+    if (keep_dim) {
+      vec_dim = std::vector<int64_t>(x.dims().size(), 1);
+    } else {
+      vec_dim = {1};
+    }
+  } else {
+    if (keep_dim) {
+      vec_dim = std::vector<int64_t>(x.dims().size(), -1);
+    } else {
+      auto x_rank = static_cast<size_t>(x.dims().size());
+      if (vec_axis.size() >= x_rank) {
+        vec_dim = {-1};
+      } else {
+        vec_dim = std::vector<int64_t>(x.dims().size() - vec_axis.size(), -1);
+      }
+    }
+  }
+  return phi::make_ddim(vec_dim);
+}
 /*  Why not use SumRawInferMeta directly?
    Because we need make InferMetaFunction's args follow the design of
   ops.yaml
@@ -3977,9 +4067,10 @@ void SumRawInferMeta(const MetaTensor& x,
                     MetaConfig config) {
  DDim out_dim;
  if (config.is_runtime || !axis.FromTensor()) {
-    out_dim = ReduceInferDim(x, axis.GetData(), keep_dim, reduce_all);
+    out_dim = OriginReduceInferDim(x, axis.GetData(), keep_dim, reduce_all);
  } else {
-    out_dim = ReduceInferDimForIntArrayAxis(x, axis, keep_dim, reduce_all);
+    out_dim =
+        OriginReduceInferDimForIntArrayAxis(x, axis, keep_dim, reduce_all);
  }
  DataType out_dtype;
@@ -3998,6 +4089,38 @@ void SumRawInferMeta(const MetaTensor& x,
  out->set_layout(x.layout());
 }
+// TODO(zhouwei): OriginReduce doesn't support 0D, remove in future
+void OriginReduceInferMeta(const MetaTensor& x,
+                           const IntArray& axis,
+                           bool keep_dim,
+                           MetaTensor* out,
+                           MetaConfig config) {
+  bool reduce_all = false;
+  if (axis.size() == 0) {
+    reduce_all = true;
+  }
+  OriginReduceInferMetaBase(x, axis, keep_dim, reduce_all, out, config);
+}
+// TODO(zhouwei): OriginReduce doesn't support 0D, remove in future
+void OriginReduceInferMetaBase(const MetaTensor& x,
+                               const IntArray& axis,
+                               bool keep_dim,
+                               bool reduce_all,
+                               MetaTensor* out,
+                               MetaConfig config) {
+  DDim out_dim;
+  if (config.is_runtime || !axis.FromTensor()) {
+    out_dim = OriginReduceInferDim(x, axis.GetData(), keep_dim, reduce_all);
+  } else {
+    out_dim =
+        OriginReduceInferDimForIntArrayAxis(x, axis, keep_dim, reduce_all);
+  }
+  out->set_dims(out_dim);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+}
 void SvdInferMeta(const MetaTensor& x,
                  bool full_matrices,
                  MetaTensor* u,

--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -572,6 +572,19 @@ void SumRawInferMeta(const MetaTensor& x,
                     MetaTensor* out,
                     MetaConfig config = MetaConfig());
+void OriginReduceInferMeta(const MetaTensor& x,
+                           const IntArray& axis,
+                           bool keep_dim,
+                           MetaTensor* out,
+                           MetaConfig config = MetaConfig());
+void OriginReduceInferMetaBase(const MetaTensor& x,
+                               const IntArray& axis,
+                               bool keep_dim,
+                               bool reduce_all,
+                               MetaTensor* out,
+                               MetaConfig config = MetaConfig());
 void SvdInferMeta(const MetaTensor& x,
                  bool full_matrices,
                  MetaTensor* u,

--- a/paddle/phi/kernels/cpu/mean_all_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/mean_all_grad_kernel.cc
@@ -32,10 +32,12 @@ void MeanAllGradKernel(const Context& dev_ctx,
                        out_grad.numel()));
  dev_ctx.template Alloc<T>(x_grad);
-  T ig_size = static_cast<T>(x_grad->numel());
+  T x_numel = static_cast<T>(x_grad->numel());
-  Eigen::DSizes<int, 1> bcast(static_cast<int>(ig_size));
+  Eigen::DSizes<int, 1> bcast(static_cast<int>(x_numel));
-  EigenVector<T>::Flatten(*x_grad).device(*dev_ctx.eigen_device()) =
+  auto eigen_x = EigenVector<T>::Flatten(*x_grad);
-      (EigenVector<T>::From(out_grad) / ig_size).broadcast(bcast);
+  auto eigen_dout = EigenVector<T>::Flatten(out_grad);
+  eigen_x.device(*dev_ctx.eigen_device()) =
+      (eigen_dout / x_numel).broadcast(bcast);
 }
 }  // namespace phi

--- a/paddle/phi/kernels/funcs/unsqueeze.h
+++ b/paddle/phi/kernels/funcs/unsqueeze.h
@@ -105,19 +105,19 @@ inline DDim GetOutputSqueezeShape(const std::vector<int> squeeze_dims,
 inline DDim GetUnsqueezeShape(const std::vector<int64_t> unsqz_dims,
                              const DDim& in_dims) {
-  int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
+  int output_rank = in_dims.size() + static_cast<int>(unsqz_dims.size());
-  int cur_output_size = in_dims.size();
+  int cur_output_rank = in_dims.size();
-  std::vector<int64_t> output_shape(output_size, 0);
+  std::vector<int64_t> output_shape(output_rank, 0);
  // Validity Check: rank range.
  PADDLE_ENFORCE_LE(
-      output_size,
+      output_rank,
      6,
      phi::errors::InvalidArgument("The output "
                                   "tensor's rank should be less than 6."));
  for (int axis : unsqz_dims) {
-    int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
+    int cur = axis < 0 ? axis + cur_output_rank + 1 : axis;
    // Vaildity Check: the axis bound
    PADDLE_ENFORCE_GE(
        cur,
@@ -125,12 +125,12 @@ inline DDim GetUnsqueezeShape(const std::vector<int64_t> unsqz_dims,
        phi::errors::InvalidArgument("The insert dimension value should "
                                     "not be less than 0"));
    PADDLE_ENFORCE_LE(cur,
-                      cur_output_size,
+                      cur_output_rank,
                      phi::errors::InvalidArgument(
                          "The insert dimension value shoule not be larger "
                          "than the dimension size of input tensor"));
    // Move old axis, and insert new axis
-    for (int i = cur_output_size; i >= cur; --i) {
+    for (int i = cur_output_rank; i >= cur; --i) {
      if (output_shape[i] == 1) {
        // Move axis
        output_shape[i + 1] = 1;
@@ -139,11 +139,11 @@ inline DDim GetUnsqueezeShape(const std::vector<int64_t> unsqz_dims,
    }
    output_shape[cur] = 1;
    // Add the output size.
-    cur_output_size++;
+    cur_output_rank++;
  }
  // Make output shape
-  for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
+  for (int in_idx = 0, out_idx = 0; out_idx < output_rank; ++out_idx) {
    if (output_shape[out_idx] == 0) {
      output_shape[out_idx] = in_dims[in_idx++];
    }

--- a/paddle/phi/kernels/onednn/reduce_kernel_impl.h
+++ b/paddle/phi/kernels/onednn/reduce_kernel_impl.h
@@ -102,8 +102,10 @@ void ReduceKernel(const Context& dev_ctx,
    reduction_p->execute(astream, reduction_args);
    astream.wait();
-    out->set_mem_desc(
+    const auto reshape_dims = out->dims().size() != 0
-        dst_memory_p->get_desc().reshape(vectorize<int64_t>(out->dims())));
+                                  ? vectorize<int64_t>(out->dims())
+                                  : std::vector<int64_t>{1};
+    out->set_mem_desc(dst_memory_p->get_desc().reshape(reshape_dims));
  }
 }

--- a/python/paddle/distributed/auto_parallel/cost/base_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/base_cost.py
@@ -13,7 +13,8 @@
 # limitations under the License
 from collections import OrderedDict
-from functools import reduce
+import numpy as np
 import paddle
 from paddle.utils.flops import flops
@@ -807,7 +808,7 @@ class CommOpCost(OpCost):
                factor = 8
            else:
                raise ValueError(f"Unsupported comm dtype {dtype}")
-            comm_count = reduce(lambda x, y: x * y, shape) * factor
+            comm_count = int(np.prod(shape)) * factor
            self._comm_count = comm_count
        return self._comm_count

--- a/python/paddle/distributed/fleet/utils/mix_precision_utils.py
+++ b/python/paddle/distributed/fleet/utils/mix_precision_utils.py
@@ -242,7 +242,7 @@ def unscale_method(self, optimizer):
        paddle.distributed.all_reduce(
            is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
        )
-        self._found_inf = is_found_inf.numpy()[0]
+        self._found_inf = int(is_found_inf)
 class MixPrecisionScaler:

--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -179,7 +179,7 @@ def monkey_patch_math_varbase():
    @property
    def _size_(var):
-        return np.prod(var.shape)
+        return int(np.prod(var.shape))
    @property
    def _T_(var):

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -286,7 +286,7 @@ class TestDistRunnerBase:
                fetch_list=[avg_cost.name],
                feed=feeder.feed(get_data()),
            )
-            out_losses.append(loss[0])
+            out_losses.append(float(loss))
            print_to_err(type(self).__name__, "run step %d finished" % i)
        print_to_err(type(self).__name__, "trainer run finished")
        print_to_err(type(self).__name__, f"dist losses: {out_losses}")
@@ -382,7 +382,7 @@ class TestDistRunnerBase:
                fetch_list=[avg_cost.name],
                feed=feeder.feed(get_data()),
            )
-            out_losses.append(loss[0])
+            out_losses.append(float(loss))
            print_to_err(type(self).__name__, "run step %d finished" % i)
        print_to_err(type(self).__name__, "trainer run finished")
@@ -619,7 +619,7 @@ class TestDistRunnerBase:
            (loss,) = exe.run(
                binary, fetch_list=[avg_cost.name], feed=feeder.feed(get_data())
            )
-            out_losses.append(loss[0])
+            out_losses.append(float(loss))
            print_to_err(type(self).__name__, "run step %d finished" % i)
            if lr_scheduler is not None:
                lr_scheduler.step()

--- a/python/paddle/fluid/tests/unittests/test_l1_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_l1_loss.py
@@ -31,17 +31,17 @@ class TestFunctionalL1Loss(unittest.TestCase):
        dy_result = paddle.nn.functional.l1_loss(input, label)
        expected = np.mean(np.abs(self.input_np - self.label_np))
        np.testing.assert_allclose(dy_result.numpy(), expected, rtol=1e-05)
-        self.assertTrue(dy_result.shape, [1])
+        self.assertEqual(dy_result.shape, [])
        dy_result = paddle.nn.functional.l1_loss(input, label, reduction='sum')
        expected = np.sum(np.abs(self.input_np - self.label_np))
        np.testing.assert_allclose(dy_result.numpy(), expected, rtol=1e-05)
-        self.assertTrue(dy_result.shape, [1])
+        self.assertEqual(dy_result.shape, [1])
        dy_result = paddle.nn.functional.l1_loss(input, label, reduction='none')
        expected = np.abs(self.input_np - self.label_np)
        np.testing.assert_allclose(dy_result.numpy(), expected, rtol=1e-05)
-        self.assertTrue(dy_result.shape, [10, 10, 5])
+        self.assertEqual(dy_result.shape, [10, 10, 5])
    def run_static(self, use_gpu=False):
        input = paddle.static.data(
@@ -119,19 +119,19 @@ class TestClassL1Loss(unittest.TestCase):
        dy_result = l1_loss(input, label)
        expected = np.mean(np.abs(self.input_np - self.label_np))
        np.testing.assert_allclose(dy_result.numpy(), expected, rtol=1e-05)
-        self.assertTrue(dy_result.shape, [1])
+        self.assertEqual(dy_result.shape, [])
        l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
        dy_result = l1_loss(input, label)
        expected = np.sum(np.abs(self.input_np - self.label_np))
        np.testing.assert_allclose(dy_result.numpy(), expected, rtol=1e-05)
-        self.assertTrue(dy_result.shape, [1])
+        self.assertEqual(dy_result.shape, [1])
        l1_loss = paddle.nn.loss.L1Loss(reduction='none')
        dy_result = l1_loss(input, label)
        expected = np.abs(self.input_np - self.label_np)
        np.testing.assert_allclose(dy_result.numpy(), expected, rtol=1e-05)
-        self.assertTrue(dy_result.shape, [10, 10, 5])
+        self.assertEqual(dy_result.shape, [10, 10, 5])
    def run_static(self, use_gpu=False):
        input = paddle.static.data(

--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -212,7 +212,7 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
            adam_test.set_dict(opt_state)
            self.assertEqual(
                adam_test._learning_rate.best_loss,
-                adam3._learning_rate.best_loss.numpy()[0],
+                adam3._learning_rate.best_loss,
                "best_loss is different before and after set_dict",
            )
            self.assertEqual(
@@ -275,7 +275,7 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
                t = lr()
                np.testing.assert_allclose(
-                    t.numpy()[0].item(), right_result[i], rtol=1e-05
+                    t.numpy().item(), right_result[i], rtol=1e-05
                )
            with self.assertRaises(TypeError):
@@ -342,7 +342,7 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
                right_result = step_decay(
                    epoch, learning_rate, step_size, decay_rate
                )
-                fluid_result = scheduler().numpy()[0]
+                fluid_result = scheduler().numpy().item()
                scheduler.epoch()
                self.assertAlmostEqual(
                    right_result,
@@ -371,7 +371,7 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
            for epoch in range(30):
                right_result = lambda_decay(epoch, learning_rate, lr_lambda)
-                fluid_result = scheduler().numpy()[0]
+                fluid_result = scheduler().numpy().item()
                scheduler.epoch()
                self.assertAlmostEqual(
                    right_result,

--- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -208,7 +208,7 @@ class TestReduceOnPlateauDecay:
        self.assertEqual(
            scheduler.cooldown_counter, scheduler1.cooldown_counter
        )
-        self.assertEqual(scheduler.best.numpy()[0], scheduler1.best)
+        self.assertEqual(scheduler.best, scheduler1.best)
        self.assertEqual(scheduler.num_bad_epochs, scheduler1.num_bad_epochs)
        self.assertEqual(scheduler.last_epoch, scheduler1.last_epoch)
        self.assertEqual(scheduler.last_lr, scheduler1.last_lr)

--- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
@@ -197,6 +197,7 @@ class TestReduceAPI(unittest.TestCase):
            out_empty_list = api(x, [])
            self.assertEqual(out_empty_list, out)
+            self.assertEqual(out_empty_list.shape, [])
            if x.grad is not None:
                self.assertEqual(x.grad.shape, [])
@@ -218,6 +219,44 @@ class TestReduceAPI(unittest.TestCase):
                self.assertEqual(x.grad.shape, [])
                np.testing.assert_allclose(x.grad.numpy(), np.array(3.0))
+            if api in [
+                paddle.sum,
+                paddle.mean,
+                paddle.nanmean,
+                paddle.nansum,
+            ]:
+                return
+            # 2) x is ND, reduce to 0D
+            if api in [paddle.all, paddle.any]:
+                x = paddle.randint(0, 2, [3, 5]).astype('bool')
+            else:
+                x = paddle.rand([3, 5])
+            x.stop_gradient = False
+            out = api(x, None)
+            out.retain_grads()
+            out.backward()
+            self.assertEqual(out.shape, [])
+            if x.grad is not None:
+                self.assertEqual(out.grad.shape, [])
+                self.assertEqual(x.grad.shape, [3, 5])
+            # 3) x is 1D, axis=0, reduce to 0D
+            if api in [paddle.all, paddle.any]:
+                x = paddle.randint(0, 2, [5]).astype('bool')
+            else:
+                x = paddle.rand([5])
+            x.stop_gradient = False
+            out = api(x, 0)
+            out.retain_grads()
+            out.backward()
+            self.assertEqual(out.shape, [])
+            if x.grad is not None:
+                self.assertEqual(out.grad.shape, [])
+                self.assertEqual(x.grad.shape, [5])
        paddle.enable_static()
    def test_static_reduce(self):
@@ -262,6 +301,53 @@ class TestReduceAPI(unittest.TestCase):
                    np.testing.assert_allclose(res[2], np.array(1.0))
                    np.testing.assert_allclose(res[3], np.array(1.0))
+                if api in [
+                    paddle.sum,
+                    paddle.mean,
+                    paddle.nanmean,
+                    paddle.nansum,
+                ]:
+                    return
+                # 2) x is ND, reduce to 0D
+                if api in [paddle.all, paddle.any]:
+                    x = paddle.randint(0, 2, [3, 5]).astype('bool')
+                else:
+                    x = paddle.rand([3, 5])
+                x = paddle.rand([3, 5])
+                x.stop_gradient = False
+                out = api(x, None)
+                paddle.static.append_backward(out)
+                fetch_list = [out]
+                if block.has_var(x.grad_name):
+                    fetch_list.extend([out.grad_name, x.grad_name])
+                res = exe.run(main_prog, fetch_list=fetch_list)
+                self.assertEqual(res[0].shape, ())
+                if len(res) > 1:
+                    self.assertEqual(res[1].shape, ())
+                    self.assertEqual(res[2].shape, (3, 5))
+                # 3) x is 1D, axis=0, reduce to 0D
+                if api in [paddle.all, paddle.any]:
+                    x = paddle.randint(0, 2, [5]).astype('bool')
+                else:
+                    x = paddle.rand([5])
+                x.stop_gradient = False
+                out = api(x, 0)
+                paddle.static.append_backward(out)
+                fetch_list = [out]
+                if block.has_var(x.grad_name):
+                    fetch_list.extend([out.grad_name, x.grad_name])
+                res = exe.run(main_prog, fetch_list=fetch_list)
+                self.assertEqual(res[0].shape, ())
+                if len(res) > 1:
+                    self.assertEqual(res[1].shape, ())
+                    self.assertEqual(res[2].shape, (5,))
        paddle.disable_static()
@@ -1321,8 +1407,8 @@ class TestSundryAPI(unittest.TestCase):
    def test_shape(self):
        out = paddle.shape(self.x)
-        self.assertEqual(out.shape, [0])
        np.testing.assert_array_equal(out.numpy(), np.array([]))
+        self.assertEqual(out.shape, [0])
    def test_equal_scalar(self):
        x = paddle.rand([])
@@ -1382,6 +1468,16 @@ class TestSundryAPI(unittest.TestCase):
        self.assertEqual(out.grad.shape, [])
        self.assertEqual(x.grad.shape, [])
+        x1 = paddle.uniform([], None, -10, 10)
+        x1.stop_gradient = False
+        out1 = paddle.clip(x1, paddle.full([], 5.0), paddle.full([], 5.0))
+        out1.retain_grads()
+        out1.backward()
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out1.grad.shape, [])
+        self.assertEqual(x1.grad.shape, [])
    def test_increment(self):
        x = paddle.rand([])
        x.stop_gradient = False
@@ -1614,6 +1710,11 @@ class TestSundryAPI(unittest.TestCase):
        self.assertEqual(out.grad.shape, [])
        self.assertEqual(x.grad.shape, [])
+    def test_scale_(self):
+        x = paddle.rand([])
+        out = x.scale_(scale=2.0, bias=1.0)
+        self.assertEqual(out.shape, [])
    def test_floor_divide(self):
        # 1-d // 0-d
        x = paddle.to_tensor([1, -2, 3], dtype="int64")
@@ -1946,32 +2047,6 @@ class TestSundryAPI(unittest.TestCase):
            # check grad shape with 1D repeats
            self.assertEqual(x.grad.shape, [])
-    def test_sigmoid_focal_loss(self):
-        logit = paddle.to_tensor(
-            [[0.97, 0.91, 0.03], [0.55, 0.43, 0.71]],
-            dtype='float32',
-            stop_gradient=False,
-        )
-        logit.retain_grads()
-        label = paddle.to_tensor(
-            [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype='float32'
-        )
-        fg_num_0 = paddle.full([], 2.0)
-        fg_num_1 = paddle.full([1], 2.0)
-        out0 = F.sigmoid_focal_loss(logit, label, normalizer=fg_num_0)
-        out1 = F.sigmoid_focal_loss(logit, label, normalizer=fg_num_1)
-        out0.retain_grads()
-        np.testing.assert_array_equal(
-            out0.numpy(),
-            out1.numpy(),
-        )
-        out0.backward()
-        self.assertEqual(out0.grad.shape, [1])
-        self.assertEqual(logit.grad.shape, [2, 3])
    def test_allclose(self):
        # 1) x is 0D
        x = paddle.full([], 0.5)
@@ -2454,6 +2529,7 @@ class TestSundryAPIStatic(unittest.TestCase):
        self.assertEqual(res[3].shape, ())
        self.assertEqual(res[3], 1.0)
+    @prog_scope()
    def test_argmin(self):
        # 1) x is 0D
        x = paddle.rand([])
@@ -2998,14 +3074,33 @@ class TestSundryAPIStatic(unittest.TestCase):
        out = paddle.clip(x, -5, 5)
        paddle.static.append_backward(out)
+        x1 = paddle.uniform([], None, -10, 10)
+        x1.stop_gradient = False
+        out1 = paddle.clip(x1, paddle.full([], 5.0), paddle.full([], 5.0))
+        paddle.static.append_backward(out1)
        prog = paddle.static.default_main_program()
        res = self.exe.run(
-            prog, fetch_list=[x, out, x.grad_name, out.grad_name]
+            prog,
+            fetch_list=[
+                x,
+                out,
+                x.grad_name,
+                out.grad_name,
+                x1,
+                out1,
+                x1.grad_name,
+                out1.grad_name,
+            ],
        )
        self.assertEqual(res[0].shape, ())
        self.assertEqual(res[1].shape, ())
        self.assertEqual(res[2].shape, ())
        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[4].shape, ())
+        self.assertEqual(res[5].shape, ())
+        self.assertEqual(res[6].shape, ())
+        self.assertEqual(res[7].shape, ())
    @prog_scope()
    def test_increment(self):
@@ -3340,6 +3435,7 @@ class TestSundryAPIStatic(unittest.TestCase):
        self.assertEqual(out2.shape, ())
        self.assertEqual(out3.shape, ())
+    @prog_scope()
    def test_add_n(self):
        x1 = paddle.rand([])
        x1.stop_gradient = False
@@ -3962,15 +4058,14 @@ class TestSundryAPIStatic(unittest.TestCase):
        np.testing.assert_array_equal(res[0], np.array(2))
    @prog_scope()
-    def _test_shape(self):
+    def test_shape(self):
        x = paddle.full([], 0.5)
        out = paddle.shape(x)
        prog = paddle.static.default_main_program()
        res = self.exe.run(prog, fetch_list=[out])
-        # 0-Size should be [ np.array([]) ], its [None] now
-        self.assertEqual(res[0].shape, (0))
        np.testing.assert_array_equal(res[0], np.array([]))
+        self.assertEqual(res[0].shape, (0,))
    def test_broadcast_tensors(self):
        # 1) x is 0D, y is 0D
@@ -4725,5 +4820,75 @@ class TestDistribution(unittest.TestCase):
        # self.assertEqual(d.entropy().shape, [])
+class TestLossAPI(unittest.TestCase):
+    def test_sigmoid_focal_loss(self):
+        logit = paddle.to_tensor(
+            [[0.97, 0.91, 0.03], [0.55, 0.43, 0.71]],
+            dtype='float32',
+            stop_gradient=False,
+        )
+        logit.retain_grads()
+        label = paddle.to_tensor(
+            [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype='float32'
+        )
+        fg_num_0 = paddle.full([], 2.0)
+        fg_num_1 = paddle.full([1], 2.0)
+        out0 = F.sigmoid_focal_loss(
+            logit, label, normalizer=fg_num_0, reduction='mean'
+        )
+        out1 = F.sigmoid_focal_loss(
+            logit, label, normalizer=fg_num_1, reduction='mean'
+        )
+        out0.retain_grads()
+        np.testing.assert_array_equal(
+            out0.numpy(),
+            out1.numpy(),
+        )
+        out0.backward()
+        self.assertEqual(out0.shape, [])
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out0.grad.shape, [])
+        self.assertEqual(logit.grad.shape, [2, 3])
+class TestLossAPIStatic(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.exe = paddle.static.Executor()
+    @prog_scope()
+    def test_sigmoid_focal_loss(self):
+        logit = paddle.rand([2, 3])
+        logit.stop_gradient = False
+        label = paddle.randint(0, 1, [2, 3]).astype('float32')
+        label.stop_gradient = False
+        fg_num_0 = paddle.full([], 2.0)
+        fg_num_1 = paddle.full([1], 2.0)
+        out0 = F.sigmoid_focal_loss(
+            logit, label, normalizer=fg_num_0, reduction='mean'
+        )
+        out1 = F.sigmoid_focal_loss(
+            logit, label, normalizer=fg_num_1, reduction='mean'
+        )
+        paddle.static.append_backward(out0.sum())
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog, fetch_list=[out0, out1, out0.grad_name, logit.grad_name]
+        )
+        np.testing.assert_allclose(res[0], res[1])
+        # because static use paddle.mean
+        # self.assertEqual(res[0].shape, ())
+        # self.assertEqual(res[1].shape, ())
+        # self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, (2, 3))
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/hapi/progressbar.py
+++ b/python/paddle/hapi/progressbar.py
@@ -81,8 +81,13 @@ class ProgressBar:
        for i, (k, val) in enumerate(values):
            if k == "loss":
-                val = val if isinstance(val, (list, np.ndarray)) else [val]
+                if isinstance(val, list):
-                if isinstance(val[0], np.uint16):
+                    scalar_val = val[0]
+                elif isinstance(val, np.ndarray):
+                    scalar_val = val.item()
+                else:
+                    scalar_val = val
+                if isinstance(scalar_val, np.uint16):
                    values[i] = ("loss", list(convert_uint16_to_float(val)))
        if current_num:

--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@@ -698,7 +698,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
        global_norm_var = paddle.add_n(global_norm_var)
        global_norm_var = paddle.sqrt(global_norm_var)
        max_global_norm = paddle.full(
-            shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm
+            shape=[], dtype=global_norm_var.dtype, fill_value=self.clip_norm
        )
        need_clip = False

--- a/python/paddle/nn/quant/lsq.py
+++ b/python/paddle/nn/quant/lsq.py
@@ -178,7 +178,7 @@ class FakeQuantActLSQPlus(Layer):
        s_attr = ParamAttr(
            name=self._scale_name, initializer=Constant(1.0), trainable=True
        )
-        self.s = self.create_parameter(shape=[1], attr=s_attr, dtype='float32')
+        self.s = self.create_parameter(shape=[], attr=s_attr, dtype='float32')
        self.s.stop_gradient = False
        if not self.symmetric:
@@ -189,7 +189,7 @@ class FakeQuantActLSQPlus(Layer):
                name=self._beta_name, initializer=Constant(0.0), trainable=True
            )
            self.beta = self.create_parameter(
-                shape=[1], attr=beta_attr, dtype='float32'
+                shape=[], attr=beta_attr, dtype='float32'
            )
            self.beta.stop_gradient = False

--- a/test/autograd/test_autograd_functional_dynamic.py
+++ b/test/autograd/test_autograd_functional_dynamic.py
@@ -256,7 +256,10 @@ def jac(grad_fn, f, inputs):
            _vs = vs.copy()
            _vs[i] = _v
            _, grads = grad_fn(f, inputs, _vs)
-            d_outs = paddle.concat([d_out.flatten() for d_out in grads])
+            if isinstance(grads, typing.Sequence):
+                d_outs = paddle.concat([d_out.flatten() for d_out in grads])
+            else:
+                d_outs = grads.flatten()
            JJ_cols.append(d_outs)
    # JJ is the fully unrolled jacobian
    JJ = paddle.stack(JJ_cols)

--- a/test/autograd/utils.py
+++ b/test/autograd/utils.py
@@ -26,10 +26,7 @@ from paddle.incubate.autograd.utils import as_tensors
 # Finite Difference Utils
 ##########################################################
 def _product(t):
-    if isinstance(t, int):
+    return int(np.product(t))
-        return t
-    else:
-        return np.product(t)
 def _get_item(t, idx):

--- a/test/dygraph_to_static/seq2seq_dygraph_model.py
+++ b/test/dygraph_to_static/seq2seq_dygraph_model.py
@@ -407,7 +407,7 @@ class BaseModel(paddle.nn.Layer):
        parent_ids = []
        for step_idx in range(paddle.to_tensor(self.beam_max_step_num)):
-            if paddle.sum(1 - beam_finished).numpy()[0] == 0:
+            if paddle.sum(1 - beam_finished) == 0:
                break
            step_input = self._merge_batch_beams(step_input)
            new_dec_hidden, new_dec_cell = [], []

--- a/test/dygraph_to_static/test_convert_operators.py
+++ b/test/dygraph_to_static/test_convert_operators.py
@@ -158,7 +158,7 @@ class TestConvertShapeCompare(unittest.TestCase):
                fetch_list=[eq_out, not_eq_out, long_eq_out],
            )
            np.testing.assert_array_equal(
-                np.array(x_y_eq_out), np.array([[True], [False], [False]])
+                np.array(x_y_eq_out), np.array([True, False, False])
            )
            set_a_zero = np.ones([3, 2]).astype(np.float32)
@@ -168,7 +168,7 @@ class TestConvertShapeCompare(unittest.TestCase):
                fetch_list=[eq_out, not_eq_out, long_eq_out],
            )
            np.testing.assert_array_equal(
-                np.array(x_y_not_eq_out), np.array([[False], [True], [True]])
+                np.array(x_y_not_eq_out), np.array([False, True, True])
            )
        paddle.disable_static()

--- a/test/dygraph_to_static/test_for_enumerate.py
+++ b/test/dygraph_to_static/test_for_enumerate.py
@@ -28,7 +28,7 @@ from paddle.static import InputSpec
 def for_in_range(x):
    z = paddle.tensor.fill_constant([1], 'int32', 0)
    x = fluid.dygraph.to_variable(x)
-    for i in range(x.numpy()[0]):
+    for i in range(x.numpy().item()):
        z = z + i
    return z

--- a/test/dygraph_to_static/test_lac.py
+++ b/test/dygraph_to_static/test_lac.py
@@ -573,7 +573,7 @@ class TestLACModel(unittest.TestCase):
                    words, targets, length = batch
                    start_time = time.time()
                    avg_cost, crf_decode = model(words, targets, length)
-                    loss_data.append(avg_cost.numpy()[0])
+                    loss_data.append(float(avg_cost))
                    # backward and optimization
                    avg_cost.backward()

--- a/test/dygraph_to_static/test_mnist_pure_fp16.py
+++ b/test/dygraph_to_static/test_mnist_pure_fp16.py
@@ -100,7 +100,7 @@ class TestPureFP16(TestMNIST):
                scaled.backward()
                scaler.minimize(optimizer, scaled)
-                loss_data.append(avg_loss.numpy()[0])
+                loss_data.append(float(avg_loss))
                # save checkpoint
                mnist.clear_gradients()
                if batch_id % 2 == 0:

--- a/test/dygraph_to_static/test_reinforcement_learning.py
+++ b/test/dygraph_to_static/test_reinforcement_learning.py
@@ -176,7 +176,7 @@ def train(args, place, to_static):
                state, reward, done, _ = env.step(action)
                # log loss_probs
-                loss_data.append(loss.numpy()[0])
+                loss_data.append(float(loss))
                policy.rewards.append(reward)
                ep_reward += reward
@@ -191,7 +191,7 @@ def train(args, place, to_static):
            if i_episode % args.log_interval == 0:
                print(
                    'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}\t loss_probs: {}'.format(
-                        i_episode, ep_reward, running_reward, loss.numpy()[0]
+                        i_episode, ep_reward, running_reward, float(loss)
                    )
                )

--- a/test/dygraph_to_static/test_resnet_pure_fp16.py
+++ b/test/dygraph_to_static/test_resnet_pure_fp16.py
@@ -86,7 +86,7 @@ def train(to_static, build_strategy=None):
            scaler.minimize(optimizer, scaled)
            resnet.clear_gradients()
-            loss_data.append(avg_loss.numpy()[0])
+            loss_data.append(float(avg_loss))
            total_loss += avg_loss
            total_acc1 += acc_top1
            total_acc5 += acc_top5

--- a/test/dygraph_to_static/test_sentiment.py
+++ b/test/dygraph_to_static/test_sentiment.py
@@ -342,7 +342,7 @@ def train(args, to_static):
                model.train()
                avg_cost, prediction, acc = model(doc, label)
-                loss_data.append(avg_cost.numpy()[0])
+                loss_data.append(float(avg_cost))
                avg_cost.backward()
                sgd_optimizer.minimize(avg_cost)
@@ -358,7 +358,7 @@ def train(args, to_static):
                        "step: %d, ave loss: %f, speed: %f steps/s"
                        % (
                            batch_id,
-                            avg_cost.numpy()[0],
+                            float(avg_cost),
                            args.log_step / used_time,
                        )
                    )

--- a/test/dygraph_to_static/test_transformer.py
+++ b/test/dygraph_to_static/test_transformer.py
@@ -261,7 +261,7 @@ def train_dygraph(args, batch_generator):
                transformer.clear_gradients()
                if step_idx % args.print_step == 0:
                    total_avg_cost = avg_cost.numpy() * trainer_count
-                    avg_loss.append(total_avg_cost[0])
+                    avg_loss.append(float(total_avg_cost))
                    if step_idx == 0:
                        logging.info(
                            "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "