[Fluid Clean] remove paddle.fluid.layers.nn.reduce_mean (#48196)

9ffc760f · heyanru · GitHub · b9421dc1 · 9ffc760f · 9ffc760f
97 changed file
--- a/python/paddle/fluid/contrib/slim/quantization/adaround.py
+++ b/python/paddle/fluid/contrib/slim/quantization/adaround.py
@@ -64,7 +64,7 @@ class AdaRoundLoss:
        square_cost = fluid.layers.square_error_cost(
            ada_quantized_output, orig_output
        )
-        recon_loss = fluid.layers.reduce_mean(paddle.sum(square_cost, axis=-1))
+        recon_loss = paddle.mean(paddle.sum(square_cost, axis=-1))
        return recon_loss
    def compute_round_loss(self, alpha_v, warm_start, beta):

--- a/python/paddle/fluid/contrib/slim/tests/test_moving_average_abs_max_scale_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_moving_average_abs_max_scale_op.py
@@ -53,7 +53,7 @@ class TestMovingAverageAbsMaxScaleOp(unittest.TestCase):
            cross_entropy = fluid.layers.softmax_with_cross_entropy(
                fc_tmp, label
            )
-            loss = fluid.layers.reduce_mean(cross_entropy)
+            loss = paddle.mean(cross_entropy)
            sgd = fluid.optimizer.SGD(learning_rate=1e-3)
            sgd.minimize(loss)

--- a/python/paddle/fluid/contrib/tests/test_correlation.py
+++ b/python/paddle/fluid/contrib/tests/test_correlation.py
@@ -122,7 +122,7 @@ class TestCorrelationOp(unittest.TestCase):
            stride2=1,
        )
-        loss = fluid.layers.reduce_mean(out)
+        loss = paddle.mean(out)
        optimizer = fluid.optimizer.Momentum(0.0001, 0.9)
        optimizer.minimize(loss)

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -71,7 +71,6 @@ __all__ = [
    'softmax',
    'pool2d',
    'batch_norm',
-    'reduce_mean',
    'reduce_all',
    'reduce_any',
    'dropout',
@@ -2506,63 +2505,6 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
    return out
-@deprecated(since="2.0.0", update_to="paddle.mean")
-def reduce_mean(input, dim=None, keep_dim=False, name=None):
-    """
-    Computes the mean of the input tensor's elements along the given dimension.
-    Args:
-        input (Variable): The input variable which is a Tensor, the data type is float32,
-            float64, int32, int64.
-        dim (list|int, optional): The dimension along which the mean is computed. If
-            `None`, compute the mean over all elements of :attr:`input`
-            and return a variable with a single element, otherwise it
-            must be in the range :math:`[-rank(input), rank(input))`. If
-            :math:`dim[i] < 0`, the dimension to reduce is
-            :math:`rank(input) + dim[i]`.
-        keep_dim (bool, optional): Whether to reserve the reduced dimension in the
-            output Tensor. The result tensor will have one fewer dimension
-            than the :attr:`input` unless :attr:`keep_dim` is true, default
-            value is False.
-        name(str, optional): The default value is None.  Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
-    Returns:
-        Variable: Tensor, results of average on the specified dim of input tensor,
-        it's data type is the same as input's Tensor.
-    Raises:
-        TypeError, if out data type is different with the input data type.
-    Examples:
-        .. code-block:: python
-            import paddle
-            import paddle.fluid as fluid
-            paddle.enable_static()
-            # x is a Tensor variable with following elements:
-            #    [[0.2, 0.3, 0.5, 0.9]
-            #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the corresponding output tensor.
-            x = fluid.data(name='x', shape=[2, 4], dtype='float32')
-            fluid.layers.reduce_mean(x)  # [0.4375]
-            fluid.layers.reduce_mean(x, dim=0)  # [0.15, 0.25, 0.55, 0.8]
-            fluid.layers.reduce_mean(x, dim=-1)  # [0.475, 0.4]
-            fluid.layers.reduce_mean(x, dim=1, keep_dim=True)  # [[0.475], [0.4]]
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
-            #      [[[1.0, 2.0], [3.0, 4.0]],
-            #      [[5.0, 6.0], [7.0, 8.0]]]
-            # Each example is followed by the corresponding output tensor.
-            y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32')
-            fluid.layers.reduce_mean(y, dim=[1, 2]) # [2.5, 6.5]
-            fluid.layers.reduce_mean(y, dim=[0, 1]) # [4.0, 5.0]
-    """
-    return paddle.mean(x=input, axis=dim, keepdim=keep_dim, name=name)
 def reduce_all(input, dim=None, keep_dim=False, name=None):
    """

--- a/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
+++ b/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
@@ -17,6 +17,7 @@ import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
 from paddle.fluid import unique_name
@@ -71,7 +72,7 @@ class AutoCheckpointBase(unittest.TestCase):
            cross_entropy = fluid.layers.softmax_with_cross_entropy(
                fc_tmp, label
            )
-            loss = fluid.layers.reduce_mean(cross_entropy)
+            loss = paddle.mean(cross_entropy)
            sgd = fluid.optimizer.SGD(learning_rate=1e-3)
            if minimize:
                sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_embedding.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_embedding.py
@@ -39,7 +39,7 @@ def make_program_lookup_table_v1_mp_dp():
            dtype="float32",
            is_sparse=False,
        )
-        loss = paddle.fluid.layers.reduce_mean(emb_out)
+        loss = paddle.mean(emb_out)
        auto.shard_tensor(
            src_ids,

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
@@ -57,7 +57,7 @@ def dyfunc_with_if_else2(x, col=100):
        #  `x` is Tensor, `col` is not Tensor, and `col` is the return value of `true_fn` after transformed.
        # col = -1
        col = fluid.layers.fill_constant(shape=[1], value=-1, dtype="int64")
-    if fluid.layers.reduce_mean(x).numpy()[0] > x.numpy()[row][col]:
+    if paddle.mean(x).numpy()[0] > x.numpy()[row][col]:
        y = fluid.layers.relu(x)
    else:
        x_pow = paddle.pow(x, 2)

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
@@ -303,7 +303,7 @@ class BaseModel(fluid.dygraph.Layer):
            tar_sequence_length, maxlen=max_tar_seq_len, dtype='float32'
        )
        loss = loss * tar_mask
-        loss = fluid.layers.reduce_mean(loss, dim=[0])
+        loss = paddle.mean(loss, axis=[0])
        loss = paddle.sum(loss)
        return loss
@@ -837,7 +837,7 @@ class AttentionModel(fluid.dygraph.Layer):
            tar_sequence_length, maxlen=max_tar_seq_len, dtype='float32'
        )
        loss = loss * tar_mask
-        loss = fluid.layers.reduce_mean(loss, dim=[0])
+        loss = paddle.mean(loss, axis=[0])
-        loss = paddle.sum(loss)
+        loss = fluid.layers.reduce_sum(loss)
        return loss
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -114,7 +114,7 @@ class ReduceMeanLayer:
        """
        operation
        """
-        mean = fluid.layers.reduce_mean(input)
+        mean = paddle.mean(input)
        return mean

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -331,11 +331,11 @@ def bmn_loss_func(
            epsilon = 0.000001
            # temp = paddle.log(pred_score + epsilon)
            loss_pos = paddle.multiply(paddle.log(pred_score + epsilon), pmask)
-            loss_pos = coef_1 * fluid.layers.reduce_mean(loss_pos)
+            loss_pos = coef_1 * paddle.mean(loss_pos)
            loss_neg = paddle.multiply(
                paddle.log(1.0 - pred_score + epsilon), (1.0 - pmask)
            )
-            loss_neg = coef_0 * fluid.layers.reduce_mean(loss_neg)
+            loss_neg = coef_0 * paddle.mean(loss_neg)
            loss = -1 * (loss_pos + loss_neg)
            return loss

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
@@ -93,8 +93,8 @@ class Cycle_Gan(fluid.dygraph.Layer):
        diff_A = paddle.abs(paddle.subtract(x=input_A, y=cyc_A))
        diff_B = paddle.abs(paddle.subtract(x=input_B, y=cyc_B))
-        cyc_A_loss = fluid.layers.reduce_mean(diff_A) * lambda_A
+        cyc_A_loss = paddle.mean(diff_A) * lambda_A
-        cyc_B_loss = fluid.layers.reduce_mean(diff_B) * lambda_B
+        cyc_B_loss = paddle.mean(diff_B) * lambda_B
        cyc_loss = cyc_A_loss + cyc_B_loss
        fake_rec_A = self.build_gen_discriminator_a(fake_B)
@@ -105,8 +105,8 @@ class Cycle_Gan(fluid.dygraph.Layer):
        G = g_A_loss + g_B_loss
        idt_A = self.build_generator_resnet_9blocks_a(input_B)
        idt_loss_A = (
-            fluid.layers.reduce_mean(
+            paddle.mean(
-                paddle.abs(paddle.subtract(x=input_B, y=idt_A))
+                paddle.abs(fluid.layers.elementwise_sub(x=input_B, y=idt_A))
            )
            * lambda_B
            * lambda_identity
@@ -114,8 +114,8 @@ class Cycle_Gan(fluid.dygraph.Layer):
        idt_B = self.build_generator_resnet_9blocks_b(input_A)
        idt_loss_B = (
-            fluid.layers.reduce_mean(
+            paddle.mean(
-                paddle.abs(paddle.subtract(x=input_A, y=idt_B))
+                paddle.abs(fluid.layers.elementwise_sub(x=input_A, y=idt_B))
            )
            * lambda_A
            * lambda_identity
@@ -648,7 +648,7 @@ def train(args, to_static):
                d_loss_A = (
                    paddle.square(fake_pool_rec_B) + paddle.square(rec_B - 1)
                ) / 2.0
-                d_loss_A = fluid.layers.reduce_mean(d_loss_A)
+                d_loss_A = paddle.mean(d_loss_A)
                d_loss_A.backward()
                optimizer2.minimize(d_loss_A)
@@ -661,7 +661,7 @@ def train(args, to_static):
                d_loss_B = (
                    paddle.square(fake_pool_rec_A) + paddle.square(rec_A - 1)
                ) / 2.0
-                d_loss_B = fluid.layers.reduce_mean(d_loss_B)
+                d_loss_B = paddle.mean(d_loss_B)
                d_loss_B.backward()
                optimizer3.minimize(d_loss_B)

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
@@ -220,7 +220,7 @@ class PtbModel(fluid.Layer):
            logits=projection, label=label, soft_label=False
        )
        loss = paddle.reshape(loss, shape=[-1, self.num_steps])
-        loss = fluid.layers.reduce_mean(loss, dim=[0])
+        loss = paddle.mean(loss, axis=[0])
        loss = paddle.sum(loss)
        return loss, last_hidden, last_cell

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
@@ -265,7 +265,7 @@ class SkipGram(fluid.dygraph.Layer):
        loss = paddle.nn.functional.binary_cross_entropy_with_logits(
            word_sim, label
        )
-        loss = fluid.layers.reduce_mean(loss)
+        loss = paddle.mean(loss)
        return pred, loss

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
@@ -325,7 +325,7 @@ class YOLOv3(fluid.dygraph.Layer):
                    downsample_ratio=self.downsample,
                    use_label_smooth=cfg.label_smooth,
                )
-                self.losses.append(fluid.layers.reduce_mean(loss))
+                self.losses.append(paddle.mean(loss))
            else:
                mask_anchors = []

--- a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
@@ -28,7 +28,7 @@ class TestMean(IPUOpTest):
        self.set_test_op()
    def set_test_op(self):
-        self.op = paddle.fluid.layers.reduce_mean
+        self.op = paddle.mean
    def set_feed_attr(self):
        self.feed_shape = [x.shape for x in self.feed_fp32.values()]

--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -144,7 +144,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
      test_trt_pool3d_op
      PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 45)
  endif()
-  set_tests_properties(test_trt_reduce_mean_op PROPERTIES TIMEOUT 60)
  set_tests_properties(test_trt_tile_op PROPERTIES TIMEOUT 60)
  set_tests_properties(test_trt_fc_fuse_quant_dequant_pass PROPERTIES TIMEOUT
                                                                      100)

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_mean_op.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import numpy as np
-from inference_pass_test import InferencePassTest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.core import AnalysisConfig, PassVersionChecker
-class TRTReduceMeanTest(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, -1, -1], dtype="float32"
-            )
-            reduce_mean = fluid.layers.reduce_mean(
-                data, dim=[2, -1], keep_dim=True
-            )
-            out = fluid.layers.batch_norm(reduce_mean, is_test=True)
-        self.feeds = {
-            "data": np.random.random([3, 3, 56, 56]).astype("float32"),
-        }
-        self.enable_trt = True
-        self.trt_parameters = TRTReduceMeanTest.TensorRTParam(
-            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False
-        )
-        self.fetch_list = [out]
-        self.dynamic_shape_params = TRTReduceMeanTest.DynamicShapeParam(
-            {'data': [1, 3, 16, 16]},
-            {'data': [3, 3, 56, 56]},
-            {'data': [3, 3, 56, 56]},
-            False,
-        )
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu, flatten=True)
-            self.assertTrue(
-                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')
-            )
-class TRTReduceMeanAllNoBatchTest(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, -1, -1], dtype="float32"
-            )
-            reduce_mean = fluid.layers.reduce_mean(data, keep_dim=True)
-            out = fluid.layers.batch_norm(reduce_mean, is_test=True)
-        self.feeds = {
-            "data": np.random.random([3, 3, 56, 56]).astype("float32"),
-        }
-        self.enable_trt = True
-        self.trt_parameters = TRTReduceMeanAllNoBatchTest.TensorRTParam(
-            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False
-        )
-        self.fetch_list = [out]
-        self.dynamic_shape_params = (
-            TRTReduceMeanAllNoBatchTest.DynamicShapeParam(
-                {'data': [1, 3, 16, 16]},
-                {'data': [3, 3, 56, 56]},
-                {'data': [3, 3, 56, 56]},
-                False,
-            )
-        )
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu, flatten=True)
-            self.assertTrue(
-                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')
-            )
-class TRTReduceMeanTestFP16(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, -1, -1], dtype="float32"
-            )
-            reduce_mean = fluid.layers.reduce_mean(
-                data, dim=[2, -1], keep_dim=True
-            )
-            out = fluid.layers.batch_norm(reduce_mean, is_test=True)
-        self.feeds = {
-            "data": np.random.random([3, 3, 56, 56]).astype("float32"),
-        }
-        self.enable_trt = True
-        self.trt_parameters = TRTReduceMeanTestFP16.TensorRTParam(
-            1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False
-        )
-        self.fetch_list = [out]
-        self.dynamic_shape_params = TRTReduceMeanTestFP16.DynamicShapeParam(
-            {'data': [1, 3, 16, 16]},
-            {'data': [3, 3, 56, 56]},
-            {'data': [3, 3, 56, 56]},
-            False,
-        )
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu, flatten=True)
-            self.assertTrue(
-                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')
-            )
-class TRTReduceMeanAllTest(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 56, 56], dtype="float32"
-            )
-            reduce_mean = fluid.layers.reduce_mean(data, keep_dim=True)
-            out = fluid.layers.batch_norm(reduce_mean, is_test=True)
-        self.feeds = {
-            "data": np.random.random([3, 3, 56, 56]).astype("float32"),
-        }
-        self.enable_trt = True
-        self.trt_parameters = TRTReduceMeanAllTest.TensorRTParam(
-            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False
-        )
-        self.fetch_list = [out]
-        self.dynamic_shape_params = TRTReduceMeanAllTest.DynamicShapeParam(
-            {'data': [1, 3, 56, 56]},
-            {'data': [3, 3, 56, 56]},
-            {'data': [3, 3, 56, 56]},
-            False,
-        )
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu, flatten=True)
-            self.assertTrue(
-                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')
-            )
-class TRTReduceMeanTestStatic(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[3, 3, 56, 56], dtype="float32"
-            )
-            reduce_mean = fluid.layers.reduce_mean(
-                data, dim=[2, -1], keep_dim=True
-            )
-            out = fluid.layers.batch_norm(reduce_mean, is_test=True)
-        self.feeds = {
-            "data": np.random.random([3, 3, 56, 56]).astype("float32"),
-        }
-        self.enable_trt = True
-        self.trt_parameters = TRTReduceMeanTestStatic.TensorRTParam(
-            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False
-        )
-        self.fetch_list = [out]
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu, flatten=True)
-            self.assertTrue(
-                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')
-            )
-class TRTReduceMeanStaticAllTest(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[4, 3, 56, 56], dtype="float32"
-            )
-            reduce_mean = fluid.layers.reduce_mean(data, keep_dim=True)
-            out = fluid.layers.batch_norm(reduce_mean, is_test=True)
-        self.feeds = {
-            "data": np.random.random([4, 3, 56, 56]).astype("float32"),
-        }
-        self.enable_trt = True
-        self.trt_parameters = TRTReduceMeanStaticAllTest.TensorRTParam(
-            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False
-        )
-        self.fetch_list = [out]
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu, flatten=True)
-            self.assertTrue(
-                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')
-            )
-class TRTReduceMeanStaticFP16(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[4, 3, 56, 56], dtype="float32"
-            )
-            reduce_mean = fluid.layers.reduce_mean(data, keep_dim=True)
-            out = fluid.layers.batch_norm(reduce_mean, is_test=True)
-        self.feeds = {
-            "data": np.random.random([4, 3, 56, 56]).astype("float32"),
-        }
-        self.enable_trt = True
-        self.trt_parameters = TRTReduceMeanStaticFP16.TensorRTParam(
-            1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False
-        )
-        self.fetch_list = [out]
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(
-                use_gpu, flatten=True, atol=1e-3, rtol=1e-3
-            )
-            self.assertTrue(
-                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')
-            )
-class TRTReduceMeanFP16Static(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[4, 3, 56, 56], dtype="float32"
-            )
-            reduce_mean = fluid.layers.reduce_mean(data, keep_dim=True)
-            out = fluid.layers.batch_norm(reduce_mean, is_test=True)
-        self.feeds = {
-            "data": np.random.random([4, 3, 56, 56]).astype("float32"),
-        }
-        self.enable_trt = True
-        self.trt_parameters = TRTReduceMeanFP16Static.TensorRTParam(
-            1 << 30, 32, 1, AnalysisConfig.Precision.Half, True, False
-        )
-        self.fetch_list = [out]
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(
-                use_gpu, flatten=True, atol=1e-3, rtol=1e-3
-            )
-            self.assertTrue(
-                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')
-            )
-if __name__ == "__main__":
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py
@@ -264,7 +264,7 @@ class TestNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            adam = fluid.optimizer.Adam(learning_rate=0.01)
            adam.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py
@@ -215,7 +215,7 @@ class TestNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            adam = paddle.optimizer.AdamW(learning_rate=0.01, weight_decay=0.02)
            adam.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_max_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_max_op_mlu.py
@@ -344,7 +344,7 @@ class TestElementwiseMaxNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_min_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_min_op_mlu.py
@@ -190,7 +190,7 @@ class TestElementwiseMinOpNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py
@@ -113,7 +113,7 @@ class TestGeluNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1_gelu, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/mlu/test_leaky_relu_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_leaky_relu_op_mlu.py
@@ -107,7 +107,7 @@ class TestLeakyReluNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py
@@ -126,7 +126,7 @@ class TestRelu6Net(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py
@@ -127,7 +127,7 @@ class TestReluNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py
@@ -127,7 +127,7 @@ class TestPowNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2)
            cost = fluid.layers.softmax_with_cross_entropy(prediction, label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/mlu/test_tanh_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_tanh_op_mlu.py
@@ -108,7 +108,7 @@ class TestTanhNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
@@ -264,7 +264,7 @@ class TestNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            adam = fluid.optimizer.Adam(learning_rate=0.01)
            adam.minimize(loss)
@@ -349,7 +349,7 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
                )
                cost = fluid.layers.cross_entropy(input=prediction, label=label)
-                loss = fluid.layers.reduce_mean(cost)
+                loss = paddle.mean(cost)
                beta1_init = 0.9
                beta2_init = 0.999
                epsilon_init = 1e-8

--- a/python/paddle/fluid/tests/unittests/npu/test_adamw_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_adamw_op_npu.py
@@ -215,7 +215,7 @@ class TestNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            adam = paddle.optimizer.AdamW(learning_rate=0.01, weight_decay=0.02)
            adam.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py
@@ -105,7 +105,7 @@ class TestCosNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py
@@ -139,7 +139,7 @@ class TestElementwiseDivNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py
@@ -303,7 +303,7 @@ class TestElementwiseMaxNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
@@ -190,7 +190,7 @@ class TestElementwiseMinOpNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
@@ -314,7 +314,7 @@ class TestElementwisePowNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
@@ -195,7 +195,7 @@ class TestSubtractNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
@@ -134,7 +134,7 @@ class TestGatherGrad(unittest.TestCase):
            a.stop_gradient = False
            b = paddle.gather(a, index)
-            loss = fluid.layers.reduce_mean(b)
+            loss = paddle.mean(b)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
@@ -113,7 +113,7 @@ class TestGeluNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1_gelu, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py
@@ -107,7 +107,7 @@ class TestLeakyReluNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
@@ -105,7 +105,7 @@ class TestLogNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
@@ -248,7 +248,7 @@ class TestMulNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)
@@ -325,7 +325,7 @@ class TestMulNet3_2(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)
@@ -405,7 +405,7 @@ class TestMulNet3_2_xc2(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)
@@ -486,7 +486,7 @@ class TestMulNet4_2(unittest.TestCase):
            prediction = fluid.layers.fc(input=result, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py
@@ -105,7 +105,7 @@ class TestPowNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
@@ -113,7 +113,7 @@ class TestReduceSumNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=z_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py
@@ -126,7 +126,7 @@ class TestRelu6Net(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
@@ -119,7 +119,7 @@ class TestReluNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_rmsprop_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_rmsprop_op_npu.py
@@ -53,7 +53,7 @@ class TestNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            rmsprop = fluid.optimizer.RMSProp(learning_rate=0.01)
            rmsprop.minimize(loss)
@@ -116,7 +116,7 @@ class TestCenteredNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            rmsprop = fluid.optimizer.RMSProp(learning_rate=0.01, centered=True)
            rmsprop.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py
@@ -78,7 +78,7 @@ class TestNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
@@ -125,7 +125,7 @@ class TestPowNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2)
            cost = fluid.layers.softmax_with_cross_entropy(prediction, label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
@@ -108,7 +108,7 @@ class TestSqrtNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
@@ -105,7 +105,7 @@ class TestSquareNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
@@ -108,7 +108,7 @@ class TestTanhNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
+            loss = paddle.mean(cost)
            sgd = fluid.optimizer.SGD(learning_rate=0.01)
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
@@ -72,7 +72,7 @@ class SimpleNet(fluid.Layer):
            logits=projection, label=label, soft_label=False
        )
        loss = paddle.reshape(loss, shape=[-1, self.num_steps])
-        loss = fluid.layers.reduce_mean(loss, dim=[0])
+        loss = paddle.mean(loss, axis=[0])
        loss = paddle.sum(loss)
        return loss

--- a/python/paddle/fluid/tests/unittests/seresnext_net.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_net.py
@@ -51,7 +51,7 @@ def squeeze_excitation(input, num_channels, reduction_ratio):
    conv = input
    shape = conv.shape
    reshape = paddle.reshape(x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
-    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
+    pool = paddle.mean(x=reshape, axis=2)
    squeeze = fluid.layers.fc(
        input=pool, size=num_channels // reduction_ratio, act='relu'
@@ -162,7 +162,7 @@ def SE_ResNeXt50Small(use_feed):
    shape = conv.shape
    reshape = paddle.reshape(x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
-    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
+    pool = paddle.mean(x=reshape, axis=2)
    dropout = (
        pool
        if remove_dropout

--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -614,7 +614,7 @@ class TestAdamOpV2(unittest.TestCase):
            with fluid.unique_name.guard():
                data = fluid.data(name="data", shape=shape)
                conv = fluid.layers.conv2d(data, 8, 3)
-                loss = fluid.layers.reduce_mean(conv)
+                loss = paddle.mean(conv)
                beta1 = fluid.layers.create_global_var(
                    shape=[1], value=0.85, dtype='float32', persistable=True
@@ -807,7 +807,7 @@ class TestAdamOptimizer(unittest.TestCase):
                )
                cost = fluid.layers.cross_entropy(input=prediction, label=label)
-                loss = fluid.layers.reduce_mean(cost)
+                loss = paddle.mean(cost)
                beta1_init = 0.9
                beta2_init = 0.999
                epsilon_init = 1e-8
@@ -965,7 +965,7 @@ class TestAdamOptimizer(unittest.TestCase):
        prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
        cost = fluid.layers.cross_entropy(input=prediction, label=label)
-        loss = fluid.layers.reduce_mean(cost)
+        loss = paddle.mean(cost)
        adam = fluid.optimizer.Adam(use_global_beta_pow=True)
        adam.minimize(loss)
        self.assertRaises(Exception, adam._get_global_accumulator, 'tmp')

--- a/python/paddle/fluid/tests/unittests/test_dataloader_early_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_early_reset.py
@@ -12,11 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import unittest
+import paddle
-import numpy as np
 import paddle.fluid as fluid
+import numpy as np
+import unittest
 def infinite_reader():
@@ -33,7 +32,7 @@ class TestDataLoaderEarlyReset(unittest.TestCase):
    def build_network(self):
        y = fluid.layers.fc(self.x, size=10)
-        loss = fluid.layers.reduce_mean(y)
+        loss = paddle.mean(y)
        optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
        optimizer.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py
@@ -17,6 +17,7 @@ import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
@@ -48,7 +49,7 @@ class DataLoaderKeepOrderTestBase(unittest.TestCase):
        )
        fc = fluid.layers.fc(input_data, size=10)
-        loss = fluid.layers.reduce_mean(fc)
+        loss = paddle.mean(fc)
        loader.set_batch_generator(
            create_reader(self.shape, self.batch_num),

--- a/python/paddle/fluid/tests/unittests/test_dataloader_unkeep_order.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_unkeep_order.py
@@ -17,6 +17,7 @@ import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.reader import keep_data_loader_order
@@ -54,7 +55,7 @@ class DataLoaderKeepOrderTestBase(unittest.TestCase):
        )
        fc = fluid.layers.fc(input_data, size=10)
-        loss = fluid.layers.reduce_mean(fc)
+        loss = paddle.mean(fc)
        loader.set_batch_generator(
            create_reader(self.shape, self.batch_num),

--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py
@@ -55,7 +55,7 @@ class SparseLoadOp(unittest.TestCase):
                    ),
                ),
            )
-            loss = fluid.layers.reduce_mean(fc1)
+            loss = paddle.mean(fc1)
        return loss
    def save_origin_model(self, emb_array, fc_array):

--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_sgd.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_sgd.py
@@ -52,7 +52,7 @@ class TestSparseLoadProgram(unittest.TestCase):
                    )
                    fc1 = fluid.layers.fc(input=emb, size=128, act="relu")
                    fc2 = fluid.layers.fc(input=fc1, size=64, act="relu")
-                    loss = fluid.layers.reduce_mean(fc2)
+                    loss = paddle.mean(fc2)
            return scope, train_program, startup_program, loss

--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -437,7 +437,7 @@ class TestFakeInit(TranspilerTest):
            paddle.sum(true_xent, axis=1),
            paddle.sum(neg_xent, axis=1),
        )
-        avg_cost = fluid.layers.reduce_mean(cost)
+        avg_cost = paddle.mean(cost)
        sgd_optimizer = fluid.optimizer.SGD(
            learning_rate=fluid.layers.exponential_decay(

--- a/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py
@@ -57,7 +57,7 @@ def build_and_run_program(place, batch_size, beam_size, stop_gradient=False):
        layers.assign(length_cond, cond)
    out = layers.tensor_array_to_tensor(scores, axis=0, use_stack=True)[0]
-    loss = layers.reduce_mean(out)
+    loss = paddle.mean(out)
    opt = fluid.optimizer.Adam(0.01)
    opt.minimize(loss)
    exe = fluid.Executor(place)

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
@@ -468,7 +468,7 @@ def lm_model(
    )
    loss = paddle.reshape(loss, shape=[-1, num_steps])
-    loss = layers.reduce_mean(loss, dim=[0])
+    loss = paddle.mean(loss, axis=[0])
    loss = paddle.sum(loss)
    loss.persistable = True

--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -412,7 +412,7 @@ class TestDygraphGradientClip(unittest.TestCase):
                [16, 5], min=-10, max=10
            ).astype('float32')
            out = linear(fluid.dygraph.to_variable(inputs))
-            loss = fluid.layers.reduce_mean(out)
+            loss = paddle.mean(out)
            loss.backward()
            sgd_optimizer = fluid.optimizer.SGD(
                learning_rate=0.0,
@@ -557,7 +557,7 @@ class TestDygraphGradientClipFP16(unittest.TestCase):
                ).astype('float32')
                with paddle.amp.auto_cast(level='O2'):
                    out = model(fluid.dygraph.to_variable(inputs))
-                    loss = fluid.layers.reduce_mean(out)
+                    loss = paddle.mean(out)
                scaled = scaler.scale(loss)
                scaled.backward()
                scaler.unscale_(sgd_optimizer)
@@ -605,7 +605,7 @@ class TestDygraphGradientClipFP64(unittest.TestCase):
            ).astype('float32')
            linear = paddle.nn.Linear(5, 5)
            out = linear(fluid.dygraph.to_variable(inputs))
-            loss = fluid.layers.reduce_mean(out)
+            loss = paddle.mean(out)
            loss.backward()
            # before clip
            params_grads = []

--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -321,7 +321,7 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
            path_code=path_code,
        )
-        avg_cost = fluid.layers.reduce_mean(cost)
+        avg_cost = paddle.mean(cost)
        return avg_cost, data_list

--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -45,7 +45,7 @@ class AutoPruneLayer0(fluid.Layer):
        a = self.linear1(x)
        b = self.linear2(y)
        c = fluid.layers.mul(a, b)
-        d = fluid.layers.reduce_mean(c)
+        d = paddle.mean(c)
        return d
@@ -74,7 +74,7 @@ class AutoPruneLayer1(fluid.Layer):
        b = self.linear2(y)
        b.stop_gradient = True
        c = fluid.layers.mul(a, b)
-        d = fluid.layers.reduce_mean(c)
+        d = paddle.mean(c)
        return d
@@ -124,15 +124,15 @@ class MyLayer(fluid.Layer):
    def forward(self, x):
        # this method involves only the linear layers
-        loss = fluid.layers.reduce_mean(self.linear_0(x) + self.linear_1(x))
+        loss = paddle.mean(self.linear_0(x) + self.linear_1(x))
        return loss
    def linear0(self, x):
-        loss = fluid.layers.reduce_mean(self.linear_0(x))
+        loss = paddle.mean(self.linear_0(x))
        return loss
    def embed_linear0(self, x):
-        loss = fluid.layers.reduce_mean(self.linear_0(self.embed0(x)))
+        loss = paddle.mean(self.linear_0(self.embed0(x)))
        return loss
@@ -147,18 +147,18 @@ class MyLayer2(fluid.Layer):
    def forward(self, indices):
        # mind the difference with MyLayer
        # In this example, the forward method involes all params
-        loss = fluid.layers.reduce_mean(
+        loss = paddle.mean(
            self.linear_0(self.embed0(indices))
            + self.linear_1(self.embed1(indices))
        )
        return loss
    def linear0(self, x):
-        loss = fluid.layers.reduce_mean(self.linear_0(x))
+        loss = paddle.mean(self.linear_0(x))
        return loss
    def embed_linear0(self, x):
-        loss = fluid.layers.reduce_mean(self.linear_0(self.embed0(x)))
+        loss = paddle.mean(self.linear_0(self.embed0(x)))
        return loss

--- a/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py
@@ -50,7 +50,7 @@ class TestImperativeContainerParameterList(unittest.TestCase):
            self.assertEqual(len(model.params), num_stacked_param)
            res = model(x)
            self.assertListEqual(res.shape, [5, 2])
-            loss = fluid.layers.reduce_mean(res)
+            loss = paddle.mean(res)
            loss.backward()
            model.params[num_stacked_param - 1] = fluid.layers.create_parameter(
@@ -64,7 +64,7 @@ class TestImperativeContainerParameterList(unittest.TestCase):
            self.assertEqual(len(model.params), num_stacked_param + 1)
            res = model(x)
            self.assertListEqual(res.shape, [5, 4])
-            loss = fluid.layers.reduce_mean(res)
+            loss = paddle.mean(res)
            loss.backward()
    def test_paramter_list(self):

--- a/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py
@@ -16,6 +16,7 @@ import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.framework import _test_eager_guard
 from paddle.nn import Linear
@@ -32,7 +33,7 @@ class TestImperativeContainerSequential(unittest.TestCase):
            model1[1] = Linear(1, 3)
            res1 = model1(data)
            self.assertListEqual(res1.shape, [5, 3])
-            loss1 = fluid.layers.reduce_mean(res1)
+            loss1 = paddle.mean(res1)
            loss1.backward()
            l1 = Linear(10, 1)
@@ -53,7 +54,7 @@ class TestImperativeContainerSequential(unittest.TestCase):
            res2 = model2(data)
            self.assertListEqual(res2.shape, [5, 4])
-            loss2 = fluid.layers.reduce_mean(res2)
+            loss2 = paddle.mean(res2)
            loss2.backward()
    def test_sequential(self):
@@ -71,7 +72,7 @@ class TestImperativeContainerSequential(unittest.TestCase):
            model1[1] = Linear(1, 3)
            res1 = model1(data)
            self.assertListEqual(res1.shape, [5, 3])
-            loss1 = fluid.layers.reduce_mean(res1)
+            loss1 = paddle.mean(res1)
            loss1.backward()
            l1 = Linear(10, 1)
@@ -92,7 +93,7 @@ class TestImperativeContainerSequential(unittest.TestCase):
            res2 = model2(data)
            self.assertListEqual(res2.shape, [5, 4])
-            loss2 = fluid.layers.reduce_mean(res2)
+            loss2 = paddle.mean(res2)
            loss2.backward()
    def test_sequential_list_params(self):

--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
@@ -322,7 +322,7 @@ class TestDygraphDoubleGrad(TestCase):
        z = y1 + y2
        w = z * z
-        w_mean = fluid.layers.reduce_mean(w)
+        w_mean = paddle.mean(w)
        del y1, z, w
        (dx_actual,) = self.grad(
@@ -440,7 +440,7 @@ class TestDygraphDoubleGrad(TestCase):
        z = y + 1
        w = z * z
-        w_mean = fluid.layers.reduce_mean(w)
+        w_mean = paddle.mean(w)
        del y, z, w
        (dx_actual,) = self.grad([w_mean], [x], create_graph=True)
@@ -454,7 +454,7 @@ class TestDygraphDoubleGrad(TestCase):
        ).astype('float32')
        np.testing.assert_allclose(dx_actual.numpy(), dx_expected, rtol=1e-05)
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+        loss = paddle.mean(dx_actual * dx_actual + x * x)
        loss.backward(retain_graph=True)
        x_grad_actual = x.gradient()
@@ -494,7 +494,7 @@ class TestDygraphDoubleGrad(TestCase):
        z = y1 + y2
        w = z * z
-        w_mean = fluid.layers.reduce_mean(w)
+        w_mean = paddle.mean(w)
        del y1, z, w
        (dx_actual,) = self.grad(
@@ -517,7 +517,7 @@ class TestDygraphDoubleGrad(TestCase):
        ).astype('float32')
        np.testing.assert_allclose(dx_actual.numpy(), dx_expected, rtol=1e-05)
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+        loss = paddle.mean(dx_actual * dx_actual + x * x)
        loss.backward()
        x_grad_actual = x.gradient()
@@ -544,7 +544,7 @@ class TestDygraphDoubleGrad(TestCase):
        z = y + 1
        w = z * z
-        w_mean = fluid.layers.reduce_mean(w)
+        w_mean = paddle.mean(w)
        del y, z, w
        (dx_actual,) = self.grad([w_mean], [x], create_graph=False)
@@ -558,7 +558,7 @@ class TestDygraphDoubleGrad(TestCase):
        np.testing.assert_allclose(dx_actual.numpy(), dx_expected, rtol=1e-05)
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+        loss = paddle.mean(dx_actual * dx_actual + x * x)
        loss.backward()
        x_grad_actual = x.gradient()
@@ -644,7 +644,7 @@ class TestRaiseNoDoubleGradOp(TestCase):
                outputs=[y], inputs=[x], create_graph=True, retain_graph=True
            )[0]
-            loss = fluid.layers.reduce_mean(dx)
+            loss = paddle.mean(dx)
            loss.backward()
    def test_raise(self):

--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -79,7 +79,7 @@ class TestDygraphGAN(unittest.TestCase):
            )
            d_real = discriminator(img)
-            d_loss_real = fluid.layers.reduce_mean(
+            d_loss_real = paddle.mean(
                paddle.nn.functional.binary_cross_entropy_with_logits(
                    logit=d_real,
                    label=fluid.layers.fill_constant(
@@ -89,7 +89,7 @@ class TestDygraphGAN(unittest.TestCase):
            )
            d_fake = discriminator(generator(noise))
-            d_loss_fake = fluid.layers.reduce_mean(
+            d_loss_fake = paddle.mean(
                paddle.nn.functional.binary_cross_entropy_with_logits(
                    logit=d_fake,
                    label=fluid.layers.fill_constant(
@@ -112,7 +112,7 @@ class TestDygraphGAN(unittest.TestCase):
            )
            d_fake = discriminator(generator(noise))
-            g_loss = fluid.layers.reduce_mean(
+            g_loss = paddle.mean(
                paddle.nn.functional.binary_cross_entropy_with_logits(
                    logit=d_fake,
                    label=fluid.layers.fill_constant(
@@ -164,7 +164,7 @@ class TestDygraphGAN(unittest.TestCase):
            )
            d_real = discriminator(to_variable(np.ones([2, 1], np.float32)))
-            d_loss_real = fluid.layers.reduce_mean(
+            d_loss_real = paddle.mean(
                paddle.nn.functional.binary_cross_entropy_with_logits(
                    logit=d_real, label=to_variable(np.ones([2, 1], np.float32))
                )
@@ -173,7 +173,7 @@ class TestDygraphGAN(unittest.TestCase):
            d_fake = discriminator(
                generator(to_variable(np.ones([2, 2], np.float32)))
            )
-            d_loss_fake = fluid.layers.reduce_mean(
+            d_loss_fake = paddle.mean(
                paddle.nn.functional.binary_cross_entropy_with_logits(
                    logit=d_fake,
                    label=to_variable(np.zeros([2, 1], np.float32)),
@@ -189,7 +189,7 @@ class TestDygraphGAN(unittest.TestCase):
            d_fake = discriminator(
                generator(to_variable(np.ones([2, 2], np.float32)))
            )
-            g_loss = fluid.layers.reduce_mean(
+            g_loss = paddle.mean(
                paddle.nn.functional.binary_cross_entropy_with_logits(
                    logit=d_fake, label=to_variable(np.ones([2, 1], np.float32))
                )
@@ -219,7 +219,7 @@ class TestDygraphGAN(unittest.TestCase):
            )
            d_real2 = discriminator2(to_variable(np.ones([2, 1], np.float32)))
-            d_loss_real2 = fluid.layers.reduce_mean(
+            d_loss_real2 = paddle.mean(
                paddle.nn.functional.binary_cross_entropy_with_logits(
                    logit=d_real2,
                    label=to_variable(np.ones([2, 1], np.float32)),
@@ -229,7 +229,7 @@ class TestDygraphGAN(unittest.TestCase):
            d_fake2 = discriminator2(
                generator2(to_variable(np.ones([2, 2], np.float32)))
            )
-            d_loss_fake2 = fluid.layers.reduce_mean(
+            d_loss_fake2 = paddle.mean(
                paddle.nn.functional.binary_cross_entropy_with_logits(
                    logit=d_fake2,
                    label=to_variable(np.zeros([2, 1], np.float32)),
@@ -245,7 +245,7 @@ class TestDygraphGAN(unittest.TestCase):
            d_fake2 = discriminator2(
                generator2(to_variable(np.ones([2, 2], np.float32)))
            )
-            g_loss2 = fluid.layers.reduce_mean(
+            g_loss2 = paddle.mean(
                paddle.nn.functional.binary_cross_entropy_with_logits(
                    logit=d_fake2,
                    label=to_variable(np.ones([2, 1], np.float32)),

--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -73,7 +73,7 @@ class SimpleNet(fluid.Layer):
            logits=projection, label=label, soft_label=False
        )
        loss = paddle.reshape(loss, shape=[-1, self.num_steps])
-        loss = fluid.layers.reduce_mean(loss, dim=[0])
+        loss = paddle.mean(loss, axis=[0])
        loss = paddle.sum(loss)
        return loss

--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -141,7 +141,7 @@ class TestImperativeOptimizerBase(unittest.TestCase):
                img = paddle.reshape(img, shape=[batch_size, -1])
                cost = mlp(img)
-                avg_loss = fluid.layers.reduce_mean(cost)
+                avg_loss = paddle.mean(cost)
                dy_out = avg_loss.numpy()
                if batch_id == 0:
@@ -180,7 +180,7 @@ class TestImperativeOptimizerBase(unittest.TestCase):
            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
            img = paddle.reshape(img, shape=[batch_size, 784])
            cost = mlp(img)
-            avg_loss = fluid.layers.reduce_mean(cost)
+            avg_loss = paddle.mean(cost)
            optimizer.minimize(avg_loss)
            # initialize params and fetch them
@@ -478,7 +478,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
            b = linear(a)
-            loss = fluid.layers.reduce_mean(b)
+            loss = paddle.mean(b)
            adam = fluid.optimizer.Adam(
                0.001, parameter_list=linear.parameters()
@@ -509,7 +509,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
            b = linear(a)
-            loss = fluid.layers.reduce_mean(b)
+            loss = paddle.mean(b)
            bd = [2, 4, 6, 8]
            value = [0.2, 0.4, 0.6, 0.8, 1.0]
@@ -545,7 +545,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
            b = linear(a)
-            loss = fluid.layers.reduce_mean(b)
+            loss = paddle.mean(b)
            base_lr = 1.0
            adam = fluid.optimizer.Adam(
@@ -584,7 +584,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
            b = linear(a)
-            loss = fluid.layers.reduce_mean(b)
+            loss = paddle.mean(b)
            adam = fluid.optimizer.Adam(0.1, parameter_list=linear.parameters())
@@ -965,7 +965,7 @@ class TestImperativeOptimizerList(unittest.TestCase):
            y = linear_1(in_data)
            y = linear_2(y)
-            loss = fluid.layers.reduce_mean(y)
+            loss = paddle.mean(y)
            loss.backward()
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -139,7 +139,7 @@ class TestImperativeOptimizerBase(unittest.TestCase):
            img = paddle.reshape(img, shape=[batch_size, -1])
            cost = mlp(img)
-            avg_loss = fluid.layers.reduce_mean(cost)
+            avg_loss = paddle.mean(cost)
            dy_out = avg_loss.numpy()
            if batch_id == 0:
@@ -189,7 +189,7 @@ class TestImperativeOptimizerBase(unittest.TestCase):
            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
            img = paddle.reshape(img, shape=[batch_size, 784])
            cost = mlp(img)
-            avg_loss = fluid.layers.reduce_mean(cost)
+            avg_loss = paddle.mean(cost)
            optimizer.minimize(avg_loss)
            # initialize params and fetch them
@@ -616,7 +616,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
            b = linear(a)
-            loss = fluid.layers.reduce_mean(b)
+            loss = paddle.mean(b)
            adam = paddle.optimizer.Adam(0.001, parameters=linear.parameters())
@@ -645,7 +645,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
            b = linear(a)
-            loss = fluid.layers.reduce_mean(b)
+            loss = paddle.mean(b)
            bd = [2, 4, 6, 8]
            value = [0.2, 0.4, 0.6, 0.8, 1.0]
@@ -677,7 +677,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
            a = fluid.dygraph.to_variable(a)
            b = linear(a)
-            loss = fluid.layers.reduce_mean(b)
+            loss = paddle.mean(b)
            base_lr = 1.0
            scheduler = paddle.optimizer.lr.NaturalExpDecay(1.0, gamma=0.5)
@@ -709,7 +709,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
            b = linear(a)
-            loss = fluid.layers.reduce_mean(b)
+            loss = paddle.mean(b)
            adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())
@@ -1085,7 +1085,7 @@ class TestImperativeOptimizerList(unittest.TestCase):
            y = linear_1(in_data)
            y = linear_2(y)
-            loss = fluid.layers.reduce_mean(y)
+            loss = paddle.mean(y)
            loss.backward()
            sgd.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py
@@ -31,7 +31,7 @@ class TestImperativePartitialBackward(unittest.TestCase):
            y = linear1(x[:, :2])
            z = linear2(x[:, 2:])
-            loss = fluid.layers.reduce_mean(y)
+            loss = paddle.mean(y)
            loss.backward()
            for param in linear1.parameters():

--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -232,7 +232,7 @@ class PtbModel(fluid.Layer):
            logits=projection, label=label, soft_label=False
        )
        loss = paddle.reshape(loss, shape=[-1, self.num_steps])
-        loss = fluid.layers.reduce_mean(loss, dim=[0])
+        loss = paddle.mean(loss, axis=[0])
        loss = paddle.sum(loss)
        return loss, last_hidden, last_cell

--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -228,7 +228,7 @@ class PtbModel(fluid.Layer):
            logits=projection, label=label, soft_label=False
        )
        loss = paddle.reshape(loss, shape=[-1, self.num_steps])
-        loss = fluid.layers.reduce_mean(loss, dim=[0])
+        loss = paddle.mean(loss, axis=[0])
        loss = paddle.sum(loss)
        return loss, last_hidden, last_cell

--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -229,7 +229,7 @@ class PtbModel(fluid.Layer):
            logits=projection, label=label, soft_label=False
        )
        loss = paddle.reshape(loss, shape=[-1, self.num_steps])
-        loss = fluid.layers.reduce_mean(loss, dim=[0])
+        loss = paddle.mean(loss, axis=[0])
        loss = paddle.sum(loss)
        return loss, last_hidden, last_cell

--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
@@ -82,7 +82,7 @@ class SimpleNet(fluid.Layer):
            logits=projection, label=label, soft_label=False
        )
        loss = paddle.reshape(loss, shape=[-1, self.num_steps])
-        loss = fluid.layers.reduce_mean(loss, dim=[0])
+        loss = paddle.mean(loss, axis=[0])
        loss = paddle.sum(loss)
        return loss

--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -445,9 +445,7 @@ def get_generator_loss(
 ):
    fake_img = generator(image_real, label_trg)
    rec_img = generator(fake_img, label_org)
-    g_loss_rec = fluid.layers.reduce_mean(
+    g_loss_rec = paddle.mean(paddle.abs(paddle.subtract(image_real, rec_img)))
-        paddle.abs(paddle.subtract(image_real, rec_img))
-    )
    pred_fake, cls_fake = discriminator(fake_img)

--- a/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
@@ -72,7 +72,7 @@ class TestTracedLayerRecordNonPersistableInput(unittest.TestCase):
                static_out = traced_layer([in_x])[0]
                np.testing.assert_array_equal(dygraph_out_numpy, static_out)
-                loss = fluid.layers.reduce_mean(dygraph_out)
+                loss = paddle.mean(dygraph_out)
                loss.backward()
                optimizer.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -151,7 +151,7 @@ class TestLearningRateDecayDygraph(unittest.TestCase):
            for epoch in range(10):
                out = linear(input)
-                loss = fluid.layers.reduce_mean(out)
+                loss = paddle.mean(out)
                loss.backward()
                adam1.minimize(loss)
                adam2.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -383,7 +383,7 @@ class TestMeanAPI(unittest.TestCase):
    def test_fluid_api(self):
        with fluid.program_guard(fluid.Program(), fluid.Program()):
            x = fluid.data("x", shape=[10, 10], dtype="float32")
-            out = fluid.layers.reduce_mean(input=x, dim=1)
+            out = paddle.mean(x=x, axis=1)
            place = fluid.CPUPlace()
            exe = fluid.Executor(place)
            x_np = np.random.rand(10, 10).astype(np.float32)
@@ -393,7 +393,7 @@ class TestMeanAPI(unittest.TestCase):
        with fluid.dygraph.guard():
            x_np = np.random.rand(10, 10).astype(np.float32)
            x = fluid.dygraph.to_variable(x_np)
-            out = fluid.layers.reduce_mean(input=x, dim=1)
+            out = paddle.mean(x=x, axis=1)
        np.testing.assert_allclose(
            out.numpy(), np.mean(x_np, axis=1), rtol=1e-05
        )

--- a/python/paddle/fluid/tests/unittests/test_memory_reuse_exclude_feed_var.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_reuse_exclude_feed_var.py
@@ -16,6 +16,7 @@ import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
@@ -29,7 +30,7 @@ class TestMemoryReuseExcludeFeedVar(unittest.TestCase):
            name='image', shape=self.image_shape, dtype='float32'
        )
        relu_image = fluid.layers.relu(image)
-        loss = fluid.layers.reduce_mean(relu_image)
+        loss = paddle.mean(relu_image)
        build_strategy = fluid.BuildStrategy()
        build_strategy.enable_inplace = True

--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
@@ -16,6 +16,7 @@ import sys
 import time
 import unittest
+import paddle
 import numpy as np
 from test_multiprocess_dataloader_static import (
    BATCH_SIZE,
@@ -100,7 +101,7 @@ class TestDygraphDataLoader(unittest.TestCase):
                for image, label in dataloader():
                    out = fc_net(image)
                    loss = fluid.layers.cross_entropy(out, label)
-                    avg_loss = fluid.layers.reduce_mean(loss)
+                    avg_loss = paddle.mean(loss)
                    avg_loss.backward()
                    optimizer.minimize(avg_loss)
                    fc_net.clear_gradients()
@@ -170,7 +171,7 @@ class TestDygraphDataLoaderWithBatchedDataset(TestDygraphDataLoader):
                for image, label in dataloader():
                    out = fc_net(image)
                    loss = fluid.layers.cross_entropy(out, label)
-                    avg_loss = fluid.layers.reduce_mean(loss)
+                    avg_loss = paddle.mean(loss)
                    avg_loss.backward()
                    optimizer.minimize(avg_loss)
                    fc_net.clear_gradients()

--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
@@ -100,7 +100,7 @@ class TestDygraphDataLoader(unittest.TestCase):
                for image, label in dataloader():
                    out = fc_net(image)
                    loss = fluid.layers.cross_entropy(out, label)
-                    avg_loss = fluid.layers.reduce_mean(loss)
+                    avg_loss = paddle.mean(loss)
                    avg_loss.backward()
                    optimizer.minimize(avg_loss)
                    fc_net.clear_gradients()
@@ -168,7 +168,7 @@ class TestDygraphDataLoaderWithBatchedDataset(TestDygraphDataLoader):
                for image, label in dataloader():
                    out = fc_net(image)
                    loss = fluid.layers.cross_entropy(out, label)
-                    avg_loss = fluid.layers.reduce_mean(loss)
+                    avg_loss = paddle.mean(loss)
                    avg_loss.backward()
                    optimizer.minimize(avg_loss)
                    fc_net.clear_gradients()

--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
@@ -18,6 +18,7 @@ import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.io import DataLoader, IterableDataset
@@ -78,7 +79,7 @@ def simple_fc_net_static():
                param_attr=param_attr,
                bias_attr=bias_attr,
            )
-            loss = fluid.layers.reduce_mean(
+            loss = paddle.mean(
                fluid.layers.cross_entropy(input=predict_label, label=label)
            )

--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
@@ -79,7 +79,7 @@ def simple_fc_net_static():
                param_attr=param_attr,
                bias_attr=bias_attr,
            )
-            loss = fluid.layers.reduce_mean(
+            loss = paddle.mean(
                fluid.layers.cross_entropy(input=predict_label, label=label)
            )

--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -75,7 +75,7 @@ class TestReduceMeanWithDimDoubleGradCheck(unittest.TestCase):
        x = layers.data('x', shape, False, dtype)
        x.persistable = True
-        y = layers.reduce_mean(x, dim=0)
+        y = paddle.mean(x, axis=0)
        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
        gradient_checker.double_grad_check(

--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -1169,7 +1169,7 @@ class TestRecomputeOptimizer(unittest.TestCase):
                input=[drop_res], size=2, act='softmax'
            )
            cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
-            sum_cost = fluid.layers.reduce_mean(cost)
+            sum_cost = paddle.mean(cost)
            return drop_res, prediction, sum_cost
        main_program = Program()
@@ -1226,7 +1226,7 @@ class TestRecomputeOptimizerCUDA(unittest.TestCase):
                input=[drop_res], size=2, act='softmax'
            )
            cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
-            sum_cost = fluid.layers.reduce_mean(cost)
+            sum_cost = paddle.mean(cost)
            return drop_res, prediction, sum_cost
        main_program = Program()

--- a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
@@ -239,7 +239,7 @@ class TestDygraphDoubleGrad(TestCase):
        z = y + 1
        w = z * z
-        w_mean = fluid.layers.reduce_mean(w)
+        w_mean = paddle.mean(w)
        del y, z, w
        (dx_actual,) = self.grad([w_mean], [x], create_graph=True)
@@ -256,7 +256,7 @@ class TestDygraphDoubleGrad(TestCase):
        if not _in_legacy_dygraph():
            pass
        else:
-            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss = paddle.mean(dx_actual * dx_actual + x * x)
            loss.backward()
            x_grad_actual = x.gradient()
@@ -286,7 +286,7 @@ class TestDygraphDoubleGrad(TestCase):
        z = y1 + y2
        w = z * z
-        w_mean = fluid.layers.reduce_mean(w)
+        w_mean = paddle.mean(w)
        del y1, z, w
        (dx_actual,) = self.grad(
@@ -308,7 +308,7 @@ class TestDygraphDoubleGrad(TestCase):
        if not _in_legacy_dygraph():
            pass
        else:
-            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss = paddle.mean(dx_actual * dx_actual + x * x)
            loss.backward()
            x_grad_actual = x.gradient()
@@ -337,7 +337,7 @@ class TestDygraphDoubleGrad(TestCase):
        z = y + 1
        w = z * z
-        w_mean = fluid.layers.reduce_mean(w)
+        w_mean = paddle.mean(w)
        del y, z, w
        (dx_actual,) = self.grad([w_mean], [x], create_graph=False)
@@ -354,7 +354,7 @@ class TestDygraphDoubleGrad(TestCase):
        if not _in_legacy_dygraph():
            pass
        else:
-            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss = paddle.mean(dx_actual * dx_actual + x * x)
            loss.backward()
            x_grad_actual = x.gradient()

--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -213,7 +213,7 @@ class TestSaveLoadAny(unittest.TestCase):
            )
            z = paddle.static.nn.fc(x, 10)
            z = paddle.static.nn.fc(z, 10, bias_attr=False)
-            loss = fluid.layers.reduce_mean(z)
+            loss = paddle.mean(z)
            opt = Adam(learning_rate=1e-3)
            opt.minimize(loss)
            place = paddle.CPUPlace()
@@ -382,7 +382,7 @@ class TestSaveLoadAny(unittest.TestCase):
                name="x", shape=[None, IMAGE_SIZE], dtype='float32'
            )
            z = paddle.static.nn.fc(x, 128)
-            loss = fluid.layers.reduce_mean(z)
+            loss = paddle.mean(z)
            place = (
                fluid.CPUPlace()
                if not paddle.fluid.core.is_compiled_with_cuda()
@@ -640,7 +640,7 @@ class TestSaveLoadAny(unittest.TestCase):
            )
            z = paddle.static.nn.fc(x, 10, bias_attr=False)
            z = paddle.static.nn.fc(z, 128, bias_attr=False)
-            loss = fluid.layers.reduce_mean(z)
+            loss = paddle.mean(z)
            place = (
                fluid.CPUPlace()
                if not paddle.fluid.core.is_compiled_with_cuda()
@@ -915,7 +915,7 @@ class TestSaveLoadToMemory(unittest.TestCase):
            )
            z = paddle.static.nn.fc(x, 10, bias_attr=False)
            z = paddle.static.nn.fc(z, 128, bias_attr=False)
-            loss = fluid.layers.reduce_mean(z)
+            loss = paddle.mean(z)
            place = (
                fluid.CPUPlace()
                if not paddle.fluid.core.is_compiled_with_cuda()

--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
@@ -79,7 +79,7 @@ class TestSaveLoadBinaryFormat(unittest.TestCase):
            )
            z = paddle.static.nn.fc(x, 10, bias_attr=False)
            z = paddle.static.nn.fc(z, 128, bias_attr=False)
-            loss = fluid.layers.reduce_mean(z)
+            loss = paddle.mean(z)
            place = (
                fluid.CPUPlace()
                if not paddle.fluid.core.is_compiled_with_cuda()

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
@@ -31,7 +31,7 @@ class TestParallelExecutorFetchIsolatedVarBase(unittest.TestCase):
        x = fluid.data(name='x', shape=[-1, 10], dtype='float32')
        y = fluid.data(name='y', shape=[-1, 10], dtype='float32')
        fc = fluid.layers.fc(x, size=30, bias_attr=False)
-        loss = fluid.layers.reduce_mean(fc)
+        loss = paddle.mean(fc)
        if is_training:
            adam = fluid.optimizer.Adam(learning_rate=1e-3)
            adam.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_inference_feed_partial_data.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_inference_feed_partial_data.py
@@ -16,6 +16,7 @@ import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
@@ -183,7 +184,7 @@ class TestInferencePartialFeedUsingDataLoader(unittest.TestCase):
            feed_list=[x], capacity=16, iterable=iterable, drop_last=drop_last
        )
        y = fluid.layers.fc(x, size=10)
-        loss = fluid.layers.reduce_mean(y)
+        loss = paddle.mean(y)
        exe = fluid.Executor(places[0])
        exe.run(fluid.default_startup_program())

--- a/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py
@@ -640,7 +640,7 @@ def def_seq2seq_model(
        target_length, maxlen=max_tar_seq_len, dtype="float32"
    )
    loss = loss * tar_mask
-    loss = layers.reduce_mean(loss, dim=[0])
+    loss = paddle.mean(loss, axis=[0])
    loss = paddle.sum(loss)
    # optimizer

--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -319,7 +319,7 @@ class PolicyGradient:
        cost = (
            (paddle.sum(cost) / paddle.sum(length))
            if length is not None
-            else layers.reduce_mean(cost)
+            else paddle.mean(cost)
        )
        optimizer = fluid.optimizer.Adam(self.lr)
        optimizer.minimize(cost)
@@ -405,7 +405,7 @@ class MLE:
        max_seq_len = layers.shape(probs)[1]
        mask = layers.sequence_mask(length, maxlen=max_seq_len, dtype="float32")
        loss = loss * mask
-        loss = layers.reduce_mean(loss, dim=[0])
+        loss = paddle.mean(loss, axis=[0])
        loss = paddle.sum(loss)
        optimizer = fluid.optimizer.Adam(self.lr)
        optimizer.minimize(loss)

--- a/python/paddle/fluid/tests/unittests/test_static_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py
@@ -241,7 +241,7 @@ class PtbModel(fluid.Layer):
            logits=projection, label=label, soft_label=False
        )
        loss = paddle.reshape(loss, shape=[-1, self.num_steps])
-        loss = fluid.layers.reduce_mean(loss, dim=[0])
+        loss = paddle.mean(loss, axis=[0])
        loss = paddle.sum(loss)
        return loss, last_hidden, last_cell

--- a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
+++ b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
@@ -223,7 +223,7 @@ class TestTracedLayerErrMsg(unittest.TestCase):
                    ).astype('float32')
                )
                dygraph_out = layer(in_x)
-                loss = fluid.layers.reduce_mean(dygraph_out)
+                loss = paddle.mean(dygraph_out)
                loss.backward()
                optimizer.minimize(loss)
        return layer

--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -522,24 +522,16 @@ class MSELoss(Layer):
    r"""
    **Mean Square Error Loss**
    Computes the mean square error (squared L2 norm) of given input and label.
    If :attr:`reduction` is set to ``'none'``, loss is calculated as:
    .. math::
        Out = (input - label)^2
    If :attr:`reduction` is set to ``'mean'``, loss is calculated as:
    .. math::
        Out = \operatorname{mean}((input - label)^2)
    If :attr:`reduction` is set to ``'sum'``, loss is calculated as:
    .. math::
        Out = \operatorname{sum}((input - label)^2)
    where `input` and `label` are `float32` tensors of same shape.
    Parameters:
        reduction (string, optional): The reduction method for the output,
            could be 'none' | 'mean' | 'sum'.
@@ -547,17 +539,13 @@ class MSELoss(Layer):
            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
            Default is ``'mean'``.
    Shape:
        input (Tensor): Input tensor, the data type is float32 or float64
        label (Tensor): Label tensor, the data type is float32 or float64
        output (Tensor): output tensor storing the MSE loss of input and label, the data type is same as input.
    Examples:
        .. code-block:: python
            import paddle
            mse_loss = paddle.nn.loss.MSELoss()
            input = paddle.to_tensor([1.5])
            label = paddle.to_tensor([1.7])
@@ -596,7 +584,7 @@ class MSELoss(Layer):
            square_out = paddle.sum(square_out)
            return square_out
-        return getattr(fluid.layers, reduce_op)(square_out)
+        return paddle.mean(square_out)
 class L1Loss(Layer):