diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index 33922b7f35d9c470fe15e7b5ddf9b7d1908aa99c..bd05cbe8797184ca37b3afb7c401ebde2e87ba33 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -160,7 +160,7 @@ class HybridParallelClipGrad:
         )
         clip_var = layers.elementwise_div(
             x=max_global_norm,
-            y=layers.elementwise_max(x=global_norm_var_fp32, y=max_global_norm),
+            y=paddle.maximum(x=global_norm_var_fp32, y=max_global_norm),
         )
         clip_var_fp16 = paddle.cast(clip_var, paddle.float16)
         for p, g in params_grads:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index b3c4231b36f3e84b61e606af65a899c9b34d1b10..1cd0b23488ed7e292e3edbb9a10eb5a99316ecdc 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -468,7 +468,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
                 next_local_steps = layers.elementwise_min(
                     next_local_steps, max_local_steps
                 )
-                next_local_steps = layers.elementwise_max(
+                next_local_steps = paddle.maximum(
                     next_local_steps, min_local_steps
                 )
                 layers.assign(next_local_steps, k_steps)
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
index 6832e9a7caa2125852cbdf0a56a6d64e5e294982..2976ef88e5983466e8a187671421d0f2745e357e 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -141,7 +141,7 @@ class GroupShardedClipGrad:
 
         clip_var = layers.elementwise_div(
             x=max_global_norm,
-            y=layers.elementwise_max(x=global_norm_var, y=max_global_norm),
+            y=paddle.maximum(x=global_norm_var, y=max_global_norm),
         )
         clip_var_fp16 = paddle.cast(clip_var, paddle.float16)
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index 07cf159c3e66fd2ce8f4d4aae45bcc9c4e543433..4cee382339538cb9f748bf200ee30c8f8d46e5ad 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -138,7 +138,7 @@ class ShardingClipGrad:
 
         clip_var = layers.elementwise_div(
             x=max_global_norm,
-            y=layers.elementwise_max(x=global_norm_var, y=max_global_norm),
+            y=paddle.maximum(x=global_norm_var, y=max_global_norm),
         )
         clip_var_fp16 = paddle.cast(clip_var, paddle.float16)
 
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 68a2f8a0deea36274a1ab2db2796cdcb122fe706..19c8629fa9ad8cef8b254da62584044a815ce59c 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -550,7 +550,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
             need_clip = True
             clip_var = layers.elementwise_div(
                 x=max_global_norm,
-                y=layers.elementwise_max(x=global_norm_var, y=max_global_norm),
+                y=paddle.maximum(x=global_norm_var, y=max_global_norm),
             )
         elif global_norm_var > max_global_norm:
             # only when global_norm_var > max_global_norm, grad need clip
@@ -654,9 +654,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
                 )
                 scale_var = layers.elementwise_div(
                     x=max_global_norm,
-                    y=layers.elementwise_max(
-                        x=max_global_norm, y=global_norm_var
-                    ),
+                    y=paddle.maximum(x=max_global_norm, y=global_norm_var),
                 )
             param_new_grad_name_dict = dict()
             for p, g in params_grads:
@@ -733,7 +731,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
             clip_var = self.context[self.group_name + "_clip"]
             group_scale_var = layers.elementwise_div(
                 x=clip_var,
-                y=layers.elementwise_max(x=clip_var, y=group_norm_var),
+                y=paddle.maximum(x=clip_var, y=group_norm_var),
             )
             assert group_scale_var.shape == (1,)
             self.context[group_scale_name] = group_scale_var
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index 3afe92cbc62342aa9e34d96fdcb53e22afbc413b..0204542d6ec2beb456357dbaf6206b4bceb62b94 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -15,6 +15,7 @@
 import math
 import warnings
 
+import paddle
 from .. import unique_name
 from ..framework import Variable
 from ..data_feeder import check_type
@@ -977,11 +978,9 @@ class ReduceLROnPlateau(LearningRateDecay):
                 self.num_bad_epochs += 1
 
             if self.num_bad_epochs > self.patience:
-                from .. import layers
-
                 self.cooldown_counter = self.cooldown
                 self.num_bad_epochs = 0
-                new_lr = layers.elementwise_max(
+                new_lr = paddle.maximum(
                     self.learning_rate * self.decay_rate, self.min_lr
                 )
                 if self.learning_rate - new_lr > self.eps:
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
index f9d926ad1c73356b846005076abb61c98b8eb00e..cc6371cc7cc676f6cca306597b791452ae4b2cea 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
@@ -74,7 +74,7 @@ def get_loss(cos_q_pt, cos_q_nt):
         cos_q_pt,
     )
     loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
-    loss_op3 = fluid.layers.elementwise_max(
+    loss_op3 = paddle.maximum(
         fluid.layers.fill_constant_batch_size_like(
             input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
         ),
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
index 0bb08405141c3fc5b328bf31d828b03e23d4f877..facc72faf873694681fcdb47160f699877711edb 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
+
 import paddle.fluid as fluid
 import paddle.fluid.param_attr as attr
 
@@ -151,7 +153,7 @@ class ElementwiseMaxLayer:
         """
         operation
         """
-        max = fluid.layers.elementwise_max(x, y)
+        max = paddle.maximum(x, y)
         return max
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
index 8d5ac58d62aaa1d7d38a79bbe2d5d249a3ac5099..42a96cc66f41d30c76e39155fdf0833d8bdde28f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase):
                 cos_q_pt,
             )
             loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
-            loss_op3 = fluid.layers.elementwise_max(
+            loss_op3 = paddle.maximum(
                 fluid.layers.fill_constant_batch_size_like(
                     input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
                 ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
index b0d8df316a8ab1216e99e85c0983783c8565a0c4..4fc0e2eb5a0c3b2db3510171464905764b5f157b 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase):
                 cos_q_pt,
             )
             loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
-            loss_op3 = fluid.layers.elementwise_max(
+            loss_op3 = paddle.maximum(
                 fluid.layers.fill_constant_batch_size_like(
                     input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
                 ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
index b4c10116a55c3f3d05aea497296ef97fb81e33ac..f38dbf9e563347c334326f519618cd6f2f1211b0 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
@@ -59,7 +59,7 @@ class TestPSPassWithBow(unittest.TestCase):
                 cos_q_pt,
             )
             loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
-            loss_op3 = fluid.layers.elementwise_max(
+            loss_op3 = paddle.maximum(
                 fluid.layers.fill_constant_batch_size_like(
                     input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
                 ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py
index 47cbaefd68d7b98e4f071d2a93ea5b56226207c7..1c8c3b4f879e0509ee83ff801dff0466290fc1b5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py
@@ -60,7 +60,7 @@ class TestPSPassWithBow(unittest.TestCase):
                 cos_q_pt,
             )
             loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
-            loss_op3 = fluid.layers.elementwise_max(
+            loss_op3 = paddle.maximum(
                 fluid.layers.fill_constant_batch_size_like(
                     input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
                 ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
index a4cdcb32bd4120f0f82b7c394838db711e7dbd7c..335577818259cb7cc2d36f8f8808231e306aac16 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
@@ -59,7 +59,7 @@ class TestPSPassWithBow(unittest.TestCase):
                 cos_q_pt,
             )
             loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
-            loss_op3 = fluid.layers.elementwise_max(
+            loss_op3 = paddle.maximum(
                 fluid.layers.fill_constant_batch_size_like(
                     input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
                 ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
index 2eb62770184415dc40d2f709a2928269298f5c57..abd0ff1c858c12fb9087544e8ea087139fc489be 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase):
                 cos_q_pt,
             )
             loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
-            loss_op3 = fluid.layers.elementwise_max(
+            loss_op3 = paddle.maximum(
                 fluid.layers.fill_constant_batch_size_like(
                     input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
                 ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
index 0cb4ee6e3af2f6ea19976a5d4602b453da30a282..cfc806d372b0115665ce4c24e337c94f7fa1eceb 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase):
                 cos_q_pt,
             )
             loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
-            loss_op3 = fluid.layers.elementwise_max(
+            loss_op3 = paddle.maximum(
                 fluid.layers.fill_constant_batch_size_like(
                     input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
                 ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
index fbb640fc8bbc84d8845fcab8ac0a4b1030a2787d..f88ca8fcb1de32dcef87fa9672725246c0e59d5f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase):
                 cos_q_pt,
             )
             loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
-            loss_op3 = fluid.layers.elementwise_max(
+            loss_op3 = paddle.maximum(
                 fluid.layers.fill_constant_batch_size_like(
                     input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
                 ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
index e0b73b4344c68431b1a8466670df680ba6d956a2..6cfae26323e5cd2139d40d5e510831adf0cac157 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase):
                 cos_q_pt,
             )
             loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
-            loss_op3 = fluid.layers.elementwise_max(
+            loss_op3 = paddle.maximum(
                 fluid.layers.fill_constant_batch_size_like(
                     input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
                 ),
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index ba29e7430a942d87646e4ec18a099367b9b84c25..9f9e98bfca1c714be0154efe88788fe7f7a77813 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -652,14 +652,12 @@ class TestLayer(LayerTest):
                 min_eager_ret = layers.elementwise_min(
                     to_variable(n), to_variable(n2)
                 )
-                max_eager_ret = layers.elementwise_max(
-                    to_variable(n), to_variable(n2)
-                )
+                max_eager_ret = paddle.maximum(to_variable(n), to_variable(n2))
                 min_eager_ret_value = min_eager_ret.numpy()
                 max_eager_ret_value = max_eager_ret.numpy()
 
             min_ret = layers.elementwise_min(to_variable(n), to_variable(n2))
-            max_ret = layers.elementwise_max(to_variable(n), to_variable(n2))
+            max_ret = paddle.maximum(to_variable(n), to_variable(n2))
             min_ret_value = min_ret.numpy()
             max_ret_value = max_ret.numpy()
 
diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py
index 9a3a0dfc0f70761509b35427902a6ee070e436b0..17372ea4f175da586a341d435e600b109ae0f0e8 100644
--- a/python/paddle/incubate/distributed/models/moe/grad_clip.py
+++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
+
 import paddle.distributed as dist
 from paddle.fluid.clip import ClipGradBase, _squared_l2_norm
 from paddle.fluid.dygraph import base as imperative_base
@@ -213,7 +215,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
         )
         clip_var = layers.elementwise_div(
             x=max_global_norm,
-            y=layers.elementwise_max(x=global_norm_var, y=max_global_norm),
+            y=paddle.maximum(x=global_norm_var, y=max_global_norm),
         )
         for p, g in params_grads:
             if g is None: