From 42c8d51a94c4149e72e162b8749637cc5218d270 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Mon, 14 Nov 2022 17:22:50 +0800 Subject: [PATCH] clean fluid elementwise_max (#47866) --- .../dygraph_optimizer/hybrid_parallel_optimizer.py | 2 +- .../fleet/meta_optimizers/localsgd_optimizer.py | 2 +- .../fleet/meta_parallel/sharding/group_sharded_utils.py | 2 +- .../fleet/meta_parallel/sharding/sharding_utils.py | 2 +- python/paddle/fluid/clip.py | 8 +++----- python/paddle/fluid/dygraph/learning_rate_scheduler.py | 5 ++--- .../paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py | 2 +- .../unittests/dygraph_to_static/simnet_dygraph_model.py | 4 +++- python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py | 2 +- .../paddle/fluid/tests/unittests/test_dist_fleet_ps11.py | 2 +- .../paddle/fluid/tests/unittests/test_dist_fleet_ps12.py | 2 +- .../paddle/fluid/tests/unittests/test_dist_fleet_ps13.py | 2 +- .../paddle/fluid/tests/unittests/test_dist_fleet_ps2.py | 2 +- .../paddle/fluid/tests/unittests/test_dist_fleet_ps3.py | 2 +- .../paddle/fluid/tests/unittests/test_dist_fleet_ps4.py | 2 +- .../paddle/fluid/tests/unittests/test_dist_fleet_ps5.py | 2 +- .../paddle/fluid/tests/unittests/test_dist_fleet_ps6.py | 2 +- python/paddle/fluid/tests/unittests/test_layers.py | 6 ++---- .../paddle/incubate/distributed/models/moe/grad_clip.py | 4 +++- 19 files changed, 27 insertions(+), 28 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py index 33922b7f35..bd05cbe879 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py @@ -160,7 +160,7 @@ class HybridParallelClipGrad: ) clip_var = layers.elementwise_div( x=max_global_norm, - y=layers.elementwise_max(x=global_norm_var_fp32, y=max_global_norm), + y=paddle.maximum(x=global_norm_var_fp32, y=max_global_norm), ) clip_var_fp16 = paddle.cast(clip_var, paddle.float16) for p, g in params_grads: diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py index b3c4231b36..1cd0b23488 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py @@ -468,7 +468,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase): next_local_steps = layers.elementwise_min( next_local_steps, max_local_steps ) - next_local_steps = layers.elementwise_max( + next_local_steps = paddle.maximum( next_local_steps, min_local_steps ) layers.assign(next_local_steps, k_steps) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py index 6832e9a7ca..2976ef88e5 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py @@ -141,7 +141,7 @@ class GroupShardedClipGrad: clip_var = layers.elementwise_div( x=max_global_norm, - y=layers.elementwise_max(x=global_norm_var, y=max_global_norm), + y=paddle.maximum(x=global_norm_var, y=max_global_norm), ) clip_var_fp16 = paddle.cast(clip_var, paddle.float16) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py index 07cf159c3e..4cee382339 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py @@ -138,7 +138,7 @@ class ShardingClipGrad: clip_var = layers.elementwise_div( x=max_global_norm, - y=layers.elementwise_max(x=global_norm_var, y=max_global_norm), + y=paddle.maximum(x=global_norm_var, y=max_global_norm), ) clip_var_fp16 = paddle.cast(clip_var, paddle.float16) diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 68a2f8a0de..19c8629fa9 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -550,7 +550,7 @@ class ClipGradByGlobalNorm(ClipGradBase): need_clip = True clip_var = layers.elementwise_div( x=max_global_norm, - y=layers.elementwise_max(x=global_norm_var, y=max_global_norm), + y=paddle.maximum(x=global_norm_var, y=max_global_norm), ) elif global_norm_var > max_global_norm: # only when global_norm_var > max_global_norm, grad need clip @@ -654,9 +654,7 @@ class ClipGradByGlobalNorm(ClipGradBase): ) scale_var = layers.elementwise_div( x=max_global_norm, - y=layers.elementwise_max( - x=max_global_norm, y=global_norm_var - ), + y=paddle.maximum(x=max_global_norm, y=global_norm_var), ) param_new_grad_name_dict = dict() for p, g in params_grads: @@ -733,7 +731,7 @@ class ClipGradByGlobalNorm(ClipGradBase): clip_var = self.context[self.group_name + "_clip"] group_scale_var = layers.elementwise_div( x=clip_var, - y=layers.elementwise_max(x=clip_var, y=group_norm_var), + y=paddle.maximum(x=clip_var, y=group_norm_var), ) assert group_scale_var.shape == (1,) self.context[group_scale_name] = group_scale_var diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py index 3afe92cbc6..0204542d6e 100644 --- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py +++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py @@ -15,6 +15,7 @@ import math import warnings +import paddle from .. import unique_name from ..framework import Variable from ..data_feeder import check_type @@ -977,11 +978,9 @@ class ReduceLROnPlateau(LearningRateDecay): self.num_bad_epochs += 1 if self.num_bad_epochs > self.patience: - from .. import layers - self.cooldown_counter = self.cooldown self.num_bad_epochs = 0 - new_lr = layers.elementwise_max( + new_lr = paddle.maximum( self.learning_rate * self.decay_rate, self.min_lr ) if self.learning_rate - new_lr > self.eps: diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py index f9d926ad1c..cc6371cc7c 100644 --- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py +++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py @@ -74,7 +74,7 @@ def get_loss(cos_q_pt, cos_q_nt): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py index 0bb0840514..facc72faf8 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import paddle + import paddle.fluid as fluid import paddle.fluid.param_attr as attr @@ -151,7 +153,7 @@ class ElementwiseMaxLayer: """ operation """ - max = fluid.layers.elementwise_max(x, y) + max = paddle.maximum(x, y) return max diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py index 8d5ac58d62..42a96cc66f 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py @@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py index b0d8df316a..4fc0e2eb5a 100755 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py @@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py index b4c10116a5..f38dbf9e56 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py @@ -59,7 +59,7 @@ class TestPSPassWithBow(unittest.TestCase): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py index 47cbaefd68..1c8c3b4f87 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py @@ -60,7 +60,7 @@ class TestPSPassWithBow(unittest.TestCase): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py index a4cdcb32bd..3355778182 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py @@ -59,7 +59,7 @@ class TestPSPassWithBow(unittest.TestCase): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py index 2eb6277018..abd0ff1c85 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py @@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py index 0cb4ee6e3a..cfc806d372 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py @@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py index fbb640fc8b..f88ca8fcb1 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py @@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py index e0b73b4344..6cfae26323 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py @@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index ba29e7430a..9f9e98bfca 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -652,14 +652,12 @@ class TestLayer(LayerTest): min_eager_ret = layers.elementwise_min( to_variable(n), to_variable(n2) ) - max_eager_ret = layers.elementwise_max( - to_variable(n), to_variable(n2) - ) + max_eager_ret = paddle.maximum(to_variable(n), to_variable(n2)) min_eager_ret_value = min_eager_ret.numpy() max_eager_ret_value = max_eager_ret.numpy() min_ret = layers.elementwise_min(to_variable(n), to_variable(n2)) - max_ret = layers.elementwise_max(to_variable(n), to_variable(n2)) + max_ret = paddle.maximum(to_variable(n), to_variable(n2)) min_ret_value = min_ret.numpy() max_ret_value = max_ret.numpy() diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py index 9a3a0dfc0f..17372ea4f1 100644 --- a/python/paddle/incubate/distributed/models/moe/grad_clip.py +++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import paddle + import paddle.distributed as dist from paddle.fluid.clip import ClipGradBase, _squared_l2_norm from paddle.fluid.dygraph import base as imperative_base @@ -213,7 +215,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase): ) clip_var = layers.elementwise_div( x=max_global_norm, - y=layers.elementwise_max(x=global_norm_var, y=max_global_norm), + y=paddle.maximum(x=global_norm_var, y=max_global_norm), ) for p, g in params_grads: if g is None: -- GitLab