From 048e0c558a9bc116b6cad98f1645fef3515419e8 Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Tue, 29 Nov 2022 20:38:45 +0800
Subject: [PATCH] clean elem_arithmetic not test.py (#48460)

---
 .../sharding/group_sharded_utils.py           |  2 +-
 .../meta_parallel/sharding/sharding_utils.py  |  2 +-
 python/paddle/distribution/normal.py          | 15 +++----
 python/paddle/distribution/uniform.py         | 11 ++----
 python/paddle/fluid/clip.py                   | 12 +++---
 .../extend_optimizer_with_weight_decay.py     |  4 +-
 .../paddle/fluid/contrib/layers/rnn_impl.py   | 18 ++++-----
 python/paddle/fluid/dygraph/rnn.py            | 39 +++++++++----------
 python/paddle/fluid/layer_helper_base.py      |  4 +-
 python/paddle/fluid/layers/rnn.py             |  7 ++--
 python/paddle/fluid/nets.py                   |  2 +-
 python/paddle/fluid/optimizer.py              |  8 ++--
 .../distributed/models/moe/grad_clip.py       |  4 +-
 13 files changed, 55 insertions(+), 73 deletions(-)

diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
index 39d88fef67d..c12381c894e 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -138,7 +138,7 @@ class GroupShardedClipGrad:
             shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
         )
 
-        clip_var = layers.elementwise_div(
+        clip_var = paddle.divide(
             x=max_global_norm,
             y=paddle.maximum(x=global_norm_var, y=max_global_norm),
         )
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index 89978cceff7..22f2eb8f1b8 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -135,7 +135,7 @@ class ShardingClipGrad:
             shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
         )
 
-        clip_var = layers.elementwise_div(
+        clip_var = paddle.divide(
             x=max_global_norm,
             y=paddle.maximum(x=global_norm_var, y=max_global_norm),
         )
diff --git a/python/paddle/distribution/normal.py b/python/paddle/distribution/normal.py
index 2d4b0bed980..f28b92ec86b 100644
--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -22,9 +22,6 @@ from paddle.distribution import distribution
 from paddle.fluid.data_feeder import check_type, convert_dtype
 from paddle.fluid.framework import _non_static_mode
 from paddle.fluid.layers import (
-    elementwise_add,
-    elementwise_div,
-    elementwise_sub,
     nn,
     tensor,
 )
@@ -191,14 +188,14 @@ class Normal(distribution.Distribution):
                 zero_tmp_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype
             )
             output = normal_random_tmp * (zero_tmp_reshape + self.scale)
-            output = elementwise_add(output, self.loc, name=name)
+            output = paddle.add(output, self.loc, name=name)
             return output
         else:
             output_shape = shape + batch_shape
             output = nn.gaussian_random(
                 output_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype
             ) * (tensor.zeros(output_shape, dtype=self.dtype) + self.scale)
-            output = elementwise_add(output, self.loc, name=name)
+            output = paddle.add(output, self.loc, name=name)
             if self.all_arg_is_float:
                 return paddle.reshape(output, shape, name=name)
             else:
@@ -243,7 +240,7 @@ class Normal(distribution.Distribution):
         zero_tmp = tensor.fill_constant_batch_size_like(
             self.loc + self.scale, batch_shape, self.dtype, 0.0
         )
-        return elementwise_add(
+        return paddle.add(
             0.5 + zero_tmp,
             0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)),
             name=name,
@@ -264,7 +261,7 @@ class Normal(distribution.Distribution):
 
         var = self.scale * self.scale
         log_scale = nn.log(self.scale)
-        return elementwise_sub(
+        return paddle.subtract(
             -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var),
             log_scale + math.log(math.sqrt(2.0 * math.pi)),
             name=name,
@@ -284,7 +281,7 @@ class Normal(distribution.Distribution):
         value = self._check_values_dtype_in_probs(self.loc, value)
 
         var = self.scale * self.scale
-        return elementwise_div(
+        return paddle.divide(
             paddle.exp(
                 -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var)
             ),
@@ -333,6 +330,6 @@ class Normal(distribution.Distribution):
         var_ratio = var_ratio * var_ratio
         t1 = (self.loc - other.loc) / other.scale
         t1 = t1 * t1
-        return elementwise_add(
+        return paddle.add(
             0.5 * var_ratio, 0.5 * (t1 - 1.0 - nn.log(var_ratio)), name=name
         )
diff --git a/python/paddle/distribution/uniform.py b/python/paddle/distribution/uniform.py
index 9571cdb08c2..f242dc3db0d 100644
--- a/python/paddle/distribution/uniform.py
+++ b/python/paddle/distribution/uniform.py
@@ -24,9 +24,6 @@ from paddle.fluid.framework import (
     in_dygraph_mode,
 )
 from paddle.fluid.layers import (
-    elementwise_add,
-    elementwise_div,
-    elementwise_sub,
     nn,
     tensor,
 )
@@ -184,7 +181,7 @@ class Uniform(distribution.Distribution):
             output = uniform_random_tmp_reshape * (
                 zero_tmp_reshape + self.high - self.low
             )
-            output = elementwise_add(output, self.low, name=name)
+            output = paddle.add(output, self.low, name=name)
             return output
         else:
             output_shape = shape + batch_shape
@@ -194,7 +191,7 @@ class Uniform(distribution.Distribution):
                 tensor.zeros(output_shape, dtype=self.dtype)
                 + (self.high - self.low)
             )
-            output = elementwise_add(output, self.low, name=name)
+            output = paddle.add(output, self.low, name=name)
             if self.all_arg_is_float:
                 return paddle.reshape(output, shape, name=name)
             else:
@@ -235,7 +232,7 @@ class Uniform(distribution.Distribution):
         ub_bool = value < self.high
         lb = tensor.cast(lb_bool, dtype=value.dtype)
         ub = tensor.cast(ub_bool, dtype=value.dtype)
-        return elementwise_sub(
+        return paddle.subtract(
             nn.log(lb * ub), nn.log(self.high - self.low), name=name
         )
 
@@ -273,7 +270,7 @@ class Uniform(distribution.Distribution):
         ub_bool = value < self.high
         lb = tensor.cast(lb_bool, dtype=value.dtype)
         ub = tensor.cast(ub_bool, dtype=value.dtype)
-        return elementwise_div((lb * ub), (self.high - self.low), name=name)
+        return paddle.divide((lb * ub), (self.high - self.low), name=name)
 
     def entropy(self):
         r"""Shannon entropy in nats.
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 7162313ddae..525c3360f5e 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -548,16 +548,14 @@ class ClipGradByGlobalNorm(ClipGradBase):
         need_clip = False
         if not self.auto_skip_clip:  # always apply clip
             need_clip = True
-            clip_var = layers.elementwise_div(
+            clip_var = paddle.divide(
                 x=max_global_norm,
                 y=paddle.maximum(x=global_norm_var, y=max_global_norm),
             )
         elif global_norm_var > max_global_norm:
             # only when global_norm_var > max_global_norm, grad need clip
             need_clip = True
-            clip_var = layers.elementwise_div(
-                x=max_global_norm, y=global_norm_var
-            )
+            clip_var = paddle.divide(x=max_global_norm, y=global_norm_var)
 
         for p, g in params_grads:
             if g is None:
@@ -572,7 +570,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
                     if clip_var.dtype != g.dtype
                     else clip_var
                 )
-                new_grad = layers.elementwise_mul(g, clip_input)
+                new_grad = paddle.multiply(g, clip_input)
                 params_and_grads.append((p, new_grad))
             else:
                 params_and_grads.append((p, g))
@@ -652,7 +650,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
                 max_global_norm = layers.fill_constant(
                     shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
                 )
-                scale_var = layers.elementwise_div(
+                scale_var = paddle.divide(
                     x=max_global_norm,
                     y=paddle.maximum(x=max_global_norm, y=global_norm_var),
                 )
@@ -729,7 +727,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
             group_norm_var = layers.sums(input=self.context[self.group_name])
             group_norm_var = paddle.sqrt(x=group_norm_var)
             clip_var = self.context[self.group_name + "_clip"]
-            group_scale_var = layers.elementwise_div(
+            group_scale_var = paddle.divide(
                 x=clip_var,
                 y=paddle.maximum(x=clip_var, y=group_norm_var),
             )
diff --git a/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py b/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
index 53a010c23ce..3a40c5ac80a 100644
--- a/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
+++ b/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
@@ -95,9 +95,7 @@ class DecoupledWeightDecay:
             with param.block.program._optimized_guard(
                 [param, grad]
             ), framework.name_scope('weight decay'):
-                updated_param = paddle.fluid.layers.elementwise_sub(
-                    x=param, y=scaled_param
-                )
+                updated_param = paddle.subtract(x=param, y=scaled_param)
                 paddle.fluid.layers.assign(input=updated_param, output=param)
 
         optimize_ops = self.apply_optimize(
diff --git a/python/paddle/fluid/contrib/layers/rnn_impl.py b/python/paddle/fluid/contrib/layers/rnn_impl.py
index df6a38852ff..5f9a4d2827f 100644
--- a/python/paddle/fluid/contrib/layers/rnn_impl.py
+++ b/python/paddle/fluid/contrib/layers/rnn_impl.py
@@ -153,7 +153,7 @@ class BasicGRUUnit(Layer):
 
         gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
 
-        gate_input = layers.elementwise_add(gate_input, self._gate_bias)
+        gate_input = paddle.add(gate_input, self._gate_bias)
 
         gate_input = self._gate_activation(gate_input)
         r, u = layers.split(gate_input, num_or_sections=2, dim=1)
@@ -163,7 +163,7 @@ class BasicGRUUnit(Layer):
         candidate = layers.matmul(
             layers.concat([input, r_hidden], 1), self._candidate_weight
         )
-        candidate = layers.elementwise_add(candidate, self._candidate_bias)
+        candidate = paddle.add(candidate, self._candidate_bias)
 
         c = self._activation(candidate)
         new_hidden = u * pre_hidden + (1 - u) * c
@@ -876,18 +876,14 @@ class BasicLSTMUnit(Layer):
         concat_input_hidden = layers.concat([input, pre_hidden], 1)
         gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
 
-        gate_input = layers.elementwise_add(gate_input, self._bias)
+        gate_input = paddle.add(gate_input, self._bias)
         i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
-        new_cell = layers.elementwise_add(
-            layers.elementwise_mul(
+        new_cell = paddle.add(
+            paddle.multiply(
                 pre_cell,
-                paddle.nn.functional.sigmoid(
-                    layers.elementwise_add(f, self._forget_bias)
-                ),
-            ),
-            layers.elementwise_mul(
-                paddle.nn.functional.sigmoid(i), paddle.tanh(j)
+                paddle.nn.functional.sigmoid(paddle.add(f, self._forget_bias)),
             ),
+            paddle.multiply(paddle.nn.functional.sigmoid(i), paddle.tanh(j)),
         )
         new_hidden = paddle.tanh(new_cell) * paddle.nn.functional.sigmoid(o)
 
diff --git a/python/paddle/fluid/dygraph/rnn.py b/python/paddle/fluid/dygraph/rnn.py
index fa88dc44bbd..986d1c562b4 100644
--- a/python/paddle/fluid/dygraph/rnn.py
+++ b/python/paddle/fluid/dygraph/rnn.py
@@ -18,7 +18,6 @@ from ..layers import (
     concat,
     fill_constant,
     matmul,
-    elementwise_add,
     elementwise_mul,
     split,
 )
@@ -217,23 +216,23 @@ class LSTMCell(Layer):
 
         if self._use_cudnn_impl:
             igates = matmul(input, y=self._weight_ih, transpose_y=True)
-            igates = elementwise_add(igates, self._bias_ih)
+            igates = paddle.add(igates, self._bias_ih)
             hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True)
-            hgates = elementwise_add(hgates, self._bias_hh)
+            hgates = paddle.add(hgates, self._bias_hh)
 
             chunked_igates = split(igates, num_or_sections=4, dim=1)
             chunked_hgates = split(hgates, num_or_sections=4, dim=1)
 
-            ingate = elementwise_add(chunked_igates[0], chunked_hgates[0])
+            ingate = paddle.add(chunked_igates[0], chunked_hgates[0])
             ingate = self._gate_activation(ingate)
 
-            forgetgate = elementwise_add(chunked_igates[1], chunked_hgates[1])
+            forgetgate = paddle.add(chunked_igates[1], chunked_hgates[1])
             forgetgate = self._gate_activation(forgetgate)
 
-            cellgate = elementwise_add(chunked_igates[2], chunked_hgates[2])
+            cellgate = paddle.add(chunked_igates[2], chunked_hgates[2])
             cellgate = self._activation(cellgate)
 
-            outgate = elementwise_add(chunked_igates[3], chunked_hgates[3])
+            outgate = paddle.add(chunked_igates[3], chunked_hgates[3])
             outgate = self._gate_activation(outgate)
 
             new_cell = (forgetgate * pre_cell) + (ingate * cellgate)
@@ -244,16 +243,14 @@ class LSTMCell(Layer):
             concat_input_hidden = concat([input, pre_hidden], 1)
             gate_input = matmul(x=concat_input_hidden, y=self._weight)
 
-            gate_input = elementwise_add(gate_input, self._bias)
+            gate_input = paddle.add(gate_input, self._bias)
             i, j, f, o = split(gate_input, num_or_sections=4, dim=-1)
-            new_cell = elementwise_add(
-                elementwise_mul(
+            new_cell = paddle.add(
+                paddle.multiply(
                     pre_cell,
-                    self._gate_activation(
-                        elementwise_add(f, self._forget_bias)
-                    ),
+                    self._gate_activation(paddle.add(f, self._forget_bias)),
                 ),
-                elementwise_mul(
+                paddle.multiply(
                     paddle.nn.functional.sigmoid(i), paddle.tanh(j)
                 ),
             )
@@ -466,21 +463,21 @@ class GRUCell(Layer):
         if self._use_cudnn_impl:
 
             igates = matmul(input, y=self._weight_ih, transpose_y=True)
-            igates = elementwise_add(igates, self._bias_ih)
+            igates = paddle.add(igates, self._bias_ih)
             hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True)
-            hgates = elementwise_add(hgates, self._bias_hh)
+            hgates = paddle.add(hgates, self._bias_hh)
 
             chunked_igates = split(igates, num_or_sections=3, dim=1)
             chunked_hgates = split(hgates, num_or_sections=3, dim=1)
 
-            reset_gate = elementwise_add(chunked_igates[0], chunked_hgates[0])
+            reset_gate = paddle.add(chunked_igates[0], chunked_hgates[0])
             reset_gate = self._gate_activation(reset_gate)
 
-            input_gate = elementwise_add(chunked_igates[1], chunked_hgates[1])
+            input_gate = paddle.add(chunked_igates[1], chunked_hgates[1])
             input_gate = self._gate_activation(input_gate)
 
             _temp = reset_gate * chunked_hgates[2]
-            new_gate = elementwise_add(chunked_igates[2], _temp)
+            new_gate = paddle.add(chunked_igates[2], _temp)
             new_gate = self._activation(new_gate)
 
             new_hidden = (pre_hidden - new_gate) * input_gate + new_gate
@@ -491,7 +488,7 @@ class GRUCell(Layer):
 
             gate_input = matmul(x=concat_input_hidden, y=self._gate_weight)
 
-            gate_input = elementwise_add(gate_input, self._gate_bias)
+            gate_input = paddle.add(gate_input, self._gate_bias)
             gate_input = self._gate_activation(gate_input)
             r, u = split(gate_input, num_or_sections=2, dim=1)
 
@@ -500,7 +497,7 @@ class GRUCell(Layer):
             candidate = matmul(
                 concat([input, r_hidden], 1), self._candidate_weight
             )
-            candidate = elementwise_add(candidate, self._candidate_bias)
+            candidate = paddle.add(candidate, self._candidate_bias)
 
             c = self._activation(candidate)
             new_hidden = u * pre_hidden + (1 - u) * c
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index 39eb4a09474..ebdc1e60ab0 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -115,7 +115,7 @@ class LayerHelperBase:
             )
 
     def _create_weight_normalize(self, attr, shape, dtype):
-        from .layers import elementwise_mul, elementwise_div
+        from .layers import elementwise_mul
 
         # Remove these ops when LayerHelper and layers support indicating
         # program and block.
@@ -266,7 +266,7 @@ class LayerHelperBase:
             norm = __norm_except_dim(
                 v, dim=dim, block=self.main_program.current_block()
             )
-            scale = elementwise_div(
+            scale = paddle.divide(
                 x=g, y=norm
             )  # The shapes of g and norm are the same.
             # Currently, elementwise_mul only support broadcast when the shape
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index a3bfd80de6d..c49d6d4d528 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -1125,10 +1125,9 @@ class BeamSearchDecoder(Decoder):
         )
         # TODO: use where_op
         finished = tensor.cast(finished, dtype=probs.dtype)
-        probs = nn.elementwise_mul(
+        probs = paddle.multiply(
             paddle.tile(nn.unsqueeze(finished, [2]), [1, 1, self.vocab_size]),
             self.noend_mask_tensor,
-            axis=-1,
         ) - nn.elementwise_mul(probs, (finished - 1), axis=0)
         return probs
 
@@ -1503,7 +1502,7 @@ def _dynamic_decode_imperative(
             # To confirm states.finished/finished be consistent with
             # next_finished.
             tensor.assign(next_finished, finished)
-            next_sequence_lengths = nn.elementwise_add(
+            next_sequence_lengths = paddle.add(
                 sequence_lengths,
                 tensor.cast(
                     paddle.logical_not(finished), sequence_lengths.dtype
@@ -1663,7 +1662,7 @@ def _dynamic_decode_declarative(
             # Otherwise, perform logical OR which would not change the already
             # finished.
             next_finished = paddle.logical_or(next_finished, global_finished)
-            next_sequence_lengths = nn.elementwise_add(
+            next_sequence_lengths = paddle.add(
                 sequence_lengths,
                 tensor.cast(
                     paddle.logical_not(global_finished),
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 5cd8380eba5..3d4f187e18f 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -390,7 +390,7 @@ def glu(input, dim=-1):
     )
     a, b = layers.split(input, num_or_sections=2, dim=dim)
     act_b = paddle.nn.functional.sigmoid(x=b)
-    out = layers.elementwise_mul(x=a, y=act_b)
+    out = paddle.multiply(x=a, y=act_b)
     return out
 
 
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 84c3f226ca9..c7a817e1d75 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -7298,10 +7298,10 @@ class LookaheadOptimizer:
                     for param_name in params:
                         fast_var = main_block.var(param_name)
                         slow_var = param_to_slow[param_name]
-                        tmp_var = layers.elementwise_add(
-                            layers.elementwise_mul(fast_var, alpha),
-                            layers.elementwise_mul(
-                                slow_var, layers.elementwise_sub(one_var, alpha)
+                        tmp_var = paddle.add(
+                            paddle.multiply(fast_var, alpha),
+                            paddle.multiply(
+                                slow_var, paddle.subtract(one_var, alpha)
                             ),
                         )
                         layers.assign(input=tmp_var, output=slow_var)
diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py
index c3faa7bd202..ca4922700b8 100644
--- a/python/paddle/incubate/distributed/models/moe/grad_clip.py
+++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py
@@ -212,7 +212,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
         max_global_norm = layers.fill_constant(
             shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
         )
-        clip_var = layers.elementwise_div(
+        clip_var = paddle.divide(
             x=max_global_norm,
             y=paddle.maximum(x=global_norm_var, y=max_global_norm),
         )
@@ -228,7 +228,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
                 if g.dtype == core.VarDesc.VarType.FP16
                 else clip_var
             )
-            new_grad = layers.elementwise_mul(x=g, y=clip_input)
+            new_grad = paddle.multiply(x=g, y=clip_input)
             params_and_grads.append((p, new_grad))
         return params_and_grads
 
-- 
GitLab