clean elem_arithmetic not test.py (#48460)

048e0c55 · HongyuJia · GitHub · 41f15537 · 048e0c55 · 048e0c55
13 changed file
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -138,7 +138,7 @@ class GroupShardedClipGrad:
            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
        )
-        clip_var = layers.elementwise_div(
+        clip_var = paddle.divide(
            x=max_global_norm,
            y=paddle.maximum(x=global_norm_var, y=max_global_norm),
        )

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -135,7 +135,7 @@ class ShardingClipGrad:
            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
        )
-        clip_var = layers.elementwise_div(
+        clip_var = paddle.divide(
            x=max_global_norm,
            y=paddle.maximum(x=global_norm_var, y=max_global_norm),
        )

--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -22,9 +22,6 @@ from paddle.distribution import distribution
 from paddle.fluid.data_feeder import check_type, convert_dtype
 from paddle.fluid.framework import _non_static_mode
 from paddle.fluid.layers import (
-    elementwise_add,
-    elementwise_div,
-    elementwise_sub,
    nn,
    tensor,
 )
@@ -191,14 +188,14 @@ class Normal(distribution.Distribution):
                zero_tmp_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype
            )
            output = normal_random_tmp * (zero_tmp_reshape + self.scale)
-            output = elementwise_add(output, self.loc, name=name)
+            output = paddle.add(output, self.loc, name=name)
            return output
        else:
            output_shape = shape + batch_shape
            output = nn.gaussian_random(
                output_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype
            ) * (tensor.zeros(output_shape, dtype=self.dtype) + self.scale)
-            output = elementwise_add(output, self.loc, name=name)
+            output = paddle.add(output, self.loc, name=name)
            if self.all_arg_is_float:
                return paddle.reshape(output, shape, name=name)
            else:
@@ -243,7 +240,7 @@ class Normal(distribution.Distribution):
        zero_tmp = tensor.fill_constant_batch_size_like(
            self.loc + self.scale, batch_shape, self.dtype, 0.0
        )
-        return elementwise_add(
+        return paddle.add(
            0.5 + zero_tmp,
            0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)),
            name=name,
@@ -264,7 +261,7 @@ class Normal(distribution.Distribution):
        var = self.scale * self.scale
        log_scale = nn.log(self.scale)
-        return elementwise_sub(
+        return paddle.subtract(
            -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var),
            log_scale + math.log(math.sqrt(2.0 * math.pi)),
            name=name,
@@ -284,7 +281,7 @@ class Normal(distribution.Distribution):
        value = self._check_values_dtype_in_probs(self.loc, value)
        var = self.scale * self.scale
-        return elementwise_div(
+        return paddle.divide(
            paddle.exp(
                -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var)
            ),
@@ -333,6 +330,6 @@ class Normal(distribution.Distribution):
        var_ratio = var_ratio * var_ratio
        t1 = (self.loc - other.loc) / other.scale
        t1 = t1 * t1
-        return elementwise_add(
+        return paddle.add(
            0.5 * var_ratio, 0.5 * (t1 - 1.0 - nn.log(var_ratio)), name=name
        )
--- a/python/paddle/distribution/uniform.py
+++ b/python/paddle/distribution/uniform.py
@@ -24,9 +24,6 @@ from paddle.fluid.framework import (
    in_dygraph_mode,
 )
 from paddle.fluid.layers import (
-    elementwise_add,
-    elementwise_div,
-    elementwise_sub,
    nn,
    tensor,
 )
@@ -184,7 +181,7 @@ class Uniform(distribution.Distribution):
            output = uniform_random_tmp_reshape * (
                zero_tmp_reshape + self.high - self.low
            )
-            output = elementwise_add(output, self.low, name=name)
+            output = paddle.add(output, self.low, name=name)
            return output
        else:
            output_shape = shape + batch_shape
@@ -194,7 +191,7 @@ class Uniform(distribution.Distribution):
                tensor.zeros(output_shape, dtype=self.dtype)
                + (self.high - self.low)
            )
-            output = elementwise_add(output, self.low, name=name)
+            output = paddle.add(output, self.low, name=name)
            if self.all_arg_is_float:
                return paddle.reshape(output, shape, name=name)
            else:
@@ -235,7 +232,7 @@ class Uniform(distribution.Distribution):
        ub_bool = value < self.high
        lb = tensor.cast(lb_bool, dtype=value.dtype)
        ub = tensor.cast(ub_bool, dtype=value.dtype)
-        return elementwise_sub(
+        return paddle.subtract(
            nn.log(lb * ub), nn.log(self.high - self.low), name=name
        )
@@ -273,7 +270,7 @@ class Uniform(distribution.Distribution):
        ub_bool = value < self.high
        lb = tensor.cast(lb_bool, dtype=value.dtype)
        ub = tensor.cast(ub_bool, dtype=value.dtype)
-        return elementwise_div((lb * ub), (self.high - self.low), name=name)
+        return paddle.divide((lb * ub), (self.high - self.low), name=name)
    def entropy(self):
        r"""Shannon entropy in nats.

--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -548,16 +548,14 @@ class ClipGradByGlobalNorm(ClipGradBase):
        need_clip = False
        if not self.auto_skip_clip:  # always apply clip
            need_clip = True
-            clip_var = layers.elementwise_div(
+            clip_var = paddle.divide(
                x=max_global_norm,
                y=paddle.maximum(x=global_norm_var, y=max_global_norm),
            )
        elif global_norm_var > max_global_norm:
            # only when global_norm_var > max_global_norm, grad need clip
            need_clip = True
-            clip_var = layers.elementwise_div(
+            clip_var = paddle.divide(x=max_global_norm, y=global_norm_var)
-                x=max_global_norm, y=global_norm_var
-            )
        for p, g in params_grads:
            if g is None:
@@ -572,7 +570,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
                    if clip_var.dtype != g.dtype
                    else clip_var
                )
-                new_grad = layers.elementwise_mul(g, clip_input)
+                new_grad = paddle.multiply(g, clip_input)
                params_and_grads.append((p, new_grad))
            else:
                params_and_grads.append((p, g))
@@ -652,7 +650,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
                max_global_norm = layers.fill_constant(
                    shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
                )
-                scale_var = layers.elementwise_div(
+                scale_var = paddle.divide(
                    x=max_global_norm,
                    y=paddle.maximum(x=max_global_norm, y=global_norm_var),
                )
@@ -729,7 +727,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
            group_norm_var = layers.sums(input=self.context[self.group_name])
            group_norm_var = paddle.sqrt(x=group_norm_var)
            clip_var = self.context[self.group_name + "_clip"]
-            group_scale_var = layers.elementwise_div(
+            group_scale_var = paddle.divide(
                x=clip_var,
                y=paddle.maximum(x=clip_var, y=group_norm_var),
            )

--- a/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
+++ b/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
@@ -95,9 +95,7 @@ class DecoupledWeightDecay:
            with param.block.program._optimized_guard(
                [param, grad]
            ), framework.name_scope('weight decay'):
-                updated_param = paddle.fluid.layers.elementwise_sub(
+                updated_param = paddle.subtract(x=param, y=scaled_param)
-                    x=param, y=scaled_param
-                )
                paddle.fluid.layers.assign(input=updated_param, output=param)
        optimize_ops = self.apply_optimize(

--- a/python/paddle/fluid/contrib/layers/rnn_impl.py
+++ b/python/paddle/fluid/contrib/layers/rnn_impl.py
@@ -153,7 +153,7 @@ class BasicGRUUnit(Layer):
        gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
-        gate_input = layers.elementwise_add(gate_input, self._gate_bias)
+        gate_input = paddle.add(gate_input, self._gate_bias)
        gate_input = self._gate_activation(gate_input)
        r, u = layers.split(gate_input, num_or_sections=2, dim=1)
@@ -163,7 +163,7 @@ class BasicGRUUnit(Layer):
        candidate = layers.matmul(
            layers.concat([input, r_hidden], 1), self._candidate_weight
        )
-        candidate = layers.elementwise_add(candidate, self._candidate_bias)
+        candidate = paddle.add(candidate, self._candidate_bias)
        c = self._activation(candidate)
        new_hidden = u * pre_hidden + (1 - u) * c
@@ -876,18 +876,14 @@ class BasicLSTMUnit(Layer):
        concat_input_hidden = layers.concat([input, pre_hidden], 1)
        gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
-        gate_input = layers.elementwise_add(gate_input, self._bias)
+        gate_input = paddle.add(gate_input, self._bias)
        i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
-        new_cell = layers.elementwise_add(
+        new_cell = paddle.add(
-            layers.elementwise_mul(
+            paddle.multiply(
                pre_cell,
-                paddle.nn.functional.sigmoid(
+                paddle.nn.functional.sigmoid(paddle.add(f, self._forget_bias)),
-                    layers.elementwise_add(f, self._forget_bias)
-                ),
-            ),
-            layers.elementwise_mul(
-                paddle.nn.functional.sigmoid(i), paddle.tanh(j)
            ),
+            paddle.multiply(paddle.nn.functional.sigmoid(i), paddle.tanh(j)),
        )
        new_hidden = paddle.tanh(new_cell) * paddle.nn.functional.sigmoid(o)

--- a/python/paddle/fluid/dygraph/rnn.py
+++ b/python/paddle/fluid/dygraph/rnn.py
@@ -18,7 +18,6 @@ from ..layers import (
    concat,
    fill_constant,
    matmul,
-    elementwise_add,
    elementwise_mul,
    split,
 )
@@ -217,23 +216,23 @@ class LSTMCell(Layer):
        if self._use_cudnn_impl:
            igates = matmul(input, y=self._weight_ih, transpose_y=True)
-            igates = elementwise_add(igates, self._bias_ih)
+            igates = paddle.add(igates, self._bias_ih)
            hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True)
-            hgates = elementwise_add(hgates, self._bias_hh)
+            hgates = paddle.add(hgates, self._bias_hh)
            chunked_igates = split(igates, num_or_sections=4, dim=1)
            chunked_hgates = split(hgates, num_or_sections=4, dim=1)
-            ingate = elementwise_add(chunked_igates[0], chunked_hgates[0])
+            ingate = paddle.add(chunked_igates[0], chunked_hgates[0])
            ingate = self._gate_activation(ingate)
-            forgetgate = elementwise_add(chunked_igates[1], chunked_hgates[1])
+            forgetgate = paddle.add(chunked_igates[1], chunked_hgates[1])
            forgetgate = self._gate_activation(forgetgate)
-            cellgate = elementwise_add(chunked_igates[2], chunked_hgates[2])
+            cellgate = paddle.add(chunked_igates[2], chunked_hgates[2])
            cellgate = self._activation(cellgate)
-            outgate = elementwise_add(chunked_igates[3], chunked_hgates[3])
+            outgate = paddle.add(chunked_igates[3], chunked_hgates[3])
            outgate = self._gate_activation(outgate)
            new_cell = (forgetgate * pre_cell) + (ingate * cellgate)
@@ -244,16 +243,14 @@ class LSTMCell(Layer):
            concat_input_hidden = concat([input, pre_hidden], 1)
            gate_input = matmul(x=concat_input_hidden, y=self._weight)
-            gate_input = elementwise_add(gate_input, self._bias)
+            gate_input = paddle.add(gate_input, self._bias)
            i, j, f, o = split(gate_input, num_or_sections=4, dim=-1)
-            new_cell = elementwise_add(
+            new_cell = paddle.add(
-                elementwise_mul(
+                paddle.multiply(
                    pre_cell,
-                    self._gate_activation(
+                    self._gate_activation(paddle.add(f, self._forget_bias)),
-                        elementwise_add(f, self._forget_bias)
-                    ),
                ),
-                elementwise_mul(
+                paddle.multiply(
                    paddle.nn.functional.sigmoid(i), paddle.tanh(j)
                ),
            )
@@ -466,21 +463,21 @@ class GRUCell(Layer):
        if self._use_cudnn_impl:
            igates = matmul(input, y=self._weight_ih, transpose_y=True)
-            igates = elementwise_add(igates, self._bias_ih)
+            igates = paddle.add(igates, self._bias_ih)
            hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True)
-            hgates = elementwise_add(hgates, self._bias_hh)
+            hgates = paddle.add(hgates, self._bias_hh)
            chunked_igates = split(igates, num_or_sections=3, dim=1)
            chunked_hgates = split(hgates, num_or_sections=3, dim=1)
-            reset_gate = elementwise_add(chunked_igates[0], chunked_hgates[0])
+            reset_gate = paddle.add(chunked_igates[0], chunked_hgates[0])
            reset_gate = self._gate_activation(reset_gate)
-            input_gate = elementwise_add(chunked_igates[1], chunked_hgates[1])
+            input_gate = paddle.add(chunked_igates[1], chunked_hgates[1])
            input_gate = self._gate_activation(input_gate)
            _temp = reset_gate * chunked_hgates[2]
-            new_gate = elementwise_add(chunked_igates[2], _temp)
+            new_gate = paddle.add(chunked_igates[2], _temp)
            new_gate = self._activation(new_gate)
            new_hidden = (pre_hidden - new_gate) * input_gate + new_gate
@@ -491,7 +488,7 @@ class GRUCell(Layer):
            gate_input = matmul(x=concat_input_hidden, y=self._gate_weight)
-            gate_input = elementwise_add(gate_input, self._gate_bias)
+            gate_input = paddle.add(gate_input, self._gate_bias)
            gate_input = self._gate_activation(gate_input)
            r, u = split(gate_input, num_or_sections=2, dim=1)
@@ -500,7 +497,7 @@ class GRUCell(Layer):
            candidate = matmul(
                concat([input, r_hidden], 1), self._candidate_weight
            )
-            candidate = elementwise_add(candidate, self._candidate_bias)
+            candidate = paddle.add(candidate, self._candidate_bias)
            c = self._activation(candidate)
            new_hidden = u * pre_hidden + (1 - u) * c

--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -115,7 +115,7 @@ class LayerHelperBase:
            )
    def _create_weight_normalize(self, attr, shape, dtype):
-        from .layers import elementwise_mul, elementwise_div
+        from .layers import elementwise_mul
        # Remove these ops when LayerHelper and layers support indicating
        # program and block.
@@ -266,7 +266,7 @@ class LayerHelperBase:
            norm = __norm_except_dim(
                v, dim=dim, block=self.main_program.current_block()
            )
-            scale = elementwise_div(
+            scale = paddle.divide(
                x=g, y=norm
            )  # The shapes of g and norm are the same.
            # Currently, elementwise_mul only support broadcast when the shape

--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -1125,10 +1125,9 @@ class BeamSearchDecoder(Decoder):
        )
        # TODO: use where_op
        finished = tensor.cast(finished, dtype=probs.dtype)
-        probs = nn.elementwise_mul(
+        probs = paddle.multiply(
            paddle.tile(nn.unsqueeze(finished, [2]), [1, 1, self.vocab_size]),
            self.noend_mask_tensor,
-            axis=-1,
        ) - nn.elementwise_mul(probs, (finished - 1), axis=0)
        return probs
@@ -1503,7 +1502,7 @@ def _dynamic_decode_imperative(
            # To confirm states.finished/finished be consistent with
            # next_finished.
            tensor.assign(next_finished, finished)
-            next_sequence_lengths = nn.elementwise_add(
+            next_sequence_lengths = paddle.add(
                sequence_lengths,
                tensor.cast(
                    paddle.logical_not(finished), sequence_lengths.dtype
@@ -1663,7 +1662,7 @@ def _dynamic_decode_declarative(
            # Otherwise, perform logical OR which would not change the already
            # finished.
            next_finished = paddle.logical_or(next_finished, global_finished)
-            next_sequence_lengths = nn.elementwise_add(
+            next_sequence_lengths = paddle.add(
                sequence_lengths,
                tensor.cast(
                    paddle.logical_not(global_finished),

--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -390,7 +390,7 @@ def glu(input, dim=-1):
    )
    a, b = layers.split(input, num_or_sections=2, dim=dim)
    act_b = paddle.nn.functional.sigmoid(x=b)
-    out = layers.elementwise_mul(x=a, y=act_b)
+    out = paddle.multiply(x=a, y=act_b)
    return out

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -7298,10 +7298,10 @@ class LookaheadOptimizer:
                    for param_name in params:
                        fast_var = main_block.var(param_name)
                        slow_var = param_to_slow[param_name]
-                        tmp_var = layers.elementwise_add(
+                        tmp_var = paddle.add(
-                            layers.elementwise_mul(fast_var, alpha),
+                            paddle.multiply(fast_var, alpha),
-                            layers.elementwise_mul(
+                            paddle.multiply(
-                                slow_var, layers.elementwise_sub(one_var, alpha)
+                                slow_var, paddle.subtract(one_var, alpha)
                            ),
                        )
                        layers.assign(input=tmp_var, output=slow_var)

--- a/python/paddle/incubate/distributed/models/moe/grad_clip.py
+++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py
@@ -212,7 +212,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
        max_global_norm = layers.fill_constant(
            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
        )
-        clip_var = layers.elementwise_div(
+        clip_var = paddle.divide(
            x=max_global_norm,
            y=paddle.maximum(x=global_norm_var, y=max_global_norm),
        )
@@ -228,7 +228,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
                if g.dtype == core.VarDesc.VarType.FP16
                else clip_var
            )
-            new_grad = layers.elementwise_mul(x=g, y=clip_input)
+            new_grad = paddle.multiply(x=g, y=clip_input)
            params_and_grads.append((p, new_grad))
        return params_and_grads