From 048e0c558a9bc116b6cad98f1645fef3515419e8 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Tue, 29 Nov 2022 20:38:45 +0800 Subject: [PATCH] clean elem_arithmetic not test.py (#48460) --- .../sharding/group_sharded_utils.py | 2 +- .../meta_parallel/sharding/sharding_utils.py | 2 +- python/paddle/distribution/normal.py | 15 +++---- python/paddle/distribution/uniform.py | 11 ++---- python/paddle/fluid/clip.py | 12 +++--- .../extend_optimizer_with_weight_decay.py | 4 +- .../paddle/fluid/contrib/layers/rnn_impl.py | 18 ++++----- python/paddle/fluid/dygraph/rnn.py | 39 +++++++++---------- python/paddle/fluid/layer_helper_base.py | 4 +- python/paddle/fluid/layers/rnn.py | 7 ++-- python/paddle/fluid/nets.py | 2 +- python/paddle/fluid/optimizer.py | 8 ++-- .../distributed/models/moe/grad_clip.py | 4 +- 13 files changed, 55 insertions(+), 73 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py index 39d88fef67d..c12381c894e 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py @@ -138,7 +138,7 @@ class GroupShardedClipGrad: shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm ) - clip_var = layers.elementwise_div( + clip_var = paddle.divide( x=max_global_norm, y=paddle.maximum(x=global_norm_var, y=max_global_norm), ) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py index 89978cceff7..22f2eb8f1b8 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py @@ -135,7 +135,7 @@ class ShardingClipGrad: shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm ) - clip_var = layers.elementwise_div( + clip_var = paddle.divide( x=max_global_norm, y=paddle.maximum(x=global_norm_var, y=max_global_norm), ) diff --git a/python/paddle/distribution/normal.py b/python/paddle/distribution/normal.py index 2d4b0bed980..f28b92ec86b 100644 --- a/python/paddle/distribution/normal.py +++ b/python/paddle/distribution/normal.py @@ -22,9 +22,6 @@ from paddle.distribution import distribution from paddle.fluid.data_feeder import check_type, convert_dtype from paddle.fluid.framework import _non_static_mode from paddle.fluid.layers import ( - elementwise_add, - elementwise_div, - elementwise_sub, nn, tensor, ) @@ -191,14 +188,14 @@ class Normal(distribution.Distribution): zero_tmp_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype ) output = normal_random_tmp * (zero_tmp_reshape + self.scale) - output = elementwise_add(output, self.loc, name=name) + output = paddle.add(output, self.loc, name=name) return output else: output_shape = shape + batch_shape output = nn.gaussian_random( output_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype ) * (tensor.zeros(output_shape, dtype=self.dtype) + self.scale) - output = elementwise_add(output, self.loc, name=name) + output = paddle.add(output, self.loc, name=name) if self.all_arg_is_float: return paddle.reshape(output, shape, name=name) else: @@ -243,7 +240,7 @@ class Normal(distribution.Distribution): zero_tmp = tensor.fill_constant_batch_size_like( self.loc + self.scale, batch_shape, self.dtype, 0.0 ) - return elementwise_add( + return paddle.add( 0.5 + zero_tmp, 0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)), name=name, @@ -264,7 +261,7 @@ class Normal(distribution.Distribution): var = self.scale * self.scale log_scale = nn.log(self.scale) - return elementwise_sub( + return paddle.subtract( -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var), log_scale + math.log(math.sqrt(2.0 * math.pi)), name=name, @@ -284,7 +281,7 @@ class Normal(distribution.Distribution): value = self._check_values_dtype_in_probs(self.loc, value) var = self.scale * self.scale - return elementwise_div( + return paddle.divide( paddle.exp( -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var) ), @@ -333,6 +330,6 @@ class Normal(distribution.Distribution): var_ratio = var_ratio * var_ratio t1 = (self.loc - other.loc) / other.scale t1 = t1 * t1 - return elementwise_add( + return paddle.add( 0.5 * var_ratio, 0.5 * (t1 - 1.0 - nn.log(var_ratio)), name=name ) diff --git a/python/paddle/distribution/uniform.py b/python/paddle/distribution/uniform.py index 9571cdb08c2..f242dc3db0d 100644 --- a/python/paddle/distribution/uniform.py +++ b/python/paddle/distribution/uniform.py @@ -24,9 +24,6 @@ from paddle.fluid.framework import ( in_dygraph_mode, ) from paddle.fluid.layers import ( - elementwise_add, - elementwise_div, - elementwise_sub, nn, tensor, ) @@ -184,7 +181,7 @@ class Uniform(distribution.Distribution): output = uniform_random_tmp_reshape * ( zero_tmp_reshape + self.high - self.low ) - output = elementwise_add(output, self.low, name=name) + output = paddle.add(output, self.low, name=name) return output else: output_shape = shape + batch_shape @@ -194,7 +191,7 @@ class Uniform(distribution.Distribution): tensor.zeros(output_shape, dtype=self.dtype) + (self.high - self.low) ) - output = elementwise_add(output, self.low, name=name) + output = paddle.add(output, self.low, name=name) if self.all_arg_is_float: return paddle.reshape(output, shape, name=name) else: @@ -235,7 +232,7 @@ class Uniform(distribution.Distribution): ub_bool = value < self.high lb = tensor.cast(lb_bool, dtype=value.dtype) ub = tensor.cast(ub_bool, dtype=value.dtype) - return elementwise_sub( + return paddle.subtract( nn.log(lb * ub), nn.log(self.high - self.low), name=name ) @@ -273,7 +270,7 @@ class Uniform(distribution.Distribution): ub_bool = value < self.high lb = tensor.cast(lb_bool, dtype=value.dtype) ub = tensor.cast(ub_bool, dtype=value.dtype) - return elementwise_div((lb * ub), (self.high - self.low), name=name) + return paddle.divide((lb * ub), (self.high - self.low), name=name) def entropy(self): r"""Shannon entropy in nats. diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 7162313ddae..525c3360f5e 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -548,16 +548,14 @@ class ClipGradByGlobalNorm(ClipGradBase): need_clip = False if not self.auto_skip_clip: # always apply clip need_clip = True - clip_var = layers.elementwise_div( + clip_var = paddle.divide( x=max_global_norm, y=paddle.maximum(x=global_norm_var, y=max_global_norm), ) elif global_norm_var > max_global_norm: # only when global_norm_var > max_global_norm, grad need clip need_clip = True - clip_var = layers.elementwise_div( - x=max_global_norm, y=global_norm_var - ) + clip_var = paddle.divide(x=max_global_norm, y=global_norm_var) for p, g in params_grads: if g is None: @@ -572,7 +570,7 @@ class ClipGradByGlobalNorm(ClipGradBase): if clip_var.dtype != g.dtype else clip_var ) - new_grad = layers.elementwise_mul(g, clip_input) + new_grad = paddle.multiply(g, clip_input) params_and_grads.append((p, new_grad)) else: params_and_grads.append((p, g)) @@ -652,7 +650,7 @@ class ClipGradByGlobalNorm(ClipGradBase): max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm ) - scale_var = layers.elementwise_div( + scale_var = paddle.divide( x=max_global_norm, y=paddle.maximum(x=max_global_norm, y=global_norm_var), ) @@ -729,7 +727,7 @@ class ClipGradByGlobalNorm(ClipGradBase): group_norm_var = layers.sums(input=self.context[self.group_name]) group_norm_var = paddle.sqrt(x=group_norm_var) clip_var = self.context[self.group_name + "_clip"] - group_scale_var = layers.elementwise_div( + group_scale_var = paddle.divide( x=clip_var, y=paddle.maximum(x=clip_var, y=group_norm_var), ) diff --git a/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py b/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py index 53a010c23ce..3a40c5ac80a 100644 --- a/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py +++ b/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py @@ -95,9 +95,7 @@ class DecoupledWeightDecay: with param.block.program._optimized_guard( [param, grad] ), framework.name_scope('weight decay'): - updated_param = paddle.fluid.layers.elementwise_sub( - x=param, y=scaled_param - ) + updated_param = paddle.subtract(x=param, y=scaled_param) paddle.fluid.layers.assign(input=updated_param, output=param) optimize_ops = self.apply_optimize( diff --git a/python/paddle/fluid/contrib/layers/rnn_impl.py b/python/paddle/fluid/contrib/layers/rnn_impl.py index df6a38852ff..5f9a4d2827f 100644 --- a/python/paddle/fluid/contrib/layers/rnn_impl.py +++ b/python/paddle/fluid/contrib/layers/rnn_impl.py @@ -153,7 +153,7 @@ class BasicGRUUnit(Layer): gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight) - gate_input = layers.elementwise_add(gate_input, self._gate_bias) + gate_input = paddle.add(gate_input, self._gate_bias) gate_input = self._gate_activation(gate_input) r, u = layers.split(gate_input, num_or_sections=2, dim=1) @@ -163,7 +163,7 @@ class BasicGRUUnit(Layer): candidate = layers.matmul( layers.concat([input, r_hidden], 1), self._candidate_weight ) - candidate = layers.elementwise_add(candidate, self._candidate_bias) + candidate = paddle.add(candidate, self._candidate_bias) c = self._activation(candidate) new_hidden = u * pre_hidden + (1 - u) * c @@ -876,18 +876,14 @@ class BasicLSTMUnit(Layer): concat_input_hidden = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=concat_input_hidden, y=self._weight) - gate_input = layers.elementwise_add(gate_input, self._bias) + gate_input = paddle.add(gate_input, self._bias) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) - new_cell = layers.elementwise_add( - layers.elementwise_mul( + new_cell = paddle.add( + paddle.multiply( pre_cell, - paddle.nn.functional.sigmoid( - layers.elementwise_add(f, self._forget_bias) - ), - ), - layers.elementwise_mul( - paddle.nn.functional.sigmoid(i), paddle.tanh(j) + paddle.nn.functional.sigmoid(paddle.add(f, self._forget_bias)), ), + paddle.multiply(paddle.nn.functional.sigmoid(i), paddle.tanh(j)), ) new_hidden = paddle.tanh(new_cell) * paddle.nn.functional.sigmoid(o) diff --git a/python/paddle/fluid/dygraph/rnn.py b/python/paddle/fluid/dygraph/rnn.py index fa88dc44bbd..986d1c562b4 100644 --- a/python/paddle/fluid/dygraph/rnn.py +++ b/python/paddle/fluid/dygraph/rnn.py @@ -18,7 +18,6 @@ from ..layers import ( concat, fill_constant, matmul, - elementwise_add, elementwise_mul, split, ) @@ -217,23 +216,23 @@ class LSTMCell(Layer): if self._use_cudnn_impl: igates = matmul(input, y=self._weight_ih, transpose_y=True) - igates = elementwise_add(igates, self._bias_ih) + igates = paddle.add(igates, self._bias_ih) hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True) - hgates = elementwise_add(hgates, self._bias_hh) + hgates = paddle.add(hgates, self._bias_hh) chunked_igates = split(igates, num_or_sections=4, dim=1) chunked_hgates = split(hgates, num_or_sections=4, dim=1) - ingate = elementwise_add(chunked_igates[0], chunked_hgates[0]) + ingate = paddle.add(chunked_igates[0], chunked_hgates[0]) ingate = self._gate_activation(ingate) - forgetgate = elementwise_add(chunked_igates[1], chunked_hgates[1]) + forgetgate = paddle.add(chunked_igates[1], chunked_hgates[1]) forgetgate = self._gate_activation(forgetgate) - cellgate = elementwise_add(chunked_igates[2], chunked_hgates[2]) + cellgate = paddle.add(chunked_igates[2], chunked_hgates[2]) cellgate = self._activation(cellgate) - outgate = elementwise_add(chunked_igates[3], chunked_hgates[3]) + outgate = paddle.add(chunked_igates[3], chunked_hgates[3]) outgate = self._gate_activation(outgate) new_cell = (forgetgate * pre_cell) + (ingate * cellgate) @@ -244,16 +243,14 @@ class LSTMCell(Layer): concat_input_hidden = concat([input, pre_hidden], 1) gate_input = matmul(x=concat_input_hidden, y=self._weight) - gate_input = elementwise_add(gate_input, self._bias) + gate_input = paddle.add(gate_input, self._bias) i, j, f, o = split(gate_input, num_or_sections=4, dim=-1) - new_cell = elementwise_add( - elementwise_mul( + new_cell = paddle.add( + paddle.multiply( pre_cell, - self._gate_activation( - elementwise_add(f, self._forget_bias) - ), + self._gate_activation(paddle.add(f, self._forget_bias)), ), - elementwise_mul( + paddle.multiply( paddle.nn.functional.sigmoid(i), paddle.tanh(j) ), ) @@ -466,21 +463,21 @@ class GRUCell(Layer): if self._use_cudnn_impl: igates = matmul(input, y=self._weight_ih, transpose_y=True) - igates = elementwise_add(igates, self._bias_ih) + igates = paddle.add(igates, self._bias_ih) hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True) - hgates = elementwise_add(hgates, self._bias_hh) + hgates = paddle.add(hgates, self._bias_hh) chunked_igates = split(igates, num_or_sections=3, dim=1) chunked_hgates = split(hgates, num_or_sections=3, dim=1) - reset_gate = elementwise_add(chunked_igates[0], chunked_hgates[0]) + reset_gate = paddle.add(chunked_igates[0], chunked_hgates[0]) reset_gate = self._gate_activation(reset_gate) - input_gate = elementwise_add(chunked_igates[1], chunked_hgates[1]) + input_gate = paddle.add(chunked_igates[1], chunked_hgates[1]) input_gate = self._gate_activation(input_gate) _temp = reset_gate * chunked_hgates[2] - new_gate = elementwise_add(chunked_igates[2], _temp) + new_gate = paddle.add(chunked_igates[2], _temp) new_gate = self._activation(new_gate) new_hidden = (pre_hidden - new_gate) * input_gate + new_gate @@ -491,7 +488,7 @@ class GRUCell(Layer): gate_input = matmul(x=concat_input_hidden, y=self._gate_weight) - gate_input = elementwise_add(gate_input, self._gate_bias) + gate_input = paddle.add(gate_input, self._gate_bias) gate_input = self._gate_activation(gate_input) r, u = split(gate_input, num_or_sections=2, dim=1) @@ -500,7 +497,7 @@ class GRUCell(Layer): candidate = matmul( concat([input, r_hidden], 1), self._candidate_weight ) - candidate = elementwise_add(candidate, self._candidate_bias) + candidate = paddle.add(candidate, self._candidate_bias) c = self._activation(candidate) new_hidden = u * pre_hidden + (1 - u) * c diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py index 39eb4a09474..ebdc1e60ab0 100644 --- a/python/paddle/fluid/layer_helper_base.py +++ b/python/paddle/fluid/layer_helper_base.py @@ -115,7 +115,7 @@ class LayerHelperBase: ) def _create_weight_normalize(self, attr, shape, dtype): - from .layers import elementwise_mul, elementwise_div + from .layers import elementwise_mul # Remove these ops when LayerHelper and layers support indicating # program and block. @@ -266,7 +266,7 @@ class LayerHelperBase: norm = __norm_except_dim( v, dim=dim, block=self.main_program.current_block() ) - scale = elementwise_div( + scale = paddle.divide( x=g, y=norm ) # The shapes of g and norm are the same. # Currently, elementwise_mul only support broadcast when the shape diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py index a3bfd80de6d..c49d6d4d528 100644 --- a/python/paddle/fluid/layers/rnn.py +++ b/python/paddle/fluid/layers/rnn.py @@ -1125,10 +1125,9 @@ class BeamSearchDecoder(Decoder): ) # TODO: use where_op finished = tensor.cast(finished, dtype=probs.dtype) - probs = nn.elementwise_mul( + probs = paddle.multiply( paddle.tile(nn.unsqueeze(finished, [2]), [1, 1, self.vocab_size]), self.noend_mask_tensor, - axis=-1, ) - nn.elementwise_mul(probs, (finished - 1), axis=0) return probs @@ -1503,7 +1502,7 @@ def _dynamic_decode_imperative( # To confirm states.finished/finished be consistent with # next_finished. tensor.assign(next_finished, finished) - next_sequence_lengths = nn.elementwise_add( + next_sequence_lengths = paddle.add( sequence_lengths, tensor.cast( paddle.logical_not(finished), sequence_lengths.dtype @@ -1663,7 +1662,7 @@ def _dynamic_decode_declarative( # Otherwise, perform logical OR which would not change the already # finished. next_finished = paddle.logical_or(next_finished, global_finished) - next_sequence_lengths = nn.elementwise_add( + next_sequence_lengths = paddle.add( sequence_lengths, tensor.cast( paddle.logical_not(global_finished), diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py index 5cd8380eba5..3d4f187e18f 100644 --- a/python/paddle/fluid/nets.py +++ b/python/paddle/fluid/nets.py @@ -390,7 +390,7 @@ def glu(input, dim=-1): ) a, b = layers.split(input, num_or_sections=2, dim=dim) act_b = paddle.nn.functional.sigmoid(x=b) - out = layers.elementwise_mul(x=a, y=act_b) + out = paddle.multiply(x=a, y=act_b) return out diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 84c3f226ca9..c7a817e1d75 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -7298,10 +7298,10 @@ class LookaheadOptimizer: for param_name in params: fast_var = main_block.var(param_name) slow_var = param_to_slow[param_name] - tmp_var = layers.elementwise_add( - layers.elementwise_mul(fast_var, alpha), - layers.elementwise_mul( - slow_var, layers.elementwise_sub(one_var, alpha) + tmp_var = paddle.add( + paddle.multiply(fast_var, alpha), + paddle.multiply( + slow_var, paddle.subtract(one_var, alpha) ), ) layers.assign(input=tmp_var, output=slow_var) diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py index c3faa7bd202..ca4922700b8 100644 --- a/python/paddle/incubate/distributed/models/moe/grad_clip.py +++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py @@ -212,7 +212,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase): max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm ) - clip_var = layers.elementwise_div( + clip_var = paddle.divide( x=max_global_norm, y=paddle.maximum(x=global_norm_var, y=max_global_norm), ) @@ -228,7 +228,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase): if g.dtype == core.VarDesc.VarType.FP16 else clip_var ) - new_grad = layers.elementwise_mul(x=g, y=clip_input) + new_grad = paddle.multiply(x=g, y=clip_input) params_and_grads.append((p, new_grad)) return params_and_grads -- GitLab