未验证 提交 048e0c55 编写于 作者: H HongyuJia 提交者: GitHub

clean elem_arithmetic not test.py (#48460)

上级 41f15537
...@@ -138,7 +138,7 @@ class GroupShardedClipGrad: ...@@ -138,7 +138,7 @@ class GroupShardedClipGrad:
shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
) )
clip_var = layers.elementwise_div( clip_var = paddle.divide(
x=max_global_norm, x=max_global_norm,
y=paddle.maximum(x=global_norm_var, y=max_global_norm), y=paddle.maximum(x=global_norm_var, y=max_global_norm),
) )
......
...@@ -135,7 +135,7 @@ class ShardingClipGrad: ...@@ -135,7 +135,7 @@ class ShardingClipGrad:
shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
) )
clip_var = layers.elementwise_div( clip_var = paddle.divide(
x=max_global_norm, x=max_global_norm,
y=paddle.maximum(x=global_norm_var, y=max_global_norm), y=paddle.maximum(x=global_norm_var, y=max_global_norm),
) )
......
...@@ -22,9 +22,6 @@ from paddle.distribution import distribution ...@@ -22,9 +22,6 @@ from paddle.distribution import distribution
from paddle.fluid.data_feeder import check_type, convert_dtype from paddle.fluid.data_feeder import check_type, convert_dtype
from paddle.fluid.framework import _non_static_mode from paddle.fluid.framework import _non_static_mode
from paddle.fluid.layers import ( from paddle.fluid.layers import (
elementwise_add,
elementwise_div,
elementwise_sub,
nn, nn,
tensor, tensor,
) )
...@@ -191,14 +188,14 @@ class Normal(distribution.Distribution): ...@@ -191,14 +188,14 @@ class Normal(distribution.Distribution):
zero_tmp_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype zero_tmp_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype
) )
output = normal_random_tmp * (zero_tmp_reshape + self.scale) output = normal_random_tmp * (zero_tmp_reshape + self.scale)
output = elementwise_add(output, self.loc, name=name) output = paddle.add(output, self.loc, name=name)
return output return output
else: else:
output_shape = shape + batch_shape output_shape = shape + batch_shape
output = nn.gaussian_random( output = nn.gaussian_random(
output_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype output_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype
) * (tensor.zeros(output_shape, dtype=self.dtype) + self.scale) ) * (tensor.zeros(output_shape, dtype=self.dtype) + self.scale)
output = elementwise_add(output, self.loc, name=name) output = paddle.add(output, self.loc, name=name)
if self.all_arg_is_float: if self.all_arg_is_float:
return paddle.reshape(output, shape, name=name) return paddle.reshape(output, shape, name=name)
else: else:
...@@ -243,7 +240,7 @@ class Normal(distribution.Distribution): ...@@ -243,7 +240,7 @@ class Normal(distribution.Distribution):
zero_tmp = tensor.fill_constant_batch_size_like( zero_tmp = tensor.fill_constant_batch_size_like(
self.loc + self.scale, batch_shape, self.dtype, 0.0 self.loc + self.scale, batch_shape, self.dtype, 0.0
) )
return elementwise_add( return paddle.add(
0.5 + zero_tmp, 0.5 + zero_tmp,
0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)), 0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)),
name=name, name=name,
...@@ -264,7 +261,7 @@ class Normal(distribution.Distribution): ...@@ -264,7 +261,7 @@ class Normal(distribution.Distribution):
var = self.scale * self.scale var = self.scale * self.scale
log_scale = nn.log(self.scale) log_scale = nn.log(self.scale)
return elementwise_sub( return paddle.subtract(
-1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var), -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var),
log_scale + math.log(math.sqrt(2.0 * math.pi)), log_scale + math.log(math.sqrt(2.0 * math.pi)),
name=name, name=name,
...@@ -284,7 +281,7 @@ class Normal(distribution.Distribution): ...@@ -284,7 +281,7 @@ class Normal(distribution.Distribution):
value = self._check_values_dtype_in_probs(self.loc, value) value = self._check_values_dtype_in_probs(self.loc, value)
var = self.scale * self.scale var = self.scale * self.scale
return elementwise_div( return paddle.divide(
paddle.exp( paddle.exp(
-1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var) -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var)
), ),
...@@ -333,6 +330,6 @@ class Normal(distribution.Distribution): ...@@ -333,6 +330,6 @@ class Normal(distribution.Distribution):
var_ratio = var_ratio * var_ratio var_ratio = var_ratio * var_ratio
t1 = (self.loc - other.loc) / other.scale t1 = (self.loc - other.loc) / other.scale
t1 = t1 * t1 t1 = t1 * t1
return elementwise_add( return paddle.add(
0.5 * var_ratio, 0.5 * (t1 - 1.0 - nn.log(var_ratio)), name=name 0.5 * var_ratio, 0.5 * (t1 - 1.0 - nn.log(var_ratio)), name=name
) )
...@@ -24,9 +24,6 @@ from paddle.fluid.framework import ( ...@@ -24,9 +24,6 @@ from paddle.fluid.framework import (
in_dygraph_mode, in_dygraph_mode,
) )
from paddle.fluid.layers import ( from paddle.fluid.layers import (
elementwise_add,
elementwise_div,
elementwise_sub,
nn, nn,
tensor, tensor,
) )
...@@ -184,7 +181,7 @@ class Uniform(distribution.Distribution): ...@@ -184,7 +181,7 @@ class Uniform(distribution.Distribution):
output = uniform_random_tmp_reshape * ( output = uniform_random_tmp_reshape * (
zero_tmp_reshape + self.high - self.low zero_tmp_reshape + self.high - self.low
) )
output = elementwise_add(output, self.low, name=name) output = paddle.add(output, self.low, name=name)
return output return output
else: else:
output_shape = shape + batch_shape output_shape = shape + batch_shape
...@@ -194,7 +191,7 @@ class Uniform(distribution.Distribution): ...@@ -194,7 +191,7 @@ class Uniform(distribution.Distribution):
tensor.zeros(output_shape, dtype=self.dtype) tensor.zeros(output_shape, dtype=self.dtype)
+ (self.high - self.low) + (self.high - self.low)
) )
output = elementwise_add(output, self.low, name=name) output = paddle.add(output, self.low, name=name)
if self.all_arg_is_float: if self.all_arg_is_float:
return paddle.reshape(output, shape, name=name) return paddle.reshape(output, shape, name=name)
else: else:
...@@ -235,7 +232,7 @@ class Uniform(distribution.Distribution): ...@@ -235,7 +232,7 @@ class Uniform(distribution.Distribution):
ub_bool = value < self.high ub_bool = value < self.high
lb = tensor.cast(lb_bool, dtype=value.dtype) lb = tensor.cast(lb_bool, dtype=value.dtype)
ub = tensor.cast(ub_bool, dtype=value.dtype) ub = tensor.cast(ub_bool, dtype=value.dtype)
return elementwise_sub( return paddle.subtract(
nn.log(lb * ub), nn.log(self.high - self.low), name=name nn.log(lb * ub), nn.log(self.high - self.low), name=name
) )
...@@ -273,7 +270,7 @@ class Uniform(distribution.Distribution): ...@@ -273,7 +270,7 @@ class Uniform(distribution.Distribution):
ub_bool = value < self.high ub_bool = value < self.high
lb = tensor.cast(lb_bool, dtype=value.dtype) lb = tensor.cast(lb_bool, dtype=value.dtype)
ub = tensor.cast(ub_bool, dtype=value.dtype) ub = tensor.cast(ub_bool, dtype=value.dtype)
return elementwise_div((lb * ub), (self.high - self.low), name=name) return paddle.divide((lb * ub), (self.high - self.low), name=name)
def entropy(self): def entropy(self):
r"""Shannon entropy in nats. r"""Shannon entropy in nats.
......
...@@ -548,16 +548,14 @@ class ClipGradByGlobalNorm(ClipGradBase): ...@@ -548,16 +548,14 @@ class ClipGradByGlobalNorm(ClipGradBase):
need_clip = False need_clip = False
if not self.auto_skip_clip: # always apply clip if not self.auto_skip_clip: # always apply clip
need_clip = True need_clip = True
clip_var = layers.elementwise_div( clip_var = paddle.divide(
x=max_global_norm, x=max_global_norm,
y=paddle.maximum(x=global_norm_var, y=max_global_norm), y=paddle.maximum(x=global_norm_var, y=max_global_norm),
) )
elif global_norm_var > max_global_norm: elif global_norm_var > max_global_norm:
# only when global_norm_var > max_global_norm, grad need clip # only when global_norm_var > max_global_norm, grad need clip
need_clip = True need_clip = True
clip_var = layers.elementwise_div( clip_var = paddle.divide(x=max_global_norm, y=global_norm_var)
x=max_global_norm, y=global_norm_var
)
for p, g in params_grads: for p, g in params_grads:
if g is None: if g is None:
...@@ -572,7 +570,7 @@ class ClipGradByGlobalNorm(ClipGradBase): ...@@ -572,7 +570,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
if clip_var.dtype != g.dtype if clip_var.dtype != g.dtype
else clip_var else clip_var
) )
new_grad = layers.elementwise_mul(g, clip_input) new_grad = paddle.multiply(g, clip_input)
params_and_grads.append((p, new_grad)) params_and_grads.append((p, new_grad))
else: else:
params_and_grads.append((p, g)) params_and_grads.append((p, g))
...@@ -652,7 +650,7 @@ class ClipGradByGlobalNorm(ClipGradBase): ...@@ -652,7 +650,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
max_global_norm = layers.fill_constant( max_global_norm = layers.fill_constant(
shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
) )
scale_var = layers.elementwise_div( scale_var = paddle.divide(
x=max_global_norm, x=max_global_norm,
y=paddle.maximum(x=max_global_norm, y=global_norm_var), y=paddle.maximum(x=max_global_norm, y=global_norm_var),
) )
...@@ -729,7 +727,7 @@ class ClipGradByGlobalNorm(ClipGradBase): ...@@ -729,7 +727,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
group_norm_var = layers.sums(input=self.context[self.group_name]) group_norm_var = layers.sums(input=self.context[self.group_name])
group_norm_var = paddle.sqrt(x=group_norm_var) group_norm_var = paddle.sqrt(x=group_norm_var)
clip_var = self.context[self.group_name + "_clip"] clip_var = self.context[self.group_name + "_clip"]
group_scale_var = layers.elementwise_div( group_scale_var = paddle.divide(
x=clip_var, x=clip_var,
y=paddle.maximum(x=clip_var, y=group_norm_var), y=paddle.maximum(x=clip_var, y=group_norm_var),
) )
......
...@@ -95,9 +95,7 @@ class DecoupledWeightDecay: ...@@ -95,9 +95,7 @@ class DecoupledWeightDecay:
with param.block.program._optimized_guard( with param.block.program._optimized_guard(
[param, grad] [param, grad]
), framework.name_scope('weight decay'): ), framework.name_scope('weight decay'):
updated_param = paddle.fluid.layers.elementwise_sub( updated_param = paddle.subtract(x=param, y=scaled_param)
x=param, y=scaled_param
)
paddle.fluid.layers.assign(input=updated_param, output=param) paddle.fluid.layers.assign(input=updated_param, output=param)
optimize_ops = self.apply_optimize( optimize_ops = self.apply_optimize(
......
...@@ -153,7 +153,7 @@ class BasicGRUUnit(Layer): ...@@ -153,7 +153,7 @@ class BasicGRUUnit(Layer):
gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight) gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
gate_input = layers.elementwise_add(gate_input, self._gate_bias) gate_input = paddle.add(gate_input, self._gate_bias)
gate_input = self._gate_activation(gate_input) gate_input = self._gate_activation(gate_input)
r, u = layers.split(gate_input, num_or_sections=2, dim=1) r, u = layers.split(gate_input, num_or_sections=2, dim=1)
...@@ -163,7 +163,7 @@ class BasicGRUUnit(Layer): ...@@ -163,7 +163,7 @@ class BasicGRUUnit(Layer):
candidate = layers.matmul( candidate = layers.matmul(
layers.concat([input, r_hidden], 1), self._candidate_weight layers.concat([input, r_hidden], 1), self._candidate_weight
) )
candidate = layers.elementwise_add(candidate, self._candidate_bias) candidate = paddle.add(candidate, self._candidate_bias)
c = self._activation(candidate) c = self._activation(candidate)
new_hidden = u * pre_hidden + (1 - u) * c new_hidden = u * pre_hidden + (1 - u) * c
...@@ -876,18 +876,14 @@ class BasicLSTMUnit(Layer): ...@@ -876,18 +876,14 @@ class BasicLSTMUnit(Layer):
concat_input_hidden = layers.concat([input, pre_hidden], 1) concat_input_hidden = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=concat_input_hidden, y=self._weight) gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
gate_input = layers.elementwise_add(gate_input, self._bias) gate_input = paddle.add(gate_input, self._bias)
i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
new_cell = layers.elementwise_add( new_cell = paddle.add(
layers.elementwise_mul( paddle.multiply(
pre_cell, pre_cell,
paddle.nn.functional.sigmoid( paddle.nn.functional.sigmoid(paddle.add(f, self._forget_bias)),
layers.elementwise_add(f, self._forget_bias)
),
),
layers.elementwise_mul(
paddle.nn.functional.sigmoid(i), paddle.tanh(j)
), ),
paddle.multiply(paddle.nn.functional.sigmoid(i), paddle.tanh(j)),
) )
new_hidden = paddle.tanh(new_cell) * paddle.nn.functional.sigmoid(o) new_hidden = paddle.tanh(new_cell) * paddle.nn.functional.sigmoid(o)
......
...@@ -18,7 +18,6 @@ from ..layers import ( ...@@ -18,7 +18,6 @@ from ..layers import (
concat, concat,
fill_constant, fill_constant,
matmul, matmul,
elementwise_add,
elementwise_mul, elementwise_mul,
split, split,
) )
...@@ -217,23 +216,23 @@ class LSTMCell(Layer): ...@@ -217,23 +216,23 @@ class LSTMCell(Layer):
if self._use_cudnn_impl: if self._use_cudnn_impl:
igates = matmul(input, y=self._weight_ih, transpose_y=True) igates = matmul(input, y=self._weight_ih, transpose_y=True)
igates = elementwise_add(igates, self._bias_ih) igates = paddle.add(igates, self._bias_ih)
hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True) hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True)
hgates = elementwise_add(hgates, self._bias_hh) hgates = paddle.add(hgates, self._bias_hh)
chunked_igates = split(igates, num_or_sections=4, dim=1) chunked_igates = split(igates, num_or_sections=4, dim=1)
chunked_hgates = split(hgates, num_or_sections=4, dim=1) chunked_hgates = split(hgates, num_or_sections=4, dim=1)
ingate = elementwise_add(chunked_igates[0], chunked_hgates[0]) ingate = paddle.add(chunked_igates[0], chunked_hgates[0])
ingate = self._gate_activation(ingate) ingate = self._gate_activation(ingate)
forgetgate = elementwise_add(chunked_igates[1], chunked_hgates[1]) forgetgate = paddle.add(chunked_igates[1], chunked_hgates[1])
forgetgate = self._gate_activation(forgetgate) forgetgate = self._gate_activation(forgetgate)
cellgate = elementwise_add(chunked_igates[2], chunked_hgates[2]) cellgate = paddle.add(chunked_igates[2], chunked_hgates[2])
cellgate = self._activation(cellgate) cellgate = self._activation(cellgate)
outgate = elementwise_add(chunked_igates[3], chunked_hgates[3]) outgate = paddle.add(chunked_igates[3], chunked_hgates[3])
outgate = self._gate_activation(outgate) outgate = self._gate_activation(outgate)
new_cell = (forgetgate * pre_cell) + (ingate * cellgate) new_cell = (forgetgate * pre_cell) + (ingate * cellgate)
...@@ -244,16 +243,14 @@ class LSTMCell(Layer): ...@@ -244,16 +243,14 @@ class LSTMCell(Layer):
concat_input_hidden = concat([input, pre_hidden], 1) concat_input_hidden = concat([input, pre_hidden], 1)
gate_input = matmul(x=concat_input_hidden, y=self._weight) gate_input = matmul(x=concat_input_hidden, y=self._weight)
gate_input = elementwise_add(gate_input, self._bias) gate_input = paddle.add(gate_input, self._bias)
i, j, f, o = split(gate_input, num_or_sections=4, dim=-1) i, j, f, o = split(gate_input, num_or_sections=4, dim=-1)
new_cell = elementwise_add( new_cell = paddle.add(
elementwise_mul( paddle.multiply(
pre_cell, pre_cell,
self._gate_activation( self._gate_activation(paddle.add(f, self._forget_bias)),
elementwise_add(f, self._forget_bias)
), ),
), paddle.multiply(
elementwise_mul(
paddle.nn.functional.sigmoid(i), paddle.tanh(j) paddle.nn.functional.sigmoid(i), paddle.tanh(j)
), ),
) )
...@@ -466,21 +463,21 @@ class GRUCell(Layer): ...@@ -466,21 +463,21 @@ class GRUCell(Layer):
if self._use_cudnn_impl: if self._use_cudnn_impl:
igates = matmul(input, y=self._weight_ih, transpose_y=True) igates = matmul(input, y=self._weight_ih, transpose_y=True)
igates = elementwise_add(igates, self._bias_ih) igates = paddle.add(igates, self._bias_ih)
hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True) hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True)
hgates = elementwise_add(hgates, self._bias_hh) hgates = paddle.add(hgates, self._bias_hh)
chunked_igates = split(igates, num_or_sections=3, dim=1) chunked_igates = split(igates, num_or_sections=3, dim=1)
chunked_hgates = split(hgates, num_or_sections=3, dim=1) chunked_hgates = split(hgates, num_or_sections=3, dim=1)
reset_gate = elementwise_add(chunked_igates[0], chunked_hgates[0]) reset_gate = paddle.add(chunked_igates[0], chunked_hgates[0])
reset_gate = self._gate_activation(reset_gate) reset_gate = self._gate_activation(reset_gate)
input_gate = elementwise_add(chunked_igates[1], chunked_hgates[1]) input_gate = paddle.add(chunked_igates[1], chunked_hgates[1])
input_gate = self._gate_activation(input_gate) input_gate = self._gate_activation(input_gate)
_temp = reset_gate * chunked_hgates[2] _temp = reset_gate * chunked_hgates[2]
new_gate = elementwise_add(chunked_igates[2], _temp) new_gate = paddle.add(chunked_igates[2], _temp)
new_gate = self._activation(new_gate) new_gate = self._activation(new_gate)
new_hidden = (pre_hidden - new_gate) * input_gate + new_gate new_hidden = (pre_hidden - new_gate) * input_gate + new_gate
...@@ -491,7 +488,7 @@ class GRUCell(Layer): ...@@ -491,7 +488,7 @@ class GRUCell(Layer):
gate_input = matmul(x=concat_input_hidden, y=self._gate_weight) gate_input = matmul(x=concat_input_hidden, y=self._gate_weight)
gate_input = elementwise_add(gate_input, self._gate_bias) gate_input = paddle.add(gate_input, self._gate_bias)
gate_input = self._gate_activation(gate_input) gate_input = self._gate_activation(gate_input)
r, u = split(gate_input, num_or_sections=2, dim=1) r, u = split(gate_input, num_or_sections=2, dim=1)
...@@ -500,7 +497,7 @@ class GRUCell(Layer): ...@@ -500,7 +497,7 @@ class GRUCell(Layer):
candidate = matmul( candidate = matmul(
concat([input, r_hidden], 1), self._candidate_weight concat([input, r_hidden], 1), self._candidate_weight
) )
candidate = elementwise_add(candidate, self._candidate_bias) candidate = paddle.add(candidate, self._candidate_bias)
c = self._activation(candidate) c = self._activation(candidate)
new_hidden = u * pre_hidden + (1 - u) * c new_hidden = u * pre_hidden + (1 - u) * c
......
...@@ -115,7 +115,7 @@ class LayerHelperBase: ...@@ -115,7 +115,7 @@ class LayerHelperBase:
) )
def _create_weight_normalize(self, attr, shape, dtype): def _create_weight_normalize(self, attr, shape, dtype):
from .layers import elementwise_mul, elementwise_div from .layers import elementwise_mul
# Remove these ops when LayerHelper and layers support indicating # Remove these ops when LayerHelper and layers support indicating
# program and block. # program and block.
...@@ -266,7 +266,7 @@ class LayerHelperBase: ...@@ -266,7 +266,7 @@ class LayerHelperBase:
norm = __norm_except_dim( norm = __norm_except_dim(
v, dim=dim, block=self.main_program.current_block() v, dim=dim, block=self.main_program.current_block()
) )
scale = elementwise_div( scale = paddle.divide(
x=g, y=norm x=g, y=norm
) # The shapes of g and norm are the same. ) # The shapes of g and norm are the same.
# Currently, elementwise_mul only support broadcast when the shape # Currently, elementwise_mul only support broadcast when the shape
......
...@@ -1125,10 +1125,9 @@ class BeamSearchDecoder(Decoder): ...@@ -1125,10 +1125,9 @@ class BeamSearchDecoder(Decoder):
) )
# TODO: use where_op # TODO: use where_op
finished = tensor.cast(finished, dtype=probs.dtype) finished = tensor.cast(finished, dtype=probs.dtype)
probs = nn.elementwise_mul( probs = paddle.multiply(
paddle.tile(nn.unsqueeze(finished, [2]), [1, 1, self.vocab_size]), paddle.tile(nn.unsqueeze(finished, [2]), [1, 1, self.vocab_size]),
self.noend_mask_tensor, self.noend_mask_tensor,
axis=-1,
) - nn.elementwise_mul(probs, (finished - 1), axis=0) ) - nn.elementwise_mul(probs, (finished - 1), axis=0)
return probs return probs
...@@ -1503,7 +1502,7 @@ def _dynamic_decode_imperative( ...@@ -1503,7 +1502,7 @@ def _dynamic_decode_imperative(
# To confirm states.finished/finished be consistent with # To confirm states.finished/finished be consistent with
# next_finished. # next_finished.
tensor.assign(next_finished, finished) tensor.assign(next_finished, finished)
next_sequence_lengths = nn.elementwise_add( next_sequence_lengths = paddle.add(
sequence_lengths, sequence_lengths,
tensor.cast( tensor.cast(
paddle.logical_not(finished), sequence_lengths.dtype paddle.logical_not(finished), sequence_lengths.dtype
...@@ -1663,7 +1662,7 @@ def _dynamic_decode_declarative( ...@@ -1663,7 +1662,7 @@ def _dynamic_decode_declarative(
# Otherwise, perform logical OR which would not change the already # Otherwise, perform logical OR which would not change the already
# finished. # finished.
next_finished = paddle.logical_or(next_finished, global_finished) next_finished = paddle.logical_or(next_finished, global_finished)
next_sequence_lengths = nn.elementwise_add( next_sequence_lengths = paddle.add(
sequence_lengths, sequence_lengths,
tensor.cast( tensor.cast(
paddle.logical_not(global_finished), paddle.logical_not(global_finished),
......
...@@ -390,7 +390,7 @@ def glu(input, dim=-1): ...@@ -390,7 +390,7 @@ def glu(input, dim=-1):
) )
a, b = layers.split(input, num_or_sections=2, dim=dim) a, b = layers.split(input, num_or_sections=2, dim=dim)
act_b = paddle.nn.functional.sigmoid(x=b) act_b = paddle.nn.functional.sigmoid(x=b)
out = layers.elementwise_mul(x=a, y=act_b) out = paddle.multiply(x=a, y=act_b)
return out return out
......
...@@ -7298,10 +7298,10 @@ class LookaheadOptimizer: ...@@ -7298,10 +7298,10 @@ class LookaheadOptimizer:
for param_name in params: for param_name in params:
fast_var = main_block.var(param_name) fast_var = main_block.var(param_name)
slow_var = param_to_slow[param_name] slow_var = param_to_slow[param_name]
tmp_var = layers.elementwise_add( tmp_var = paddle.add(
layers.elementwise_mul(fast_var, alpha), paddle.multiply(fast_var, alpha),
layers.elementwise_mul( paddle.multiply(
slow_var, layers.elementwise_sub(one_var, alpha) slow_var, paddle.subtract(one_var, alpha)
), ),
) )
layers.assign(input=tmp_var, output=slow_var) layers.assign(input=tmp_var, output=slow_var)
......
...@@ -212,7 +212,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase): ...@@ -212,7 +212,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
max_global_norm = layers.fill_constant( max_global_norm = layers.fill_constant(
shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
) )
clip_var = layers.elementwise_div( clip_var = paddle.divide(
x=max_global_norm, x=max_global_norm,
y=paddle.maximum(x=global_norm_var, y=max_global_norm), y=paddle.maximum(x=global_norm_var, y=max_global_norm),
) )
...@@ -228,7 +228,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase): ...@@ -228,7 +228,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
if g.dtype == core.VarDesc.VarType.FP16 if g.dtype == core.VarDesc.VarType.FP16
else clip_var else clip_var
) )
new_grad = layers.elementwise_mul(x=g, y=clip_input) new_grad = paddle.multiply(x=g, y=clip_input)
params_and_grads.append((p, new_grad)) params_and_grads.append((p, new_grad))
return params_and_grads return params_and_grads
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册