Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
048e0c55
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
048e0c55
编写于
11月 29, 2022
作者:
H
HongyuJia
提交者:
GitHub
11月 29, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
clean elem_arithmetic not test.py (#48460)
上级
41f15537
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
55 addition
and
73 deletion
+55
-73
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
...buted/fleet/meta_parallel/sharding/group_sharded_utils.py
+1
-1
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
...istributed/fleet/meta_parallel/sharding/sharding_utils.py
+1
-1
python/paddle/distribution/normal.py
python/paddle/distribution/normal.py
+6
-9
python/paddle/distribution/uniform.py
python/paddle/distribution/uniform.py
+4
-7
python/paddle/fluid/clip.py
python/paddle/fluid/clip.py
+5
-7
python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
...ib/extend_optimizer/extend_optimizer_with_weight_decay.py
+1
-3
python/paddle/fluid/contrib/layers/rnn_impl.py
python/paddle/fluid/contrib/layers/rnn_impl.py
+7
-11
python/paddle/fluid/dygraph/rnn.py
python/paddle/fluid/dygraph/rnn.py
+18
-21
python/paddle/fluid/layer_helper_base.py
python/paddle/fluid/layer_helper_base.py
+2
-2
python/paddle/fluid/layers/rnn.py
python/paddle/fluid/layers/rnn.py
+3
-4
python/paddle/fluid/nets.py
python/paddle/fluid/nets.py
+1
-1
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+4
-4
python/paddle/incubate/distributed/models/moe/grad_clip.py
python/paddle/incubate/distributed/models/moe/grad_clip.py
+2
-2
未找到文件。
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
浏览文件 @
048e0c55
...
...
@@ -138,7 +138,7 @@ class GroupShardedClipGrad:
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
)
clip_var
=
layers
.
elementwise_div
(
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
)
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
浏览文件 @
048e0c55
...
...
@@ -135,7 +135,7 @@ class ShardingClipGrad:
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
)
clip_var
=
layers
.
elementwise_div
(
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
)
...
...
python/paddle/distribution/normal.py
浏览文件 @
048e0c55
...
...
@@ -22,9 +22,6 @@ from paddle.distribution import distribution
from
paddle.fluid.data_feeder
import
check_type
,
convert_dtype
from
paddle.fluid.framework
import
_non_static_mode
from
paddle.fluid.layers
import
(
elementwise_add
,
elementwise_div
,
elementwise_sub
,
nn
,
tensor
,
)
...
...
@@ -191,14 +188,14 @@ class Normal(distribution.Distribution):
zero_tmp_shape
,
mean
=
0.0
,
std
=
1.0
,
seed
=
seed
,
dtype
=
self
.
dtype
)
output
=
normal_random_tmp
*
(
zero_tmp_reshape
+
self
.
scale
)
output
=
elementwise_
add
(
output
,
self
.
loc
,
name
=
name
)
output
=
paddle
.
add
(
output
,
self
.
loc
,
name
=
name
)
return
output
else
:
output_shape
=
shape
+
batch_shape
output
=
nn
.
gaussian_random
(
output_shape
,
mean
=
0.0
,
std
=
1.0
,
seed
=
seed
,
dtype
=
self
.
dtype
)
*
(
tensor
.
zeros
(
output_shape
,
dtype
=
self
.
dtype
)
+
self
.
scale
)
output
=
elementwise_
add
(
output
,
self
.
loc
,
name
=
name
)
output
=
paddle
.
add
(
output
,
self
.
loc
,
name
=
name
)
if
self
.
all_arg_is_float
:
return
paddle
.
reshape
(
output
,
shape
,
name
=
name
)
else
:
...
...
@@ -243,7 +240,7 @@ class Normal(distribution.Distribution):
zero_tmp
=
tensor
.
fill_constant_batch_size_like
(
self
.
loc
+
self
.
scale
,
batch_shape
,
self
.
dtype
,
0.0
)
return
elementwise_
add
(
return
paddle
.
add
(
0.5
+
zero_tmp
,
0.5
*
math
.
log
(
2
*
math
.
pi
)
+
nn
.
log
((
self
.
scale
+
zero_tmp
)),
name
=
name
,
...
...
@@ -264,7 +261,7 @@ class Normal(distribution.Distribution):
var
=
self
.
scale
*
self
.
scale
log_scale
=
nn
.
log
(
self
.
scale
)
return
elementwise_sub
(
return
paddle
.
subtract
(
-
1.0
*
((
value
-
self
.
loc
)
*
(
value
-
self
.
loc
))
/
(
2.0
*
var
),
log_scale
+
math
.
log
(
math
.
sqrt
(
2.0
*
math
.
pi
)),
name
=
name
,
...
...
@@ -284,7 +281,7 @@ class Normal(distribution.Distribution):
value
=
self
.
_check_values_dtype_in_probs
(
self
.
loc
,
value
)
var
=
self
.
scale
*
self
.
scale
return
elementwise_div
(
return
paddle
.
divide
(
paddle
.
exp
(
-
1.0
*
((
value
-
self
.
loc
)
*
(
value
-
self
.
loc
))
/
(
2.0
*
var
)
),
...
...
@@ -333,6 +330,6 @@ class Normal(distribution.Distribution):
var_ratio
=
var_ratio
*
var_ratio
t1
=
(
self
.
loc
-
other
.
loc
)
/
other
.
scale
t1
=
t1
*
t1
return
elementwise_
add
(
return
paddle
.
add
(
0.5
*
var_ratio
,
0.5
*
(
t1
-
1.0
-
nn
.
log
(
var_ratio
)),
name
=
name
)
python/paddle/distribution/uniform.py
浏览文件 @
048e0c55
...
...
@@ -24,9 +24,6 @@ from paddle.fluid.framework import (
in_dygraph_mode
,
)
from
paddle.fluid.layers
import
(
elementwise_add
,
elementwise_div
,
elementwise_sub
,
nn
,
tensor
,
)
...
...
@@ -184,7 +181,7 @@ class Uniform(distribution.Distribution):
output
=
uniform_random_tmp_reshape
*
(
zero_tmp_reshape
+
self
.
high
-
self
.
low
)
output
=
elementwise_
add
(
output
,
self
.
low
,
name
=
name
)
output
=
paddle
.
add
(
output
,
self
.
low
,
name
=
name
)
return
output
else
:
output_shape
=
shape
+
batch_shape
...
...
@@ -194,7 +191,7 @@ class Uniform(distribution.Distribution):
tensor
.
zeros
(
output_shape
,
dtype
=
self
.
dtype
)
+
(
self
.
high
-
self
.
low
)
)
output
=
elementwise_
add
(
output
,
self
.
low
,
name
=
name
)
output
=
paddle
.
add
(
output
,
self
.
low
,
name
=
name
)
if
self
.
all_arg_is_float
:
return
paddle
.
reshape
(
output
,
shape
,
name
=
name
)
else
:
...
...
@@ -235,7 +232,7 @@ class Uniform(distribution.Distribution):
ub_bool
=
value
<
self
.
high
lb
=
tensor
.
cast
(
lb_bool
,
dtype
=
value
.
dtype
)
ub
=
tensor
.
cast
(
ub_bool
,
dtype
=
value
.
dtype
)
return
elementwise_sub
(
return
paddle
.
subtract
(
nn
.
log
(
lb
*
ub
),
nn
.
log
(
self
.
high
-
self
.
low
),
name
=
name
)
...
...
@@ -273,7 +270,7 @@ class Uniform(distribution.Distribution):
ub_bool
=
value
<
self
.
high
lb
=
tensor
.
cast
(
lb_bool
,
dtype
=
value
.
dtype
)
ub
=
tensor
.
cast
(
ub_bool
,
dtype
=
value
.
dtype
)
return
elementwise_div
((
lb
*
ub
),
(
self
.
high
-
self
.
low
),
name
=
name
)
return
paddle
.
divide
((
lb
*
ub
),
(
self
.
high
-
self
.
low
),
name
=
name
)
def
entropy
(
self
):
r
"""Shannon entropy in nats.
...
...
python/paddle/fluid/clip.py
浏览文件 @
048e0c55
...
...
@@ -548,16 +548,14 @@ class ClipGradByGlobalNorm(ClipGradBase):
need_clip
=
False
if
not
self
.
auto_skip_clip
:
# always apply clip
need_clip
=
True
clip_var
=
layers
.
elementwise_div
(
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
)
elif
global_norm_var
>
max_global_norm
:
# only when global_norm_var > max_global_norm, grad need clip
need_clip
=
True
clip_var
=
layers
.
elementwise_div
(
x
=
max_global_norm
,
y
=
global_norm_var
)
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
global_norm_var
)
for
p
,
g
in
params_grads
:
if
g
is
None
:
...
...
@@ -572,7 +570,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
if
clip_var
.
dtype
!=
g
.
dtype
else
clip_var
)
new_grad
=
layers
.
elementwise_mul
(
g
,
clip_input
)
new_grad
=
paddle
.
multiply
(
g
,
clip_input
)
params_and_grads
.
append
((
p
,
new_grad
))
else
:
params_and_grads
.
append
((
p
,
g
))
...
...
@@ -652,7 +650,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
max_global_norm
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
)
scale_var
=
layers
.
elementwise_div
(
scale_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
max_global_norm
,
y
=
global_norm_var
),
)
...
...
@@ -729,7 +727,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
group_norm_var
=
layers
.
sums
(
input
=
self
.
context
[
self
.
group_name
])
group_norm_var
=
paddle
.
sqrt
(
x
=
group_norm_var
)
clip_var
=
self
.
context
[
self
.
group_name
+
"_clip"
]
group_scale_var
=
layers
.
elementwise_div
(
group_scale_var
=
paddle
.
divide
(
x
=
clip_var
,
y
=
paddle
.
maximum
(
x
=
clip_var
,
y
=
group_norm_var
),
)
...
...
python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
浏览文件 @
048e0c55
...
...
@@ -95,9 +95,7 @@ class DecoupledWeightDecay:
with
param
.
block
.
program
.
_optimized_guard
(
[
param
,
grad
]
),
framework
.
name_scope
(
'weight decay'
):
updated_param
=
paddle
.
fluid
.
layers
.
elementwise_sub
(
x
=
param
,
y
=
scaled_param
)
updated_param
=
paddle
.
subtract
(
x
=
param
,
y
=
scaled_param
)
paddle
.
fluid
.
layers
.
assign
(
input
=
updated_param
,
output
=
param
)
optimize_ops
=
self
.
apply_optimize
(
...
...
python/paddle/fluid/contrib/layers/rnn_impl.py
浏览文件 @
048e0c55
...
...
@@ -153,7 +153,7 @@ class BasicGRUUnit(Layer):
gate_input
=
layers
.
matmul
(
x
=
concat_input_hidden
,
y
=
self
.
_gate_weight
)
gate_input
=
layers
.
elementwise_
add
(
gate_input
,
self
.
_gate_bias
)
gate_input
=
paddle
.
add
(
gate_input
,
self
.
_gate_bias
)
gate_input
=
self
.
_gate_activation
(
gate_input
)
r
,
u
=
layers
.
split
(
gate_input
,
num_or_sections
=
2
,
dim
=
1
)
...
...
@@ -163,7 +163,7 @@ class BasicGRUUnit(Layer):
candidate
=
layers
.
matmul
(
layers
.
concat
([
input
,
r_hidden
],
1
),
self
.
_candidate_weight
)
candidate
=
layers
.
elementwise_
add
(
candidate
,
self
.
_candidate_bias
)
candidate
=
paddle
.
add
(
candidate
,
self
.
_candidate_bias
)
c
=
self
.
_activation
(
candidate
)
new_hidden
=
u
*
pre_hidden
+
(
1
-
u
)
*
c
...
...
@@ -876,18 +876,14 @@ class BasicLSTMUnit(Layer):
concat_input_hidden
=
layers
.
concat
([
input
,
pre_hidden
],
1
)
gate_input
=
layers
.
matmul
(
x
=
concat_input_hidden
,
y
=
self
.
_weight
)
gate_input
=
layers
.
elementwise_
add
(
gate_input
,
self
.
_bias
)
gate_input
=
paddle
.
add
(
gate_input
,
self
.
_bias
)
i
,
j
,
f
,
o
=
layers
.
split
(
gate_input
,
num_or_sections
=
4
,
dim
=-
1
)
new_cell
=
layers
.
elementwise_
add
(
layers
.
elementwise_mul
(
new_cell
=
paddle
.
add
(
paddle
.
multiply
(
pre_cell
,
paddle
.
nn
.
functional
.
sigmoid
(
layers
.
elementwise_add
(
f
,
self
.
_forget_bias
)
),
),
layers
.
elementwise_mul
(
paddle
.
nn
.
functional
.
sigmoid
(
i
),
paddle
.
tanh
(
j
)
paddle
.
nn
.
functional
.
sigmoid
(
paddle
.
add
(
f
,
self
.
_forget_bias
)),
),
paddle
.
multiply
(
paddle
.
nn
.
functional
.
sigmoid
(
i
),
paddle
.
tanh
(
j
)),
)
new_hidden
=
paddle
.
tanh
(
new_cell
)
*
paddle
.
nn
.
functional
.
sigmoid
(
o
)
...
...
python/paddle/fluid/dygraph/rnn.py
浏览文件 @
048e0c55
...
...
@@ -18,7 +18,6 @@ from ..layers import (
concat
,
fill_constant
,
matmul
,
elementwise_add
,
elementwise_mul
,
split
,
)
...
...
@@ -217,23 +216,23 @@ class LSTMCell(Layer):
if
self
.
_use_cudnn_impl
:
igates
=
matmul
(
input
,
y
=
self
.
_weight_ih
,
transpose_y
=
True
)
igates
=
elementwise_
add
(
igates
,
self
.
_bias_ih
)
igates
=
paddle
.
add
(
igates
,
self
.
_bias_ih
)
hgates
=
matmul
(
pre_hidden
,
self
.
_weight_hh
,
transpose_y
=
True
)
hgates
=
elementwise_
add
(
hgates
,
self
.
_bias_hh
)
hgates
=
paddle
.
add
(
hgates
,
self
.
_bias_hh
)
chunked_igates
=
split
(
igates
,
num_or_sections
=
4
,
dim
=
1
)
chunked_hgates
=
split
(
hgates
,
num_or_sections
=
4
,
dim
=
1
)
ingate
=
elementwise_
add
(
chunked_igates
[
0
],
chunked_hgates
[
0
])
ingate
=
paddle
.
add
(
chunked_igates
[
0
],
chunked_hgates
[
0
])
ingate
=
self
.
_gate_activation
(
ingate
)
forgetgate
=
elementwise_
add
(
chunked_igates
[
1
],
chunked_hgates
[
1
])
forgetgate
=
paddle
.
add
(
chunked_igates
[
1
],
chunked_hgates
[
1
])
forgetgate
=
self
.
_gate_activation
(
forgetgate
)
cellgate
=
elementwise_
add
(
chunked_igates
[
2
],
chunked_hgates
[
2
])
cellgate
=
paddle
.
add
(
chunked_igates
[
2
],
chunked_hgates
[
2
])
cellgate
=
self
.
_activation
(
cellgate
)
outgate
=
elementwise_
add
(
chunked_igates
[
3
],
chunked_hgates
[
3
])
outgate
=
paddle
.
add
(
chunked_igates
[
3
],
chunked_hgates
[
3
])
outgate
=
self
.
_gate_activation
(
outgate
)
new_cell
=
(
forgetgate
*
pre_cell
)
+
(
ingate
*
cellgate
)
...
...
@@ -244,16 +243,14 @@ class LSTMCell(Layer):
concat_input_hidden
=
concat
([
input
,
pre_hidden
],
1
)
gate_input
=
matmul
(
x
=
concat_input_hidden
,
y
=
self
.
_weight
)
gate_input
=
elementwise_
add
(
gate_input
,
self
.
_bias
)
gate_input
=
paddle
.
add
(
gate_input
,
self
.
_bias
)
i
,
j
,
f
,
o
=
split
(
gate_input
,
num_or_sections
=
4
,
dim
=-
1
)
new_cell
=
elementwise_
add
(
elementwise_mul
(
new_cell
=
paddle
.
add
(
paddle
.
multiply
(
pre_cell
,
self
.
_gate_activation
(
elementwise_add
(
f
,
self
.
_forget_bias
)
),
self
.
_gate_activation
(
paddle
.
add
(
f
,
self
.
_forget_bias
)),
),
elementwise_mul
(
paddle
.
multiply
(
paddle
.
nn
.
functional
.
sigmoid
(
i
),
paddle
.
tanh
(
j
)
),
)
...
...
@@ -466,21 +463,21 @@ class GRUCell(Layer):
if
self
.
_use_cudnn_impl
:
igates
=
matmul
(
input
,
y
=
self
.
_weight_ih
,
transpose_y
=
True
)
igates
=
elementwise_
add
(
igates
,
self
.
_bias_ih
)
igates
=
paddle
.
add
(
igates
,
self
.
_bias_ih
)
hgates
=
matmul
(
pre_hidden
,
self
.
_weight_hh
,
transpose_y
=
True
)
hgates
=
elementwise_
add
(
hgates
,
self
.
_bias_hh
)
hgates
=
paddle
.
add
(
hgates
,
self
.
_bias_hh
)
chunked_igates
=
split
(
igates
,
num_or_sections
=
3
,
dim
=
1
)
chunked_hgates
=
split
(
hgates
,
num_or_sections
=
3
,
dim
=
1
)
reset_gate
=
elementwise_
add
(
chunked_igates
[
0
],
chunked_hgates
[
0
])
reset_gate
=
paddle
.
add
(
chunked_igates
[
0
],
chunked_hgates
[
0
])
reset_gate
=
self
.
_gate_activation
(
reset_gate
)
input_gate
=
elementwise_
add
(
chunked_igates
[
1
],
chunked_hgates
[
1
])
input_gate
=
paddle
.
add
(
chunked_igates
[
1
],
chunked_hgates
[
1
])
input_gate
=
self
.
_gate_activation
(
input_gate
)
_temp
=
reset_gate
*
chunked_hgates
[
2
]
new_gate
=
elementwise_
add
(
chunked_igates
[
2
],
_temp
)
new_gate
=
paddle
.
add
(
chunked_igates
[
2
],
_temp
)
new_gate
=
self
.
_activation
(
new_gate
)
new_hidden
=
(
pre_hidden
-
new_gate
)
*
input_gate
+
new_gate
...
...
@@ -491,7 +488,7 @@ class GRUCell(Layer):
gate_input
=
matmul
(
x
=
concat_input_hidden
,
y
=
self
.
_gate_weight
)
gate_input
=
elementwise_
add
(
gate_input
,
self
.
_gate_bias
)
gate_input
=
paddle
.
add
(
gate_input
,
self
.
_gate_bias
)
gate_input
=
self
.
_gate_activation
(
gate_input
)
r
,
u
=
split
(
gate_input
,
num_or_sections
=
2
,
dim
=
1
)
...
...
@@ -500,7 +497,7 @@ class GRUCell(Layer):
candidate
=
matmul
(
concat
([
input
,
r_hidden
],
1
),
self
.
_candidate_weight
)
candidate
=
elementwise_
add
(
candidate
,
self
.
_candidate_bias
)
candidate
=
paddle
.
add
(
candidate
,
self
.
_candidate_bias
)
c
=
self
.
_activation
(
candidate
)
new_hidden
=
u
*
pre_hidden
+
(
1
-
u
)
*
c
...
...
python/paddle/fluid/layer_helper_base.py
浏览文件 @
048e0c55
...
...
@@ -115,7 +115,7 @@ class LayerHelperBase:
)
def
_create_weight_normalize
(
self
,
attr
,
shape
,
dtype
):
from
.layers
import
elementwise_mul
,
elementwise_div
from
.layers
import
elementwise_mul
# Remove these ops when LayerHelper and layers support indicating
# program and block.
...
...
@@ -266,7 +266,7 @@ class LayerHelperBase:
norm
=
__norm_except_dim
(
v
,
dim
=
dim
,
block
=
self
.
main_program
.
current_block
()
)
scale
=
elementwise_div
(
scale
=
paddle
.
divide
(
x
=
g
,
y
=
norm
)
# The shapes of g and norm are the same.
# Currently, elementwise_mul only support broadcast when the shape
...
...
python/paddle/fluid/layers/rnn.py
浏览文件 @
048e0c55
...
...
@@ -1125,10 +1125,9 @@ class BeamSearchDecoder(Decoder):
)
# TODO: use where_op
finished
=
tensor
.
cast
(
finished
,
dtype
=
probs
.
dtype
)
probs
=
nn
.
elementwise_mul
(
probs
=
paddle
.
multiply
(
paddle
.
tile
(
nn
.
unsqueeze
(
finished
,
[
2
]),
[
1
,
1
,
self
.
vocab_size
]),
self
.
noend_mask_tensor
,
axis
=-
1
,
)
-
nn
.
elementwise_mul
(
probs
,
(
finished
-
1
),
axis
=
0
)
return
probs
...
...
@@ -1503,7 +1502,7 @@ def _dynamic_decode_imperative(
# To confirm states.finished/finished be consistent with
# next_finished.
tensor
.
assign
(
next_finished
,
finished
)
next_sequence_lengths
=
nn
.
elementwise_
add
(
next_sequence_lengths
=
paddle
.
add
(
sequence_lengths
,
tensor
.
cast
(
paddle
.
logical_not
(
finished
),
sequence_lengths
.
dtype
...
...
@@ -1663,7 +1662,7 @@ def _dynamic_decode_declarative(
# Otherwise, perform logical OR which would not change the already
# finished.
next_finished
=
paddle
.
logical_or
(
next_finished
,
global_finished
)
next_sequence_lengths
=
nn
.
elementwise_
add
(
next_sequence_lengths
=
paddle
.
add
(
sequence_lengths
,
tensor
.
cast
(
paddle
.
logical_not
(
global_finished
),
...
...
python/paddle/fluid/nets.py
浏览文件 @
048e0c55
...
...
@@ -390,7 +390,7 @@ def glu(input, dim=-1):
)
a
,
b
=
layers
.
split
(
input
,
num_or_sections
=
2
,
dim
=
dim
)
act_b
=
paddle
.
nn
.
functional
.
sigmoid
(
x
=
b
)
out
=
layers
.
elementwise_mul
(
x
=
a
,
y
=
act_b
)
out
=
paddle
.
multiply
(
x
=
a
,
y
=
act_b
)
return
out
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
048e0c55
...
...
@@ -7298,10 +7298,10 @@ class LookaheadOptimizer:
for
param_name
in
params
:
fast_var
=
main_block
.
var
(
param_name
)
slow_var
=
param_to_slow
[
param_name
]
tmp_var
=
layers
.
elementwise_
add
(
layers
.
elementwise_mul
(
fast_var
,
alpha
),
layers
.
elementwise_mul
(
slow_var
,
layers
.
elementwise_sub
(
one_var
,
alpha
)
tmp_var
=
paddle
.
add
(
paddle
.
multiply
(
fast_var
,
alpha
),
paddle
.
multiply
(
slow_var
,
paddle
.
subtract
(
one_var
,
alpha
)
),
)
layers
.
assign
(
input
=
tmp_var
,
output
=
slow_var
)
...
...
python/paddle/incubate/distributed/models/moe/grad_clip.py
浏览文件 @
048e0c55
...
...
@@ -212,7 +212,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
max_global_norm
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
)
clip_var
=
layers
.
elementwise_div
(
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
)
...
...
@@ -228,7 +228,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
if
g
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
else
clip_var
)
new_grad
=
layers
.
elementwise_mul
(
x
=
g
,
y
=
clip_input
)
new_grad
=
paddle
.
multiply
(
x
=
g
,
y
=
clip_input
)
params_and_grads
.
append
((
p
,
new_grad
))
return
params_and_grads
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录