Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
048e0c55
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
048e0c55
编写于
11月 29, 2022
作者:
H
HongyuJia
提交者:
GitHub
11月 29, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
clean elem_arithmetic not test.py (#48460)
上级
41f15537
变更
13
显示空白变更内容
内联
并排
Showing
13 changed file
with
55 addition
and
73 deletion
+55
-73
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
...buted/fleet/meta_parallel/sharding/group_sharded_utils.py
+1
-1
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
...istributed/fleet/meta_parallel/sharding/sharding_utils.py
+1
-1
python/paddle/distribution/normal.py
python/paddle/distribution/normal.py
+6
-9
python/paddle/distribution/uniform.py
python/paddle/distribution/uniform.py
+4
-7
python/paddle/fluid/clip.py
python/paddle/fluid/clip.py
+5
-7
python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
...ib/extend_optimizer/extend_optimizer_with_weight_decay.py
+1
-3
python/paddle/fluid/contrib/layers/rnn_impl.py
python/paddle/fluid/contrib/layers/rnn_impl.py
+7
-11
python/paddle/fluid/dygraph/rnn.py
python/paddle/fluid/dygraph/rnn.py
+18
-21
python/paddle/fluid/layer_helper_base.py
python/paddle/fluid/layer_helper_base.py
+2
-2
python/paddle/fluid/layers/rnn.py
python/paddle/fluid/layers/rnn.py
+3
-4
python/paddle/fluid/nets.py
python/paddle/fluid/nets.py
+1
-1
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+4
-4
python/paddle/incubate/distributed/models/moe/grad_clip.py
python/paddle/incubate/distributed/models/moe/grad_clip.py
+2
-2
未找到文件。
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
浏览文件 @
048e0c55
...
...
@@ -138,7 +138,7 @@ class GroupShardedClipGrad:
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
)
clip_var
=
layers
.
elementwise_div
(
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
)
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
浏览文件 @
048e0c55
...
...
@@ -135,7 +135,7 @@ class ShardingClipGrad:
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
)
clip_var
=
layers
.
elementwise_div
(
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
)
...
...
python/paddle/distribution/normal.py
浏览文件 @
048e0c55
...
...
@@ -22,9 +22,6 @@ from paddle.distribution import distribution
from
paddle.fluid.data_feeder
import
check_type
,
convert_dtype
from
paddle.fluid.framework
import
_non_static_mode
from
paddle.fluid.layers
import
(
elementwise_add
,
elementwise_div
,
elementwise_sub
,
nn
,
tensor
,
)
...
...
@@ -191,14 +188,14 @@ class Normal(distribution.Distribution):
zero_tmp_shape
,
mean
=
0.0
,
std
=
1.0
,
seed
=
seed
,
dtype
=
self
.
dtype
)
output
=
normal_random_tmp
*
(
zero_tmp_reshape
+
self
.
scale
)
output
=
elementwise_
add
(
output
,
self
.
loc
,
name
=
name
)
output
=
paddle
.
add
(
output
,
self
.
loc
,
name
=
name
)
return
output
else
:
output_shape
=
shape
+
batch_shape
output
=
nn
.
gaussian_random
(
output_shape
,
mean
=
0.0
,
std
=
1.0
,
seed
=
seed
,
dtype
=
self
.
dtype
)
*
(
tensor
.
zeros
(
output_shape
,
dtype
=
self
.
dtype
)
+
self
.
scale
)
output
=
elementwise_
add
(
output
,
self
.
loc
,
name
=
name
)
output
=
paddle
.
add
(
output
,
self
.
loc
,
name
=
name
)
if
self
.
all_arg_is_float
:
return
paddle
.
reshape
(
output
,
shape
,
name
=
name
)
else
:
...
...
@@ -243,7 +240,7 @@ class Normal(distribution.Distribution):
zero_tmp
=
tensor
.
fill_constant_batch_size_like
(
self
.
loc
+
self
.
scale
,
batch_shape
,
self
.
dtype
,
0.0
)
return
elementwise_
add
(
return
paddle
.
add
(
0.5
+
zero_tmp
,
0.5
*
math
.
log
(
2
*
math
.
pi
)
+
nn
.
log
((
self
.
scale
+
zero_tmp
)),
name
=
name
,
...
...
@@ -264,7 +261,7 @@ class Normal(distribution.Distribution):
var
=
self
.
scale
*
self
.
scale
log_scale
=
nn
.
log
(
self
.
scale
)
return
elementwise_sub
(
return
paddle
.
subtract
(
-
1.0
*
((
value
-
self
.
loc
)
*
(
value
-
self
.
loc
))
/
(
2.0
*
var
),
log_scale
+
math
.
log
(
math
.
sqrt
(
2.0
*
math
.
pi
)),
name
=
name
,
...
...
@@ -284,7 +281,7 @@ class Normal(distribution.Distribution):
value
=
self
.
_check_values_dtype_in_probs
(
self
.
loc
,
value
)
var
=
self
.
scale
*
self
.
scale
return
elementwise_div
(
return
paddle
.
divide
(
paddle
.
exp
(
-
1.0
*
((
value
-
self
.
loc
)
*
(
value
-
self
.
loc
))
/
(
2.0
*
var
)
),
...
...
@@ -333,6 +330,6 @@ class Normal(distribution.Distribution):
var_ratio
=
var_ratio
*
var_ratio
t1
=
(
self
.
loc
-
other
.
loc
)
/
other
.
scale
t1
=
t1
*
t1
return
elementwise_
add
(
return
paddle
.
add
(
0.5
*
var_ratio
,
0.5
*
(
t1
-
1.0
-
nn
.
log
(
var_ratio
)),
name
=
name
)
python/paddle/distribution/uniform.py
浏览文件 @
048e0c55
...
...
@@ -24,9 +24,6 @@ from paddle.fluid.framework import (
in_dygraph_mode
,
)
from
paddle.fluid.layers
import
(
elementwise_add
,
elementwise_div
,
elementwise_sub
,
nn
,
tensor
,
)
...
...
@@ -184,7 +181,7 @@ class Uniform(distribution.Distribution):
output
=
uniform_random_tmp_reshape
*
(
zero_tmp_reshape
+
self
.
high
-
self
.
low
)
output
=
elementwise_
add
(
output
,
self
.
low
,
name
=
name
)
output
=
paddle
.
add
(
output
,
self
.
low
,
name
=
name
)
return
output
else
:
output_shape
=
shape
+
batch_shape
...
...
@@ -194,7 +191,7 @@ class Uniform(distribution.Distribution):
tensor
.
zeros
(
output_shape
,
dtype
=
self
.
dtype
)
+
(
self
.
high
-
self
.
low
)
)
output
=
elementwise_
add
(
output
,
self
.
low
,
name
=
name
)
output
=
paddle
.
add
(
output
,
self
.
low
,
name
=
name
)
if
self
.
all_arg_is_float
:
return
paddle
.
reshape
(
output
,
shape
,
name
=
name
)
else
:
...
...
@@ -235,7 +232,7 @@ class Uniform(distribution.Distribution):
ub_bool
=
value
<
self
.
high
lb
=
tensor
.
cast
(
lb_bool
,
dtype
=
value
.
dtype
)
ub
=
tensor
.
cast
(
ub_bool
,
dtype
=
value
.
dtype
)
return
elementwise_sub
(
return
paddle
.
subtract
(
nn
.
log
(
lb
*
ub
),
nn
.
log
(
self
.
high
-
self
.
low
),
name
=
name
)
...
...
@@ -273,7 +270,7 @@ class Uniform(distribution.Distribution):
ub_bool
=
value
<
self
.
high
lb
=
tensor
.
cast
(
lb_bool
,
dtype
=
value
.
dtype
)
ub
=
tensor
.
cast
(
ub_bool
,
dtype
=
value
.
dtype
)
return
elementwise_div
((
lb
*
ub
),
(
self
.
high
-
self
.
low
),
name
=
name
)
return
paddle
.
divide
((
lb
*
ub
),
(
self
.
high
-
self
.
low
),
name
=
name
)
def
entropy
(
self
):
r
"""Shannon entropy in nats.
...
...
python/paddle/fluid/clip.py
浏览文件 @
048e0c55
...
...
@@ -548,16 +548,14 @@ class ClipGradByGlobalNorm(ClipGradBase):
need_clip
=
False
if
not
self
.
auto_skip_clip
:
# always apply clip
need_clip
=
True
clip_var
=
layers
.
elementwise_div
(
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
)
elif
global_norm_var
>
max_global_norm
:
# only when global_norm_var > max_global_norm, grad need clip
need_clip
=
True
clip_var
=
layers
.
elementwise_div
(
x
=
max_global_norm
,
y
=
global_norm_var
)
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
global_norm_var
)
for
p
,
g
in
params_grads
:
if
g
is
None
:
...
...
@@ -572,7 +570,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
if
clip_var
.
dtype
!=
g
.
dtype
else
clip_var
)
new_grad
=
layers
.
elementwise_mul
(
g
,
clip_input
)
new_grad
=
paddle
.
multiply
(
g
,
clip_input
)
params_and_grads
.
append
((
p
,
new_grad
))
else
:
params_and_grads
.
append
((
p
,
g
))
...
...
@@ -652,7 +650,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
max_global_norm
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
)
scale_var
=
layers
.
elementwise_div
(
scale_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
max_global_norm
,
y
=
global_norm_var
),
)
...
...
@@ -729,7 +727,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
group_norm_var
=
layers
.
sums
(
input
=
self
.
context
[
self
.
group_name
])
group_norm_var
=
paddle
.
sqrt
(
x
=
group_norm_var
)
clip_var
=
self
.
context
[
self
.
group_name
+
"_clip"
]
group_scale_var
=
layers
.
elementwise_div
(
group_scale_var
=
paddle
.
divide
(
x
=
clip_var
,
y
=
paddle
.
maximum
(
x
=
clip_var
,
y
=
group_norm_var
),
)
...
...
python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
浏览文件 @
048e0c55
...
...
@@ -95,9 +95,7 @@ class DecoupledWeightDecay:
with
param
.
block
.
program
.
_optimized_guard
(
[
param
,
grad
]
),
framework
.
name_scope
(
'weight decay'
):
updated_param
=
paddle
.
fluid
.
layers
.
elementwise_sub
(
x
=
param
,
y
=
scaled_param
)
updated_param
=
paddle
.
subtract
(
x
=
param
,
y
=
scaled_param
)
paddle
.
fluid
.
layers
.
assign
(
input
=
updated_param
,
output
=
param
)
optimize_ops
=
self
.
apply_optimize
(
...
...
python/paddle/fluid/contrib/layers/rnn_impl.py
浏览文件 @
048e0c55
...
...
@@ -153,7 +153,7 @@ class BasicGRUUnit(Layer):
gate_input
=
layers
.
matmul
(
x
=
concat_input_hidden
,
y
=
self
.
_gate_weight
)
gate_input
=
layers
.
elementwise_
add
(
gate_input
,
self
.
_gate_bias
)
gate_input
=
paddle
.
add
(
gate_input
,
self
.
_gate_bias
)
gate_input
=
self
.
_gate_activation
(
gate_input
)
r
,
u
=
layers
.
split
(
gate_input
,
num_or_sections
=
2
,
dim
=
1
)
...
...
@@ -163,7 +163,7 @@ class BasicGRUUnit(Layer):
candidate
=
layers
.
matmul
(
layers
.
concat
([
input
,
r_hidden
],
1
),
self
.
_candidate_weight
)
candidate
=
layers
.
elementwise_
add
(
candidate
,
self
.
_candidate_bias
)
candidate
=
paddle
.
add
(
candidate
,
self
.
_candidate_bias
)
c
=
self
.
_activation
(
candidate
)
new_hidden
=
u
*
pre_hidden
+
(
1
-
u
)
*
c
...
...
@@ -876,18 +876,14 @@ class BasicLSTMUnit(Layer):
concat_input_hidden
=
layers
.
concat
([
input
,
pre_hidden
],
1
)
gate_input
=
layers
.
matmul
(
x
=
concat_input_hidden
,
y
=
self
.
_weight
)
gate_input
=
layers
.
elementwise_
add
(
gate_input
,
self
.
_bias
)
gate_input
=
paddle
.
add
(
gate_input
,
self
.
_bias
)
i
,
j
,
f
,
o
=
layers
.
split
(
gate_input
,
num_or_sections
=
4
,
dim
=-
1
)
new_cell
=
layers
.
elementwise_
add
(
layers
.
elementwise_mul
(
new_cell
=
paddle
.
add
(
paddle
.
multiply
(
pre_cell
,
paddle
.
nn
.
functional
.
sigmoid
(
layers
.
elementwise_add
(
f
,
self
.
_forget_bias
)
),
),
layers
.
elementwise_mul
(
paddle
.
nn
.
functional
.
sigmoid
(
i
),
paddle
.
tanh
(
j
)
paddle
.
nn
.
functional
.
sigmoid
(
paddle
.
add
(
f
,
self
.
_forget_bias
)),
),
paddle
.
multiply
(
paddle
.
nn
.
functional
.
sigmoid
(
i
),
paddle
.
tanh
(
j
)),
)
new_hidden
=
paddle
.
tanh
(
new_cell
)
*
paddle
.
nn
.
functional
.
sigmoid
(
o
)
...
...
python/paddle/fluid/dygraph/rnn.py
浏览文件 @
048e0c55
...
...
@@ -18,7 +18,6 @@ from ..layers import (
concat
,
fill_constant
,
matmul
,
elementwise_add
,
elementwise_mul
,
split
,
)
...
...
@@ -217,23 +216,23 @@ class LSTMCell(Layer):
if
self
.
_use_cudnn_impl
:
igates
=
matmul
(
input
,
y
=
self
.
_weight_ih
,
transpose_y
=
True
)
igates
=
elementwise_
add
(
igates
,
self
.
_bias_ih
)
igates
=
paddle
.
add
(
igates
,
self
.
_bias_ih
)
hgates
=
matmul
(
pre_hidden
,
self
.
_weight_hh
,
transpose_y
=
True
)
hgates
=
elementwise_
add
(
hgates
,
self
.
_bias_hh
)
hgates
=
paddle
.
add
(
hgates
,
self
.
_bias_hh
)
chunked_igates
=
split
(
igates
,
num_or_sections
=
4
,
dim
=
1
)
chunked_hgates
=
split
(
hgates
,
num_or_sections
=
4
,
dim
=
1
)
ingate
=
elementwise_
add
(
chunked_igates
[
0
],
chunked_hgates
[
0
])
ingate
=
paddle
.
add
(
chunked_igates
[
0
],
chunked_hgates
[
0
])
ingate
=
self
.
_gate_activation
(
ingate
)
forgetgate
=
elementwise_
add
(
chunked_igates
[
1
],
chunked_hgates
[
1
])
forgetgate
=
paddle
.
add
(
chunked_igates
[
1
],
chunked_hgates
[
1
])
forgetgate
=
self
.
_gate_activation
(
forgetgate
)
cellgate
=
elementwise_
add
(
chunked_igates
[
2
],
chunked_hgates
[
2
])
cellgate
=
paddle
.
add
(
chunked_igates
[
2
],
chunked_hgates
[
2
])
cellgate
=
self
.
_activation
(
cellgate
)
outgate
=
elementwise_
add
(
chunked_igates
[
3
],
chunked_hgates
[
3
])
outgate
=
paddle
.
add
(
chunked_igates
[
3
],
chunked_hgates
[
3
])
outgate
=
self
.
_gate_activation
(
outgate
)
new_cell
=
(
forgetgate
*
pre_cell
)
+
(
ingate
*
cellgate
)
...
...
@@ -244,16 +243,14 @@ class LSTMCell(Layer):
concat_input_hidden
=
concat
([
input
,
pre_hidden
],
1
)
gate_input
=
matmul
(
x
=
concat_input_hidden
,
y
=
self
.
_weight
)
gate_input
=
elementwise_
add
(
gate_input
,
self
.
_bias
)
gate_input
=
paddle
.
add
(
gate_input
,
self
.
_bias
)
i
,
j
,
f
,
o
=
split
(
gate_input
,
num_or_sections
=
4
,
dim
=-
1
)
new_cell
=
elementwise_
add
(
elementwise_mul
(
new_cell
=
paddle
.
add
(
paddle
.
multiply
(
pre_cell
,
self
.
_gate_activation
(
elementwise_add
(
f
,
self
.
_forget_bias
)
self
.
_gate_activation
(
paddle
.
add
(
f
,
self
.
_forget_bias
)),
),
),
elementwise_mul
(
paddle
.
multiply
(
paddle
.
nn
.
functional
.
sigmoid
(
i
),
paddle
.
tanh
(
j
)
),
)
...
...
@@ -466,21 +463,21 @@ class GRUCell(Layer):
if
self
.
_use_cudnn_impl
:
igates
=
matmul
(
input
,
y
=
self
.
_weight_ih
,
transpose_y
=
True
)
igates
=
elementwise_
add
(
igates
,
self
.
_bias_ih
)
igates
=
paddle
.
add
(
igates
,
self
.
_bias_ih
)
hgates
=
matmul
(
pre_hidden
,
self
.
_weight_hh
,
transpose_y
=
True
)
hgates
=
elementwise_
add
(
hgates
,
self
.
_bias_hh
)
hgates
=
paddle
.
add
(
hgates
,
self
.
_bias_hh
)
chunked_igates
=
split
(
igates
,
num_or_sections
=
3
,
dim
=
1
)
chunked_hgates
=
split
(
hgates
,
num_or_sections
=
3
,
dim
=
1
)
reset_gate
=
elementwise_
add
(
chunked_igates
[
0
],
chunked_hgates
[
0
])
reset_gate
=
paddle
.
add
(
chunked_igates
[
0
],
chunked_hgates
[
0
])
reset_gate
=
self
.
_gate_activation
(
reset_gate
)
input_gate
=
elementwise_
add
(
chunked_igates
[
1
],
chunked_hgates
[
1
])
input_gate
=
paddle
.
add
(
chunked_igates
[
1
],
chunked_hgates
[
1
])
input_gate
=
self
.
_gate_activation
(
input_gate
)
_temp
=
reset_gate
*
chunked_hgates
[
2
]
new_gate
=
elementwise_
add
(
chunked_igates
[
2
],
_temp
)
new_gate
=
paddle
.
add
(
chunked_igates
[
2
],
_temp
)
new_gate
=
self
.
_activation
(
new_gate
)
new_hidden
=
(
pre_hidden
-
new_gate
)
*
input_gate
+
new_gate
...
...
@@ -491,7 +488,7 @@ class GRUCell(Layer):
gate_input
=
matmul
(
x
=
concat_input_hidden
,
y
=
self
.
_gate_weight
)
gate_input
=
elementwise_
add
(
gate_input
,
self
.
_gate_bias
)
gate_input
=
paddle
.
add
(
gate_input
,
self
.
_gate_bias
)
gate_input
=
self
.
_gate_activation
(
gate_input
)
r
,
u
=
split
(
gate_input
,
num_or_sections
=
2
,
dim
=
1
)
...
...
@@ -500,7 +497,7 @@ class GRUCell(Layer):
candidate
=
matmul
(
concat
([
input
,
r_hidden
],
1
),
self
.
_candidate_weight
)
candidate
=
elementwise_
add
(
candidate
,
self
.
_candidate_bias
)
candidate
=
paddle
.
add
(
candidate
,
self
.
_candidate_bias
)
c
=
self
.
_activation
(
candidate
)
new_hidden
=
u
*
pre_hidden
+
(
1
-
u
)
*
c
...
...
python/paddle/fluid/layer_helper_base.py
浏览文件 @
048e0c55
...
...
@@ -115,7 +115,7 @@ class LayerHelperBase:
)
def
_create_weight_normalize
(
self
,
attr
,
shape
,
dtype
):
from
.layers
import
elementwise_mul
,
elementwise_div
from
.layers
import
elementwise_mul
# Remove these ops when LayerHelper and layers support indicating
# program and block.
...
...
@@ -266,7 +266,7 @@ class LayerHelperBase:
norm
=
__norm_except_dim
(
v
,
dim
=
dim
,
block
=
self
.
main_program
.
current_block
()
)
scale
=
elementwise_div
(
scale
=
paddle
.
divide
(
x
=
g
,
y
=
norm
)
# The shapes of g and norm are the same.
# Currently, elementwise_mul only support broadcast when the shape
...
...
python/paddle/fluid/layers/rnn.py
浏览文件 @
048e0c55
...
...
@@ -1125,10 +1125,9 @@ class BeamSearchDecoder(Decoder):
)
# TODO: use where_op
finished
=
tensor
.
cast
(
finished
,
dtype
=
probs
.
dtype
)
probs
=
nn
.
elementwise_mul
(
probs
=
paddle
.
multiply
(
paddle
.
tile
(
nn
.
unsqueeze
(
finished
,
[
2
]),
[
1
,
1
,
self
.
vocab_size
]),
self
.
noend_mask_tensor
,
axis
=-
1
,
)
-
nn
.
elementwise_mul
(
probs
,
(
finished
-
1
),
axis
=
0
)
return
probs
...
...
@@ -1503,7 +1502,7 @@ def _dynamic_decode_imperative(
# To confirm states.finished/finished be consistent with
# next_finished.
tensor
.
assign
(
next_finished
,
finished
)
next_sequence_lengths
=
nn
.
elementwise_
add
(
next_sequence_lengths
=
paddle
.
add
(
sequence_lengths
,
tensor
.
cast
(
paddle
.
logical_not
(
finished
),
sequence_lengths
.
dtype
...
...
@@ -1663,7 +1662,7 @@ def _dynamic_decode_declarative(
# Otherwise, perform logical OR which would not change the already
# finished.
next_finished
=
paddle
.
logical_or
(
next_finished
,
global_finished
)
next_sequence_lengths
=
nn
.
elementwise_
add
(
next_sequence_lengths
=
paddle
.
add
(
sequence_lengths
,
tensor
.
cast
(
paddle
.
logical_not
(
global_finished
),
...
...
python/paddle/fluid/nets.py
浏览文件 @
048e0c55
...
...
@@ -390,7 +390,7 @@ def glu(input, dim=-1):
)
a
,
b
=
layers
.
split
(
input
,
num_or_sections
=
2
,
dim
=
dim
)
act_b
=
paddle
.
nn
.
functional
.
sigmoid
(
x
=
b
)
out
=
layers
.
elementwise_mul
(
x
=
a
,
y
=
act_b
)
out
=
paddle
.
multiply
(
x
=
a
,
y
=
act_b
)
return
out
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
048e0c55
...
...
@@ -7298,10 +7298,10 @@ class LookaheadOptimizer:
for
param_name
in
params
:
fast_var
=
main_block
.
var
(
param_name
)
slow_var
=
param_to_slow
[
param_name
]
tmp_var
=
layers
.
elementwise_
add
(
layers
.
elementwise_mul
(
fast_var
,
alpha
),
layers
.
elementwise_mul
(
slow_var
,
layers
.
elementwise_sub
(
one_var
,
alpha
)
tmp_var
=
paddle
.
add
(
paddle
.
multiply
(
fast_var
,
alpha
),
paddle
.
multiply
(
slow_var
,
paddle
.
subtract
(
one_var
,
alpha
)
),
)
layers
.
assign
(
input
=
tmp_var
,
output
=
slow_var
)
...
...
python/paddle/incubate/distributed/models/moe/grad_clip.py
浏览文件 @
048e0c55
...
...
@@ -212,7 +212,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
max_global_norm
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
)
clip_var
=
layers
.
elementwise_div
(
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
)
...
...
@@ -228,7 +228,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
if
g
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
else
clip_var
)
new_grad
=
layers
.
elementwise_mul
(
x
=
g
,
y
=
clip_input
)
new_grad
=
paddle
.
multiply
(
x
=
g
,
y
=
clip_input
)
params_and_grads
.
append
((
p
,
new_grad
))
return
params_and_grads
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录