Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
048e0c55
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2312
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
048e0c55
编写于
11月 29, 2022
作者:
H
HongyuJia
提交者:
GitHub
11月 29, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
clean elem_arithmetic not test.py (#48460)
上级
41f15537
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
55 addition
and
73 deletion
+55
-73
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
...buted/fleet/meta_parallel/sharding/group_sharded_utils.py
+1
-1
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
...istributed/fleet/meta_parallel/sharding/sharding_utils.py
+1
-1
python/paddle/distribution/normal.py
python/paddle/distribution/normal.py
+6
-9
python/paddle/distribution/uniform.py
python/paddle/distribution/uniform.py
+4
-7
python/paddle/fluid/clip.py
python/paddle/fluid/clip.py
+5
-7
python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
...ib/extend_optimizer/extend_optimizer_with_weight_decay.py
+1
-3
python/paddle/fluid/contrib/layers/rnn_impl.py
python/paddle/fluid/contrib/layers/rnn_impl.py
+7
-11
python/paddle/fluid/dygraph/rnn.py
python/paddle/fluid/dygraph/rnn.py
+18
-21
python/paddle/fluid/layer_helper_base.py
python/paddle/fluid/layer_helper_base.py
+2
-2
python/paddle/fluid/layers/rnn.py
python/paddle/fluid/layers/rnn.py
+3
-4
python/paddle/fluid/nets.py
python/paddle/fluid/nets.py
+1
-1
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+4
-4
python/paddle/incubate/distributed/models/moe/grad_clip.py
python/paddle/incubate/distributed/models/moe/grad_clip.py
+2
-2
未找到文件。
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
浏览文件 @
048e0c55
...
@@ -138,7 +138,7 @@ class GroupShardedClipGrad:
...
@@ -138,7 +138,7 @@ class GroupShardedClipGrad:
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
)
)
clip_var
=
layers
.
elementwise_div
(
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
)
)
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
浏览文件 @
048e0c55
...
@@ -135,7 +135,7 @@ class ShardingClipGrad:
...
@@ -135,7 +135,7 @@ class ShardingClipGrad:
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
)
)
clip_var
=
layers
.
elementwise_div
(
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
)
)
...
...
python/paddle/distribution/normal.py
浏览文件 @
048e0c55
...
@@ -22,9 +22,6 @@ from paddle.distribution import distribution
...
@@ -22,9 +22,6 @@ from paddle.distribution import distribution
from
paddle.fluid.data_feeder
import
check_type
,
convert_dtype
from
paddle.fluid.data_feeder
import
check_type
,
convert_dtype
from
paddle.fluid.framework
import
_non_static_mode
from
paddle.fluid.framework
import
_non_static_mode
from
paddle.fluid.layers
import
(
from
paddle.fluid.layers
import
(
elementwise_add
,
elementwise_div
,
elementwise_sub
,
nn
,
nn
,
tensor
,
tensor
,
)
)
...
@@ -191,14 +188,14 @@ class Normal(distribution.Distribution):
...
@@ -191,14 +188,14 @@ class Normal(distribution.Distribution):
zero_tmp_shape
,
mean
=
0.0
,
std
=
1.0
,
seed
=
seed
,
dtype
=
self
.
dtype
zero_tmp_shape
,
mean
=
0.0
,
std
=
1.0
,
seed
=
seed
,
dtype
=
self
.
dtype
)
)
output
=
normal_random_tmp
*
(
zero_tmp_reshape
+
self
.
scale
)
output
=
normal_random_tmp
*
(
zero_tmp_reshape
+
self
.
scale
)
output
=
elementwise_
add
(
output
,
self
.
loc
,
name
=
name
)
output
=
paddle
.
add
(
output
,
self
.
loc
,
name
=
name
)
return
output
return
output
else
:
else
:
output_shape
=
shape
+
batch_shape
output_shape
=
shape
+
batch_shape
output
=
nn
.
gaussian_random
(
output
=
nn
.
gaussian_random
(
output_shape
,
mean
=
0.0
,
std
=
1.0
,
seed
=
seed
,
dtype
=
self
.
dtype
output_shape
,
mean
=
0.0
,
std
=
1.0
,
seed
=
seed
,
dtype
=
self
.
dtype
)
*
(
tensor
.
zeros
(
output_shape
,
dtype
=
self
.
dtype
)
+
self
.
scale
)
)
*
(
tensor
.
zeros
(
output_shape
,
dtype
=
self
.
dtype
)
+
self
.
scale
)
output
=
elementwise_
add
(
output
,
self
.
loc
,
name
=
name
)
output
=
paddle
.
add
(
output
,
self
.
loc
,
name
=
name
)
if
self
.
all_arg_is_float
:
if
self
.
all_arg_is_float
:
return
paddle
.
reshape
(
output
,
shape
,
name
=
name
)
return
paddle
.
reshape
(
output
,
shape
,
name
=
name
)
else
:
else
:
...
@@ -243,7 +240,7 @@ class Normal(distribution.Distribution):
...
@@ -243,7 +240,7 @@ class Normal(distribution.Distribution):
zero_tmp
=
tensor
.
fill_constant_batch_size_like
(
zero_tmp
=
tensor
.
fill_constant_batch_size_like
(
self
.
loc
+
self
.
scale
,
batch_shape
,
self
.
dtype
,
0.0
self
.
loc
+
self
.
scale
,
batch_shape
,
self
.
dtype
,
0.0
)
)
return
elementwise_
add
(
return
paddle
.
add
(
0.5
+
zero_tmp
,
0.5
+
zero_tmp
,
0.5
*
math
.
log
(
2
*
math
.
pi
)
+
nn
.
log
((
self
.
scale
+
zero_tmp
)),
0.5
*
math
.
log
(
2
*
math
.
pi
)
+
nn
.
log
((
self
.
scale
+
zero_tmp
)),
name
=
name
,
name
=
name
,
...
@@ -264,7 +261,7 @@ class Normal(distribution.Distribution):
...
@@ -264,7 +261,7 @@ class Normal(distribution.Distribution):
var
=
self
.
scale
*
self
.
scale
var
=
self
.
scale
*
self
.
scale
log_scale
=
nn
.
log
(
self
.
scale
)
log_scale
=
nn
.
log
(
self
.
scale
)
return
elementwise_sub
(
return
paddle
.
subtract
(
-
1.0
*
((
value
-
self
.
loc
)
*
(
value
-
self
.
loc
))
/
(
2.0
*
var
),
-
1.0
*
((
value
-
self
.
loc
)
*
(
value
-
self
.
loc
))
/
(
2.0
*
var
),
log_scale
+
math
.
log
(
math
.
sqrt
(
2.0
*
math
.
pi
)),
log_scale
+
math
.
log
(
math
.
sqrt
(
2.0
*
math
.
pi
)),
name
=
name
,
name
=
name
,
...
@@ -284,7 +281,7 @@ class Normal(distribution.Distribution):
...
@@ -284,7 +281,7 @@ class Normal(distribution.Distribution):
value
=
self
.
_check_values_dtype_in_probs
(
self
.
loc
,
value
)
value
=
self
.
_check_values_dtype_in_probs
(
self
.
loc
,
value
)
var
=
self
.
scale
*
self
.
scale
var
=
self
.
scale
*
self
.
scale
return
elementwise_div
(
return
paddle
.
divide
(
paddle
.
exp
(
paddle
.
exp
(
-
1.0
*
((
value
-
self
.
loc
)
*
(
value
-
self
.
loc
))
/
(
2.0
*
var
)
-
1.0
*
((
value
-
self
.
loc
)
*
(
value
-
self
.
loc
))
/
(
2.0
*
var
)
),
),
...
@@ -333,6 +330,6 @@ class Normal(distribution.Distribution):
...
@@ -333,6 +330,6 @@ class Normal(distribution.Distribution):
var_ratio
=
var_ratio
*
var_ratio
var_ratio
=
var_ratio
*
var_ratio
t1
=
(
self
.
loc
-
other
.
loc
)
/
other
.
scale
t1
=
(
self
.
loc
-
other
.
loc
)
/
other
.
scale
t1
=
t1
*
t1
t1
=
t1
*
t1
return
elementwise_
add
(
return
paddle
.
add
(
0.5
*
var_ratio
,
0.5
*
(
t1
-
1.0
-
nn
.
log
(
var_ratio
)),
name
=
name
0.5
*
var_ratio
,
0.5
*
(
t1
-
1.0
-
nn
.
log
(
var_ratio
)),
name
=
name
)
)
python/paddle/distribution/uniform.py
浏览文件 @
048e0c55
...
@@ -24,9 +24,6 @@ from paddle.fluid.framework import (
...
@@ -24,9 +24,6 @@ from paddle.fluid.framework import (
in_dygraph_mode
,
in_dygraph_mode
,
)
)
from
paddle.fluid.layers
import
(
from
paddle.fluid.layers
import
(
elementwise_add
,
elementwise_div
,
elementwise_sub
,
nn
,
nn
,
tensor
,
tensor
,
)
)
...
@@ -184,7 +181,7 @@ class Uniform(distribution.Distribution):
...
@@ -184,7 +181,7 @@ class Uniform(distribution.Distribution):
output
=
uniform_random_tmp_reshape
*
(
output
=
uniform_random_tmp_reshape
*
(
zero_tmp_reshape
+
self
.
high
-
self
.
low
zero_tmp_reshape
+
self
.
high
-
self
.
low
)
)
output
=
elementwise_
add
(
output
,
self
.
low
,
name
=
name
)
output
=
paddle
.
add
(
output
,
self
.
low
,
name
=
name
)
return
output
return
output
else
:
else
:
output_shape
=
shape
+
batch_shape
output_shape
=
shape
+
batch_shape
...
@@ -194,7 +191,7 @@ class Uniform(distribution.Distribution):
...
@@ -194,7 +191,7 @@ class Uniform(distribution.Distribution):
tensor
.
zeros
(
output_shape
,
dtype
=
self
.
dtype
)
tensor
.
zeros
(
output_shape
,
dtype
=
self
.
dtype
)
+
(
self
.
high
-
self
.
low
)
+
(
self
.
high
-
self
.
low
)
)
)
output
=
elementwise_
add
(
output
,
self
.
low
,
name
=
name
)
output
=
paddle
.
add
(
output
,
self
.
low
,
name
=
name
)
if
self
.
all_arg_is_float
:
if
self
.
all_arg_is_float
:
return
paddle
.
reshape
(
output
,
shape
,
name
=
name
)
return
paddle
.
reshape
(
output
,
shape
,
name
=
name
)
else
:
else
:
...
@@ -235,7 +232,7 @@ class Uniform(distribution.Distribution):
...
@@ -235,7 +232,7 @@ class Uniform(distribution.Distribution):
ub_bool
=
value
<
self
.
high
ub_bool
=
value
<
self
.
high
lb
=
tensor
.
cast
(
lb_bool
,
dtype
=
value
.
dtype
)
lb
=
tensor
.
cast
(
lb_bool
,
dtype
=
value
.
dtype
)
ub
=
tensor
.
cast
(
ub_bool
,
dtype
=
value
.
dtype
)
ub
=
tensor
.
cast
(
ub_bool
,
dtype
=
value
.
dtype
)
return
elementwise_sub
(
return
paddle
.
subtract
(
nn
.
log
(
lb
*
ub
),
nn
.
log
(
self
.
high
-
self
.
low
),
name
=
name
nn
.
log
(
lb
*
ub
),
nn
.
log
(
self
.
high
-
self
.
low
),
name
=
name
)
)
...
@@ -273,7 +270,7 @@ class Uniform(distribution.Distribution):
...
@@ -273,7 +270,7 @@ class Uniform(distribution.Distribution):
ub_bool
=
value
<
self
.
high
ub_bool
=
value
<
self
.
high
lb
=
tensor
.
cast
(
lb_bool
,
dtype
=
value
.
dtype
)
lb
=
tensor
.
cast
(
lb_bool
,
dtype
=
value
.
dtype
)
ub
=
tensor
.
cast
(
ub_bool
,
dtype
=
value
.
dtype
)
ub
=
tensor
.
cast
(
ub_bool
,
dtype
=
value
.
dtype
)
return
elementwise_div
((
lb
*
ub
),
(
self
.
high
-
self
.
low
),
name
=
name
)
return
paddle
.
divide
((
lb
*
ub
),
(
self
.
high
-
self
.
low
),
name
=
name
)
def
entropy
(
self
):
def
entropy
(
self
):
r
"""Shannon entropy in nats.
r
"""Shannon entropy in nats.
...
...
python/paddle/fluid/clip.py
浏览文件 @
048e0c55
...
@@ -548,16 +548,14 @@ class ClipGradByGlobalNorm(ClipGradBase):
...
@@ -548,16 +548,14 @@ class ClipGradByGlobalNorm(ClipGradBase):
need_clip
=
False
need_clip
=
False
if
not
self
.
auto_skip_clip
:
# always apply clip
if
not
self
.
auto_skip_clip
:
# always apply clip
need_clip
=
True
need_clip
=
True
clip_var
=
layers
.
elementwise_div
(
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
)
)
elif
global_norm_var
>
max_global_norm
:
elif
global_norm_var
>
max_global_norm
:
# only when global_norm_var > max_global_norm, grad need clip
# only when global_norm_var > max_global_norm, grad need clip
need_clip
=
True
need_clip
=
True
clip_var
=
layers
.
elementwise_div
(
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
global_norm_var
)
x
=
max_global_norm
,
y
=
global_norm_var
)
for
p
,
g
in
params_grads
:
for
p
,
g
in
params_grads
:
if
g
is
None
:
if
g
is
None
:
...
@@ -572,7 +570,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
...
@@ -572,7 +570,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
if
clip_var
.
dtype
!=
g
.
dtype
if
clip_var
.
dtype
!=
g
.
dtype
else
clip_var
else
clip_var
)
)
new_grad
=
layers
.
elementwise_mul
(
g
,
clip_input
)
new_grad
=
paddle
.
multiply
(
g
,
clip_input
)
params_and_grads
.
append
((
p
,
new_grad
))
params_and_grads
.
append
((
p
,
new_grad
))
else
:
else
:
params_and_grads
.
append
((
p
,
g
))
params_and_grads
.
append
((
p
,
g
))
...
@@ -652,7 +650,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
...
@@ -652,7 +650,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
max_global_norm
=
layers
.
fill_constant
(
max_global_norm
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
)
)
scale_var
=
layers
.
elementwise_div
(
scale_var
=
paddle
.
divide
(
x
=
max_global_norm
,
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
max_global_norm
,
y
=
global_norm_var
),
y
=
paddle
.
maximum
(
x
=
max_global_norm
,
y
=
global_norm_var
),
)
)
...
@@ -729,7 +727,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
...
@@ -729,7 +727,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
group_norm_var
=
layers
.
sums
(
input
=
self
.
context
[
self
.
group_name
])
group_norm_var
=
layers
.
sums
(
input
=
self
.
context
[
self
.
group_name
])
group_norm_var
=
paddle
.
sqrt
(
x
=
group_norm_var
)
group_norm_var
=
paddle
.
sqrt
(
x
=
group_norm_var
)
clip_var
=
self
.
context
[
self
.
group_name
+
"_clip"
]
clip_var
=
self
.
context
[
self
.
group_name
+
"_clip"
]
group_scale_var
=
layers
.
elementwise_div
(
group_scale_var
=
paddle
.
divide
(
x
=
clip_var
,
x
=
clip_var
,
y
=
paddle
.
maximum
(
x
=
clip_var
,
y
=
group_norm_var
),
y
=
paddle
.
maximum
(
x
=
clip_var
,
y
=
group_norm_var
),
)
)
...
...
python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
浏览文件 @
048e0c55
...
@@ -95,9 +95,7 @@ class DecoupledWeightDecay:
...
@@ -95,9 +95,7 @@ class DecoupledWeightDecay:
with
param
.
block
.
program
.
_optimized_guard
(
with
param
.
block
.
program
.
_optimized_guard
(
[
param
,
grad
]
[
param
,
grad
]
),
framework
.
name_scope
(
'weight decay'
):
),
framework
.
name_scope
(
'weight decay'
):
updated_param
=
paddle
.
fluid
.
layers
.
elementwise_sub
(
updated_param
=
paddle
.
subtract
(
x
=
param
,
y
=
scaled_param
)
x
=
param
,
y
=
scaled_param
)
paddle
.
fluid
.
layers
.
assign
(
input
=
updated_param
,
output
=
param
)
paddle
.
fluid
.
layers
.
assign
(
input
=
updated_param
,
output
=
param
)
optimize_ops
=
self
.
apply_optimize
(
optimize_ops
=
self
.
apply_optimize
(
...
...
python/paddle/fluid/contrib/layers/rnn_impl.py
浏览文件 @
048e0c55
...
@@ -153,7 +153,7 @@ class BasicGRUUnit(Layer):
...
@@ -153,7 +153,7 @@ class BasicGRUUnit(Layer):
gate_input
=
layers
.
matmul
(
x
=
concat_input_hidden
,
y
=
self
.
_gate_weight
)
gate_input
=
layers
.
matmul
(
x
=
concat_input_hidden
,
y
=
self
.
_gate_weight
)
gate_input
=
layers
.
elementwise_
add
(
gate_input
,
self
.
_gate_bias
)
gate_input
=
paddle
.
add
(
gate_input
,
self
.
_gate_bias
)
gate_input
=
self
.
_gate_activation
(
gate_input
)
gate_input
=
self
.
_gate_activation
(
gate_input
)
r
,
u
=
layers
.
split
(
gate_input
,
num_or_sections
=
2
,
dim
=
1
)
r
,
u
=
layers
.
split
(
gate_input
,
num_or_sections
=
2
,
dim
=
1
)
...
@@ -163,7 +163,7 @@ class BasicGRUUnit(Layer):
...
@@ -163,7 +163,7 @@ class BasicGRUUnit(Layer):
candidate
=
layers
.
matmul
(
candidate
=
layers
.
matmul
(
layers
.
concat
([
input
,
r_hidden
],
1
),
self
.
_candidate_weight
layers
.
concat
([
input
,
r_hidden
],
1
),
self
.
_candidate_weight
)
)
candidate
=
layers
.
elementwise_
add
(
candidate
,
self
.
_candidate_bias
)
candidate
=
paddle
.
add
(
candidate
,
self
.
_candidate_bias
)
c
=
self
.
_activation
(
candidate
)
c
=
self
.
_activation
(
candidate
)
new_hidden
=
u
*
pre_hidden
+
(
1
-
u
)
*
c
new_hidden
=
u
*
pre_hidden
+
(
1
-
u
)
*
c
...
@@ -876,18 +876,14 @@ class BasicLSTMUnit(Layer):
...
@@ -876,18 +876,14 @@ class BasicLSTMUnit(Layer):
concat_input_hidden
=
layers
.
concat
([
input
,
pre_hidden
],
1
)
concat_input_hidden
=
layers
.
concat
([
input
,
pre_hidden
],
1
)
gate_input
=
layers
.
matmul
(
x
=
concat_input_hidden
,
y
=
self
.
_weight
)
gate_input
=
layers
.
matmul
(
x
=
concat_input_hidden
,
y
=
self
.
_weight
)
gate_input
=
layers
.
elementwise_
add
(
gate_input
,
self
.
_bias
)
gate_input
=
paddle
.
add
(
gate_input
,
self
.
_bias
)
i
,
j
,
f
,
o
=
layers
.
split
(
gate_input
,
num_or_sections
=
4
,
dim
=-
1
)
i
,
j
,
f
,
o
=
layers
.
split
(
gate_input
,
num_or_sections
=
4
,
dim
=-
1
)
new_cell
=
layers
.
elementwise_
add
(
new_cell
=
paddle
.
add
(
layers
.
elementwise_mul
(
paddle
.
multiply
(
pre_cell
,
pre_cell
,
paddle
.
nn
.
functional
.
sigmoid
(
paddle
.
nn
.
functional
.
sigmoid
(
paddle
.
add
(
f
,
self
.
_forget_bias
)),
layers
.
elementwise_add
(
f
,
self
.
_forget_bias
)
),
),
layers
.
elementwise_mul
(
paddle
.
nn
.
functional
.
sigmoid
(
i
),
paddle
.
tanh
(
j
)
),
),
paddle
.
multiply
(
paddle
.
nn
.
functional
.
sigmoid
(
i
),
paddle
.
tanh
(
j
)),
)
)
new_hidden
=
paddle
.
tanh
(
new_cell
)
*
paddle
.
nn
.
functional
.
sigmoid
(
o
)
new_hidden
=
paddle
.
tanh
(
new_cell
)
*
paddle
.
nn
.
functional
.
sigmoid
(
o
)
...
...
python/paddle/fluid/dygraph/rnn.py
浏览文件 @
048e0c55
...
@@ -18,7 +18,6 @@ from ..layers import (
...
@@ -18,7 +18,6 @@ from ..layers import (
concat
,
concat
,
fill_constant
,
fill_constant
,
matmul
,
matmul
,
elementwise_add
,
elementwise_mul
,
elementwise_mul
,
split
,
split
,
)
)
...
@@ -217,23 +216,23 @@ class LSTMCell(Layer):
...
@@ -217,23 +216,23 @@ class LSTMCell(Layer):
if
self
.
_use_cudnn_impl
:
if
self
.
_use_cudnn_impl
:
igates
=
matmul
(
input
,
y
=
self
.
_weight_ih
,
transpose_y
=
True
)
igates
=
matmul
(
input
,
y
=
self
.
_weight_ih
,
transpose_y
=
True
)
igates
=
elementwise_
add
(
igates
,
self
.
_bias_ih
)
igates
=
paddle
.
add
(
igates
,
self
.
_bias_ih
)
hgates
=
matmul
(
pre_hidden
,
self
.
_weight_hh
,
transpose_y
=
True
)
hgates
=
matmul
(
pre_hidden
,
self
.
_weight_hh
,
transpose_y
=
True
)
hgates
=
elementwise_
add
(
hgates
,
self
.
_bias_hh
)
hgates
=
paddle
.
add
(
hgates
,
self
.
_bias_hh
)
chunked_igates
=
split
(
igates
,
num_or_sections
=
4
,
dim
=
1
)
chunked_igates
=
split
(
igates
,
num_or_sections
=
4
,
dim
=
1
)
chunked_hgates
=
split
(
hgates
,
num_or_sections
=
4
,
dim
=
1
)
chunked_hgates
=
split
(
hgates
,
num_or_sections
=
4
,
dim
=
1
)
ingate
=
elementwise_
add
(
chunked_igates
[
0
],
chunked_hgates
[
0
])
ingate
=
paddle
.
add
(
chunked_igates
[
0
],
chunked_hgates
[
0
])
ingate
=
self
.
_gate_activation
(
ingate
)
ingate
=
self
.
_gate_activation
(
ingate
)
forgetgate
=
elementwise_
add
(
chunked_igates
[
1
],
chunked_hgates
[
1
])
forgetgate
=
paddle
.
add
(
chunked_igates
[
1
],
chunked_hgates
[
1
])
forgetgate
=
self
.
_gate_activation
(
forgetgate
)
forgetgate
=
self
.
_gate_activation
(
forgetgate
)
cellgate
=
elementwise_
add
(
chunked_igates
[
2
],
chunked_hgates
[
2
])
cellgate
=
paddle
.
add
(
chunked_igates
[
2
],
chunked_hgates
[
2
])
cellgate
=
self
.
_activation
(
cellgate
)
cellgate
=
self
.
_activation
(
cellgate
)
outgate
=
elementwise_
add
(
chunked_igates
[
3
],
chunked_hgates
[
3
])
outgate
=
paddle
.
add
(
chunked_igates
[
3
],
chunked_hgates
[
3
])
outgate
=
self
.
_gate_activation
(
outgate
)
outgate
=
self
.
_gate_activation
(
outgate
)
new_cell
=
(
forgetgate
*
pre_cell
)
+
(
ingate
*
cellgate
)
new_cell
=
(
forgetgate
*
pre_cell
)
+
(
ingate
*
cellgate
)
...
@@ -244,16 +243,14 @@ class LSTMCell(Layer):
...
@@ -244,16 +243,14 @@ class LSTMCell(Layer):
concat_input_hidden
=
concat
([
input
,
pre_hidden
],
1
)
concat_input_hidden
=
concat
([
input
,
pre_hidden
],
1
)
gate_input
=
matmul
(
x
=
concat_input_hidden
,
y
=
self
.
_weight
)
gate_input
=
matmul
(
x
=
concat_input_hidden
,
y
=
self
.
_weight
)
gate_input
=
elementwise_
add
(
gate_input
,
self
.
_bias
)
gate_input
=
paddle
.
add
(
gate_input
,
self
.
_bias
)
i
,
j
,
f
,
o
=
split
(
gate_input
,
num_or_sections
=
4
,
dim
=-
1
)
i
,
j
,
f
,
o
=
split
(
gate_input
,
num_or_sections
=
4
,
dim
=-
1
)
new_cell
=
elementwise_
add
(
new_cell
=
paddle
.
add
(
elementwise_mul
(
paddle
.
multiply
(
pre_cell
,
pre_cell
,
self
.
_gate_activation
(
self
.
_gate_activation
(
paddle
.
add
(
f
,
self
.
_forget_bias
)),
elementwise_add
(
f
,
self
.
_forget_bias
)
),
),
),
elementwise_mul
(
paddle
.
multiply
(
paddle
.
nn
.
functional
.
sigmoid
(
i
),
paddle
.
tanh
(
j
)
paddle
.
nn
.
functional
.
sigmoid
(
i
),
paddle
.
tanh
(
j
)
),
),
)
)
...
@@ -466,21 +463,21 @@ class GRUCell(Layer):
...
@@ -466,21 +463,21 @@ class GRUCell(Layer):
if
self
.
_use_cudnn_impl
:
if
self
.
_use_cudnn_impl
:
igates
=
matmul
(
input
,
y
=
self
.
_weight_ih
,
transpose_y
=
True
)
igates
=
matmul
(
input
,
y
=
self
.
_weight_ih
,
transpose_y
=
True
)
igates
=
elementwise_
add
(
igates
,
self
.
_bias_ih
)
igates
=
paddle
.
add
(
igates
,
self
.
_bias_ih
)
hgates
=
matmul
(
pre_hidden
,
self
.
_weight_hh
,
transpose_y
=
True
)
hgates
=
matmul
(
pre_hidden
,
self
.
_weight_hh
,
transpose_y
=
True
)
hgates
=
elementwise_
add
(
hgates
,
self
.
_bias_hh
)
hgates
=
paddle
.
add
(
hgates
,
self
.
_bias_hh
)
chunked_igates
=
split
(
igates
,
num_or_sections
=
3
,
dim
=
1
)
chunked_igates
=
split
(
igates
,
num_or_sections
=
3
,
dim
=
1
)
chunked_hgates
=
split
(
hgates
,
num_or_sections
=
3
,
dim
=
1
)
chunked_hgates
=
split
(
hgates
,
num_or_sections
=
3
,
dim
=
1
)
reset_gate
=
elementwise_
add
(
chunked_igates
[
0
],
chunked_hgates
[
0
])
reset_gate
=
paddle
.
add
(
chunked_igates
[
0
],
chunked_hgates
[
0
])
reset_gate
=
self
.
_gate_activation
(
reset_gate
)
reset_gate
=
self
.
_gate_activation
(
reset_gate
)
input_gate
=
elementwise_
add
(
chunked_igates
[
1
],
chunked_hgates
[
1
])
input_gate
=
paddle
.
add
(
chunked_igates
[
1
],
chunked_hgates
[
1
])
input_gate
=
self
.
_gate_activation
(
input_gate
)
input_gate
=
self
.
_gate_activation
(
input_gate
)
_temp
=
reset_gate
*
chunked_hgates
[
2
]
_temp
=
reset_gate
*
chunked_hgates
[
2
]
new_gate
=
elementwise_
add
(
chunked_igates
[
2
],
_temp
)
new_gate
=
paddle
.
add
(
chunked_igates
[
2
],
_temp
)
new_gate
=
self
.
_activation
(
new_gate
)
new_gate
=
self
.
_activation
(
new_gate
)
new_hidden
=
(
pre_hidden
-
new_gate
)
*
input_gate
+
new_gate
new_hidden
=
(
pre_hidden
-
new_gate
)
*
input_gate
+
new_gate
...
@@ -491,7 +488,7 @@ class GRUCell(Layer):
...
@@ -491,7 +488,7 @@ class GRUCell(Layer):
gate_input
=
matmul
(
x
=
concat_input_hidden
,
y
=
self
.
_gate_weight
)
gate_input
=
matmul
(
x
=
concat_input_hidden
,
y
=
self
.
_gate_weight
)
gate_input
=
elementwise_
add
(
gate_input
,
self
.
_gate_bias
)
gate_input
=
paddle
.
add
(
gate_input
,
self
.
_gate_bias
)
gate_input
=
self
.
_gate_activation
(
gate_input
)
gate_input
=
self
.
_gate_activation
(
gate_input
)
r
,
u
=
split
(
gate_input
,
num_or_sections
=
2
,
dim
=
1
)
r
,
u
=
split
(
gate_input
,
num_or_sections
=
2
,
dim
=
1
)
...
@@ -500,7 +497,7 @@ class GRUCell(Layer):
...
@@ -500,7 +497,7 @@ class GRUCell(Layer):
candidate
=
matmul
(
candidate
=
matmul
(
concat
([
input
,
r_hidden
],
1
),
self
.
_candidate_weight
concat
([
input
,
r_hidden
],
1
),
self
.
_candidate_weight
)
)
candidate
=
elementwise_
add
(
candidate
,
self
.
_candidate_bias
)
candidate
=
paddle
.
add
(
candidate
,
self
.
_candidate_bias
)
c
=
self
.
_activation
(
candidate
)
c
=
self
.
_activation
(
candidate
)
new_hidden
=
u
*
pre_hidden
+
(
1
-
u
)
*
c
new_hidden
=
u
*
pre_hidden
+
(
1
-
u
)
*
c
...
...
python/paddle/fluid/layer_helper_base.py
浏览文件 @
048e0c55
...
@@ -115,7 +115,7 @@ class LayerHelperBase:
...
@@ -115,7 +115,7 @@ class LayerHelperBase:
)
)
def
_create_weight_normalize
(
self
,
attr
,
shape
,
dtype
):
def
_create_weight_normalize
(
self
,
attr
,
shape
,
dtype
):
from
.layers
import
elementwise_mul
,
elementwise_div
from
.layers
import
elementwise_mul
# Remove these ops when LayerHelper and layers support indicating
# Remove these ops when LayerHelper and layers support indicating
# program and block.
# program and block.
...
@@ -266,7 +266,7 @@ class LayerHelperBase:
...
@@ -266,7 +266,7 @@ class LayerHelperBase:
norm
=
__norm_except_dim
(
norm
=
__norm_except_dim
(
v
,
dim
=
dim
,
block
=
self
.
main_program
.
current_block
()
v
,
dim
=
dim
,
block
=
self
.
main_program
.
current_block
()
)
)
scale
=
elementwise_div
(
scale
=
paddle
.
divide
(
x
=
g
,
y
=
norm
x
=
g
,
y
=
norm
)
# The shapes of g and norm are the same.
)
# The shapes of g and norm are the same.
# Currently, elementwise_mul only support broadcast when the shape
# Currently, elementwise_mul only support broadcast when the shape
...
...
python/paddle/fluid/layers/rnn.py
浏览文件 @
048e0c55
...
@@ -1125,10 +1125,9 @@ class BeamSearchDecoder(Decoder):
...
@@ -1125,10 +1125,9 @@ class BeamSearchDecoder(Decoder):
)
)
# TODO: use where_op
# TODO: use where_op
finished
=
tensor
.
cast
(
finished
,
dtype
=
probs
.
dtype
)
finished
=
tensor
.
cast
(
finished
,
dtype
=
probs
.
dtype
)
probs
=
nn
.
elementwise_mul
(
probs
=
paddle
.
multiply
(
paddle
.
tile
(
nn
.
unsqueeze
(
finished
,
[
2
]),
[
1
,
1
,
self
.
vocab_size
]),
paddle
.
tile
(
nn
.
unsqueeze
(
finished
,
[
2
]),
[
1
,
1
,
self
.
vocab_size
]),
self
.
noend_mask_tensor
,
self
.
noend_mask_tensor
,
axis
=-
1
,
)
-
nn
.
elementwise_mul
(
probs
,
(
finished
-
1
),
axis
=
0
)
)
-
nn
.
elementwise_mul
(
probs
,
(
finished
-
1
),
axis
=
0
)
return
probs
return
probs
...
@@ -1503,7 +1502,7 @@ def _dynamic_decode_imperative(
...
@@ -1503,7 +1502,7 @@ def _dynamic_decode_imperative(
# To confirm states.finished/finished be consistent with
# To confirm states.finished/finished be consistent with
# next_finished.
# next_finished.
tensor
.
assign
(
next_finished
,
finished
)
tensor
.
assign
(
next_finished
,
finished
)
next_sequence_lengths
=
nn
.
elementwise_
add
(
next_sequence_lengths
=
paddle
.
add
(
sequence_lengths
,
sequence_lengths
,
tensor
.
cast
(
tensor
.
cast
(
paddle
.
logical_not
(
finished
),
sequence_lengths
.
dtype
paddle
.
logical_not
(
finished
),
sequence_lengths
.
dtype
...
@@ -1663,7 +1662,7 @@ def _dynamic_decode_declarative(
...
@@ -1663,7 +1662,7 @@ def _dynamic_decode_declarative(
# Otherwise, perform logical OR which would not change the already
# Otherwise, perform logical OR which would not change the already
# finished.
# finished.
next_finished
=
paddle
.
logical_or
(
next_finished
,
global_finished
)
next_finished
=
paddle
.
logical_or
(
next_finished
,
global_finished
)
next_sequence_lengths
=
nn
.
elementwise_
add
(
next_sequence_lengths
=
paddle
.
add
(
sequence_lengths
,
sequence_lengths
,
tensor
.
cast
(
tensor
.
cast
(
paddle
.
logical_not
(
global_finished
),
paddle
.
logical_not
(
global_finished
),
...
...
python/paddle/fluid/nets.py
浏览文件 @
048e0c55
...
@@ -390,7 +390,7 @@ def glu(input, dim=-1):
...
@@ -390,7 +390,7 @@ def glu(input, dim=-1):
)
)
a
,
b
=
layers
.
split
(
input
,
num_or_sections
=
2
,
dim
=
dim
)
a
,
b
=
layers
.
split
(
input
,
num_or_sections
=
2
,
dim
=
dim
)
act_b
=
paddle
.
nn
.
functional
.
sigmoid
(
x
=
b
)
act_b
=
paddle
.
nn
.
functional
.
sigmoid
(
x
=
b
)
out
=
layers
.
elementwise_mul
(
x
=
a
,
y
=
act_b
)
out
=
paddle
.
multiply
(
x
=
a
,
y
=
act_b
)
return
out
return
out
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
048e0c55
...
@@ -7298,10 +7298,10 @@ class LookaheadOptimizer:
...
@@ -7298,10 +7298,10 @@ class LookaheadOptimizer:
for
param_name
in
params
:
for
param_name
in
params
:
fast_var
=
main_block
.
var
(
param_name
)
fast_var
=
main_block
.
var
(
param_name
)
slow_var
=
param_to_slow
[
param_name
]
slow_var
=
param_to_slow
[
param_name
]
tmp_var
=
layers
.
elementwise_
add
(
tmp_var
=
paddle
.
add
(
layers
.
elementwise_mul
(
fast_var
,
alpha
),
paddle
.
multiply
(
fast_var
,
alpha
),
layers
.
elementwise_mul
(
paddle
.
multiply
(
slow_var
,
layers
.
elementwise_sub
(
one_var
,
alpha
)
slow_var
,
paddle
.
subtract
(
one_var
,
alpha
)
),
),
)
)
layers
.
assign
(
input
=
tmp_var
,
output
=
slow_var
)
layers
.
assign
(
input
=
tmp_var
,
output
=
slow_var
)
...
...
python/paddle/incubate/distributed/models/moe/grad_clip.py
浏览文件 @
048e0c55
...
@@ -212,7 +212,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
...
@@ -212,7 +212,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
max_global_norm
=
layers
.
fill_constant
(
max_global_norm
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
)
)
clip_var
=
layers
.
elementwise_div
(
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
)
)
...
@@ -228,7 +228,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
...
@@ -228,7 +228,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
if
g
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
if
g
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
else
clip_var
else
clip_var
)
)
new_grad
=
layers
.
elementwise_mul
(
x
=
g
,
y
=
clip_input
)
new_grad
=
paddle
.
multiply
(
x
=
g
,
y
=
clip_input
)
params_and_grads
.
append
((
p
,
new_grad
))
params_and_grads
.
append
((
p
,
new_grad
))
return
params_and_grads
return
params_and_grads
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录