Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
hapi
提交
6431daed
H
hapi
项目概览
PaddlePaddle
/
hapi
通知
11
Star
2
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
4
列表
看板
标记
里程碑
合并请求
7
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
H
hapi
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
4
Issue
4
列表
看板
标记
里程碑
合并请求
7
合并请求
7
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
6431daed
编写于
4月 08, 2020
作者:
G
guosheng
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix condition var is None
上级
3c682920
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
21 addition
and
24 deletion
+21
-24
transformer/predict.py
transformer/predict.py
+0
-2
transformer/train.py
transformer/train.py
+18
-20
transformer/transformer.py
transformer/transformer.py
+3
-2
未找到文件。
transformer/predict.py
浏览文件 @
6431daed
...
@@ -91,8 +91,6 @@ def do_predict(args):
...
@@ -91,8 +91,6 @@ def do_predict(args):
dataset
=
dataset
,
dataset
=
dataset
,
batch_sampler
=
batch_sampler
,
batch_sampler
=
batch_sampler
,
places
=
device
,
places
=
device
,
feed_list
=
None
if
fluid
.
in_dygraph_mode
()
else
[
x
.
forward
()
for
x
in
inputs
],
collate_fn
=
partial
(
collate_fn
=
partial
(
prepare_infer_input
,
src_pad_idx
=
args
.
eos_idx
,
n_head
=
args
.
n_head
),
prepare_infer_input
,
src_pad_idx
=
args
.
eos_idx
,
n_head
=
args
.
n_head
),
num_workers
=
0
,
num_workers
=
0
,
...
...
transformer/train.py
浏览文件 @
6431daed
...
@@ -22,7 +22,6 @@ from functools import partial
...
@@ -22,7 +22,6 @@ from functools import partial
import
numpy
as
np
import
numpy
as
np
import
paddle
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
paddle.fluid.dygraph
import
to_variable
from
paddle.fluid.io
import
DataLoader
from
paddle.fluid.io
import
DataLoader
from
utils.configure
import
PDConfig
from
utils.configure
import
PDConfig
...
@@ -31,32 +30,33 @@ from utils.check import check_gpu, check_version
...
@@ -31,32 +30,33 @@ from utils.check import check_gpu, check_version
from
model
import
Input
,
set_device
from
model
import
Input
,
set_device
from
callbacks
import
ProgBarLogger
from
callbacks
import
ProgBarLogger
from
reader
import
prepare_train_input
,
Seq2SeqDataset
,
Seq2SeqBatchSampler
from
reader
import
prepare_train_input
,
Seq2SeqDataset
,
Seq2SeqBatchSampler
from
transformer
import
Transformer
,
CrossEntropyCriterion
,
NoamDecay
from
transformer
import
Transformer
,
CrossEntropyCriterion
class
Logger
Callback
(
ProgBarLogger
):
class
Train
Callback
(
ProgBarLogger
):
def
__init__
(
self
,
log_freq
=
1
,
verbose
=
2
,
loss_normalizer
=
0.
):
def
__init__
(
self
,
log_freq
=
1
,
verbose
=
2
,
loss_normalizer
=
0.
):
super
(
Logger
Callback
,
self
).
__init__
(
log_freq
,
verbose
)
super
(
Train
Callback
,
self
).
__init__
(
log_freq
,
verbose
)
# TODO: wrap these override function to simplify
# TODO: wrap these override function to simplify
self
.
loss_normalizer
=
loss_normalizer
self
.
loss_normalizer
=
loss_normalizer
def
on_train_begin
(
self
,
logs
=
None
):
def
on_train_begin
(
self
,
logs
=
None
):
super
(
Logger
Callback
,
self
).
on_train_begin
(
logs
)
super
(
Train
Callback
,
self
).
on_train_begin
(
logs
)
self
.
train_metrics
+=
[
"normalized loss"
,
"ppl"
]
self
.
train_metrics
+=
[
"normalized loss"
,
"ppl"
]
def
on_train_batch_end
(
self
,
step
,
logs
=
None
):
def
on_train_batch_end
(
self
,
step
,
logs
=
None
):
logs
[
"normalized loss"
]
=
logs
[
"loss"
][
0
]
-
self
.
loss_normalizer
logs
[
"normalized loss"
]
=
logs
[
"loss"
][
0
]
-
self
.
loss_normalizer
logs
[
"ppl"
]
=
np
.
exp
(
min
(
logs
[
"loss"
][
0
],
100
))
logs
[
"ppl"
]
=
np
.
exp
(
min
(
logs
[
"loss"
][
0
],
100
))
super
(
Logger
Callback
,
self
).
on_train_batch_end
(
step
,
logs
)
super
(
Train
Callback
,
self
).
on_train_batch_end
(
step
,
logs
)
def
on_eval_begin
(
self
,
logs
=
None
):
def
on_eval_begin
(
self
,
logs
=
None
):
super
(
LoggerCallback
,
self
).
on_eval_begin
(
logs
)
super
(
TrainCallback
,
self
).
on_eval_begin
(
logs
)
self
.
eval_metrics
+=
[
"normalized loss"
,
"ppl"
]
self
.
eval_metrics
=
list
(
self
.
eval_metrics
)
+
[
"normalized loss"
,
"ppl"
]
def
on_eval_batch_end
(
self
,
step
,
logs
=
None
):
def
on_eval_batch_end
(
self
,
step
,
logs
=
None
):
logs
[
"normalized loss"
]
=
logs
[
"loss"
][
0
]
-
self
.
loss_normalizer
logs
[
"normalized loss"
]
=
logs
[
"loss"
][
0
]
-
self
.
loss_normalizer
logs
[
"ppl"
]
=
np
.
exp
(
min
(
logs
[
"loss"
][
0
],
100
))
logs
[
"ppl"
]
=
np
.
exp
(
min
(
logs
[
"loss"
][
0
],
100
))
super
(
Logger
Callback
,
self
).
on_eval_batch_end
(
step
,
logs
)
super
(
Train
Callback
,
self
).
on_eval_batch_end
(
step
,
logs
)
def
do_train
(
args
):
def
do_train
(
args
):
...
@@ -127,8 +127,6 @@ def do_train(args):
...
@@ -127,8 +127,6 @@ def do_train(args):
dataset
=
dataset
,
dataset
=
dataset
,
batch_sampler
=
batch_sampler
,
batch_sampler
=
batch_sampler
,
places
=
device
,
places
=
device
,
feed_list
=
None
if
fluid
.
in_dygraph_mode
()
else
[
x
.
forward
()
for
x
in
inputs
+
labels
],
collate_fn
=
partial
(
collate_fn
=
partial
(
prepare_train_input
,
prepare_train_input
,
src_pad_idx
=
args
.
eos_idx
,
src_pad_idx
=
args
.
eos_idx
,
...
@@ -149,8 +147,10 @@ def do_train(args):
...
@@ -149,8 +147,10 @@ def do_train(args):
transformer
.
prepare
(
transformer
.
prepare
(
fluid
.
optimizer
.
Adam
(
fluid
.
optimizer
.
Adam
(
learning_rate
=
fluid
.
layers
.
noam_decay
(
args
.
d_model
,
learning_rate
=
fluid
.
layers
.
noam_decay
(
args
.
warmup_steps
),
args
.
d_model
,
args
.
warmup_steps
,
learning_rate
=
args
.
learning_rate
),
beta1
=
args
.
beta1
,
beta1
=
args
.
beta1
,
beta2
=
args
.
beta2
,
beta2
=
args
.
beta2
,
epsilon
=
float
(
args
.
eps
),
epsilon
=
float
(
args
.
eps
),
...
@@ -161,13 +161,10 @@ def do_train(args):
...
@@ -161,13 +161,10 @@ def do_train(args):
## init from some checkpoint, to resume the previous training
## init from some checkpoint, to resume the previous training
if
args
.
init_from_checkpoint
:
if
args
.
init_from_checkpoint
:
transformer
.
load
(
transformer
.
load
(
args
.
init_from_checkpoint
)
os
.
path
.
join
(
args
.
init_from_checkpoint
,
"transformer"
))
## init from some pretrain models, to better solve the current task
## init from some pretrain models, to better solve the current task
if
args
.
init_from_pretrain_model
:
if
args
.
init_from_pretrain_model
:
transformer
.
load
(
transformer
.
load
(
args
.
init_from_pretrain_model
,
reset_optimizer
=
True
)
os
.
path
.
join
(
args
.
init_from_pretrain_model
,
"transformer"
),
reset_optimizer
=
True
)
# the best cross-entropy value with label smoothing
# the best cross-entropy value with label smoothing
loss_normalizer
=
-
(
loss_normalizer
=
-
(
...
@@ -178,12 +175,13 @@ def do_train(args):
...
@@ -178,12 +175,13 @@ def do_train(args):
# model train
# model train
transformer
.
fit
(
train_data
=
train_loader
,
transformer
.
fit
(
train_data
=
train_loader
,
eval_data
=
eval_loader
,
eval_data
=
eval_loader
,
epochs
=
1
,
epochs
=
args
.
epoch
,
eval_freq
=
1
,
eval_freq
=
1
,
save_freq
=
1
,
save_freq
=
1
,
save_dir
=
args
.
save_model
,
verbose
=
2
,
verbose
=
2
,
callbacks
=
[
callbacks
=
[
Logger
Callback
(
Train
Callback
(
log_freq
=
args
.
print_step
,
log_freq
=
args
.
print_step
,
loss_normalizer
=
loss_normalizer
)
loss_normalizer
=
loss_normalizer
)
])
])
...
...
transformer/transformer.py
浏览文件 @
6431daed
...
@@ -79,7 +79,8 @@ class PrePostProcessLayer(Layer):
...
@@ -79,7 +79,8 @@ class PrePostProcessLayer(Layer):
self
.
functors
=
[]
self
.
functors
=
[]
for
cmd
in
self
.
process_cmd
:
for
cmd
in
self
.
process_cmd
:
if
cmd
==
"a"
:
# add residual connection
if
cmd
==
"a"
:
# add residual connection
self
.
functors
.
append
(
lambda
x
,
y
:
x
+
y
if
y
else
x
)
self
.
functors
.
append
(
lambda
x
,
y
:
x
+
y
if
y
is
not
None
else
x
)
elif
cmd
==
"n"
:
# add layer normalization
elif
cmd
==
"n"
:
# add layer normalization
self
.
functors
.
append
(
self
.
functors
.
append
(
self
.
add_sublayer
(
self
.
add_sublayer
(
...
@@ -169,7 +170,7 @@ class MultiHeadAttention(Layer):
...
@@ -169,7 +170,7 @@ class MultiHeadAttention(Layer):
# scale dot product attention
# scale dot product attention
product
=
layers
.
matmul
(
product
=
layers
.
matmul
(
x
=
q
,
y
=
k
,
transpose_y
=
True
,
alpha
=
self
.
d_model
**-
0.5
)
x
=
q
,
y
=
k
,
transpose_y
=
True
,
alpha
=
self
.
d_model
**-
0.5
)
if
attn_bias
:
if
attn_bias
is
not
None
:
product
+=
attn_bias
product
+=
attn_bias
weights
=
layers
.
softmax
(
product
)
weights
=
layers
.
softmax
(
product
)
if
self
.
dropout_rate
:
if
self
.
dropout_rate
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录