Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
models
提交
0f2fec44
M
models
项目概览
PaddlePaddle
/
models
1 年多 前同步成功
通知
222
Star
6828
Fork
2962
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
602
列表
看板
标记
里程碑
合并请求
255
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
602
Issue
602
列表
看板
标记
里程碑
合并请求
255
合并请求
255
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
0f2fec44
编写于
11月 28, 2017
作者:
W
wangmeng28
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
restructure the code of scheduled sampling
上级
4ccf9345
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
526 addition
and
61 deletion
+526
-61
scheduled_sampling/README.md
scheduled_sampling/README.md
+66
-59
scheduled_sampling/generate.py
scheduled_sampling/generate.py
+91
-0
scheduled_sampling/network_conf.py
scheduled_sampling/network_conf.py
+202
-0
scheduled_sampling/reader.py
scheduled_sampling/reader.py
+42
-0
scheduled_sampling/train.py
scheduled_sampling/train.py
+123
-0
scheduled_sampling/utils.py
scheduled_sampling/utils.py
+2
-2
未找到文件。
scheduled_sampling/README.md
浏览文件 @
0f2fec44
...
...
@@ -37,7 +37,7 @@ Scheduled Sampling主要应用在序列到序列模型的训练阶段,而生
## 模型实现
由于Scheduled Sampling是对序列到序列模型的改进,其整体实现框架与序列到序列模型较为相似。为突出本文重点,这里仅介绍与Scheduled Sampling相关的部分,完整的代码见
`
scheduled_sampling
.py`
。
由于Scheduled Sampling是对序列到序列模型的改进,其整体实现框架与序列到序列模型较为相似。为突出本文重点,这里仅介绍与Scheduled Sampling相关的部分,完整的代码见
`
network_conf
.py`
。
首先导入需要的包,并定义控制衰减概率的类
`RandomScheduleGenerator`
,如下:
...
...
@@ -119,9 +119,10 @@ true_token_flags = paddle.layer.data(
这里还需要对原始reader进行封装,增加
`true_token_flag`
的数据生成器。下面以线性衰减为例说明如何调用上面定义的
`RandomScheduleGenerator`
产生
`true_token_flag`
的输入数据。
```
python
schedule_generator
=
RandomScheduleGenerator
(
"linear"
,
0.75
,
1000000
)
def
gen_schedule_data
(
reader
):
def
gen_schedule_data
(
reader
,
schedule_type
=
"linear"
,
decay_a
=
0.75
,
decay_b
=
1000000
):
"""
Creates a data reader for scheduled sampling.
...
...
@@ -130,10 +131,17 @@ def gen_schedule_data(reader):
:param reader: the original reader.
:type reader: callable
:param schedule_type: the type of sampling rate decay.
:type schedule_type: str
:param decay_a: the decay parameter a.
:type decay_a: float
:param decay_b: the decay parameter b.
:type decay_b: float
:return: the new reader with the field "true_token_flag".
:rtype: callable
"""
schedule_generator
=
RandomScheduleGenerator
(
schedule_type
,
decay_a
,
decay_b
)
def
data_reader
():
for
src_ids
,
trg_ids
,
trg_ids_next
in
reader
():
...
...
@@ -149,61 +157,60 @@ def gen_schedule_data(reader):
```
python
def
gru_decoder_with_attention_train
(
enc_vec
,
enc_proj
,
true_word
,
true_token_flag
):
"""
The decoder step for training.
:param enc_vec: the encoder vector for attention
:type enc_vec: LayerOutput
:param enc_proj: the encoder projection for attention
:type enc_proj: LayerOutput
:param true_word: the ground-truth target word
:type true_word: LayerOutput
:param true_token_flag: the flag of using the ground-truth target word
:type true_token_flag: LayerOutput
:return: the softmax output layer
:rtype: LayerOutput
"""
decoder_mem
=
paddle
.
layer
.
memory
(
name
=
'gru_decoder'
,
size
=
decoder_size
,
boot_layer
=
decoder_boot
)
context
=
paddle
.
networks
.
simple_attention
(
encoded_sequence
=
enc_vec
,
encoded_proj
=
enc_proj
,
decoder_state
=
decoder_mem
)
gru_out_memory
=
paddle
.
layer
.
memory
(
name
=
'gru_out'
,
size
=
target_dict_dim
)
generated_word
=
paddle
.
layer
.
max_id
(
input
=
gru_out_memory
)
generated_word_emb
=
paddle
.
layer
.
embedding
(
input
=
generated_word
,
size
=
word_vector_dim
,
param_attr
=
paddle
.
attr
.
ParamAttr
(
name
=
'_target_language_embedding'
))
current_word
=
paddle
.
layer
.
multiplex
(
input
=
[
true_token_flag
,
true_word
,
generated_word_emb
])
with
paddle
.
layer
.
mixed
(
size
=
decoder_size
*
3
)
as
decoder_inputs
:
decoder_inputs
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
context
)
decoder_inputs
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
current_word
)
gru_step
=
paddle
.
layer
.
gru_step
(
name
=
'gru_decoder'
,
input
=
decoder_inputs
,
output_mem
=
decoder_mem
,
size
=
decoder_size
)
with
paddle
.
layer
.
mixed
(
name
=
'gru_out'
,
size
=
target_dict_dim
,
bias_attr
=
True
,
act
=
paddle
.
activation
.
Softmax
())
as
out
:
out
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
gru_step
)
return
out
true_token_flag
):
"""
The decoder step for training.
:param enc_vec: the encoder vector for attention
:type enc_vec: LayerOutput
:param enc_proj: the encoder projection for attention
:type enc_proj: LayerOutput
:param true_word: the ground-truth target word
:type true_word: LayerOutput
:param true_token_flag: the flag of using the ground-truth target word
:type true_token_flag: LayerOutput
:return: the softmax output layer
:rtype: LayerOutput
"""
decoder_mem
=
paddle
.
layer
.
memory
(
name
=
'gru_decoder'
,
size
=
decoder_size
,
boot_layer
=
decoder_boot
)
context
=
paddle
.
networks
.
simple_attention
(
encoded_sequence
=
enc_vec
,
encoded_proj
=
enc_proj
,
decoder_state
=
decoder_mem
)
gru_out_memory
=
paddle
.
layer
.
memory
(
name
=
'gru_out'
,
size
=
target_dict_dim
)
generated_word
=
paddle
.
layer
.
max_id
(
input
=
gru_out_memory
)
generated_word_emb
=
paddle
.
layer
.
embedding
(
input
=
generated_word
,
size
=
word_vector_dim
,
param_attr
=
paddle
.
attr
.
ParamAttr
(
name
=
'_target_language_embedding'
))
current_word
=
paddle
.
layer
.
multiplex
(
input
=
[
true_token_flag
,
true_word
,
generated_word_emb
])
decoder_inputs
=
paddle
.
layer
.
fc
(
input
=
[
context
,
current_word
],
size
=
decoder_size
*
3
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
gru_step
=
paddle
.
layer
.
gru_step
(
name
=
'gru_decoder'
,
input
=
decoder_inputs
,
output_mem
=
decoder_mem
,
size
=
decoder_size
)
out
=
paddle
.
layer
.
fc
(
name
=
'gru_out'
,
input
=
gru_step
,
size
=
target_dict_dim
,
act
=
paddle
.
activation
.
Softmax
())
return
out
```
该函数使用
`memory`
层
`gru_out_memory`
记忆上一时刻生成的元素,根据
`gru_out_memory`
选择概率最大的词语
`generated_word`
作为生成的词语。
`multiplex`
层会在真实元素
`true_word`
和生成的元素
`generated_word`
之间做出选择,并将选择的结果作为解码器输入。
`multiplex`
层使用了三个输入,分别为
`true_token_flag`
、
`true_word`
和
`generated_word_emb`
。对于这三个输入中每个元素,若
`true_token_flag`
中的值为
`0`
,则
`multiplex`
层输出
`true_word`
中的相应元素;若
`true_token_flag`
中的值为
`1`
,则
`multiplex`
层输出
`generated_word_emb`
中的相应元素。
...
...
scheduled_sampling/generate.py
0 → 100644
浏览文件 @
0f2fec44
import
gzip
import
argparse
import
distutils.util
import
paddle.v2
as
paddle
from
network_conf
import
seqToseq_net
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
"PaddlePaddle Scheduled Sampling"
)
parser
.
add_argument
(
'--model_path'
,
type
=
str
,
required
=
True
,
help
=
"The path for trained model to load."
)
parser
.
add_argument
(
'--beam_size'
,
type
=
int
,
default
=
3
,
help
=
'The width of beam expansion. (default: %(default)s)'
)
parser
.
add_argument
(
"--use_gpu"
,
type
=
distutils
.
util
.
strtobool
,
default
=
False
,
help
=
"Use gpu or not. (default: %(default)s)"
)
parser
.
add_argument
(
"--trainer_count"
,
type
=
int
,
default
=
1
,
help
=
"Trainer number. (default: %(default)s)"
)
return
parser
.
parse_args
()
def
generate
(
gen_data
,
dict_size
,
model_path
,
beam_size
):
beam_gen
=
seqToseq_net
(
dict_size
,
dict_size
,
beam_size
,
is_generating
=
True
)
with
gzip
.
open
(
model_path
,
'r'
)
as
f
:
parameters
=
paddle
.
parameters
.
Parameters
.
from_tar
(
f
)
# prob is the prediction probabilities, and id is the prediction word.
beam_result
=
paddle
.
infer
(
output_layer
=
beam_gen
,
parameters
=
parameters
,
input
=
gen_data
,
field
=
[
'prob'
,
'id'
])
# get the dictionary
src_dict
,
trg_dict
=
paddle
.
dataset
.
wmt14
.
get_dict
(
dict_size
)
# the delimited element of generated sequences is -1,
# the first element of each generated sequence is the sequence length
seq_list
=
[]
seq
=
[]
for
w
in
beam_result
[
1
]:
if
w
!=
-
1
:
seq
.
append
(
w
)
else
:
seq_list
.
append
(
' '
.
join
([
trg_dict
.
get
(
w
)
for
w
in
seq
[
1
:]]))
seq
=
[]
prob
=
beam_result
[
0
]
for
i
in
xrange
(
gen_num
):
print
"
\n
*******************************************************
\n
"
print
"src:"
,
' '
.
join
([
src_dict
.
get
(
w
)
for
w
in
gen_data
[
i
][
0
]]),
"
\n
"
for
j
in
xrange
(
beam_size
):
print
"prob = %f:"
%
(
prob
[
i
][
j
]),
seq_list
[
i
*
beam_size
+
j
]
if
__name__
==
'__main__'
:
args
=
parse_args
()
dict_size
=
30000
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
# use the first 3 samples for generation
gen_creator
=
paddle
.
dataset
.
wmt14
.
gen
(
dict_size
)
gen_data
=
[]
gen_num
=
3
for
item
in
gen_creator
():
gen_data
.
append
((
item
[
0
],
))
if
len
(
gen_data
)
==
gen_num
:
break
generate
(
gen_data
,
dict_size
=
dict_size
,
model_path
=
args
.
model_path
,
beam_size
=
args
.
beam_size
)
scheduled_sampling/
scheduled_sampling
.py
→
scheduled_sampling/
network_conf
.py
浏览文件 @
0f2fec44
import
sys
import
paddle.v2
as
paddle
from
random_schedule_generator
import
RandomScheduleGenerator
schedule_generator
=
RandomScheduleGenerator
(
"linear"
,
0.75
,
1000000
)
__all__
=
[
"seqToseq_net"
]
### Network Architecture
word_vector_dim
=
512
# dimension of word vector
decoder_size
=
512
# dimension of hidden unit in GRU Decoder network
encoder_size
=
512
# dimension of hidden unit in GRU Encoder network
def
gen_schedule_data
(
reader
):
"""
Creates a data reader for scheduled sampling.
Output from the iterator that created by original reader will be
appended with "true_token_flag" to indicate whether to use true token.
max_length
=
250
:param reader: the original reader.
:type reader: callable
:return: the new reader with the field "true_token_flag".
:rtype: callable
"""
def
data_reader
():
for
src_ids
,
trg_ids
,
trg_ids_next
in
reader
():
yield
src_ids
,
trg_ids
,
trg_ids_next
,
\
[
0
]
+
schedule_generator
.
processBatch
(
len
(
trg_ids
)
-
1
)
return
data_reader
def
seqToseq_net
(
source_dict_dim
,
target_dict_dim
,
is_generating
=
False
):
def
seqToseq_net
(
source_dict_dim
,
target_dict_dim
,
beam_size
,
is_generating
=
False
):
"""
The definition of the sequence to sequence model
:param source_dict_dim: the dictionary size of the source language
:type source_dict_dim: int
:param target_dict_dim: the dictionary size of the target language
:type target_dict_dim: int
:param beam_size: The width of beam expansion
:type beam_size: int
:param is_generating: whether in generating mode
:type is_generating: Bool
:return: the last layer of the network
:rtype: LayerOutput
"""
### Network Architecture
word_vector_dim
=
512
# dimension of word vector
decoder_size
=
512
# dimension of hidden unit in GRU Decoder network
encoder_size
=
512
# dimension of hidden unit in GRU Encoder network
beam_size
=
3
max_length
=
250
#### Encoder
src_word_id
=
paddle
.
layer
.
data
(
...
...
@@ -55,21 +36,24 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
input
=
src_word_id
,
size
=
word_vector_dim
)
src_forward
=
paddle
.
networks
.
simple_gru
(
input
=
src_embedding
,
size
=
encoder_size
)
src_
backward
=
paddle
.
networks
.
simple_gru
(
src_
reverse
=
paddle
.
networks
.
simple_gru
(
input
=
src_embedding
,
size
=
encoder_size
,
reverse
=
True
)
encoded_vector
=
paddle
.
layer
.
concat
(
input
=
[
src_forward
,
src_
backward
])
encoded_vector
=
paddle
.
layer
.
concat
(
input
=
[
src_forward
,
src_
reverse
])
#### Decoder
with
paddle
.
layer
.
mixed
(
size
=
decoder_size
)
as
encoded_proj
:
encoded_proj
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
encoded_vector
)
encoded_proj
=
paddle
.
layer
.
fc
(
input
=
encoded_vector
,
size
=
decoder_size
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
backward_first
=
paddle
.
layer
.
first_seq
(
input
=
src_backward
)
reverse_first
=
paddle
.
layer
.
first_seq
(
input
=
src_reverse
)
with
paddle
.
layer
.
mixed
(
size
=
decoder_size
,
act
=
paddle
.
activation
.
Tanh
())
as
decoder_boot
:
decoder_boot
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
backward_first
)
decoder_boot
=
paddle
.
layer
.
fc
(
input
=
reverse_first
,
size
=
decoder_size
,
act
=
paddle
.
activation
.
Tanh
(),
bias_attr
=
False
)
def
gru_decoder_with_attention_train
(
enc_vec
,
enc_proj
,
true_word
,
true_token_flag
):
...
...
@@ -108,10 +92,11 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
current_word
=
paddle
.
layer
.
multiplex
(
input
=
[
true_token_flag
,
true_word
,
generated_word_emb
])
with
paddle
.
layer
.
mixed
(
size
=
decoder_size
*
3
)
as
decoder_inputs
:
decoder_inputs
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
context
)
decoder_inputs
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
current_word
)
decoder_inputs
=
paddle
.
layer
.
fc
(
input
=
[
context
,
current_word
],
size
=
decoder_size
*
3
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
gru_step
=
paddle
.
layer
.
gru_step
(
name
=
'gru_decoder'
,
...
...
@@ -119,16 +104,14 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
output_mem
=
decoder_mem
,
size
=
decoder_size
)
with
paddle
.
layer
.
mixed
(
name
=
'gru_out'
,
size
=
target_dict_dim
,
bias_attr
=
True
,
act
=
paddle
.
activation
.
Softmax
())
as
out
:
out
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
gru_step
)
out
=
paddle
.
layer
.
fc
(
name
=
'gru_out'
,
input
=
gru_step
,
size
=
target_dict_dim
,
act
=
paddle
.
activation
.
Softmax
())
return
out
def
gru_decoder_with_attention_
test
(
enc_vec
,
enc_proj
,
current_word
):
def
gru_decoder_with_attention_
gen
(
enc_vec
,
enc_proj
,
current_word
):
"""
The decoder step for generating.
:param enc_vec: the encoder vector for attention
...
...
@@ -149,10 +132,11 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
encoded_proj
=
enc_proj
,
decoder_state
=
decoder_mem
)
with
paddle
.
layer
.
mixed
(
size
=
decoder_size
*
3
)
as
decoder_inputs
:
decoder_inputs
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
context
)
decoder_inputs
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
current_word
)
decoder_inputs
=
paddle
.
layer
.
fc
(
input
=
[
context
,
current_word
],
size
=
decoder_size
*
3
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
gru_step
=
paddle
.
layer
.
gru_step
(
name
=
'gru_decoder'
,
...
...
@@ -160,17 +144,16 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
output_mem
=
decoder_mem
,
size
=
decoder_size
)
with
paddle
.
layer
.
mixed
(
size
=
target_dict_dim
,
bias_attr
=
True
,
act
=
paddle
.
activation
.
Softmax
())
as
out
:
out
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
gru_step
)
out
=
paddle
.
layer
.
fc
(
name
=
'gru_out'
,
input
=
gru_step
,
size
=
target_dict_dim
,
act
=
paddle
.
activation
.
Softmax
()
)
return
out
decoder_group_name
=
"decoder_group"
group_input1
=
paddle
.
layer
.
StaticInput
(
input
=
encoded_vector
,
is_seq
=
True
)
group_input2
=
paddle
.
layer
.
StaticInput
(
input
=
encoded_proj
,
is_seq
=
True
)
group_inputs
=
[
group_input1
,
group_input2
]
if
not
is_generating
:
trg_embedding
=
paddle
.
layer
.
embedding
(
...
...
@@ -179,12 +162,14 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
type
=
paddle
.
data_type
.
integer_value_sequence
(
target_dict_dim
)),
size
=
word_vector_dim
,
param_attr
=
paddle
.
attr
.
ParamAttr
(
name
=
'_target_language_embedding'
))
group_inputs
.
append
(
trg_embedding
)
true_token_flags
=
paddle
.
layer
.
data
(
name
=
'true_token_flag'
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
2
))
group_inputs
.
append
(
true_token_flags
)
group_inputs
=
[
group_input1
,
group_input2
,
trg_embedding
,
true_token_flags
]
decoder
=
paddle
.
layer
.
recurrent_group
(
name
=
decoder_group_name
,
...
...
@@ -194,6 +179,7 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
lbl
=
paddle
.
layer
.
data
(
name
=
'target_language_next_word'
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
target_dict_dim
))
cost
=
paddle
.
layer
.
classification_cost
(
input
=
decoder
,
label
=
lbl
)
return
cost
...
...
@@ -202,122 +188,15 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
size
=
target_dict_dim
,
embedding_name
=
'_target_language_embedding'
,
embedding_size
=
word_vector_dim
)
group_inputs
.
append
(
trg_embedding
)
group_inputs
=
[
group_input1
,
group_input2
,
trg_embedding
]
beam_gen
=
paddle
.
layer
.
beam_search
(
name
=
decoder_group_name
,
step
=
gru_decoder_with_attention_
test
,
step
=
gru_decoder_with_attention_
gen
,
input
=
group_inputs
,
bos_id
=
0
,
eos_id
=
1
,
beam_size
=
beam_size
,
max_length
=
max_length
)
return
beam_gen
def
main
():
paddle
.
init
(
use_gpu
=
False
,
trainer_count
=
1
)
is_generating
=
False
model_path_for_generating
=
'params_pass_1.tar.gz'
# source and target dict dim.
dict_size
=
30000
source_dict_dim
=
target_dict_dim
=
dict_size
# train the network
if
not
is_generating
:
cost
=
seqToseq_net
(
source_dict_dim
,
target_dict_dim
)
parameters
=
paddle
.
parameters
.
create
(
cost
)
# define optimize method and trainer
optimizer
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
5e-5
,
regularization
=
paddle
.
optimizer
.
L2Regularization
(
rate
=
8e-4
))
trainer
=
paddle
.
trainer
.
SGD
(
cost
=
cost
,
parameters
=
parameters
,
update_equation
=
optimizer
)
# define data reader
wmt14_reader
=
paddle
.
batch
(
gen_schedule_data
(
paddle
.
reader
.
shuffle
(
paddle
.
dataset
.
wmt14
.
train
(
dict_size
),
buf_size
=
8192
)),
batch_size
=
5
)
feeding
=
{
'source_language_word'
:
0
,
'target_language_word'
:
1
,
'target_language_next_word'
:
2
,
'true_token_flag'
:
3
}
# define event_handler callback
def
event_handler
(
event
):
if
isinstance
(
event
,
paddle
.
event
.
EndIteration
):
if
event
.
batch_id
%
10
==
0
:
print
"
\n
Pass %d, Batch %d, Cost %f, %s"
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
,
event
.
metrics
)
else
:
sys
.
stdout
.
write
(
'.'
)
sys
.
stdout
.
flush
()
if
isinstance
(
event
,
paddle
.
event
.
EndPass
):
# save parameters
with
gzip
.
open
(
'params_pass_%d.tar.gz'
%
event
.
pass_id
,
'w'
)
as
f
:
trainer
.
save_parameter_to_tar
(
f
)
# start to train
trainer
.
train
(
reader
=
wmt14_reader
,
event_handler
=
event_handler
,
feeding
=
feeding
,
num_passes
=
2
)
# generate a english sequence to french
else
:
# use the first 3 samples for generation
gen_creator
=
paddle
.
dataset
.
wmt14
.
gen
(
dict_size
)
gen_data
=
[]
gen_num
=
3
for
item
in
gen_creator
():
gen_data
.
append
((
item
[
0
],
))
if
len
(
gen_data
)
==
gen_num
:
break
beam_gen
=
seqToseq_net
(
source_dict_dim
,
target_dict_dim
,
is_generating
)
# get the trained model
with
gzip
.
open
(
model_path_for_generating
,
'r'
)
as
f
:
parameters
=
Parameters
.
from_tar
(
f
)
# prob is the prediction probabilities, and id is the prediction word.
beam_result
=
paddle
.
infer
(
output_layer
=
beam_gen
,
parameters
=
parameters
,
input
=
gen_data
,
field
=
[
'prob'
,
'id'
])
# get the dictionary
src_dict
,
trg_dict
=
paddle
.
dataset
.
wmt14
.
get_dict
(
dict_size
)
# the delimited element of generated sequences is -1,
# the first element of each generated sequence is the sequence length
seq_list
=
[]
seq
=
[]
for
w
in
beam_result
[
1
]:
if
w
!=
-
1
:
seq
.
append
(
w
)
else
:
seq_list
.
append
(
' '
.
join
([
trg_dict
.
get
(
w
)
for
w
in
seq
[
1
:]]))
seq
=
[]
prob
=
beam_result
[
0
]
beam_size
=
3
for
i
in
xrange
(
gen_num
):
print
"
\n
*******************************************************
\n
"
print
"src:"
,
' '
.
join
(
[
src_dict
.
get
(
w
)
for
w
in
gen_data
[
i
][
0
]]),
"
\n
"
for
j
in
xrange
(
beam_size
):
print
"prob = %f:"
%
(
prob
[
i
][
j
]),
seq_list
[
i
*
beam_size
+
j
]
if
__name__
==
'__main__'
:
main
()
scheduled_sampling/reader.py
0 → 100644
浏览文件 @
0f2fec44
from
utils
import
RandomScheduleGenerator
def
gen_schedule_data
(
reader
,
schedule_type
=
"linear"
,
decay_a
=
0.75
,
decay_b
=
1000000
):
"""
Creates a data reader for scheduled sampling.
Output from the iterator that created by original reader will be
appended with "true_token_flag" to indicate whether to use true token.
:param reader: the original reader.
:type reader: callable
:param schedule_type: the type of sampling rate decay.
:type schedule_type: str
:param decay_a: the decay parameter a.
:type decay_a: float
:param decay_b: the decay parameter b.
:type decay_b: float
:return: the new reader with the field "true_token_flag".
:rtype: callable
"""
schedule_generator
=
RandomScheduleGenerator
(
schedule_type
,
decay_a
,
decay_b
)
def
data_reader
():
for
src_ids
,
trg_ids
,
trg_ids_next
in
reader
():
yield
src_ids
,
trg_ids
,
trg_ids_next
,
\
[
0
]
+
schedule_generator
.
processBatch
(
len
(
trg_ids
)
-
1
)
return
data_reader
feeding
=
{
'source_language_word'
:
0
,
'target_language_word'
:
1
,
'target_language_next_word'
:
2
,
'true_token_flag'
:
3
}
scheduled_sampling/train.py
0 → 100644
浏览文件 @
0f2fec44
import
os
import
sys
import
gzip
import
argparse
import
distutils.util
import
paddle.v2
as
paddle
import
reader
from
network_conf
import
seqToseq_net
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
"PaddlePaddle Scheduled Sampling"
)
parser
.
add_argument
(
'--schedule_type'
,
type
=
str
,
default
=
"linear"
,
help
=
'The type of sampling rate decay. Supported type: constant, linear, exponential, inverse_sigmoid. (default: %(default)s)'
)
parser
.
add_argument
(
'--decay_a'
,
type
=
float
,
default
=
0.75
,
help
=
'The sampling rate decay parameter a. (default: %(default)s)'
)
parser
.
add_argument
(
'--decay_b'
,
type
=
float
,
default
=
1000000
,
help
=
'The sampling rate decay parameter b. (default: %(default)s)'
)
parser
.
add_argument
(
'--beam_size'
,
type
=
int
,
default
=
3
,
help
=
'The width of beam expansion. (default: %(default)s)'
)
parser
.
add_argument
(
"--use_gpu"
,
type
=
distutils
.
util
.
strtobool
,
default
=
False
,
help
=
"Use gpu or not. (default: %(default)s)"
)
parser
.
add_argument
(
"--trainer_count"
,
type
=
int
,
default
=
1
,
help
=
"Trainer number. (default: %(default)s)"
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
32
,
help
=
"Size of a mini-batch. (default: %(default)s)"
)
parser
.
add_argument
(
'--num_passes'
,
type
=
int
,
default
=
10
,
help
=
"Number of passes to train. (default: %(default)s)"
)
parser
.
add_argument
(
'--model_output_dir'
,
type
=
str
,
default
=
'models'
,
help
=
"The path for model to store. (default: %(default)s)"
)
return
parser
.
parse_args
()
def
train
(
dict_size
,
batch_size
,
num_passes
,
beam_size
,
schedule_type
,
decay_a
,
decay_b
,
model_dir
):
optimizer
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
1e-4
,
regularization
=
paddle
.
optimizer
.
L2Regularization
(
rate
=
1e-5
))
cost
=
seqToseq_net
(
dict_size
,
dict_size
,
beam_size
)
parameters
=
paddle
.
parameters
.
create
(
cost
)
trainer
=
paddle
.
trainer
.
SGD
(
cost
=
cost
,
parameters
=
parameters
,
update_equation
=
optimizer
)
wmt14_reader
=
reader
.
gen_schedule_data
(
paddle
.
reader
.
shuffle
(
paddle
.
dataset
.
wmt14
.
train
(
dict_size
),
buf_size
=
8192
),
schedule_type
,
decay_a
,
decay_b
)
# define event_handler callback
def
event_handler
(
event
):
if
isinstance
(
event
,
paddle
.
event
.
EndIteration
):
if
event
.
batch_id
%
10
==
0
:
print
"
\n
Pass %d, Batch %d, Cost %f, %s"
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
,
event
.
metrics
)
else
:
sys
.
stdout
.
write
(
'.'
)
sys
.
stdout
.
flush
()
if
isinstance
(
event
,
paddle
.
event
.
EndPass
):
# save parameters
with
gzip
.
open
(
os
.
path
.
join
(
model_dir
,
'params_pass_%d.tar.gz'
%
event
.
pass_id
),
'w'
)
as
f
:
trainer
.
save_parameter_to_tar
(
f
)
# start to train
trainer
.
train
(
reader
=
paddle
.
batch
(
wmt14_reader
,
batch_size
=
batch_size
),
event_handler
=
event_handler
,
feeding
=
reader
.
feeding
,
num_passes
=
num_passes
)
if
__name__
==
'__main__'
:
args
=
parse_args
()
if
not
os
.
path
.
isdir
(
args
.
model_output_dir
):
os
.
mkdir
(
args
.
model_output_dir
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
train
(
dict_size
=
30000
,
batch_size
=
args
.
batch_size
,
num_passes
=
args
.
num_passes
,
beam_size
=
args
.
beam_size
,
schedule_type
=
args
.
schedule_type
,
decay_a
=
args
.
decay_a
,
decay_b
=
args
.
decay_b
,
model_dir
=
args
.
model_output_dir
)
scheduled_sampling/
random_schedule_generator
.py
→
scheduled_sampling/
utils
.py
浏览文件 @
0f2fec44
import
numpy
as
np
import
math
import
numpy
as
np
class
RandomScheduleGenerator
:
"""
The random sampling rate for scheduled sampling algoithm, which uses de
v
cayed
The random sampling rate for scheduled sampling algoithm, which uses decayed
sampling rate.
"""
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录