提交 0f2fec44 编写于 作者: W wangmeng28

restructure the code of scheduled sampling

上级 4ccf9345
...@@ -37,7 +37,7 @@ Scheduled Sampling主要应用在序列到序列模型的训练阶段,而生 ...@@ -37,7 +37,7 @@ Scheduled Sampling主要应用在序列到序列模型的训练阶段,而生
## 模型实现 ## 模型实现
由于Scheduled Sampling是对序列到序列模型的改进,其整体实现框架与序列到序列模型较为相似。为突出本文重点,这里仅介绍与Scheduled Sampling相关的部分,完整的代码见`scheduled_sampling.py` 由于Scheduled Sampling是对序列到序列模型的改进,其整体实现框架与序列到序列模型较为相似。为突出本文重点,这里仅介绍与Scheduled Sampling相关的部分,完整的代码见`network_conf.py`
首先导入需要的包,并定义控制衰减概率的类`RandomScheduleGenerator`,如下: 首先导入需要的包,并定义控制衰减概率的类`RandomScheduleGenerator`,如下:
...@@ -119,9 +119,10 @@ true_token_flags = paddle.layer.data( ...@@ -119,9 +119,10 @@ true_token_flags = paddle.layer.data(
这里还需要对原始reader进行封装,增加`true_token_flag`的数据生成器。下面以线性衰减为例说明如何调用上面定义的`RandomScheduleGenerator`产生`true_token_flag`的输入数据。 这里还需要对原始reader进行封装,增加`true_token_flag`的数据生成器。下面以线性衰减为例说明如何调用上面定义的`RandomScheduleGenerator`产生`true_token_flag`的输入数据。
```python ```python
schedule_generator = RandomScheduleGenerator("linear", 0.75, 1000000) def gen_schedule_data(reader,
schedule_type="linear",
def gen_schedule_data(reader): decay_a=0.75,
decay_b=1000000):
""" """
Creates a data reader for scheduled sampling. Creates a data reader for scheduled sampling.
...@@ -130,10 +131,17 @@ def gen_schedule_data(reader): ...@@ -130,10 +131,17 @@ def gen_schedule_data(reader):
:param reader: the original reader. :param reader: the original reader.
:type reader: callable :type reader: callable
:param schedule_type: the type of sampling rate decay.
:type schedule_type: str
:param decay_a: the decay parameter a.
:type decay_a: float
:param decay_b: the decay parameter b.
:type decay_b: float
:return: the new reader with the field "true_token_flag". :return: the new reader with the field "true_token_flag".
:rtype: callable :rtype: callable
""" """
schedule_generator = RandomScheduleGenerator(schedule_type, decay_a, decay_b)
def data_reader(): def data_reader():
for src_ids, trg_ids, trg_ids_next in reader(): for src_ids, trg_ids, trg_ids_next in reader():
...@@ -149,61 +157,60 @@ def gen_schedule_data(reader): ...@@ -149,61 +157,60 @@ def gen_schedule_data(reader):
```python ```python
def gru_decoder_with_attention_train(enc_vec, enc_proj, true_word, def gru_decoder_with_attention_train(enc_vec, enc_proj, true_word,
true_token_flag): true_token_flag):
""" """
The decoder step for training. The decoder step for training.
:param enc_vec: the encoder vector for attention :param enc_vec: the encoder vector for attention
:type enc_vec: LayerOutput :type enc_vec: LayerOutput
:param enc_proj: the encoder projection for attention :param enc_proj: the encoder projection for attention
:type enc_proj: LayerOutput :type enc_proj: LayerOutput
:param true_word: the ground-truth target word :param true_word: the ground-truth target word
:type true_word: LayerOutput :type true_word: LayerOutput
:param true_token_flag: the flag of using the ground-truth target word :param true_token_flag: the flag of using the ground-truth target word
:type true_token_flag: LayerOutput :type true_token_flag: LayerOutput
:return: the softmax output layer :return: the softmax output layer
:rtype: LayerOutput :rtype: LayerOutput
""" """
decoder_mem = paddle.layer.memory( decoder_mem = paddle.layer.memory(
name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
context = paddle.networks.simple_attention( context = paddle.networks.simple_attention(
encoded_sequence=enc_vec, encoded_sequence=enc_vec,
encoded_proj=enc_proj, encoded_proj=enc_proj,
decoder_state=decoder_mem) decoder_state=decoder_mem)
gru_out_memory = paddle.layer.memory( gru_out_memory = paddle.layer.memory(
name='gru_out', size=target_dict_dim) name='gru_out', size=target_dict_dim)
generated_word = paddle.layer.max_id(input=gru_out_memory) generated_word = paddle.layer.max_id(input=gru_out_memory)
generated_word_emb = paddle.layer.embedding( generated_word_emb = paddle.layer.embedding(
input=generated_word, input=generated_word,
size=word_vector_dim, size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
current_word = paddle.layer.multiplex( current_word = paddle.layer.multiplex(
input=[true_token_flag, true_word, generated_word_emb]) input=[true_token_flag, true_word, generated_word_emb])
with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: decoder_inputs = paddle.layer.fc(
decoder_inputs += paddle.layer.full_matrix_projection(input=context) input=[context, current_word],
decoder_inputs += paddle.layer.full_matrix_projection( size=decoder_size * 3,
input=current_word) act=paddle.activation.Linear(),
bias_attr=False)
gru_step = paddle.layer.gru_step(
name='gru_decoder', gru_step = paddle.layer.gru_step(
input=decoder_inputs, name='gru_decoder',
output_mem=decoder_mem, input=decoder_inputs,
size=decoder_size) output_mem=decoder_mem,
size=decoder_size)
with paddle.layer.mixed(
name='gru_out', out = paddle.layer.fc(
size=target_dict_dim, name='gru_out',
bias_attr=True, input=gru_step,
act=paddle.activation.Softmax()) as out: size=target_dict_dim,
out += paddle.layer.full_matrix_projection(input=gru_step) act=paddle.activation.Softmax())
return out
return out
``` ```
该函数使用`memory``gru_out_memory`记忆上一时刻生成的元素,根据`gru_out_memory`选择概率最大的词语`generated_word`作为生成的词语。`multiplex`层会在真实元素`true_word`和生成的元素`generated_word`之间做出选择,并将选择的结果作为解码器输入。`multiplex`层使用了三个输入,分别为`true_token_flag``true_word``generated_word_emb`。对于这三个输入中每个元素,若`true_token_flag`中的值为`0`,则`multiplex`层输出`true_word`中的相应元素;若`true_token_flag`中的值为`1`,则`multiplex`层输出`generated_word_emb`中的相应元素。 该函数使用`memory``gru_out_memory`记忆上一时刻生成的元素,根据`gru_out_memory`选择概率最大的词语`generated_word`作为生成的词语。`multiplex`层会在真实元素`true_word`和生成的元素`generated_word`之间做出选择,并将选择的结果作为解码器输入。`multiplex`层使用了三个输入,分别为`true_token_flag``true_word``generated_word_emb`。对于这三个输入中每个元素,若`true_token_flag`中的值为`0`,则`multiplex`层输出`true_word`中的相应元素;若`true_token_flag`中的值为`1`,则`multiplex`层输出`generated_word_emb`中的相应元素。
......
import gzip
import argparse
import distutils.util
import paddle.v2 as paddle
from network_conf import seqToseq_net
def parse_args():
parser = argparse.ArgumentParser(
description="PaddlePaddle Scheduled Sampling")
parser.add_argument(
'--model_path',
type=str,
required=True,
help="The path for trained model to load.")
parser.add_argument(
'--beam_size',
type=int,
default=3,
help='The width of beam expansion. (default: %(default)s)')
parser.add_argument(
"--use_gpu",
type=distutils.util.strtobool,
default=False,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
type=int,
default=1,
help="Trainer number. (default: %(default)s)")
return parser.parse_args()
def generate(gen_data, dict_size, model_path, beam_size):
beam_gen = seqToseq_net(dict_size, dict_size, beam_size, is_generating=True)
with gzip.open(model_path, 'r') as f:
parameters = paddle.parameters.Parameters.from_tar(f)
# prob is the prediction probabilities, and id is the prediction word.
beam_result = paddle.infer(
output_layer=beam_gen,
parameters=parameters,
input=gen_data,
field=['prob', 'id'])
# get the dictionary
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
# the delimited element of generated sequences is -1,
# the first element of each generated sequence is the sequence length
seq_list = []
seq = []
for w in beam_result[1]:
if w != -1:
seq.append(w)
else:
seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
seq = []
prob = beam_result[0]
for i in xrange(gen_num):
print "\n*******************************************************\n"
print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n"
for j in xrange(beam_size):
print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
if __name__ == '__main__':
args = parse_args()
dict_size = 30000
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
# use the first 3 samples for generation
gen_creator = paddle.dataset.wmt14.gen(dict_size)
gen_data = []
gen_num = 3
for item in gen_creator():
gen_data.append((item[0], ))
if len(gen_data) == gen_num:
break
generate(
gen_data,
dict_size=dict_size,
model_path=args.model_path,
beam_size=args.beam_size)
import sys
import paddle.v2 as paddle import paddle.v2 as paddle
from random_schedule_generator import RandomScheduleGenerator
schedule_generator = RandomScheduleGenerator("linear", 0.75, 1000000) __all__ = ["seqToseq_net"]
### Network Architecture
word_vector_dim = 512 # dimension of word vector
decoder_size = 512 # dimension of hidden unit in GRU Decoder network
encoder_size = 512 # dimension of hidden unit in GRU Encoder network
def gen_schedule_data(reader): max_length = 250
"""
Creates a data reader for scheduled sampling.
Output from the iterator that created by original reader will be
appended with "true_token_flag" to indicate whether to use true token.
:param reader: the original reader.
:type reader: callable
:return: the new reader with the field "true_token_flag". def seqToseq_net(source_dict_dim,
:rtype: callable target_dict_dim,
""" beam_size,
is_generating=False):
def data_reader():
for src_ids, trg_ids, trg_ids_next in reader():
yield src_ids, trg_ids, trg_ids_next, \
[0] + schedule_generator.processBatch(len(trg_ids) - 1)
return data_reader
def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
""" """
The definition of the sequence to sequence model The definition of the sequence to sequence model
:param source_dict_dim: the dictionary size of the source language :param source_dict_dim: the dictionary size of the source language
:type source_dict_dim: int :type source_dict_dim: int
:param target_dict_dim: the dictionary size of the target language :param target_dict_dim: the dictionary size of the target language
:type target_dict_dim: int :type target_dict_dim: int
:param beam_size: The width of beam expansion
:type beam_size: int
:param is_generating: whether in generating mode :param is_generating: whether in generating mode
:type is_generating: Bool :type is_generating: Bool
:return: the last layer of the network :return: the last layer of the network
:rtype: LayerOutput :rtype: LayerOutput
""" """
### Network Architecture
word_vector_dim = 512 # dimension of word vector
decoder_size = 512 # dimension of hidden unit in GRU Decoder network
encoder_size = 512 # dimension of hidden unit in GRU Encoder network
beam_size = 3
max_length = 250
#### Encoder #### Encoder
src_word_id = paddle.layer.data( src_word_id = paddle.layer.data(
...@@ -55,21 +36,24 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): ...@@ -55,21 +36,24 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
input=src_word_id, size=word_vector_dim) input=src_word_id, size=word_vector_dim)
src_forward = paddle.networks.simple_gru( src_forward = paddle.networks.simple_gru(
input=src_embedding, size=encoder_size) input=src_embedding, size=encoder_size)
src_backward = paddle.networks.simple_gru( src_reverse = paddle.networks.simple_gru(
input=src_embedding, size=encoder_size, reverse=True) input=src_embedding, size=encoder_size, reverse=True)
encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) encoded_vector = paddle.layer.concat(input=[src_forward, src_reverse])
#### Decoder #### Decoder
with paddle.layer.mixed(size=decoder_size) as encoded_proj: encoded_proj = paddle.layer.fc(
encoded_proj += paddle.layer.full_matrix_projection( input=encoded_vector,
input=encoded_vector) size=decoder_size,
act=paddle.activation.Linear(),
bias_attr=False)
backward_first = paddle.layer.first_seq(input=src_backward) reverse_first = paddle.layer.first_seq(input=src_reverse)
with paddle.layer.mixed( decoder_boot = paddle.layer.fc(
size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot: input=reverse_first,
decoder_boot += paddle.layer.full_matrix_projection( size=decoder_size,
input=backward_first) act=paddle.activation.Tanh(),
bias_attr=False)
def gru_decoder_with_attention_train(enc_vec, enc_proj, true_word, def gru_decoder_with_attention_train(enc_vec, enc_proj, true_word,
true_token_flag): true_token_flag):
...@@ -108,10 +92,11 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): ...@@ -108,10 +92,11 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
current_word = paddle.layer.multiplex( current_word = paddle.layer.multiplex(
input=[true_token_flag, true_word, generated_word_emb]) input=[true_token_flag, true_word, generated_word_emb])
with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: decoder_inputs = paddle.layer.fc(
decoder_inputs += paddle.layer.full_matrix_projection(input=context) input=[context, current_word],
decoder_inputs += paddle.layer.full_matrix_projection( size=decoder_size * 3,
input=current_word) act=paddle.activation.Linear(),
bias_attr=False)
gru_step = paddle.layer.gru_step( gru_step = paddle.layer.gru_step(
name='gru_decoder', name='gru_decoder',
...@@ -119,16 +104,14 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): ...@@ -119,16 +104,14 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
output_mem=decoder_mem, output_mem=decoder_mem,
size=decoder_size) size=decoder_size)
with paddle.layer.mixed( out = paddle.layer.fc(
name='gru_out', name='gru_out',
size=target_dict_dim, input=gru_step,
bias_attr=True, size=target_dict_dim,
act=paddle.activation.Softmax()) as out: act=paddle.activation.Softmax())
out += paddle.layer.full_matrix_projection(input=gru_step)
return out return out
def gru_decoder_with_attention_test(enc_vec, enc_proj, current_word): def gru_decoder_with_attention_gen(enc_vec, enc_proj, current_word):
""" """
The decoder step for generating. The decoder step for generating.
:param enc_vec: the encoder vector for attention :param enc_vec: the encoder vector for attention
...@@ -149,10 +132,11 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): ...@@ -149,10 +132,11 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
encoded_proj=enc_proj, encoded_proj=enc_proj,
decoder_state=decoder_mem) decoder_state=decoder_mem)
with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: decoder_inputs = paddle.layer.fc(
decoder_inputs += paddle.layer.full_matrix_projection(input=context) input=[context, current_word],
decoder_inputs += paddle.layer.full_matrix_projection( size=decoder_size * 3,
input=current_word) act=paddle.activation.Linear(),
bias_attr=False)
gru_step = paddle.layer.gru_step( gru_step = paddle.layer.gru_step(
name='gru_decoder', name='gru_decoder',
...@@ -160,17 +144,16 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): ...@@ -160,17 +144,16 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
output_mem=decoder_mem, output_mem=decoder_mem,
size=decoder_size) size=decoder_size)
with paddle.layer.mixed( out = paddle.layer.fc(
size=target_dict_dim, name='gru_out',
bias_attr=True, input=gru_step,
act=paddle.activation.Softmax()) as out: size=target_dict_dim,
out += paddle.layer.full_matrix_projection(input=gru_step) act=paddle.activation.Softmax())
return out return out
decoder_group_name = "decoder_group" decoder_group_name = "decoder_group"
group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
group_inputs = [group_input1, group_input2]
if not is_generating: if not is_generating:
trg_embedding = paddle.layer.embedding( trg_embedding = paddle.layer.embedding(
...@@ -179,12 +162,14 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): ...@@ -179,12 +162,14 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
type=paddle.data_type.integer_value_sequence(target_dict_dim)), type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim, size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
true_token_flags = paddle.layer.data( true_token_flags = paddle.layer.data(
name='true_token_flag', name='true_token_flag',
type=paddle.data_type.integer_value_sequence(2)) type=paddle.data_type.integer_value_sequence(2))
group_inputs.append(true_token_flags)
group_inputs = [
group_input1, group_input2, trg_embedding, true_token_flags
]
decoder = paddle.layer.recurrent_group( decoder = paddle.layer.recurrent_group(
name=decoder_group_name, name=decoder_group_name,
...@@ -194,6 +179,7 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): ...@@ -194,6 +179,7 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
lbl = paddle.layer.data( lbl = paddle.layer.data(
name='target_language_next_word', name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim)) type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl) cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost return cost
...@@ -202,122 +188,15 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): ...@@ -202,122 +188,15 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
size=target_dict_dim, size=target_dict_dim,
embedding_name='_target_language_embedding', embedding_name='_target_language_embedding',
embedding_size=word_vector_dim) embedding_size=word_vector_dim)
group_inputs.append(trg_embedding)
group_inputs = [group_input1, group_input2, trg_embedding]
beam_gen = paddle.layer.beam_search( beam_gen = paddle.layer.beam_search(
name=decoder_group_name, name=decoder_group_name,
step=gru_decoder_with_attention_test, step=gru_decoder_with_attention_gen,
input=group_inputs, input=group_inputs,
bos_id=0, bos_id=0,
eos_id=1, eos_id=1,
beam_size=beam_size, beam_size=beam_size,
max_length=max_length) max_length=max_length)
return beam_gen return beam_gen
def main():
paddle.init(use_gpu=False, trainer_count=1)
is_generating = False
model_path_for_generating = 'params_pass_1.tar.gz'
# source and target dict dim.
dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
# train the network
if not is_generating:
cost = seqToseq_net(source_dict_dim, target_dict_dim)
parameters = paddle.parameters.create(cost)
# define optimize method and trainer
optimizer = paddle.optimizer.Adam(
learning_rate=5e-5,
regularization=paddle.optimizer.L2Regularization(rate=8e-4))
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
# define data reader
wmt14_reader = paddle.batch(
gen_schedule_data(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=8192)),
batch_size=5)
feeding = {
'source_language_word': 0,
'target_language_word': 1,
'target_language_next_word': 2,
'true_token_flag': 3
}
# define event_handler callback
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 10 == 0:
print "\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost,
event.metrics)
else:
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.EndPass):
# save parameters
with gzip.open('params_pass_%d.tar.gz' % event.pass_id,
'w') as f:
trainer.save_parameter_to_tar(f)
# start to train
trainer.train(
reader=wmt14_reader,
event_handler=event_handler,
feeding=feeding,
num_passes=2)
# generate a english sequence to french
else:
# use the first 3 samples for generation
gen_creator = paddle.dataset.wmt14.gen(dict_size)
gen_data = []
gen_num = 3
for item in gen_creator():
gen_data.append((item[0], ))
if len(gen_data) == gen_num:
break
beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating)
# get the trained model
with gzip.open(model_path_for_generating, 'r') as f:
parameters = Parameters.from_tar(f)
# prob is the prediction probabilities, and id is the prediction word.
beam_result = paddle.infer(
output_layer=beam_gen,
parameters=parameters,
input=gen_data,
field=['prob', 'id'])
# get the dictionary
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
# the delimited element of generated sequences is -1,
# the first element of each generated sequence is the sequence length
seq_list = []
seq = []
for w in beam_result[1]:
if w != -1:
seq.append(w)
else:
seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
seq = []
prob = beam_result[0]
beam_size = 3
for i in xrange(gen_num):
print "\n*******************************************************\n"
print "src:", ' '.join(
[src_dict.get(w) for w in gen_data[i][0]]), "\n"
for j in xrange(beam_size):
print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
if __name__ == '__main__':
main()
from utils import RandomScheduleGenerator
def gen_schedule_data(reader,
schedule_type="linear",
decay_a=0.75,
decay_b=1000000):
"""
Creates a data reader for scheduled sampling.
Output from the iterator that created by original reader will be
appended with "true_token_flag" to indicate whether to use true token.
:param reader: the original reader.
:type reader: callable
:param schedule_type: the type of sampling rate decay.
:type schedule_type: str
:param decay_a: the decay parameter a.
:type decay_a: float
:param decay_b: the decay parameter b.
:type decay_b: float
:return: the new reader with the field "true_token_flag".
:rtype: callable
"""
schedule_generator = RandomScheduleGenerator(schedule_type, decay_a,
decay_b)
def data_reader():
for src_ids, trg_ids, trg_ids_next in reader():
yield src_ids, trg_ids, trg_ids_next, \
[0] + schedule_generator.processBatch(len(trg_ids) - 1)
return data_reader
feeding = {
'source_language_word': 0,
'target_language_word': 1,
'target_language_next_word': 2,
'true_token_flag': 3
}
import os
import sys
import gzip
import argparse
import distutils.util
import paddle.v2 as paddle
import reader
from network_conf import seqToseq_net
def parse_args():
parser = argparse.ArgumentParser(
description="PaddlePaddle Scheduled Sampling")
parser.add_argument(
'--schedule_type',
type=str,
default="linear",
help='The type of sampling rate decay. Supported type: constant, linear, exponential, inverse_sigmoid. (default: %(default)s)'
)
parser.add_argument(
'--decay_a',
type=float,
default=0.75,
help='The sampling rate decay parameter a. (default: %(default)s)')
parser.add_argument(
'--decay_b',
type=float,
default=1000000,
help='The sampling rate decay parameter b. (default: %(default)s)')
parser.add_argument(
'--beam_size',
type=int,
default=3,
help='The width of beam expansion. (default: %(default)s)')
parser.add_argument(
"--use_gpu",
type=distutils.util.strtobool,
default=False,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
type=int,
default=1,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
'--batch_size',
type=int,
default=32,
help="Size of a mini-batch. (default: %(default)s)")
parser.add_argument(
'--num_passes',
type=int,
default=10,
help="Number of passes to train. (default: %(default)s)")
parser.add_argument(
'--model_output_dir',
type=str,
default='models',
help="The path for model to store. (default: %(default)s)")
return parser.parse_args()
def train(dict_size, batch_size, num_passes, beam_size, schedule_type, decay_a,
decay_b, model_dir):
optimizer = paddle.optimizer.Adam(
learning_rate=1e-4,
regularization=paddle.optimizer.L2Regularization(rate=1e-5))
cost = seqToseq_net(dict_size, dict_size, beam_size)
parameters = paddle.parameters.create(cost)
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
wmt14_reader = reader.gen_schedule_data(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=8192),
schedule_type, decay_a, decay_b)
# define event_handler callback
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 10 == 0:
print "\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
else:
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.EndPass):
# save parameters
with gzip.open(
os.path.join(model_dir, 'params_pass_%d.tar.gz' %
event.pass_id), 'w') as f:
trainer.save_parameter_to_tar(f)
# start to train
trainer.train(
reader=paddle.batch(wmt14_reader, batch_size=batch_size),
event_handler=event_handler,
feeding=reader.feeding,
num_passes=num_passes)
if __name__ == '__main__':
args = parse_args()
if not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
train(
dict_size=30000,
batch_size=args.batch_size,
num_passes=args.num_passes,
beam_size=args.beam_size,
schedule_type=args.schedule_type,
decay_a=args.decay_a,
decay_b=args.decay_b,
model_dir=args.model_output_dir)
import numpy as np
import math import math
import numpy as np
class RandomScheduleGenerator: class RandomScheduleGenerator:
""" """
The random sampling rate for scheduled sampling algoithm, which uses devcayed The random sampling rate for scheduled sampling algoithm, which uses decayed
sampling rate. sampling rate.
""" """
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册