未验证 提交 3a83e026 编写于 作者: W Wang Meng 提交者: GitHub

Merge pull request #498 from will-am/restructure_scheduled_sampling

restructure the code of scheduled sampling
......@@ -37,7 +37,7 @@ Scheduled Sampling主要应用在序列到序列模型的训练阶段,而生
## 模型实现
由于Scheduled Sampling是对序列到序列模型的改进,其整体实现框架与序列到序列模型较为相似。为突出本文重点,这里仅介绍与Scheduled Sampling相关的部分,完整的代码见`scheduled_sampling.py`
由于Scheduled Sampling是对序列到序列模型的改进,其整体实现框架与序列到序列模型较为相似。为突出本文重点,这里仅介绍与Scheduled Sampling相关的部分,完整的代码见`network_conf.py`
首先导入需要的包,并定义控制衰减概率的类`RandomScheduleGenerator`,如下:
......@@ -119,9 +119,10 @@ true_token_flags = paddle.layer.data(
这里还需要对原始reader进行封装,增加`true_token_flag`的数据生成器。下面以线性衰减为例说明如何调用上面定义的`RandomScheduleGenerator`产生`true_token_flag`的输入数据。
```python
schedule_generator = RandomScheduleGenerator("linear", 0.75, 1000000)
def gen_schedule_data(reader):
def gen_schedule_data(reader,
schedule_type="linear",
decay_a=0.75,
decay_b=1000000):
"""
Creates a data reader for scheduled sampling.
......@@ -130,10 +131,17 @@ def gen_schedule_data(reader):
:param reader: the original reader.
:type reader: callable
:param schedule_type: the type of sampling rate decay.
:type schedule_type: str
:param decay_a: the decay parameter a.
:type decay_a: float
:param decay_b: the decay parameter b.
:type decay_b: float
:return: the new reader with the field "true_token_flag".
:rtype: callable
"""
schedule_generator = RandomScheduleGenerator(schedule_type, decay_a, decay_b)
def data_reader():
for src_ids, trg_ids, trg_ids_next in reader():
......@@ -149,61 +157,60 @@ def gen_schedule_data(reader):
```python
def gru_decoder_with_attention_train(enc_vec, enc_proj, true_word,
true_token_flag):
"""
The decoder step for training.
:param enc_vec: the encoder vector for attention
:type enc_vec: LayerOutput
:param enc_proj: the encoder projection for attention
:type enc_proj: LayerOutput
:param true_word: the ground-truth target word
:type true_word: LayerOutput
:param true_token_flag: the flag of using the ground-truth target word
:type true_token_flag: LayerOutput
:return: the softmax output layer
:rtype: LayerOutput
"""
decoder_mem = paddle.layer.memory(
name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
context = paddle.networks.simple_attention(
encoded_sequence=enc_vec,
encoded_proj=enc_proj,
decoder_state=decoder_mem)
gru_out_memory = paddle.layer.memory(
name='gru_out', size=target_dict_dim)
generated_word = paddle.layer.max_id(input=gru_out_memory)
generated_word_emb = paddle.layer.embedding(
input=generated_word,
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
current_word = paddle.layer.multiplex(
input=[true_token_flag, true_word, generated_word_emb])
with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
decoder_inputs += paddle.layer.full_matrix_projection(input=context)
decoder_inputs += paddle.layer.full_matrix_projection(
input=current_word)
gru_step = paddle.layer.gru_step(
name='gru_decoder',
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)
with paddle.layer.mixed(
name='gru_out',
size=target_dict_dim,
bias_attr=True,
act=paddle.activation.Softmax()) as out:
out += paddle.layer.full_matrix_projection(input=gru_step)
return out
true_token_flag):
"""
The decoder step for training.
:param enc_vec: the encoder vector for attention
:type enc_vec: LayerOutput
:param enc_proj: the encoder projection for attention
:type enc_proj: LayerOutput
:param true_word: the ground-truth target word
:type true_word: LayerOutput
:param true_token_flag: the flag of using the ground-truth target word
:type true_token_flag: LayerOutput
:return: the softmax output layer
:rtype: LayerOutput
"""
decoder_mem = paddle.layer.memory(
name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
context = paddle.networks.simple_attention(
encoded_sequence=enc_vec,
encoded_proj=enc_proj,
decoder_state=decoder_mem)
gru_out_memory = paddle.layer.memory(
name='gru_out', size=target_dict_dim)
generated_word = paddle.layer.max_id(input=gru_out_memory)
generated_word_emb = paddle.layer.embedding(
input=generated_word,
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
current_word = paddle.layer.multiplex(
input=[true_token_flag, true_word, generated_word_emb])
decoder_inputs = paddle.layer.fc(
input=[context, current_word],
size=decoder_size * 3,
act=paddle.activation.Linear(),
bias_attr=False)
gru_step = paddle.layer.gru_step(
name='gru_decoder',
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)
out = paddle.layer.fc(
name='gru_out',
input=gru_step,
size=target_dict_dim,
act=paddle.activation.Softmax())
return out
```
该函数使用`memory``gru_out_memory`记忆上一时刻生成的元素,根据`gru_out_memory`选择概率最大的词语`generated_word`作为生成的词语。`multiplex`层会在真实元素`true_word`和生成的元素`generated_word`之间做出选择,并将选择的结果作为解码器输入。`multiplex`层使用了三个输入,分别为`true_token_flag``true_word``generated_word_emb`。对于这三个输入中每个元素,若`true_token_flag`中的值为`0`,则`multiplex`层输出`true_word`中的相应元素;若`true_token_flag`中的值为`1`,则`multiplex`层输出`generated_word_emb`中的相应元素。
......
import gzip
import argparse
import distutils.util
import paddle.v2 as paddle
from network_conf import seqToseq_net
def parse_args():
parser = argparse.ArgumentParser(
description="PaddlePaddle Scheduled Sampling")
parser.add_argument(
'--model_path',
type=str,
required=True,
help="The path for trained model to load.")
parser.add_argument(
'--beam_size',
type=int,
default=3,
help='The width of beam expansion. (default: %(default)s)')
parser.add_argument(
"--use_gpu",
type=distutils.util.strtobool,
default=False,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
type=int,
default=1,
help="Trainer number. (default: %(default)s)")
return parser.parse_args()
def generate(gen_data, dict_size, model_path, beam_size):
beam_gen = seqToseq_net(dict_size, dict_size, beam_size, is_generating=True)
with gzip.open(model_path, 'r') as f:
parameters = paddle.parameters.Parameters.from_tar(f)
# prob is the prediction probabilities, and id is the prediction word.
beam_result = paddle.infer(
output_layer=beam_gen,
parameters=parameters,
input=gen_data,
field=['prob', 'id'])
# get the dictionary
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
# the delimited element of generated sequences is -1,
# the first element of each generated sequence is the sequence length
seq_list = []
seq = []
for w in beam_result[1]:
if w != -1:
seq.append(w)
else:
seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
seq = []
prob = beam_result[0]
for i in xrange(gen_num):
print "\n*******************************************************\n"
print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n"
for j in xrange(beam_size):
print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
if __name__ == '__main__':
args = parse_args()
dict_size = 30000
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
# use the first 3 samples for generation
gen_creator = paddle.dataset.wmt14.gen(dict_size)
gen_data = []
gen_num = 3
for item in gen_creator():
gen_data.append((item[0], ))
if len(gen_data) == gen_num:
break
generate(
gen_data,
dict_size=dict_size,
model_path=args.model_path,
beam_size=args.beam_size)
import sys
import paddle.v2 as paddle
from random_schedule_generator import RandomScheduleGenerator
schedule_generator = RandomScheduleGenerator("linear", 0.75, 1000000)
__all__ = ["seqToseq_net"]
### Network Architecture
word_vector_dim = 512 # dimension of word vector
decoder_size = 512 # dimension of hidden unit in GRU Decoder network
encoder_size = 512 # dimension of hidden unit in GRU Encoder network
def gen_schedule_data(reader):
"""
Creates a data reader for scheduled sampling.
Output from the iterator that created by original reader will be
appended with "true_token_flag" to indicate whether to use true token.
max_length = 250
:param reader: the original reader.
:type reader: callable
:return: the new reader with the field "true_token_flag".
:rtype: callable
"""
def data_reader():
for src_ids, trg_ids, trg_ids_next in reader():
yield src_ids, trg_ids, trg_ids_next, \
[0] + schedule_generator.processBatch(len(trg_ids) - 1)
return data_reader
def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
def seqToseq_net(source_dict_dim,
target_dict_dim,
beam_size,
is_generating=False):
"""
The definition of the sequence to sequence model
:param source_dict_dim: the dictionary size of the source language
:type source_dict_dim: int
:param target_dict_dim: the dictionary size of the target language
:type target_dict_dim: int
:param beam_size: The width of beam expansion
:type beam_size: int
:param is_generating: whether in generating mode
:type is_generating: Bool
:return: the last layer of the network
:rtype: LayerOutput
"""
### Network Architecture
word_vector_dim = 512 # dimension of word vector
decoder_size = 512 # dimension of hidden unit in GRU Decoder network
encoder_size = 512 # dimension of hidden unit in GRU Encoder network
beam_size = 3
max_length = 250
#### Encoder
src_word_id = paddle.layer.data(
......@@ -55,21 +36,24 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
input=src_word_id, size=word_vector_dim)
src_forward = paddle.networks.simple_gru(
input=src_embedding, size=encoder_size)
src_backward = paddle.networks.simple_gru(
src_reverse = paddle.networks.simple_gru(
input=src_embedding, size=encoder_size, reverse=True)
encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
encoded_vector = paddle.layer.concat(input=[src_forward, src_reverse])
#### Decoder
with paddle.layer.mixed(size=decoder_size) as encoded_proj:
encoded_proj += paddle.layer.full_matrix_projection(
input=encoded_vector)
encoded_proj = paddle.layer.fc(
input=encoded_vector,
size=decoder_size,
act=paddle.activation.Linear(),
bias_attr=False)
backward_first = paddle.layer.first_seq(input=src_backward)
reverse_first = paddle.layer.first_seq(input=src_reverse)
with paddle.layer.mixed(
size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
decoder_boot += paddle.layer.full_matrix_projection(
input=backward_first)
decoder_boot = paddle.layer.fc(
input=reverse_first,
size=decoder_size,
act=paddle.activation.Tanh(),
bias_attr=False)
def gru_decoder_with_attention_train(enc_vec, enc_proj, true_word,
true_token_flag):
......@@ -108,10 +92,11 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
current_word = paddle.layer.multiplex(
input=[true_token_flag, true_word, generated_word_emb])
with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
decoder_inputs += paddle.layer.full_matrix_projection(input=context)
decoder_inputs += paddle.layer.full_matrix_projection(
input=current_word)
decoder_inputs = paddle.layer.fc(
input=[context, current_word],
size=decoder_size * 3,
act=paddle.activation.Linear(),
bias_attr=False)
gru_step = paddle.layer.gru_step(
name='gru_decoder',
......@@ -119,16 +104,14 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
output_mem=decoder_mem,
size=decoder_size)
with paddle.layer.mixed(
name='gru_out',
size=target_dict_dim,
bias_attr=True,
act=paddle.activation.Softmax()) as out:
out += paddle.layer.full_matrix_projection(input=gru_step)
out = paddle.layer.fc(
name='gru_out',
input=gru_step,
size=target_dict_dim,
act=paddle.activation.Softmax())
return out
def gru_decoder_with_attention_test(enc_vec, enc_proj, current_word):
def gru_decoder_with_attention_gen(enc_vec, enc_proj, current_word):
"""
The decoder step for generating.
:param enc_vec: the encoder vector for attention
......@@ -149,10 +132,11 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
encoded_proj=enc_proj,
decoder_state=decoder_mem)
with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
decoder_inputs += paddle.layer.full_matrix_projection(input=context)
decoder_inputs += paddle.layer.full_matrix_projection(
input=current_word)
decoder_inputs = paddle.layer.fc(
input=[context, current_word],
size=decoder_size * 3,
act=paddle.activation.Linear(),
bias_attr=False)
gru_step = paddle.layer.gru_step(
name='gru_decoder',
......@@ -160,17 +144,16 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
output_mem=decoder_mem,
size=decoder_size)
with paddle.layer.mixed(
size=target_dict_dim,
bias_attr=True,
act=paddle.activation.Softmax()) as out:
out += paddle.layer.full_matrix_projection(input=gru_step)
out = paddle.layer.fc(
name='gru_out',
input=gru_step,
size=target_dict_dim,
act=paddle.activation.Softmax())
return out
decoder_group_name = "decoder_group"
group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
group_inputs = [group_input1, group_input2]
if not is_generating:
trg_embedding = paddle.layer.embedding(
......@@ -179,12 +162,14 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
true_token_flags = paddle.layer.data(
name='true_token_flag',
type=paddle.data_type.integer_value_sequence(2))
group_inputs.append(true_token_flags)
group_inputs = [
group_input1, group_input2, trg_embedding, true_token_flags
]
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
......@@ -194,6 +179,7 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
......@@ -202,122 +188,15 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
size=target_dict_dim,
embedding_name='_target_language_embedding',
embedding_size=word_vector_dim)
group_inputs.append(trg_embedding)
group_inputs = [group_input1, group_input2, trg_embedding]
beam_gen = paddle.layer.beam_search(
name=decoder_group_name,
step=gru_decoder_with_attention_test,
step=gru_decoder_with_attention_gen,
input=group_inputs,
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=max_length)
return beam_gen
def main():
paddle.init(use_gpu=False, trainer_count=1)
is_generating = False
model_path_for_generating = 'params_pass_1.tar.gz'
# source and target dict dim.
dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
# train the network
if not is_generating:
cost = seqToseq_net(source_dict_dim, target_dict_dim)
parameters = paddle.parameters.create(cost)
# define optimize method and trainer
optimizer = paddle.optimizer.Adam(
learning_rate=5e-5,
regularization=paddle.optimizer.L2Regularization(rate=8e-4))
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
# define data reader
wmt14_reader = paddle.batch(
gen_schedule_data(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=8192)),
batch_size=5)
feeding = {
'source_language_word': 0,
'target_language_word': 1,
'target_language_next_word': 2,
'true_token_flag': 3
}
# define event_handler callback
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 10 == 0:
print "\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost,
event.metrics)
else:
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.EndPass):
# save parameters
with gzip.open('params_pass_%d.tar.gz' % event.pass_id,
'w') as f:
trainer.save_parameter_to_tar(f)
# start to train
trainer.train(
reader=wmt14_reader,
event_handler=event_handler,
feeding=feeding,
num_passes=2)
# generate a english sequence to french
else:
# use the first 3 samples for generation
gen_creator = paddle.dataset.wmt14.gen(dict_size)
gen_data = []
gen_num = 3
for item in gen_creator():
gen_data.append((item[0], ))
if len(gen_data) == gen_num:
break
beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating)
# get the trained model
with gzip.open(model_path_for_generating, 'r') as f:
parameters = Parameters.from_tar(f)
# prob is the prediction probabilities, and id is the prediction word.
beam_result = paddle.infer(
output_layer=beam_gen,
parameters=parameters,
input=gen_data,
field=['prob', 'id'])
# get the dictionary
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
# the delimited element of generated sequences is -1,
# the first element of each generated sequence is the sequence length
seq_list = []
seq = []
for w in beam_result[1]:
if w != -1:
seq.append(w)
else:
seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
seq = []
prob = beam_result[0]
beam_size = 3
for i in xrange(gen_num):
print "\n*******************************************************\n"
print "src:", ' '.join(
[src_dict.get(w) for w in gen_data[i][0]]), "\n"
for j in xrange(beam_size):
print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
if __name__ == '__main__':
main()
from utils import RandomScheduleGenerator
def gen_schedule_data(reader,
schedule_type="linear",
decay_a=0.75,
decay_b=1000000):
"""
Creates a data reader for scheduled sampling.
Output from the iterator that created by original reader will be
appended with "true_token_flag" to indicate whether to use true token.
:param reader: the original reader.
:type reader: callable
:param schedule_type: the type of sampling rate decay.
:type schedule_type: str
:param decay_a: the decay parameter a.
:type decay_a: float
:param decay_b: the decay parameter b.
:type decay_b: float
:return: the new reader with the field "true_token_flag".
:rtype: callable
"""
schedule_generator = RandomScheduleGenerator(schedule_type, decay_a,
decay_b)
def data_reader():
for src_ids, trg_ids, trg_ids_next in reader():
yield src_ids, trg_ids, trg_ids_next, \
[0] + schedule_generator.processBatch(len(trg_ids) - 1)
return data_reader
feeding = {
'source_language_word': 0,
'target_language_word': 1,
'target_language_next_word': 2,
'true_token_flag': 3
}
import os
import sys
import gzip
import argparse
import distutils.util
import paddle.v2 as paddle
import reader
from network_conf import seqToseq_net
def parse_args():
parser = argparse.ArgumentParser(
description="PaddlePaddle Scheduled Sampling")
parser.add_argument(
'--schedule_type',
type=str,
default="linear",
help='The type of sampling rate decay. Supported type: constant, linear, exponential, inverse_sigmoid. (default: %(default)s)'
)
parser.add_argument(
'--decay_a',
type=float,
default=0.75,
help='The sampling rate decay parameter a. (default: %(default)s)')
parser.add_argument(
'--decay_b',
type=float,
default=1000000,
help='The sampling rate decay parameter b. (default: %(default)s)')
parser.add_argument(
'--beam_size',
type=int,
default=3,
help='The width of beam expansion. (default: %(default)s)')
parser.add_argument(
"--use_gpu",
type=distutils.util.strtobool,
default=False,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
type=int,
default=1,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
'--batch_size',
type=int,
default=32,
help="Size of a mini-batch. (default: %(default)s)")
parser.add_argument(
'--num_passes',
type=int,
default=10,
help="Number of passes to train. (default: %(default)s)")
parser.add_argument(
'--model_output_dir',
type=str,
default='models',
help="The path for model to store. (default: %(default)s)")
return parser.parse_args()
def train(dict_size, batch_size, num_passes, beam_size, schedule_type, decay_a,
decay_b, model_dir):
optimizer = paddle.optimizer.Adam(
learning_rate=1e-4,
regularization=paddle.optimizer.L2Regularization(rate=1e-5))
cost = seqToseq_net(dict_size, dict_size, beam_size)
parameters = paddle.parameters.create(cost)
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
wmt14_reader = reader.gen_schedule_data(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=8192),
schedule_type, decay_a, decay_b)
# define event_handler callback
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 10 == 0:
print "\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
else:
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.EndPass):
# save parameters
with gzip.open(
os.path.join(model_dir, 'params_pass_%d.tar.gz' %
event.pass_id), 'w') as f:
trainer.save_parameter_to_tar(f)
# start to train
trainer.train(
reader=paddle.batch(wmt14_reader, batch_size=batch_size),
event_handler=event_handler,
feeding=reader.feeding,
num_passes=num_passes)
if __name__ == '__main__':
args = parse_args()
if not os.path.isdir(args.model_output_dir):
os.mkdir(args.model_output_dir)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
train(
dict_size=30000,
batch_size=args.batch_size,
num_passes=args.num_passes,
beam_size=args.beam_size,
schedule_type=args.schedule_type,
decay_a=args.decay_a,
decay_b=args.decay_b,
model_dir=args.model_output_dir)
import numpy as np
import math
import numpy as np
class RandomScheduleGenerator:
"""
The random sampling rate for scheduled sampling algoithm, which uses devcayed
The random sampling rate for scheduled sampling algoithm, which uses decayed
sampling rate.
"""
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册