提交 bca3c03d 编写于 作者: G guosheng

Add reader, ParallelExecutor and refine for Transformer

上级 e7684f07
class TrainTaskConfig(object): class TrainTaskConfig(object):
use_gpu = False use_gpu = True
# the epoch number to train. # the epoch number to train.
pass_num = 2 pass_num = 30
# the number of sequences contained in a mini-batch. # the number of sequences contained in a mini-batch.
batch_size = 64 batch_size = 32
# the hyper parameters for Adam optimizer. # the hyper parameters for Adam optimizer.
learning_rate = 0.001 # This static learning_rate will multiply LearningRateScheduler
# derived learning rate the to get the final learning rate.
learning_rate = 1
beta1 = 0.9 beta1 = 0.9
beta2 = 0.98 beta2 = 0.98
eps = 1e-9 eps = 1e-9
# the parameters for learning rate scheduling. # the parameters for learning rate scheduling.
warmup_steps = 4000 warmup_steps = 4000
# the flag indicating to use average loss or sum loss when training. # the flag indicating to use average loss or sum loss when training.
use_avg_cost = False use_avg_cost = True
# the weight used to mix up the ground-truth distribution and the fixed
# uniform distribution in label smoothing when training.
# Set this as zero if label smoothing is not wanted.
label_smooth_eps = 0.1
# the directory for saving trained models. # the directory for saving trained models.
model_dir = "trained_models" model_dir = "trained_models"
# the directory for saving checkpoints.
ckpt_dir = "trained_ckpts"
# the directory for loading checkpoint.
# If provided, continue training from the checkpoint.
ckpt_path = None
# the parameter to initialize the learning rate scheduler.
# It should be provided if use checkpoints, since the checkpoint doesn't
# include the training step counter currently.
start_step = 0
class InferTaskConfig(object): class InferTaskConfig(object):
use_gpu = False use_gpu = True
# the number of examples in one run for sequence generation. # the number of examples in one run for sequence generation.
batch_size = 10 batch_size = 10
# the parameters for beam search. # the parameters for beam search.
beam_size = 5 beam_size = 5
max_length = 30 max_length = 30
# the number of decoded sentences to output. # the number of decoded sentences to output.
n_best = 1 n_best = 1
# the flags indicating whether to output the special tokens. # the flags indicating whether to output the special tokens.
output_bos = False output_bos = False
output_eos = False output_eos = False
output_unk = False output_unk = False
# the directory for loading the trained model. # the directory for loading the trained model.
model_path = "trained_models/pass_1.infer.model" model_path = "trained_models/pass_1.infer.model"
...@@ -47,30 +54,24 @@ class ModelHyperParams(object): ...@@ -47,30 +54,24 @@ class ModelHyperParams(object):
# <unk> token has alreay been added. As for the <pad> token, any token # <unk> token has alreay been added. As for the <pad> token, any token
# included in dict can be used to pad, since the paddings' loss will be # included in dict can be used to pad, since the paddings' loss will be
# masked out and make no effect on parameter gradients. # masked out and make no effect on parameter gradients.
# size of source word dictionary. # size of source word dictionary.
src_vocab_size = 10000 src_vocab_size = 10000
# size of target word dictionay # size of target word dictionay
trg_vocab_size = 10000 trg_vocab_size = 10000
# index for <bos> token # index for <bos> token
bos_idx = 0 bos_idx = 0
# index for <eos> token # index for <eos> token
eos_idx = 1 eos_idx = 1
# index for <unk> token # index for <unk> token
unk_idx = 2 unk_idx = 2
# max length of sequences. # max length of sequences.
# The size of position encoding table should at least plus 1, since the # The size of position encoding table should at least plus 1, since the
# sinusoid position encoding starts from 1 and 0 can be used as the padding # sinusoid position encoding starts from 1 and 0 can be used as the padding
# token for position encoding. # token for position encoding.
max_length = 50 max_length = 50
# the dimension for word embeddings, which is also the last dimension of # the dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward # the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder. # networks, encoder and decoder.
d_model = 512 d_model = 512
# size of the hidden layer in position-wise feed-forward networks. # size of the hidden layer in position-wise feed-forward networks.
d_inner_hid = 1024 d_inner_hid = 1024
...@@ -86,34 +87,116 @@ class ModelHyperParams(object): ...@@ -86,34 +87,116 @@ class ModelHyperParams(object):
dropout = 0.1 dropout = 0.1
def merge_cfg_from_list(cfg_list, g_cfgs):
"""
Set the above global configurations using the cfg_list.
"""
assert len(cfg_list) % 2 == 0
for key, value in zip(cfg_list[0::2], cfg_list[1::2]):
for g_cfg in g_cfgs:
if hasattr(g_cfg, key):
try:
value = eval(value)
except SyntaxError: # for file path
pass
setattr(g_cfg, key, value)
break
# Here list the data shapes and data types of all inputs.
# The shapes here act as placeholder and are set to pass the infer-shape in
# compile time.
input_descs = {
# The actual data shape of src_word is:
# [batch_size * max_src_len_in_batch, 1]
"src_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
# The actual data shape of src_pos is:
# [batch_size * max_src_len_in_batch, 1]
"src_pos": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
# This input is used to remove attention weights on paddings in the
# encoder.
# The actual data shape of src_slf_attn_bias is:
# [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
"src_slf_attn_bias":
[(1, ModelHyperParams.n_head, (ModelHyperParams.max_length + 1),
(ModelHyperParams.max_length + 1)), "float32"],
# This shape input is used to reshape the output of embedding layer.
"src_data_shape": [(3L, ), "int32"],
# This shape input is used to reshape before softmax in self attention.
"src_slf_attn_pre_softmax_shape": [(2L, ), "int32"],
# This shape input is used to reshape after softmax in self attention.
"src_slf_attn_post_softmax_shape": [(4L, ), "int32"],
# The actual data shape of trg_word is:
# [batch_size * max_trg_len_in_batch, 1]
"trg_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
# The actual data shape of trg_pos is:
# [batch_size * max_trg_len_in_batch, 1]
"trg_pos": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
# This input is used to remove attention weights on paddings and
# subsequent words in the decoder.
# The actual data shape of trg_slf_attn_bias is:
# [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
"trg_slf_attn_bias": [(1, ModelHyperParams.n_head,
(ModelHyperParams.max_length + 1),
(ModelHyperParams.max_length + 1)), "float32"],
# This input is used to remove attention weights on paddings of the source
# input in the encoder-decoder attention.
# The actual data shape of trg_src_attn_bias is:
# [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
"trg_src_attn_bias": [(1, ModelHyperParams.n_head,
(ModelHyperParams.max_length + 1),
(ModelHyperParams.max_length + 1)), "float32"],
# This shape input is used to reshape the output of embedding layer.
"trg_data_shape": [(3L, ), "int32"],
# This shape input is used to reshape before softmax in self attention.
"trg_slf_attn_pre_softmax_shape": [(2L, ), "int32"],
# This shape input is used to reshape after softmax in self attention.
"trg_slf_attn_post_softmax_shape": [(4L, ), "int32"],
# This shape input is used to reshape before softmax in encoder-decoder
# attention.
"trg_src_attn_pre_softmax_shape": [(2L, ), "int32"],
# This shape input is used to reshape after softmax in encoder-decoder
# attention.
"trg_src_attn_post_softmax_shape": [(4L, ), "int32"],
# This input is used in independent decoder program for inference.
# The actual data shape of enc_output is:
# [batch_size, max_src_len_in_batch, d_model]
"enc_output": [(1, (ModelHyperParams.max_length + 1),
ModelHyperParams.d_model), "float32"],
# The actual data shape of label_word is:
# [batch_size * max_trg_len_in_batch, 1]
"lbl_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
# This input is used to mask out the loss of paddding tokens.
# The actual data shape of label_weight is:
# [batch_size * max_trg_len_in_batch, 1]
"lbl_weight": [(1 * (ModelHyperParams.max_length + 1), 1L), "float32"],
}
# Names of position encoding table which will be initialized externally. # Names of position encoding table which will be initialized externally.
pos_enc_param_names = ( pos_enc_param_names = (
"src_pos_enc_table", "src_pos_enc_table",
"trg_pos_enc_table", ) "trg_pos_enc_table", )
# separated inputs for different usages.
# Names of all data layers in encoder listed in order. encoder_data_input_fields = (
encoder_input_data_names = (
"src_word", "src_word",
"src_pos", "src_pos",
"src_slf_attn_bias", "src_slf_attn_bias", )
encoder_util_input_fields = (
"src_data_shape", "src_data_shape",
"src_slf_attn_pre_softmax_shape", "src_slf_attn_pre_softmax_shape",
"src_slf_attn_post_softmax_shape", ) "src_slf_attn_post_softmax_shape", )
decoder_data_input_fields = (
# Names of all data layers in decoder listed in order.
decoder_input_data_names = (
"trg_word", "trg_word",
"trg_pos", "trg_pos",
"trg_slf_attn_bias", "trg_slf_attn_bias",
"trg_src_attn_bias", "trg_src_attn_bias",
"enc_output", )
decoder_util_input_fields = (
"trg_data_shape", "trg_data_shape",
"trg_slf_attn_pre_softmax_shape", "trg_slf_attn_pre_softmax_shape",
"trg_slf_attn_post_softmax_shape", "trg_slf_attn_post_softmax_shape",
"trg_src_attn_pre_softmax_shape", "trg_src_attn_pre_softmax_shape",
"trg_src_attn_post_softmax_shape", "trg_src_attn_post_softmax_shape", )
"enc_output", ) label_data_input_fields = (
# Names of label related data layers listed in order.
label_data_names = (
"lbl_word", "lbl_word",
"lbl_weight", ) "lbl_weight", )
import argparse
import numpy as np import numpy as np
import paddle import paddle
...@@ -6,9 +7,52 @@ import paddle.fluid as fluid ...@@ -6,9 +7,52 @@ import paddle.fluid as fluid
import model import model
from model import wrap_encoder as encoder from model import wrap_encoder as encoder
from model import wrap_decoder as decoder from model import wrap_decoder as decoder
from config import InferTaskConfig, ModelHyperParams, \ from config import *
encoder_input_data_names, decoder_input_data_names
from train import pad_batch_data from train import pad_batch_data
import reader
def parse_args():
parser = argparse.ArgumentParser("Training for Transformer.")
parser.add_argument(
"--src_vocab_fpath",
type=str,
required=True,
help="The path of vocabulary file of source language.")
parser.add_argument(
"--trg_vocab_fpath",
type=str,
required=True,
help="The path of vocabulary file of target language.")
parser.add_argument(
"--test_file_pattern",
type=str,
required=True,
help="The pattern to match test data files.")
parser.add_argument(
"--batch_size",
type=int,
default=50,
help="The number of examples in one run for sequence generation.")
parser.add_argument(
"--pool_size",
type=int,
default=10000,
help="The buffer size to pool data.")
parser.add_argument(
"--special_token",
type=str,
default=["<s>", "<e>", "<unk>"],
nargs=3,
help="The <bos>, <eos> and <unk> tokens in the dictionary.")
parser.add_argument(
'opts',
help='See config.py for all options',
default=None,
nargs=argparse.REMAINDER)
args = parser.parse_args()
merge_cfg_from_list(args.opts, [InferTaskConfig, ModelHyperParams])
return args
def translate_batch(exe, def translate_batch(exe,
...@@ -243,7 +287,7 @@ def translate_batch(exe, ...@@ -243,7 +287,7 @@ def translate_batch(exe,
return seqs, scores[:, :n_best].tolist() return seqs, scores[:, :n_best].tolist()
def main(): def infer(args):
place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
...@@ -292,13 +336,23 @@ def main(): ...@@ -292,13 +336,23 @@ def main():
decoder_program = fluid.io.get_inference_program( decoder_program = fluid.io.get_inference_program(
target_vars=[predict], main_program=decoder_program) target_vars=[predict], main_program=decoder_program)
test_data = paddle.batch( test_data = reader.DataReader(
paddle.dataset.wmt16.test(ModelHyperParams.src_vocab_size, src_vocab_fpath=args.src_vocab_fpath,
ModelHyperParams.trg_vocab_size), trg_vocab_fpath=args.trg_vocab_fpath,
batch_size=InferTaskConfig.batch_size) fpattern=args.test_file_pattern,
batch_size=args.batch_size,
use_token_batch=False,
pool_size=args.pool_size,
sort_type=reader.SortType.NONE,
shuffle=False,
shuffle_batch=False,
start_mark=args.special_token[0],
end_mark=args.special_token[1],
unk_mark=args.special_token[2],
clip_last_batch=False)
trg_idx2word = paddle.dataset.wmt16.get_dict( trg_idx2word = test_data._load_dict(
"de", dict_size=ModelHyperParams.trg_vocab_size, reverse=True) dict_path=args.trg_vocab_fpath, reverse=True)
def post_process_seq(seq, def post_process_seq(seq,
bos_idx=ModelHyperParams.bos_idx, bos_idx=ModelHyperParams.bos_idx,
...@@ -320,15 +374,16 @@ def main(): ...@@ -320,15 +374,16 @@ def main():
(output_eos or idx != eos_idx), (output_eos or idx != eos_idx),
seq) seq)
for batch_id, data in enumerate(test_data()): for batch_id, data in enumerate(test_data.batch_generator()):
batch_seqs, batch_scores = translate_batch( batch_seqs, batch_scores = translate_batch(
exe, exe,
[item[0] for item in data], [item[0] for item in data],
encoder_program, encoder_program,
encoder_input_data_names, encoder_data_input_fields + encoder_util_input_fields,
[enc_output.name], [enc_output.name],
decoder_program, decoder_program,
decoder_input_data_names, decoder_data_input_fields[:-1] + decoder_util_input_fields +
(decoder_data_input_fields[-1], ),
[predict.name], [predict.name],
InferTaskConfig.beam_size, InferTaskConfig.beam_size,
InferTaskConfig.max_length, InferTaskConfig.max_length,
...@@ -351,4 +406,5 @@ def main(): ...@@ -351,4 +406,5 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
main() args = parse_args()
infer(args)
...@@ -4,8 +4,7 @@ import numpy as np ...@@ -4,8 +4,7 @@ import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from config import TrainTaskConfig, pos_enc_param_names, \ from config import *
encoder_input_data_names, decoder_input_data_names, label_data_names
def position_encoding_init(n_position, d_pos_vec): def position_encoding_init(n_position, d_pos_vec):
...@@ -171,7 +170,6 @@ def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.): ...@@ -171,7 +170,6 @@ def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.):
""" """
Add residual connection, layer normalization and droput to the out tensor Add residual connection, layer normalization and droput to the out tensor
optionally according to the value of process_cmd. optionally according to the value of process_cmd.
This will be used before or after multi-head attention and position-wise This will be used before or after multi-head attention and position-wise
feed-forward networks. feed-forward networks.
""" """
...@@ -206,7 +204,6 @@ def prepare_encoder(src_word, ...@@ -206,7 +204,6 @@ def prepare_encoder(src_word,
"""Add word embeddings and position encodings. """Add word embeddings and position encodings.
The output tensor has a shape of: The output tensor has a shape of:
[batch_size, max_src_length_in_batch, d_model]. [batch_size, max_src_length_in_batch, d_model].
This module is used at the bottom of the encoder stacks. This module is used at the bottom of the encoder stacks.
""" """
src_word_emb = layers.embedding( src_word_emb = layers.embedding(
...@@ -245,7 +242,6 @@ def encoder_layer(enc_input, ...@@ -245,7 +242,6 @@ def encoder_layer(enc_input,
pre_softmax_shape=None, pre_softmax_shape=None,
post_softmax_shape=None): post_softmax_shape=None):
"""The encoder layers that can be stacked to form a deep encoder. """The encoder layers that can be stacked to form a deep encoder.
This module consits of a multi-head (self) attention followed by This module consits of a multi-head (self) attention followed by
position-wise feed-forward networks and both the two components companied position-wise feed-forward networks and both the two components companied
with the post_process_layer to add residual connection, layer normalization with the post_process_layer to add residual connection, layer normalization
...@@ -306,7 +302,6 @@ def decoder_layer(dec_input, ...@@ -306,7 +302,6 @@ def decoder_layer(dec_input,
src_attn_pre_softmax_shape=None, src_attn_pre_softmax_shape=None,
src_attn_post_softmax_shape=None): src_attn_post_softmax_shape=None):
""" The layer to be stacked in decoder part. """ The layer to be stacked in decoder part.
The structure of this module is similar to that in the encoder part except The structure of this module is similar to that in the encoder part except
a multi-head attention is added to implement encoder-decoder attention. a multi-head attention is added to implement encoder-decoder attention.
""" """
...@@ -394,116 +389,19 @@ def decoder(dec_input, ...@@ -394,116 +389,19 @@ def decoder(dec_input,
return dec_output return dec_output
def make_inputs(input_data_names, def make_all_inputs(input_fields):
n_head,
d_model,
max_length,
is_pos,
slf_attn_bias_flag,
src_attn_bias_flag,
enc_output_flag=False,
data_shape_flag=True,
slf_attn_shape_flag=True,
src_attn_shape_flag=True):
""" """
Define the input data layers for the transformer model. Define the input data layers for the transformer model.
""" """
input_layers = [] inputs = []
batch_size = 1 # Only for the infer-shape in compile time. for input_field in input_fields:
# The shapes here act as placeholder and are set to pass the infer-shape in input_var = layers.data(
# compile time. name=input_field,
# The actual data shape of word is: shape=input_descs[input_field][0],
# [batch_size * max_len_in_batch, 1] dtype=input_descs[input_field][1],
word = layers.data(
name=input_data_names[len(input_layers)],
shape=[batch_size * max_length, 1],
dtype="int64",
append_batch_size=False)
input_layers += [word]
# This is used for position data or label weight.
# The actual data shape of pos is:
# [batch_size * max_len_in_batch, 1]
pos = layers.data(
name=input_data_names[len(input_layers)],
shape=[batch_size * max_length, 1],
dtype="int64" if is_pos else "float32",
append_batch_size=False)
input_layers += [pos]
if slf_attn_bias_flag:
# This input is used to remove attention weights on paddings for the
# encoder and to remove attention weights on subsequent words for the
# decoder.
# The actual data shape of slf_attn_bias_flag is:
# [batch_size, n_head, max_len_in_batch, max_len_in_batch]
slf_attn_bias = layers.data(
name=input_data_names[len(input_layers)],
shape=[batch_size, n_head, max_length, max_length],
dtype="float32",
append_batch_size=False)
input_layers += [slf_attn_bias]
if src_attn_bias_flag:
# This input is used to remove attention weights on paddings. It's used
# in encoder-decoder attention.
# The actual data shape of slf_attn_bias_flag is:
# [batch_size, n_head, trg_max_len_in_batch, src_max_len_in_batch]
src_attn_bias = layers.data(
name=input_data_names[len(input_layers)],
shape=[batch_size, n_head, max_length, max_length],
dtype="float32",
append_batch_size=False)
input_layers += [src_attn_bias]
if data_shape_flag:
# This input is used to reshape the output of embedding layer.
data_shape = layers.data(
name=input_data_names[len(input_layers)],
shape=[3],
dtype="int32",
append_batch_size=False) append_batch_size=False)
input_layers += [data_shape] inputs.append(input_var)
if slf_attn_shape_flag: return inputs
# This shape input is used to reshape before softmax in self attention.
slf_attn_pre_softmax_shape = layers.data(
name=input_data_names[len(input_layers)],
shape=[2],
dtype="int32",
append_batch_size=False)
input_layers += [slf_attn_pre_softmax_shape]
# This shape input is used to reshape after softmax in self attention.
slf_attn_post_softmax_shape = layers.data(
name=input_data_names[len(input_layers)],
shape=[4],
dtype="int32",
append_batch_size=False)
input_layers += [slf_attn_post_softmax_shape]
if src_attn_shape_flag:
# This shape input is used to reshape before softmax in encoder-decoder
# attention.
src_attn_pre_softmax_shape = layers.data(
name=input_data_names[len(input_layers)],
shape=[2],
dtype="int32",
append_batch_size=False)
input_layers += [src_attn_pre_softmax_shape]
# This shape input is used to reshape after softmax in encoder-decoder
# attention.
src_attn_post_softmax_shape = layers.data(
name=input_data_names[len(input_layers)],
shape=[4],
dtype="int32",
append_batch_size=False)
input_layers += [src_attn_post_softmax_shape]
if enc_output_flag:
# This input is used in independent decoder program for inference.
# The actual data shape of slf_attn_bias_flag is:
# [batch_size, max_len_in_batch, d_model]
enc_output = layers.data(
name=input_data_names[len(input_layers)],
shape=[batch_size, max_length, d_model],
dtype="float32",
append_batch_size=False)
input_layers += [enc_output]
return input_layers
def transformer( def transformer(
...@@ -516,19 +414,10 @@ def transformer( ...@@ -516,19 +414,10 @@ def transformer(
d_value, d_value,
d_model, d_model,
d_inner_hid, d_inner_hid,
dropout_rate, ): dropout_rate,
enc_inputs = make_inputs( label_smooth_eps, ):
encoder_input_data_names, enc_inputs = make_all_inputs(encoder_data_input_fields +
n_head, encoder_util_input_fields)
d_model,
max_length,
is_pos=True,
slf_attn_bias_flag=True,
src_attn_bias_flag=False,
enc_output_flag=False,
data_shape_flag=True,
slf_attn_shape_flag=True,
src_attn_shape_flag=False)
enc_output = wrap_encoder( enc_output = wrap_encoder(
src_vocab_size, src_vocab_size,
...@@ -542,18 +431,8 @@ def transformer( ...@@ -542,18 +431,8 @@ def transformer(
dropout_rate, dropout_rate,
enc_inputs, ) enc_inputs, )
dec_inputs = make_inputs( dec_inputs = make_all_inputs(decoder_data_input_fields[:-1] +
decoder_input_data_names, decoder_util_input_fields)
n_head,
d_model,
max_length,
is_pos=True,
slf_attn_bias_flag=True,
src_attn_bias_flag=True,
enc_output_flag=False,
data_shape_flag=True,
slf_attn_shape_flag=True,
src_attn_shape_flag=True)
predict = wrap_decoder( predict = wrap_decoder(
trg_vocab_size, trg_vocab_size,
...@@ -570,19 +449,17 @@ def transformer( ...@@ -570,19 +449,17 @@ def transformer(
# Padding index do not contribute to the total loss. The weights is used to # Padding index do not contribute to the total loss. The weights is used to
# cancel padding index in calculating the loss. # cancel padding index in calculating the loss.
gold, weights = make_inputs( label, weights = make_all_inputs(label_data_input_fields)
label_data_names, if label_smooth_eps:
n_head, label = layers.label_smooth(
d_model, label=layers.one_hot(
max_length, input=label, depth=trg_vocab_size),
is_pos=False, epsilon=label_smooth_eps)
slf_attn_bias_flag=False, cost = layers.softmax_with_cross_entropy(
src_attn_bias_flag=False, logits=predict,
enc_output_flag=False, label=label,
data_shape_flag=False, soft_label=True if label_smooth_eps else False)
slf_attn_shape_flag=False, # cost = layers.softmax_with_cross_entropy(logits=predict, label=gold)
src_attn_shape_flag=False)
cost = layers.softmax_with_cross_entropy(logits=predict, label=gold)
weighted_cost = cost * weights weighted_cost = cost * weights
sum_cost = layers.reduce_sum(weighted_cost) sum_cost = layers.reduce_sum(weighted_cost)
token_num = layers.reduce_sum(weights) token_num = layers.reduce_sum(weights)
...@@ -607,18 +484,8 @@ def wrap_encoder(src_vocab_size, ...@@ -607,18 +484,8 @@ def wrap_encoder(src_vocab_size,
# This is used to implement independent encoder program in inference. # This is used to implement independent encoder program in inference.
src_word, src_pos, src_slf_attn_bias, src_data_shape, \ src_word, src_pos, src_slf_attn_bias, src_data_shape, \
slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \ slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \
make_inputs( make_all_inputs(encoder_data_input_fields +
encoder_input_data_names, encoder_util_input_fields)
n_head,
d_model,
max_length,
is_pos=True,
slf_attn_bias_flag=True,
src_attn_bias_flag=False,
enc_output_flag=False,
data_shape_flag=True,
slf_attn_shape_flag=True,
src_attn_shape_flag=False)
else: else:
src_word, src_pos, src_slf_attn_bias, src_data_shape, \ src_word, src_pos, src_slf_attn_bias, src_data_shape, \
slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \ slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \
...@@ -663,20 +530,10 @@ def wrap_decoder(trg_vocab_size, ...@@ -663,20 +530,10 @@ def wrap_decoder(trg_vocab_size,
if dec_inputs is None: if dec_inputs is None:
# This is used to implement independent decoder program in inference. # This is used to implement independent decoder program in inference.
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
trg_data_shape, slf_attn_pre_softmax_shape, \ enc_output, trg_data_shape, slf_attn_pre_softmax_shape, \
slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \ slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \
src_attn_post_softmax_shape, enc_output = make_inputs( src_attn_post_softmax_shape = make_all_inputs(
decoder_input_data_names, decoder_data_input_fields + decoder_util_input_fields)
n_head,
d_model,
max_length,
is_pos=True,
slf_attn_bias_flag=True,
src_attn_bias_flag=True,
enc_output_flag=True,
data_shape_flag=True,
slf_attn_shape_flag=True,
src_attn_shape_flag=True)
else: else:
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
trg_data_shape, slf_attn_pre_softmax_shape, \ trg_data_shape, slf_attn_pre_softmax_shape, \
......
...@@ -14,27 +14,24 @@ class LearningRateScheduler(object): ...@@ -14,27 +14,24 @@ class LearningRateScheduler(object):
def __init__(self, def __init__(self,
d_model, d_model,
warmup_steps, warmup_steps,
place,
learning_rate=0.001, learning_rate=0.001,
current_steps=0, current_steps=0,
name="learning_rate"): name="learning_rate"):
self.current_steps = current_steps self.current_steps = current_steps
self.warmup_steps = warmup_steps self.warmup_steps = warmup_steps
self.d_model = d_model self.d_model = d_model
self.static_lr = learning_rate
self.learning_rate = layers.create_global_var( self.learning_rate = layers.create_global_var(
name=name, name=name,
shape=[1], shape=[1],
value=float(learning_rate), value=float(learning_rate),
dtype="float32", dtype="float32",
persistable=True) persistable=True)
self.place = place
def update_learning_rate(self, data_input): def update_learning_rate(self):
self.current_steps += 1 self.current_steps += 1
lr_value = np.power(self.d_model, -0.5) * np.min([ lr_value = np.power(self.d_model, -0.5) * np.min([
np.power(self.current_steps, -0.5), np.power(self.current_steps, -0.5),
np.power(self.warmup_steps, -1.5) * self.current_steps np.power(self.warmup_steps, -1.5) * self.current_steps
]) ]) * self.static_lr
lr_tensor = fluid.LoDTensor() return np.array([lr_value], dtype="float32")
lr_tensor.set(np.array([lr_value], dtype="float32"), self.place)
data_input[self.learning_rate.name] = lr_tensor
import os
import tarfile
import glob
import random
class SortType(object):
GLOBAL = 'global'
POOL = 'pool'
NONE = "none"
class EndEpoch():
pass
class Pool(object):
def __init__(self, sample_generator, pool_size, sort):
self._pool_size = pool_size
self._pool = []
self._sample_generator = sample_generator()
self._end = False
self._sort = sort
def _fill(self):
while len(self._pool) < self._pool_size and not self._end:
try:
sample = self._sample_generator.next()
self._pool.append(sample)
except StopIteration as e:
self._end = True
break
if self._sort:
self._pool.sort(
key=lambda sample: max(len(sample[0]), len(sample[1])) if len(sample) > 1 else len(sample[0])
)
if self._end and len(self._pool) < self._pool_size:
self._pool.append(EndEpoch())
def push_back(self, samples):
if len(self._pool) != 0:
raise Exception("Pool should be empty.")
if len(samples) >= self._pool_size:
raise Exception("Capacity of pool should be greater than a batch. "
"Please enlarge `pool_size`.")
for sample in samples:
self._pool.append(sample)
self._fill()
def next(self, look=False):
if len(self._pool) == 0:
return None
else:
return self._pool[0] if look else self._pool.pop(0)
class DataReader(object):
"""
The data reader loads all data from files and produces batches of data
in the way corresponding to settings.
number of tokens or number of sequences.
"""
def __init__(self,
src_vocab_fpath,
trg_vocab_fpath,
fpattern,
batch_size,
pool_size,
sort_type=SortType.NONE,
clip_last_batch=True,
tar_fname=None,
min_length=0,
max_length=100,
shuffle=True,
shuffle_batch=False,
use_token_batch=False,
delimiter="\t",
start_mark="<s>",
end_mark="<e>",
unk_mark="<unk>",
seed=0):
"""
Load all data from files and set the settings to make mini-batches.
:param src_vocab_fpath: The path of vocabulary file of source language.
:type src_vocab_fpath: basestring
:param trg_vocab_fpath: The path of vocabulary file of target language.
:type trg_vocab_fpath: basestring
:param fpattern: The pattern to match data files.
:type fpattern: basestring
:param batch_size: The number of sequences contained in a mini-batch.
or the maximum number of tokens (include paddings) contained in a
mini-batch.
:type batch_size: int
:param pool_size: The buffer size to pool data.
:type pool_size: int
:param sort_type: The grain to sort by length: 'global' for all
instances; 'pool' for instances in pool; 'none' for no sort.
:type sort_type: basestring
:param sort_type: The grain to sort by length: 'global' for all
instances; 'pool' for instances in pool; 'none' for no sort.
:type sort_type: basestring
:param clip_last_batch: Whether to clip the last uncompleted batch.
:type clip_last_batch: bool
:param tar_fname: The data file in tar if fpattern matches a tar file.
:type tar_fname: basestring
:param min_length: The minimum length used to filt sequences.
:type min_length: int
:param max_length: The maximum length used to filt sequences.
:type max_length: int
:param shuffle: Whether to shuffle all instances.
:type shuffle: bool
:param shuffle_batch: Whether to shuffle the generated batches.
:type shuffle_batch: bool
:param use_token_batch: Whether to produce batch data according to
token number.
:type use_token_batch: bool
:param delimiter: The delimiter used to split source and target in each
line of data file.
:type delimiter: basestring
:param start_mark: The token representing for the beginning of
sentences in dictionary.
:type start_mark: basestring
:param end_mark: The token representing for the end of sentences
in dictionary.
:type end_mark: basestring
:param unk_mark: The token representing for unknown word in dictionary.
:type unk_mark: basestring
:param seed: The seed for random.
:type seed: int
"""
self._src_vocab = self._load_dict(src_vocab_fpath)
self._only_src = True
if trg_vocab_fpath is not None:
self._trg_vocab = self._load_dict(trg_vocab_fpath)
self._only_src = False
self._pool_size = pool_size
self._batch_size = batch_size
self._use_token_batch = use_token_batch
self._sort_type = sort_type
self._clip_last_batch = clip_last_batch
self._shuffle = shuffle
self._shuffle_batch = shuffle_batch
self._min_length = min_length
self._max_length = max_length
self._delimiter = delimiter
self._epoch_batches = []
src_seq_words, trg_seq_words = self._load_data(fpattern, tar_fname)
self._src_seq_ids = [[
self._src_vocab.get(word, self._src_vocab.get(unk_mark))
for word in ([start_mark] + src_seq + [end_mark])
] for src_seq in src_seq_words]
self._sample_count = len(self._src_seq_ids)
if not self._only_src:
self._trg_seq_ids = [[
self._trg_vocab.get(word, self._trg_vocab.get(unk_mark))
for word in ([start_mark] + trg_seq + [end_mark])
] for trg_seq in trg_seq_words]
if len(self._trg_seq_ids) != self._sample_count:
raise Exception("Inconsistent sample count between "
"source sequences and target sequences.")
else:
self._trg_seq_ids = None
self._sample_idxs = [i for i in xrange(self._sample_count)]
self._sorted = False
random.seed(seed)
def _parse_file(self, f_obj):
src_seq_words = []
trg_seq_words = []
for line in f_obj:
fields = line.strip().split(self._delimiter)
if len(fields) != 2 or (self._only_src and len(fields) != 1):
continue
sample_words = []
is_valid_sample = True
max_len = -1
for i, seq in enumerate(fields):
seq_words = seq.split()
max_len = max(max_len, len(seq_words))
if len(seq_words) == 0 or \
len(seq_words) < self._min_length or \
len(seq_words) > self._max_length or \
(self._use_token_batch and max_len > self._batch_size):
is_valid_sample = False
break
sample_words.append(seq_words)
if not is_valid_sample: continue
src_seq_words.append(sample_words[0])
if not self._only_src:
trg_seq_words.append(sample_words[1])
return (src_seq_words, trg_seq_words)
def _load_data(self, fpattern, tar_fname):
fpaths = glob.glob(fpattern)
src_seq_words = []
trg_seq_words = []
if len(fpaths) == 1 and tarfile.is_tarfile(fpaths[0]):
if tar_fname is None:
raise Exception("If tar file provided, please set tar_fname.")
f = tarfile.open(fpaths[0], 'r')
part_file_data = self._parse_file(f.extractfile(tar_fname))
src_seq_words = part_file_data[0]
trg_seq_words = part_file_data[1]
else:
for fpath in fpaths:
if not os.path.isfile(fpath):
raise IOError("Invalid file: %s" % fpath)
part_file_data = self._parse_file(open(fpath, 'r'))
src_seq_words.extend(part_file_data[0])
trg_seq_words.extend(part_file_data[1])
return src_seq_words, trg_seq_words
def _load_dict(self, dict_path, reverse=False):
word_dict = {}
with open(dict_path, "r") as fdict:
for idx, line in enumerate(fdict):
if reverse:
word_dict[idx] = line.strip()
else:
word_dict[line.strip()] = idx
return word_dict
def _sample_generator(self):
if self._sort_type == SortType.GLOBAL:
if not self._sorted:
self._sample_idxs.sort(
key=lambda idx: max(len(self._src_seq_ids[idx]),
len(self._trg_seq_ids[idx] if not self._only_src else 0))
)
self._sorted = True
elif self._shuffle:
random.shuffle(self._sample_idxs)
for sample_idx in self._sample_idxs:
if self._only_src:
yield (self._src_seq_ids[sample_idx])
else:
yield (self._src_seq_ids[sample_idx],
self._trg_seq_ids[sample_idx][:-1],
self._trg_seq_ids[sample_idx][1:])
def batch_generator(self):
pool = Pool(self._sample_generator, self._pool_size, True
if self._sort_type == SortType.POOL else False)
def next_batch():
batch_data = []
max_len = -1
batch_max_seq_len = -1
while True:
sample = pool.next(look=True)
if sample is None:
pool.push_back(batch_data)
batch_data = []
continue
if isinstance(sample, EndEpoch):
return batch_data, batch_max_seq_len, True
max_len = max(max_len, len(sample[0]))
if not self._only_src:
max_len = max(max_len, len(sample[1]))
if self._use_token_batch:
if max_len * (len(batch_data) + 1) < self._batch_size:
batch_max_seq_len = max_len
batch_data.append(pool.next())
else:
return batch_data, batch_max_seq_len, False
else:
if len(batch_data) < self._batch_size:
batch_max_seq_len = max_len
batch_data.append(pool.next())
else:
return batch_data, batch_max_seq_len, False
if not self._shuffle_batch:
batch_data, batch_max_seq_len, last_batch = next_batch()
while not last_batch:
yield batch_data
batch_data, batch_max_seq_len, last_batch = next_batch()
batch_size = len(batch_data)
if self._use_token_batch:
batch_size *= batch_max_seq_len
if (not self._clip_last_batch and len(batch_data) > 0) \
or (batch_size == self._batch_size):
yield batch_data
else:
# should re-generate batches
if self._sort_type == SortType.POOL \
or len(self._epoch_batches) == 0:
self._epoch_batches = []
batch_data, batch_max_seq_len, last_batch = next_batch()
while not last_batch:
self._epoch_batches.append(batch_data)
batch_data, batch_max_seq_len, last_batch = next_batch()
batch_size = len(batch_data)
if self._use_token_batch:
batch_size *= batch_max_seq_len
if (not self._clip_last_batch and len(batch_data) > 0) \
or (batch_size == self._batch_size):
self._epoch_batches.append(batch_data)
random.shuffle(self._epoch_batches)
for batch_data in self._epoch_batches:
yield batch_data
import os import os
import time import time
import argparse
import ast
import numpy as np import numpy as np
import paddle import paddle
...@@ -7,8 +9,78 @@ import paddle.fluid as fluid ...@@ -7,8 +9,78 @@ import paddle.fluid as fluid
from model import transformer, position_encoding_init from model import transformer, position_encoding_init
from optim import LearningRateScheduler from optim import LearningRateScheduler
from config import TrainTaskConfig, ModelHyperParams, pos_enc_param_names, \ from config import *
encoder_input_data_names, decoder_input_data_names, label_data_names import reader
def parse_args():
parser = argparse.ArgumentParser("Training for Transformer.")
parser.add_argument(
"--src_vocab_fpath",
type=str,
required=True,
help="The path of vocabulary file of source language.")
parser.add_argument(
"--trg_vocab_fpath",
type=str,
required=True,
help="The path of vocabulary file of target language.")
parser.add_argument(
"--train_file_pattern",
type=str,
required=True,
help="The pattern to match training data files.")
parser.add_argument(
"--val_file_pattern",
type=str,
help="The pattern to match validation data files.")
parser.add_argument(
"--use_token_batch",
type=ast.literal_eval,
default=True,
help="The flag indicating whether to "
"produce batch data according to token number.")
parser.add_argument(
"--batch_size",
type=int,
default=2000,
help="The number of sequences contained in a mini-batch, or the maximum "
"number of tokens (include paddings) contained in a mini-batch.")
parser.add_argument(
"--pool_size",
type=int,
default=10000,
help="The buffer size to pool data.")
parser.add_argument(
"--sort_type",
default="pool",
choices=("global", "pool", "none"),
help="The grain to sort by length: global for all instances; pool for "
"instances in pool; none for no sort.")
parser.add_argument(
"--shuffle",
type=ast.literal_eval,
default=True,
help="The flag indicating whether to shuffle instances in each pass.")
parser.add_argument(
"--shuffle_batch",
type=ast.literal_eval,
default=True,
help="The flag indicating whether to shuffle the data batches.")
parser.add_argument(
"--special_token",
type=str,
default=["<s>", "<e>", "<unk>"],
nargs=3,
help="The <bos>, <eos> and <unk> tokens in the dictionary.")
parser.add_argument(
'opts',
help='See config.py for all options',
default=None,
nargs=argparse.REMAINDER)
args = parser.parse_args()
merge_cfg_from_list(args.opts, [TrainTaskConfig, ModelHyperParams])
return args
def pad_batch_data(insts, def pad_batch_data(insts,
...@@ -17,13 +89,16 @@ def pad_batch_data(insts, ...@@ -17,13 +89,16 @@ def pad_batch_data(insts,
is_target=False, is_target=False,
is_label=False, is_label=False,
return_attn_bias=True, return_attn_bias=True,
return_max_len=True): return_max_len=True,
return_num_token=False):
""" """
Pad the instances to the max sequence length in batch, and generate the Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias. corresponding position data and attention bias.
""" """
return_list = [] return_list = []
max_len = max(len(inst) for inst in insts) max_len = max(len(inst) for inst in insts)
num_token = reduce(lambda x, y: x + y,
[len(inst) for inst in insts]) if return_num_token else 0
# Any token included in dict can be used to pad, since the paddings' loss # Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients. # will be masked out by weights and make no effect on parameter gradients.
inst_data = np.array( inst_data = np.array(
...@@ -44,8 +119,8 @@ def pad_batch_data(insts, ...@@ -44,8 +119,8 @@ def pad_batch_data(insts,
# This is used to avoid attention on paddings and subsequent # This is used to avoid attention on paddings and subsequent
# words. # words.
slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, max_len)) slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, max_len))
slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape( slf_attn_bias_data = np.triu(slf_attn_bias_data,
[-1, 1, max_len, max_len]) 1).reshape([-1, 1, max_len, max_len])
slf_attn_bias_data = np.tile(slf_attn_bias_data, slf_attn_bias_data = np.tile(slf_attn_bias_data,
[1, n_head, 1, 1]) * [-1e9] [1, n_head, 1, 1]) * [-1e9]
else: else:
...@@ -59,11 +134,13 @@ def pad_batch_data(insts, ...@@ -59,11 +134,13 @@ def pad_batch_data(insts,
return_list += [slf_attn_bias_data.astype("float32")] return_list += [slf_attn_bias_data.astype("float32")]
if return_max_len: if return_max_len:
return_list += [max_len] return_list += [max_len]
if return_num_token:
return_list += [num_token]
return return_list if len(return_list) > 1 else return_list[0] return return_list if len(return_list) > 1 else return_list[0]
def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, def prepare_batch_input(insts, data_input_names, util_input_names, src_pad_idx,
n_head, d_model): trg_pad_idx, n_head, d_model):
""" """
Put all padded data needed by training into a dict. Put all padded data needed by training into a dict.
""" """
...@@ -75,139 +152,254 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, ...@@ -75,139 +152,254 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx,
[1, 1, trg_max_len, 1]).astype("float32") [1, 1, trg_max_len, 1]).astype("float32")
# These shape tensors are used in reshape_op. # These shape tensors are used in reshape_op.
src_data_shape = np.array([len(insts), src_max_len, d_model], dtype="int32") src_data_shape = np.array([-1, src_max_len, d_model], dtype="int32")
trg_data_shape = np.array([len(insts), trg_max_len, d_model], dtype="int32") trg_data_shape = np.array([-1, trg_max_len, d_model], dtype="int32")
src_slf_attn_pre_softmax_shape = np.array( src_slf_attn_pre_softmax_shape = np.array(
[-1, src_slf_attn_bias.shape[-1]], dtype="int32") [-1, src_slf_attn_bias.shape[-1]], dtype="int32")
src_slf_attn_post_softmax_shape = np.array( src_slf_attn_post_softmax_shape = np.array(
src_slf_attn_bias.shape, dtype="int32") [-1] + list(src_slf_attn_bias.shape[1:]), dtype="int32")
trg_slf_attn_pre_softmax_shape = np.array( trg_slf_attn_pre_softmax_shape = np.array(
[-1, trg_slf_attn_bias.shape[-1]], dtype="int32") [-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
trg_slf_attn_post_softmax_shape = np.array( trg_slf_attn_post_softmax_shape = np.array(
trg_slf_attn_bias.shape, dtype="int32") [-1] + list(trg_slf_attn_bias.shape[1:]), dtype="int32")
trg_src_attn_pre_softmax_shape = np.array( trg_src_attn_pre_softmax_shape = np.array(
[-1, trg_src_attn_bias.shape[-1]], dtype="int32") [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
trg_src_attn_post_softmax_shape = np.array( trg_src_attn_post_softmax_shape = np.array(
trg_src_attn_bias.shape, dtype="int32") [-1] + list(trg_src_attn_bias.shape[1:]), dtype="int32")
lbl_word, lbl_weight = pad_batch_data( lbl_word, lbl_weight, num_token = pad_batch_data(
[inst[2] for inst in insts], [inst[2] for inst in insts],
trg_pad_idx, trg_pad_idx,
n_head, n_head,
is_target=False, is_target=False,
is_label=True, is_label=True,
return_attn_bias=False, return_attn_bias=False,
return_max_len=False) return_max_len=False,
return_num_token=True)
input_dict = dict(
zip(input_data_names, [ data_input_dict = dict(
src_word, src_pos, src_slf_attn_bias, src_data_shape, zip(data_input_names, [
src_slf_attn_pre_softmax_shape, src_slf_attn_post_softmax_shape, src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
trg_data_shape, trg_slf_attn_pre_softmax_shape,
trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape,
trg_src_attn_post_softmax_shape, lbl_word, lbl_weight
])) ]))
return input_dict util_input_dict = dict(
zip(util_input_names, [
src_data_shape, src_slf_attn_pre_softmax_shape,
src_slf_attn_post_softmax_shape, trg_data_shape,
trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape,
trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape
]))
return data_input_dict, util_input_dict, np.asarray(
[num_token], dtype="float32")
def main(): def train(args):
place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace() dev_count = fluid.core.get_cuda_device_count()
exe = fluid.Executor(place)
def read_multiple(reader,
count=dev_count if args.use_token_batch else 1,
clip_last=False):
"""
Stack data from reader for multi-devices.
"""
def __impl__():
res = []
for item in reader():
res.append(item)
if len(res) == count:
yield res
res = []
if len(res) == count:
yield res
elif not clip_last:
data = []
for item in res:
data += item
if len(data) > count:
inst_num_per_part = len(data) // count
yield [
data[inst_num_per_part * i:inst_num_per_part * (i + 1)]
for i in range(count)
]
return __impl__
def split_data(data, num_part=dev_count):
"""
Split data for each device.
"""
if len(data) == num_part:
return data
data = data[0]
inst_num_per_part = len(data) // num_part
return [
data[inst_num_per_part * i:inst_num_per_part * (i + 1)]
for i in range(num_part)
]
sum_cost, avg_cost, predict, token_num = transformer( sum_cost, avg_cost, predict, token_num = transformer(
ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.n_head, ModelHyperParams.d_key,
ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_value, ModelHyperParams.d_model,
ModelHyperParams.d_inner_hid, ModelHyperParams.dropout) ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
TrainTaskConfig.label_smooth_eps)
lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
TrainTaskConfig.warmup_steps, place, TrainTaskConfig.warmup_steps,
TrainTaskConfig.learning_rate) TrainTaskConfig.learning_rate)
optimizer = fluid.optimizer.Adam( optimizer = fluid.optimizer.Adam(
learning_rate=lr_scheduler.learning_rate, learning_rate=lr_scheduler.learning_rate,
beta1=TrainTaskConfig.beta1, beta1=TrainTaskConfig.beta1,
beta2=TrainTaskConfig.beta2, beta2=TrainTaskConfig.beta2,
epsilon=TrainTaskConfig.eps) epsilon=TrainTaskConfig.eps)
optimizer.minimize(avg_cost if TrainTaskConfig.use_avg_cost else sum_cost) optimizer.minimize(sum_cost)
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt16.train(ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size),
buf_size=100000),
batch_size=TrainTaskConfig.batch_size)
# Program to do validation.
test_program = fluid.default_main_program().clone()
with fluid.program_guard(test_program):
test_program = fluid.io.get_inference_program([avg_cost])
val_data = paddle.batch(
paddle.dataset.wmt16.validation(ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size),
batch_size=TrainTaskConfig.batch_size)
def test(exe):
test_total_cost = 0
test_total_token = 0
for batch_id, data in enumerate(val_data()):
data_input = prepare_batch_input(
data, encoder_input_data_names + decoder_input_data_names[:-1] +
label_data_names, ModelHyperParams.eos_idx,
ModelHyperParams.eos_idx, ModelHyperParams.n_head,
ModelHyperParams.d_model)
test_sum_cost, test_token_num = exe.run(
test_program,
feed=data_input,
fetch_list=[sum_cost, token_num],
use_program_cache=True)
test_total_cost += test_sum_cost
test_total_token += test_token_num
test_avg_cost = test_total_cost / test_total_token
test_ppl = np.exp([min(test_avg_cost, 100)])
return test_avg_cost, test_ppl
place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
# Initialize the parameters. # Initialize the parameters.
exe.run(fluid.framework.default_startup_program()) if TrainTaskConfig.ckpt_path:
for pos_enc_param_name in pos_enc_param_names: fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
pos_enc_param = fluid.global_scope().find_var( lr_scheduler.current_steps = TrainTaskConfig.start_step
pos_enc_param_name).get_tensor() else:
pos_enc_param.set( exe.run(fluid.framework.default_startup_program())
position_encoding_init(ModelHyperParams.max_length + 1,
ModelHyperParams.d_model), place)
train_data = reader.DataReader(
src_vocab_fpath=args.src_vocab_fpath,
trg_vocab_fpath=args.trg_vocab_fpath,
fpattern=args.train_file_pattern,
use_token_batch=args.use_token_batch,
batch_size=args.batch_size * (1 if args.use_token_batch else dev_count),
pool_size=args.pool_size,
sort_type=args.sort_type,
shuffle=args.shuffle,
shuffle_batch=args.shuffle_batch,
start_mark=args.special_token[0],
end_mark=args.special_token[1],
unk_mark=args.special_token[2],
clip_last_batch=False)
train_data = read_multiple(reader=train_data.batch_generator)
train_exe = fluid.ParallelExecutor(
use_cuda=TrainTaskConfig.use_gpu,
loss_name=sum_cost.name,
use_default_grad_scale=False)
def test_context():
# Context to do validation.
test_program = fluid.default_main_program().clone()
with fluid.program_guard(test_program):
test_program = fluid.io.get_inference_program([avg_cost])
val_data = reader.DataReader(
src_vocab_fpath=args.src_vocab_fpath,
trg_vocab_fpath=args.trg_vocab_fpath,
fpattern=args.val_file_pattern,
use_token_batch=args.use_token_batch,
batch_size=args.batch_size *
(1 if args.use_token_batch else dev_count),
pool_size=args.pool_size,
sort_type=args.sort_type,
start_mark=args.special_token[0],
end_mark=args.special_token[1],
unk_mark=args.special_token[2],
clip_last_batch=False,
shuffle=False,
shuffle_batch=False)
test_exe = fluid.ParallelExecutor(
use_cuda=TrainTaskConfig.use_gpu,
main_program=test_program,
share_vars_from=train_exe)
def test(exe=test_exe):
test_total_cost = 0
test_total_token = 0
test_data = read_multiple(reader=val_data.batch_generator)
for batch_id, data in enumerate(test_data()):
feed_list = []
for place_id, data_buffer in enumerate(split_data(data)):
data_input_dict, util_input_dict, _ = prepare_batch_input(
data_buffer, data_input_names, util_input_names,
ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
ModelHyperParams.n_head, ModelHyperParams.d_model)
feed_list.append(
dict(data_input_dict.items() + util_input_dict.items()))
outs = exe.run(feed=feed_list,
fetch_list=[sum_cost.name, token_num.name])
sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[
1])
test_total_cost += sum_cost_val.sum()
test_total_token += token_num_val.sum()
test_avg_cost = test_total_cost / test_total_token
test_ppl = np.exp([min(test_avg_cost, 100)])
return test_avg_cost, test_ppl
return test
if args.val_file_pattern is not None:
test = test_context()
data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
-1] + label_data_input_fields
util_input_names = encoder_util_input_fields + decoder_util_input_fields
init = False
for pass_id in xrange(TrainTaskConfig.pass_num): for pass_id in xrange(TrainTaskConfig.pass_num):
pass_start_time = time.time() pass_start_time = time.time()
for batch_id, data in enumerate(train_data()): for batch_id, data in enumerate(train_data()):
if len(data) != TrainTaskConfig.batch_size: feed_list = []
continue total_num_token = 0
data_input = prepare_batch_input( lr_rate = lr_scheduler.update_learning_rate()
data, encoder_input_data_names + decoder_input_data_names[:-1] + for place_id, data_buffer in enumerate(split_data(data)):
label_data_names, ModelHyperParams.eos_idx, data_input_dict, util_input_dict, num_token = prepare_batch_input(
ModelHyperParams.eos_idx, ModelHyperParams.n_head, data_buffer, data_input_names, util_input_names,
ModelHyperParams.d_model) ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
lr_scheduler.update_learning_rate(data_input) ModelHyperParams.n_head, ModelHyperParams.d_model)
outs = exe.run(fluid.framework.default_main_program(), total_num_token += num_token
feed=data_input, feed_list.append(
fetch_list=[sum_cost, avg_cost], dict(data_input_dict.items() + util_input_dict.items() +
use_program_cache=True) {lr_scheduler.learning_rate.name: lr_rate}.items()))
sum_cost_val, avg_cost_val = np.array(outs[0]), np.array(outs[1])
if not init:
for pos_enc_param_name in pos_enc_param_names:
pos_enc = position_encoding_init(
ModelHyperParams.max_length + 1,
ModelHyperParams.d_model)
feed_list[place_id][pos_enc_param_name] = pos_enc
for feed_dict in feed_list:
feed_dict[
sum_cost.name +
"@GRAD"] = 1. / total_num_token if TrainTaskConfig.use_avg_cost else np.asarray(
[1.], dtype="float32")
outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name],
feed=feed_list)
sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
total_sum_cost = sum_cost_val.sum(
) # sum the cost from multi-devices
total_token_num = token_num_val.sum()
total_avg_cost = total_sum_cost / total_token_num
print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" % print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
(pass_id, batch_id, sum_cost_val, avg_cost_val, (pass_id, batch_id, total_sum_cost, total_avg_cost,
np.exp([min(avg_cost_val[0], 100)]))) np.exp([min(total_avg_cost, 100)])))
init = True
# Validate and save the model for inference. # Validate and save the model for inference.
val_avg_cost, val_ppl = test(exe) print("epoch: %d, " % pass_id + (
pass_end_time = time.time() "val avg loss: %f, val ppl: %f, " % test()
time_consumed = pass_end_time - pass_start_time if args.val_file_pattern is not None else "") + "consumed %fs" % (
print("epoch: %d, val avg loss: %f, val ppl: %f, " time.time() - pass_start_time))
"consumed %fs" % (pass_id, val_avg_cost, val_ppl, time_consumed)) fluid.io.save_persistables(
exe,
os.path.join(TrainTaskConfig.ckpt_dir,
"pass_" + str(pass_id) + ".checkpoint"))
fluid.io.save_inference_model( fluid.io.save_inference_model(
os.path.join(TrainTaskConfig.model_dir, os.path.join(TrainTaskConfig.model_dir,
"pass_" + str(pass_id) + ".infer.model"), "pass_" + str(pass_id) + ".infer.model"),
encoder_input_data_names + decoder_input_data_names[:-1], data_input_names[:-2] + util_input_names, [predict], exe)
[predict], exe)
if __name__ == "__main__": if __name__ == "__main__":
main() args = parse_args()
train(args)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册