提交 6fec6837 编写于 作者: G guosheng

Add ParallelExecutor for Transformer

上级 26b3788b
class TrainTaskConfig(object): class TrainTaskConfig(object):
use_gpu = False use_gpu = True
# the epoch number to train. # the epoch number to train.
pass_num = 2 pass_num = 20
# the number of sequences contained in a mini-batch. # the number of sequences contained in a mini-batch.
batch_size = 64 batch_size = 64
...@@ -117,3 +117,52 @@ decoder_input_data_names = ( ...@@ -117,3 +117,52 @@ decoder_input_data_names = (
label_data_names = ( label_data_names = (
"lbl_word", "lbl_word",
"lbl_weight", ) "lbl_weight", )
encoder_data_input_fields = (
"src_word",
"src_pos",
"src_slf_attn_bias", )
encoder_util_input_fields = (
"src_data_shape",
"src_slf_attn_pre_softmax_shape",
"src_slf_attn_post_softmax_shape", )
decoder_data_input_fields = (
"trg_word",
"trg_pos",
"trg_slf_attn_bias",
"trg_src_attn_bias",
"enc_output", )
decoder_util_input_fields = (
"trg_data_shape",
"trg_slf_attn_pre_softmax_shape",
"trg_slf_attn_post_softmax_shape",
"trg_src_attn_pre_softmax_shape",
"trg_src_attn_post_softmax_shape", )
input_descs = {
"src_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
"src_pos": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
"src_slf_attn_bias":
[(1, ModelHyperParams.n_head, (ModelHyperParams.max_length + 1),
(ModelHyperParams.max_length + 1)), "float32"],
"src_data_shape": [(3L, ), "int32"],
"src_slf_attn_pre_softmax_shape": [(2L, ), "int32"],
"src_slf_attn_post_softmax_shape": [(4L, ), "int32"],
"trg_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
"trg_pos": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
"trg_slf_attn_bias": [(1, ModelHyperParams.n_head,
(ModelHyperParams.max_length + 1),
(ModelHyperParams.max_length + 1)), "float32"],
"trg_src_attn_bias": [(1, ModelHyperParams.n_head,
(ModelHyperParams.max_length + 1),
(ModelHyperParams.max_length + 1)), "float32"],
"trg_data_shape": [(3L, ), "int32"],
"trg_slf_attn_pre_softmax_shape": [(2L, ), "int32"],
"trg_slf_attn_post_softmax_shape": [(4L, ), "int32"],
"trg_src_attn_pre_softmax_shape": [(2L, ), "int32"],
"trg_src_attn_post_softmax_shape": [(4L, ), "int32"],
"enc_output": [(1, (ModelHyperParams.max_length + 1),
ModelHyperParams.d_model), "float32"],
"lbl_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
"lbl_weight": [(1 * (ModelHyperParams.max_length + 1), 1L), "float32"],
}
...@@ -4,8 +4,7 @@ import numpy as np ...@@ -4,8 +4,7 @@ import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from config import TrainTaskConfig, pos_enc_param_names, \ from config import *
encoder_input_data_names, decoder_input_data_names, label_data_names
def position_encoding_init(n_position, d_pos_vec): def position_encoding_init(n_position, d_pos_vec):
...@@ -506,6 +505,22 @@ def make_inputs(input_data_names, ...@@ -506,6 +505,22 @@ def make_inputs(input_data_names,
return input_layers return input_layers
def make_all_inputs(input_fields):
"""
Define the input data layers for the transformer model.
"""
inputs = []
for input_field in input_fields:
input_var = layers.data(
name=input_field,
shape=input_descs[input_field][0],
dtype=input_descs[input_field][1],
append_batch_size=False)
inputs.append(input_var)
fluid.default_startup_program().global_block().clone_variable(input_var)
return inputs
def transformer( def transformer(
src_vocab_size, src_vocab_size,
trg_vocab_size, trg_vocab_size,
...@@ -517,18 +532,8 @@ def transformer( ...@@ -517,18 +532,8 @@ def transformer(
d_model, d_model,
d_inner_hid, d_inner_hid,
dropout_rate, ): dropout_rate, ):
enc_inputs = make_inputs( enc_inputs = make_all_inputs(encoder_data_input_fields +
encoder_input_data_names, encoder_util_input_fields)
n_head,
d_model,
max_length,
is_pos=True,
slf_attn_bias_flag=True,
src_attn_bias_flag=False,
enc_output_flag=False,
data_shape_flag=True,
slf_attn_shape_flag=True,
src_attn_shape_flag=False)
enc_output = wrap_encoder( enc_output = wrap_encoder(
src_vocab_size, src_vocab_size,
...@@ -542,18 +547,8 @@ def transformer( ...@@ -542,18 +547,8 @@ def transformer(
dropout_rate, dropout_rate,
enc_inputs, ) enc_inputs, )
dec_inputs = make_inputs( dec_inputs = make_all_inputs(decoder_data_input_fields[:-1] +
decoder_input_data_names, decoder_util_input_fields)
n_head,
d_model,
max_length,
is_pos=True,
slf_attn_bias_flag=True,
src_attn_bias_flag=True,
enc_output_flag=False,
data_shape_flag=True,
slf_attn_shape_flag=True,
src_attn_shape_flag=True)
predict = wrap_decoder( predict = wrap_decoder(
trg_vocab_size, trg_vocab_size,
...@@ -570,18 +565,7 @@ def transformer( ...@@ -570,18 +565,7 @@ def transformer(
# Padding index do not contribute to the total loss. The weights is used to # Padding index do not contribute to the total loss. The weights is used to
# cancel padding index in calculating the loss. # cancel padding index in calculating the loss.
gold, weights = make_inputs( gold, weights = make_all_inputs(label_data_names)
label_data_names,
n_head,
d_model,
max_length,
is_pos=False,
slf_attn_bias_flag=False,
src_attn_bias_flag=False,
enc_output_flag=False,
data_shape_flag=False,
slf_attn_shape_flag=False,
src_attn_shape_flag=False)
cost = layers.softmax_with_cross_entropy(logits=predict, label=gold) cost = layers.softmax_with_cross_entropy(logits=predict, label=gold)
weighted_cost = cost * weights weighted_cost = cost * weights
sum_cost = layers.reduce_sum(weighted_cost) sum_cost = layers.reduce_sum(weighted_cost)
......
...@@ -14,7 +14,6 @@ class LearningRateScheduler(object): ...@@ -14,7 +14,6 @@ class LearningRateScheduler(object):
def __init__(self, def __init__(self,
d_model, d_model,
warmup_steps, warmup_steps,
place,
learning_rate=0.001, learning_rate=0.001,
current_steps=0, current_steps=0,
name="learning_rate"): name="learning_rate"):
...@@ -27,14 +26,11 @@ class LearningRateScheduler(object): ...@@ -27,14 +26,11 @@ class LearningRateScheduler(object):
value=float(learning_rate), value=float(learning_rate),
dtype="float32", dtype="float32",
persistable=True) persistable=True)
self.place = place
def update_learning_rate(self, data_input): def update_learning_rate(self):
self.current_steps += 1 self.current_steps += 1
lr_value = np.power(self.d_model, -0.5) * np.min([ lr_value = np.power(self.d_model, -0.5) * np.min([
np.power(self.current_steps, -0.5), np.power(self.current_steps, -0.5),
np.power(self.warmup_steps, -1.5) * self.current_steps np.power(self.warmup_steps, -1.5) * self.current_steps
]) ])
lr_tensor = fluid.LoDTensor() return np.array([lr_value], dtype="float32")
lr_tensor.set(np.array([lr_value], dtype="float32"), self.place)
data_input[self.learning_rate.name] = lr_tensor
...@@ -7,8 +7,7 @@ import paddle.fluid as fluid ...@@ -7,8 +7,7 @@ import paddle.fluid as fluid
from model import transformer, position_encoding_init from model import transformer, position_encoding_init
from optim import LearningRateScheduler from optim import LearningRateScheduler
from config import TrainTaskConfig, ModelHyperParams, pos_enc_param_names, \ from config import *
encoder_input_data_names, decoder_input_data_names, label_data_names
def pad_batch_data(insts, def pad_batch_data(insts,
...@@ -62,8 +61,8 @@ def pad_batch_data(insts, ...@@ -62,8 +61,8 @@ def pad_batch_data(insts,
return return_list if len(return_list) > 1 else return_list[0] return return_list if len(return_list) > 1 else return_list[0]
def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, def prepare_batch_input(insts, data_input_names, util_input_names, src_pad_idx,
n_head, d_model): trg_pad_idx, n_head, d_model):
""" """
Put all padded data needed by training into a dict. Put all padded data needed by training into a dict.
""" """
...@@ -75,20 +74,20 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, ...@@ -75,20 +74,20 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx,
[1, 1, trg_max_len, 1]).astype("float32") [1, 1, trg_max_len, 1]).astype("float32")
# These shape tensors are used in reshape_op. # These shape tensors are used in reshape_op.
src_data_shape = np.array([len(insts), src_max_len, d_model], dtype="int32") src_data_shape = np.array([-1, src_max_len, d_model], dtype="int32")
trg_data_shape = np.array([len(insts), trg_max_len, d_model], dtype="int32") trg_data_shape = np.array([-1, trg_max_len, d_model], dtype="int32")
src_slf_attn_pre_softmax_shape = np.array( src_slf_attn_pre_softmax_shape = np.array(
[-1, src_slf_attn_bias.shape[-1]], dtype="int32") [-1, src_slf_attn_bias.shape[-1]], dtype="int32")
src_slf_attn_post_softmax_shape = np.array( src_slf_attn_post_softmax_shape = np.array(
src_slf_attn_bias.shape, dtype="int32") [-1] + list(src_slf_attn_bias.shape[1:]), dtype="int32")
trg_slf_attn_pre_softmax_shape = np.array( trg_slf_attn_pre_softmax_shape = np.array(
[-1, trg_slf_attn_bias.shape[-1]], dtype="int32") [-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
trg_slf_attn_post_softmax_shape = np.array( trg_slf_attn_post_softmax_shape = np.array(
trg_slf_attn_bias.shape, dtype="int32") [-1] + list(trg_slf_attn_bias.shape[1:]), dtype="int32")
trg_src_attn_pre_softmax_shape = np.array( trg_src_attn_pre_softmax_shape = np.array(
[-1, trg_src_attn_bias.shape[-1]], dtype="int32") [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
trg_src_attn_post_softmax_shape = np.array( trg_src_attn_post_softmax_shape = np.array(
trg_src_attn_bias.shape, dtype="int32") [-1] + list(trg_src_attn_bias.shape[1:]), dtype="int32")
lbl_word, lbl_weight = pad_batch_data( lbl_word, lbl_weight = pad_batch_data(
[inst[2] for inst in insts], [inst[2] for inst in insts],
...@@ -99,16 +98,19 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, ...@@ -99,16 +98,19 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx,
return_attn_bias=False, return_attn_bias=False,
return_max_len=False) return_max_len=False)
input_dict = dict( data_input_dict = dict(
zip(input_data_names, [ zip(data_input_names, [
src_word, src_pos, src_slf_attn_bias, src_data_shape, src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
src_slf_attn_pre_softmax_shape, src_slf_attn_post_softmax_shape, trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias,
trg_data_shape, trg_slf_attn_pre_softmax_shape,
trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape,
trg_src_attn_post_softmax_shape, lbl_word, lbl_weight
])) ]))
return input_dict util_input_dict = dict(
zip(util_input_names, [
src_data_shape, src_slf_attn_pre_softmax_shape,
src_slf_attn_post_softmax_shape, trg_data_shape,
trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape,
trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape
]))
return data_input_dict, util_input_dict
def main(): def main():
...@@ -123,7 +125,7 @@ def main(): ...@@ -123,7 +125,7 @@ def main():
ModelHyperParams.d_inner_hid, ModelHyperParams.dropout) ModelHyperParams.d_inner_hid, ModelHyperParams.dropout)
lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
TrainTaskConfig.warmup_steps, place, TrainTaskConfig.warmup_steps,
TrainTaskConfig.learning_rate) TrainTaskConfig.learning_rate)
optimizer = fluid.optimizer.Adam( optimizer = fluid.optimizer.Adam(
learning_rate=lr_scheduler.learning_rate, learning_rate=lr_scheduler.learning_rate,
...@@ -152,14 +154,13 @@ def main(): ...@@ -152,14 +154,13 @@ def main():
test_total_cost = 0 test_total_cost = 0
test_total_token = 0 test_total_token = 0
for batch_id, data in enumerate(val_data()): for batch_id, data in enumerate(val_data()):
data_input = prepare_batch_input( data_input_dict, util_input_dict = prepare_batch_input(
data, encoder_input_data_names + decoder_input_data_names[:-1] + data, data_input_names, util_input_names,
label_data_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.n_head, ModelHyperParams.d_model)
ModelHyperParams.d_model)
test_sum_cost, test_token_num = exe.run( test_sum_cost, test_token_num = exe.run(
test_program, test_program,
feed=data_input, feed=dict(data_input_dict.items() + util_input_dict.items()),
fetch_list=[sum_cost, token_num], fetch_list=[sum_cost, token_num],
use_program_cache=True) use_program_cache=True)
test_total_cost += test_sum_cost test_total_cost += test_sum_cost
...@@ -168,34 +169,46 @@ def main(): ...@@ -168,34 +169,46 @@ def main():
test_ppl = np.exp([min(test_avg_cost, 100)]) test_ppl = np.exp([min(test_avg_cost, 100)])
return test_avg_cost, test_ppl return test_avg_cost, test_ppl
def set_util_input(input_name_value):
tensor = fluid.global_scope().find_var(input_name_value[0]).get_tensor()
tensor.set(input_name_value[1], place)
# Initialize the parameters. # Initialize the parameters.
exe.run(fluid.framework.default_startup_program()) exe.run(fluid.framework.default_startup_program())
for pos_enc_param_name in pos_enc_param_names: for pos_enc_param_name in pos_enc_param_names:
pos_enc_param = fluid.global_scope().find_var( set_util_input((pos_enc_param_name, position_encoding_init(
pos_enc_param_name).get_tensor() ModelHyperParams.max_length + 1, ModelHyperParams.d_model)))
pos_enc_param.set(
position_encoding_init(ModelHyperParams.max_length + 1, data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
ModelHyperParams.d_model), place) -1] + label_data_names
util_input_names = encoder_util_input_fields + decoder_util_input_fields
train_exe = fluid.ParallelExecutor(
use_cuda=TrainTaskConfig.use_gpu,
loss_name=avg_cost.name
if TrainTaskConfig.use_avg_cost else sum_cost.name)
for pass_id in xrange(TrainTaskConfig.pass_num): for pass_id in xrange(TrainTaskConfig.pass_num):
pass_start_time = time.time() pass_start_time = time.time()
for batch_id, data in enumerate(train_data()): for batch_id, data in enumerate(train_data()):
if len(data) != TrainTaskConfig.batch_size: data_input_dict, util_input_dict = prepare_batch_input(
continue data, data_input_names, util_input_names,
data_input = prepare_batch_input( ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
data, encoder_input_data_names + decoder_input_data_names[:-1] + ModelHyperParams.n_head, ModelHyperParams.d_model)
label_data_names, ModelHyperParams.eos_idx, map(set_util_input,
ModelHyperParams.eos_idx, ModelHyperParams.n_head, zip(util_input_dict.keys() + [lr_scheduler.learning_rate.name],
ModelHyperParams.d_model) util_input_dict.values() +
lr_scheduler.update_learning_rate(data_input) [lr_scheduler.update_learning_rate()]))
outs = exe.run(fluid.framework.default_main_program(), outs = train_exe.run(feed_dict=data_input_dict,
feed=data_input, fetch_list=[sum_cost.name, token_num.name])
fetch_list=[sum_cost, avg_cost], sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
use_program_cache=True) total_sum_cost = sum_cost_val.sum(
sum_cost_val, avg_cost_val = np.array(outs[0]), np.array(outs[1]) ) # sum the cost from multi devices
total_token_num = token_num_val.sum()
total_avg_cost = total_sum_cost / total_token_num
print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" % print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
(pass_id, batch_id, sum_cost_val, avg_cost_val, (pass_id, batch_id, total_sum_cost, total_avg_cost,
np.exp([min(avg_cost_val[0], 100)]))) np.exp([min(total_avg_cost, 100)])))
# Validate and save the model for inference. # Validate and save the model for inference.
val_avg_cost, val_ppl = test(exe) val_avg_cost, val_ppl = test(exe)
pass_end_time = time.time() pass_end_time = time.time()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册