diff --git a/fluid/neural_machine_translation/transformer/config.py b/fluid/neural_machine_translation/transformer/config.py index 737568ba35ffe3bdd86e6b3d2a39ed6897940979..6744e1a4af677c7008370cea1a2f914dff89feac 100644 --- a/fluid/neural_machine_translation/transformer/config.py +++ b/fluid/neural_machine_translation/transformer/config.py @@ -4,7 +4,7 @@ class TrainTaskConfig(object): pass_num = 2 # number of sequences contained in a mini-batch. - batch_size = 32 + batch_size = 64 # the hyper params for Adam optimizer. learning_rate = 0.001 diff --git a/fluid/neural_machine_translation/transformer/model.py b/fluid/neural_machine_translation/transformer/model.py index 379a17221c3aaa4daf7f530f9553bcef89b42de6..5732fe4232db8e2c92fee56dcaec7ef619956e4c 100644 --- a/fluid/neural_machine_translation/transformer/model.py +++ b/fluid/neural_machine_translation/transformer/model.py @@ -387,60 +387,34 @@ def transformer( src_pad_idx, trg_pad_idx, pos_pad_idx, ): - # The shapes here act as placeholder. - # The shapes set here is to pass the infer-shape in compile time. The actual - # shape of src_word in run time is: - # [batch_size * max_src_length_in_a_batch, 1]. - src_word = layers.data( - name=input_data_names[0], - shape=[batch_size * max_length, 1], - dtype="int64", - append_batch_size=False) - # The actual shape of src_pos in runtime is: - # [batch_size * max_src_length_in_a_batch, 1]. - src_pos = layers.data( - name=input_data_names[1], - shape=[batch_size * max_length, 1], - dtype="int64", - append_batch_size=False) - # The actual shape of trg_word is in runtime is: - # [batch_size * max_trg_length_in_a_batch, 1]. - trg_word = layers.data( - name=input_data_names[2], - shape=[batch_size * max_length, 1], - dtype="int64", - append_batch_size=False) - # The actual shape of trg_pos in runtime is: - # [batch_size * max_trg_length_in_a_batch, 1]. - trg_pos = layers.data( - name=input_data_names[3], - shape=[batch_size * max_length, 1], - dtype="int64", - append_batch_size=False) - # The actual shape of src_slf_attn_bias in runtime is: - # [batch_size, n_head, max_src_length_in_a_batch, max_src_length_in_a_batch]. - # This input is used to remove attention weights on paddings. - src_slf_attn_bias = layers.data( - name=input_data_names[4], - shape=[batch_size, n_head, max_length, max_length], - dtype="float32", - append_batch_size=False) - # The actual shape of trg_slf_attn_bias in runtime is: - # [batch_size, n_head, max_trg_length_in_batch, max_trg_length_in_batch]. - # This is used to remove attention weights on paddings and subsequent words. - trg_slf_attn_bias = layers.data( - name=input_data_names[5], - shape=[batch_size, n_head, max_length, max_length], - dtype="float32", - append_batch_size=False) - # The actual shape of trg_src_attn_bias in runtime is: - # [batch_size, n_head, max_trg_length_in_batch, max_src_length_in_batch]. - # This is used to remove attention weights on paddings. - trg_src_attn_bias = layers.data( - name=input_data_names[6], - shape=[batch_size, n_head, max_length, max_length], - dtype="float32", - append_batch_size=False) + file_obj = fluid.layers.open_recordio_file( + filename='./wmt16.recordio', + shapes=[ + [batch_size * max_length, 1], + [batch_size * max_length, 1], + [batch_size * max_length, 1], + [batch_size * max_length, 1], + [batch_size, n_head, max_length, max_length], + [batch_size, n_head, max_length, max_length], + [batch_size, n_head, max_length, max_length], + [batch_size * max_length, 1], + [batch_size * max_length, 1], + ], + dtypes=[ + 'int64', + 'int64', + 'int64', + 'int64', + 'float32', + 'float32', + 'float32', + 'int64', + 'float32', + ], + lod_levels=[0] * 9) + + src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, gold, weights = fluid.layers.read_file( + file_obj) enc_input = prepare_encoder( src_word, @@ -492,22 +466,6 @@ def transformer( num_flatten_dims=2), shape=[-1, trg_vocab_size], act="softmax") - # The actual shape of gold in runtime is: - # [batch_size * max_trg_length_in_a_batch, 1]. - gold = layers.data( - name=input_data_names[7], - shape=[batch_size * max_length, 1], - dtype="int64", - append_batch_size=False) cost = layers.cross_entropy(input=predict, label=gold) - # The actual shape of weights in runtime is: - # [batch_size * max_trg_length_in_a_batch, 1]. - # Padding index do not contribute to the total loss. This Weight is used to - # cancel padding index in calculating the loss. - weights = layers.data( - name=input_data_names[8], - shape=[batch_size * max_length, 1], - dtype="float32", - append_batch_size=False) weighted_cost = cost * weights return layers.reduce_sum(weighted_cost) diff --git a/fluid/neural_machine_translation/transformer/train.py b/fluid/neural_machine_translation/transformer/train.py index d4ddd78b7770dc87e47ca679596c7c464103ed36..41dadf8a712dd5cc057090af11f55b89ecaf2ab0 100644 --- a/fluid/neural_machine_translation/transformer/train.py +++ b/fluid/neural_machine_translation/transformer/train.py @@ -2,9 +2,10 @@ import numpy as np import sys import time -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid import paddle.fluid.profiler as profiler +import paddle.dataset.wmt16 as wmt16 from model import transformer, position_encoding_init from optim import LearningRateScheduler @@ -12,8 +13,7 @@ from config import TrainTaskConfig, ModelHyperParams, \ pos_enc_param_names, input_data_names -def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, - max_length, n_head, place): +def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head): """ Pad the instances to the max sequence length in batch, and generate the corresponding position data and attention bias. Then, convert the numpy @@ -28,9 +28,9 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, return_attn_bias=True, return_max_len=True): """ - Pad the instances to the max sequence length in batch, and generate the - corresponding position data and attention bias. - """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. + """ return_list = [] max_len = max(len(inst) for inst in insts) inst_data = np.array( @@ -66,13 +66,6 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, return_list += [max_len] return return_list if len(return_list) > 1 else return_list[0] - def data_to_tensor(data_list, name_list, input_dict, place): - assert len(data_list) == len(name_list) - for i in range(len(name_list)): - tensor = fluid.LoDTensor() - tensor.set(data_list[i], place) - input_dict[name_list[i]] = tensor - src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data( [inst[0] for inst in insts], src_pad_idx, is_target=False) trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data( @@ -83,18 +76,13 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, False, False, False) lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1]) - data_to_tensor([ + return [ src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight - ], input_data_names, input_dict, place) - - return input_dict + ] def main(): - place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace() - exe = fluid.Executor(place) - cost = transformer( ModelHyperParams.src_vocab_size + 1, ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1, @@ -104,11 +92,8 @@ def main(): ModelHyperParams.dropout, ModelHyperParams.src_pad_idx, ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx) - lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, - TrainTaskConfig.warmup_steps, place, - TrainTaskConfig.learning_rate) optimizer = fluid.optimizer.Adam( - learning_rate=lr_scheduler.learning_rate, + learning_rate=TrainTaskConfig.learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) @@ -121,26 +106,27 @@ def main(): buf_size=100000), batch_size=TrainTaskConfig.batch_size) - # Initialize the parameters. - exe.run(fluid.framework.default_startup_program()) - for pos_enc_param_name in pos_enc_param_names: - pos_enc_param = fluid.global_scope().find_var( - pos_enc_param_name).get_tensor() - pos_enc_param.set( - position_encoding_init(ModelHyperParams.max_length + 1, - ModelHyperParams.d_model), place) - def fn(pass_id, batch_id, data): + reader = paddle.batch( + wmt16.train(ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size), + batch_size=TrainTaskConfig.batch_size) + + with fluid.recordio_writer.create_recordio_writer( + "./wmt16.recordio") as writer: + for batch in reader(): + for tensor in prepare_batch_input( + batch, ModelHyperParams.src_pad_idx, + ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head): + t = fluid.LoDTensor() + t.set(tensor, fluid.CPUPlace()) + writer.append_tensor(t) + writer.complete_append_tensor() + + exe = fluid.ParallelExecutor(loss_name=cost.name, use_cuda=True) + def fn(pass_id, batch_id): t1 = time.time() - data_input = prepare_batch_input( - data, input_data_names, ModelHyperParams.src_pad_idx, - ModelHyperParams.trg_pad_idx, ModelHyperParams.max_length, - ModelHyperParams.n_head, place) - lr_scheduler.update_learning_rate(data_input) - outs = exe.run(fluid.framework.default_main_program(), - feed=data_input, - fetch_list=[cost], - use_program_cache=True) + outs = exe.run([cost.name]) cost_val = np.array(outs[0]) print("pass_id = " + str(pass_id) + " batch = " + str(batch_id) + " cost = " + str(cost_val)) @@ -151,16 +137,13 @@ def main(): total_time = 0.0 count = 0 for pass_id in xrange(TrainTaskConfig.pass_num): - for batch_id, data in enumerate(train_data()): - # The current program desc is coupled with batch_size, thus all - # mini-batches must have the same number of instances currently. - if len(data) != TrainTaskConfig.batch_size: - continue - if pass_id == 0 and batch_id >= 10 and batch_id < 12: + for batch_id in xrange(10000): + if batch_id == 1: with profiler.profiler('All', 'total', '/tmp/transformer'): - duration = fn(pass_id, batch_id, data) + duration = fn(pass_id, batch_id) + duration = fn(pass_id, batch_id) else: - duration = fn(pass_id, batch_id, data) + duration = fn(pass_id, batch_id) count += 1 total_time += duration print("avg: " + str(total_time / count) + " cur: " + str(duration))