diff --git a/fluid/neural_machine_translation/transformer/config.py b/fluid/neural_machine_translation/transformer/config.py index 0fb00ed3c2fb8bfe74796bf91a8dd711ed44866c..0fb89f45c6c63f70fa91238f091bc93f274881ab 100644 --- a/fluid/neural_machine_translation/transformer/config.py +++ b/fluid/neural_machine_translation/transformer/config.py @@ -15,6 +15,9 @@ class TrainTaskConfig(object): # the parameters for learning rate scheduling. warmup_steps = 4000 + # the flag indicating to use average loss or sum loss when training. + use_avg_cost = False + # the directory for saving trained models. model_dir = "trained_models" diff --git a/fluid/neural_machine_translation/transformer/model.py b/fluid/neural_machine_translation/transformer/model.py index deba195c34f6c71fb4a78f46b4e2678f67feaa75..c149e3085769e357e9891c2f43c0dd38d3dd86c1 100644 --- a/fluid/neural_machine_translation/transformer/model.py +++ b/fluid/neural_machine_translation/transformer/model.py @@ -591,7 +591,10 @@ def transformer( src_attn_shape_flag=False) cost = layers.softmax_with_cross_entropy(logits=predict, label=gold) weighted_cost = cost * weights - return layers.reduce_sum(weighted_cost), predict + sum_cost = layers.reduce_sum(weighted_cost) + token_num = layers.reduce_sum(weights) + avg_cost = sum_cost / token_num + return sum_cost, avg_cost, predict def wrap_encoder(src_vocab_size, diff --git a/fluid/neural_machine_translation/transformer/train.py b/fluid/neural_machine_translation/transformer/train.py index 83098e9b8720ef9652f4071f37c1fb7af83109c0..cca8096660de5efd9188748453cf3304d94d3e5c 100644 --- a/fluid/neural_machine_translation/transformer/train.py +++ b/fluid/neural_machine_translation/transformer/train.py @@ -1,4 +1,5 @@ import os +import time import numpy as np import paddle @@ -103,7 +104,7 @@ def main(): place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) - cost, predict = transformer( + sum_cost, avg_cost, predict = transformer( ModelHyperParams.src_vocab_size + 1, ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, @@ -120,7 +121,7 @@ def main(): beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) - optimizer.minimize(cost) + optimizer.minimize(avg_cost if TrainTaskConfig.use_avg_cost else sum_cost) train_data = paddle.batch( paddle.reader.shuffle( @@ -132,29 +133,30 @@ def main(): # Program to do validation. test_program = fluid.default_main_program().clone() with fluid.program_guard(test_program): - test_program = fluid.io.get_inference_program([cost]) + test_program = fluid.io.get_inference_program([avg_cost]) val_data = paddle.batch( paddle.dataset.wmt16.validation(ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size), batch_size=TrainTaskConfig.batch_size) def test(exe): - test_costs = [] + test_sum_costs = [] + test_avg_costs = [] for batch_id, data in enumerate(val_data()): if len(data) != TrainTaskConfig.batch_size: - # Since we use the sum cost, keep comparable cost by fixing the - # batch size. Remove this if the cost is mean. + # Fix the batch size to keep comparable cost among all + # mini-batches and compute the mean. continue data_input = prepare_batch_input( data, encoder_input_data_names + decoder_input_data_names[:-1] + label_data_names, ModelHyperParams.src_pad_idx, ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) - test_cost = exe.run(test_program, - feed=data_input, - fetch_list=[cost])[0] - test_costs.append(test_cost) - return np.mean(test_costs) + test_sum_cost, test_avg_cost = exe.run( + test_program, feed=data_input, fetch_list=[sum_cost, avg_cost]) + test_sum_costs.append(test_sum_cost) + test_avg_costs.append(test_avg_cost) + return np.mean(test_sum_costs), np.mean(test_avg_costs) # Initialize the parameters. exe.run(fluid.framework.default_startup_program()) @@ -166,6 +168,7 @@ def main(): ModelHyperParams.d_model), place) for pass_id in xrange(TrainTaskConfig.pass_num): + pass_start_time = time.time() for batch_id, data in enumerate(train_data()): data_input = prepare_batch_input( data, encoder_input_data_names + decoder_input_data_names[:-1] + @@ -175,14 +178,20 @@ def main(): lr_scheduler.update_learning_rate(data_input) outs = exe.run(fluid.framework.default_main_program(), feed=data_input, - fetch_list=[cost], + fetch_list=[sum_cost, avg_cost], use_program_cache=True) - cost_val = np.array(outs[0]) - print("pass_id = " + str(pass_id) + " batch = " + str(batch_id) + - " cost = " + str(cost_val)) + sum_cost_val, avg_cost_val = np.array(outs[0]), np.array(outs[1]) + print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" % + (pass_id, batch_id, sum_cost_val, avg_cost_val, + np.exp([min(avg_cost_val[0], 100)]))) # Validate and save the model for inference. - val_cost = test(exe) - print("pass_id = " + str(pass_id) + " val_cost = " + str(val_cost)) + val_sum_cost, val_avg_cost = test(exe) + pass_end_time = time.time() + time_consumed = pass_end_time - pass_start_time + print("epoch: %d, val sum loss: %f, val avg loss: %f, val ppl: %f, " + "consumed %fs" % + (pass_id, val_sum_cost, val_avg_cost, + np.exp([min(val_avg_cost, 100)]), time_consumed)) fluid.io.save_inference_model( os.path.join(TrainTaskConfig.model_dir, "pass_" + str(pass_id) + ".infer.model"),