提交 86fe83f2 编写于 作者: G guosheng

Merge branch 'develop' of https://github.com/PaddlePaddle/models into fix-transformer-batchsize-dev

...@@ -15,6 +15,9 @@ class TrainTaskConfig(object): ...@@ -15,6 +15,9 @@ class TrainTaskConfig(object):
# the parameters for learning rate scheduling. # the parameters for learning rate scheduling.
warmup_steps = 4000 warmup_steps = 4000
# the flag indicating to use average loss or sum loss when training.
use_avg_cost = False
# the directory for saving trained models. # the directory for saving trained models.
model_dir = "trained_models" model_dir = "trained_models"
......
...@@ -591,7 +591,10 @@ def transformer( ...@@ -591,7 +591,10 @@ def transformer(
src_attn_shape_flag=False) src_attn_shape_flag=False)
cost = layers.softmax_with_cross_entropy(logits=predict, label=gold) cost = layers.softmax_with_cross_entropy(logits=predict, label=gold)
weighted_cost = cost * weights weighted_cost = cost * weights
return layers.reduce_sum(weighted_cost), predict sum_cost = layers.reduce_sum(weighted_cost)
token_num = layers.reduce_sum(weights)
avg_cost = sum_cost / token_num
return sum_cost, avg_cost, predict
def wrap_encoder(src_vocab_size, def wrap_encoder(src_vocab_size,
......
import os import os
import time
import numpy as np import numpy as np
import paddle import paddle
...@@ -103,7 +104,7 @@ def main(): ...@@ -103,7 +104,7 @@ def main():
place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
cost, predict = transformer( sum_cost, avg_cost, predict = transformer(
ModelHyperParams.src_vocab_size + 1, ModelHyperParams.src_vocab_size + 1,
ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1, ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.n_layer, ModelHyperParams.n_head,
...@@ -120,7 +121,7 @@ def main(): ...@@ -120,7 +121,7 @@ def main():
beta1=TrainTaskConfig.beta1, beta1=TrainTaskConfig.beta1,
beta2=TrainTaskConfig.beta2, beta2=TrainTaskConfig.beta2,
epsilon=TrainTaskConfig.eps) epsilon=TrainTaskConfig.eps)
optimizer.minimize(cost) optimizer.minimize(avg_cost if TrainTaskConfig.use_avg_cost else sum_cost)
train_data = paddle.batch( train_data = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
...@@ -132,29 +133,30 @@ def main(): ...@@ -132,29 +133,30 @@ def main():
# Program to do validation. # Program to do validation.
test_program = fluid.default_main_program().clone() test_program = fluid.default_main_program().clone()
with fluid.program_guard(test_program): with fluid.program_guard(test_program):
test_program = fluid.io.get_inference_program([cost]) test_program = fluid.io.get_inference_program([avg_cost])
val_data = paddle.batch( val_data = paddle.batch(
paddle.dataset.wmt16.validation(ModelHyperParams.src_vocab_size, paddle.dataset.wmt16.validation(ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size), ModelHyperParams.trg_vocab_size),
batch_size=TrainTaskConfig.batch_size) batch_size=TrainTaskConfig.batch_size)
def test(exe): def test(exe):
test_costs = [] test_sum_costs = []
test_avg_costs = []
for batch_id, data in enumerate(val_data()): for batch_id, data in enumerate(val_data()):
if len(data) != TrainTaskConfig.batch_size: if len(data) != TrainTaskConfig.batch_size:
# Since we use the sum cost, keep comparable cost by fixing the # Fix the batch size to keep comparable cost among all
# batch size. Remove this if the cost is mean. # mini-batches and compute the mean.
continue continue
data_input = prepare_batch_input( data_input = prepare_batch_input(
data, encoder_input_data_names + decoder_input_data_names[:-1] + data, encoder_input_data_names + decoder_input_data_names[:-1] +
label_data_names, ModelHyperParams.src_pad_idx, label_data_names, ModelHyperParams.src_pad_idx,
ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head, ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head,
ModelHyperParams.d_model) ModelHyperParams.d_model)
test_cost = exe.run(test_program, test_sum_cost, test_avg_cost = exe.run(
feed=data_input, test_program, feed=data_input, fetch_list=[sum_cost, avg_cost])
fetch_list=[cost])[0] test_sum_costs.append(test_sum_cost)
test_costs.append(test_cost) test_avg_costs.append(test_avg_cost)
return np.mean(test_costs) return np.mean(test_sum_costs), np.mean(test_avg_costs)
# Initialize the parameters. # Initialize the parameters.
exe.run(fluid.framework.default_startup_program()) exe.run(fluid.framework.default_startup_program())
...@@ -166,6 +168,7 @@ def main(): ...@@ -166,6 +168,7 @@ def main():
ModelHyperParams.d_model), place) ModelHyperParams.d_model), place)
for pass_id in xrange(TrainTaskConfig.pass_num): for pass_id in xrange(TrainTaskConfig.pass_num):
pass_start_time = time.time()
for batch_id, data in enumerate(train_data()): for batch_id, data in enumerate(train_data()):
data_input = prepare_batch_input( data_input = prepare_batch_input(
data, encoder_input_data_names + decoder_input_data_names[:-1] + data, encoder_input_data_names + decoder_input_data_names[:-1] +
...@@ -175,14 +178,20 @@ def main(): ...@@ -175,14 +178,20 @@ def main():
lr_scheduler.update_learning_rate(data_input) lr_scheduler.update_learning_rate(data_input)
outs = exe.run(fluid.framework.default_main_program(), outs = exe.run(fluid.framework.default_main_program(),
feed=data_input, feed=data_input,
fetch_list=[cost], fetch_list=[sum_cost, avg_cost],
use_program_cache=True) use_program_cache=True)
cost_val = np.array(outs[0]) sum_cost_val, avg_cost_val = np.array(outs[0]), np.array(outs[1])
print("pass_id = " + str(pass_id) + " batch = " + str(batch_id) + print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
" cost = " + str(cost_val)) (pass_id, batch_id, sum_cost_val, avg_cost_val,
np.exp([min(avg_cost_val[0], 100)])))
# Validate and save the model for inference. # Validate and save the model for inference.
val_cost = test(exe) val_sum_cost, val_avg_cost = test(exe)
print("pass_id = " + str(pass_id) + " val_cost = " + str(val_cost)) pass_end_time = time.time()
time_consumed = pass_end_time - pass_start_time
print("epoch: %d, val sum loss: %f, val avg loss: %f, val ppl: %f, "
"consumed %fs" %
(pass_id, val_sum_cost, val_avg_cost,
np.exp([min(val_avg_cost, 100)]), time_consumed))
fluid.io.save_inference_model( fluid.io.save_inference_model(
os.path.join(TrainTaskConfig.model_dir, os.path.join(TrainTaskConfig.model_dir,
"pass_" + str(pass_id) + ".infer.model"), "pass_" + str(pass_id) + ".infer.model"),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册