提交 df3309a1 编写于 作者: G guosheng

Add CE for Transformer

上级 68e73327
#!/bin/bash
DATA_PATH=$HOME/.cache/paddle/dataset/wmt16
if [ ! -d $DATA_PATH ] ; then
python -c 'import paddle;paddle.dataset.wmt16.train(10000, 10000, "en")'\
'().next()'
tar -zxf $DATA_PATH/wmt16.tar.gz -C $DATA_PATH
fi
train(){
python -u train.py \
--src_vocab_fpath $DATA_PATH/en_10000.dict \
--trg_vocab_fpath $DATA_PATH/de_10000.dict \
--special_token '<s>' '<e>' '<unk>' \
--train_file_pattern $DATA_PATH/wmt16/train \
--val_file_pattern $DATA_PATH/wmt16/val \
--use_token_batch True \
--batch_size 2048 \
--sort_type pool \
--pool_size 10000 \
--enable_ce True \
weight_sharing False \
pass_num 20 \
dropout_seed 10
}
train | python _ce.py
####this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_cost_kpi = CostKpi('train_cost', 0.02, actived=True)
test_cost_kpi = CostKpi('test_cost', 0.005, actived=True)
train_duration_kpi = DurationKpi('train_duration', 0.06, actived=True)
tracking_kpis = [
train_cost_kpi,
test_cost_kpi,
train_duration_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print log
print("****")
log_to_ce(log)
\ No newline at end of file
......@@ -81,6 +81,8 @@ class ModelHyperParams(object):
n_layer = 6
# dropout rate used by all dropout layers.
dropout = 0.1
# random seed used in dropout for CE.
dropout_seed = None
# the flag indicating whether to share embedding and softmax weights.
# vocabularies in source and target should be same for weight sharing.
weight_sharing = True
......
......@@ -111,7 +111,10 @@ def multi_head_attention(queries,
x=weights, shape=product.shape, actual_shape=post_softmax_shape)
if dropout_rate:
weights = layers.dropout(
weights, dropout_prob=dropout_rate, is_test=False)
weights,
dropout_prob=dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
out = layers.matmul(weights, v)
return out
......@@ -171,7 +174,10 @@ def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.):
elif cmd == "d": # add dropout
if dropout_rate:
out = layers.dropout(
out, dropout_prob=dropout_rate, is_test=False)
out,
dropout_prob=dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
return out
......@@ -211,7 +217,9 @@ def prepare_encoder(src_word,
shape=[batch_size, seq_len, src_emb_dim],
actual_shape=src_data_shape)
return layers.dropout(
enc_input, dropout_prob=dropout_rate,
enc_input,
dropout_prob=dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False) if dropout_rate else enc_input
......
......@@ -103,6 +103,12 @@ def parse_args():
help="The device type.")
parser.add_argument(
'--sync', type=ast.literal_eval, default=True, help="sync mode.")
parser.add_argument(
"--enable_ce",
type=ast.literal_eval,
default=True,
help="The flag indicating whether to run the task "
"for continuous evaluation.")
args = parser.parse_args()
# Append args related to dict
......@@ -382,6 +388,12 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
data_input_names, util_input_names, sum_cost,
token_num)
# the best cross-entropy value with label smoothing
loss_normalizer = -((1. - TrainTaskConfig.label_smooth_eps) * np.log(
(1. - TrainTaskConfig.label_smooth_eps
)) + TrainTaskConfig.label_smooth_eps *
np.log(TrainTaskConfig.label_smooth_eps / (
ModelHyperParams.trg_vocab_size - 1) + 1e-20))
init = False
for pass_id in xrange(TrainTaskConfig.pass_num):
pass_start_time = time.time()
......@@ -421,19 +433,27 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
) # sum the cost from multi-devices
total_token_num = token_num_val.sum()
total_avg_cost = total_sum_cost / total_token_num
print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
(pass_id, batch_id, total_sum_cost, total_avg_cost,
np.exp([min(total_avg_cost, 100)])))
print("epoch: %d, batch: %d, avg loss: %f, normalized loss: %f,"
" ppl: %f" % (pass_id, batch_id, total_avg_cost,
total_avg_cost - loss_normalizer,
np.exp([min(total_avg_cost, 100)])))
if batch_id > 0 and batch_id % 1000 == 0:
fluid.io.save_persistables(
exe,
os.path.join(TrainTaskConfig.ckpt_dir, "latest.checkpoint"))
init = True
time_consumed = time.time() - pass_start_time
# Validate and save the model for inference.
print("epoch: %d, " % pass_id +
("val avg loss: %f, val ppl: %f, " % test()
if args.val_file_pattern is not None else "") + "consumed %fs" %
(time.time() - pass_start_time))
if args.val_file_pattern is not None:
val_avg_cost, val_ppl = test()
print(
"epoch: %d, val avg loss: %f, val normalized loss: %f, val ppl: %f,"
" consumed %fs" % (pass_id, val_avg_cost,
val_avg_cost - loss_normalizer, val_ppl,
time_consumed))
else:
print("epoch: %d, consumed %fs" % (pass_id, time_consumed))
fluid.io.save_persistables(
exe,
os.path.join(TrainTaskConfig.ckpt_dir,
......@@ -442,6 +462,10 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
os.path.join(TrainTaskConfig.model_dir,
"pass_" + str(pass_id) + ".infer.model"),
data_input_names[:-2] + util_input_names, [predict], exe)
if args.enable_ce: # For CE
print("kpis\ttrain_cost\t%f" % total_avg_cost)
print("kpis\ttest_cost\t%f" % val_avg_cost)
print("kpis\ttrain_duration\t%f" % time_consumed)
def train(args):
......@@ -465,6 +489,9 @@ def train(args):
exe = fluid.Executor(place)
if args.enable_ce:
fluid.default_startup_program().random_seed = 1000
sum_cost, avg_cost, predict, token_num = transformer(
ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册