diff --git a/fluid/neural_machine_translation/rnn_search/.run_ce.sh b/fluid/neural_machine_translation/rnn_search/.run_ce.sh new file mode 100755 index 0000000000000000000000000000000000000000..e423795e8c22532332b46e4ddb714ec2e44e517a --- /dev/null +++ b/fluid/neural_machine_translation/rnn_search/.run_ce.sh @@ -0,0 +1,5 @@ +###!/bin/bash +####This file is only used for continuous evaluation. + +model_file='train.py' +python $model_file --pass_num 1 --learning_rate 0.001 --save_interval 10 --enable_ce diff --git a/fluid/neural_machine_translation/rnn_search/_ce.py b/fluid/neural_machine_translation/rnn_search/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..c3d7beef6b8d55485ec87f40ce2aa8e7fe2ddc49 --- /dev/null +++ b/fluid/neural_machine_translation/rnn_search/_ce.py @@ -0,0 +1,63 @@ +####this file is only used for continuous evaluation test! + +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +#### NOTE kpi.py should shared in models in some way!!!! + +train_cost_kpi = CostKpi('train_cost', 0.02, actived=True) +test_cost_kpi = CostKpi('test_cost', 0.005, actived=True) +train_duration_kpi = DurationKpi('train_duration', 0.06, actived=True) + +tracking_kpis = [ + train_cost_kpi, + test_cost_kpi, + train_duration_kpi, +] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + print("-----%s" % fs) + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + print("*****") + print log + print("****") + log_to_ce(log) diff --git a/fluid/neural_machine_translation/rnn_search/args.py b/fluid/neural_machine_translation/rnn_search/args.py index 50eacdf417b4f57bfb28436c9f76e04670689733..fc0b61b2da1f1a4c2ddbe5785cb4f2f6aad92af6 100644 --- a/fluid/neural_machine_translation/rnn_search/args.py +++ b/fluid/neural_machine_translation/rnn_search/args.py @@ -89,5 +89,9 @@ def parse_args(): default=1, help="Save the trained model every n passes." "(default: %(default)d)") + parser.add_argument( + "--enable_ce", + action='store_true', + help="If set, run the task with continuous evaluation logs.") args = parser.parse_args() return args diff --git a/fluid/neural_machine_translation/rnn_search/train.py b/fluid/neural_machine_translation/rnn_search/train.py index 589fd160d9138fd20cb931b96c71d0c2c2cfdb49..7bd3628917d83eb1221312e01f8b0fd906f46a64 100644 --- a/fluid/neural_machine_translation/rnn_search/train.py +++ b/fluid/neural_machine_translation/rnn_search/train.py @@ -35,6 +35,9 @@ import no_attention_model def train(): args = parse_args() + if args.enable_ce: + framework.default_startup_program().random_seed = 111 + # Training process if args.no_attention: avg_cost, feed_order = no_attention_model.seq_to_seq_net( @@ -68,17 +71,28 @@ def train(): optimizer.minimize(avg_cost) - train_batch_generator = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), - batch_size=args.batch_size, - drop_last=False) + if not args.enable_ce: + train_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), + batch_size=args.batch_size, + drop_last=False) + + test_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), + batch_size=args.batch_size, + drop_last=False) + else: + train_batch_generator = paddle.batch( + paddle.dataset.wmt14.train(args.dict_size), + batch_size=args.batch_size, + drop_last=False) - test_batch_generator = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), - batch_size=args.batch_size, - drop_last=False) + test_batch_generator = paddle.batch( + paddle.dataset.wmt14.test(args.dict_size), + batch_size=args.batch_size, + drop_last=False) place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() exe = Executor(place) @@ -123,6 +137,9 @@ def train(): avg_cost_train = np.array(fetch_outs[0]) print('pass_id=%d, batch_id=%d, train_loss: %f' % (pass_id, batch_id, avg_cost_train)) + # This is for continuous evaluation only + if args.enable_ce and batch_id >= 100: + break pass_end_time = time.time() test_loss = validation() @@ -131,6 +148,12 @@ def train(): print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % (pass_id, test_loss, words_per_sec, time_consumed)) + # This log is for continuous evaluation only + if args.enable_ce: + print("kpis train_cost %f" % avg_cost_train) + print("kpis test_cost %f" % test_loss) + print("kpis train_duration %f" % time_consumed) + if pass_id % args.save_interval == 0: model_path = os.path.join(args.save_dir, str(pass_id)) if not os.path.isdir(model_path): diff --git a/fluid/neural_machine_translation_rnn_search b/fluid/neural_machine_translation_rnn_search new file mode 120000 index 0000000000000000000000000000000000000000..29002f1776a3f4e0bfa0b32a1aebc44d66b65628 --- /dev/null +++ b/fluid/neural_machine_translation_rnn_search @@ -0,0 +1 @@ +./neural_machine_translation/rnn_search \ No newline at end of file