From 7e0e08a842e70243f371bdf287c2978b44d0471a Mon Sep 17 00:00:00 2001 From: zhengya01 Date: Fri, 1 Mar 2019 03:59:31 +0000 Subject: [PATCH] add word2vec ce --- fluid/PaddleRec/word2vec/.run_ce.sh | 13 ++++++ fluid/PaddleRec/word2vec/_ce.py | 62 +++++++++++++++++++++++++++++ fluid/PaddleRec/word2vec/train.py | 33 ++++++++++++++- 3 files changed, 107 insertions(+), 1 deletion(-) create mode 100755 fluid/PaddleRec/word2vec/.run_ce.sh create mode 100644 fluid/PaddleRec/word2vec/_ce.py diff --git a/fluid/PaddleRec/word2vec/.run_ce.sh b/fluid/PaddleRec/word2vec/.run_ce.sh new file mode 100755 index 00000000..dc7635fe --- /dev/null +++ b/fluid/PaddleRec/word2vec/.run_ce.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 + + +export CPU_NUM=1 + +FLAGS_benchmark=true python train.py --train_data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict --with_hs --is_local --num_passes 10 --enable_ce | python _ce.py + +export CPU_NUM=8 + +FLAGS_benchmark=true python train.py --train_data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict --with_hs --is_local --num_passes 10 --enable_ce | python _ce.py diff --git a/fluid/PaddleRec/word2vec/_ce.py b/fluid/PaddleRec/word2vec/_ce.py new file mode 100644 index 00000000..c8eefd74 --- /dev/null +++ b/fluid/PaddleRec/word2vec/_ce.py @@ -0,0 +1,62 @@ +# this file is only used for continuous evaluation test! + +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi +from kpi import DurationKpi +from kpi import AccKpi + + +each_pass_duration_cpu1_thread1_kpi = DurationKpi('each_pass_duration_cpu1_thread1', 0.08, 0, actived=True) +train_loss_cpu1_thread1_kpi = CostKpi('train_loss_cpu1_thread1', 0.08, 0) +each_pass_duration_cpu8_thread8_kpi = DurationKpi('each_pass_duration_cpu8_thread8', 0.08, 0, actived=True) +train_loss_cpu8_thread8_kpi = CostKpi('train_loss_cpu8_thread8', 0.08, 0) + +tracking_kpis = [ + each_pass_duration_cpu1_thread1_kpi, + train_loss_cpu1_thread1_kpi, + each_pass_duration_cpu8_thread8_kpi, + train_loss_cpu8_thread8_kpi, + ] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + log_to_ce(log) diff --git a/fluid/PaddleRec/word2vec/train.py b/fluid/PaddleRec/word2vec/train.py index 2c8512e4..31ee54f6 100644 --- a/fluid/PaddleRec/word2vec/train.py +++ b/fluid/PaddleRec/word2vec/train.py @@ -129,6 +129,11 @@ def parse_args(): default=4, help="find rank_num-nearest result for test (default: 4)") + parser.add_argument( + '--enable_ce', + action='store_true', + help='If set, run the task with continuous evaluation logs.') + return parser.parse_args() @@ -198,6 +203,8 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id): profiler_step_start = 20 profiler_step_end = 30 + total_time = 0 + ce_info = [] for pass_id in range(args.num_passes): py_reader.start() time.sleep(10) @@ -206,11 +213,14 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id): start = time.time() try: - while True: + start_time = time.time() loss_val = train_exe.run(fetch_list=[loss.name]) loss_val = np.mean(loss_val) + total_time += time.time() - start_time + ce_info.append(loss_val.mean()) + if batch_id % 50 == 0: logger.info( "TRAIN --> pass: {} batch: {} loss: {} reader queue:{}". @@ -250,6 +260,27 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id): fluid.io.save_persistables(executor=exe, dirname=model_dir) with open(model_dir + "/_success", 'w+') as f: f.write(str(pass_id)) + # only for ce + if args.enable_ce: + threads_num, cpu_num = get_cards(args) + epoch_idx = args.num_passes + ce_loss = 0 + try: + ce_loss = ce_info[-1] + except: + logger.error("ce info error") + + print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" % + (cpu_num, threads_num, total_time / epoch_idx)) + print("kpis\ttrain_loss_cpu%s_thread%s\t%s" % + (cpu_num, threads_num, ce_loss)) + + +def get_cards(args): + threads_num = os.environ.get('CPU_NUM', 1) + cpu_num = os.environ.get('CPU_NUM', 1) + return int(threads_num), int(cpu_num) + def GetFileList(data_path): -- GitLab