diff --git a/PaddleNLP/language_model/.run_ce.sh b/PaddleNLP/language_model/.run_ce.sh new file mode 100644 index 0000000000000000000000000000000000000000..96c7e71defd5c14675ce3303bc652b43dd1cd484 --- /dev/null +++ b/PaddleNLP/language_model/.run_ce.sh @@ -0,0 +1,15 @@ +export CUDA_VISIBLE_DEVICES=0 + +python train.py \ + --data_path data/simple-examples/data/ \ + --model_type test \ + --use_gpu True \ + --rnn_model static \ + --enable_ce | python _ce.py + +python train.py \ + --data_path data/simple-examples/data/ \ + --model_type test \ + --use_gpu True \ + --rnn_model padding \ + --enable_ce | python _ce.py diff --git a/PaddleNLP/language_model/__init__.py b/PaddleNLP/language_model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PaddleNLP/language_model/_ce.py b/PaddleNLP/language_model/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..2a2cadae315c5e0101b41f8d3e1f949be3c1001f --- /dev/null +++ b/PaddleNLP/language_model/_ce.py @@ -0,0 +1,62 @@ +# this file is only used for continuous evaluation test! + +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi +from kpi import DurationKpi + +imikolov_20_avg_ppl_kpi_card1 = CostKpi('lstm_language_model_static_loss_card1', 0.01, 0) +imikolov_20_pass_duration_kpi_card1 = DurationKpi( + 'lstm_language_model_static_duration_card1', 0.03, 0, actived=True) +imikolov_20_avg_ppl_kpi_card1_padding = CostKpi('lstm_language_model_padding_loss_card1', 0.01, 0) +imikolov_20_pass_duration_kpi_card1_padding = DurationKpi( + 'lstm_language_model_padding_duration_card1', 0.03, 0, actived=True) + +tracking_kpis = [ + imikolov_20_avg_ppl_kpi_card1, + imikolov_20_pass_duration_kpi_card1, + imikolov_20_avg_ppl_kpi_card1_padding, + imikolov_20_pass_duration_kpi_card1_padding, +] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'ptblm': + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + log_to_ce(log) diff --git a/PaddleNLP/language_model/train.py b/PaddleNLP/language_model/train.py index 115b803e7bd61b72dea76c7137a00144b6240128..e399e9989d6a621de745e2e8b2aae512a529ce47 100644 --- a/PaddleNLP/language_model/train.py +++ b/PaddleNLP/language_model/train.py @@ -286,9 +286,10 @@ def train(): print("train ppl", ppl[0]) if epoch_id == max_epoch - 1 and args.enable_ce: - print("ptblm\tlstm_language_model_duration\t%s" % - (total_time / max_epoch)) - print("ptblm\tlstm_language_model_loss\t%s" % ppl[0]) + card_num = get_cards() + print("ptblm\tlstm_language_model_duration_card%d\t%s" % + (card_num, total_time / max_epoch)) + print("ptblm\tlstm_language_model_loss_card%d\t%s" % (card_num, ppl[0])) model_path = os.path.join("model_new/", str(epoch_id)) if not os.path.isdir(model_path): @@ -301,5 +302,13 @@ def train(): print("test ppl", test_ppl[0]) +def get_cards(): + num = 0 + cards = os.environ.get('CUDA_VISIBLE_DEVICES', '') + if cards != '': + num = len(cards.split(",")) + return num + + if __name__ == '__main__': train()