From d5d4806f9e24ee40465f7a5fbcb40692d546ab12 Mon Sep 17 00:00:00 2001 From: liyang109 Date: Thu, 30 May 2019 20:59:11 +0800 Subject: [PATCH] ce --- .../transformer/_ce.py | 66 ++++++++++++++++++ .../transformer/run_ps_ce_card1.sh | 68 +++++++++++++++++++ .../transformer/run_ps_ce_card4.sh | 68 +++++++++++++++++++ 3 files changed, 202 insertions(+) create mode 100644 PaddleNLP/neural_machine_translation/transformer/_ce.py create mode 100644 PaddleNLP/neural_machine_translation/transformer/run_ps_ce_card1.sh create mode 100644 PaddleNLP/neural_machine_translation/transformer/run_ps_ce_card4.sh diff --git a/PaddleNLP/neural_machine_translation/transformer/_ce.py b/PaddleNLP/neural_machine_translation/transformer/_ce.py new file mode 100644 index 00000000..a2d03d8e --- /dev/null +++ b/PaddleNLP/neural_machine_translation/transformer/_ce.py @@ -0,0 +1,66 @@ +import os +import sys +sys.path.insert(0, os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +#### NOTE kpi.py should shared in models in some way!!!! + +train_cost_card1_kpi = CostKpi('train_cost_card1', 0.02, 0, actived=True) +test_cost_card1_kpi = CostKpi('test_cost_card1', 0.008, 0, actived=True) +train_duration_card1_kpi = DurationKpi( + 'train_duration_card1', 0.06, 0, actived=True) +train_cost_card4_kpi = CostKpi('train_cost_card4', 0.02, 0, actived=True) +test_cost_card4_kpi = CostKpi('test_cost_card4', 0.008, 0, actived=True) +train_duration_card4_kpi = DurationKpi( + 'train_duration_card4', 0.06, 0, actived=True) + +tracking_kpis = [ + train_cost_card1_kpi, + test_cost_card1_kpi, + train_duration_card1_kpi, + train_cost_card4_kpi, + test_cost_card4_kpi, + train_duration_card4_kpi, +] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + The suggestion: + each line in the log should be key, value, for example: + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + print("-----%s" % fs) + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + print("*****") + print(log) + print("****") + log_to_ce(log) diff --git a/PaddleNLP/neural_machine_translation/transformer/run_ps_ce_card1.sh b/PaddleNLP/neural_machine_translation/transformer/run_ps_ce_card1.sh new file mode 100644 index 00000000..35dc7db9 --- /dev/null +++ b/PaddleNLP/neural_machine_translation/transformer/run_ps_ce_card1.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +train(){ + + DATA_PATH=./dataset/wmt16 + + python train.py \ + --src_vocab_fpath $DATA_PATH/en_10000.dict \ + --trg_vocab_fpath $DATA_PATH/de_10000.dict \ + --special_token '' '' '' \ + --train_file_pattern $DATA_PATH/wmt16/train \ + --val_file_pattern $DATA_PATH/wmt16/val \ + --use_token_batch True \ + --batch_size 1024 \ + --sort_type pool \ + --pool_size 200000 \ + --shuffle False \ + --enable_ce True \ + --local False \ + --shuffle_batch False \ + --use_py_reader True \ + --use_mem_opt True \ + --fetch_steps 100 $@ \ + dropout_seed 10 \ + learning_rate 2.0 \ + warmup_steps 8000 \ + beta2 0.997 \ + d_model 512 \ + d_inner_hid 2048 \ + n_head 8 \ + prepostprocess_dropout 0.1 \ + attention_dropout 0.1 \ + relu_dropout 0.1 \ + weight_sharing True \ + pass_num 2 \ + model_dir 'tmp_models' \ + ckpt_dir 'tmp_ckpts' & +} + +export PADDLE_PSERVERS="127.0.0.1:7160,127.0.0.1:7161" +export PADDLE_TRAINERS_NUM="2" +mkdir -p logs + +run_ps_ce_card1(){ + TRAINING_ROLE="PSERVER" \ + PADDLE_CURRENT_ENDPOINT="127.0.0.1:7160" \ + FLAGS_fraction_of_gpu_memory_to_use=0.0 \ + train &> logs/ps0.log & + + TRAINING_ROLE="PSERVER" \ + PADDLE_CURRENT_ENDPOINT="127.0.0.1:7161" \ + FLAGS_fraction_of_gpu_memory_to_use=0.0 \ + train &> logs/ps1.log & + + TRAINING_ROLE="TRAINER" \ + PADDLE_CURRENT_ENDPOINT="127.0.0.1:7162" \ + PADDLE_TRAINER_ID="0" \ + CUDA_VISIBLE_DEVICES="6" \ + train &> logs/tr0.log|python _ce.py & + + TRAINING_ROLE="TRAINER" \ + PADDLE_CURRENT_ENDPOINT="127.0.0.1:7163" \ + PADDLE_TRAINER_ID="1" \ + CUDA_VISIBLE_DEVICES="7" \ + train &> logs/tr1.log |python _ce.py & +} + +run_ps_ce_card1 diff --git a/PaddleNLP/neural_machine_translation/transformer/run_ps_ce_card4.sh b/PaddleNLP/neural_machine_translation/transformer/run_ps_ce_card4.sh new file mode 100644 index 00000000..563e919d --- /dev/null +++ b/PaddleNLP/neural_machine_translation/transformer/run_ps_ce_card4.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +train(){ + + DATA_PATH=./dataset/wmt16 + + python train.py \ + --src_vocab_fpath $DATA_PATH/en_10000.dict \ + --trg_vocab_fpath $DATA_PATH/de_10000.dict \ + --special_token '' '' '' \ + --train_file_pattern $DATA_PATH/wmt16/train \ + --val_file_pattern $DATA_PATH/wmt16/val \ + --use_token_batch True \ + --batch_size 1024 \ + --sort_type pool \ + --pool_size 200000 \ + --shuffle False \ + --enable_ce True \ + --local False \ + --shuffle_batch False \ + --use_py_reader True \ + --use_mem_opt True \ + --fetch_steps 100 $@ \ + dropout_seed 10 \ + learning_rate 2.0 \ + warmup_steps 8000 \ + beta2 0.997 \ + d_model 512 \ + d_inner_hid 2048 \ + n_head 8 \ + prepostprocess_dropout 0.1 \ + attention_dropout 0.1 \ + relu_dropout 0.1 \ + weight_sharing True \ + pass_num 2 \ + model_dir 'tmp_models' \ + ckpt_dir 'tmp_ckpts' & +} + +export PADDLE_PSERVERS="127.0.0.1:7160,127.0.0.1:7161" +export PADDLE_TRAINERS_NUM="2" +mkdir -p logs + +run_ps_ce_card4(){ + TRAINING_ROLE="PSERVER" \ + PADDLE_CURRENT_ENDPOINT="127.0.0.1:7160" \ + FLAGS_fraction_of_gpu_memory_to_use=0.0 \ + train &> logs/ps2.log & + + TRAINING_ROLE="PSERVER" \ + PADDLE_CURRENT_ENDPOINT="127.0.0.1:7161" \ + FLAGS_fraction_of_gpu_memory_to_use=0.0 \ + train &> logs/ps3.log & + + TRAINING_ROLE="TRAINER" \ + PADDLE_CURRENT_ENDPOINT="127.0.0.1:7162" \ + PADDLE_TRAINER_ID="0" \ + CUDA_VISIBLE_DEVICES="0,1,2,3" \ + train &> logs/tr2.log|python _ce.py & + + TRAINING_ROLE="TRAINER" \ + PADDLE_CURRENT_ENDPOINT="127.0.0.1:7163" \ + PADDLE_TRAINER_ID="1" \ + CUDA_VISIBLE_DEVICES="4,5,6,7" \ + train &> logs/tr3.log |python _ce.py & +} + +run_ps_ce_card4 -- GitLab