diff --git a/fluid/neural_machine_translation/transformer/.run_ce.sh b/fluid/neural_machine_translation/transformer/.run_ce.sh index e37856e5055ac3689416355ca9b29e6b8911598f..50161ca4662547dd19a098ee1f360a717da1b9a7 100644 --- a/fluid/neural_machine_translation/transformer/.run_ce.sh +++ b/fluid/neural_machine_translation/transformer/.run_ce.sh @@ -24,4 +24,12 @@ train(){ dropout_seed 10 } +cudaid=${transformer:=0} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +train | python _ce.py + +cudaid=${transformer_m:=0,1,2,3} # use 0,1,2,3 card as default +export CUDA_VISIBLE_DEVICES=$cudaid + train | python _ce.py diff --git a/fluid/neural_machine_translation/transformer/_ce.py b/fluid/neural_machine_translation/transformer/_ce.py index b774af075b028f4afafeb575f65cc8c9de8dc96b..49e67a63c8377d01ff196ccb49a2e3f93814bcc0 100644 --- a/fluid/neural_machine_translation/transformer/_ce.py +++ b/fluid/neural_machine_translation/transformer/_ce.py @@ -7,14 +7,22 @@ from kpi import CostKpi, DurationKpi, AccKpi #### NOTE kpi.py should shared in models in some way!!!! -train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True) -test_cost_kpi = CostKpi('test_cost', 0.005, 0, actived=True) -train_duration_kpi = DurationKpi('train_duration', 0.06, 0, actived=True) +train_cost_card1_kpi = CostKpi('train_cost_card1', 0.01, 0, actived=True) +test_cost_card1_kpi = CostKpi('test_cost_card1', 0.005, 0, actived=True) +train_duration_card1_kpi = DurationKpi( + 'train_duration_card1', 0.06, 0, actived=True) +train_cost_card4_kpi = CostKpi('train_cost_card4', 0.01, 0, actived=True) +test_cost_card4_kpi = CostKpi('test_cost_card4', 0.005, 0, actived=True) +train_duration_card4_kpi = DurationKpi( + 'train_duration_card4', 0.06, 0, actived=True) tracking_kpis = [ - train_cost_kpi, - test_cost_kpi, - train_duration_kpi, + train_cost_card1_kpi, + test_cost_card1_kpi, + train_duration_card1_kpi, + train_cost_card4_kpi, + test_cost_card4_kpi, + train_duration_card4_kpi, ] diff --git a/fluid/neural_machine_translation/transformer/train.py b/fluid/neural_machine_translation/transformer/train.py index beb2b7d3e6e6db09ac9269cd285f71446bb69270..1148fe58845003f44eb80186ecd3b462c130c209 100644 --- a/fluid/neural_machine_translation/transformer/train.py +++ b/fluid/neural_machine_translation/transformer/train.py @@ -463,9 +463,9 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, "pass_" + str(pass_id) + ".infer.model"), data_input_names[:-2] + util_input_names, [predict], exe) if args.enable_ce: # For CE - print("kpis\ttrain_cost\t%f" % total_avg_cost) - print("kpis\ttest_cost\t%f" % val_avg_cost) - print("kpis\ttrain_duration\t%f" % time_consumed) + print("kpis\ttrain_cost_card%d\t%f" % (dev_count, total_avg_cost)) + print("kpis\ttest_cost_card%d\t%f" % (dev_count, val_avg_cost)) + print("kpis\ttrain_duration_card%d\t%f" % (dev_count, time_consumed)) def train(args):