From 868ed52da2eded9a3b20973eb1611af80bf4ac8d Mon Sep 17 00:00:00 2001 From: zhengya01 Date: Tue, 26 Feb 2019 13:46:29 +0000 Subject: [PATCH] add ctr ce --- fluid/PaddleRec/ctr/.run_ce.sh | 9 +++++++-- fluid/PaddleRec/ctr/_ce.py | 10 +++++++++- fluid/PaddleRec/ctr/train.py | 27 +++++++++++++++------------ 3 files changed, 31 insertions(+), 15 deletions(-) diff --git a/fluid/PaddleRec/ctr/.run_ce.sh b/fluid/PaddleRec/ctr/.run_ce.sh index b5b1c9dc..8bd8986d 100755 --- a/fluid/PaddleRec/ctr/.run_ce.sh +++ b/fluid/PaddleRec/ctr/.run_ce.sh @@ -4,8 +4,13 @@ export MKL_NUM_THREADS=1 export OMP_NUM_THREADS=1 -cudaid=${face_detection:=0} # use 0-th card as default -export CUDA_VISIBLE_DEVICES=$cudaid +#cudaid=${face_detection:=0} # use 0-th card as default +#export CUDA_VISIBLE_DEVICES=$cudaid +export NUM_THREADS=1 + +FLAGS_benchmark=true python train.py --is_local 1 --cloud_train 0 --train_data_path data/raw/train.txt --enable_ce | python _ce.py + +export NUM_THREADS=4 FLAGS_benchmark=true python train.py --is_local 1 --cloud_train 0 --train_data_path data/raw/train.txt --enable_ce | python _ce.py diff --git a/fluid/PaddleRec/ctr/_ce.py b/fluid/PaddleRec/ctr/_ce.py index 67e18566..7597dd52 100644 --- a/fluid/PaddleRec/ctr/_ce.py +++ b/fluid/PaddleRec/ctr/_ce.py @@ -13,12 +13,20 @@ each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.08, 0, train_loss_card1_kpi = CostKpi('train_loss_card1', 0.08, 0) train_auc_val_card1_kpi = AccKpi('train_auc_val_card1', 0.08, 0) train_batch_auc_val_card1_kpi = AccKpi('train_batch_auc_val_card1', 0.08, 0) +each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.08, 0, actived=True) +train_loss_card4_kpi = CostKpi('train_loss_card4', 0.08, 0) +train_auc_val_card4_kpi = AccKpi('train_auc_val_card4', 0.08, 0) +train_batch_auc_val_card4_kpi = AccKpi('train_batch_auc_val_card4', 0.08, 0) tracking_kpis = [ each_pass_duration_card1_kpi, train_loss_card1_kpi, train_auc_val_card1_kpi, - train_batch_auc_val_card1_kpi + train_batch_auc_val_card1_kpi, + each_pass_duration_card4_kpi, + train_loss_card4_kpi, + train_auc_val_card4_kpi, + train_batch_auc_val_card4_kpi ] diff --git a/fluid/PaddleRec/ctr/train.py b/fluid/PaddleRec/ctr/train.py index 97c26dc0..bbeebafe 100644 --- a/fluid/PaddleRec/ctr/train.py +++ b/fluid/PaddleRec/ctr/train.py @@ -112,10 +112,10 @@ def parse_args(): action='store_true', help='If set, run the task with continuous evaluation logs.') parser.add_argument( - '--num_devices', + '--num_threads', type=int, - default=0, - help='The num of devices, (default: 1)') + default=1, + help='The num of threads, (default: 1)') return parser.parse_args() @@ -193,16 +193,17 @@ def train_loop(args, train_program, py_reader, loss, auc_var, batch_auc_var, # only for ce if args.enable_ce: - gpu_num = get_cards(args) + cpu_num = get_cards(args) + print("cpu_num", cpu_num) epoch_idx = args.num_passes print("kpis\teach_pass_duration_card%s\t%s" % - (gpu_num, total_time / epoch_idx)) + (cpu_num, total_time / epoch_idx)) print("kpis\ttrain_loss_card%s\t%s" % - (gpu_num, loss_val/args.batch_size)) + (cpu_num, loss_val/args.batch_size)) print("kpis\ttrain_auc_val_card%s\t%s" % - (gpu_num, auc_val)) + (cpu_num, auc_val)) print("kpis\ttrain_batch_auc_val_card%s\t%s" % - (gpu_num, batch_auc_val)) + (cpu_num, batch_auc_val)) def train(): @@ -257,11 +258,13 @@ def train(): def get_cards(args): if args.enable_ce: - cards = os.environ.get('CUDA_VISIBLE_DEVICES') - num = len(cards.split(",")) - return num + cards = os.environ.get('NUM_THREADS', 1) + print("cards", cards) + + return int(cards) else: - return args.num_devices + print("return args.num_threads") + return args.num_threads if __name__ == '__main__': -- GitLab