diff --git a/fluid/PaddleCV/deeplabv3+/.run_ce.sh b/fluid/PaddleCV/deeplabv3+/.run_ce.sh new file mode 100755 index 0000000000000000000000000000000000000000..540fb964ba94fd29dc28bb51342cdba839d433e7 --- /dev/null +++ b/fluid/PaddleCV/deeplabv3+/.run_ce.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 + +DATASET_PATH=${HOME}/.cache/paddle/dataset/cityscape/ + +cudaid=${deeplabv3plus:=0} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +FLAGS_benchmark=true python train.py \ +--batch_size=2 \ +--train_crop_size=769 \ +--total_step=50 \ +--save_weights_path=output1 \ +--dataset_path=$DATASET_PATH \ +--enable_ce | python _ce.py + +cudaid=${deeplabv3plus_m:=0,1,2,3} # use 0,1,2,3 card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +FLAGS_benchmark=true python train.py \ +--batch_size=2 \ +--train_crop_size=769 \ +--total_step=50 \ +--save_weights_path=output4 \ +--dataset_path=$DATASET_PATH \ +--enable_ce | python _ce.py diff --git a/fluid/PaddleCV/deeplabv3+/__init__.py b/fluid/PaddleCV/deeplabv3+/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/fluid/PaddleCV/deeplabv3+/_ce.py b/fluid/PaddleCV/deeplabv3+/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..b0127d6445213b9d3934220fa36e9eb44d3e04b4 --- /dev/null +++ b/fluid/PaddleCV/deeplabv3+/_ce.py @@ -0,0 +1,60 @@ +# this file is only used for continuous evaluation test! + +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi +from kpi import DurationKpi + +each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.1, 0, actived=True) +train_loss_card1_kpi = CostKpi('train_loss_card1', 0.05, 0) +each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.1, 0, actived=True) +train_loss_card4_kpi = CostKpi('train_loss_card4', 0.05, 0) + +tracking_kpis = [ + each_pass_duration_card1_kpi, + train_loss_card1_kpi, + each_pass_duration_card4_kpi, + train_loss_card4_kpi, + ] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + log_to_ce(log) diff --git a/fluid/PaddleCV/deeplabv3+/train.py b/fluid/PaddleCV/deeplabv3+/train.py old mode 100644 new mode 100755 index 817d53d173467f9146918ec9bb6b44141eb0ac3f..a95fb12e4eee6c53a1046f067051f15c0d2dae6b --- a/fluid/PaddleCV/deeplabv3+/train.py +++ b/fluid/PaddleCV/deeplabv3+/train.py @@ -34,6 +34,7 @@ def add_arguments(): add_argument('parallel', bool, False, "using ParallelExecutor.") add_argument('use_gpu', bool, True, "Whether use GPU or CPU.") add_argument('num_classes', int, 19, "Number of classes.") + parser.add_argument('--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.') def load_model(): @@ -84,6 +85,14 @@ def loss(logit, label): return loss, label_nignore +def get_cards(args): + if args.enable_ce: + cards = os.environ.get('CUDA_VISIBLE_DEVICES') + num = len(cards.split(",")) + return num + else: + return args.num_devices + CityscapeDataset = reader.CityscapeDataset parser = argparse.ArgumentParser() @@ -99,6 +108,13 @@ deeplabv3p = models.deeplabv3p sp = fluid.Program() tp = fluid.Program() + +# only for ce +if args.enable_ce: + SEED = 102 + sp.random_seed = SEED + tp.random_seed = SEED + crop_size = args.train_crop_size batch_size = args.batch_size image_shape = [crop_size, crop_size] @@ -155,7 +171,13 @@ if args.parallel: batches = dataset.get_batch_generator(batch_size, total_step) +total_time = 0.0 +epoch_idx = 0 +train_loss = 0 + for i, imgs, labels, names in batches: + epoch_idx += 1 + begin_time = time.time() prev_start_time = time.time() if args.parallel: retv = exe_p.run(fetch_list=[pred.name, loss_mean.name], @@ -167,11 +189,22 @@ for i, imgs, labels, names in batches: 'label': labels}, fetch_list=[pred, loss_mean]) end_time = time.time() + total_time += end_time - begin_time if i % 100 == 0: print("Model is saved to", args.save_weights_path) save_model() print("step {:d}, loss: {:.6f}, step_time_cost: {:.3f}".format( i, np.mean(retv[1]), end_time - prev_start_time)) + # only for ce + train_loss = np.mean(retv[1]) + +if args.enable_ce: + gpu_num = get_cards(args) + print("kpis\teach_pass_duration_card%s\t%s" % + (gpu_num, total_time / epoch_idx)) + print("kpis\ttrain_loss_card%s\t%s" % + (gpu_num, train_loss)) + print("Training done. Model is saved to", args.save_weights_path) save_model() diff --git a/fluid/PaddleNLP/text_matching_on_quora/.run_ce.sh b/fluid/PaddleNLP/text_matching_on_quora/.run_ce.sh old mode 100644 new mode 100755 index eca247a40a3f680a6a59c4a183bfba006ced8d44..f1bb7febd3f2c572544612baf24be14c711108e3 --- a/fluid/PaddleNLP/text_matching_on_quora/.run_ce.sh +++ b/fluid/PaddleNLP/text_matching_on_quora/.run_ce.sh @@ -6,9 +6,9 @@ export OMP_NUM_THREADS=1 cudaid=${text_matching_on_quora:=0} # use 0-th card as default export CUDA_VISIBLE_DEVICES=$cudaid -FLAGS_benchmark=true python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce | python _ce.py +FLAGS_benchmark=true python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce --epoch_num=5 | python _ce.py cudaid=${text_matching_on_quora_m:=0,1,2,3} # use 0,1,2,3 card as default export CUDA_VISIBLE_DEVICES=$cudaid -FLAGS_benchmark=true python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce | python _ce.py +FLAGS_benchmark=true python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce --epoch_num=5 | python _ce.py diff --git a/fluid/PaddleNLP/text_matching_on_quora/_ce.py b/fluid/PaddleNLP/text_matching_on_quora/_ce.py index b38ad21a1e0eb7407f78d100a3cb3659f6c5d8d3..eadeb821da6f7049d1916a65a1ae4eb995c5cb6d 100644 --- a/fluid/PaddleNLP/text_matching_on_quora/_ce.py +++ b/fluid/PaddleNLP/text_matching_on_quora/_ce.py @@ -7,11 +7,11 @@ from kpi import CostKpi from kpi import DurationKpi -each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.05, 0, actived=True) -train_avg_cost_card1_kpi = CostKpi('train_avg_cost_card1', 0.2, 0) +each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.08, 0, actived=True) +train_avg_cost_card1_kpi = CostKpi('train_avg_cost_card1', 0.08, 0) train_avg_acc_card1_kpi = CostKpi('train_avg_acc_card1', 0.02, 0) -each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.05, 0, actived=True) -train_avg_cost_card4_kpi = CostKpi('train_avg_cost_card4', 0.2, 0) +each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.08, 0, actived=True) +train_avg_cost_card4_kpi = CostKpi('train_avg_cost_card4', 0.08, 0) train_avg_acc_card4_kpi = CostKpi('train_avg_acc_card4', 0.02, 0) tracking_kpis = [ diff --git a/fluid/PaddleNLP/text_matching_on_quora/train_and_evaluate.py b/fluid/PaddleNLP/text_matching_on_quora/train_and_evaluate.py index 714fa6f970d9f213efdc6b6e1799b244696fb20d..0f88c6b6ef13aec25e08527b7efabe8638a3af25 100755 --- a/fluid/PaddleNLP/text_matching_on_quora/train_and_evaluate.py +++ b/fluid/PaddleNLP/text_matching_on_quora/train_and_evaluate.py @@ -34,6 +34,7 @@ parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('--model_name', type=str, default='cdssmNet', help="Which model to train") parser.add_argument('--config', type=str, default='cdssm_base', help="The global config setting") parser.add_argument('--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.') +parser.add_argument('--epoch_num', type=int, help='Number of epoch') DATA_DIR = os.path.join(os.path.expanduser('~'), '.cache/paddle/dataset') @@ -241,6 +242,9 @@ def main(): args = parser.parse_args() global_config = configs.__dict__[args.config]() + if args.epoch_num != None: + global_config.epoch_num = args.epoch_num + print("net_name: ", args.model_name) net = models.__dict__[args.model_name](global_config)