From 58347b8bc0270aecb5fabeeed9bf5d1382c32154 Mon Sep 17 00:00:00 2001 From: zhengya01 Date: Thu, 27 Dec 2018 13:01:54 +0000 Subject: [PATCH] add ce --- fluid/PaddleCV/deeplabv3+/.run_ce.sh | 28 +++++++++++++ fluid/PaddleCV/deeplabv3+/__init__.py | 0 fluid/PaddleCV/deeplabv3+/_ce.py | 60 +++++++++++++++++++++++++++ fluid/PaddleCV/deeplabv3+/train.py | 33 +++++++++++++++ 4 files changed, 121 insertions(+) create mode 100755 fluid/PaddleCV/deeplabv3+/.run_ce.sh create mode 100644 fluid/PaddleCV/deeplabv3+/__init__.py create mode 100644 fluid/PaddleCV/deeplabv3+/_ce.py mode change 100644 => 100755 fluid/PaddleCV/deeplabv3+/train.py diff --git a/fluid/PaddleCV/deeplabv3+/.run_ce.sh b/fluid/PaddleCV/deeplabv3+/.run_ce.sh new file mode 100755 index 00000000..540fb964 --- /dev/null +++ b/fluid/PaddleCV/deeplabv3+/.run_ce.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 + +DATASET_PATH=${HOME}/.cache/paddle/dataset/cityscape/ + +cudaid=${deeplabv3plus:=0} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +FLAGS_benchmark=true python train.py \ +--batch_size=2 \ +--train_crop_size=769 \ +--total_step=50 \ +--save_weights_path=output1 \ +--dataset_path=$DATASET_PATH \ +--enable_ce | python _ce.py + +cudaid=${deeplabv3plus_m:=0,1,2,3} # use 0,1,2,3 card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +FLAGS_benchmark=true python train.py \ +--batch_size=2 \ +--train_crop_size=769 \ +--total_step=50 \ +--save_weights_path=output4 \ +--dataset_path=$DATASET_PATH \ +--enable_ce | python _ce.py diff --git a/fluid/PaddleCV/deeplabv3+/__init__.py b/fluid/PaddleCV/deeplabv3+/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/fluid/PaddleCV/deeplabv3+/_ce.py b/fluid/PaddleCV/deeplabv3+/_ce.py new file mode 100644 index 00000000..b0127d64 --- /dev/null +++ b/fluid/PaddleCV/deeplabv3+/_ce.py @@ -0,0 +1,60 @@ +# this file is only used for continuous evaluation test! + +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi +from kpi import DurationKpi + +each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.1, 0, actived=True) +train_loss_card1_kpi = CostKpi('train_loss_card1', 0.05, 0) +each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.1, 0, actived=True) +train_loss_card4_kpi = CostKpi('train_loss_card4', 0.05, 0) + +tracking_kpis = [ + each_pass_duration_card1_kpi, + train_loss_card1_kpi, + each_pass_duration_card4_kpi, + train_loss_card4_kpi, + ] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + log_to_ce(log) diff --git a/fluid/PaddleCV/deeplabv3+/train.py b/fluid/PaddleCV/deeplabv3+/train.py old mode 100644 new mode 100755 index 817d53d1..a95fb12e --- a/fluid/PaddleCV/deeplabv3+/train.py +++ b/fluid/PaddleCV/deeplabv3+/train.py @@ -34,6 +34,7 @@ def add_arguments(): add_argument('parallel', bool, False, "using ParallelExecutor.") add_argument('use_gpu', bool, True, "Whether use GPU or CPU.") add_argument('num_classes', int, 19, "Number of classes.") + parser.add_argument('--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.') def load_model(): @@ -84,6 +85,14 @@ def loss(logit, label): return loss, label_nignore +def get_cards(args): + if args.enable_ce: + cards = os.environ.get('CUDA_VISIBLE_DEVICES') + num = len(cards.split(",")) + return num + else: + return args.num_devices + CityscapeDataset = reader.CityscapeDataset parser = argparse.ArgumentParser() @@ -99,6 +108,13 @@ deeplabv3p = models.deeplabv3p sp = fluid.Program() tp = fluid.Program() + +# only for ce +if args.enable_ce: + SEED = 102 + sp.random_seed = SEED + tp.random_seed = SEED + crop_size = args.train_crop_size batch_size = args.batch_size image_shape = [crop_size, crop_size] @@ -155,7 +171,13 @@ if args.parallel: batches = dataset.get_batch_generator(batch_size, total_step) +total_time = 0.0 +epoch_idx = 0 +train_loss = 0 + for i, imgs, labels, names in batches: + epoch_idx += 1 + begin_time = time.time() prev_start_time = time.time() if args.parallel: retv = exe_p.run(fetch_list=[pred.name, loss_mean.name], @@ -167,11 +189,22 @@ for i, imgs, labels, names in batches: 'label': labels}, fetch_list=[pred, loss_mean]) end_time = time.time() + total_time += end_time - begin_time if i % 100 == 0: print("Model is saved to", args.save_weights_path) save_model() print("step {:d}, loss: {:.6f}, step_time_cost: {:.3f}".format( i, np.mean(retv[1]), end_time - prev_start_time)) + # only for ce + train_loss = np.mean(retv[1]) + +if args.enable_ce: + gpu_num = get_cards(args) + print("kpis\teach_pass_duration_card%s\t%s" % + (gpu_num, total_time / epoch_idx)) + print("kpis\ttrain_loss_card%s\t%s" % + (gpu_num, train_loss)) + print("Training done. Model is saved to", args.save_weights_path) save_model() -- GitLab