From edc6b3623e7d9526570b594eb70a0d62b97f4aab Mon Sep 17 00:00:00 2001 From: baiyfbupt Date: Wed, 8 Aug 2018 00:47:05 +0800 Subject: [PATCH] kpi fix --- fluid/object_detection/.run_ce.sh | 14 +++++++++--- fluid/object_detection/_ce.py | 10 ++++----- fluid/object_detection/train.py | 37 ++++++++++++++++--------------- 3 files changed, 35 insertions(+), 26 deletions(-) diff --git a/fluid/object_detection/.run_ce.sh b/fluid/object_detection/.run_ce.sh index 78318b20..50809e77 100755 --- a/fluid/object_detection/.run_ce.sh +++ b/fluid/object_detection/.run_ce.sh @@ -1,11 +1,19 @@ +###!/bin/bash +####This file is only used for continuous evaluation. + export MKL_NUM_THREADS=1 export OMP_NUM_THREADS=1 -cudaid=${object_detection_cudaid:=0} # use 0-th card as default -export CUDA_VISIBLE_DEVICES=$cudaid if [ ! -d "/root/.cache/paddle/dataset/pascalvoc" ];then mkdir -p /root/.cache/paddle/dataset/pascalvoc ./data/pascalvoc/download.sh cp -r ./data/pascalvoc/. /home/.cache/paddle/dataset/pascalvoc fi -FLAGS_benchmark=true python train.py --for_model_ce=True --batch_size=64 --num_passes=2 --data_dir=/root/.cache/paddle/dataset/pascalvoc/ | python _ce.py + +cudaid=${object_detection_cudaid:=0} +export CUDA_VISIBLE_DEVICES=$cudaid +FLAGS_benchmark=true python train.py --enable_ce=True --batch_size=64 --num_passes=2 --data_dir=/root/.cache/paddle/dataset/pascalvoc/ | python _ce.py + +cudaid=${object_detection_cudaid:=0,1,2,3} +export CUDA_VISIBLE_DEVICES=$cudaid +FLAGS_benchmark=true python train.py --enable_ce=True --batch_size=64 --num_passes=2 --data_dir=/root/.cache/paddle/dataset/pascalvoc/ | python _ce.py diff --git a/fluid/object_detection/_ce.py b/fluid/object_detection/_ce.py index c33f61d0..5abb3c9b 100644 --- a/fluid/object_detection/_ce.py +++ b/fluid/object_detection/_ce.py @@ -8,15 +8,15 @@ from kpi import CostKpi, DurationKpi, AccKpi #### NOTE kpi.py should shared in models in some way!!!! train_cost_kpi = CostKpi('train_cost', 0.02, actived=True) -test_acc_kpi = AccKpi('test_acc', 0.005, actived=True) -train_duration_kpi = DurationKpi('train_duration', 0.06, actived=True) -train_acc_kpi = AccKpi('train_acc', 0.005, actived=True) +test_acc_kpi = AccKpi('test_acc', 0.01, actived=True) +train_speed_kpi = AccKpi('train_speed', 0.2, actived=True) +train_speed_card4_kpi = AccKpi('train_speed_card4', 0.2, actived=True) tracking_kpis = [ - train_acc_kpi, train_cost_kpi, test_acc_kpi, - train_duration_kpi, + train_speed_kpi, + train_speed_card4_kpi, ] diff --git a/fluid/object_detection/train.py b/fluid/object_detection/train.py index 1bdd3796..bf3134d0 100644 --- a/fluid/object_detection/train.py +++ b/fluid/object_detection/train.py @@ -11,11 +11,6 @@ import reader from mobilenet_ssd import mobile_net from utility import add_arguments, print_arguments -SEED = 90 - -# random seed must set before configuring the network. -fluid.default_startup_program().random_seed = SEED - parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable @@ -38,7 +33,7 @@ add_arg('mean_value_G', float, 127.5, "Mean value for G channel which will add_arg('mean_value_R', float, 127.5, "Mean value for R channel which will be subtracted.") #103.94 add_arg('is_toy', int, 0, "Toy for quick debug, 0 means using all data, while n means using only n sample.") add_arg('data_dir', str, 'data/pascalvoc', "data directory") -add_arg('for_model_ce', bool, False, "Use CE to evaluate the model") +add_arg('enable_ce', bool, False, "Whether use CE to evaluate the model") #yapf: enable @@ -51,6 +46,9 @@ def train(args, num_passes, model_save_dir, pretrained_model=None): + if args.enable_ce: + fluid.framework.default_startup_program().random_seed = 111 + image_shape = [3, data_args.resize_h, data_args.resize_w] if 'coco' in data_args.dataset: num_classes = 91 @@ -124,8 +122,12 @@ def train(args, train_exe = fluid.ParallelExecutor( use_cuda=args.use_gpu, loss_name=loss.name) - train_reader = paddle.batch( - reader.train(data_args, train_file_list), batch_size=batch_size) + if not args.enable_ce: + train_reader = paddle.batch( + reader.train(data_args, train_file_list), batch_size=batch_size) + else: + train_reader = paddle.batch( + reader.train(data_args, train_file_list, False), batch_size=batch_size) test_reader = paddle.batch( reader.test(data_args, val_file_list), batch_size=batch_size) feeder = fluid.DataFeeder( @@ -143,17 +145,20 @@ def train(args, def test(pass_id, best_map): _, accum_map = map_eval.get_map_var() map_eval.reset(exe) + every_pass_map=[] for batch_id, data in enumerate(test_reader()): test_map, = exe.run(test_program, feed=feeder.feed(data), fetch_list=[accum_map]) if batch_id % 20 == 0: + every_pass_map.append(test_map) print("Batch {0}, map {1}".format(batch_id, test_map)) + mean_map = np.mean(every_pass_map) if test_map[0] > best_map: best_map = test_map[0] save_model('best_model') print("Pass {0}, test map {1}".format(pass_id, test_map)) - return best_map + return best_map, mean_map total_time = 0.0 for pass_id in range(num_passes): @@ -183,28 +188,24 @@ def train(args, pass_id, batch_id, loss_v, start_time - prev_start_time)) end_time = time.time() - if args.for_model_ce: - gpu_num = get_cards() + best_map, mean_map = test(pass_id, best_map) + if args.enable_ce and pass_id == 1: total_time += end_time - start_time train_avg_loss = np.mean(every_pass_loss) - if gpu_num == 1: + if devices_num == 1: print ("kpis train_cost %s" % train_avg_loss) + print ("kpis test_acc %s" % mean_map) print ("kpis train_speed %s" % (total_time / epoch_idx)) else: print ("kpis train_cost_card%s %s" % (gpu_num, train_avg_loss)) + print ("kpis test_acc_card%s %s" % (gpu_num, mean_map)) print ("kpis train_speed_card%s %f" % (gpu_num, total_time / epoch_idx)) - best_map = test(pass_id, best_map) if pass_id % 10 == 0 or pass_id == num_passes - 1: save_model(str(pass_id)) print("Best test map {0}".format(best_map)) -def get_cards(): - cards = os.environ.get('CUDA_VISIBLE_DEVICES') - num = len(cards.split(",")) - return num - if __name__ == '__main__': args = parser.parse_args() print_arguments(args) -- GitLab