From 182d5092e4e9ada51a59db4ba0d167e3d3c296ec Mon Sep 17 00:00:00 2001 From: ruri Date: Thu, 5 Dec 2019 20:09:35 +0800 Subject: [PATCH] add ce for image classification (#4030) --- PaddleCV/image_classification/reader.py | 14 +++++-- PaddleCV/image_classification/train.py | 38 +++++++++++------ .../image_classification/utils/utility.py | 41 ++++++++++++++++++- 3 files changed, 76 insertions(+), 17 deletions(-) diff --git a/PaddleCV/image_classification/reader.py b/PaddleCV/image_classification/reader.py index 239f9eae..f353cc9a 100644 --- a/PaddleCV/image_classification/reader.py +++ b/PaddleCV/image_classification/reader.py @@ -287,17 +287,25 @@ class ImageNetReader: full_lines = [line.strip() for line in flist] if mode != "test" and len(full_lines) < settings.batch_size: print( - "Warning: The number of the whole data ({}) is smaller than the batch_size ({}), and drop_last is turnning on, so nothing will feed in program, Terminated now. Please reset batch_size to a smaller number or feed more data!" - .format(len(full_lines), settings.batch_size)) + "Warning: The number of the whole data ({}) is smaller than the batch_size ({}), and drop_last is turnning on, so nothing will feed in program, Terminated now. Please reset batch_size to a smaller number or feed more data!". + format(len(full_lines), settings.batch_size)) os._exit(1) if num_trainers > 1 and mode == "train": assert self.shuffle_seed is not None, "multiprocess train, shuffle seed must be set!" np.random.RandomState(self.shuffle_seed).shuffle( full_lines) elif shuffle: - np.random.shuffle(full_lines) + if not settings.enable_ce or settings.same_feed: + np.random.shuffle(full_lines) batch_data = [] + if settings.same_feed: + temp_file = full_lines[0] + print("Same images({},nums:{}) will feed in the net".format( + str(temp_file), settings.same_feed)) + full_lines = [] + for i in range(settings.same_feed): + full_lines.append(temp_file) for line in full_lines: img_path, label = line.split() img_path = os.path.join(data_dir, img_path) diff --git a/PaddleCV/image_classification/train.py b/PaddleCV/image_classification/train.py index b11dab99..baec7a90 100755 --- a/PaddleCV/image_classification/train.py +++ b/PaddleCV/image_classification/train.py @@ -54,7 +54,7 @@ def build_program(is_train, main_prog, startup_prog, args): else: model = models.__dict__[args.model]() with fluid.program_guard(main_prog, startup_prog): - if args.random_seed: + if args.random_seed or args.enable_ce: main_prog.random_seed = args.random_seed startup_prog.random_seed = args.random_seed with fluid.unique_name.guard(): @@ -79,8 +79,14 @@ def build_program(is_train, main_prog, startup_prog, args): return loss_out -def validate(args, test_iter, exe, test_prog, test_fetch_list, pass_id, - train_batch_metrics_record): +def validate(args, + test_iter, + exe, + test_prog, + test_fetch_list, + pass_id, + train_batch_metrics_record, + train_batch_time_record=None): test_batch_time_record = [] test_batch_metrics_record = [] test_batch_id = 0 @@ -96,12 +102,11 @@ def validate(args, test_iter, exe, test_prog, test_fetch_list, pass_id, test_batch_metrics_avg = np.mean(np.array(test_batch_metrics), axis=1) test_batch_metrics_record.append(test_batch_metrics_avg) - print_info(pass_id, test_batch_id, args.print_step, - test_batch_metrics_avg, test_batch_elapse, "batch") + print_info("batch", test_batch_metrics_avg, test_batch_elapse, pass_id, + test_batch_id, args.print_step) sys.stdout.flush() test_batch_id += 1 - #train_epoch_time_avg = np.mean(np.array(train_batch_time_record)) train_epoch_metrics_avg = np.mean( np.array(train_batch_metrics_record), axis=0) @@ -109,9 +114,18 @@ def validate(args, test_iter, exe, test_prog, test_fetch_list, pass_id, test_epoch_metrics_avg = np.mean( np.array(test_batch_metrics_record), axis=0) - print_info(pass_id, 0, 0, - list(train_epoch_metrics_avg) + list(test_epoch_metrics_avg), - test_epoch_time_avg, "epoch") + print_info( + "epoch", + list(train_epoch_metrics_avg) + list(test_epoch_metrics_avg), + test_epoch_time_avg, + pass_id=pass_id) + if args.enable_ce: + device_num = fluid.core.get_cuda_device_count() if args.use_gpu else 1 + print_info( + "ce", + list(train_epoch_metrics_avg) + list(test_epoch_metrics_avg), + train_batch_time_record, + device_num=device_num) def train(args): @@ -207,8 +221,8 @@ def train(args): np.array(train_batch_metrics), axis=1) train_batch_metrics_record.append(train_batch_metrics_avg) if trainer_id == 0: - print_info(pass_id, train_batch_id, args.print_step, - train_batch_metrics_avg, train_batch_elapse, "batch") + print_info("batch", train_batch_metrics_avg, train_batch_elapse, + pass_id, train_batch_id, args.print_step) sys.stdout.flush() train_batch_id += 1 t1 = time.time() @@ -232,7 +246,7 @@ def train(args): print('ExponentialMovingAverage validate over!') validate(args, test_iter, exe, test_prog, test_fetch_list, pass_id, - train_batch_metrics_record) + train_batch_metrics_record, train_batch_time_record) #For now, save model per epoch. if pass_id % args.save_step == 0: save_model(args, exe, train_prog, pass_id) diff --git a/PaddleCV/image_classification/utils/utility.py b/PaddleCV/image_classification/utils/utility.py index 77045856..32b17fd9 100644 --- a/PaddleCV/image_classification/utils/utility.py +++ b/PaddleCV/image_classification/utils/utility.py @@ -136,7 +136,9 @@ def parse_args(): add_arg('label_smoothing_epsilon', float, 0.1, "The value of label_smoothing_epsilon parameter") #NOTE: (2019/08/08) temporary disable use_distill #add_arg('use_distill', bool, False, "Whether to use distill") + add_arg("enable_ce", bool, False, "Whether to enable ce") add_arg('random_seed', int, None, "random seed") + add_arg('use_ema', bool, False, "Whether to use ExponentialMovingAverage.") add_arg('ema_decay', float, 0.9999, "The value of ema decay rate") add_arg('padding_type', str, "SAME", "Padding type of convolution") @@ -146,6 +148,7 @@ def parse_args(): add_arg('profiler_path', str, './', "the profiler output file path.(used for benchmark)") add_arg('max_iter', int, 0, "the max train batch num.(used for benchmark)") add_arg('validate', int, 1, "whether validate.(used for benchmark)") + add_arg('same_feed', int, 0, "whether to feed same images") # yapf: enable @@ -263,6 +266,10 @@ def check_args(args): args.data_dir ), "Data doesn't exist in {}, please load right path".format(args.data_dir) + if args.enable_ce: + args.random_seed = 0 + print("CE is running now!") + #check gpu check_gpu() @@ -344,7 +351,13 @@ def create_data_loader(is_train, args): return data_loader, [feed_image, feed_label] -def print_info(pass_id, batch_id, print_step, metrics, time_info, info_mode): +def print_info(info_mode, + metrics, + time_info, + pass_id=0, + batch_id=0, + print_step=1, + device_num=1): """print function Args: @@ -355,6 +368,7 @@ def print_info(pass_id, batch_id, print_step, metrics, time_info, info_mode): time_info: time infomation info_mode: mode """ + #XXX: Use specific name to choose pattern, not the length of metrics. if info_mode == "batch": if batch_id % print_step == 0: #if isinstance(metrics,np.ndarray): @@ -402,11 +416,34 @@ def print_info(pass_id, batch_id, print_step, metrics, time_info, info_mode): "%.5f" % test_acc5)) sys.stdout.flush() elif info_mode == "ce": - raise Warning("CE code is not ready") + assert len( + metrics + ) == 7, "Enable CE: The Metrics should contain train_loss, train_acc1, train_acc5, test_loss, test_acc1, test_acc5, and train_speed" + assert len( + time_info + ) > 10, "0~9th batch statistics will drop when doing benchmark or ce, because it might be mixed with startup time, so please make sure training at least 10 batches." + print_ce(device_num, metrics, time_info) + #raise Warning("CE code is not ready") else: raise Exception("Illegal info_mode") +def print_ce(device_num, metrics, time_info): + """ Print log for CE(for internal test). + """ + train_loss, train_acc1, train_acc5, _, test_loss, test_acc1, test_acc5 = metrics + + train_speed = np.mean(np.array(time_info[10:])) + + print("kpis\ttrain_cost_card{}\t{}".format(device_num, train_loss)) + print("kpis\ttrain_acc1_card{}\t{}".format(device_num, train_acc1)) + print("kpis\ttrain_acc5_card{}\t{}".format(device_num, train_acc5)) + print("kpis\ttest_loss_card{}\t{}".format(device_num, test_loss)) + print("kpis\ttest_acc1_card{}\t{}".format(device_num, test_acc1)) + print("kpis\ttest_acc5_card{}\t{}".format(device_num, test_acc5)) + print("kpis\ttrain_speed_card{}\t{}".format(device_num, train_speed)) + + def best_strategy_compiled(args, program, loss, exe): """make a program which wrapped by a compiled program """ -- GitLab