support out json in multi-card eval and add exclude fc params in finetune (#4095)

* support out json in eval and add exclude fc params in finetune * fix cpu bug

support out json in multi-card eval and add exclude fc params in finetune (#4095)
* support out json in eval and add exclude fc params in finetune * fix cpu bug
54f64c66 · ruri · GitHub · 7e493aa1 · 54f64c66 · 54f64c66
4 changed file
--- a/PaddleCV/image_classification/eval.py
+++ b/PaddleCV/image_classification/eval.py
@@ -50,6 +50,7 @@ add_arg('padding_type',     str,  "SAME",               "Padding type of convolu
 add_arg('use_se',           bool, True,                 "Whether to use Squeeze-and-Excitation module for EfficientNet.")
 add_arg('save_json_path',   str,  None,                 "Whether to save output in json file.")
 add_arg('same_feed',        int,  0,                    "Whether to feed same images")
+add_arg('print_step',       int,  1,                    "the batch step to print info")
 # yapf: enable
@@ -65,6 +66,11 @@ def eval(args):
    assert args.image_shape[
        1] <= args.resize_short_size, "Please check the args:image_shape and args:resize_short_size, The croped size(image_shape[1]) must smaller than or equal to the resized length(resize_short_size) "
+    # check gpu: when using gpu, the number of visible cards should divide batch size
+    if args.use_gpu:
+        assert args.batch_size % fluid.core.get_cuda_device_count(
+        ) == 0, "please support correct batch_size({}), which can be divided by available cards({}), you can change the number of cards by indicating: export CUDA_VISIBLE_DEVICES= ".format(
+            args.batch_size, fluid.core.get_cuda_device_count())
    image = fluid.data(
        name='image', shape=[None] + args.image_shape, dtype='float32')
    label = fluid.data(name='label', shape=[None, 1], dtype='int64')
@@ -98,11 +104,9 @@ def eval(args):
        acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1)
        acc_top5 = fluid.layers.accuracy(input=pred, label=label, k=5)
-    #startup_prog = fluid.Program()
    test_program = fluid.default_main_program().clone(for_test=True)
-    fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]
+    fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name, pred.name]
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
@@ -118,34 +122,59 @@ def eval(args):
    fluid.io.load_persistables(exe, args.pretrained_model)
    imagenet_reader = reader.ImageNetReader()
    val_reader = imagenet_reader.val(settings=args)
-    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
-    val_reader = feeder.decorate_reader(val_reader, multi_devices=True)
+    # set places to run on the multi-card
+    feeder = fluid.DataFeeder(place=places, feed_list=[image, label])
    test_info = [[], [], []]
    cnt = 0
+    parallel_data = []
+    parallel_id = []
+    place_num = paddle.fluid.core.get_cuda_device_count()
+    real_iter = 0
+    info_dict = {}
    for batch_id, data in enumerate(val_reader()):
-        t1 = time.time()
+        #image data and label
-        loss, acc1, acc5 = exe.run(compiled_program,
+        image_data = [items[0:2] for items in data]
-                                   fetch_list=fetch_list,
+        image_id = [items[2] for items in data]
-                                   feed=data)
+        parallel_id.append(image_id)
-        t2 = time.time()
+        parallel_data.append(image_data)
-        period = t2 - t1
+        if place_num == len(parallel_data):
-        loss = np.mean(loss)
+            t1 = time.time()
-        acc1 = np.mean(acc1)
+            loss_set, acc1_set, acc5_set, pred_set = exe.run(
-        acc5 = np.mean(acc5)
+                compiled_program,
-        test_info[0].append(loss * len(data))
+                fetch_list=fetch_list,
-        test_info[1].append(acc1 * len(data))
+                feed=list(feeder.feed_parallel(parallel_data, place_num)))
-        test_info[2].append(acc5 * len(data))
+            t2 = time.time()
-        cnt += len(data)
+            period = t2 - t1
-        if batch_id % 10 == 0:
+            loss = np.mean(loss_set)
-            info = "Testbatch {0},loss {1}, acc1 {2},acc5 {3},time {4}".format(batch_id, \
+            acc1 = np.mean(acc1_set)
+            acc5 = np.mean(acc5_set)
+            test_info[0].append(loss * len(data))
+            test_info[1].append(acc1 * len(data))
+            test_info[2].append(acc5 * len(data))
+            cnt += len(data)
+            if batch_id % args.print_step == 0:
+                info = "Testbatch {0},loss {1}, acc1 {2},acc5 {3},time {4}".format(real_iter, \
                  "%.5f"%loss,"%.5f"%acc1, "%.5f"%acc5, \
                  "%2.2f sec" % period)
-            print(info)
+                print(info)
+                sys.stdout.flush()
            if args.save_json_path:
-                save_json(info, args.save_json_path)
+                for i, res in enumerate(pred_set):
-            sys.stdout.flush()
+                    pred_label = np.argsort(res)[::-1][:1]
+                    real_id = str(np.array(parallel_id).flatten()[i])
+                    _, real_id = os.path.split(real_id)
+                    info_dict[real_id] = {}
+                    info_dict[real_id]['score'], info_dict[real_id][
+                        'class'] = str(res[pred_label]), str(pred_label)
+                    save_json(info_dict, args.save_json_path)
+            parallel_id = []
+            parallel_data = []
+            real_iter += 1
    test_loss = np.sum(test_info[0]) / cnt
    test_acc1 = np.sum(test_info[1]) / cnt

--- a/PaddleCV/image_classification/infer.py
+++ b/PaddleCV/image_classification/infer.py
@@ -97,12 +97,13 @@ def infer(args):
    test_program = fluid.default_main_program().clone(for_test=True)
    fetch_list = [out.name]
+    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
-    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
+    places = place
-    places = fluid.framework.cuda_places()
+    if args.use_gpu:
+        places = fluid.framework.cuda_places()
    compiled_program = fluid.compiler.CompiledProgram(
        test_program).with_data_parallel(places=places)
@@ -140,7 +141,7 @@ def infer(args):
    info = {}
    parallel_data = []
    parallel_id = []
-    place_num = paddle.fluid.core.get_cuda_device_count()
+    place_num = paddle.fluid.core.get_cuda_device_count() if args.use_gpu else 1
    for batch_id, data in enumerate(test_reader()):
        image_data = [[items[0]] for items in data]

--- a/PaddleCV/image_classification/reader.py
+++ b/PaddleCV/image_classification/reader.py
@@ -236,11 +236,18 @@ def process_image(sample, settings, mode, color_jitter, rotate):
    img_std = np.array(std).reshape((3, 1, 1))
    img -= img_mean
    img /= img_std
+    # doing training (train.py)
-    if mode == 'train' or mode == 'val':
+    if mode == 'train' or (mode == 'val' and
+                           not hasattr(settings, 'save_json_path')):
        return (img, sample[1])
+    #doing testing (eval.py)
+    elif mode == 'val' and hasattr(settings, 'save_json_path'):
+        return (img, sample[1], sample[0])
+    #doing predict (infer.py)
    elif mode == 'test':
        return (img, sample[0])
+    else:
+        raise Exception("mode not implemented")
 def process_batch_data(input_data, settings, mode, color_jitter, rotate):
@@ -264,14 +271,14 @@ class ImageNetReader:
    def _get_single_card_bs(self, settings, mode):
        if settings.use_gpu:
-            if mode == "val" and settings.test_batch_size:
+            if mode == "val" and hasattr(settings, "test_batch_size"):
                single_card_bs = settings.test_batch_size // paddle.fluid.core.get_cuda_device_count(
                )
            else:
                single_card_bs = settings.batch_size // paddle.fluid.core.get_cuda_device_count(
                )
        else:
-            if mode == "val" and settings.test_batch_size:
+            if mode == "val" and hasattr(settings, "test_batch_size"):
                single_card_bs = settings.test_batch_size // int(
                    os.environ.get('CPU_NUM', 1))
            else:

--- a/PaddleCV/image_classification/utils/utility.py
+++ b/PaddleCV/image_classification/utils/utility.py
@@ -92,6 +92,7 @@ def parse_args():
    add_arg('model_save_dir',           str,    "./output",        "The directory path to save model.")
    add_arg('data_dir',                 str,    "./data/ILSVRC2012/",   "The ImageNet dataset root directory.")
    add_arg('pretrained_model',         str,    None,                   "Whether to load pretrained model.")
+    add_arg('finetune_exclude_pretrained_params', str, None,            "Ignore params when doing finetune")
    add_arg('checkpoint',               str,    None,                   "Whether to resume checkpoint.")
    add_arg('print_step',               int,    10,                     "The steps interval to print logs")
    add_arg('save_step',                int,    1,                      "The steps interval to save checkpoints")
@@ -293,9 +294,9 @@ def init_model(exe, args, program):
        print("Finish initing model from %s" % (args.checkpoint))
    if args.pretrained_model:
+        """
        # yapf: disable
+        # This is a dict of fc layers in all the classification models.
-        #XXX: should rename all models' final fc layers name as final_fc_weights and final_fc_offset!
        final_fc_name = [
                         "fc8_weights","fc8_offset", #alexnet
                         "fc_weights","fc_offset", #darknet, densenet, dpn, hrnet, mobilenet_v3, res2net, res2net_vd, resnext, resnext_vd, xception
@@ -312,6 +313,13 @@ def init_model(exe, args, program):
                         "fc_bias" #"fc_weights", xception_deeplab
                         ]
        # yapf: enable
+        """
+        final_fc_name = []
+        if args.finetune_exclude_pretrained_params:
+            final_fc_name = [
+                str(s)
+                for s in args.finetune_exclude_pretrained_params.split(",")
+            ]
        def is_parameter(var):
            fc_exclude_flag = False
@@ -324,8 +332,8 @@ def init_model(exe, args, program):
                Parameter) and not fc_exclude_flag and os.path.exists(
                    os.path.join(args.pretrained_model, var.name))
-        print("Load pretrain weights from {}, exclude fc layer.".format(
+        print("Load pretrain weights from {}, exclude params {}.".format(
-            args.pretrained_model))
+            args.pretrained_model, final_fc_name))
        vars = filter(is_parameter, program.list_vars())
        fluid.io.load_vars(
            exe, args.pretrained_model, vars=vars, main_program=program)
@@ -474,7 +482,6 @@ def print_info(info_mode,
            time_info
        ) > 10, "0~9th batch statistics will drop when doing benchmark or ce, because it might be mixed with startup time, so please make sure training at least 10 batches."
        print_ce(device_num, metrics, time_info)
-        #raise Warning("CE code is not ready")
    else:
        raise Exception("Illegal info_mode")