diff --git a/PaddleCV/image_classification/eval.py b/PaddleCV/image_classification/eval.py
index 254ec21d480a9e52e3f25f4cfcd8ab20567ac14f..773ea927cffbf2c9faf0eb26e52d6270072b92dc 100644
--- a/PaddleCV/image_classification/eval.py
+++ b/PaddleCV/image_classification/eval.py
@@ -50,6 +50,7 @@ add_arg('padding_type',     str,  "SAME",               "Padding type of convolu
 add_arg('use_se',           bool, True,                 "Whether to use Squeeze-and-Excitation module for EfficientNet.")
 add_arg('save_json_path',   str,  None,                 "Whether to save output in json file.")
 add_arg('same_feed',        int,  0,                    "Whether to feed same images")
+add_arg('print_step',       int,  1,                    "the batch step to print info")
 # yapf: enable
 
 
@@ -65,6 +66,11 @@ def eval(args):
     assert args.image_shape[
         1] <= args.resize_short_size, "Please check the args:image_shape and args:resize_short_size, The croped size(image_shape[1]) must smaller than or equal to the resized length(resize_short_size) "
 
+    # check gpu: when using gpu, the number of visible cards should divide batch size
+    if args.use_gpu:
+        assert args.batch_size % fluid.core.get_cuda_device_count(
+        ) == 0, "please support correct batch_size({}), which can be divided by available cards({}), you can change the number of cards by indicating: export CUDA_VISIBLE_DEVICES= ".format(
+            args.batch_size, fluid.core.get_cuda_device_count())
     image = fluid.data(
         name='image', shape=[None] + args.image_shape, dtype='float32')
     label = fluid.data(name='label', shape=[None, 1], dtype='int64')
@@ -98,11 +104,9 @@ def eval(args):
         acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1)
         acc_top5 = fluid.layers.accuracy(input=pred, label=label, k=5)
 
-    #startup_prog = fluid.Program()
-
     test_program = fluid.default_main_program().clone(for_test=True)
 
-    fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]
+    fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name, pred.name]
     gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
 
     place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
@@ -118,34 +122,59 @@ def eval(args):
     fluid.io.load_persistables(exe, args.pretrained_model)
     imagenet_reader = reader.ImageNetReader()
     val_reader = imagenet_reader.val(settings=args)
-    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
 
-    val_reader = feeder.decorate_reader(val_reader, multi_devices=True)
+    # set places to run on the multi-card
+    feeder = fluid.DataFeeder(place=places, feed_list=[image, label])
 
     test_info = [[], [], []]
     cnt = 0
+    parallel_data = []
+    parallel_id = []
+    place_num = paddle.fluid.core.get_cuda_device_count()
+    real_iter = 0
+    info_dict = {}
+
     for batch_id, data in enumerate(val_reader()):
-        t1 = time.time()
-        loss, acc1, acc5 = exe.run(compiled_program,
-                                   fetch_list=fetch_list,
-                                   feed=data)
-        t2 = time.time()
-        period = t2 - t1
-        loss = np.mean(loss)
-        acc1 = np.mean(acc1)
-        acc5 = np.mean(acc5)
-        test_info[0].append(loss * len(data))
-        test_info[1].append(acc1 * len(data))
-        test_info[2].append(acc5 * len(data))
-        cnt += len(data)
-        if batch_id % 10 == 0:
-            info = "Testbatch {0},loss {1}, acc1 {2},acc5 {3},time {4}".format(batch_id, \
+        #image data and label
+        image_data = [items[0:2] for items in data]
+        image_id = [items[2] for items in data]
+        parallel_id.append(image_id)
+        parallel_data.append(image_data)
+        if place_num == len(parallel_data):
+            t1 = time.time()
+            loss_set, acc1_set, acc5_set, pred_set = exe.run(
+                compiled_program,
+                fetch_list=fetch_list,
+                feed=list(feeder.feed_parallel(parallel_data, place_num)))
+            t2 = time.time()
+            period = t2 - t1
+            loss = np.mean(loss_set)
+            acc1 = np.mean(acc1_set)
+            acc5 = np.mean(acc5_set)
+            test_info[0].append(loss * len(data))
+            test_info[1].append(acc1 * len(data))
+            test_info[2].append(acc5 * len(data))
+            cnt += len(data)
+            if batch_id % args.print_step == 0:
+                info = "Testbatch {0},loss {1}, acc1 {2},acc5 {3},time {4}".format(real_iter, \
                   "%.5f"%loss,"%.5f"%acc1, "%.5f"%acc5, \
                   "%2.2f sec" % period)
-            print(info)
+                print(info)
+                sys.stdout.flush()
+
             if args.save_json_path:
-                save_json(info, args.save_json_path)
-            sys.stdout.flush()
+                for i, res in enumerate(pred_set):
+                    pred_label = np.argsort(res)[::-1][:1]
+                    real_id = str(np.array(parallel_id).flatten()[i])
+                    _, real_id = os.path.split(real_id)
+                    info_dict[real_id] = {}
+                    info_dict[real_id]['score'], info_dict[real_id][
+                        'class'] = str(res[pred_label]), str(pred_label)
+                    save_json(info_dict, args.save_json_path)
+
+            parallel_id = []
+            parallel_data = []
+            real_iter += 1
 
     test_loss = np.sum(test_info[0]) / cnt
     test_acc1 = np.sum(test_info[1]) / cnt
diff --git a/PaddleCV/image_classification/infer.py b/PaddleCV/image_classification/infer.py
index 8df267427dac2513472d917d0a6821e2c27f3e7d..708be476a8804cb767369c218d8177ecc5cd0a65 100644
--- a/PaddleCV/image_classification/infer.py
+++ b/PaddleCV/image_classification/infer.py
@@ -97,12 +97,13 @@ def infer(args):
     test_program = fluid.default_main_program().clone(for_test=True)
 
     fetch_list = [out.name]
-
-    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
+    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
-
-    places = fluid.framework.cuda_places()
+    places = place
+    if args.use_gpu:
+        places = fluid.framework.cuda_places()
     compiled_program = fluid.compiler.CompiledProgram(
         test_program).with_data_parallel(places=places)
 
@@ -140,7 +141,7 @@ def infer(args):
     info = {}
     parallel_data = []
     parallel_id = []
-    place_num = paddle.fluid.core.get_cuda_device_count()
+    place_num = paddle.fluid.core.get_cuda_device_count() if args.use_gpu else 1
 
     for batch_id, data in enumerate(test_reader()):
         image_data = [[items[0]] for items in data]
diff --git a/PaddleCV/image_classification/reader.py b/PaddleCV/image_classification/reader.py
index 6f7e097c072588eb5c750bcf63feab0abf76bfd3..53c53fb7b0bf64a42a3a3e6db3186a55b94d8915 100644
--- a/PaddleCV/image_classification/reader.py
+++ b/PaddleCV/image_classification/reader.py
@@ -236,11 +236,18 @@ def process_image(sample, settings, mode, color_jitter, rotate):
     img_std = np.array(std).reshape((3, 1, 1))
     img -= img_mean
     img /= img_std
-
-    if mode == 'train' or mode == 'val':
+    # doing training (train.py)
+    if mode == 'train' or (mode == 'val' and
+                           not hasattr(settings, 'save_json_path')):
         return (img, sample[1])
+    #doing testing (eval.py)
+    elif mode == 'val' and hasattr(settings, 'save_json_path'):
+        return (img, sample[1], sample[0])
+    #doing predict (infer.py)
     elif mode == 'test':
         return (img, sample[0])
+    else:
+        raise Exception("mode not implemented")
 
 
 def process_batch_data(input_data, settings, mode, color_jitter, rotate):
@@ -264,14 +271,14 @@ class ImageNetReader:
 
     def _get_single_card_bs(self, settings, mode):
         if settings.use_gpu:
-            if mode == "val" and settings.test_batch_size:
+            if mode == "val" and hasattr(settings, "test_batch_size"):
                 single_card_bs = settings.test_batch_size // paddle.fluid.core.get_cuda_device_count(
                 )
             else:
                 single_card_bs = settings.batch_size // paddle.fluid.core.get_cuda_device_count(
                 )
         else:
-            if mode == "val" and settings.test_batch_size:
+            if mode == "val" and hasattr(settings, "test_batch_size"):
                 single_card_bs = settings.test_batch_size // int(
                     os.environ.get('CPU_NUM', 1))
             else:
diff --git a/PaddleCV/image_classification/utils/utility.py b/PaddleCV/image_classification/utils/utility.py
index 44668556e5c0c22de388d9f98403c5d2c81219c8..c621c2290a2a5d0d688a6fd647707e6b117d0d72 100644
--- a/PaddleCV/image_classification/utils/utility.py
+++ b/PaddleCV/image_classification/utils/utility.py
@@ -92,6 +92,7 @@ def parse_args():
     add_arg('model_save_dir',           str,    "./output",        "The directory path to save model.")
     add_arg('data_dir',                 str,    "./data/ILSVRC2012/",   "The ImageNet dataset root directory.")
     add_arg('pretrained_model',         str,    None,                   "Whether to load pretrained model.")
+    add_arg('finetune_exclude_pretrained_params', str, None,            "Ignore params when doing finetune")
     add_arg('checkpoint',               str,    None,                   "Whether to resume checkpoint.")
     add_arg('print_step',               int,    10,                     "The steps interval to print logs")
     add_arg('save_step',                int,    1,                      "The steps interval to save checkpoints")
@@ -293,9 +294,9 @@ def init_model(exe, args, program):
         print("Finish initing model from %s" % (args.checkpoint))
 
     if args.pretrained_model:
+        """
         # yapf: disable
-
-        #XXX: should rename all models' final fc layers name as final_fc_weights and final_fc_offset!
+        # This is a dict of fc layers in all the classification models.
         final_fc_name = [
                          "fc8_weights","fc8_offset", #alexnet
                          "fc_weights","fc_offset", #darknet, densenet, dpn, hrnet, mobilenet_v3, res2net, res2net_vd, resnext, resnext_vd, xception
@@ -312,6 +313,13 @@ def init_model(exe, args, program):
                          "fc_bias" #"fc_weights", xception_deeplab
                          ]
         # yapf: enable
+        """
+        final_fc_name = []
+        if args.finetune_exclude_pretrained_params:
+            final_fc_name = [
+                str(s)
+                for s in args.finetune_exclude_pretrained_params.split(",")
+            ]
 
         def is_parameter(var):
             fc_exclude_flag = False
@@ -324,8 +332,8 @@ def init_model(exe, args, program):
                 Parameter) and not fc_exclude_flag and os.path.exists(
                     os.path.join(args.pretrained_model, var.name))
 
-        print("Load pretrain weights from {}, exclude fc layer.".format(
-            args.pretrained_model))
+        print("Load pretrain weights from {}, exclude params {}.".format(
+            args.pretrained_model, final_fc_name))
         vars = filter(is_parameter, program.list_vars())
         fluid.io.load_vars(
             exe, args.pretrained_model, vars=vars, main_program=program)
@@ -474,7 +482,6 @@ def print_info(info_mode,
             time_info
         ) > 10, "0~9th batch statistics will drop when doing benchmark or ce, because it might be mixed with startup time, so please make sure training at least 10 batches."
         print_ce(device_num, metrics, time_info)
-        #raise Warning("CE code is not ready")
     else:
         raise Exception("Illegal info_mode")