diff --git a/configs/ppyolo/ppyolo_r50vd_dcn_voc.yml b/configs/ppyolo/ppyolo_r50vd_dcn_voc.yml
index d16c0507dcd546f2dd34c8965f3be5abcbef8fb6..5349d6b1ed381705218f32daf17bff92a233d89e 100644
--- a/configs/ppyolo/ppyolo_r50vd_dcn_voc.yml
+++ b/configs/ppyolo/ppyolo_r50vd_dcn_voc.yml
@@ -13,6 +13,12 @@ TrainReader:
   mixup_epoch: 350
   batch_size: 12
 
+# set collate_batch to false because ground-truth info is needed
+# on voc dataset and should not collate data in batch when batch size
+# is larger than 1.
+EvalReader:
+  collate_batch: false
+
 epoch: 583
 
 LearningRate:
diff --git a/configs/ssd/ssd_mobilenet_v1_300_120e_voc.yml b/configs/ssd/ssd_mobilenet_v1_300_120e_voc.yml
index 45de7733a3f263b7dbf84a0f0bf933eedcc9b78e..feaec0c43273aecded4dc1d6c63164ceef50c487 100644
--- a/configs/ssd/ssd_mobilenet_v1_300_120e_voc.yml
+++ b/configs/ssd/ssd_mobilenet_v1_300_120e_voc.yml
@@ -6,3 +6,9 @@ _BASE_: [
   '_base_/ssd_mobilenet_reader.yml',
 ]
 weights: output/ssd_mobilenet_v1_300_120e_voc/model_final
+
+# set collate_batch to false because ground-truth info is needed
+# on voc dataset and should not collate data in batch when batch size
+# is larger than 1.
+EvalReader:
+  collate_batch: false
diff --git a/configs/ssd/ssd_vgg16_300_240e_voc.yml b/configs/ssd/ssd_vgg16_300_240e_voc.yml
index 58cf4b9855a4ca414faf67eae2179f40fc63bf77..ff24242a1fb94a8a895b6230684865bb40fff44a 100644
--- a/configs/ssd/ssd_vgg16_300_240e_voc.yml
+++ b/configs/ssd/ssd_vgg16_300_240e_voc.yml
@@ -6,3 +6,9 @@ _BASE_: [
   '_base_/ssd_reader.yml',
 ]
 weights: output/ssd_vgg16_300_240e_voc/model_final
+
+# set collate_batch to false because ground-truth info is needed
+# on voc dataset and should not collate data in batch when batch size
+# is larger than 1.
+EvalReader:
+  collate_batch: false
diff --git a/configs/yolov3/yolov3_darknet53_270e_voc.yml b/configs/yolov3/yolov3_darknet53_270e_voc.yml
index bb7a315ef29a09eb5d223447789af83ca40a1619..205fce756e318e5d0546e522942874264ddb7034 100644
--- a/configs/yolov3/yolov3_darknet53_270e_voc.yml
+++ b/configs/yolov3/yolov3_darknet53_270e_voc.yml
@@ -8,3 +8,9 @@ _BASE_: [
 
 snapshot_epoch: 5
 weights: output/yolov3_darknet53_270e_voc/model_final
+
+# set collate_batch to false because ground-truth info is needed
+# on voc dataset and should not collate data in batch when batch size
+# is larger than 1.
+EvalReader:
+  collate_batch: false
diff --git a/configs/yolov3/yolov3_mobilenet_v1_270e_voc.yml b/configs/yolov3/yolov3_mobilenet_v1_270e_voc.yml
index 7b25cd0e38fa59794050048b7e1d100c1d403170..996757af6be052409b5a71f8d543e5da63cb491d 100644
--- a/configs/yolov3/yolov3_mobilenet_v1_270e_voc.yml
+++ b/configs/yolov3/yolov3_mobilenet_v1_270e_voc.yml
@@ -9,6 +9,12 @@ _BASE_: [
 snapshot_epoch: 5
 weights: output/yolov3_mobilenet_v1_270e_voc/model_final
 
+# set collate_batch to false because ground-truth info is needed
+# on voc dataset and should not collate data in batch when batch size
+# is larger than 1.
+EvalReader:
+  collate_batch: false
+
 LearningRate:
   base_lr: 0.001
   schedulers:
diff --git a/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_voc.yml b/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_voc.yml
index 7a3e62fa1ee1effe9e7109938a2d8e217f9d5b9e..0f9c85fd981113ccbd1e1080000ea76a0cd680a6 100644
--- a/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_voc.yml
+++ b/configs/yolov3/yolov3_mobilenet_v1_ssld_270e_voc.yml
@@ -10,6 +10,12 @@ snapshot_epoch: 5
 pretrain_weights:  https://paddledet.bj.bcebos.com/models/pretrained/MobileNetV1_ssld_pretrained.pdparams
 weights: output/yolov3_mobilenet_v1_ssld_270e_voc/model_final
 
+# set collate_batch to false because ground-truth info is needed
+# on voc dataset and should not collate data in batch when batch size
+# is larger than 1.
+EvalReader:
+  collate_batch: false
+
 LearningRate:
   base_lr: 0.001
   schedulers:
diff --git a/configs/yolov3/yolov3_mobilenet_v3_large_270e_voc.yml b/configs/yolov3/yolov3_mobilenet_v3_large_270e_voc.yml
index abf492e235eb7e1a9a3f58905383486a4900a4aa..e246c8bae484833e7e63034318f150c7fbba93d6 100644
--- a/configs/yolov3/yolov3_mobilenet_v3_large_270e_voc.yml
+++ b/configs/yolov3/yolov3_mobilenet_v3_large_270e_voc.yml
@@ -9,6 +9,12 @@ _BASE_: [
 snapshot_epoch: 5
 weights: output/yolov3_mobilenet_v3_large_270e_voc/model_final
 
+# set collate_batch to false because ground-truth info is needed
+# on voc dataset and should not collate data in batch when batch size
+# is larger than 1.
+EvalReader:
+  collate_batch: false
+
 LearningRate:
   base_lr: 0.001
   schedulers:
diff --git a/configs/yolov3/yolov3_mobilenet_v3_large_ssld_270e_voc.yml b/configs/yolov3/yolov3_mobilenet_v3_large_ssld_270e_voc.yml
index 6d183e3e2207b0dc2023b81113513b4fbdcdd4f7..13a2583397bfda58ab7e06d9b1621edec47f506e 100644
--- a/configs/yolov3/yolov3_mobilenet_v3_large_ssld_270e_voc.yml
+++ b/configs/yolov3/yolov3_mobilenet_v3_large_ssld_270e_voc.yml
@@ -10,6 +10,12 @@ snapshot_epoch: 5
 pretrain_weights:  https://paddledet.bj.bcebos.com/models/pretrained/MobileNetV3_large_x1_0_ssld_pretrained.pdparams
 weights: output/yolov3_mobilenet_v3_large_ssld_270e_voc/model_final
 
+# set collate_batch to false because ground-truth info is needed
+# on voc dataset and should not collate data in batch when batch size
+# is larger than 1.
+EvalReader:
+  collate_batch: false
+
 LearningRate:
   base_lr: 0.001
   schedulers:
diff --git a/ppdet/data/reader.py b/ppdet/data/reader.py
index b8d01247459bb3e391ba3e5db69b0622d9e5e6f0..b03be3d32e5fc1936f3f1fd1f2e704b98536025a 100644
--- a/ppdet/data/reader.py
+++ b/ppdet/data/reader.py
@@ -95,10 +95,9 @@ class BatchCompose(Compose):
                 tmp_data = []
                 for i in range(len(data)):
                     tmp_data.append(data[i][k])
-                if not 'gt_' in k and not 'is_crowd' in k:
+                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
                     tmp_data = np.stack(tmp_data, axis=0)
                 batch_data[k] = tmp_data
-
         return batch_data
 
 
@@ -118,6 +117,11 @@ class BaseDataLoader(object):
         drop_empty (bool): whether to drop samples with no ground
                            truth labels, default True
         num_classes (int): class number of dataset, default 80
+        collate_batch (bool): whether to collate batch in dataloader.
+            If set to True, the samples will collate into batch according
+            to the batch size. Otherwise, the ground-truth will not collate,
+            which is used when the number of ground-truch is different in 
+            samples.
         use_shared_memory (bool): whether to use shared memory to
                 accelerate data loading, enable this only if you
                 are sure that the shared memory size of your OS
diff --git a/ppdet/metrics/metrics.py b/ppdet/metrics/metrics.py
index e4ad1544f4808f721445390f07b5c81441ef21ca..ed81c4210ea41170b7ef22e7cbc6639d697a0b6f 100644
--- a/ppdet/metrics/metrics.py
+++ b/ppdet/metrics/metrics.py
@@ -202,9 +202,9 @@ class VOCMetric(Metric):
 
         if bboxes.shape == (1, 1) or bboxes is None:
             return
-        gt_boxes = inputs['gt_bbox'].numpy()
-        gt_labels = inputs['gt_class'].numpy()
-        difficults = inputs['difficult'].numpy() if not self.evaluate_difficult \
+        gt_boxes = inputs['gt_bbox']
+        gt_labels = inputs['gt_class']
+        difficults = inputs['difficult'] if not self.evaluate_difficult \
                             else None
 
         scale_factor = inputs['scale_factor'].numpy(
@@ -212,13 +212,13 @@ class VOCMetric(Metric):
             (gt_boxes.shape[0], 2)).astype('float32')
 
         bbox_idx = 0
-        for i in range(gt_boxes.shape[0]):
-            gt_box = gt_boxes[i]
+        for i in range(len(gt_boxes)):
+            gt_box = gt_boxes[i].numpy()
             h, w = scale_factor[i]
             gt_box = gt_box / np.array([w, h, w, h])
-            gt_label = gt_labels[i]
+            gt_label = gt_labels[i].numpy()
             difficult = None if difficults is None \
-                            else difficults[i]
+                            else difficults[i].numpy()
             bbox_num = bbox_lengths[i]
             bbox = bboxes[bbox_idx:bbox_idx + bbox_num]
             score = scores[bbox_idx:bbox_idx + bbox_num]