add VOC visualize (#2547)

* add VOC visualize * fixn ssd_mobilenet_v1_voc.yml * use default label * clean TestFeed dataset config * fix voc default label * fix format * fix as review * revert voc default * use defult label for all * enable batch size != 1

add VOC visualize (#2547)
* add VOC visualize * fixn ssd_mobilenet_v1_voc.yml * use default label * clean TestFeed dataset config * fix voc default label * fix format * fix as review * revert voc default * use defult label for all * enable batch size != 1
356bb7e2 · Kaipeng Deng · GitHub · 32e54ca0 · 356bb7e2 · 356bb7e2
39 changed file
--- a/PaddleCV/object_detection/configs/cascade_rcnn_r50_fpn_1x.yml
+++ b/PaddleCV/object_detection/configs/cascade_rcnn_r50_fpn_1x.yml
@@ -135,7 +135,6 @@ FasterRCNNTestFeed:
    pad_to_stride: 32
  dataset:
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  drop_last: false
  num_workers: 2
  shuffle: false
--- a/PaddleCV/object_detection/configs/faster_rcnn_r101_1x.yml
+++ b/PaddleCV/object_detection/configs/faster_rcnn_r101_1x.yml
@@ -112,6 +112,4 @@ FasterRCNNEvalFeed:
 FasterRCNNTestFeed:
  batch_size: 1
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
--- a/PaddleCV/object_detection/configs/faster_rcnn_r101_fpn_1x.yml
+++ b/PaddleCV/object_detection/configs/faster_rcnn_r101_fpn_1x.yml
@@ -135,8 +135,6 @@ FasterRCNNTestFeed:
  - !PadBatch
    pad_to_stride: 32
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  num_workers: 2
  shuffle: False
--- a/PaddleCV/object_detection/configs/faster_rcnn_r101_fpn_2x.yml
+++ b/PaddleCV/object_detection/configs/faster_rcnn_r101_fpn_2x.yml
@@ -135,8 +135,6 @@ FasterRCNNTestFeed:
  - !PadBatch
    pad_to_stride: 32
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  num_workers: 2
  shuffle: False
--- a/PaddleCV/object_detection/configs/faster_rcnn_r101_vd_fpn_1x.yml
+++ b/PaddleCV/object_detection/configs/faster_rcnn_r101_vd_fpn_1x.yml
@@ -136,8 +136,6 @@ FasterRCNNTestFeed:
  - !PadBatch
    pad_to_stride: 32
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  num_workers: 2
  shuffle: False
--- a/PaddleCV/object_detection/configs/faster_rcnn_r101_vd_fpn_2x.yml
+++ b/PaddleCV/object_detection/configs/faster_rcnn_r101_vd_fpn_2x.yml
@@ -136,8 +136,6 @@ FasterRCNNTestFeed:
  - !PadBatch
    pad_to_stride: 32
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  num_workers: 2
  shuffle: False
--- a/PaddleCV/object_detection/configs/faster_rcnn_r50_1x.yml
+++ b/PaddleCV/object_detection/configs/faster_rcnn_r50_1x.yml
@@ -112,6 +112,4 @@ FasterRCNNEvalFeed:
 FasterRCNNTestFeed:
  batch_size: 1
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
--- a/PaddleCV/object_detection/configs/faster_rcnn_r50_2x.yml
+++ b/PaddleCV/object_detection/configs/faster_rcnn_r50_2x.yml
@@ -112,6 +112,4 @@ FasterRCNNEvalFeed:
 FasterRCNNTestFeed:
  batch_size: 1
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
--- a/PaddleCV/object_detection/configs/faster_rcnn_r50_fpn_1x.yml
+++ b/PaddleCV/object_detection/configs/faster_rcnn_r50_fpn_1x.yml
@@ -135,7 +135,6 @@ FasterRCNNTestFeed:
    pad_to_stride: 32
  dataset:
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  drop_last: false
  num_workers: 2
  shuffle: false
--- a/PaddleCV/object_detection/configs/faster_rcnn_r50_fpn_2x.yml
+++ b/PaddleCV/object_detection/configs/faster_rcnn_r50_fpn_2x.yml
@@ -135,7 +135,6 @@ FasterRCNNTestFeed:
    pad_to_stride: 32
  dataset:
    annotation: coco/annotations/instances_val2017.json
-    image_dir: coco/val2017
  drop_last: false
  num_workers: 2
  shuffle: false
--- a/PaddleCV/object_detection/configs/faster_rcnn_r50_vd_1x.yml
+++ b/PaddleCV/object_detection/configs/faster_rcnn_r50_vd_1x.yml
@@ -114,6 +114,4 @@ FasterRCNNEvalFeed:
 FasterRCNNTestFeed:
  batch_size: 1
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
--- a/PaddleCV/object_detection/configs/faster_rcnn_r50_vd_fpn_2x.yml
+++ b/PaddleCV/object_detection/configs/faster_rcnn_r50_vd_fpn_2x.yml
@@ -136,8 +136,6 @@ FasterRCNNTestFeed:
  - !PadBatch
    pad_to_stride: 32
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  num_workers: 2
  shuffle: False
--- a/PaddleCV/object_detection/configs/faster_rcnn_se154_vd_1x.yml
+++ b/PaddleCV/object_detection/configs/faster_rcnn_se154_vd_1x.yml
@@ -121,8 +121,6 @@ FasterRCNNEvalFeed:
 FasterRCNNTestFeed:
  batch_size: 1
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  num_workers: 2
  shuffle: False
--- a/PaddleCV/object_detection/configs/faster_rcnn_se154_vd_fpn_1x.yml
+++ b/PaddleCV/object_detection/configs/faster_rcnn_se154_vd_fpn_1x.yml
@@ -138,8 +138,6 @@ FasterRCNNTestFeed:
  - !PadBatch
    pad_to_stride: 32
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  num_workers: 2
  shuffle: False
--- a/PaddleCV/object_detection/configs/faster_rcnn_se154_vd_fpn_s1x.yml
+++ b/PaddleCV/object_detection/configs/faster_rcnn_se154_vd_fpn_s1x.yml
@@ -138,8 +138,6 @@ FasterRCNNTestFeed:
  - !PadBatch
    pad_to_stride: 32
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  num_workers: 2
  shuffle: False
--- a/PaddleCV/object_detection/configs/faster_rcnn_x101_64x4d_fpn_1x.yml
+++ b/PaddleCV/object_detection/configs/faster_rcnn_x101_64x4d_fpn_1x.yml
@@ -137,8 +137,6 @@ FasterRCNNTestFeed:
  - !PadBatch
    pad_to_stride: 32
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  num_workers: 2
  shuffle: False
--- a/PaddleCV/object_detection/configs/faster_rcnn_x101_64x4d_fpn_2x.yml
+++ b/PaddleCV/object_detection/configs/faster_rcnn_x101_64x4d_fpn_2x.yml
@@ -137,8 +137,6 @@ FasterRCNNTestFeed:
  - !PadBatch
    pad_to_stride: 32
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  num_workers: 2
  shuffle: False
--- a/PaddleCV/object_detection/configs/mask_rcnn_r101_fpn_1x.yml
+++ b/PaddleCV/object_detection/configs/mask_rcnn_r101_fpn_1x.yml
@@ -144,8 +144,6 @@ MaskRCNNTestFeed:
  - !PadBatch
    pad_to_stride: 32
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  num_workers: 2
  use_padded_im_info: True
--- a/PaddleCV/object_detection/configs/mask_rcnn_r101_fpn_2x.yml
+++ b/PaddleCV/object_detection/configs/mask_rcnn_r101_fpn_2x.yml
@@ -144,8 +144,6 @@ MaskRCNNTestFeed:
  - !PadBatch
    pad_to_stride: 32
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  num_workers: 2
  use_padded_im_info: True
--- a/PaddleCV/object_detection/configs/mask_rcnn_r50_1x.yml
+++ b/PaddleCV/object_detection/configs/mask_rcnn_r50_1x.yml
@@ -125,6 +125,4 @@ MaskRCNNEvalFeed:
 MaskRCNNTestFeed:
  batch_size: 1
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
--- a/PaddleCV/object_detection/configs/mask_rcnn_r50_2x.yml
+++ b/PaddleCV/object_detection/configs/mask_rcnn_r50_2x.yml
@@ -126,6 +126,4 @@ MaskRCNNEvalFeed:
 MaskRCNNTestFeed:
  batch_size: 1
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
--- a/PaddleCV/object_detection/configs/mask_rcnn_r50_fpn_1x.yml
+++ b/PaddleCV/object_detection/configs/mask_rcnn_r50_fpn_1x.yml
@@ -144,8 +144,6 @@ MaskRCNNTestFeed:
  - !PadBatch
    pad_to_stride: 32
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  num_workers: 2
  use_padded_im_info: True
--- a/PaddleCV/object_detection/configs/mask_rcnn_r50_fpn_2x.yml
+++ b/PaddleCV/object_detection/configs/mask_rcnn_r50_fpn_2x.yml
@@ -144,8 +144,6 @@ MaskRCNNTestFeed:
  - !PadBatch
    pad_to_stride: 32
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  num_workers: 2
  use_padded_im_info: True
--- a/PaddleCV/object_detection/configs/mask_rcnn_r50_vd_fpn_2x.yml
+++ b/PaddleCV/object_detection/configs/mask_rcnn_r50_vd_fpn_2x.yml
@@ -147,8 +147,6 @@ MaskRCNNTestFeed:
  - !PadBatch
    pad_to_stride: 32
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  num_workers: 2
  use_padded_im_info: True
--- a/PaddleCV/object_detection/configs/mask_rcnn_se154_vd_fpn_s1x.yml
+++ b/PaddleCV/object_detection/configs/mask_rcnn_se154_vd_fpn_s1x.yml
@@ -149,8 +149,6 @@ MaskRCNNTestFeed:
  - !PadBatch
    pad_to_stride: 32
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  num_workers: 2
  use_padded_im_info: True
--- a/PaddleCV/object_detection/configs/retinanet_r50_fpn_1x.yml
+++ b/PaddleCV/object_detection/configs/retinanet_r50_fpn_1x.yml
@@ -148,9 +148,7 @@ FasterRCNNTestFeed:
  - !PadBatch
    pad_to_stride: 128
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
  drop_last: false
  image_shape: [3, 1333, 800]
  num_workers: 2

--- a/PaddleCV/object_detection/configs/ssd_mobilenet_v1_voc.yml
+++ b/PaddleCV/object_detection/configs/ssd_mobilenet_v1_voc.yml
@@ -65,6 +65,7 @@ SSDTrainFeed:
    dataset_dir: data/voc
    annotation: VOCdevkit/VOC_all/ImageSets/Main/train.txt
    image_dir: VOCdevkit/VOC_all/JPEGImages
+    use_default_label: true

 SSDEvalFeed:
  batch_size: 64
@@ -73,14 +74,11 @@ SSDEvalFeed:
    dataset_dir: data/voc
    annotation: VOCdevkit/VOC_all/ImageSets/Main/val.txt
    image_dir: VOCdevkit/VOC_all/JPEGImages
-    use_default_label: false
+    use_default_label: true
  drop_last: false

 SSDTestFeed:
  batch_size: 1
  dataset:
-    dataset_dir: data/voc
-    annotation: VOCdevkit/VOC_all/ImageSets/Main/test.txt
-    image_dir: VOCdevkit/VOC_all/JPEGImages
-    use_default_label: false
+    use_default_label: true
  drop_last: false
--- a/PaddleCV/object_detection/configs/yolov3_darknet.yml
+++ b/PaddleCV/object_detection/configs/yolov3_darknet.yml
@@ -77,6 +77,4 @@ YoloEvalFeed:
 YoloTestFeed:
  batch_size: 1
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
--- a/PaddleCV/object_detection/configs/yolov3_mobilenet_v1.yml
+++ b/PaddleCV/object_detection/configs/yolov3_mobilenet_v1.yml
@@ -78,6 +78,4 @@ YoloEvalFeed:
 YoloTestFeed:
  batch_size: 1
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
--- a/PaddleCV/object_detection/configs/yolov3_r34.yml
+++ b/PaddleCV/object_detection/configs/yolov3_r34.yml
@@ -80,6 +80,4 @@ YoloEvalFeed:
 YoloTestFeed:
  batch_size: 1
  dataset:
-    dataset_dir: data/coco
    annotation: annotations/instances_val2017.json
-    image_dir: val2017
--- a/PaddleCV/object_detection/ppdet/data/data_feed.py
+++ b/PaddleCV/object_detection/ppdet/data/data_feed.py
@@ -781,7 +781,7 @@ class SSDTestFeed(DataFeed):

    def __init__(self,
                 dataset=SimpleDataSet(VOC_TEST_ANNOTATION).__dict__,
-                 fields=['image'],
+                 fields=['image', 'im_id'],
                 image_shape=[3, 300, 300],
                 sample_transforms=[
                     DecodeImage(to_rgb=True),

--- a/PaddleCV/object_detection/ppdet/data/source/voc_loader.py
+++ b/PaddleCV/object_detection/ppdet/data/source/voc_loader.py
@@ -243,25 +243,25 @@ def load(anno_path, sample_num=-1, use_default_label=True):

 def pascalvoc_label():
    labels_map = {
-        'aeroplane': 1,
-        'bicycle': 2,
-        'bird': 3,
-        'boat': 4,
-        'bottle': 5,
-        'bus': 6,
-        'car': 7,
-        'cat': 8,
-        'chair': 9,
-        'cow': 10,
-        'diningtable': 11,
-        'dog': 12,
-        'horse': 13,
-        'motorbike': 14,
-        'person': 15,
-        'pottedplant': 16,
-        'sheep': 17,
-        'sofa': 18,
-        'train': 19,
-        'tvmonitor': 20
+	'aeroplane': 1,
+	'bicycle': 2,
+	'bird': 3,
+	'boat': 4,
+	'bottle': 5,
+	'bus': 6,
+	'car': 7,
+	'cat': 8,
+	'chair': 9,
+	'cow': 10,
+	'diningtable': 11,
+	'dog': 12,
+	'horse': 13,
+	'motorbike': 14,
+	'person': 15,
+	'pottedplant': 16,
+	'sheep': 17,
+	'sofa': 18,
+	'train': 19,
+	'tvmonitor': 20
    }
    return labels_map
--- a/PaddleCV/object_detection/ppdet/data/tools/generate_data_for_training.py
+++ b/PaddleCV/object_detection/ppdet/data/tools/generate_data_for_training.py
@@ -109,7 +109,7 @@ def dump_voc_as_pickle(args):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    save_dir = args.save_dir
-    anno_path = args.annotation
+    anno_path = os.path.expanduser(args.annotation)
    roidb, cat2id = loader.load(
        anno_path, samples, with_cat2id=True, use_default_label=None)
    samples = len(roidb)

--- a/PaddleCV/object_detection/ppdet/data/transform/arrange_sample.py
+++ b/PaddleCV/object_detection/ppdet/data/transform/arrange_sample.py
@@ -183,7 +183,8 @@ class ArrangeTestSSD(BaseOperator):
            sample: a tuple containing the following items: (image)
        """
        im = sample['image']
-        outs = (im)
+        im_id = sample['im_id']
+        outs = (im, im_id)
        return outs



--- a/PaddleCV/object_detection/ppdet/modeling/model_input.py
+++ b/PaddleCV/object_detection/ppdet/modeling/model_input.py
@@ -31,7 +31,7 @@ feed_var_def = [
    {'name': 'is_crowd',      'shape': [1],  'dtype': 'int32',   'lod_level': 1},
    {'name': 'gt_mask',       'shape': [2],  'dtype': 'float32', 'lod_level': 3},
    {'name': 'is_difficult',  'shape': [1],  'dtype': 'int32',   'lod_level': 1},
-    {'name': 'gt_score',      'shape': None, 'dtype': 'float32', 'lod_level': 0},
+    {'name': 'gt_score',      'shape': [1],  'dtype': 'float32', 'lod_level': 0},
    {'name': 'im_shape',      'shape': [3],  'dtype': 'float32',   'lod_level': 0},
 ]
 # yapf: enable

--- a/PaddleCV/object_detection/ppdet/utils/coco_eval.py
+++ b/PaddleCV/object_detection/ppdet/utils/coco_eval.py
@@ -34,6 +34,14 @@ __all__ = [
 ]


+def clip_bbox(bbox):
+    xmin = max(min(bbox[0], 1.), 0.)
+    ymin = max(min(bbox[1], 1.), 0.)
+    xmax = max(min(bbox[2], 1.), 0.)
+    ymax = max(min(bbox[3], 1.), 0.)
+    return xmin, ymin, xmax, ymax
+
+
 def bbox_eval(results, anno_file, outfile, with_background=True):
    assert 'bbox' in results[0]
    assert outfile.endswith('.json')
@@ -80,7 +88,7 @@ def mask_eval(results, anno_file, outfile, resolution, thresh_binarize=0.5):
    coco_ev.summarize()


-def bbox2out(results, clsid2catid):
+def bbox2out(results, clsid2catid, is_bbox_normalized=False):
    xywh_res = []
    for t in results:
        bboxes = t['bbox'][0]
@@ -97,8 +105,16 @@ def bbox2out(results, clsid2catid):
                dt = bboxes[k]
                clsid, score, xmin, ymin, xmax, ymax = dt.tolist()
                catid = clsid2catid[clsid]
-                w = xmax - xmin + 1
-                h = ymax - ymin + 1
+
+                if is_bbox_normalized:
+                    xmin, ymin, xmax, ymax = \
+                            clip_bbox([xmin, ymin, xmax, ymax])
+                    w = xmax - xmin
+                    h = ymax - ymin
+                else:
+                    w = xmax - xmin + 1
+                    h = ymax - ymin + 1
+
                bbox = [xmin, ymin, w, h]
                coco_res = {
                    'image_id': im_id,
@@ -211,8 +227,11 @@ def expand_boxes(boxes, scale):
    return boxes_exp


-def get_category_info(anno_file=None, with_background=True):
-    if anno_file is None or not os.path.exists(anno_file):
+def get_category_info(anno_file=None,
+                      with_background=True,
+                      use_default_label=False):
+    if use_default_label or anno_file is None \
+            or not os.path.exists(anno_file):
        logger.info("Not found annotation file {}, load "
                    "coco17 categories.".format(anno_file))
        return coco17_category_info(with_background)

--- a/PaddleCV/object_detection/ppdet/utils/visualizer.py
+++ b/PaddleCV/object_detection/ppdet/utils/visualizer.py
@@ -17,6 +17,7 @@ from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals

+import logging
 import numpy as np
 import pycocotools.mask as mask_util
 from PIL import Image, ImageDraw
@@ -25,23 +26,28 @@ from .colormap import colormap

 __all__ = ['visualize_results']

+logger = logging.getLogger(__name__)
+

 def visualize_results(image,
+                      im_id,
                      catid2name,
                      threshold=0.5,
                      bbox_results=None,
-                      mask_results=None):
+                      mask_results=None,
+                      is_bbox_normalized=False):
    """
    Visualize bbox and mask results
    """
    if mask_results:
-        image = draw_mask(image, mask_results, threshold)
+        image = draw_mask(image, im_id, mask_results, threshold)
    if bbox_results:
-        image = draw_bbox(image, catid2name, bbox_results, threshold)
+        image = draw_bbox(image, im_id, catid2name, bbox_results,
+                          threshold, is_bbox_normalized)
    return image


-def draw_mask(image, segms, threshold, alpha=0.7):
+def draw_mask(image, im_id, segms, threshold, alpha=0.7):
    """
    Draw mask on image
    """
@@ -50,6 +56,8 @@ def draw_mask(image, segms, threshold, alpha=0.7):
    w_ratio = .4
    img_array = np.array(image).astype('float32')
    for dt in np.array(segms):
+        if im_id != dt['image_id']:
+            continue
        segm, score = dt['segmentation'], dt['score']
        if score < threshold:
            continue
@@ -65,18 +73,28 @@ def draw_mask(image, segms, threshold, alpha=0.7):
    return Image.fromarray(img_array.astype('uint8'))


-def draw_bbox(image, catid2name, bboxes, threshold):
+def draw_bbox(image, im_id, catid2name, bboxes, threshold, 
+              is_bbox_normalized=False):
    """
    Draw bbox on image
    """
    draw = ImageDraw.Draw(image)
-    im_width, im_height = image.size

    for dt in np.array(bboxes):
+        if im_id != dt['image_id']:
+            continue
        catid, bbox, score = dt['category_id'], dt['bbox'], dt['score']
        if score < threshold:
            continue
        xmin, ymin, w, h = bbox
+
+        if is_bbox_normalized:
+            im_width, im_height = image.size
+            xmin *= im_width
+            ymin *= im_height
+            w *= im_width
+            h *= im_height
+
        xmax = xmin + w
        ymax = ymin + h
        draw.line(
@@ -86,5 +104,7 @@ def draw_bbox(image, catid2name, bboxes, threshold):
            fill='red')
        if image.mode == 'RGB':
            draw.text((xmin, ymin), catid2name[catid], (255, 255, 0))
+        logger.debug("\t {:15s} at {:25} score: {:.5f}".format(catid2name[catid],
+                    str(list(map(int, list([xmin, ymin, xmax, ymax])))), score))

    return image
--- a/PaddleCV/object_detection/ppdet/utils/voc_eval.py
+++ b/PaddleCV/object_detection/ppdet/utils/voc_eval.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os
+import sys
+import numpy as np
+
+from ..data.source.voc_loader import pascalvoc_label
+from .coco_eval import bbox2out
+
+import logging
+logger = logging.getLogger(__name__)
+
+__all__ = [
+    'bbox2out', 'get_category_info'
+]
+
+
+def get_category_info(anno_file=None,
+                      with_background=True,
+                      use_default_label=False):
+    if use_default_label or anno_file is None \
+            or not os.path.exists(anno_file):
+        logger.info("Not found annotation file {}, load "
+                    "voc2012 categories.".format(anno_file))
+        return vocall_category_info(with_background)
+    else:
+        logger.info("Load categories from {}".format(anno_file))
+        return get_category_info_from_anno(anno_file, with_background)
+
+
+def get_category_info_from_anno(anno_file, with_background=True):
+    """
+    Get class id to category id map and category id
+    to category name map from annotation file.
+
+    Args:
+        anno_file (str): annotation file path
+        with_background (bool, default True):
+            whether load background as class 0.
+    """
+    cats = []
+    with open(anno_file) as f:
+        for line in f.readlines():
+            cats.append(line.strip())
+
+    if cats[0] != 'background' and with_background:
+        cats.insert(0, 'background')
+    if cats[0] == 'background' and not with_background:
+        cats = cats[1:]
+
+    clsid2catid = {i: i for i in range(len(cats))}
+    catid2name = {i: name for i, name in enumerate(cats)}
+
+    return clsid2catid, catid2name
+
+
+def vocall_category_info(with_background=True):
+    """
+    Get class id to category id map and category id
+    to category name map of mixup voc dataset
+
+    Args:
+        with_background (bool, default True):
+            whether load background as class 0.
+    """
+    label_map = pascalvoc_label()
+    label_map = sorted(label_map.items(), key=lambda x: x[1])
+    cats = [l[0] for l in label_map]
+
+    if with_background:
+        cats.insert(0, 'background')
+
+    clsid2catid = {i: i for i in range(len(cats))}
+    catid2name = {i: name for i, name in enumerate(cats)}
+
+    return clsid2catid, catid2name
--- a/PaddleCV/object_detection/tools/infer.py
+++ b/PaddleCV/object_detection/tools/infer.py
@@ -119,18 +119,21 @@ def main():
    extra_keys = []
    if cfg['metric'] == 'COCO':
        extra_keys = ['im_info', 'im_id', 'im_shape']
+    if cfg['metric'] == 'VOC':
+        extra_keys = ['im_id']
    keys, values, _ = parse_fetches(test_fetches, infer_prog, extra_keys)

    # 6. Parse dataset category
    if cfg.metric == 'COCO':
        from ppdet.utils.coco_eval import bbox2out, mask2out, get_category_info
    if cfg.metric == "VOC":
-        # TODO(dengkaipeng): add VOC metric process
-        pass
+        from ppdet.utils.voc_eval import bbox2out, get_category_info

    anno_file = getattr(test_feed.dataset, 'annotation', None)
    with_background = getattr(test_feed, 'with_background', True)
-    clsid2catid, catid2name = get_category_info(anno_file, with_background)
+    use_default_label = getattr(test_feed, 'use_default_label', False)
+    clsid2catid, catid2name = get_category_info(anno_file, with_background,
+                                                use_default_label)

    imid2path = reader.imid2path
    for iter_id, data in enumerate(reader()):
@@ -144,27 +147,27 @@ def main():
        }
        logger.info('Infer iter {}'.format(iter_id))

-        im_id = int(res['im_id'][0])
-        image_path = imid2path[im_id]
-        if cfg.metric == 'COCO':
-            bbox_results = None
-            mask_results = None
-            if 'bbox' in res:
-                bbox_results = bbox2out([res], clsid2catid)
-            if 'mask' in res:
-                mask_results = mask2out([res], clsid2catid,
-                                        cfg.MaskHead.resolution)
-            image = Image.open(image_path)
-            image = visualize_results(image, catid2name, 0.5,
-                                      bbox_results, mask_results)
+        bbox_results = None
+        mask_results = None
+        is_bbox_normalized = True if cfg.metric == 'VOC' else False
+        if 'bbox' in res:
+            bbox_results = bbox2out([res], clsid2catid, 
+                                    is_bbox_normalized)
+        if 'mask' in res:
+            mask_results = mask2out([res], clsid2catid,
+                                    cfg.MaskHead['resolution'])
+
+        # visualize result
+        im_ids = res['im_id'][0]
+        for im_id in im_ids:
+            image_path = imid2path[int(im_id)]
+            image = Image.open(image_path).convert('RGB')
+            visualize_results(image, int(im_id), catid2name, 0.5, bbox_results,
+                              mask_results, is_bbox_normalized)
            save_name = get_save_image_name(FLAGS.output_dir, image_path)
            logger.info("Detection bbox results save in {}".format(save_name))
            image.save(save_name)

-        if cfg.metric == "VOC":
-            # TODO(dengkaipeng): add VOC metric process
-            pass
-

 if __name__ == '__main__':
    parser = ArgsParser()