Add Yolov3 model based on PascalVOC and add voc metrics by Python. (#2801)

* add voc_eval and yolo_darknet_voc * add yolov3_darknet_voc in MODEL_ZOO * fix default im_size * fix MODEL_ZOO note * fix is_bbox_normalized * extract map to map_utils.py * update yolov3_dorknet_voc mixup * add yolov3_r34_voc * add yolov3_mobilenet_v1_voc * fix drop empty in VAL mode * use cfg.num_classes * assert metric valid * enable difficulty can be None * add comment for bbox_eval * num_classes in retinanet

Add Yolov3 model based on PascalVOC and add voc metrics by Python. (#2801)
* add voc_eval and yolo_darknet_voc * add yolov3_darknet_voc in MODEL_ZOO * fix default im_size * fix MODEL_ZOO note * fix is_bbox_normalized * extract map to map_utils.py * update yolov3_dorknet_voc mixup * add yolov3_r34_voc * add yolov3_mobilenet_v1_voc * fix drop empty in VAL mode * use cfg.num_classes * assert metric valid * enable difficulty can be None * add comment for bbox_eval * num_classes in retinanet
b00deb54 · Kaipeng Deng · GitHub · 2a5fd326 · b00deb54 · b00deb54
25 changed file
--- a/configs/retinanet_r101_fpn_1x.yml
+++ b/configs/retinanet_r101_fpn_1x.yml
@@ -10,6 +10,7 @@ log_smooth_window: 20
 snapshot_iter: 10000
 metric: COCO
 save_dir: output
+num_classes: 81

 RetinaNet:
  backbone: ResNet
@@ -38,7 +39,6 @@ RetinaHead:
  prior_prob: 0.01
  base_scale: 4
  num_scales_per_octave: 3
-  num_classes: 81
  anchor_generator:
    aspect_ratios: [1.0, 2.0, 0.5]
    variance: [1.0, 1.0, 1.0, 1.0]

--- a/configs/retinanet_r50_fpn_1x.yml
+++ b/configs/retinanet_r50_fpn_1x.yml
@@ -10,6 +10,7 @@ log_smooth_window: 20
 snapshot_iter: 10000
 metric: COCO
 save_dir: output
+num_classes: 81

 RetinaNet:
  backbone: ResNet
@@ -38,7 +39,6 @@ RetinaHead:
  prior_prob: 0.01
  base_scale: 4
  num_scales_per_octave: 3
-  num_classes: 81
  anchor_generator:
    aspect_ratios: [1.0, 2.0, 0.5]
    variance: [1.0, 1.0, 1.0, 1.0]

--- a/configs/ssd_mobilenet_v1_voc.yml
+++ b/configs/ssd_mobilenet_v1_voc.yml
@@ -30,6 +30,7 @@ SSD:
 MobileNet:
  norm_decay: 0.
  conv_group_scale: 1
+  conv_learning_rate: 0.1
  extra_block_filters: [[256, 512], [128, 256], [128, 256], [64, 128]]
  with_extra_blocks: true


--- a/configs/yolov3_darknet.yml
+++ b/configs/yolov3_darknet.yml
@@ -10,6 +10,7 @@ snapshot_iter: 2000
 metric: COCO
 pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/DarkNet53_pretrained.tar
 weights: output/yolov3_darknet/model_final
+num_classes: 80

 YOLOv3:
  backbone: DarkNet
@@ -35,7 +36,6 @@ YOLOv3Head:
    nms_top_k: 1000
    normalized: false
    score_threshold: 0.01
-  num_classes: 80

 LearningRate:
  base_lr: 0.001

--- a/configs/yolov3_darknet_voc.yml
+++ b/configs/yolov3_darknet_voc.yml
+architecture: YOLOv3
+train_feed: YoloTrainFeed
+eval_feed: YoloEvalFeed
+test_feed: YoloTestFeed
+use_gpu: true
+max_iters: 70000
+log_smooth_window: 20
+save_dir: output
+snapshot_iter: 2000
+metric: VOC
+pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/DarkNet53_pretrained.tar
+weights: output/yolov3_darknet_voc/model_final
+num_classes: 20
+
+YOLOv3:
+  backbone: DarkNet
+  yolo_head: YOLOv3Head
+
+DarkNet:
+  norm_type: sync_bn
+  norm_decay: 0.
+  depth: 53
+
+YOLOv3Head:
+  anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+  anchors: [[10, 13], [16, 30], [33, 23],
+            [30, 61], [62, 45], [59, 119],
+            [116, 90], [156, 198], [373, 326]]
+  norm_decay: 0.
+  ignore_thresh: 0.7
+  label_smooth: false
+  nms:
+    background_label: -1
+    keep_top_k: 100
+    nms_threshold: 0.45
+    nms_top_k: 1000
+    normalized: false
+    score_threshold: 0.01
+
+LearningRate:
+  base_lr: 0.001
+  schedulers:
+  - !PiecewiseDecay
+    gamma: 0.1
+    milestones:
+    - 55000
+    - 62000
+  - !LinearWarmup
+    start_factor: 0.
+    steps: 1000
+
+OptimizerBuilder:
+  optimizer:
+    momentum: 0.9
+    type: Momentum
+  regularizer:
+    factor: 0.0005
+    type: L2
+
+YoloTrainFeed:
+  batch_size: 8
+  dataset:
+    dataset_dir: dataset/voc
+    annotation: VOCdevkit/VOC_all/ImageSets/Main/train.txt
+    image_dir: VOCdevkit/VOC_all/JPEGImages
+    use_default_label: true
+  num_workers: 8
+  bufsize: 128
+  use_process: true
+  mixup_epoch: 250
+
+YoloEvalFeed:
+  batch_size: 8
+  dataset:
+    dataset_dir: dataset/voc
+    annotation: VOCdevkit/VOC_all/ImageSets/Main/val.txt
+    image_dir: VOCdevkit/VOC_all/JPEGImages
+    use_default_label: true
+
+YoloTestFeed:
+  batch_size: 1
+  dataset:
+    use_default_label: true
--- a/configs/yolov3_mobilenet_v1.yml
+++ b/configs/yolov3_mobilenet_v1.yml
@@ -10,6 +10,7 @@ snapshot_iter: 2000
 metric: COCO
 pretrain_weights: http://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV1_pretrained.tar
 weights: output/yolov3_mobilenet_v1/model_final
+num_classes: 80

 YOLOv3:
  backbone: MobileNet
@@ -36,7 +37,6 @@ YOLOv3Head:
    nms_top_k: 1000
    normalized: false
    score_threshold: 0.01
-  num_classes: 80

 LearningRate:
  base_lr: 0.001

--- a/configs/yolov3_mobilenet_v1_voc.yml
+++ b/configs/yolov3_mobilenet_v1_voc.yml
+architecture: YOLOv3
+train_feed: YoloTrainFeed
+eval_feed: YoloEvalFeed
+test_feed: YoloTestFeed
+use_gpu: true
+max_iters: 70000
+log_smooth_window: 20
+save_dir: output
+snapshot_iter: 2000
+metric: VOC
+pretrain_weights: http://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV1_pretrained.tar
+weights: output/yolov3_mobilenet_v1_voc/model_final
+num_classes: 20
+
+YOLOv3:
+  backbone: MobileNet
+  yolo_head: YOLOv3Head
+
+MobileNet:
+  norm_type: sync_bn
+  norm_decay: 0.
+  conv_group_scale: 1
+  with_extra_blocks: false
+
+YOLOv3Head:
+  anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+  anchors: [[10, 13], [16, 30], [33, 23],
+            [30, 61], [62, 45], [59, 119],
+            [116, 90], [156, 198], [373, 326]]
+  norm_decay: 0.
+  ignore_thresh: 0.7
+  label_smooth: false
+  nms:
+    background_label: -1
+    keep_top_k: 100
+    nms_threshold: 0.45
+    nms_top_k: 1000
+    normalized: false
+    score_threshold: 0.01
+
+LearningRate:
+  base_lr: 0.001
+  schedulers:
+  - !PiecewiseDecay
+    gamma: 0.1
+    milestones:
+    - 55000
+    - 62000
+  - !LinearWarmup
+    start_factor: 0.
+    steps: 1000
+
+OptimizerBuilder:
+  optimizer:
+    momentum: 0.9
+    type: Momentum
+  regularizer:
+    factor: 0.0005
+    type: L2
+
+YoloTrainFeed:
+  batch_size: 8
+  dataset:
+    dataset_dir: dataset/voc
+    annotation: VOCdevkit/VOC_all/ImageSets/Main/train.txt
+    image_dir: VOCdevkit/VOC_all/JPEGImages
+    use_default_label: true
+  num_workers: 8
+  bufsize: 128
+  use_process: true
+  mixup_epoch: 250
+
+YoloEvalFeed:
+  batch_size: 8
+  dataset:
+    dataset_dir: dataset/voc
+    annotation: VOCdevkit/VOC_all/ImageSets/Main/val.txt
+    image_dir: VOCdevkit/VOC_all/JPEGImages
+    use_default_label: true
+
+YoloTestFeed:
+  batch_size: 1
+  dataset:
+    use_default_label: true
--- a/configs/yolov3_r34.yml
+++ b/configs/yolov3_r34.yml
@@ -10,6 +10,7 @@ snapshot_iter: 2000
 metric: COCO
 pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet34_pretrained.tar
 weights: output/yolov3_r34/model_final
+num_classes: 80

 YOLOv3:
  backbone: ResNet
@@ -38,7 +39,6 @@ YOLOv3Head:
    nms_top_k: 1000
    normalized: false
    score_threshold: 0.01
-  num_classes: 80

 LearningRate:
  base_lr: 0.001

--- a/configs/yolov3_r34_voc.yml
+++ b/configs/yolov3_r34_voc.yml
+architecture: YOLOv3
+train_feed: YoloTrainFeed
+eval_feed: YoloEvalFeed
+test_feed: YoloTestFeed
+use_gpu: true
+max_iters: 70000
+log_smooth_window: 20
+save_dir: output
+snapshot_iter: 2000
+metric: VOC
+pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet34_pretrained.tar
+weights: output/yolov3_r34_voc/model_final
+num_classes: 20
+
+YOLOv3:
+  backbone: ResNet
+  yolo_head: YOLOv3Head
+
+ResNet:
+  norm_type: sync_bn
+  freeze_at: 0
+  freeze_norm: false
+  norm_decay: 0.
+  depth: 34
+  feature_maps: [3, 4, 5]
+
+YOLOv3Head:
+  anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+  anchors: [[10, 13], [16, 30], [33, 23],
+            [30, 61], [62, 45], [59, 119],
+            [116, 90], [156, 198], [373, 326]]
+  norm_decay: 0.
+  ignore_thresh: 0.7
+  label_smooth: false
+  nms:
+    background_label: -1
+    keep_top_k: 100
+    nms_threshold: 0.45
+    nms_top_k: 1000
+    normalized: false
+    score_threshold: 0.01
+
+LearningRate:
+  base_lr: 0.001
+  schedulers:
+  - !PiecewiseDecay
+    gamma: 0.1
+    milestones:
+    - 55000
+    - 62000
+  - !LinearWarmup
+    start_factor: 0.
+    steps: 1000
+
+OptimizerBuilder:
+  optimizer:
+    momentum: 0.9
+    type: Momentum
+  regularizer:
+    factor: 0.0005
+    type: L2
+
+YoloTrainFeed:
+  batch_size: 8
+  dataset:
+    dataset_dir: dataset/voc
+    annotation: VOCdevkit/VOC_all/ImageSets/Main/train.txt
+    image_dir: VOCdevkit/VOC_all/JPEGImages
+    use_default_label: true
+  num_workers: 8
+  bufsize: 128
+  use_process: true
+  mixup_epoch: 250
+
+YoloEvalFeed:
+  batch_size: 8
+  dataset:
+    dataset_dir: dataset/voc
+    annotation: VOCdevkit/VOC_all/ImageSets/Main/val.txt
+    image_dir: VOCdevkit/VOC_all/JPEGImages
+    use_default_label: true
+
+YoloTestFeed:
+  batch_size: 1
+  dataset:
+    use_default_label: true
--- a/docs/MODEL_ZOO.md
+++ b/docs/MODEL_ZOO.md
@@ -69,6 +69,20 @@ The backbone models pretrained on ImageNet are available. All backbone models ar
 | ResNet34     | 416  |    8    |   270e  |  34.3  | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34.tar) |
 | ResNet34     | 320  |    8    |   270e  |  31.4  | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34.tar) |

+### Yolo v3 on Pascal VOC
+
+| Backbone     | Size | Image/gpu | Lr schd | Box AP | Download  |
+| :----------- | :--: | :-----: | :-----: | :----: | :-------: |
+| DarkNet53    | 608  |    8    |   270e  |  83.5  | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_darknet_voc.tar) |
+| DarkNet53    | 416  |    8    |   270e  |  83.6  | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_darknet_voc.tar) |
+| DarkNet53    | 320  |    8    |   270e  |  82.2  | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_darknet_voc.tar) |
+| MobileNet-V1 | 608  |    8    |   270e  |  76.2  | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) |
+| MobileNet-V1 | 416  |    8    |   270e  |  76.7  | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) |
+| MobileNet-V1 | 320  |    8    |   270e  |  75.3  | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) |
+| ResNet34     | 608  |    8    |   270e  |  82.6  | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34_voc.tar) |
+| ResNet34     | 416  |    8    |   270e  |  81.9  | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34_voc.tar) |
+| ResNet34     | 320  |    8    |   270e  |  80.1  | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34_voc.tar) |
+
 **NOTE**: Yolo v3 is trained in 8 GPU with total batch size as 64 and trained 270 epoches. Yolo v3 training data augmentations: mixup,
 randomly color distortion, randomly cropping, randomly expansion, randomly interpolation method, randomly flippling. Yolo v3 used randomly
 reshaped minibatch in training, inferences can be performed on different image sizes with the same model weights, and we provided evaluation

--- a/ppdet/data/data_feed.py
+++ b/ppdet/data/data_feed.py
@@ -30,7 +30,8 @@ from ppdet.data.transform.operators import (
    Permute)
 from ppdet.data.transform.arrange_sample import (ArrangeRCNN, ArrangeTestRCNN,
                                                 ArrangeSSD, ArrangeTestSSD,
-                                                 ArrangeYOLO, ArrangeTestYOLO)
+                                                 ArrangeYOLO, ArrangeEvalYOLO,
+                                                 ArrangeTestYOLO)

 __all__ = [
    'PadBatch', 'MultiScale', 'RandomShape', 'DataSet', 'CocoDataSet',
@@ -891,7 +892,8 @@ class YoloEvalFeed(DataFeed):
    def __init__(self,
                 dataset=CocoDataSet(COCO_VAL_ANNOTATION,
                                     COCO_VAL_IMAGE_DIR).__dict__,
-                 fields=['image', 'im_shape', 'im_id'],
+                 fields=['image', 'im_size', 'im_id', 'gt_box', 
+                         'gt_label', 'is_difficult'],
                 image_shape=[3, 608, 608],
                 sample_transforms=[
                     DecodeImage(to_rgb=True),
@@ -912,7 +914,7 @@ class YoloEvalFeed(DataFeed):
                 num_workers=8,
                 num_max_boxes=50,
                 use_process=False):
-        sample_transforms.append(ArrangeTestYOLO())
+        sample_transforms.append(ArrangeEvalYOLO())
        super(YoloEvalFeed, self).__init__(
            dataset,
            fields,
@@ -926,7 +928,6 @@ class YoloEvalFeed(DataFeed):
            with_background=with_background,
            num_workers=num_workers,
            use_process=use_process)
-        self.num_max_boxes = num_max_boxes
        self.mode = 'VAL'
        self.bufsize = 128

@@ -938,7 +939,7 @@ class YoloTestFeed(DataFeed):
    def __init__(self,
                 dataset=SimpleDataSet(COCO_VAL_ANNOTATION,
                                       COCO_VAL_IMAGE_DIR).__dict__,
-                 fields=['image', 'im_shape', 'im_id'],
+                 fields=['image', 'im_size', 'im_id'],
                 image_shape=[3, 608, 608],
                 sample_transforms=[
                     DecodeImage(to_rgb=True),
@@ -974,6 +975,5 @@ class YoloTestFeed(DataFeed):
            with_background=with_background,
            num_workers=num_workers,
            use_process=use_process)
-        self.num_max_boxes = num_max_boxes
        self.mode = 'TEST'
        self.bufsize = 128
--- a/ppdet/data/reader.py
+++ b/ppdet/data/reader.py
@@ -63,7 +63,10 @@ class Reader(object):
            worker_args = {k.lower(): v for k, v in worker_args.items()}

        mapped_ds = map(sc, mapper, worker_args)
-        batched_ds = batch(mapped_ds, batchsize, drop_last)
+        # In VAL mode, gt_bbox, gt_label can be empty, and should
+        # not be dropped
+        batched_ds = batch(mapped_ds, batchsize, drop_last, 
+                           drop_empty=(mode!="VAL"))

        trans_conf = {k.lower(): v for k, v in self._trans_conf[mode].items()}
        need_keys = {

--- a/ppdet/data/transform/__init__.py
+++ b/ppdet/data/transform/__init__.py
@@ -105,7 +105,7 @@ def map(ds, mapper, worker_args=None):
        return MappedDataset(ds, mapper)


-def batch(ds, batchsize, drop_last=False):
+def batch(ds, batchsize, drop_last=False, drop_empty=True):
    """
    Batch data samples to batches
    Args:
@@ -116,7 +116,10 @@ def batch(ds, batchsize, drop_last=False):
        a batched dataset
    """

-    return BatchedDataset(ds, batchsize, drop_last=drop_last)
+    return BatchedDataset(ds, 
+                          batchsize, 
+                          drop_last=drop_last, 
+                          drop_empty=drop_empty)


 def batch_map(ds, config):

--- a/ppdet/data/transform/arrange_sample.py
+++ b/ppdet/data/transform/arrange_sample.py
@@ -228,10 +228,44 @@ class ArrangeYOLO(BaseOperator):
        return outs


+@register_op
+class ArrangeEvalYOLO(BaseOperator):
+    """
+    Transform dict to the tuple format needed for evaluation.
+    """
+
+    def __init__(self):
+        super(ArrangeEvalYOLO, self).__init__()
+
+    def __call__(self, sample, context=None):
+        """
+        Args:
+            sample: a dict which contains image
+                    info and annotation info.
+            context: a dict which contains additional info.
+        Returns:
+            sample: a tuple containing the following items:
+                (image, gt_bbox, gt_class, gt_score,
+                 is_crowd, im_info, gt_masks)
+        """
+        im = sample['image']
+        if len(sample['gt_bbox']) != len(sample['gt_class']):
+            raise ValueError("gt num mismatch: bbox and class.")
+        im_id = sample['im_id']
+        h = sample['h']
+        w = sample['w']
+        im_shape = np.array((h, w))
+        gt_bbox = sample['gt_bbox']
+        gt_class = sample['gt_class']
+        difficult = sample['difficult']
+        outs = (im, im_shape, im_id, gt_bbox, gt_class, difficult)
+        return outs
+
+
 @register_op
 class ArrangeTestYOLO(BaseOperator):
    """
-    Transform dict to the tuple format needed for training.
+    Transform dict to the tuple format needed for inference.
    """

    def __init__(self):

--- a/ppdet/data/transform/transformer.py
+++ b/ppdet/data/transform/transformer.py
@@ -66,12 +66,14 @@ class BatchedDataset(ProxiedDataset):
        ds (instance of Dataset): dataset to be batched
        batchsize (int): sample number for each batch
        drop_last (bool): drop last samples when not enough for one batch
+        drop_empty (bool): drop samples which have empty field
    """

-    def __init__(self, ds, batchsize, drop_last=False):
+    def __init__(self, ds, batchsize, drop_last=False, drop_empty=True):
        super(BatchedDataset, self).__init__(ds)
        self._batchsz = batchsize
        self._drop_last = drop_last
+        self._drop_empty = drop_empty

    def next(self):
        """proxy to self._ds.next"""
@@ -95,7 +97,7 @@ class BatchedDataset(ProxiedDataset):
        for _ in range(self._batchsz):
            try:
                out = self._ds.next()
-                while has_empty(out):
+                while self._drop_empty and has_empty(out):
                    out = self._ds.next()
                batch.append(out)
            except StopIteration:

--- a/ppdet/modeling/anchor_heads/yolo_head.py
+++ b/ppdet/modeling/anchor_heads/yolo_head.py
@@ -41,6 +41,7 @@ class YOLOv3Head(object):
        nms (object): an instance of `MultiClassNMS`
    """
    __inject__ = ['nms']
+    __shared__ = ['num_classes']

    def __init__(self,
                 norm_decay=0.,
@@ -277,13 +278,13 @@ class YOLOv3Head(object):

        return sum(losses)

-    def get_prediction(self, input, im_shape):
+    def get_prediction(self, input, im_size):
        """
        Get prediction result of YOLOv3 network

        Args:
            input (list): List of Variables, output of backbone stages
-            im_shape (Variable): Variable of shape([h, w]) of each image
+            im_size (Variable): Variable of size([h, w]) of each image

        Returns:
            pred (Variable): The prediction result after non-max suppress.
@@ -298,7 +299,7 @@ class YOLOv3Head(object):
        for i, output in enumerate(outputs):
            box, score = fluid.layers.yolo_box(
                x=output,
-                img_size=im_shape,
+                img_size=im_size,
                anchors=self.mask_anchors[i],
                class_num=self.num_classes,
                conf_thresh=self.nms.score_threshold,

--- a/ppdet/modeling/architectures/yolov3.py
+++ b/ppdet/modeling/architectures/yolov3.py
@@ -59,8 +59,8 @@ class YOLOv3(object):
                                                gt_score)
            }
        else:
-            im_shape = feed_vars['im_shape']
-            return self.yolo_head.get_prediction(body_feats, im_shape)
+            im_size = feed_vars['im_size']
+            return self.yolo_head.get_prediction(body_feats, im_size)

    def train(self, feed_vars):
        return self.build(feed_vars, mode='train')

--- a/ppdet/modeling/backbones/mobilenet.py
+++ b/ppdet/modeling/backbones/mobilenet.py
@@ -42,12 +42,14 @@ class MobileNet(object):
                 norm_type='bn',
                 norm_decay=0.,
                 conv_group_scale=1,
+                 conv_learning_rate=1.0,
                 with_extra_blocks=False,
                 extra_block_filters=[[256, 512], [128, 256], [128, 256],
                                      [64, 128]]):
        self.norm_type = norm_type
        self.norm_decay = norm_decay
        self.conv_group_scale = conv_group_scale
+        self.conv_learning_rate = conv_learning_rate
        self.with_extra_blocks = with_extra_blocks
        self.extra_block_filters = extra_block_filters

@@ -62,7 +64,7 @@ class MobileNet(object):
                   use_cudnn=True,
                   name=None):
        parameter_attr = ParamAttr(
-            learning_rate=0.1,
+            learning_rate=self.conv_learning_rate,
            initializer=fluid.initializer.MSRA(),
            name=name + "_weights")
        conv = fluid.layers.conv2d(

--- a/ppdet/modeling/model_input.py
+++ b/ppdet/modeling/model_input.py
@@ -32,7 +32,8 @@ feed_var_def = [
    {'name': 'gt_mask',       'shape': [2],  'dtype': 'float32', 'lod_level': 3},
    {'name': 'is_difficult',  'shape': [1],  'dtype': 'int32',   'lod_level': 1},
    {'name': 'gt_score',      'shape': [1],  'dtype': 'float32', 'lod_level': 0},
-    {'name': 'im_shape',      'shape': [3],  'dtype': 'float32',   'lod_level': 0},
+    {'name': 'im_shape',      'shape': [3],  'dtype': 'float32', 'lod_level': 0},
+    {'name': 'im_size',       'shape': [2],  'dtype': 'int32',   'lod_level': 0},
 ]
 # yapf: enable

@@ -47,7 +48,8 @@ def create_feed(feed, use_pyreader=True):
        'lod_level': 0
    }

-    # YOLO var dim is fixed
+    # tensor padding with 0 is used instead of LoD tensor when 
+    # num_max_boxes is set
    if getattr(feed, 'num_max_boxes', None) is not None:
        feed_var_map['gt_label']['shape'] = [feed.num_max_boxes]
        feed_var_map['gt_score']['shape'] = [feed.num_max_boxes]
@@ -55,8 +57,6 @@ def create_feed(feed, use_pyreader=True):
        feed_var_map['gt_label']['lod_level'] = 0
        feed_var_map['gt_score']['lod_level'] = 0
        feed_var_map['gt_box']['lod_level'] = 0
-        feed_var_map['im_shape']['shape'] = [2]
-        feed_var_map['im_shape']['dtype'] = 'int32'

    feed_vars = OrderedDict([(key, fluid.layers.data(
        name=feed_var_map[key]['name'],

--- a/ppdet/utils/eval_utils.py
+++ b/ppdet/utils/eval_utils.py
@@ -21,6 +21,8 @@ import numpy as np

 import paddle.fluid as fluid

+from ppdet.utils.voc_eval import bbox_eval as voc_bbox_eval
+
 __all__ = ['parse_fetches', 'eval_run', 'eval_results']

 logger = logging.getLogger(__name__)
@@ -88,7 +90,13 @@ def eval_run(exe, compile_program, pyreader, keys, values, cls):
    return results


-def eval_results(results, feed, metric, resolution=None, output_file=None):
+def eval_results(results, 
+                 feed, 
+                 metric, 
+                 num_classes,
+                 resolution=None, 
+                 is_bbox_normalized=False, 
+                 output_file=None):
    """Evaluation for evaluation program results"""
    if metric == 'COCO':
        from ppdet.utils.coco_eval import proposal_eval, bbox_eval, mask_eval
@@ -110,5 +118,9 @@ def eval_results(results, feed, metric, resolution=None, output_file=None):
                output = '{}_mask.json'.format(output_file)
            mask_eval(results, anno_file, output, resolution)
    else:
-        res = np.mean(results[-1]['accum_map'][0])
-        logger.info('Test mAP: {}'.format(res))
+        if 'accum_map' in results[-1]:
+            res = np.mean(results[-1]['accum_map'][0])
+            logger.info('mAP: {:.2f}'.format(res * 100.))
+        elif 'bbox' in results[0]:
+            voc_bbox_eval(results, num_classes,
+                          is_bbox_normalized=is_bbox_normalized)
--- a/ppdet/utils/map_utils.py
+++ b/ppdet/utils/map_utils.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import sys
+import numpy as np
+import logging
+logger = logging.getLogger(__name__)
+
+__all__ = [
+    'bbox_area', 'jaccard_overlap', 'DetectionMAP'
+]
+
+
+def bbox_area(bbox, is_bbox_normalized):
+    """
+    Calculate area of a bounding box
+    """
+    norm = 1. - float(is_bbox_normalized)
+    width = bbox[2] - bbox[0] + norm
+    height = bbox[3] - bbox[1] + norm
+    return width * height
+
+
+def jaccard_overlap(pred, gt, is_bbox_normalized=False):
+    """
+    Calculate jaccard overlap ratio between two bounding box
+    """
+    if pred[0] >= gt[2] or pred[2] <= gt[0] or \
+        pred[1] >= gt[3] or pred[3] <= gt[1]:
+        return 0.
+    inter_xmin = max(pred[0], gt[0])
+    inter_ymin = max(pred[1], gt[1])
+    inter_xmax = min(pred[2], gt[2])
+    inter_ymax = min(pred[3], gt[3])
+    inter_size = bbox_area([inter_xmin, inter_ymin,
+                            inter_xmax, inter_ymax],
+                            is_bbox_normalized)
+    pred_size = bbox_area(pred, is_bbox_normalized)
+    gt_size = bbox_area(gt, is_bbox_normalized)
+    overlap = float(inter_size) / (
+        pred_size + gt_size - inter_size)
+    return overlap
+
+
+class DetectionMAP(object):
+    """
+    Calculate detection mean average precision.
+    Currently support two types: 11point and integral
+
+    Args:
+        class_num (int): the class number.
+        overlap_thresh (float): The threshold of overlap
+            ratio between prediction bounding box and 
+            ground truth bounding box for deciding 
+            true/false positive. Default 0.5.
+        map_type (str): calculation method of mean average
+            precision, currently support '11point' and
+            'integral'. Default '11point'.
+        is_bbox_normalized (bool): whther bounding boxes
+            is normalized to range[0, 1]. Default False.
+        evaluate_difficult (bool): whether to evaluate
+            difficult bounding boxes. Default False.
+    """
+
+    def __init__(self,
+                 class_num,
+                 overlap_thresh=0.5,
+                 map_type='11point',
+                 is_bbox_normalized=False,
+                 evaluate_difficult=False):
+        self.class_num = class_num
+        self.overlap_thresh = overlap_thresh
+        assert map_type in ['11point', 'integral'], \
+                "map_type currently only support '11point' "\
+                "and 'integral'"
+        self.map_type = map_type
+        self.is_bbox_normalized = is_bbox_normalized
+        self.evaluate_difficult = evaluate_difficult
+        self.reset()
+
+    def update(self, bbox, gt_box, gt_label, difficult=None):
+        """
+        Update metric statics from given prediction and ground
+        truth infomations.
+        """
+        if difficult is None:
+            difficult = np.zeros_like(gt_label)
+
+        # record class gt count
+        for gtl, diff in zip(gt_label, difficult):
+            if self.evaluate_difficult or int(diff) == 0:
+                self.class_gt_counts[int(gtl[0])] += 1
+
+        # record class score positive
+        visited = [False] * len(gt_label)
+        for b in bbox:
+            label, score, xmin, ymin, xmax, ymax = b.tolist()
+            pred = [xmin, ymin, xmax, ymax]
+            max_idx = -1
+            max_overlap = -1.0
+            for i, gl in enumerate(gt_label):
+                if int(gl) == int(label):
+                    overlap = jaccard_overlap(pred, gt_box[i],
+                                    self.is_bbox_normalized)
+                    if overlap > max_overlap:
+                        max_overlap = overlap
+                        max_idx = i
+
+            if max_overlap > self.overlap_thresh:
+                if self.evaluate_difficult or \
+                        int(difficult[max_idx]) == 0:
+                    if not visited[max_idx]:
+                        self.class_score_poss[
+                                int(label)].append([score, 1.0])
+                        visited[max_idx] = True
+                    else:
+                        self.class_score_poss[
+                                int(label)].append([score, 0.0])
+            else:
+                self.class_score_poss[
+                        int(label)].append([score, 0.0])
+    
+    def reset(self):
+        """
+        Reset metric statics
+        """
+        self.class_score_poss = [[] for _ in range(self.class_num)]
+        self.class_gt_counts = [0] * self.class_num
+        self.mAP = None
+
+    def accumulate(self):
+        """
+        Accumulate metric results and calculate mAP
+        """
+        mAP = 0.
+        valid_cnt = 0
+        for score_pos, count in zip(self.class_score_poss, 
+                                    self.class_gt_counts):
+            if count == 0 or len(score_pos) == 0:
+                continue
+
+            accum_tp_list, accum_fp_list = \
+                    self._get_tp_fp_accum(score_pos)
+            precision = []
+            recall = []
+            for ac_tp, ac_fp in zip(accum_tp_list, accum_fp_list):
+                precision.append(float(ac_tp) / (ac_tp + ac_fp))
+                recall.append(float(ac_tp) / count)
+
+            if self.map_type == '11point':
+                max_precisions = [0.] * 11
+                start_idx = len(precision) - 1
+                for j in range(10, -1, -1):
+                    for i in range(start_idx, -1, -1):
+                        if recall[i] < float(j) / 10.:
+                            start_idx = i
+                            if j > 0:
+                                max_precisions[j - 1] = max_precisions[j]
+                                break
+                        else:
+                            if max_precisions[j] < precision[i]:
+                                max_precisions[j] = precision[i]
+                mAP += sum(max_precisions) / 11. 
+                valid_cnt += 1
+            elif self.map_type == 'integral':
+                import math
+                ap = 0.
+                prev_recall = 0.
+                for i in range(len(precision)):
+                    recall_gap = math.fabs(recall[i] - prev_recall)
+                    if recall_gap > 1e-6:
+                        ap += precision[i] * recall_gap
+                        prev_recall = recall[i]
+                mAP += ap
+                valid_cnt += 1
+            else:
+                logger.error("Unspported mAP type {}".format(map_type))
+                sys.exit(1)
+
+        self.mAP = mAP / float(valid_cnt) if valid_cnt > 0 else mAP
+
+    def get_map(self):
+        """
+        Get mAP result
+        """
+        if self.mAP is None:
+            logger.error("mAP is not calculated.")
+        return self.mAP
+
+    def _get_tp_fp_accum(self, score_pos_list):
+        """
+        Calculate accumulating true/false positive results from
+        [score, pos] records
+        """
+        sorted_list = sorted(score_pos_list, 
+                             key=lambda s: s[0], 
+                             reverse=True)
+        accum_tp = 0
+        accum_fp = 0
+        accum_tp_list = []
+        accum_fp_list = []
+        for (score, pos) in sorted_list:
+            accum_tp += int(pos)
+            accum_tp_list.append(accum_tp)
+            accum_fp += 1 - int(pos)
+            accum_fp_list.append(accum_fp)
+        return accum_tp_list, accum_fp_list
+
--- a/ppdet/utils/voc_eval.py
+++ b/ppdet/utils/voc_eval.py
@@ -22,16 +22,83 @@ import sys
 import numpy as np

 from ..data.source.voc_loader import pascalvoc_label
+from .map_utils import DetectionMAP
 from .coco_eval import bbox2out

 import logging
 logger = logging.getLogger(__name__)

 __all__ = [
-    'bbox2out', 'get_category_info'
+    'bbox_eval', 'bbox2out', 'get_category_info'
 ]


+def bbox_eval(results, 
+              class_num, 
+              overlap_thresh=0.5,
+              map_type='11point',
+              is_bbox_normalized=False,
+              evaluate_difficult=False):
+    """
+    Bounding box evaluation for VOC dataset
+
+    Args:
+        results (list): prediction bounding box results.
+        class_num (int): evaluation class number.
+        overlap_thresh (float): the postive threshold of 
+                        bbox overlap
+        map_type (string): method for mAP calcualtion,
+                        can only be '11point' or 'integral'
+        is_bbox_normalized (bool): whether bbox is normalized
+                        to range [0, 1].
+        evaluate_difficult (bool): whether to evaluate 
+                        difficult gt bbox.
+    """
+    assert 'bbox' in results[0]
+    logger.info("Start evaluate...")
+
+    detection_map = DetectionMAP(class_num=class_num,
+                        overlap_thresh=overlap_thresh,
+                        map_type=map_type,
+                        is_bbox_normalized=is_bbox_normalized,
+                        evaluate_difficult=evaluate_difficult)
+
+    for t in results:
+        bboxes = t['bbox'][0]
+        bbox_lengths = t['bbox'][1][0]
+
+        if bboxes.shape == (1, 1) or bboxes is None:
+            continue
+
+        gt_boxes = t['gt_box'][0]
+        gt_box_lengths = t['gt_box'][1][0]
+        gt_labels = t['gt_label'][0]
+        assert len(gt_boxes) == len(gt_labels)
+        difficults = t['is_difficult'][0] if not evaluate_difficult \
+                            else None
+        if not evaluate_difficult:
+            assert len(gt_labels) == len(difficults)
+
+        bbox_idx = 0
+        gt_box_idx = 0
+        for i in range(len(bbox_lengths)):
+            bbox_num = bbox_lengths[i]
+            gt_box_num = gt_box_lengths[i]
+            bbox = bboxes[bbox_idx: bbox_idx + bbox_num]
+            gt_box = gt_boxes[gt_box_idx: gt_box_idx + gt_box_num]
+            gt_label = gt_labels[gt_box_idx: gt_box_idx + gt_box_num]
+            difficult = None if difficults is None else \
+                        difficults[gt_box_idx: gt_box_idx + gt_box_num]
+            detection_map.update(bbox, gt_box, gt_label, difficult)
+            bbox_idx += bbox_num
+            gt_box_idx += gt_box_num
+
+    logger.info("Accumulating evaluatation results...")
+    detection_map.accumulate()
+    logger.info("mAP({:.2f}, {}) = {:.2f}".format(overlap_thresh,
+                            map_type, 100. * detection_map.get_map()))
+
+
 def get_category_info(anno_file=None,
                      with_background=True,
                      use_default_label=False):

--- a/tools/eval.py
+++ b/tools/eval.py
@@ -93,18 +93,29 @@ def main():
    if 'weights' in cfg:
        checkpoint.load_pretrain(exe, eval_prog, cfg.weights)

+    assert cfg.metric in ['COCO', 'VOC'], \
+            "unknown metric type {}".format(cfg.metric)
    extra_keys = []
-    if 'metric' in cfg and cfg.metric == 'COCO':
+    if cfg.metric == 'COCO':
        extra_keys = ['im_info', 'im_id', 'im_shape']
+    if cfg.metric == 'VOC':
+        extra_keys = ['gt_box', 'gt_label', 'is_difficult']

    keys, values, cls = parse_fetches(fetches, eval_prog, extra_keys)

+    # whether output bbox is normalized in model output layer
+    is_bbox_normalized = False
+    if hasattr(model, 'is_bbox_normalized') and \
+            callable(model.is_bbox_normalized):
+        is_bbox_normalized = model.is_bbox_normalized()
+
    results = eval_run(exe, compile_program, pyreader, keys, values, cls)
    # evaluation
    resolution = None
    if 'mask' in results[0]:
        resolution = model.mask_head.resolution
-    eval_results(results, eval_feed, cfg.metric, resolution, FLAGS.output_file)
+    eval_results(results, eval_feed, cfg.metric, cfg.num_classes, 
+                 resolution, is_bbox_normalized, FLAGS.output_file)


 if __name__ == '__main__':

--- a/tools/infer.py
+++ b/tools/infer.py
@@ -169,6 +169,8 @@ def main():
        save_infer_model(FLAGS, exe, feed_vars, test_fetches, infer_prog)

    # parse infer fetches
+    assert cfg.metric in ['COCO', 'VOC'], \
+            "unknown metric type {}".format(cfg.metric)
    extra_keys = []
    if cfg['metric'] == 'COCO':
        extra_keys = ['im_info', 'im_id', 'im_shape']

--- a/tools/train.py
+++ b/tools/train.py
@@ -156,6 +156,12 @@ def main():
    elif cfg.pretrain_weights:
        checkpoint.load_pretrain(exe, train_prog, cfg.pretrain_weights)

+    # whether output bbox is normalized in model output layer
+    is_bbox_normalized = False
+    if hasattr(model, 'is_bbox_normalized') and \
+            callable(model.is_bbox_normalized):
+        is_bbox_normalized = model.is_bbox_normalized()
+
    train_stats = TrainingStats(cfg.log_smooth_window, train_keys)
    train_pyreader.start()
    start_time = time.time()
@@ -191,8 +197,8 @@ def main():
                resolution = None
                if 'mask' in results[0]:
                    resolution = model.mask_head.resolution
-                eval_results(results, eval_feed, cfg.metric, resolution,
-                             FLAGS.output_file)
+                eval_results(results, eval_feed, cfg.metric, cfg.num_classes, 
+                             resolution, is_bbox_normalized, FLAGS.output_file)

    train_pyreader.reset()