From 7e172f88fedb72dc20c9b9da6abc214c1997df44 Mon Sep 17 00:00:00 2001 From: Guanghua Yu <742925032@qq.com> Date: Mon, 12 Aug 2019 16:14:43 +0800 Subject: [PATCH] [PaddleDetection] Add VGG-SSD on VOC and COCO dataset. (#3037) # Add VGG-SSD on VOC and COCO dataset. # Add config and model zoo. # Refine bbox2out and draw_bbox: - Add bbox de-normalize in bbox2out function. - Remove bbox de-normalize in draw_bbox. --- README.md | 20 +-- README_cn.md | 20 +-- configs/{ => ssd}/ssd_mobilenet_v1_voc.yml | 0 configs/ssd/ssd_vgg16_300.yml | 153 ++++++++++++++++ configs/ssd/ssd_vgg16_300_voc.yml | 155 ++++++++++++++++ configs/ssd/ssd_vgg16_512.yml | 155 ++++++++++++++++ configs/ssd/ssd_vgg16_512_voc.yml | 159 ++++++++++++++++ docs/MODEL_ZOO.md | 14 +- docs/MODEL_ZOO_cn.md | 14 +- ppdet/data/data_feed.py | 24 +-- ppdet/data/transform/arrange_sample.py | 50 ++++-- ppdet/modeling/architectures/ssd.py | 15 +- ppdet/modeling/backbones/__init__.py | 2 + ppdet/modeling/backbones/vgg.py | 200 +++++++++++++++++++++ ppdet/modeling/ops.py | 14 +- ppdet/utils/coco_eval.py | 25 ++- ppdet/utils/eval_utils.py | 5 +- ppdet/utils/visualizer.py | 14 +- tools/infer.py | 4 +- 19 files changed, 964 insertions(+), 79 deletions(-) rename configs/{ => ssd}/ssd_mobilenet_v1_voc.yml (100%) create mode 100644 configs/ssd/ssd_vgg16_300.yml create mode 100644 configs/ssd/ssd_vgg16_300_voc.yml create mode 100644 configs/ssd/ssd_vgg16_512.yml create mode 100644 configs/ssd/ssd_vgg16_512_voc.yml create mode 100644 ppdet/modeling/backbones/vgg.py diff --git a/README.md b/README.md index ac9c2df80..cad7dfb12 100644 --- a/README.md +++ b/README.md @@ -38,16 +38,16 @@ multi-GPU training. Supported Architectures: -| | ResNet | ResNet-vd [1](#vd) | ResNeXt-vd | SENet | MobileNet | DarkNet | -|--------------------|:------:|------------------------------:|:----------:|:-----:|:---------:|:-------:| -| Faster R-CNN | ✓ | ✓ | x | ✓ | ✗ | ✗ | -| Faster R-CNN + FPN | ✓ | ✓ | ✓ | ✓ | ✗ | ✗ | -| Mask R-CNN | ✓ | ✓ | x | ✓ | ✗ | ✗ | -| Mask R-CNN + FPN | ✓ | ✓ | ✓ | ✓ | ✗ | ✗ | -| Cascade R-CNN | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ | -| RetinaNet | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ | -| Yolov3 | ✓ | ✗ | ✗ | ✗ | ✓ | ✓ | -| SSD | ✗ | ✗ | ✗ | ✗ | ✓ | ✗ | +| | ResNet | ResNet-vd [1](#vd) | ResNeXt-vd | SENet | MobileNet | DarkNet | VGG | +|--------------------|:------:|------------------------------:|:----------:|:-----:|:---------:|:-------:|:---:| +| Faster R-CNN | ✓ | ✓ | x | ✓ | ✗ | ✗ | ✗ | +| Faster R-CNN + FPN | ✓ | ✓ | ✓ | ✓ | ✗ | ✗ | ✗ | +| Mask R-CNN | ✓ | ✓ | x | ✓ | ✗ | ✗ | ✗ | +| Mask R-CNN + FPN | ✓ | ✓ | ✓ | ✓ | ✗ | ✗ | ✗ | +| Cascade R-CNN | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | +| RetinaNet | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | +| Yolov3 | ✓ | ✗ | ✗ | ✗ | ✓ | ✓ | ✗ | +| SSD | ✗ | ✗ | ✗ | ✗ | ✓ | ✗ | ✓ | [1] [ResNet-vd](https://arxiv.org/pdf/1812.01187) models offer much improved accuracy with negligible performance cost. diff --git a/README_cn.md b/README_cn.md index 6ca2efdbe..d4a7f792e 100644 --- a/README_cn.md +++ b/README_cn.md @@ -27,16 +27,16 @@ PaddleDetection的目的是为工业界和学术界提供大量易使用的目 支持的模型结构: -| | ResNet | ResNet-vd [1](#vd) | ResNeXt-vd | SENet | MobileNet | DarkNet | -|--------------------|:------:|------------------------------:|:----------:|:-----:|:---------:|:-------:| -| Faster R-CNN | ✓ | ✓ | x | ✓ | ✗ | ✗ | -| Faster R-CNN + FPN | ✓ | ✓ | ✓ | ✓ | ✗ | ✗ | -| Mask R-CNN | ✓ | ✓ | x | ✓ | ✗ | ✗ | -| Mask R-CNN + FPN | ✓ | ✓ | ✓ | ✓ | ✗ | ✗ | -| Cascade R-CNN | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ | -| RetinaNet | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ | -| Yolov3 | ✓ | ✗ | ✗ | ✗ | ✓ | ✓ | -| SSD | ✗ | ✗ | ✗ | ✗ | ✓ | ✗ | +| | ResNet | ResNet-vd [1](#vd) | ResNeXt-vd | SENet | MobileNet | DarkNet | VGG | +|--------------------|:------:|------------------------------:|:----------:|:-----:|:---------:|:-------:|:---:| +| Faster R-CNN | ✓ | ✓ | x | ✓ | ✗ | ✗ | ✗ | +| Faster R-CNN + FPN | ✓ | ✓ | ✓ | ✓ | ✗ | ✗ | ✗ | +| Mask R-CNN | ✓ | ✓ | x | ✓ | ✗ | ✗ | ✗ | +| Mask R-CNN + FPN | ✓ | ✓ | ✓ | ✓ | ✗ | ✗ | ✗ | +| Cascade R-CNN | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | +| RetinaNet | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | +| Yolov3 | ✓ | ✗ | ✗ | ✗ | ✓ | ✓ | ✗ | +| SSD | ✗ | ✗ | ✗ | ✗ | ✓ | ✗ | ✓ | [1] [ResNet-vd](https://arxiv.org/pdf/1812.01187) 模型提供了较大的精度提高和较少的性能损失。 diff --git a/configs/ssd_mobilenet_v1_voc.yml b/configs/ssd/ssd_mobilenet_v1_voc.yml similarity index 100% rename from configs/ssd_mobilenet_v1_voc.yml rename to configs/ssd/ssd_mobilenet_v1_voc.yml diff --git a/configs/ssd/ssd_vgg16_300.yml b/configs/ssd/ssd_vgg16_300.yml new file mode 100644 index 000000000..d44ba3d3b --- /dev/null +++ b/configs/ssd/ssd_vgg16_300.yml @@ -0,0 +1,153 @@ +architecture: SSD +train_feed: SSDTrainFeed +eval_feed: SSDEvalFeed +test_feed: SSDTestFeed +use_gpu: true +max_iters: 400000 +snapshot_iter: 10000 +log_smooth_window: 20 +log_iter: 20 +metric: COCO +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/VGG16_caffe_pretrained.tar +save_dir: output +weights: output/ssd_vgg16_300/model_final +num_classes: 81 + +SSD: + backbone: VGG + multi_box_head: MultiBoxHead + metric: + ap_version: 11point + evaluate_difficult: false + overlap_threshold: 0.5 + output_decoder: + background_label: 0 + keep_top_k: 200 + nms_eta: 1.0 + nms_threshold: 0.45 + nms_top_k: 400 + score_threshold: 0.01 + +VGG: + depth: 16 + with_extra_blocks: true + normalizations: [20., -1, -1, -1, -1, -1] + +MultiBoxHead: + base_size: 300 + aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]] + min_ratio: 15 + max_ratio: 90 + min_sizes: [30.0, 60.0, 111.0, 162.0, 213.0, 264.0] + max_sizes: [60.0, 111.0, 162.0, 213.0, 264.0, 315.0] + steps: [8, 16, 32, 64, 100, 300] + offset: 0.5 + flip: true + kernel_size: 3 + pad: 1 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [280000, 360000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 + +SSDTrainFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + image_shape: [3, 300, 300] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !RandomDistort + brightness_lower: 0.875 + brightness_upper: 1.125 + is_order: true + - !ExpandImage + max_ratio: 4 + mean: [104, 117, 123] + prob: 0.5 + - !CropImage + avoid_no_bbox: true + batch_sampler: + - [1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0] + satisfy_all: false + - !ResizeImage + interp: 1 + target_size: 300 + use_cv2: false + - !RandomFlipImage + is_normalized: true + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] + +SSDEvalFeed: + batch_size: 16 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + drop_last: false + image_shape: [3, 300, 300] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !ResizeImage + interp: 1 + target_size: 300 + use_cv2: false + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] + +SSDTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + image_shape: [3, 300, 300] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !ResizeImage + interp: 1 + max_size: 0 + target_size: 300 + use_cv2: false + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] diff --git a/configs/ssd/ssd_vgg16_300_voc.yml b/configs/ssd/ssd_vgg16_300_voc.yml new file mode 100644 index 000000000..5019ca552 --- /dev/null +++ b/configs/ssd/ssd_vgg16_300_voc.yml @@ -0,0 +1,155 @@ +architecture: SSD +train_feed: SSDTrainFeed +eval_feed: SSDEvalFeed +test_feed: SSDTestFeed +use_gpu: true +max_iters: 120001 +snapshot_iter: 10000 +log_smooth_window: 20 +log_iter: 20 +metric: VOC +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/VGG16_caffe_pretrained.tar +save_dir: output +weights: output/ssd_vgg16_300_voc/model_final/ +num_classes: 21 + +SSD: + backbone: VGG + multi_box_head: MultiBoxHead + metric: + ap_version: 11point + evaluate_difficult: false + overlap_threshold: 0.5 + output_decoder: + background_label: 0 + keep_top_k: 200 + nms_eta: 1.0 + nms_threshold: 0.45 + nms_top_k: 400 + score_threshold: 0.01 + +VGG: + depth: 16 + with_extra_blocks: true + normalizations: [20., -1, -1, -1, -1, -1] + +MultiBoxHead: + base_size: 300 + aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]] + min_ratio: 20 + max_ratio: 90 + min_sizes: [30.0, 60.0, 111.0, 162.0, 213.0, 264.0] + max_sizes: [60.0, 111.0, 162.0, 213.0, 264.0, 315.0] + steps: [8, 16, 32, 64, 100, 300] + offset: 0.5 + flip: true + min_max_aspect_ratios_order: true + kernel_size: 3 + pad: 1 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [80000, 100000] + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 + +SSDTrainFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/voc + annotation: VOCdevkit/VOC_all/ImageSets/Main/train.txt + image_dir: VOCdevkit/VOC_all/JPEGImages + use_default_label: true + image_shape: [3, 300, 300] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !RandomDistort + brightness_lower: 0.875 + brightness_upper: 1.125 + is_order: true + - !ExpandImage + max_ratio: 4 + mean: [104, 117, 123] + prob: 0.5 + - !CropImage + avoid_no_bbox: true + batch_sampler: + - [1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0] + satisfy_all: false + - !ResizeImage + interp: 1 + target_size: 300 + use_cv2: False + - !RandomFlipImage + is_normalized: true + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] + +SSDEvalFeed: + batch_size: 32 + dataset: + dataset_dir: dataset/voc + annotation: VOCdevkit/VOC_all/ImageSets/Main/val.txt + image_dir: VOCdevkit/VOC_all/JPEGImages + use_default_label: true + drop_last: false + image_shape: [3, 300, 300] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !ResizeImage + interp: 1 + target_size: 300 + use_cv2: false + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] + +SSDTestFeed: + batch_size: 1 + dataset: + use_default_label: true + drop_last: false + image_shape: [3, 300, 300] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !ResizeImage + interp: 1 + max_size: 0 + target_size: 300 + use_cv2: false + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] diff --git a/configs/ssd/ssd_vgg16_512.yml b/configs/ssd/ssd_vgg16_512.yml new file mode 100644 index 000000000..82fe5c74c --- /dev/null +++ b/configs/ssd/ssd_vgg16_512.yml @@ -0,0 +1,155 @@ +architecture: SSD +train_feed: SSDTrainFeed +eval_feed: SSDEvalFeed +test_feed: SSDTestFeed +use_gpu: true +max_iters: 400000 +snapshot_iter: 10000 +log_smooth_window: 20 +log_iter: 20 +metric: COCO +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/VGG16_caffe_pretrained.tar +save_dir: output +weights: output/ssd_vgg16_512/model_final +num_classes: 81 + +SSD: + backbone: VGG + multi_box_head: MultiBoxHead + metric: + ap_version: 11point + evaluate_difficult: false + overlap_threshold: 0.5 + output_decoder: + background_label: 0 + keep_top_k: 200 + nms_eta: 1.0 + nms_threshold: 0.45 + nms_top_k: 400 + score_threshold: 0.01 + +VGG: + depth: 16 + with_extra_blocks: true + normalizations: [20., -1, -1, -1, -1, -1, -1] + extra_block_filters: [[256, 512, 1, 2, 3], [128, 256, 1, 2, 3], [128, 256, 1, 2, 3], [128, 256, 1, 2, 3], [128, 256, 1, 1, 4]] + + +MultiBoxHead: + base_size: 512 + aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]] + min_ratio: 15 + max_ratio: 90 + min_sizes: [20.0, 51.0, 133.0, 215.0, 296.0, 378.0, 460.0] + max_sizes: [51.0, 133.0, 215.0, 296.0, 378.0, 460.0, 542.0] + steps: [8, 16, 32, 64, 128, 256, 512] + offset: 0.5 + flip: true + kernel_size: 3 + pad: 1 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [280000, 360000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 + +SSDTrainFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + image_shape: [3, 512, 512] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !RandomDistort + brightness_lower: 0.875 + brightness_upper: 1.125 + is_order: true + - !ExpandImage + max_ratio: 4 + mean: [104, 117, 123] + prob: 0.5 + - !CropImage + avoid_no_bbox: true + batch_sampler: + - [1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0] + satisfy_all: false + - !ResizeImage + interp: 1 + target_size: 512 + use_cv2: false + - !RandomFlipImage + is_normalized: true + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] + +SSDEvalFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + drop_last: false + image_shape: [3, 512, 512] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !ResizeImage + interp: 1 + target_size: 512 + use_cv2: false + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] + +SSDTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + image_shape: [3, 512, 512] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !ResizeImage + interp: 1 + max_size: 0 + target_size: 512 + use_cv2: false + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] diff --git a/configs/ssd/ssd_vgg16_512_voc.yml b/configs/ssd/ssd_vgg16_512_voc.yml new file mode 100644 index 000000000..a55473cfe --- /dev/null +++ b/configs/ssd/ssd_vgg16_512_voc.yml @@ -0,0 +1,159 @@ +architecture: SSD +train_feed: SSDTrainFeed +eval_feed: SSDEvalFeed +test_feed: SSDTestFeed +use_gpu: true +max_iters: 120000 +snapshot_iter: 10000 +log_smooth_window: 20 +log_iter: 20 +metric: VOC +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/VGG16_caffe_pretrained.tar +save_dir: output +weights: output/ssd_vgg16_512_voc/model_final/ +num_classes: 21 + +SSD: + backbone: VGG + multi_box_head: MultiBoxHead + metric: + ap_version: 11point + evaluate_difficult: false + overlap_threshold: 0.5 + output_decoder: + background_label: 0 + keep_top_k: 200 + nms_eta: 1.0 + nms_threshold: 0.45 + nms_top_k: 400 + score_threshold: 0.01 + +VGG: + depth: 16 + with_extra_blocks: true + normalizations: [20., -1, -1, -1, -1, -1, -1] + extra_block_filters: [[256, 512, 1, 2, 3], [128, 256, 1, 2, 3], [128, 256, 1, 2, 3], [128, 256, 1, 2, 3], [128, 256, 1, 1, 4]] + + +MultiBoxHead: + base_size: 512 + aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]] + min_ratio: 20 + max_ratio: 90 + min_sizes: [20.0, 51.0, 133.0, 215.0, 296.0, 378.0, 460.0] + max_sizes: [51.0, 133.0, 215.0, 296.0, 378.0, 460.0, 542.0] + steps: [8, 16, 32, 64, 128, 256, 512] + offset: 0.5 + flip: true + kernel_size: 3 + pad: 1 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [80000, 100000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 + +SSDTrainFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/voc + annotation: VOCdevkit/VOC_all/ImageSets/Main/train.txt + image_dir: VOCdevkit/VOC_all/JPEGImages + use_default_label: true + image_shape: [3, 512, 512] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !RandomDistort + brightness_lower: 0.875 + brightness_upper: 1.125 + is_order: true + - !ExpandImage + max_ratio: 4 + mean: [123, 117, 104] + prob: 0.5 + - !CropImage + avoid_no_bbox: true + batch_sampler: + - [1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0] + satisfy_all: false + - !ResizeImage + interp: 1 + target_size: 512 + use_cv2: false + - !RandomFlipImage + is_normalized: true + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [123, 117, 104] + std: [1, 1, 1] + +SSDEvalFeed: + batch_size: 32 + dataset: + dataset_dir: dataset/voc + annotation: VOCdevkit/VOC_all/ImageSets/Main/val.txt + image_dir: VOCdevkit/VOC_all/JPEGImages + use_default_label: true + drop_last: false + image_shape: [3, 512, 512] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !ResizeImage + interp: 1 + target_size: 512 + use_cv2: false + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [123, 117, 104] + std: [1, 1, 1] + +SSDTestFeed: + batch_size: 1 + dataset: + use_default_label: true + drop_last: false + image_shape: [3, 512, 512] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !ResizeImage + interp: 1 + max_size: 0 + target_size: 512 + use_cv2: false + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [123, 117, 104] + std: [1, 1, 1] diff --git a/docs/MODEL_ZOO.md b/docs/MODEL_ZOO.md index 668860ac4..f418656a9 100644 --- a/docs/MODEL_ZOO.md +++ b/docs/MODEL_ZOO.md @@ -118,11 +118,21 @@ results of image size 608/416/320 above. **Notes:** In RetinaNet, the base LR is changed to 0.01 for minibatch size 16. +### SSD + +| Backbone | Size | Image/gpu | Lr schd | Box AP | Download | +| VGG16 | 300 | 8 | 40w | 25.1 | [model](https://paddlemodels.bj.bcebos.com/object_detection/ssd_vgg16_300.tar) | +| VGG16 | 512 | 8 | 40w | 29.1 | [model](https://paddlemodels.bj.bcebos.com/object_detection/ssd_vgg16_512.tar) | + +**Notes:** VGG-SSD is trained in 4 GPU with total batch size as 32 and trained 400000 iters. + ### SSD on Pascal VOC | Backbone | Size | Image/gpu | Lr schd | Box AP | Download | | :----------- | :--: | :-----: | :-----: | :----: | :-------: | -| MobileNet v1 | 300 | 32 | 120e | 73.13 | [model](https://paddlemodels.bj.bcebos.com/object_detection/ssd_mobilenet_v1_voc.tar) | +| MobileNet v1 | 300 | 32 | 120e | 73.2 | [model](https://paddlemodels.bj.bcebos.com/object_detection/ssd_mobilenet_v1_voc.tar) | +| VGG16 | 300 | 8 | 240e | 77.5 | [model](https://paddlemodels.bj.bcebos.com/object_detection/ssd_vgg16_300_voc.tar) | +| VGG16 | 512 | 8 | 240e | 80.2 | [model](https://paddlemodels.bj.bcebos.com/object_detection/ssd_vgg16_512_voc.tar) | -**Notes:** SSD is trained in 2 GPU with totoal batch size as 64 and trained 120 epoches. SSD training data augmentations: randomly color distortion, +**NOTE**: MobileNet-SSD is trained in 2 GPU with totoal batch size as 64 and trained 120 epoches. VGG-SSD is trained in 4 GPU with total batch size as 32 and trained 240 epoches. SSD training data augmentations: randomly color distortion, randomly cropping, randomly expansion, randomly flipping. diff --git a/docs/MODEL_ZOO_cn.md b/docs/MODEL_ZOO_cn.md index 2e92c89f2..7cb3982f3 100644 --- a/docs/MODEL_ZOO_cn.md +++ b/docs/MODEL_ZOO_cn.md @@ -115,10 +115,20 @@ Paddle提供基于ImageNet的骨架网络预训练模型。所有预训练模型 **注意事项:** RetinaNet系列模型中,在总batch size为16下情况下,初始学习率改为0.01。 +### SSD + +| 骨架网络 | 输入尺寸 | 每张GPU图片个数 | 学习率策略 | Box AP | 下载 | +| VGG16 | 300 | 8 | 40万 | 25.1 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/ssd_vgg16_300.tar) | +| VGG16 | 512 | 8 | 40万 | 29.1 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/ssd_vgg16_512.tar) | + +**注意事项:** VGG-SSD在总batch size为32下训练40万轮。 + ### SSD 基于Pascal VOC数据集 | 骨架网络 | 输入尺寸 | 每张GPU图片个数 | 学习率策略 | Box AP | 下载 | | :----------- | :--: | :-----: | :-----: | :----: | :-------: | -| MobileNet v1 | 300 | 32 | 120e | 73.13 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/ssd_mobilenet_v1_voc.tar) | +| MobileNet v1 | 300 | 32 | 120e | 73.2 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/ssd_mobilenet_v1_voc.tar) | +| VGG16 | 300 | 8 | 240e | 77.5 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/ssd_vgg16_300_voc.tar) | +| VGG16 | 512 | 8 | 240e | 80.2 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/ssd_vgg16_512_voc.tar) | -**注意事项:** SSD在2卡,总batch size为64下训练120轮。数据增强包括:随机颜色失真,随机剪裁,随机扩张,随机翻转。 +**注意事项:** MobileNet-SSD在2卡,总batch size为64下训练120周期。VGG-SSD在总batch size为32下训练240周期。数据增强包括:随机颜色失真,随机剪裁,随机扩张,随机翻转。 diff --git a/ppdet/data/data_feed.py b/ppdet/data/data_feed.py index 44256722b..9c0357c34 100644 --- a/ppdet/data/data_feed.py +++ b/ppdet/data/data_feed.py @@ -28,9 +28,10 @@ from ppdet.data.transform.operators import ( DecodeImage, MixupImage, NormalizeBox, NormalizeImage, RandomDistort, RandomFlipImage, RandomInterpImage, ResizeImage, ExpandImage, CropImage, Permute) + from ppdet.data.transform.arrange_sample import ( - ArrangeRCNN, ArrangeTestRCNN, ArrangeSSD, ArrangeTestSSD, ArrangeYOLO, - ArrangeEvalYOLO, ArrangeTestYOLO) + ArrangeRCNN, ArrangeTestRCNN, ArrangeSSD, ArrangeEvalSSD, ArrangeTestSSD, + ArrangeYOLO, ArrangeEvalYOLO, ArrangeTestYOLO) __all__ = [ 'PadBatch', 'MultiScale', 'RandomShape', 'DataSet', 'CocoDataSet', @@ -690,7 +691,7 @@ class SSDTrainFeed(DataFeed): def __init__(self, dataset=VocDataSet().__dict__, - fields=['image', 'gt_box', 'gt_label', 'is_difficult'], + fields=['image', 'gt_box', 'gt_label'], image_shape=[3, 300, 300], sample_transforms=[ DecodeImage(to_rgb=True, with_mixup=False), @@ -723,8 +724,6 @@ class SSDTrainFeed(DataFeed): bufsize=10, use_process=True): sample_transforms.append(ArrangeSSD()) - if isinstance(dataset, dict): - dataset = VocDataSet(**dataset) super(SSDTrainFeed, self).__init__( dataset, fields, @@ -736,6 +735,7 @@ class SSDTrainFeed(DataFeed): samples=samples, drop_last=drop_last, num_workers=num_workers, + bufsize=bufsize, use_process=use_process) self.mode = 'TRAIN' @@ -747,7 +747,8 @@ class SSDEvalFeed(DataFeed): def __init__( self, dataset=VocDataSet(VOC_VAL_ANNOTATION).__dict__, - fields=['image', 'gt_box', 'gt_label', 'is_difficult'], + fields=['image', 'im_shape', 'im_id', 'gt_box', + 'gt_label', 'is_difficult'], image_shape=[3, 300, 300], sample_transforms=[ DecodeImage(to_rgb=True, with_mixup=False), @@ -767,9 +768,7 @@ class SSDEvalFeed(DataFeed): num_workers=8, bufsize=10, use_process=False): - sample_transforms.append(ArrangeSSD()) - if isinstance(dataset, dict): - dataset = VocDataSet(**dataset) + sample_transforms.append(ArrangeEvalSSD()) super(SSDEvalFeed, self).__init__( dataset, fields, @@ -781,6 +780,7 @@ class SSDEvalFeed(DataFeed): samples=samples, drop_last=drop_last, num_workers=num_workers, + bufsize=bufsize, use_process=use_process) self.mode = 'VAL' @@ -791,7 +791,7 @@ class SSDTestFeed(DataFeed): def __init__(self, dataset=SimpleDataSet(VOC_TEST_ANNOTATION).__dict__, - fields=['image', 'im_id'], + fields=['image', 'im_id', 'im_shape'], image_shape=[3, 300, 300], sample_transforms=[ DecodeImage(to_rgb=True), @@ -823,7 +823,9 @@ class SSDTestFeed(DataFeed): shuffle=shuffle, samples=samples, drop_last=drop_last, - num_workers=num_workers) + num_workers=num_workers, + bufsize=bufsize, + use_process=use_process) self.mode = 'TEST' diff --git a/ppdet/data/transform/arrange_sample.py b/ppdet/data/transform/arrange_sample.py index 991eed515..13f70bfa3 100644 --- a/ppdet/data/transform/arrange_sample.py +++ b/ppdet/data/transform/arrange_sample.py @@ -131,15 +131,10 @@ class ArrangeTestRCNN(BaseOperator): class ArrangeSSD(BaseOperator): """ Transform dict to tuple format needed for training. - - Args: - is_mask (bool): whether to use include mask data """ - def __init__(self, is_mask=False): + def __init__(self): super(ArrangeSSD, self).__init__() - self.is_mask = is_mask - assert isinstance(self.is_mask, bool), "wrong type for is_mask" def __call__(self, sample, context=None): """ @@ -154,10 +149,40 @@ class ArrangeSSD(BaseOperator): im = sample['image'] gt_bbox = sample['gt_bbox'] gt_class = sample['gt_class'] - difficult = sample['difficult'] - outs = (im, gt_bbox, gt_class, difficult) + outs = (im, gt_bbox, gt_class) return outs +@register_op +class ArrangeEvalSSD(BaseOperator): + """ + Transform dict to tuple format needed for training. + """ + + def __init__(self): + super(ArrangeEvalSSD, self).__init__() + + def __call__(self, sample, context=None): + """ + Args: + sample: a dict which contains image + info and annotation info. + context: a dict which contains additional info. + Returns: + sample: a tuple containing the following items: (image) + """ + im = sample['image'] + if len(sample['gt_bbox']) != len(sample['gt_class']): + raise ValueError("gt num mismatch: bbox and class.") + im_id = sample['im_id'] + h = sample['h'] + w = sample['w'] + im_shape = np.array((h, w)) + gt_bbox = sample['gt_bbox'] + gt_class = sample['gt_class'] + difficult = sample['difficult'] + outs = (im, im_shape, im_id, gt_bbox, gt_class, difficult) + + return outs @register_op class ArrangeTestSSD(BaseOperator): @@ -168,10 +193,8 @@ class ArrangeTestSSD(BaseOperator): is_mask (bool): whether to use include mask data """ - def __init__(self, is_mask=False): + def __init__(self): super(ArrangeTestSSD, self).__init__() - self.is_mask = is_mask - assert isinstance(self.is_mask, bool), "wrong type for is_mask" def __call__(self, sample, context=None): """ @@ -184,7 +207,10 @@ class ArrangeTestSSD(BaseOperator): """ im = sample['image'] im_id = sample['im_id'] - outs = (im, im_id) + h = sample['h'] + w = sample['w'] + im_shape = np.array((h, w)) + outs = (im, im_id, im_shape) return outs diff --git a/ppdet/modeling/architectures/ssd.py b/ppdet/modeling/architectures/ssd.py index f5f6bf9c1..c0fb68f92 100644 --- a/ppdet/modeling/architectures/ssd.py +++ b/ppdet/modeling/architectures/ssd.py @@ -63,7 +63,6 @@ class SSD(object): if mode == 'train' or mode == 'eval': gt_box = feed_vars['gt_box'] gt_label = feed_vars['gt_label'] - difficult = feed_vars['is_difficult'] body_feats = self.backbone(im) locs, confs, box, box_var = self.multi_box_head( @@ -76,17 +75,7 @@ class SSD(object): return {'loss': loss} else: pred = self.output_decoder(locs, confs, box, box_var) - if mode == 'eval': - map_eval = self.metric( - pred, - gt_label, - gt_box, - difficult, - class_num=self.num_classes) - _, accum_map = map_eval.get_map_var() - return {'map': map_eval, 'accum_map': accum_map} - else: - return {'bbox': pred} + return {'bbox': pred} def train(self, feed_vars): return self.build(feed_vars, 'train') @@ -99,5 +88,5 @@ class SSD(object): def is_bbox_normalized(self): # SSD use output_decoder in output layers, bbox is normalized - # to range [0, 1], is_bbox_normalized is used in infer.py + # to range [0, 1], is_bbox_normalized is used in eval.py and infer.py return True diff --git a/ppdet/modeling/backbones/__init__.py b/ppdet/modeling/backbones/__init__.py index f1e345fb3..80241c883 100644 --- a/ppdet/modeling/backbones/__init__.py +++ b/ppdet/modeling/backbones/__init__.py @@ -20,6 +20,7 @@ from . import darknet from . import mobilenet from . import senet from . import fpn +from . import vgg from .resnet import * from .resnext import * @@ -27,3 +28,4 @@ from .darknet import * from .mobilenet import * from .senet import * from .fpn import * +from .vgg import * diff --git a/ppdet/modeling/backbones/vgg.py b/ppdet/modeling/backbones/vgg.py new file mode 100644 index 000000000..28bd29272 --- /dev/null +++ b/ppdet/modeling/backbones/vgg.py @@ -0,0 +1,200 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr + +from ppdet.core.workspace import register + +__all__ = ['VGG'] + + +@register +class VGG(object): + """ + VGG, see https://arxiv.org/abs/1409.1556 + + Args: + depth (int): the VGG net depth (16 or 19) + normalizations (list): params list of init scale in l2 norm, skip init + scale if param is -1. + with_extra_blocks (bool): whether or not extra blocks should be added + extra_block_filters (list): in each extra block, params: + [in_channel, out_channel, padding_size, stride_size, filter_size] + """ + + def __init__(self, + depth=16, + with_extra_blocks=False, + normalizations=[20., -1, -1, -1, -1, -1], + extra_block_filters=[[256, 512, 1, 2, 3], [128, 256, 1, 2, 3], + [128, 256, 0, 1, 3], [128, 256, 0, 1, 3]]): + assert depth in [16, 19], \ + "depth {} not in [16, 19]" + + self.depth = depth + self.depth_cfg = { + 16: [2, 2, 3, 3, 3], + 19: [2, 2, 4, 4, 4] + } + self.with_extra_blocks = with_extra_blocks + self.normalizations = normalizations + self.extra_block_filters = extra_block_filters + + def __call__(self, input): + layers = [] + layers += self._vgg_block(input) + + if not self.with_extra_blocks: + return layers[-1] + + layers += self._add_extras_block(layers[-1]) + norm_cfg = self.normalizations + for k, v in enumerate(layers): + if not norm_cfg[k] == -1: + layers[k] = self._l2_norm_scale(v, init_scale=norm_cfg[k]) + + return layers + + def _vgg_block(self, input): + nums = self.depth_cfg[self.depth] + vgg_base = [64, 128, 256, 512, 512] + conv = input + layers = [] + for k, v in enumerate(vgg_base): + conv = self._conv_block(conv, v, nums[k], name="conv{}_".format(k + 1)) + layers.append(conv) + if k == 4: + conv = self._pooling_block(conv, 3, 1, pool_padding=1) + else: + conv = self._pooling_block(conv, 2, 2) + + fc6 = self._conv_layer(conv, 1024, 3, 1, 6, dilation=6, name="fc6") + fc7 = self._conv_layer(fc6, 1024, 1, 1, 0, name="fc7") + + return [layers[3], fc7] + + def _add_extras_block(self, input): + cfg = self.extra_block_filters + conv = input + layers = [] + for k, v in enumerate(cfg): + assert len(v) == 5, "extra_block_filters size not fix" + conv = self._extra_block(conv, v[0], v[1], + v[2], v[3], v[4], name="conv{}_".format(6 + k)) + layers.append(conv) + + return layers + + def _conv_block(self, input, num_filter, groups, name=None): + conv = input + for i in range(groups): + conv = self._conv_layer( + input=conv, + num_filters=num_filter, + filter_size=3, + stride=1, + padding=1, + act='relu', + name=name + str(i + 1)) + return conv + + def _extra_block(self, + input, + num_filters1, + num_filters2, + padding_size, + stride_size, + filter_size, + name=None): + # 1x1 conv + conv_1 = self._conv_layer( + input=input, + num_filters=int(num_filters1), + filter_size=1, + stride=1, + act='relu', + padding=0, + name=name + "1") + + # 3x3 conv + conv_2 = self._conv_layer( + input=conv_1, + num_filters=int(num_filters2), + filter_size=filter_size, + stride=stride_size, + act='relu', + padding=padding_size, + name=name + "2") + return conv_2 + + def _conv_layer(self, + input, + num_filters, + filter_size, + stride, + padding, + dilation=1, + act='relu', + use_cudnn=True, + name=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=padding, + dilation=dilation, + act=act, + use_cudnn=use_cudnn, + param_attr=ParamAttr(name=name + "_weights"), + bias_attr=ParamAttr(name=name + "_biases"), + name=name + '.conv2d.output.1') + return conv + + def _pooling_block(self, + conv, + pool_size, + pool_stride, + pool_padding=0, + ceil_mode=True): + pool = fluid.layers.pool2d( + input=conv, + pool_size=pool_size, + pool_type='max', + pool_stride=pool_stride, + pool_padding=pool_padding, + ceil_mode=ceil_mode) + return pool + + def _l2_norm_scale(self, input, init_scale=1.0, channel_shared=False): + from paddle.fluid.layer_helper import LayerHelper + from paddle.fluid.initializer import Constant + helper = LayerHelper("Scale") + l2_norm = fluid.layers.l2_normalize( + input, axis=1) # l2 norm along channel + shape = [1] if channel_shared else [input.shape[1]] + scale = helper.create_parameter( + attr=helper.param_attr, + shape=shape, + dtype=input.dtype, + default_initializer=Constant(init_scale)) + out = fluid.layers.elementwise_mul( + x=l2_norm, y=scale, axis=-1 if channel_shared else 1, + name="conv4_3_norm_scale") + return out diff --git a/ppdet/modeling/ops.py b/ppdet/modeling/ops.py index f7bc80628..75ec06bea 100644 --- a/ppdet/modeling/ops.py +++ b/ppdet/modeling/ops.py @@ -255,22 +255,30 @@ class MultiBoxHead(object): def __init__(self, min_ratio=20, max_ratio=90, + base_size=300, min_sizes=[60.0, 105.0, 150.0, 195.0, 240.0, 285.0], max_sizes=[[], 150.0, 195.0, 240.0, 285.0, 300.0], aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2., 3.], [2., 3.]], - base_size=300, + steps=None, offset=0.5, - flip=True): + flip=True, + min_max_aspect_ratios_order=False, + kernel_size=1, + pad=0): super(MultiBoxHead, self).__init__() self.min_ratio = min_ratio self.max_ratio = max_ratio + self.base_size = base_size self.min_sizes = min_sizes self.max_sizes = max_sizes self.aspect_ratios = aspect_ratios - self.base_size = base_size + self.steps = steps self.offset = offset self.flip = flip + self.min_max_aspect_ratios_order = min_max_aspect_ratios_order + self.kernel_size = kernel_size + self.pad = pad @register diff --git a/ppdet/utils/coco_eval.py b/ppdet/utils/coco_eval.py index ed26ad55a..655f40693 100644 --- a/ppdet/utils/coco_eval.py +++ b/ppdet/utils/coco_eval.py @@ -66,7 +66,12 @@ def proposal_eval(results, anno_file, outfile, max_dets=(100, 300, 1000)): # flush coco evaluation result sys.stdout.flush() -def bbox_eval(results, anno_file, outfile, with_background=True): + +def bbox_eval(results, + anno_file, + outfile, + with_background=True, + is_bbox_normalized=False): assert 'bbox' in results[0] assert outfile.endswith('.json') @@ -79,7 +84,9 @@ def bbox_eval(results, anno_file, outfile, with_background=True): {i + int(with_background): catid for i, catid in enumerate(cat_ids)}) - xywh_results = bbox2out(results, clsid2catid) + xywh_results = bbox2out( + results, clsid2catid, is_bbox_normalized=is_bbox_normalized) + if len(xywh_results) == 0: logger.warning("The number of valid bbox detected is zero.\n \ Please use reasonable model and check input data.\n \ @@ -111,6 +118,7 @@ def mask_eval(results, anno_file, outfile, resolution, thresh_binarize=0.5): cocoapi_eval(outfile, 'segm', coco_gt=coco_gt) + def cocoapi_eval(jsonfile, style, coco_gt=None, @@ -141,6 +149,7 @@ def cocoapi_eval(jsonfile, coco_eval.summarize() return coco_eval.stats + def proposal2out(results, is_bbox_normalized=False): xywh_res = [] for t in results: @@ -180,6 +189,13 @@ def proposal2out(results, is_bbox_normalized=False): def bbox2out(results, clsid2catid, is_bbox_normalized=False): + """ + Args: + results: request a dict, should include: `bbox`, `im_id`, + if is_bbox_normalized=True, also need `im_shape`. + clsid2catid: class id to category id map of COCO2017 dataset. + is_bbox_normalized: whether or not bbox is normalized. + """ xywh_res = [] for t in results: bboxes = t['bbox'][0] @@ -202,6 +218,11 @@ def bbox2out(results, clsid2catid, is_bbox_normalized=False): clip_bbox([xmin, ymin, xmax, ymax]) w = xmax - xmin h = ymax - ymin + im_height, im_width = t['im_shape'][0][i].tolist() + xmin *= im_width + ymin *= im_height + w *= im_width + h *= im_height else: w = xmax - xmin + 1 h = ymax - ymin + 1 diff --git a/ppdet/utils/eval_utils.py b/ppdet/utils/eval_utils.py index d1e331361..63d0fa4e0 100644 --- a/ppdet/utils/eval_utils.py +++ b/ppdet/utils/eval_utils.py @@ -113,8 +113,11 @@ def eval_results(results, output = 'bbox.json' if output_directory: output = os.path.join(output_directory, 'bbox.json') + box_ap_stats = bbox_eval(results, anno_file, output, - with_background) + with_background, + is_bbox_normalized=is_bbox_normalized) + if 'mask' in results[0]: output = 'mask.json' if output_directory: diff --git a/ppdet/utils/visualizer.py b/ppdet/utils/visualizer.py index 21f787a84..ff35dc2d5 100644 --- a/ppdet/utils/visualizer.py +++ b/ppdet/utils/visualizer.py @@ -31,8 +31,7 @@ def visualize_results(image, catid2name, threshold=0.5, bbox_results=None, - mask_results=None, - is_bbox_normalized=False): + mask_results=None): """ Visualize bbox and mask results """ @@ -40,7 +39,7 @@ def visualize_results(image, image = draw_mask(image, im_id, mask_results, threshold) if bbox_results: image = draw_bbox(image, im_id, catid2name, bbox_results, - threshold, is_bbox_normalized) + threshold) return image @@ -69,8 +68,7 @@ def draw_mask(image, im_id, segms, threshold, alpha=0.7): return Image.fromarray(img_array.astype('uint8')) -def draw_bbox(image, im_id, catid2name, bboxes, threshold, - is_bbox_normalized=False): +def draw_bbox(image, im_id, catid2name, bboxes, threshold): """ Draw bbox on image """ @@ -86,12 +84,6 @@ def draw_bbox(image, im_id, catid2name, bboxes, threshold, continue xmin, ymin, w, h = bbox - if is_bbox_normalized: - im_width, im_height = image.size - xmin *= im_width - ymin *= im_height - w *= im_width - h *= im_height xmax = xmin + w ymax = ymin + h diff --git a/tools/infer.py b/tools/infer.py index 5e834f777..769fda765 100644 --- a/tools/infer.py +++ b/tools/infer.py @@ -186,7 +186,7 @@ def main(): if cfg['metric'] == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg['metric'] == 'VOC': - extra_keys = ['im_id'] + extra_keys = ['im_id', 'im_shape'] keys, values, _ = parse_fetches(test_fetches, infer_prog, extra_keys) # parse dataset category @@ -235,7 +235,7 @@ def main(): image = visualize_results(image, int(im_id), catid2name, FLAGS.draw_threshold, bbox_results, - mask_results, is_bbox_normalized) + mask_results) save_name = get_save_image_name(FLAGS.output_dir, image_path) logger.info("Detection bbox results save in {}".format(save_name)) image.save(save_name, quality=95) -- GitLab