diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..43369eea55459685633f4a95743b60834dad2d74 --- /dev/null +++ b/.gitignore @@ -0,0 +1,64 @@ +# Virtualenv +/.venv/ +/venv/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# C extensions +*.so + +# json file +*.json + +# Distribution / packaging +/bin/ +/build/ +/develop-eggs/ +/dist/ +/eggs/ +/lib/ +/lib64/ +/output/ +/parts/ +/sdist/ +/var/ +/*.egg-info/ +/.installed.cfg +/*.egg +/.eggs + +# AUTHORS and ChangeLog will be generated while packaging +/AUTHORS +/ChangeLog + +# BCloud / BuildSubmitter +/build_submitter.* +/logger_client_log + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +.tox/ +.coverage +.cache +.pytest_cache +nosetests.xml +coverage.xml + +# Translations +*.mo + +# Sphinx documentation +/docs/_build/ + +*.json + + +dataset/coco/annotations +dataset/coco/train2017 +dataset/coco/val2017 +dataset/voc/VOCdevkit diff --git a/.style.yapf b/.style.yapf new file mode 100644 index 0000000000000000000000000000000000000000..4741fb4f3bbc6681088cf9e960321e7b857a93a8 --- /dev/null +++ b/.style.yapf @@ -0,0 +1,3 @@ +[style] +based_on_style = pep8 +column_limit = 80 diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3edd7a2fea40c2fa28efb45b7020a563464010a1 --- /dev/null +++ b/README.md @@ -0,0 +1,134 @@ +English | [简体中文](README_cn.md) + +# PaddleDetection + +The goal of PaddleDetection is to provide easy access to a wide range of object +detection models in both industry and research settings. We design +PaddleDetection to be not only performant, production-ready but also highly +flexible, catering to research needs. + + +
+ +
+ + +## Introduction + +Features: + +- Production Ready: + + Key operations are implemented in C++ and CUDA, together with PaddlePaddle's +highly efficient inference engine, enables easy deployment in server environments. + +- Highly Flexible: + + Components are designed to be modular. Model architectures, as well as data +preprocess pipelines, can be easily customized with simple configuration +changes. + +- Performance Optimized: + + With the help of the underlying PaddlePaddle framework, faster training and +reduced GPU memory footprint is achieved. Notably, YOLOv3 training is +much faster compared to other frameworks. Another example is Mask-RCNN +(ResNet50), we managed to fit up to 4 images per GPU (Tesla V100 16GB) during +multi-GPU training. + +Supported Architectures: + +| | ResNet | ResNet-vd [1](#vd) | ResNeXt-vd | SENet | MobileNet | DarkNet | VGG | +| ------------------- | :----: | ----------------------------: | :--------: | :---: | :-------: | :-----: | :--: | +| Faster R-CNN | ✓ | ✓ | x | ✓ | ✗ | ✗ | ✗ | +| Faster R-CNN + FPN | ✓ | ✓ | ✓ | ✓ | ✗ | ✗ | ✗ | +| Mask R-CNN | ✓ | ✓ | x | ✓ | ✗ | ✗ | ✗ | +| Mask R-CNN + FPN | ✓ | ✓ | ✓ | ✓ | ✗ | ✗ | ✗ | +| Cascade Faster-RCNN | ✓ | ✓ | ✓ | ✗ | ✗ | ✗ | ✗ | +| Cascade Mask-RCNN | ✓ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | +| RetinaNet | ✓ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | +| YOLOv3 | ✓ | ✗ | ✗ | ✗ | ✓ | ✓ | ✗ | +| SSD | ✗ | ✗ | ✗ | ✗ | ✓ | ✗ | ✓ | + +[1] [ResNet-vd](https://arxiv.org/pdf/1812.01187) models offer much improved accuracy with negligible performance cost. + +Advanced Features: + +- [x] **Synchronized Batch Norm**: currently used by YOLOv3. +- [x] **Group Norm** +- [x] **Modulated Deformable Convolution** +- [x] **Deformable PSRoI Pooling** + +**NOTE:** Synchronized batch normalization can only be used on multiple GPU devices, can not be used on CPU devices or single GPU device. + +## Get Started + +- [Installation guide](docs/INSTALL.md) +- [Quick start on small dataset](docs/QUICK_STARTED.md) +- [Guide to traing, evaluate and arguments description](docs/GETTING_STARTED.md) +- [Guide to preprocess pipeline and custom dataset](docs/DATA.md) +- [Introduction to the configuration workflow](docs/CONFIG.md) +- [Examples for detailed configuration explanation](docs/config_example/) +- [IPython Notebook demo](demo/mask_rcnn_demo.ipynb) +- [Transfer learning document](docs/TRANSFER_LEARNING.md) + +## Model Zoo + +- Pretrained models are available in the [PaddleDetection model zoo](docs/MODEL_ZOO.md). +- [Face detection models](configs/face_detection/README.md) +- [Pretrained models for pedestrian and vehicle detection](contrib/README.md) + +## Model compression + +- [ Quantification aware training example](slim/quantization) +- [ Pruning compression example](slim/prune) + +## Depoly + +- [Export model for inference depolyment](docs/EXPORT_MODEL.md) +- [C++ inference depolyment](inference/README.md) + +## Benchmark + +- [Inference benchmark](docs/BENCHMARK_INFER_cn.md) + + +## Updates + +#### 10/2019 + +- Face detection models included: BlazeFace, Faceboxes. +- Enrich COCO models, box mAP up to 51.9%. +- Add CACacascade RCNN, one of the best single model of Objects365 2019 challenge Full Track champion. +- Add pretrained models for pedestrian and vehicle detection. +- Support mixed-precision training. +- Add C++ inference depolyment. +- Add model compression examples. + +#### 2/9/2019 + +- Add retrained models for GroupNorm. + +- Add Cascade-Mask-RCNN+FPN. + +#### 5/8/2019 + +- Add a series of models ralated modulated Deformable Convolution. + +#### 7/29/2019 + +- Update Chinese docs for PaddleDetection +- Fix bug in R-CNN models when train and test at the same time +- Add ResNext101-vd + Mask R-CNN + FPN models +- Add YOLOv3 on VOC models + +#### 7/3/2019 + +- Initial release of PaddleDetection and detection model zoo +- Models included: Faster R-CNN, Mask R-CNN, Faster R-CNN+FPN, Mask + R-CNN+FPN, Cascade-Faster-RCNN+FPN, RetinaNet, YOLOv3, and SSD. + + +## Contributing + +Contributions are highly welcomed and we would really appreciate your feedback!! diff --git a/README_cn.md b/README_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..7793d6495b1ddb22146fb73847ede6bd6d4f76f1 --- /dev/null +++ b/README_cn.md @@ -0,0 +1,122 @@ +[English](README.md) | 简体中文 + +# PaddleDetection + +PaddleDetection的目的是为工业界和学术界提供丰富、易用的目标检测模型。不仅性能优越、易于部署,而且能够灵活的满足算法研究的需求。 + +
+ +
+ + +## 简介 + +特性: + +- 易部署: + + PaddleDetection的模型中使用的核心算子均通过C++或CUDA实现,同时基于PaddlePaddle的高性能推理引擎可以方便地部署在多种硬件平台上。 + +- 高灵活度: + + PaddleDetection通过模块化设计来解耦各个组件,基于配置文件可以轻松地搭建各种检测模型。 + +- 高性能: + + 基于PaddlePaddle框架的高性能内核,在模型训练速度、显存占用上有一定的优势。例如,YOLOv3的训练速度快于其他框架,在Tesla V100 16GB环境下,Mask-RCNN(ResNet50)可以单卡Batch Size可以达到4 (甚至到5)。 + +支持的模型结构: + +| | ResNet | ResNet-vd [1](#vd) | ResNeXt-vd | SENet | MobileNet | DarkNet | VGG | +|--------------------|:------:|------------------------------:|:----------:|:-----:|:---------:|:-------:|:---:| +| Faster R-CNN | ✓ | ✓ | x | ✓ | ✗ | ✗ | ✗ | +| Faster R-CNN + FPN | ✓ | ✓ | ✓ | ✓ | ✗ | ✗ | ✗ | +| Mask R-CNN | ✓ | ✓ | x | ✓ | ✗ | ✗ | ✗ | +| Mask R-CNN + FPN | ✓ | ✓ | ✓ | ✓ | ✗ | ✗ | ✗ | +| Cascade Faster-CNN | ✓ | ✓ | ✓ | ✗ | ✗ | ✗ | ✗ | +| Cascade Mask-CNN | ✓ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ | +| RetinaNet | ✓ | ✗ | ✓ | ✗ | ✗ | ✗ | ✗ | +| YOLOv3 | ✓ | ✗ | ✗ | ✗ | ✓ | ✓ | ✗ | +| SSD | ✗ | ✗ | ✗ | ✗ | ✓ | ✗ | ✓ | + +[1] [ResNet-vd](https://arxiv.org/pdf/1812.01187) 模型提供了较大的精度提高和较少的性能损失。 + +扩展特性: + +- [x] **Synchronized Batch Norm**: 目前在YOLOv3中使用。 +- [x] **Group Norm** +- [x] **Modulated Deformable Convolution** +- [x] **Deformable PSRoI Pooling** + +**注意:** Synchronized batch normalization 只能在多GPU环境下使用,不能在CPU环境或者单GPU环境下使用。 + + +## 使用教程 + +- [安装说明](docs/INSTALL_cn.md) +- [快速开始](docs/QUICK_STARTED_cn.md) +- [训练、评估及参数说明](docs/GETTING_STARTED_cn.md) +- [数据预处理及自定义数据集](docs/DATA_cn.md) +- [配置模块设计和介绍](docs/CONFIG_cn.md) +- [详细的配置信息和参数说明示例](docs/config_example/) +- [IPython Notebook demo](demo/mask_rcnn_demo.ipynb) +- [迁移学习教程](docs/TRANSFER_LEARNING_cn.md) + +## 模型库 + +- [模型库](docs/MODEL_ZOO_cn.md) +- [人脸检测模型](configs/face_detection/README.md) +- [行人检测和车辆检测预训练模型](contrib/README_cn.md) + + +## 模型压缩 +- [量化训练压缩示例](slim/quantization) +- [剪枝压缩示例](slim/prune) + +## 推理部署 + +- [模型导出教程](docs/EXPORT_MODEL.md) +- [C++推理部署](inference/README.md) + +## Benchmark + +- [推理Benchmark](docs/BENCHMARK_INFER_cn.md) + + + +## 版本更新 + +### 10/2019 + +- 增加人脸检测模型BlazeFace、Faceboxes。 +- 丰富基于COCO的模型,精度高达51.9%。 +- 增加Objects365 2019 Challenge上夺冠的最佳单模型之一CACascade-RCNN。 +- 增加行人检测和车辆检测预训练模型。 +- 支持FP16训练。 +- 增加跨平台的C++推理部署方案。 +- 增加模型压缩示例。 + + +### 2/9/2019 +- 增加GroupNorm模型。 +- 增加CascadeRCNN+Mask模型。 + +#### 5/8/2019 +- 增加Modulated Deformable Convolution系列模型。 + +#### 7/22/2019 + +- 增加检测库中文文档 +- 修复R-CNN系列模型训练同时进行评估的问题 +- 新增ResNext101-vd + Mask R-CNN + FPN模型 +- 新增基于VOC数据集的YOLOv3模型 + +#### 7/3/2019 + +- 首次发布PaddleDetection检测库和检测模型库 +- 模型包括:Faster R-CNN, Mask R-CNN, Faster R-CNN+FPN, Mask + R-CNN+FPN, Cascade-Faster-RCNN+FPN, RetinaNet, YOLOv3, 和SSD. + +## 如何贡献代码 + +我们非常欢迎你可以为PaddleDetection提供代码,也十分感谢你的反馈。 diff --git a/configs/cascade_mask_rcnn_r50_fpn_1x.yml b/configs/cascade_mask_rcnn_r50_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..1d17f53c60e66061ab8a21d624f4191a15ee5a01 --- /dev/null +++ b/configs/cascade_mask_rcnn_r50_fpn_1x.yml @@ -0,0 +1,145 @@ +architecture: CascadeMaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +use_gpu: true +max_iters: 180000 +snapshot_iter: 10000 +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +metric: COCO +weights: output/cascade_mask_rcnn_r50_fpn_1x/model_final/ +num_classes: 81 + +CascadeMaskRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: CascadeBBoxHead + bbox_assigner: CascadeBBoxAssigner + mask_assigner: MaskAssigner + mask_head: MaskHead + +ResNet: + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: affine_channel + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + sampling_ratio: 2 + box_resolution: 7 + mask_resolution: 14 + +MaskHead: + dilation: 1 + conv_dim: 256 + num_convs: 4 + resolution: 28 + +CascadeBBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [10, 20, 30] + bg_thresh_hi: [0.5, 0.6, 0.7] + bg_thresh_lo: [0.0, 0.0, 0.0] + fg_fraction: 0.25 + fg_thresh: [0.5, 0.6, 0.7] + +MaskAssigner: + resolution: 28 + +CascadeBBoxHead: + head: CascadeTwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +CascadeTwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/cascade_rcnn_r50_fpn_1x.yml b/configs/cascade_rcnn_r50_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..47c089c1d95a35aacdcc0766670dc984cb18dc32 --- /dev/null +++ b/configs/cascade_rcnn_r50_fpn_1x.yml @@ -0,0 +1,137 @@ +architecture: CascadeRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 90000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +weights: output/cascade_rcnn_r50_fpn_1x/model_final +metric: COCO +num_classes: 81 + +CascadeRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: CascadeBBoxHead + bbox_assigner: CascadeBBoxAssigner + +ResNet: + norm_type: affine_channel + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + variant: b + +FPN: + min_level: 2 + max_level: 6 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + min_level: 2 + max_level: 6 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_positive_overlap: 0.7 + rpn_negative_overlap: 0.3 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + min_level: 2 + max_level: 5 + box_resolution: 7 + sampling_ratio: 2 + +CascadeBBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [10, 20, 30] + bg_thresh_lo: [0.0, 0.0, 0.0] + bg_thresh_hi: [0.5, 0.6, 0.7] + fg_thresh: [0.5, 0.6, 0.7] + fg_fraction: 0.25 + +CascadeBBoxHead: + head: CascadeTwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +CascadeTwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.02 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [60000, 80000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + batch_size: 2 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 diff --git a/configs/cascade_rcnn_r50_fpn_1x_ms_test.yml b/configs/cascade_rcnn_r50_fpn_1x_ms_test.yml new file mode 100644 index 0000000000000000000000000000000000000000..c345aeedbe6f65fbe19aaf87ddbadf5ed567c38d --- /dev/null +++ b/configs/cascade_rcnn_r50_fpn_1x_ms_test.yml @@ -0,0 +1,177 @@ +architecture: CascadeRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 90000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +weights: output/cascade_rcnn_r50_fpn_1x/model_final +metric: COCO +num_classes: 81 + +CascadeRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: CascadeBBoxHead + bbox_assigner: CascadeBBoxAssigner + +ResNet: + norm_type: affine_channel + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + variant: b + +FPN: + min_level: 2 + max_level: 6 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + min_level: 2 + max_level: 6 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_positive_overlap: 0.7 + rpn_negative_overlap: 0.3 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + min_level: 2 + max_level: 5 + box_resolution: 7 + sampling_ratio: 2 + +CascadeBBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [10, 20, 30] + bg_thresh_lo: [0.0, 0.0, 0.0] + bg_thresh_hi: [0.5, 0.6, 0.7] + fg_thresh: [0.5, 0.6, 0.7] + fg_fraction: 0.25 + +CascadeBBoxHead: + head: CascadeTwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +CascadeTwoFCHead: + mlp_dim: 1024 + +MultiScaleTEST: + score_thresh: 0.05 + nms_thresh: 0.5 + detections_per_im: 100 + enable_voting: true + vote_thresh: 0.9 + +LearningRate: + base_lr: 0.02 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [60000, 80000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + batch_size: 2 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + sample_transforms: + - !DecodeImage + to_rgb: true + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + - !MultiscaleTestResize + origin_target_size: 800 + origin_max_size: 1333 + target_size: + - 400 + - 500 + - 600 + - 700 + - 900 + - 1000 + - 1100 + - 1200 + max_size: 2000 + use_flip: true + - !Permute + channel_first: true + to_bgr: false + batch_transforms: + - !PadMSTest + pad_to_stride: 32 + num_scale: 18 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 diff --git a/configs/dcn/cascade_mask_rcnn_dcnv2_se154_vd_fpn_gn_s1x.yml b/configs/dcn/cascade_mask_rcnn_dcnv2_se154_vd_fpn_gn_s1x.yml new file mode 100755 index 0000000000000000000000000000000000000000..85e64170df16f242750737cdd70debb40727f300 --- /dev/null +++ b/configs/dcn/cascade_mask_rcnn_dcnv2_se154_vd_fpn_gn_s1x.yml @@ -0,0 +1,239 @@ +architecture: CascadeMaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +max_iters: 300000 +snapshot_iter: 10 +use_gpu: true +log_iter: 20 +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/SENet154_vd_caffe_pretrained.tar +weights: output/cascade_mask_rcnn_dcn_se154_vd_fpn_gn_s1x/model_final/ +metric: COCO +num_classes: 81 + +CascadeMaskRCNN: + backbone: SENet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: CascadeBBoxHead + bbox_assigner: CascadeBBoxAssigner + mask_assigner: MaskAssigner + mask_head: MaskHead + +SENet: + depth: 152 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + group_width: 4 + groups: 64 + norm_type: bn + freeze_norm: True + variant: d + dcn_v2_stages: [3, 4, 5] + std_senet: True + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + freeze_norm: False + norm_type: gn + +FPNRPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + box_resolution: 7 + sampling_ratio: 2 + mask_resolution: 14 + +MaskHead: + dilation: 1 + conv_dim: 256 + num_convs: 4 + resolution: 28 + norm_type: gn + +CascadeBBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [10, 20, 30] + bg_thresh_hi: [0.5, 0.6, 0.7] + bg_thresh_lo: [0.0, 0.0, 0.0] + fg_fraction: 0.25 + fg_thresh: [0.5, 0.6, 0.7] + +MaskAssigner: + resolution: 28 + +CascadeBBoxHead: + head: CascadeXConvNormHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +CascadeXConvNormHead: + norm_type: gn + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [240000, 280000] + - !LinearWarmup + start_factor: 0.01 + steps: 2000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + # batch size per device + batch_size: 1 + dataset: + dataset_dir: dataset/coco + image_dir: train2017 + annotation: annotations/instances_train2017.json + sample_transforms: + - !DecodeImage + to_rgb: False + with_mixup: False + - !RandomFlipImage + is_mask_flip: true + is_normalized: false + prob: 0.5 + - !NormalizeImage + is_channel_first: false + is_scale: False + mean: + - 102.9801 + - 115.9465 + - 122.7717 + std: + - 1.0 + - 1.0 + - 1.0 + - !ResizeImage + interp: 1 + target_size: + - 416 + - 448 + - 480 + - 512 + - 544 + - 576 + - 608 + - 640 + - 672 + - 704 + - 736 + - 768 + - 800 + - 832 + - 864 + - 896 + - 928 + - 960 + - 992 + - 1024 + - 1056 + - 1088 + - 1120 + - 1152 + - 1184 + - 1216 + - 1248 + - 1280 + - 1312 + - 1344 + - 1376 + - 1408 + max_size: 1600 + use_cv2: true + - !Permute + channel_first: true + to_bgr: false + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 8 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + sample_transforms: + - !DecodeImage + to_rgb: False + with_mixup: False + - !NormalizeImage + is_channel_first: false + is_scale: False + mean: + - 102.9801 + - 115.9465 + - 122.7717 + std: + - 1.0 + - 1.0 + - 1.0 + - !ResizeImage + interp: 1 + target_size: + - 800 + max_size: 1333 + use_cv2: true + - !Permute + channel_first: true + to_bgr: false + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/dcn/cascade_mask_rcnn_dcnv2_se154_vd_fpn_gn_s1x_ms_test.yml b/configs/dcn/cascade_mask_rcnn_dcnv2_se154_vd_fpn_gn_s1x_ms_test.yml new file mode 100644 index 0000000000000000000000000000000000000000..3c552bf9c5eb2b628850e89c05d7a22986c91223 --- /dev/null +++ b/configs/dcn/cascade_mask_rcnn_dcnv2_se154_vd_fpn_gn_s1x_ms_test.yml @@ -0,0 +1,255 @@ +architecture: CascadeMaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +max_iters: 300000 +snapshot_iter: 10000 +use_gpu: true +log_iter: 20 +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/SENet154_vd_caffe_pretrained.tar +weights: output/cascade_mask_rcnn_dcn_se154_vd_fpn_gn_s1x/model_final/ +metric: COCO +num_classes: 81 + +CascadeMaskRCNN: + backbone: SENet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: CascadeBBoxHead + bbox_assigner: CascadeBBoxAssigner + mask_assigner: MaskAssigner + mask_head: MaskHead + +SENet: + depth: 152 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + group_width: 4 + groups: 64 + norm_type: bn + freeze_norm: True + variant: d + dcn_v2_stages: [3, 4, 5] + std_senet: True + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + freeze_norm: False + norm_type: gn + +FPNRPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + box_resolution: 7 + sampling_ratio: 2 + mask_resolution: 14 + +MaskHead: + dilation: 1 + conv_dim: 256 + num_convs: 4 + resolution: 28 + norm_type: gn + +CascadeBBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [10, 20, 30] + bg_thresh_hi: [0.5, 0.6, 0.7] + bg_thresh_lo: [0.0, 0.0, 0.0] + fg_fraction: 0.25 + fg_thresh: [0.5, 0.6, 0.7] + +MaskAssigner: + resolution: 28 + +CascadeBBoxHead: + head: CascadeXConvNormHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +CascadeXConvNormHead: + norm_type: gn + +MultiScaleTEST: + score_thresh: 0.05 + nms_thresh: 0.5 + detections_per_im: 100 + enable_voting: true + vote_thresh: 0.9 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [240000, 280000] + - !LinearWarmup + start_factor: 0.01 + steps: 2000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + # batch size per device + batch_size: 1 + dataset: + dataset_dir: dataset/coco + image_dir: train2017 + annotation: annotations/instances_train2017.json + sample_transforms: + - !DecodeImage + to_rgb: False + with_mixup: False + - !RandomFlipImage + is_mask_flip: true + is_normalized: false + prob: 0.5 + - !NormalizeImage + is_channel_first: false + is_scale: False + mean: + - 102.9801 + - 115.9465 + - 122.7717 + std: + - 1.0 + - 1.0 + - 1.0 + - !ResizeImage + interp: 1 + target_size: + - 416 + - 448 + - 480 + - 512 + - 544 + - 576 + - 608 + - 640 + - 672 + - 704 + - 736 + - 768 + - 800 + - 832 + - 864 + - 896 + - 928 + - 960 + - 992 + - 1024 + - 1056 + - 1088 + - 1120 + - 1152 + - 1184 + - 1216 + - 1248 + - 1280 + - 1312 + - 1344 + - 1376 + - 1408 + max_size: 1600 + use_cv2: true + - !Permute + channel_first: true + to_bgr: false + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 8 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + sample_transforms: + - !DecodeImage + to_rgb: False + - !NormalizeImage + is_channel_first: false + is_scale: False + mean: + - 102.9801 + - 115.9465 + - 122.7717 + std: + - 1.0 + - 1.0 + - 1.0 + - !MultiscaleTestResize + origin_target_size: 800 + origin_max_size: 1333 + target_size: + - 400 + - 500 + - 600 + - 700 + - 900 + - 1000 + - 1100 + - 1200 + max_size: 2000 + use_flip: true + - !Permute + channel_first: true + to_bgr: false + batch_transforms: + - !PadMSTest + pad_to_stride: 32 + # num_scale = (len(target_size) + 1) * (1 + use_flip) + num_scale: 18 + num_workers: 2 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/dcn/cascade_rcnn_dcn_r101_vd_fpn_1x.yml b/configs/dcn/cascade_rcnn_dcn_r101_vd_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..93373adb3a7f72b64ba45996ef61c2a9d3da8414 --- /dev/null +++ b/configs/dcn/cascade_rcnn_dcn_r101_vd_fpn_1x.yml @@ -0,0 +1,139 @@ +architecture: CascadeRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 90000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +log_iter: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_vd_pretrained.tar +weights: output/cascade_rcnn_dcn_r101_vd_fpn_1x/model_final +metric: COCO +num_classes: 81 + +CascadeRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: CascadeBBoxHead + bbox_assigner: CascadeBBoxAssigner + +ResNet: + norm_type: bn + depth: 101 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + variant: d + dcn_v2_stages: [3, 4, 5] + +FPN: + min_level: 2 + max_level: 6 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + min_level: 2 + max_level: 6 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_positive_overlap: 0.7 + rpn_negative_overlap: 0.3 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + min_level: 2 + max_level: 5 + box_resolution: 7 + sampling_ratio: 2 + +CascadeBBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [10, 20, 30] + bg_thresh_lo: [0.0, 0.0, 0.0] + bg_thresh_hi: [0.5, 0.6, 0.7] + fg_thresh: [0.5, 0.6, 0.7] + fg_fraction: 0.25 + +CascadeBBoxHead: + head: CascadeTwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +CascadeTwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.02 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [60000, 80000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + batch_size: 2 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 diff --git a/configs/dcn/cascade_rcnn_dcn_r50_fpn_1x.yml b/configs/dcn/cascade_rcnn_dcn_r50_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..4c74bd877b644659812bbcab960a4ce4600277ef --- /dev/null +++ b/configs/dcn/cascade_rcnn_dcn_r50_fpn_1x.yml @@ -0,0 +1,139 @@ +architecture: CascadeRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 90000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +log_iter: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +weights: output/cascade_rcnn_dcn_r50_fpn_1x/model_final +metric: COCO +num_classes: 81 + +CascadeRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: CascadeBBoxHead + bbox_assigner: CascadeBBoxAssigner + +ResNet: + norm_type: bn + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + variant: b + dcn_v2_stages: [3, 4, 5] + +FPN: + min_level: 2 + max_level: 6 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + min_level: 2 + max_level: 6 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_positive_overlap: 0.7 + rpn_negative_overlap: 0.3 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + min_level: 2 + max_level: 5 + box_resolution: 7 + sampling_ratio: 2 + +CascadeBBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [10, 20, 30] + bg_thresh_lo: [0.0, 0.0, 0.0] + bg_thresh_hi: [0.5, 0.6, 0.7] + fg_thresh: [0.5, 0.6, 0.7] + fg_fraction: 0.25 + +CascadeBBoxHead: + head: CascadeTwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +CascadeTwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.02 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [60000, 80000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + batch_size: 2 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 diff --git a/configs/dcn/cascade_rcnn_dcn_x101_vd_64x4d_fpn_1x.yml b/configs/dcn/cascade_rcnn_dcn_x101_vd_64x4d_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..dbbe2d8014d716e68ba481f4b66eb7fe50164356 --- /dev/null +++ b/configs/dcn/cascade_rcnn_dcn_x101_vd_64x4d_fpn_1x.yml @@ -0,0 +1,141 @@ +architecture: CascadeRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 90000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +log_iter: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNeXt101_vd_64x4d_pretrained.tar +weights: output/cascade_rcnn_dcn_x101_vd_64x4d_fpn_1x/model_final +metric: COCO +num_classes: 81 + +CascadeRCNN: + backbone: ResNeXt + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: CascadeBBoxHead + bbox_assigner: CascadeBBoxAssigner + +ResNeXt: + norm_type: bn + depth: 101 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + group_width: 4 + groups: 64 + variant: d + dcn_v2_stages: [3, 4, 5] + +FPN: + min_level: 2 + max_level: 6 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + min_level: 2 + max_level: 6 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_positive_overlap: 0.7 + rpn_negative_overlap: 0.3 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + min_level: 2 + max_level: 5 + box_resolution: 7 + sampling_ratio: 2 + +CascadeBBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [10, 20, 30] + bg_thresh_lo: [0.0, 0.0, 0.0] + bg_thresh_hi: [0.5, 0.6, 0.7] + fg_thresh: [0.5, 0.6, 0.7] + fg_fraction: 0.25 + +CascadeBBoxHead: + head: CascadeTwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +CascadeTwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.02 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [60000, 80000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + batch_size: 2 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 diff --git a/configs/dcn/faster_rcnn_dcn_r101_vd_fpn_1x.yml b/configs/dcn/faster_rcnn_dcn_r101_vd_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..d6a949efb1e68ae7c258e13f343d46e81746f77b --- /dev/null +++ b/configs/dcn/faster_rcnn_dcn_r101_vd_fpn_1x.yml @@ -0,0 +1,139 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 90000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +log_iter: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_vd_pretrained.tar +weights: output/faster_rcnn_dcn_r101_vd_fpn_1x/model_final +metric: COCO +num_classes: 81 + +FasterRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 101 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: bn + variant: d + dcn_v2_stages: [3, 4, 5] + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 2000 + pre_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 1000 + pre_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + box_resolution: 7 + sampling_ratio: 2 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.02 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [60000, 80000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + # batch size per device + batch_size: 2 + dataset: + dataset_dir: dataset/coco + image_dir: train2017 + annotation: annotations/instances_train2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/dcn/faster_rcnn_dcn_r50_fpn_1x.yml b/configs/dcn/faster_rcnn_dcn_r50_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..2048d61e46d4ce93ffb1c0d3bfd1075be2ea4472 --- /dev/null +++ b/configs/dcn/faster_rcnn_dcn_r50_fpn_1x.yml @@ -0,0 +1,138 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 90000 +use_gpu: true +snapshot_iter: 10000 +log_smooth_window: 20 +log_iter: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +metric: COCO +weights: output/faster_rcnn_dcn_r50_fpn_1x/model_final +num_classes: 81 + +FasterRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 50 + norm_type: bn + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + dcn_v2_stages: [3, 4, 5] + +FPN: + min_level: 2 + max_level: 6 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + min_level: 2 + max_level: 6 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_positive_overlap: 0.7 + rpn_negative_overlap: 0.3 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + min_level: 2 + max_level: 5 + box_resolution: 7 + sampling_ratio: 2 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_lo: 0.0 + bg_thresh_hi: 0.5 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.02 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [60000, 80000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + batch_size: 2 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 diff --git a/configs/dcn/faster_rcnn_dcn_r50_vd_fpn_2x.yml b/configs/dcn/faster_rcnn_dcn_r50_vd_fpn_2x.yml new file mode 100644 index 0000000000000000000000000000000000000000..f0cb0ba27d31247415a479619d63da265b16df27 --- /dev/null +++ b/configs/dcn/faster_rcnn_dcn_r50_vd_fpn_2x.yml @@ -0,0 +1,139 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 180000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +log_iter: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar +weights: output/faster_rcnn_dcn_r50_vd_fpn_2x/model_final +metric: COCO +num_classes: 81 + +FasterRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: bn + variant: d + dcn_v2_stages: [3, 4, 5] + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 2000 + pre_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 1000 + pre_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + box_resolution: 7 + sampling_ratio: 2 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.02 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + # batch size per device + batch_size: 2 + dataset: + dataset_dir: dataset/coco + image_dir: train2017 + annotation: annotations/instances_train2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/dcn/faster_rcnn_dcn_x101_vd_64x4d_fpn_1x.yml b/configs/dcn/faster_rcnn_dcn_x101_vd_64x4d_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..b6d7b81cc3cc1944e71b0159bec1bc56c35dcf53 --- /dev/null +++ b/configs/dcn/faster_rcnn_dcn_x101_vd_64x4d_fpn_1x.yml @@ -0,0 +1,143 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 180000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +log_iter: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNeXt101_vd_64x4d_pretrained.tar +weights: output/faster_rcnn_dcn_x101_vd_64x4d_fpn_1x/model_final +metric: COCO +num_classes: 81 + +FasterRCNN: + backbone: ResNeXt + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNeXt: + depth: 101 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + group_width: 4 + groups: 64 + norm_type: bn + variant: d + dcn_v2_stages: [3, 4, 5] + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 2000 + pre_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 1000 + pre_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + box_resolution: 7 + sampling_ratio: 2 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + # batch size per device + batch_size: 1 + dataset: + dataset_dir: dataset/coco + image_dir: train2017 + annotation: annotations/instances_train2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + shuffle: true + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + shuffle: false diff --git a/configs/dcn/mask_rcnn_dcn_r101_vd_fpn_1x.yml b/configs/dcn/mask_rcnn_dcn_r101_vd_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..f5a0b7c458a0a0856a98533a047407f58c684adc --- /dev/null +++ b/configs/dcn/mask_rcnn_dcn_r101_vd_fpn_1x.yml @@ -0,0 +1,146 @@ +architecture: MaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +max_iters: 180000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +log_iter: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_vd_pretrained.tar +weights: output/mask_rcnn_dcn_r101_vd_fpn_1x/model_final +metric: COCO +num_classes: 81 + +MaskRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 101 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: bn + variant: d + dcn_v2_stages: [3, 4, 5] + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + sampling_ratio: 2 + box_resolution: 7 + mask_resolution: 14 + +MaskHead: + dilation: 1 + conv_dim: 256 + num_convs: 4 + resolution: 28 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +MaskAssigner: + resolution: 28 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/dcn/mask_rcnn_dcn_r50_fpn_1x.yml b/configs/dcn/mask_rcnn_dcn_r50_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..04653001e9bc27bc247fe420076bd12923199263 --- /dev/null +++ b/configs/dcn/mask_rcnn_dcn_r50_fpn_1x.yml @@ -0,0 +1,145 @@ +architecture: MaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +use_gpu: true +max_iters: 180000 +snapshot_iter: 10000 +log_smooth_window: 20 +log_iter: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +metric: COCO +weights: output/mask_rcnn_dcn_r50_fpn_1x/model_final/ +num_classes: 81 + +MaskRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: bn + dcn_v2_stages: [3, 4, 5] + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + sampling_ratio: 2 + box_resolution: 7 + mask_resolution: 14 + +MaskHead: + dilation: 1 + conv_dim: 256 + num_convs: 4 + resolution: 28 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +MaskAssigner: + resolution: 28 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/dcn/mask_rcnn_dcn_r50_vd_fpn_2x.yml b/configs/dcn/mask_rcnn_dcn_r50_vd_fpn_2x.yml new file mode 100644 index 0000000000000000000000000000000000000000..d008fb84dcd24d0853a53717000a4b6578002564 --- /dev/null +++ b/configs/dcn/mask_rcnn_dcn_r50_vd_fpn_2x.yml @@ -0,0 +1,147 @@ +architecture: MaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +use_gpu: true +max_iters: 360000 +snapshot_iter: 10000 +log_smooth_window: 20 +log_iter: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar +metric: COCO +weights: output/mask_rcnn_dcn_r50_vd_fpn_2x/model_final/ +num_classes: 81 + +MaskRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: bn + variant: d + dcn_v2_stages: [3, 4, 5] + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + box_resolution: 7 + sampling_ratio: 2 + mask_resolution: 14 + +MaskHead: + dilation: 1 + conv_dim: 256 + num_convs: 4 + resolution: 28 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +MaskAssigner: + resolution: 28 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [240000, 320000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + # batch size per device + batch_size: 1 + dataset: + dataset_dir: dataset/coco + image_dir: train2017 + annotation: annotations/instances_train2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/dcn/mask_rcnn_dcn_x101_vd_64x4d_fpn_1x.yml b/configs/dcn/mask_rcnn_dcn_x101_vd_64x4d_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..8076d1a5d4c209e174cf02d1a5dd36c3716456f9 --- /dev/null +++ b/configs/dcn/mask_rcnn_dcn_x101_vd_64x4d_fpn_1x.yml @@ -0,0 +1,148 @@ +architecture: MaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +max_iters: 180000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +log_iter: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNeXt101_vd_64x4d_pretrained.tar +weights: output/mask_rcnn_dcn_x101_vd_64x4d_fpn_1x/model_final +metric: COCO +num_classes: 81 + +MaskRCNN: + backbone: ResNeXt + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNeXt: + depth: 101 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + group_width: 4 + groups: 64 + norm_type: bn + variant: d + dcn_v2_stages: [3, 4, 5] + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + sampling_ratio: 2 + box_resolution: 7 + mask_resolution: 14 + +MaskHead: + dilation: 1 + conv_dim: 256 + num_convs: 4 + resolution: 28 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +MaskAssigner: + resolution: 28 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/face_detection/README.md b/configs/face_detection/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b5e2119f8148dfa3a0ed5930ca285cce50236e2e --- /dev/null +++ b/configs/face_detection/README.md @@ -0,0 +1,261 @@ +English | [简体中文](README_cn.md) + +# FaceDetection +The goal of FaceDetection is to provide efficient and high-speed face detection solutions, +including cutting-edge and classic models. + + +
+ +
+ +## Data Pipline +We use the [WIDER FACE dataset](http://shuoyang1213.me/WIDERFACE/) to carry out the training +and testing of the model, the official website gives detailed data introduction. +- WIDER Face data source: +Loads `wider_face` type dataset with directory structures like this: + + ``` + dataset/wider_face/ + ├── wider_face_split + │ ├── wider_face_train_bbx_gt.txt + │ ├── wider_face_val_bbx_gt.txt + ├── WIDER_train + │ ├── images + │ │ ├── 0--Parade + │ │ │ ├── 0_Parade_marchingband_1_100.jpg + │ │ │ ├── 0_Parade_marchingband_1_381.jpg + │ │ │ │ ... + │ │ ├── 10--People_Marching + │ │ │ ... + ├── WIDER_val + │ ├── images + │ │ ├── 0--Parade + │ │ │ ├── 0_Parade_marchingband_1_1004.jpg + │ │ │ ├── 0_Parade_marchingband_1_1045.jpg + │ │ │ │ ... + │ │ ├── 10--People_Marching + │ │ │ ... + ``` + +- Download dataset manually: +To download the WIDER FACE dataset, run the following commands: +``` +cd dataset/wider_face && ./download.sh +``` + +- Download dataset automatically: +If a training session is started but the dataset is not setup properly +(e.g, not found in dataset/wider_face), PaddleDetection can automatically +download them from [WIDER FACE dataset](http://shuoyang1213.me/WIDERFACE/), +the decompressed datasets will be cached in ~/.cache/paddle/dataset/ and can be discovered +automatically subsequently. + +### Data Augmentation + +- **Data-anchor-sampling:** Randomly transform the scale of the image to a certain range of scales, +greatly enhancing the scale change of the face. The specific operation is to obtain $v=\sqrt{width * height}$ +according to the randomly selected face height and width, and judge the value of `v` in which interval of + `[16,32,64,128]`. Assuming `v=45` && `32[1](#lite) | NAS [2](#nas) | +|:------------------------:|:--------:|:--------------------------:|:------------------------:| +| [BlazeFace](#BlazeFace) | ✓ | ✓ | ✓ | +| [FaceBoxes](#FaceBoxes) | ✓ | ✓ | x | + +[1] `Lite` edition means reduces the number of network layers and channels. +[2] `NAS` edition means use `Neural Architecture Search` algorithm to +optimized network structure. + +**Todo List:** +- [ ] HamBox +- [ ] Pyramidbox + +### Model Zoo + +#### mAP in WIDER FACE + +| Architecture | Type | Size | Img/gpu | Lr schd | Easy Set | Medium Set | Hard Set | Download | +|:------------:|:--------:|:----:|:-------:|:-------:|:---------:|:----------:|:---------:|:--------:| +| BlazeFace | Original | 640 | 8 | 32w | **0.915** | **0.892** | **0.797** | [model](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_original.tar) | +| BlazeFace | Lite | 640 | 8 | 32w | 0.909 | 0.885 | 0.781 | [model](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_lite.tar) | +| BlazeFace | NAS | 640 | 8 | 32w | 0.837 | 0.807 | 0.658 | [model](https://paddlemodels.bj.bcebos.com/object_detection/blazeface_nas.tar) | +| FaceBoxes | Original | 640 | 8 | 32w | 0.875 | 0.848 | 0.568 | [model](https://paddlemodels.bj.bcebos.com/object_detection/faceboxes_original.tar) | +| FaceBoxes | Lite | 640 | 8 | 32w | 0.898 | 0.872 | 0.752 | [model](https://paddlemodels.bj.bcebos.com/object_detection/faceboxes_lite.tar) | + +**NOTES:** +- Get mAP in `Easy/Medium/Hard Set` by multi-scale evaluation in `tools/face_eval.py`. +For details can refer to [Evaluation](#Evaluate-on-the-WIDER-FACE). +- BlazeFace-Lite Training and Testing ues [blazeface.yml](../../configs/face_detection/blazeface.yml) +configs file and set `lite_edition: true`. + +#### mAP in FDDB + +| Architecture | Type | Size | DistROC | ContROC | +|:------------:|:--------:|:----:|:-------:|:-------:| +| BlazeFace | Original | 640 | **0.992** | **0.762** | +| BlazeFace | Lite | 640 | 0.990 | 0.756 | +| BlazeFace | NAS | 640 | 0.981 | 0.741 | +| FaceBoxes | Original | 640 | 0.985 | 0.731 | +| FaceBoxes | Lite | 640 | 0.987 | 0.741 | + +**NOTES:** +- Get mAP by multi-scale evaluation on the FDDB dataset. +For details can refer to [Evaluation](#Evaluate-on-the-FDDB). + +#### Infer Time and Model Size comparison + +| Architecture | Type | Size | P4 (ms) | CPU (ms) | ARM (ms) | File size (MB) | Flops | +|:------------:|:--------:|:----:|:---------:|:--------:|:----------:|:--------------:|:---------:| +| BlazeFace | Original | 128 | - | - | - | - | - | +| BlazeFace | Lite | 128 | - | - | - | - | - | +| BlazeFace | NAS | 128 | - | - | - | - | - | +| FaceBoxes | Original | 128 | - | - | - | - | - | +| FaceBoxes | Lite | 128 | - | - | - | - | - | +| BlazeFace | Original | 320 | - | - | - | - | - | +| BlazeFace | Lite | 320 | - | - | - | - | - | +| BlazeFace | NAS | 320 | - | - | - | - | - | +| FaceBoxes | Original | 320 | - | - | - | - | - | +| FaceBoxes | Lite | 320 | - | - | - | - | - | +| BlazeFace | Original | 640 | - | - | - | - | - | +| BlazeFace | Lite | 640 | - | - | - | - | - | +| BlazeFace | NAS | 640 | - | - | - | - | - | +| FaceBoxes | Original | 640 | - | - | - | - | - | +| FaceBoxes | Lite | 640 | - | - | - | - | - | + + +**NOTES:** +- CPU: i5-7360U @ 2.30GHz. Single core and single thread. + + + +## Get Started +`Training` and `Inference` please refer to [GETTING_STARTED.md](../../docs/GETTING_STARTED.md) +- **NOTES:** +- `BlazeFace` and `FaceBoxes` is trained in 4 GPU with `batch_size=8` per gpu (total batch size as 32) +and trained 320000 iters.(If your GPU count is not 4, please refer to the rule of training parameters +in the table of [calculation rules](../../docs/GETTING_STARTED.md#faq)) +- Currently we do not support evaluation in training. + +### Evaluation +``` +export CUDA_VISIBLE_DEVICES=0 +export PYTHONPATH=$PYTHONPATH:. +python tools/face_eval.py -c configs/face_detection/blazeface.yml +``` +- Optional arguments +- `-d` or `--dataset_dir`: Dataset path, same as dataset_dir of configs. Such as: `-d dataset/wider_face`. +- `-f` or `--output_eval`: Evaluation file directory, default is `output/pred`. +- `-e` or `--eval_mode`: Evaluation mode, include `widerface` and `fddb`, default is `widerface`. +- `--multi_scale`: If you add this action button in the command, it will select `multi_scale` evaluation. +Default is `False`, it will select `single-scale` evaluation. + +After the evaluation is completed, the test result in txt format will be generated in `output/pred`, +and then mAP will be calculated according to different data sets. If you set `--eval_mode=widerface`, +it will [Evaluate on the WIDER FACE](#Evaluate-on-the-WIDER-FACE).If you set `--eval_mode=fddb`, +it will [Evaluate on the FDDB](#Evaluate-on-the-FDDB). + +#### Evaluate on the WIDER FACE +- Download the official evaluation script to evaluate the AP metrics: +``` +wget http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/support/eval_script/eval_tools.zip +unzip eval_tools.zip && rm -f eval_tools.zip +``` +- Modify the result path and the name of the curve to be drawn in `eval_tools/wider_eval.m`: +``` +# Modify the folder name where the result is stored. +pred_dir = './pred'; +# Modify the name of the curve to be drawn +legend_name = 'Fluid-BlazeFace'; +``` +- `wider_eval.m` is the main execution program of the evaluation module. The run command is as follows: +``` +matlab -nodesktop -nosplash -nojvm -r "run wider_eval.m;quit;" +``` + +#### Evaluate on the FDDB +[FDDB dataset](http://vis-www.cs.umass.edu/fddb/) details can refer to FDDB's official website. +- Download the official dataset and evaluation script to evaluate the ROC metrics: +``` +#external link to the Faces in the Wild data set +wget http://tamaraberg.com/faceDataset/originalPics.tar.gz +#The annotations are split into ten folds. See README for details. +wget http://vis-www.cs.umass.edu/fddb/FDDB-folds.tgz +#information on directory structure and file formats +wget http://vis-www.cs.umass.edu/fddb/README.txt +``` +- Install OpenCV: Requires [OpenCV library](http://sourceforge.net/projects/opencvlibrary/) +If the utility 'pkg-config' is not available for your operating system, +edit the Makefile to manually specify the OpenCV flags as following: +``` +INCS = -I/usr/local/include/opencv +LIBS = -L/usr/local/lib -lcxcore -lcv -lhighgui -lcvaux -lml +``` + +- Compile FDDB evaluation code: execute `make` in evaluation folder. + +- Generate full image path list and groundtruth in FDDB-folds. The run command is as follows: +``` +cat `ls|grep -v"ellipse"` > filePath.txt` and `cat *ellipse* > fddb_annotFile.txt` +``` +- Evaluation +Finally evaluation command is: +``` +./evaluate -a ./FDDB/FDDB-folds/fddb_annotFile.txt \ + -d DETECTION_RESULT.txt -f 0 \ + -i ./FDDB -l ./FDDB/FDDB-folds/filePath.txt \ + -r ./OUTPUT_DIR -z .jpg +``` +**NOTES:** The interpretation of the argument can be performed by `./evaluate --help`. + +## Algorithm Description + +### BlazeFace +**Introduction:** +[BlazeFace](https://arxiv.org/abs/1907.05047) is Google Research published face detection model. +It's lightweight but good performance, and tailored for mobile GPU inference. It runs at a speed +of 200-1000+ FPS on flagship devices. + +**Particularity:** +- Anchor scheme stops at 8×8(input 128x128), 6 anchors per pixel at that resolution. +- 5 single, and 6 double BlazeBlocks: 5×5 depthwise convs, same accuracy with fewer layers. +- Replace the non-maximum suppression algorithm with a blending strategy that estimates the +regression parameters of a bounding box as a weighted mean between the overlapping predictions. + +**Edition information:** +- Original: Reference original paper reproduction. +- Lite: Replace 5x5 conv with 3x3 conv, fewer network layers and conv channels. +- NAS: use `Neural Architecture Search` algorithm to optimized network structure, +less network layer and conv channel number than `Lite`. + +### FaceBoxes +**Introduction:** +[FaceBoxes](https://arxiv.org/abs/1708.05234) which named A CPU Real-time Face Detector +with High Accuracy is face detector proposed by Shifeng Zhang, with high performance on +both speed and accuracy. This paper is published by IJCB(2017). + +**Particularity:** +- Anchor scheme stops at 20x20, 10x10, 5x5, which network input size is 640x640, +including 3, 1, 1 anchors per pixel at each resolution. The corresponding densities +are 1, 2, 4(20x20), 4(10x10) and 4(5x5). +- 2 convs with CReLU, 2 poolings, 3 inceptions and 2 convs with ReLU. +- Use density prior box to improve detection accuracy. + +**Edition information:** +- Original: Reference original paper reproduction. +- Lite: 2 convs with CReLU, 1 pooling, 2 convs with ReLU, 3 inceptions and 2 convs with ReLU. +Anchor scheme stops at 80x80 and 40x40, including 3, 1 anchors per pixel at each resolution. +The corresponding densities are 1, 2, 4(80x80) and 4(40x40), using less conv channel number than lite. + + +## Contributing +Contributions are highly welcomed and we would really appreciate your feedback!! diff --git a/configs/face_detection/blazeface.yml b/configs/face_detection/blazeface.yml new file mode 100644 index 0000000000000000000000000000000000000000..692f14a7cc8091bc8df1f5edbfbca2a9c59b0073 --- /dev/null +++ b/configs/face_detection/blazeface.yml @@ -0,0 +1,130 @@ +architecture: BlazeFace +max_iters: 320000 +train_feed: SSDTrainFeed +eval_feed: SSDEvalFeed +test_feed: SSDTestFeed +pretrain_weights: +use_gpu: true +snapshot_iter: 10000 +log_smooth_window: 20 +log_iter: 20 +metric: WIDERFACE +save_dir: output +weights: output/blazeface/model_final/ +# 1(label_class) + 1(background) +num_classes: 2 + +BlazeFace: + backbone: BlazeNet + output_decoder: + keep_top_k: 750 + nms_threshold: 0.3 + nms_top_k: 5000 + score_threshold: 0.01 + min_sizes: [[16.,24.], [32., 48., 64., 80., 96., 128.]] + use_density_prior_box: false + +BlazeNet: + with_extra_blocks: true + lite_edition: false + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [240000, 300000] + +OptimizerBuilder: + optimizer: + momentum: 0.0 + type: RMSPropOptimizer + regularizer: + factor: 0.0005 + type: L2 + +SSDTrainFeed: + batch_size: 8 + use_process: True + dataset: + dataset_dir: dataset/wider_face + annotation: wider_face_split/wider_face_train_bbx_gt.txt + image_dir: WIDER_train/images + image_shape: [3, 640, 640] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !RandomDistort + brightness_lower: 0.875 + brightness_upper: 1.125 + is_order: true + - !ExpandImage + max_ratio: 4 + prob: 0.5 + - !CropImageWithDataAchorSampling + anchor_sampler: + - [1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0] + batch_sampler: + - [1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + target_size: 640 + - !RandomInterpImage + target_size: 640 + - !RandomFlipImage + is_normalized: true + - !Permute {} + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [127.502231, 127.502231, 127.502231] + +SSDEvalFeed: + batch_size: 1 + use_process: false + fields: ['image', 'im_id', 'gt_box'] + dataset: + dataset_dir: dataset/wider_face + annotation: wider_face_split/wider_face_val_bbx_gt.txt + image_dir: WIDER_val/images + drop_last: false + image_shape: [3, 640, 640] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !ResizeImage + interp: 1 + target_size: 640 + use_cv2: false + - !Permute {} + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [127.502231, 127.502231, 127.502231] + +SSDTestFeed: + batch_size: 1 + use_process: false + dataset: + use_default_label: true + drop_last: false + image_shape: [3, 640, 640] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !ResizeImage + interp: 1 + target_size: 640 + use_cv2: false + - !Permute {} + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [127.502231, 127.502231, 127.502231] diff --git a/configs/face_detection/blazeface_nas.yml b/configs/face_detection/blazeface_nas.yml new file mode 100644 index 0000000000000000000000000000000000000000..45356bda7998c18b286edaa4e308f21875cce9d3 --- /dev/null +++ b/configs/face_detection/blazeface_nas.yml @@ -0,0 +1,132 @@ +architecture: BlazeFace +max_iters: 320000 +train_feed: SSDTrainFeed +eval_feed: SSDEvalFeed +test_feed: SSDTestFeed +pretrain_weights: +use_gpu: true +snapshot_iter: 10000 +log_smooth_window: 20 +log_iter: 20 +metric: WIDERFACE +save_dir: output +weights: output/blazeface_nas/model_final/ +# 1(label_class) + 1(background) +num_classes: 2 + +BlazeFace: + backbone: BlazeNet + output_decoder: + keep_top_k: 750 + nms_threshold: 0.3 + nms_top_k: 5000 + score_threshold: 0.01 + min_sizes: [[16.,24.], [32., 48., 64., 80., 96., 128.]] + use_density_prior_box: false + +BlazeNet: + blaze_filters: [[12, 12], [12, 12, 2], [12, 12]] + double_blaze_filters: [[12, 16, 24, 2], [24, 12, 24], [24, 16, 72, 2], [72, 12, 72]] + with_extra_blocks: true + lite_edition: false + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [240000, 300000] + +OptimizerBuilder: + optimizer: + momentum: 0.0 + type: RMSPropOptimizer + regularizer: + factor: 0.0005 + type: L2 + +SSDTrainFeed: + batch_size: 8 + use_process: True + dataset: + dataset_dir: dataset/wider_face + annotation: wider_face_split/wider_face_train_bbx_gt.txt + image_dir: WIDER_train/images + image_shape: [3, 640, 640] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !RandomDistort + brightness_lower: 0.875 + brightness_upper: 1.125 + is_order: true + - !ExpandImage + max_ratio: 4 + prob: 0.5 + - !CropImageWithDataAchorSampling + anchor_sampler: + - [1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0] + batch_sampler: + - [1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + target_size: 640 + - !RandomInterpImage + target_size: 640 + - !RandomFlipImage + is_normalized: true + - !Permute {} + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [127.502231, 127.502231, 127.502231] + +SSDEvalFeed: + batch_size: 1 + use_process: false + fields: ['image', 'im_id', 'gt_box'] + dataset: + dataset_dir: dataset/wider_face + annotation: wider_face_split/wider_face_val_bbx_gt.txt + image_dir: WIDER_val/images + drop_last: false + image_shape: [3, 640, 640] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !ResizeImage + interp: 1 + target_size: 640 + use_cv2: false + - !Permute {} + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [127.502231, 127.502231, 127.502231] + +SSDTestFeed: + batch_size: 1 + use_process: false + dataset: + use_default_label: true + drop_last: false + image_shape: [3, 640, 640] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !ResizeImage + interp: 1 + target_size: 640 + use_cv2: false + - !Permute {} + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [127.502231, 127.502231, 127.502231] diff --git a/configs/face_detection/faceboxes.yml b/configs/face_detection/faceboxes.yml new file mode 100644 index 0000000000000000000000000000000000000000..b278723292ddf51e7a93a88b59f3b757f5d4455e --- /dev/null +++ b/configs/face_detection/faceboxes.yml @@ -0,0 +1,130 @@ +architecture: FaceBoxes +train_feed: SSDTrainFeed +eval_feed: SSDEvalFeed +test_feed: SSDTestFeed +pretrain_weights: +use_gpu: true +max_iters: 320000 +snapshot_iter: 10000 +log_smooth_window: 20 +log_iter: 20 +metric: WIDERFACE +save_dir: output +weights: output/faceboxes/model_final/ +# 1(label_class) + 1(background) +num_classes: 2 + +FaceBoxes: + backbone: FaceBoxNet + densities: [[4, 2, 1], [1], [1]] + fixed_sizes: [[32., 64., 128.], [256.], [512.]] + output_decoder: + keep_top_k: 750 + nms_threshold: 0.3 + nms_top_k: 5000 + score_threshold: 0.01 + +FaceBoxNet: + with_extra_blocks: true + lite_edition: false + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [240000, 300000] + +OptimizerBuilder: + optimizer: + momentum: 0.0 + type: RMSPropOptimizer + regularizer: + factor: 0.0005 + type: L2 + +SSDTrainFeed: + batch_size: 8 + use_process: True + dataset: + dataset_dir: dataset/wider_face + annotation: wider_face_split/wider_face_train_bbx_gt.txt + image_dir: WIDER_train/images + image_shape: [3, 640, 640] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !RandomDistort + brightness_lower: 0.875 + brightness_upper: 1.125 + is_order: true + - !ExpandImage + max_ratio: 4 + prob: 0.5 + - !CropImageWithDataAchorSampling + anchor_sampler: + - [1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0] + batch_sampler: + - [1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + target_size: 640 + - !RandomInterpImage + target_size: 640 + - !RandomFlipImage + is_normalized: true + - !Permute {} + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [127.502231, 127.502231, 127.502231] + +SSDEvalFeed: + batch_size: 1 + use_process: false + fields: ['image', 'im_id', 'gt_box'] + dataset: + dataset_dir: dataset/wider_face + annotation: wider_face_split/wider_face_val_bbx_gt.txt + image_dir: WIDER_val/images + drop_last: false + image_shape: [3, 640, 640] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !ResizeImage + interp: 1 + target_size: 640 + use_cv2: false + - !Permute {} + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [127.502231, 127.502231, 127.502231] + +SSDTestFeed: + batch_size: 1 + use_process: false + dataset: + use_default_label: true + drop_last: false + image_shape: [3, 640, 640] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !ResizeImage + interp: 1 + target_size: 640 + use_cv2: false + - !Permute {} + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [127.502231, 127.502231, 127.502231] diff --git a/configs/face_detection/faceboxes_lite.yml b/configs/face_detection/faceboxes_lite.yml new file mode 100644 index 0000000000000000000000000000000000000000..157f0337e4fbda281b3c2fe9cdfd85dd81b51b40 --- /dev/null +++ b/configs/face_detection/faceboxes_lite.yml @@ -0,0 +1,130 @@ +architecture: FaceBoxes +train_feed: SSDTrainFeed +eval_feed: SSDEvalFeed +test_feed: SSDTestFeed +pretrain_weights: +use_gpu: true +max_iters: 320000 +snapshot_iter: 10000 +log_smooth_window: 20 +log_iter: 20 +metric: WIDERFACE +save_dir: output +weights: output/faceboxes_lite/model_final/ +# 1(label_class) + 1(background) +num_classes: 2 + +FaceBoxes: + backbone: FaceBoxNet + densities: [[2, 1, 1], [1, 1]] + fixed_sizes: [[16., 32., 64.], [96., 128.]] + output_decoder: + keep_top_k: 750 + nms_threshold: 0.3 + nms_top_k: 5000 + score_threshold: 0.01 + +FaceBoxNet: + with_extra_blocks: true + lite_edition: true + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [240000, 300000] + +OptimizerBuilder: + optimizer: + momentum: 0.0 + type: RMSPropOptimizer + regularizer: + factor: 0.0005 + type: L2 + +SSDTrainFeed: + batch_size: 8 + use_process: True + dataset: + dataset_dir: dataset/wider_face + annotation: wider_face_split/wider_face_train_bbx_gt.txt + image_dir: WIDER_train/images + image_shape: [3, 640, 640] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !RandomDistort + brightness_lower: 0.875 + brightness_upper: 1.125 + is_order: true + - !ExpandImage + max_ratio: 4 + prob: 0.5 + - !CropImageWithDataAchorSampling + anchor_sampler: + - [1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0] + batch_sampler: + - [1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + target_size: 640 + - !RandomInterpImage + target_size: 640 + - !RandomFlipImage + is_normalized: true + - !Permute {} + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [127.502231, 127.502231, 127.502231] + +SSDEvalFeed: + batch_size: 1 + use_process: false + fields: ['image', 'im_id', 'gt_box'] + dataset: + dataset_dir: dataset/wider_face + annotation: wider_face_split/wider_face_val_bbx_gt.txt + image_dir: WIDER_val/images + drop_last: false + image_shape: [3, 640, 640] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !ResizeImage + interp: 1 + target_size: 640 + use_cv2: false + - !Permute {} + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [127.502231, 127.502231, 127.502231] + +SSDTestFeed: + batch_size: 1 + use_process: false + dataset: + use_default_label: true + drop_last: false + image_shape: [3, 640, 640] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !ResizeImage + interp: 1 + target_size: 640 + use_cv2: false + - !Permute {} + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [127.502231, 127.502231, 127.502231] diff --git a/configs/faster_rcnn_r101_1x.yml b/configs/faster_rcnn_r101_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..c72c34d4da8ce429932069d5084f4e710ddba11b --- /dev/null +++ b/configs/faster_rcnn_r101_1x.yml @@ -0,0 +1,115 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +use_gpu: true +max_iters: 180000 +log_smooth_window: 20 +save_dir: output +snapshot_iter: 10000 +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_pretrained.tar +metric: COCO +weights: output/faster_rcnn_r101_1x/model_final +num_classes: 81 + +FasterRCNN: + backbone: ResNet + rpn_head: RPNHead + roi_extractor: RoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + norm_type: affine_channel + depth: 101 + feature_maps: 4 + freeze_at: 2 + +ResNetC5: + depth: 101 + norm_type: affine_channel + +RPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + use_random: true + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 12000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 6000 + post_nms_top_n: 1000 + +RoIAlign: + resolution: 14 + sampling_ratio: 0 + spatial_scale: 0.0625 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: ResNetC5 + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + # batch size per device + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + drop_last: false + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json diff --git a/configs/faster_rcnn_r101_fpn_1x.yml b/configs/faster_rcnn_r101_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..c11d6f2141a9b15c6a3ef2c50055fbc753338d53 --- /dev/null +++ b/configs/faster_rcnn_r101_fpn_1x.yml @@ -0,0 +1,136 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 180000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: http://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_pretrained.tar +weights: output/faster_rcnn_r101_fpn_1x/model_final +metric: COCO +num_classes: 81 + +FasterRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 101 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: affine_channel + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 2000 + pre_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 1000 + pre_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + box_resolution: 7 + sampling_ratio: 2 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + # batch size per device + batch_size: 1 + dataset: + dataset_dir: dataset/coco + image_dir: train2017 + annotation: annotations/instances_train2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/faster_rcnn_r101_fpn_2x.yml b/configs/faster_rcnn_r101_fpn_2x.yml new file mode 100644 index 0000000000000000000000000000000000000000..29838c78b60bddb7a92193088354fe0956e2d14b --- /dev/null +++ b/configs/faster_rcnn_r101_fpn_2x.yml @@ -0,0 +1,136 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 360000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: http://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_pretrained.tar +weights: output/faster_rcnn_r101_fpn_2x/model_final +metric: COCO +num_classes: 81 + +FasterRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 101 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: affine_channel + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 2000 + pre_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 1000 + pre_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + box_resolution: 7 + sampling_ratio: 2 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [240000, 320000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + # batch size per device + batch_size: 1 + dataset: + dataset_dir: dataset/coco + image_dir: train2017 + annotation: annotations/instances_train2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/faster_rcnn_r101_vd_fpn_1x.yml b/configs/faster_rcnn_r101_vd_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..2ef717ffc846d325a6e3f3c9b78752250d692f9d --- /dev/null +++ b/configs/faster_rcnn_r101_vd_fpn_1x.yml @@ -0,0 +1,137 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 180000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_vd_pretrained.tar +weights: output/faster_rcnn_r101_vd_fpn_1x/model_final +metric: COCO +num_classes: 81 + +FasterRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 101 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: affine_channel + variant: d + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 2000 + pre_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 1000 + pre_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + box_resolution: 7 + sampling_ratio: 2 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + # batch size per device + batch_size: 1 + dataset: + dataset_dir: dataset/coco + image_dir: train2017 + annotation: annotations/instances_train2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/faster_rcnn_r101_vd_fpn_2x.yml b/configs/faster_rcnn_r101_vd_fpn_2x.yml new file mode 100644 index 0000000000000000000000000000000000000000..763d447c8cfb59c6fcd3045b0e8a34b8da38e73e --- /dev/null +++ b/configs/faster_rcnn_r101_vd_fpn_2x.yml @@ -0,0 +1,137 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 360000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_vd_pretrained.tar +weights: output/faster_rcnn_r101_vd_fpn_2x/model_final +metric: COCO +num_classes: 81 + +FasterRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 101 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: affine_channel + variant: d + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 2000 + pre_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 1000 + pre_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + box_resolution: 7 + sampling_ratio: 2 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [240000, 320000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + # batch size per device + batch_size: 1 + dataset: + dataset_dir: dataset/coco + image_dir: train2017 + annotation: annotations/instances_train2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/faster_rcnn_r50_1x.yml b/configs/faster_rcnn_r50_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..12d349612b3464cff0b945535a8d81a53b434a98 --- /dev/null +++ b/configs/faster_rcnn_r50_1x.yml @@ -0,0 +1,115 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +use_gpu: true +max_iters: 180000 +log_smooth_window: 20 +save_dir: output +snapshot_iter: 10000 +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +metric: COCO +weights: output/faster_rcnn_r50_1x/model_final +num_classes: 81 + +FasterRCNN: + backbone: ResNet + rpn_head: RPNHead + roi_extractor: RoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + norm_type: affine_channel + depth: 50 + feature_maps: 4 + freeze_at: 2 + +ResNetC5: + depth: 50 + norm_type: affine_channel + +RPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + use_random: true + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 12000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 6000 + post_nms_top_n: 1000 + +RoIAlign: + resolution: 14 + sampling_ratio: 0 + spatial_scale: 0.0625 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: ResNetC5 + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + # batch size per device + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + drop_last: false + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json diff --git a/configs/faster_rcnn_r50_2x.yml b/configs/faster_rcnn_r50_2x.yml new file mode 100644 index 0000000000000000000000000000000000000000..255cd02663845235beac103dad7faaaf03bb90f2 --- /dev/null +++ b/configs/faster_rcnn_r50_2x.yml @@ -0,0 +1,115 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +use_gpu: true +max_iters: 360000 +log_smooth_window: 20 +save_dir: output +snapshot_iter: 10000 +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +metric: COCO +weights: output/faster_rcnn_r50_2x/model_final +num_classes: 81 + +FasterRCNN: + backbone: ResNet + rpn_head: RPNHead + roi_extractor: RoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + norm_type: affine_channel + depth: 50 + feature_maps: 4 + freeze_at: 2 + +ResNetC5: + depth: 50 + norm_type: affine_channel + +RPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + use_random: true + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 12000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 6000 + post_nms_top_n: 1000 + +RoIAlign: + resolution: 14 + sampling_ratio: 0 + spatial_scale: 0.0625 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: ResNetC5 + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [240000, 320000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + # batch size per device + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + drop_last: false + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json diff --git a/configs/faster_rcnn_r50_fpn_1x.yml b/configs/faster_rcnn_r50_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..c719106104f1424008db3a079e2e1ac7a3d742b9 --- /dev/null +++ b/configs/faster_rcnn_r50_fpn_1x.yml @@ -0,0 +1,137 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 90000 +use_gpu: true +snapshot_iter: 10000 +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +metric: COCO +weights: output/faster_rcnn_r50_fpn_1x/model_final +num_classes: 81 + +FasterRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + norm_type: bn + norm_decay: 0. + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + +FPN: + min_level: 2 + max_level: 6 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + min_level: 2 + max_level: 6 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_positive_overlap: 0.7 + rpn_negative_overlap: 0.3 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + min_level: 2 + max_level: 5 + box_resolution: 7 + sampling_ratio: 2 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_lo: 0.0 + bg_thresh_hi: 0.5 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.02 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [60000, 80000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + batch_size: 2 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 diff --git a/configs/faster_rcnn_r50_fpn_2x.yml b/configs/faster_rcnn_r50_fpn_2x.yml new file mode 100644 index 0000000000000000000000000000000000000000..12ae624f6c642ff439a21a90a4f52d1da046c164 --- /dev/null +++ b/configs/faster_rcnn_r50_fpn_2x.yml @@ -0,0 +1,137 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 180000 +use_gpu: true +snapshot_iter: 10000 +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +metric: COCO +weights: output/faster_rcnn_r50_fpn_2x/model_final +num_classes: 81 + +FasterRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + norm_type: affine_channel + norm_decay: 0. + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + +FPN: + min_level: 2 + max_level: 6 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + min_level: 2 + max_level: 6 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_positive_overlap: 0.7 + rpn_negative_overlap: 0.3 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + min_level: 2 + max_level: 5 + box_resolution: 7 + sampling_ratio: 2 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_lo: 0.0 + bg_thresh_hi: 0.5 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.02 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + batch_size: 2 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 diff --git a/configs/faster_rcnn_r50_vd_1x.yml b/configs/faster_rcnn_r50_vd_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..f39a144a431f5998a8178c41c10ade796d270cb6 --- /dev/null +++ b/configs/faster_rcnn_r50_vd_1x.yml @@ -0,0 +1,117 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +use_gpu: true +max_iters: 180000 +log_smooth_window: 20 +save_dir: output/faster-r50-vd-c4-1x +snapshot_iter: 10000 +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar +metric: COCO +weights: output/faster_rcnn_r50_vd_1x/model_final +num_classes: 81 + +FasterRCNN: + backbone: ResNet + rpn_head: RPNHead + roi_extractor: RoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + norm_type: affine_channel + depth: 50 + feature_maps: 4 + freeze_at: 2 + variant: d + +ResNetC5: + depth: 50 + norm_type: affine_channel + variant: d + +RPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + use_random: true + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 12000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 6000 + post_nms_top_n: 1000 + +RoIAlign: + resolution: 14 + sampling_ratio: 0 + spatial_scale: 0.0625 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: ResNetC5 + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + # batch size per device + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + drop_last: false + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json diff --git a/configs/faster_rcnn_r50_vd_fpn_2x.yml b/configs/faster_rcnn_r50_vd_fpn_2x.yml new file mode 100644 index 0000000000000000000000000000000000000000..4b944ef9398b3dfebdc4f3731b8ef2522d103e22 --- /dev/null +++ b/configs/faster_rcnn_r50_vd_fpn_2x.yml @@ -0,0 +1,137 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 180000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar +weights: output/faster_rcnn_r50_vd_fpn_2x/model_final +metric: COCO +num_classes: 81 + +FasterRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: affine_channel + variant: d + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 2000 + pre_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 1000 + pre_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + box_resolution: 7 + sampling_ratio: 2 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.02 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + # batch size per device + batch_size: 2 + dataset: + dataset_dir: dataset/coco + image_dir: train2017 + annotation: annotations/instances_train2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/faster_rcnn_se154_vd_fpn_s1x.yml b/configs/faster_rcnn_se154_vd_fpn_s1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..c3dd761e6637f568c32d30af2300d87fe5f600ac --- /dev/null +++ b/configs/faster_rcnn_se154_vd_fpn_s1x.yml @@ -0,0 +1,139 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 260000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/SENet154_vd_pretrained.tar +weights: output/faster_rcnn_se154_vd_fpn_s1x/model_final +metric: COCO +num_classes: 81 + +FasterRCNN: + backbone: SENet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +SENet: + depth: 152 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + group_width: 4 + groups: 64 + norm_type: affine_channel + variant: d + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 2000 + pre_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 1000 + pre_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + box_resolution: 7 + sampling_ratio: 2 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [200000, 240000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + # batch size per device + batch_size: 1 + dataset: + dataset_dir: dataset/coco + image_dir: train2017 + annotation: annotations/instances_train2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/faster_rcnn_x101_vd_64x4d_fpn_1x.yml b/configs/faster_rcnn_x101_vd_64x4d_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..adb607b6e022f3c7c66c121922e2d28e4ba3e1d0 --- /dev/null +++ b/configs/faster_rcnn_x101_vd_64x4d_fpn_1x.yml @@ -0,0 +1,142 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 180000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNeXt101_vd_64x4d_pretrained.tar +weights: output/faster_rcnn_x101_vd_64x4d_fpn_1x/model_final +metric: COCO +num_classes: 81 + +FasterRCNN: + backbone: ResNeXt + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNeXt: + depth: 101 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + group_width: 4 + groups: 64 + norm_type: affine_channel + variant: d + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 2000 + pre_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 1000 + pre_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + box_resolution: 7 + sampling_ratio: 2 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + values: null + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + # batch size per device + batch_size: 1 + dataset: + dataset_dir: dataset/coco + image_dir: train2017 + annotation: annotations/instances_train2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + shuffle: true + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + shuffle: false diff --git a/configs/faster_rcnn_x101_vd_64x4d_fpn_2x.yml b/configs/faster_rcnn_x101_vd_64x4d_fpn_2x.yml new file mode 100644 index 0000000000000000000000000000000000000000..ee36efbe859ab42d7391d2867bf34d45b20b340f --- /dev/null +++ b/configs/faster_rcnn_x101_vd_64x4d_fpn_2x.yml @@ -0,0 +1,141 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 360000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNeXt101_vd_64x4d_pretrained.tar +weights: output/faster_rcnn_x101_vd_64x4d_fpn_1x/model_final +metric: COCO +num_classes: 81 + +FasterRCNN: + backbone: ResNeXt + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNeXt: + depth: 101 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + group_width: 4 + groups: 64 + norm_type: affine_channel + variant: d + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 2000 + pre_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 1000 + pre_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + box_resolution: 7 + sampling_ratio: 2 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [240000, 320000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + # batch size per device + batch_size: 1 + dataset: + dataset_dir: dataset/coco + image_dir: train2017 + annotation: annotations/instances_train2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + shuffle: true + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + shuffle: false diff --git a/configs/gn/cascade_mask_rcnn_r50_fpn_gn_2x.yml b/configs/gn/cascade_mask_rcnn_r50_fpn_gn_2x.yml new file mode 100644 index 0000000000000000000000000000000000000000..52c61ad4b57bf1464fe9e2816cec710563a9d707 --- /dev/null +++ b/configs/gn/cascade_mask_rcnn_r50_fpn_gn_2x.yml @@ -0,0 +1,147 @@ +architecture: CascadeMaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +max_iters: 180000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +weights: output/cascade_mask_rcnn_r50_fpn_gn_2x/model_final/ +metric: COCO +num_classes: 81 + +CascadeMaskRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: CascadeBBoxHead + bbox_assigner: CascadeBBoxAssigner + mask_head: MaskHead + mask_assigner: MaskAssigner + +ResNet: + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: affine_channel + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + norm_type: gn + +FPNRPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + sampling_ratio: 2 + box_resolution: 7 + mask_resolution: 14 + +MaskHead: + dilation: 1 + conv_dim: 256 + num_convs: 4 + resolution: 28 + norm_type: gn + +CascadeBBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [10, 20, 30] + bg_thresh_hi: [0.5, 0.6, 0.7] + bg_thresh_lo: [0.0, 0.0, 0.0] + fg_fraction: 0.25 + fg_thresh: [0.5, 0.6, 0.7] + +MaskAssigner: + resolution: 28 + +CascadeBBoxHead: + head: CascadeXConvNormHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +CascadeXConvNormHead: + norm_type: gn + +LearningRate: + base_lr: 0.02 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + batch_size: 2 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/gn/faster_rcnn_r50_fpn_gn_2x.yml b/configs/gn/faster_rcnn_r50_fpn_gn_2x.yml new file mode 100644 index 0000000000000000000000000000000000000000..a86deb4f6efab685cce14a329bd3b79f14ba36d8 --- /dev/null +++ b/configs/gn/faster_rcnn_r50_fpn_gn_2x.yml @@ -0,0 +1,137 @@ +architecture: FasterRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 180000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +metric: COCO +weights: output/faster_rcnn_r50_fpn_gn/model_final +num_classes: 81 + +FasterRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: affine_channel + +FPN: + min_level: 2 + max_level: 6 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + norm_type: gn + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + min_level: 2 + max_level: 6 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_positive_overlap: 0.7 + rpn_negative_overlap: 0.3 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + min_level: 2 + max_level: 5 + box_resolution: 7 + sampling_ratio: 2 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_lo: 0.0 + bg_thresh_hi: 0.5 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: XConvNormHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +XConvNormHead: + norm_type: gn + +LearningRate: + base_lr: 0.02 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + batch_size: 2 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 16 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 diff --git a/configs/gn/mask_rcnn_r50_fpn_gn_2x.yml b/configs/gn/mask_rcnn_r50_fpn_gn_2x.yml new file mode 100644 index 0000000000000000000000000000000000000000..bffe3ba45a78d5ddcbeb20fa15a85ebd9f034532 --- /dev/null +++ b/configs/gn/mask_rcnn_r50_fpn_gn_2x.yml @@ -0,0 +1,145 @@ +architecture: MaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +max_iters: 360000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +weights: output/mask_rcnn_r50_fpn_gn_2x/model_final/ +metric: COCO +num_classes: 81 + +MaskRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: affine_channel + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + norm_type: gn + +FPNRPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + sampling_ratio: 2 + box_resolution: 7 + mask_resolution: 14 + +MaskHead: + dilation: 1 + conv_dim: 256 + num_convs: 4 + resolution: 28 + norm_type: gn + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +MaskAssigner: + resolution: 28 + +BBoxHead: + head: XConvNormHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +XConvNormHead: + norm_type: gn + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [240000, 320000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/mask_rcnn_r101_fpn_1x.yml b/configs/mask_rcnn_r101_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..12229a074fbcf1549a3d7581fcd91bd6c124d516 --- /dev/null +++ b/configs/mask_rcnn_r101_fpn_1x.yml @@ -0,0 +1,143 @@ +architecture: MaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +use_gpu: true +max_iters: 180000 +snapshot_iter: 10000 +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_pretrained.tar +metric: COCO +weights: output/mask_rcnn_r101_fpn_1x/model_final/ +num_classes: 81 + +MaskRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 101 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: affine_channel + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + sampling_ratio: 2 + box_resolution: 7 + mask_resolution: 14 + +MaskHead: + dilation: 1 + conv_dim: 256 + num_convs: 4 + resolution: 28 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +MaskAssigner: + resolution: 28 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/mask_rcnn_r101_vd_fpn_1x.yml b/configs/mask_rcnn_r101_vd_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..91bb40c2f0d920ca1961e93a5e7adf804ff28d1b --- /dev/null +++ b/configs/mask_rcnn_r101_vd_fpn_1x.yml @@ -0,0 +1,144 @@ +architecture: MaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +max_iters: 180000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_vd_pretrained.tar +weights: output/mask_rcnn_r101_vd_fpn_1x/model_final +metric: COCO +num_classes: 81 + +MaskRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 101 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: affine_channel + variant: d + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + sampling_ratio: 2 + box_resolution: 7 + mask_resolution: 14 + +MaskHead: + dilation: 1 + conv_dim: 256 + num_convs: 4 + resolution: 28 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +MaskAssigner: + resolution: 28 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/mask_rcnn_r50_1x.yml b/configs/mask_rcnn_r50_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..6c3dd8418e55c899c1ea0e0bc7db1d39d9a0be9d --- /dev/null +++ b/configs/mask_rcnn_r50_1x.yml @@ -0,0 +1,123 @@ +architecture: MaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +use_gpu: true +max_iters: 180000 +snapshot_iter: 10000 +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +metric: COCO +weights: output/mask_rcnn_r50_1x/model_final +num_classes: 81 + +MaskRCNN: + backbone: ResNet + rpn_head: RPNHead + roi_extractor: RoIAlign + bbox_assigner: BBoxAssigner + bbox_head: BBoxHead + mask_assigner: MaskAssigner + mask_head: MaskHead + +ResNet: + norm_type: affine_channel + norm_decay: 0. + depth: 50 + feature_maps: 4 + freeze_at: 2 + +ResNetC5: + depth: 50 + norm_type: affine_channel + +RPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 12000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 6000 + post_nms_top_n: 1000 + +RoIAlign: + resolution: 14 + spatial_scale: 0.0625 + sampling_ratio: 0 + +BBoxHead: + head: ResNetC5 + nms: + keep_top_k: 100 + nms_threshold: 0.5 + normalized: false + score_threshold: 0.05 + +MaskHead: + dilation: 1 + conv_dim: 256 + resolution: 14 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +MaskAssigner: + resolution: 14 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + num_workers: 2 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json diff --git a/configs/mask_rcnn_r50_2x.yml b/configs/mask_rcnn_r50_2x.yml new file mode 100644 index 0000000000000000000000000000000000000000..091b0cf89bfe44e74ac807bfb0f1a7a8ea1a6454 --- /dev/null +++ b/configs/mask_rcnn_r50_2x.yml @@ -0,0 +1,125 @@ +architecture: MaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +use_gpu: true +max_iters: 360000 +snapshot_iter: 10000 +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +metric: COCO +weights: output/mask_rcnn_r50_2x/model_final/ +num_classes: 81 + +MaskRCNN: + backbone: ResNet + rpn_head: RPNHead + roi_extractor: RoIAlign + bbox_assigner: BBoxAssigner + bbox_head: BBoxHead + mask_assigner: MaskAssigner + mask_head: MaskHead + + +ResNet: + norm_type: affine_channel + norm_decay: 0. + depth: 50 + feature_maps: 4 + freeze_at: 2 + +ResNetC5: + depth: 50 + norm_type: affine_channel + +RPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 12000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 6000 + post_nms_top_n: 1000 + +RoIAlign: + resolution: 14 + spatial_scale: 0.0625 + sampling_ratio: 0 + +BBoxHead: + head: ResNetC5 + nms: + keep_top_k: 100 + nms_threshold: 0.5 + normalized: false + score_threshold: 0.05 + +MaskHead: + dilation: 1 + conv_dim: 256 + resolution: 14 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +MaskAssigner: + resolution: 14 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [240000, 320000] + #start the warm up from base_lr * start_factor + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + num_workers: 2 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json diff --git a/configs/mask_rcnn_r50_fpn_1x.yml b/configs/mask_rcnn_r50_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..a889ea283f445974e11d38479162b388d69ec3ad --- /dev/null +++ b/configs/mask_rcnn_r50_fpn_1x.yml @@ -0,0 +1,143 @@ +architecture: MaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +use_gpu: true +max_iters: 180000 +snapshot_iter: 10000 +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +metric: COCO +weights: output/mask_rcnn_r50_fpn_1x/model_final/ +num_classes: 81 + +MaskRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: bn + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + sampling_ratio: 2 + box_resolution: 7 + mask_resolution: 14 + +MaskHead: + dilation: 1 + conv_dim: 256 + num_convs: 4 + resolution: 28 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +MaskAssigner: + resolution: 28 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/mask_rcnn_r50_fpn_2x.yml b/configs/mask_rcnn_r50_fpn_2x.yml new file mode 100644 index 0000000000000000000000000000000000000000..08977bba39b688920f427442c01510f93efa412b --- /dev/null +++ b/configs/mask_rcnn_r50_fpn_2x.yml @@ -0,0 +1,143 @@ +architecture: MaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +max_iters: 360000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +weights: output/mask_rcnn_r50_fpn_2x/model_final/ +metric: COCO +num_classes: 81 + +MaskRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: affine_channel + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + sampling_ratio: 2 + box_resolution: 7 + mask_resolution: 14 + +MaskHead: + dilation: 1 + conv_dim: 256 + num_convs: 4 + resolution: 28 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +MaskAssigner: + resolution: 28 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [240000, 320000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/mask_rcnn_r50_vd_fpn_2x.yml b/configs/mask_rcnn_r50_vd_fpn_2x.yml new file mode 100644 index 0000000000000000000000000000000000000000..12a5057528eaada329d70058acf3aa9a13727237 --- /dev/null +++ b/configs/mask_rcnn_r50_vd_fpn_2x.yml @@ -0,0 +1,145 @@ +architecture: MaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +use_gpu: true +max_iters: 360000 +snapshot_iter: 10000 +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar +metric: COCO +weights: output/mask_rcnn_r50_vd_fpn_2x/model_final/ +num_classes: 81 + +MaskRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: affine_channel + variant: d + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + box_resolution: 7 + sampling_ratio: 2 + mask_resolution: 14 + +MaskHead: + dilation: 1 + conv_dim: 256 + num_convs: 4 + resolution: 28 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +MaskAssigner: + resolution: 28 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [240000, 320000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + # batch size per device + batch_size: 1 + dataset: + dataset_dir: dataset/coco + image_dir: train2017 + annotation: annotations/instances_train2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/mask_rcnn_se154_vd_fpn_s1x.yml b/configs/mask_rcnn_se154_vd_fpn_s1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..4430055712504bfe79252c1614792d05fb46b89b --- /dev/null +++ b/configs/mask_rcnn_se154_vd_fpn_s1x.yml @@ -0,0 +1,147 @@ +architecture: MaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +max_iters: 260000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/SENet154_vd_pretrained.tar +weights: output/mask_rcnn_se154_vd_fpn_s1x/model_final/ +metric: COCO +num_classes: 81 + +MaskRCNN: + backbone: SENet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +SENet: + depth: 152 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + group_width: 4 + groups: 64 + norm_type: affine_channel + variant: d + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + box_resolution: 7 + sampling_ratio: 2 + mask_resolution: 14 + +MaskHead: + dilation: 1 + conv_dim: 256 + num_convs: 4 + resolution: 28 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +MaskAssigner: + resolution: 28 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [200000, 240000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + # batch size per device + batch_size: 1 + dataset: + dataset_dir: dataset/coco + image_dir: train2017 + annotation: annotations/instances_train2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/mask_rcnn_x101_vd_64x4d_fpn_1x.yml b/configs/mask_rcnn_x101_vd_64x4d_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..75653ce2462e91b3be3290c5e6a5a3c12d816d71 --- /dev/null +++ b/configs/mask_rcnn_x101_vd_64x4d_fpn_1x.yml @@ -0,0 +1,146 @@ +architecture: MaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +max_iters: 180000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNeXt101_vd_64x4d_pretrained.tar +weights: output/mask_rcnn_x101_vd_64x4d_fpn_1x/model_final +metric: COCO +num_classes: 81 + +MaskRCNN: + backbone: ResNeXt + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNeXt: + depth: 101 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + group_width: 4 + groups: 64 + norm_type: affine_channel + variant: d + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + sampling_ratio: 2 + box_resolution: 7 + mask_resolution: 14 + +MaskHead: + dilation: 1 + conv_dim: 256 + num_convs: 4 + resolution: 28 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +MaskAssigner: + resolution: 28 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/mask_rcnn_x101_vd_64x4d_fpn_2x.yml b/configs/mask_rcnn_x101_vd_64x4d_fpn_2x.yml new file mode 100644 index 0000000000000000000000000000000000000000..c5a711363330092aee30c2b4d9b08dd899f97beb --- /dev/null +++ b/configs/mask_rcnn_x101_vd_64x4d_fpn_2x.yml @@ -0,0 +1,146 @@ +architecture: MaskRCNN +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed +max_iters: 360000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNeXt101_vd_64x4d_pretrained.tar +weights: output/mask_rcnn_x101_vd_64x4d_fpn_2x/model_final +metric: COCO +num_classes: 81 + +MaskRCNN: + backbone: ResNeXt + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNeXt: + depth: 101 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + group_width: 4 + groups: 64 + norm_type: affine_channel + variant: d + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + sampling_ratio: 2 + box_resolution: 7 + mask_resolution: 14 + +MaskHead: + dilation: 1 + conv_dim: 256 + num_convs: 4 + resolution: 28 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +MaskAssigner: + resolution: 28 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [240000, 320000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +MaskRCNNTrainFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 + +MaskRCNNTestFeed: + batch_size: 1 + dataset: + annotation: annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 2 diff --git a/configs/obj365/cascade_rcnn_dcnv2_se154_vd_fpn_gn_cas.yml b/configs/obj365/cascade_rcnn_dcnv2_se154_vd_fpn_gn_cas.yml new file mode 100644 index 0000000000000000000000000000000000000000..6ccc5d734a28e2154b9cb4f5c69a0cd44412bc20 --- /dev/null +++ b/configs/obj365/cascade_rcnn_dcnv2_se154_vd_fpn_gn_cas.yml @@ -0,0 +1,207 @@ +architecture: CascadeRCNN +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 500000 +snapshot_iter: 10000 +use_gpu: true +log_iter: 20 +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddlemodels.bj.bcebos.com/object_detection/cascade_mask_rcnn_dcnv2_se154_vd_fpn_gn_coco_pretrained.tar +weights: output/cascade_rcnn_dcnv2_se154_vd_fpn_gn_cas/model_final +metric: COCO +num_classes: 81 + +CascadeRCNN: + backbone: ResNet + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: CascadeBBoxHead + bbox_assigner: CascadeBBoxAssigner + +SENet: + depth: 152 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + group_width: 4 + groups: 64 + norm_type: bn + freeze_norm: True + variant: d + dcn_v2_stages: [3, 4, 5] + std_senet: True + +FPN: + min_level: 2 + max_level: 6 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + freeze_norm: False + norm_type: gn + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + min_level: 2 + max_level: 6 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_positive_overlap: 0.7 + rpn_negative_overlap: 0.3 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + min_level: 2 + max_level: 5 + box_resolution: 7 + sampling_ratio: 2 + +CascadeBBoxAssigner: + batch_size_per_im: 1024 + bbox_reg_weights: [10, 20, 30] + bg_thresh_lo: [0.0, 0.0, 0.0] + bg_thresh_hi: [0.5, 0.6, 0.7] + fg_thresh: [0.5, 0.6, 0.7] + fg_fraction: 0.25 + +CascadeBBoxHead: + head: CascadeXConvNormHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +CascadeXConvNormHead: + norm_type: gn + +CascadeTwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [400000, 460000] + - !LinearWarmup + start_factor: 0.01 + steps: 2000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/objects365 + annotation: annotations/train.json + image_dir: train + sample_transforms: + - !DecodeImage + to_rgb: False + with_mixup: False + - !RandomFlipImage + is_mask_flip: true + is_normalized: false + prob: 0.5 + - !NormalizeImage + is_channel_first: false + is_scale: False + mean: + - 102.9801 + - 115.9465 + - 122.7717 + std: + - 1.0 + - 1.0 + - 1.0 + - !ResizeImage + interp: 1 + target_size: + - 416 + - 448 + - 480 + - 512 + - 544 + - 576 + - 608 + - 640 + - 672 + - 704 + - 736 + - 768 + - 800 + - 832 + - 864 + - 896 + - 928 + - 960 + - 992 + - 1024 + - 1056 + - 1088 + - 1120 + - 1152 + - 1184 + - 1216 + - 1248 + - 1280 + - 1312 + - 1344 + - 1376 + - 1408 + max_size: 1600 + use_cv2: true + - !Permute + channel_first: true + to_bgr: false + batch_transforms: + - !PadBatch + pad_to_stride: 32 + num_workers: 4 + class_aware_sampling: true + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/objects365 + annotation: annotations/val.json + image_dir: val + batch_transforms: + - !PadBatch + pad_to_stride: 32 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/obj365/annotations/val.json + batch_transforms: + - !PadBatch + pad_to_stride: 32 + drop_last: false + num_workers: 2 diff --git a/configs/retinanet_r101_fpn_1x.yml b/configs/retinanet_r101_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..1864f379830292f1cc6a659995a71783688f61c3 --- /dev/null +++ b/configs/retinanet_r101_fpn_1x.yml @@ -0,0 +1,105 @@ +architecture: RetinaNet +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 90000 +use_gpu: true +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_pretrained.tar +weights: output/retinanet_r101_fpn_1x/model_final +log_smooth_window: 20 +snapshot_iter: 10000 +metric: COCO +save_dir: output +num_classes: 81 + +RetinaNet: + backbone: ResNet + fpn: FPN + retina_head: RetinaHead + +ResNet: + norm_type: affine_channel + norm_decay: 0. + depth: 101 + feature_maps: [3, 4, 5] + freeze_at: 2 + +FPN: + max_level: 7 + min_level: 3 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125] + has_extra_convs: true + +RetinaHead: + num_convs_per_octave: 4 + num_chan: 256 + max_level: 7 + min_level: 3 + prior_prob: 0.01 + base_scale: 4 + num_scales_per_octave: 3 + anchor_generator: + aspect_ratios: [1.0, 2.0, 0.5] + variance: [1.0, 1.0, 1.0, 1.0] + target_assign: + positive_overlap: 0.5 + negative_overlap: 0.4 + gamma: 2.0 + alpha: 0.25 + sigma: 3.0151134457776365 + output_decoder: + score_thresh: 0.05 + nms_thresh: 0.5 + pre_nms_top_n: 1000 + detections_per_im: 100 + nms_eta: 1.0 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [60000, 80000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + batch_size: 2 + batch_transforms: + - !PadBatch + pad_to_stride: 128 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 2 + batch_transforms: + - !PadBatch + pad_to_stride: 128 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + batch_transforms: + - !PadBatch + pad_to_stride: 128 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + num_workers: 2 diff --git a/configs/retinanet_r50_fpn_1x.yml b/configs/retinanet_r50_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..8c24ef4defd9e947406591a17ae2feb4833c61a1 --- /dev/null +++ b/configs/retinanet_r50_fpn_1x.yml @@ -0,0 +1,105 @@ +architecture: RetinaNet +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 90000 +use_gpu: true +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +weights: output/retinanet_r50_fpn_1x/model_final +log_smooth_window: 20 +snapshot_iter: 10000 +metric: COCO +save_dir: output +num_classes: 81 + +RetinaNet: + backbone: ResNet + fpn: FPN + retina_head: RetinaHead + +ResNet: + norm_type: affine_channel + norm_decay: 0. + depth: 50 + feature_maps: [3, 4, 5] + freeze_at: 2 + +FPN: + max_level: 7 + min_level: 3 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125] + has_extra_convs: true + +RetinaHead: + num_convs_per_octave: 4 + num_chan: 256 + max_level: 7 + min_level: 3 + prior_prob: 0.01 + base_scale: 4 + num_scales_per_octave: 3 + anchor_generator: + aspect_ratios: [1.0, 2.0, 0.5] + variance: [1.0, 1.0, 1.0, 1.0] + target_assign: + positive_overlap: 0.5 + negative_overlap: 0.4 + gamma: 2.0 + alpha: 0.25 + sigma: 3.0151134457776365 + output_decoder: + score_thresh: 0.05 + nms_thresh: 0.5 + pre_nms_top_n: 1000 + detections_per_im: 100 + nms_eta: 1.0 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [60000, 80000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + batch_size: 2 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 128 + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 2 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 128 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 128 + num_workers: 2 diff --git a/configs/retinanet_x101_vd_64x4d_fpn_1x.yml b/configs/retinanet_x101_vd_64x4d_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..6cc33fafa485c4a64cefef67ec0d6ab0b237db84 --- /dev/null +++ b/configs/retinanet_x101_vd_64x4d_fpn_1x.yml @@ -0,0 +1,108 @@ +architecture: RetinaNet +train_feed: FasterRCNNTrainFeed +eval_feed: FasterRCNNEvalFeed +test_feed: FasterRCNNTestFeed +max_iters: 180000 +use_gpu: true +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNeXt101_vd_64x4d_pretrained.tar +weights: output/retinanet_x101_vd_64x4d_fpn_1x/model_final +log_smooth_window: 20 +log_iter: 20 +snapshot_iter: 30000 +metric: COCO +save_dir: output +num_classes: 81 + +RetinaNet: + backbone: ResNeXt + fpn: FPN + retina_head: RetinaHead + +ResNeXt: + depth: 101 + feature_maps: [3, 4, 5] + freeze_at: 2 + group_width: 4 + groups: 64 + norm_type: bn + variant: d + +FPN: + max_level: 7 + min_level: 3 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125] + has_extra_convs: true + +RetinaHead: + num_convs_per_octave: 4 + num_chan: 256 + max_level: 7 + min_level: 3 + prior_prob: 0.01 + base_scale: 4 + num_scales_per_octave: 3 + anchor_generator: + aspect_ratios: [1.0, 2.0, 0.5] + variance: [1.0, 1.0, 1.0, 1.0] + target_assign: + positive_overlap: 0.5 + negative_overlap: 0.4 + gamma: 2.0 + alpha: 0.25 + sigma: 3.0151134457776365 + output_decoder: + score_thresh: 0.05 + nms_thresh: 0.5 + pre_nms_top_n: 1000 + detections_per_im: 100 + nms_eta: 1.0 + +LearningRate: + base_lr: 0.005 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.1 + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +FasterRCNNTrainFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + batch_transforms: + - !PadBatch + pad_to_stride: 128 + num_workers: 2 + +FasterRCNNEvalFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: + - !PadBatch + pad_to_stride: 128 + num_workers: 2 + +FasterRCNNTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: + - !PadBatch + pad_to_stride: 128 + num_workers: 2 diff --git a/configs/ssd/ssd_mobilenet_v1_voc.yml b/configs/ssd/ssd_mobilenet_v1_voc.yml new file mode 100644 index 0000000000000000000000000000000000000000..1d8554643c296738a711d50b3f84eea5e130a96e --- /dev/null +++ b/configs/ssd/ssd_mobilenet_v1_voc.yml @@ -0,0 +1,82 @@ +architecture: SSD +train_feed: SSDTrainFeed +eval_feed: SSDEvalFeed +test_feed: SSDTestFeed +pretrain_weights: https://paddlemodels.bj.bcebos.com/object_detection/ssd_mobilenet_v1_coco_pretrained.tar +use_gpu: true +max_iters: 28000 +snapshot_iter: 2000 +log_smooth_window: 1 +metric: VOC +map_type: 11point +save_dir: output +weights: output/ssd_mobilenet_v1_voc/model_final/ +# 20(label_class) + 1(background) +num_classes: 21 + +SSD: + backbone: MobileNet + multi_box_head: MultiBoxHead + output_decoder: + background_label: 0 + keep_top_k: 200 + nms_eta: 1.0 + nms_threshold: 0.45 + nms_top_k: 400 + score_threshold: 0.01 + +MobileNet: + norm_decay: 0. + conv_group_scale: 1 + conv_learning_rate: 0.1 + extra_block_filters: [[256, 512], [128, 256], [128, 256], [64, 128]] + with_extra_blocks: true + +MultiBoxHead: + aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2., 3.], [2., 3.]] + base_size: 300 + flip: true + max_ratio: 90 + max_sizes: [[], 150.0, 195.0, 240.0, 285.0, 300.0] + min_ratio: 20 + min_sizes: [60.0, 105.0, 150.0, 195.0, 240.0, 285.0] + offset: 0.5 + +LearningRate: + schedulers: + - !PiecewiseDecay + milestones: [10000, 15000, 20000, 25000] + values: [0.001, 0.0005, 0.00025, 0.0001, 0.00001] + +OptimizerBuilder: + optimizer: + momentum: 0.0 + type: RMSPropOptimizer + regularizer: + factor: 0.00005 + type: L2 + +SSDTrainFeed: + batch_size: 32 + use_process: true + dataset: + dataset_dir: dataset/voc + annotation: VOCdevkit/VOC_all/ImageSets/Main/train.txt + image_dir: VOCdevkit/VOC_all/JPEGImages + use_default_label: true + +SSDEvalFeed: + batch_size: 64 + use_process: true + dataset: + dataset_dir: dataset/voc + annotation: VOCdevkit/VOC_all/ImageSets/Main/val.txt + image_dir: VOCdevkit/VOC_all/JPEGImages + use_default_label: true + drop_last: false + +SSDTestFeed: + batch_size: 1 + dataset: + use_default_label: true + drop_last: false diff --git a/configs/ssd/ssd_vgg16_300.yml b/configs/ssd/ssd_vgg16_300.yml new file mode 100644 index 0000000000000000000000000000000000000000..f5e987bcb156a3a4973f9ae9d376d38597471ce2 --- /dev/null +++ b/configs/ssd/ssd_vgg16_300.yml @@ -0,0 +1,149 @@ +architecture: SSD +train_feed: SSDTrainFeed +eval_feed: SSDEvalFeed +test_feed: SSDTestFeed +use_gpu: true +max_iters: 400000 +snapshot_iter: 10000 +log_smooth_window: 20 +log_iter: 20 +metric: COCO +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/VGG16_caffe_pretrained.tar +save_dir: output +weights: output/ssd_vgg16_300/model_final +num_classes: 81 + +SSD: + backbone: VGG + multi_box_head: MultiBoxHead + output_decoder: + background_label: 0 + keep_top_k: 200 + nms_eta: 1.0 + nms_threshold: 0.45 + nms_top_k: 400 + score_threshold: 0.01 + +VGG: + depth: 16 + with_extra_blocks: true + normalizations: [20., -1, -1, -1, -1, -1] + +MultiBoxHead: + base_size: 300 + aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]] + min_ratio: 15 + max_ratio: 90 + min_sizes: [30.0, 60.0, 111.0, 162.0, 213.0, 264.0] + max_sizes: [60.0, 111.0, 162.0, 213.0, 264.0, 315.0] + steps: [8, 16, 32, 64, 100, 300] + offset: 0.5 + flip: true + kernel_size: 3 + pad: 1 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [280000, 360000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 + +SSDTrainFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + image_shape: [3, 300, 300] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !RandomDistort + brightness_lower: 0.875 + brightness_upper: 1.125 + is_order: true + - !ExpandImage + max_ratio: 4 + mean: [104, 117, 123] + prob: 0.5 + - !CropImage + avoid_no_bbox: true + batch_sampler: + - [1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0] + satisfy_all: false + - !ResizeImage + interp: 1 + target_size: 300 + use_cv2: false + - !RandomFlipImage + is_normalized: true + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] + +SSDEvalFeed: + batch_size: 16 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + drop_last: false + image_shape: [3, 300, 300] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !ResizeImage + interp: 1 + target_size: 300 + use_cv2: false + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] + +SSDTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + image_shape: [3, 300, 300] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !ResizeImage + interp: 1 + max_size: 0 + target_size: 300 + use_cv2: false + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] diff --git a/configs/ssd/ssd_vgg16_300_voc.yml b/configs/ssd/ssd_vgg16_300_voc.yml new file mode 100644 index 0000000000000000000000000000000000000000..5d91ed141c49cddeb925955e0ce1933c698dc6a0 --- /dev/null +++ b/configs/ssd/ssd_vgg16_300_voc.yml @@ -0,0 +1,153 @@ +architecture: SSD +train_feed: SSDTrainFeed +eval_feed: SSDEvalFeed +test_feed: SSDTestFeed +use_gpu: true +max_iters: 120001 +snapshot_iter: 10000 +log_smooth_window: 20 +log_iter: 20 +metric: VOC +map_type: 11point +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/VGG16_caffe_pretrained.tar +save_dir: output +weights: output/ssd_vgg16_300_voc/model_final/ +# 20(label_class) + 1(background) +num_classes: 21 + +SSD: + backbone: VGG + multi_box_head: MultiBoxHead + output_decoder: + background_label: 0 + keep_top_k: 200 + nms_eta: 1.0 + nms_threshold: 0.45 + nms_top_k: 400 + score_threshold: 0.01 + +VGG: + depth: 16 + with_extra_blocks: true + normalizations: [20., -1, -1, -1, -1, -1] + +MultiBoxHead: + base_size: 300 + aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]] + min_ratio: 20 + max_ratio: 90 + min_sizes: [30.0, 60.0, 111.0, 162.0, 213.0, 264.0] + max_sizes: [60.0, 111.0, 162.0, 213.0, 264.0, 315.0] + steps: [8, 16, 32, 64, 100, 300] + offset: 0.5 + flip: true + min_max_aspect_ratios_order: true + kernel_size: 3 + pad: 1 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [80000, 100000] + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 + +SSDTrainFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/voc + annotation: VOCdevkit/VOC_all/ImageSets/Main/train.txt + image_dir: VOCdevkit/VOC_all/JPEGImages + use_default_label: true + image_shape: [3, 300, 300] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !RandomDistort + brightness_lower: 0.875 + brightness_upper: 1.125 + is_order: true + - !ExpandImage + max_ratio: 4 + mean: [104, 117, 123] + prob: 0.5 + - !CropImage + avoid_no_bbox: true + batch_sampler: + - [1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0] + satisfy_all: false + - !ResizeImage + interp: 1 + target_size: 300 + use_cv2: False + - !RandomFlipImage + is_normalized: true + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] + +SSDEvalFeed: + batch_size: 32 + dataset: + dataset_dir: dataset/voc + annotation: VOCdevkit/VOC_all/ImageSets/Main/val.txt + image_dir: VOCdevkit/VOC_all/JPEGImages + use_default_label: true + drop_last: false + image_shape: [3, 300, 300] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !ResizeImage + interp: 1 + target_size: 300 + use_cv2: false + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] + +SSDTestFeed: + batch_size: 1 + dataset: + use_default_label: true + drop_last: false + image_shape: [3, 300, 300] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !ResizeImage + interp: 1 + max_size: 0 + target_size: 300 + use_cv2: false + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] diff --git a/configs/ssd/ssd_vgg16_512.yml b/configs/ssd/ssd_vgg16_512.yml new file mode 100644 index 0000000000000000000000000000000000000000..6214327642bfe3beb09e28cd7dbe891e8d49b848 --- /dev/null +++ b/configs/ssd/ssd_vgg16_512.yml @@ -0,0 +1,151 @@ +architecture: SSD +train_feed: SSDTrainFeed +eval_feed: SSDEvalFeed +test_feed: SSDTestFeed +use_gpu: true +max_iters: 400000 +snapshot_iter: 10000 +log_smooth_window: 20 +log_iter: 20 +metric: COCO +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/VGG16_caffe_pretrained.tar +save_dir: output +weights: output/ssd_vgg16_512/model_final +num_classes: 81 + +SSD: + backbone: VGG + multi_box_head: MultiBoxHead + output_decoder: + background_label: 0 + keep_top_k: 200 + nms_eta: 1.0 + nms_threshold: 0.45 + nms_top_k: 400 + score_threshold: 0.01 + +VGG: + depth: 16 + with_extra_blocks: true + normalizations: [20., -1, -1, -1, -1, -1, -1] + extra_block_filters: [[256, 512, 1, 2, 3], [128, 256, 1, 2, 3], [128, 256, 1, 2, 3], [128, 256, 1, 2, 3], [128, 256, 1, 1, 4]] + + +MultiBoxHead: + base_size: 512 + aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]] + min_ratio: 15 + max_ratio: 90 + min_sizes: [20.0, 51.0, 133.0, 215.0, 296.0, 378.0, 460.0] + max_sizes: [51.0, 133.0, 215.0, 296.0, 378.0, 460.0, 542.0] + steps: [8, 16, 32, 64, 128, 256, 512] + offset: 0.5 + flip: true + kernel_size: 3 + pad: 1 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [280000, 360000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 + +SSDTrainFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + image_shape: [3, 512, 512] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !RandomDistort + brightness_lower: 0.875 + brightness_upper: 1.125 + is_order: true + - !ExpandImage + max_ratio: 4 + mean: [104, 117, 123] + prob: 0.5 + - !CropImage + avoid_no_bbox: true + batch_sampler: + - [1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0] + satisfy_all: false + - !ResizeImage + interp: 1 + target_size: 512 + use_cv2: false + - !RandomFlipImage + is_normalized: true + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] + +SSDEvalFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + drop_last: false + image_shape: [3, 512, 512] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !ResizeImage + interp: 1 + target_size: 512 + use_cv2: false + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] + +SSDTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + image_shape: [3, 512, 512] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !ResizeImage + interp: 1 + max_size: 0 + target_size: 512 + use_cv2: false + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [104, 117, 123] + std: [1, 1, 1] diff --git a/configs/ssd/ssd_vgg16_512_voc.yml b/configs/ssd/ssd_vgg16_512_voc.yml new file mode 100644 index 0000000000000000000000000000000000000000..fa27465930ec18c05eb38d528b31b2a2935ad2f2 --- /dev/null +++ b/configs/ssd/ssd_vgg16_512_voc.yml @@ -0,0 +1,157 @@ +architecture: SSD +train_feed: SSDTrainFeed +eval_feed: SSDEvalFeed +test_feed: SSDTestFeed +use_gpu: true +max_iters: 120000 +snapshot_iter: 10000 +log_smooth_window: 20 +log_iter: 20 +metric: VOC +map_type: 11point +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/VGG16_caffe_pretrained.tar +save_dir: output +weights: output/ssd_vgg16_512_voc/model_final/ +# 20(label_class) + 1(background) +num_classes: 21 + +SSD: + backbone: VGG + multi_box_head: MultiBoxHead + output_decoder: + background_label: 0 + keep_top_k: 200 + nms_eta: 1.0 + nms_threshold: 0.45 + nms_top_k: 400 + score_threshold: 0.01 + +VGG: + depth: 16 + with_extra_blocks: true + normalizations: [20., -1, -1, -1, -1, -1, -1] + extra_block_filters: [[256, 512, 1, 2, 3], [128, 256, 1, 2, 3], [128, 256, 1, 2, 3], [128, 256, 1, 2, 3], [128, 256, 1, 1, 4]] + + +MultiBoxHead: + base_size: 512 + aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]] + min_ratio: 20 + max_ratio: 90 + min_sizes: [20.0, 51.0, 133.0, 215.0, 296.0, 378.0, 460.0] + max_sizes: [51.0, 133.0, 215.0, 296.0, 378.0, 460.0, 542.0] + steps: [8, 16, 32, 64, 128, 256, 512] + offset: 0.5 + flip: true + kernel_size: 3 + pad: 1 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [80000, 100000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 + +SSDTrainFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/voc + annotation: VOCdevkit/VOC_all/ImageSets/Main/train.txt + image_dir: VOCdevkit/VOC_all/JPEGImages + use_default_label: true + image_shape: [3, 512, 512] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !RandomDistort + brightness_lower: 0.875 + brightness_upper: 1.125 + is_order: true + - !ExpandImage + max_ratio: 4 + mean: [123, 117, 104] + prob: 0.5 + - !CropImage + avoid_no_bbox: true + batch_sampler: + - [1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0] + satisfy_all: false + - !ResizeImage + interp: 1 + target_size: 512 + use_cv2: false + - !RandomFlipImage + is_normalized: true + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [123, 117, 104] + std: [1, 1, 1] + +SSDEvalFeed: + batch_size: 32 + dataset: + dataset_dir: dataset/voc + annotation: VOCdevkit/VOC_all/ImageSets/Main/val.txt + image_dir: VOCdevkit/VOC_all/JPEGImages + use_default_label: true + drop_last: false + image_shape: [3, 512, 512] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !ResizeImage + interp: 1 + target_size: 512 + use_cv2: false + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [123, 117, 104] + std: [1, 1, 1] + +SSDTestFeed: + batch_size: 1 + dataset: + use_default_label: true + drop_last: false + image_shape: [3, 512, 512] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !ResizeImage + interp: 1 + max_size: 0 + target_size: 512 + use_cv2: false + - !Permute + to_bgr: false + - !NormalizeImage + is_scale: false + mean: [123, 117, 104] + std: [1, 1, 1] diff --git a/configs/yolov3_darknet.yml b/configs/yolov3_darknet.yml new file mode 100644 index 0000000000000000000000000000000000000000..9a1c243b8b245c4d0cb60e6ea9f57778e896d263 --- /dev/null +++ b/configs/yolov3_darknet.yml @@ -0,0 +1,82 @@ +architecture: YOLOv3 +train_feed: YoloTrainFeed +eval_feed: YoloEvalFeed +test_feed: YoloTestFeed +use_gpu: true +max_iters: 500200 +log_smooth_window: 20 +save_dir: output +snapshot_iter: 2000 +metric: COCO +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/DarkNet53_pretrained.tar +weights: output/yolov3_darknet/model_final +num_classes: 80 + +YOLOv3: + backbone: DarkNet + yolo_head: YOLOv3Head + +DarkNet: + norm_type: sync_bn + norm_decay: 0. + depth: 53 + +YOLOv3Head: + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + norm_decay: 0. + ignore_thresh: 0.7 + label_smooth: true + nms: + background_label: -1 + keep_top_k: 100 + nms_threshold: 0.45 + nms_top_k: 1000 + normalized: false + score_threshold: 0.01 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 400000 + - 450000 + - !LinearWarmup + start_factor: 0. + steps: 4000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 + +YoloTrainFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + num_workers: 8 + bufsize: 128 + use_process: true + +YoloEvalFeed: + batch_size: 8 + image_shape: [3, 608, 608] + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + +YoloTestFeed: + batch_size: 1 + image_shape: [3, 608, 608] + dataset: + annotation: dataset/coco/annotations/instances_val2017.json diff --git a/configs/yolov3_darknet_voc.yml b/configs/yolov3_darknet_voc.yml new file mode 100644 index 0000000000000000000000000000000000000000..1f6c10b7d8cd453e9b9c89247eb073d5ad74b4f8 --- /dev/null +++ b/configs/yolov3_darknet_voc.yml @@ -0,0 +1,86 @@ +architecture: YOLOv3 +train_feed: YoloTrainFeed +eval_feed: YoloEvalFeed +test_feed: YoloTestFeed +use_gpu: true +max_iters: 70000 +log_smooth_window: 20 +save_dir: output +snapshot_iter: 2000 +metric: VOC +map_type: 11point +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/DarkNet53_pretrained.tar +weights: output/yolov3_darknet_voc/model_final +num_classes: 20 + +YOLOv3: + backbone: DarkNet + yolo_head: YOLOv3Head + +DarkNet: + norm_type: sync_bn + norm_decay: 0. + depth: 53 + +YOLOv3Head: + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + norm_decay: 0. + ignore_thresh: 0.7 + label_smooth: false + nms: + background_label: -1 + keep_top_k: 100 + nms_threshold: 0.45 + nms_top_k: 1000 + normalized: false + score_threshold: 0.01 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 55000 + - 62000 + - !LinearWarmup + start_factor: 0. + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 + +YoloTrainFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/voc + annotation: VOCdevkit/VOC_all/ImageSets/Main/train.txt + image_dir: VOCdevkit/VOC_all/JPEGImages + use_default_label: true + num_workers: 8 + bufsize: 128 + use_process: true + mixup_epoch: 250 + +YoloEvalFeed: + batch_size: 8 + image_shape: [3, 608, 608] + dataset: + dataset_dir: dataset/voc + annotation: VOCdevkit/VOC_all/ImageSets/Main/val.txt + image_dir: VOCdevkit/VOC_all/JPEGImages + use_default_label: true + +YoloTestFeed: + batch_size: 1 + image_shape: [3, 608, 608] + dataset: + use_default_label: true diff --git a/configs/yolov3_mobilenet_v1.yml b/configs/yolov3_mobilenet_v1.yml new file mode 100644 index 0000000000000000000000000000000000000000..3e622025b587b56c6e79dd3a1cf1cbba00901406 --- /dev/null +++ b/configs/yolov3_mobilenet_v1.yml @@ -0,0 +1,83 @@ +architecture: YOLOv3 +train_feed: YoloTrainFeed +eval_feed: YoloEvalFeed +test_feed: YoloTestFeed +use_gpu: true +max_iters: 500200 +log_smooth_window: 20 +save_dir: output +snapshot_iter: 2000 +metric: COCO +pretrain_weights: http://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV1_pretrained.tar +weights: output/yolov3_mobilenet_v1/model_final +num_classes: 80 + +YOLOv3: + backbone: MobileNet + yolo_head: YOLOv3Head + +MobileNet: + norm_type: sync_bn + norm_decay: 0. + conv_group_scale: 1 + with_extra_blocks: false + +YOLOv3Head: + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + norm_decay: 0. + ignore_thresh: 0.7 + label_smooth: true + nms: + background_label: -1 + keep_top_k: 100 + nms_threshold: 0.45 + nms_top_k: 1000 + normalized: false + score_threshold: 0.01 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 400000 + - 450000 + - !LinearWarmup + start_factor: 0. + steps: 4000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 + +YoloTrainFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + num_workers: 8 + bufsize: 128 + use_process: true + +YoloEvalFeed: + batch_size: 8 + image_shape: [3, 608, 608] + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + +YoloTestFeed: + batch_size: 1 + image_shape: [3, 608, 608] + dataset: + annotation: dataset/coco/annotations/instances_val2017.json diff --git a/configs/yolov3_mobilenet_v1_fruit.yml b/configs/yolov3_mobilenet_v1_fruit.yml new file mode 100644 index 0000000000000000000000000000000000000000..b7922489a35ce637f55376c9460e626ff81ed3d2 --- /dev/null +++ b/configs/yolov3_mobilenet_v1_fruit.yml @@ -0,0 +1,125 @@ +architecture: YOLOv3 +train_feed: YoloTrainFeed +eval_feed: YoloEvalFeed +test_feed: YoloTestFeed +use_gpu: true +max_iters: 20000 +log_smooth_window: 20 +save_dir: output +snapshot_iter: 200 +metric: VOC +map_type: 11point +pretrain_weights: https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar +weights: output/yolov3_mobilenet_v1_fruit/best_model +num_classes: 3 +finetune_exclude_pretrained_params: ['yolo_output'] + +YOLOv3: + backbone: MobileNet + yolo_head: YOLOv3Head + +MobileNet: + norm_type: sync_bn + norm_decay: 0. + conv_group_scale: 1 + with_extra_blocks: false + +YOLOv3Head: + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + norm_decay: 0. + ignore_thresh: 0.7 + label_smooth: true + nms: + background_label: -1 + keep_top_k: 100 + nms_threshold: 0.45 + nms_top_k: 1000 + normalized: false + score_threshold: 0.01 + +LearningRate: + base_lr: 0.00001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 15000 + - 18000 + - !LinearWarmup + start_factor: 0. + steps: 100 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 + +YoloTrainFeed: + batch_size: 1 + dataset: + dataset_dir: dataset/fruit/fruit-detection + annotation: ./ImageSets/Main/train.txt + image_dir: ./JPEGImages + use_default_label: false + num_workers: 16 + bufsize: 128 + use_process: true + mixup_epoch: -1 + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeBox {} + - !ExpandImage + max_ratio: 4.0 + mean: [123.675, 116.28, 103.53] + prob: 0.5 + - !RandomInterpImage + max_size: 0 + target_size: 608 + - !RandomFlipImage + is_mask_flip: false + is_normalized: true + prob: 0.5 + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: + - 0.485 + - 0.456 + - 0.406 + std: + - 0.229 + - 0.224 + - 0.225 + - !Permute + channel_first: true + to_bgr: false + batch_transforms: + - !RandomShape + sizes: [608] + with_background: false + +YoloEvalFeed: + batch_size: 1 + image_shape: [3, 608, 608] + dataset: + dataset_dir: dataset/fruit/fruit-detection + annotation: ./ImageSets/Main/val.txt + image_dir: ./JPEGImages + use_default_label: false + + +YoloTestFeed: + batch_size: 1 + image_shape: [3, 608, 608] + dataset: + dataset_dir: dataset/fruit/fruit-detection + annotation: ./ImageSets/Main/label_list.txt + use_default_label: false diff --git a/configs/yolov3_mobilenet_v1_voc.yml b/configs/yolov3_mobilenet_v1_voc.yml new file mode 100644 index 0000000000000000000000000000000000000000..8933773ef9d9d2116d6a2ec61a14027661be1c67 --- /dev/null +++ b/configs/yolov3_mobilenet_v1_voc.yml @@ -0,0 +1,87 @@ +architecture: YOLOv3 +train_feed: YoloTrainFeed +eval_feed: YoloEvalFeed +test_feed: YoloTestFeed +use_gpu: true +max_iters: 70000 +log_smooth_window: 20 +save_dir: output +snapshot_iter: 2000 +metric: VOC +map_type: 11point +pretrain_weights: http://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV1_pretrained.tar +weights: output/yolov3_mobilenet_v1_voc/model_final +num_classes: 20 + +YOLOv3: + backbone: MobileNet + yolo_head: YOLOv3Head + +MobileNet: + norm_type: sync_bn + norm_decay: 0. + conv_group_scale: 1 + with_extra_blocks: false + +YOLOv3Head: + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + norm_decay: 0. + ignore_thresh: 0.7 + label_smooth: false + nms: + background_label: -1 + keep_top_k: 100 + nms_threshold: 0.45 + nms_top_k: 1000 + normalized: false + score_threshold: 0.01 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 55000 + - 62000 + - !LinearWarmup + start_factor: 0. + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 + +YoloTrainFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/voc + annotation: VOCdevkit/VOC_all/ImageSets/Main/train.txt + image_dir: VOCdevkit/VOC_all/JPEGImages + use_default_label: true + num_workers: 8 + bufsize: 128 + use_process: true + mixup_epoch: 250 + +YoloEvalFeed: + batch_size: 8 + image_shape: [3, 608, 608] + dataset: + dataset_dir: dataset/voc + annotation: VOCdevkit/VOC_all/ImageSets/Main/val.txt + image_dir: VOCdevkit/VOC_all/JPEGImages + use_default_label: true + +YoloTestFeed: + batch_size: 1 + image_shape: [3, 608, 608] + dataset: + use_default_label: true diff --git a/configs/yolov3_r34.yml b/configs/yolov3_r34.yml new file mode 100644 index 0000000000000000000000000000000000000000..e864f8fd92d2f671c442f3beecd8344171952e48 --- /dev/null +++ b/configs/yolov3_r34.yml @@ -0,0 +1,85 @@ +architecture: YOLOv3 +train_feed: YoloTrainFeed +eval_feed: YoloEvalFeed +test_feed: YoloTestFeed +use_gpu: true +max_iters: 500200 +log_smooth_window: 20 +save_dir: output +snapshot_iter: 2000 +metric: COCO +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet34_pretrained.tar +weights: output/yolov3_r34/model_final +num_classes: 80 + +YOLOv3: + backbone: ResNet + yolo_head: YOLOv3Head + +ResNet: + norm_type: sync_bn + freeze_at: 0 + freeze_norm: false + norm_decay: 0. + depth: 34 + feature_maps: [3, 4, 5] + +YOLOv3Head: + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + norm_decay: 0. + ignore_thresh: 0.7 + label_smooth: true + nms: + background_label: -1 + keep_top_k: 100 + nms_threshold: 0.45 + nms_top_k: 1000 + normalized: false + score_threshold: 0.01 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 400000 + - 450000 + - !LinearWarmup + start_factor: 0. + steps: 4000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 + +YoloTrainFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_train2017.json + image_dir: train2017 + num_workers: 8 + bufsize: 128 + use_process: true + +YoloEvalFeed: + batch_size: 8 + image_shape: [3, 608, 608] + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + +YoloTestFeed: + batch_size: 1 + image_shape: [3, 608, 608] + dataset: + annotation: dataset/coco/annotations/instances_val2017.json diff --git a/configs/yolov3_r34_voc.yml b/configs/yolov3_r34_voc.yml new file mode 100644 index 0000000000000000000000000000000000000000..5bb6a1366aa8fa38f4c4fe1feeb26528579f75c9 --- /dev/null +++ b/configs/yolov3_r34_voc.yml @@ -0,0 +1,89 @@ +architecture: YOLOv3 +train_feed: YoloTrainFeed +eval_feed: YoloEvalFeed +test_feed: YoloTestFeed +use_gpu: true +max_iters: 70000 +log_smooth_window: 20 +save_dir: output +snapshot_iter: 2000 +metric: VOC +map_type: 11point +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet34_pretrained.tar +weights: output/yolov3_r34_voc/model_final +num_classes: 20 + +YOLOv3: + backbone: ResNet + yolo_head: YOLOv3Head + +ResNet: + norm_type: sync_bn + freeze_at: 0 + freeze_norm: false + norm_decay: 0. + depth: 34 + feature_maps: [3, 4, 5] + +YOLOv3Head: + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + norm_decay: 0. + ignore_thresh: 0.7 + label_smooth: false + nms: + background_label: -1 + keep_top_k: 100 + nms_threshold: 0.45 + nms_top_k: 1000 + normalized: false + score_threshold: 0.01 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 55000 + - 62000 + - !LinearWarmup + start_factor: 0. + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 + +YoloTrainFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/voc + annotation: VOCdevkit/VOC_all/ImageSets/Main/train.txt + image_dir: VOCdevkit/VOC_all/JPEGImages + use_default_label: true + num_workers: 8 + bufsize: 128 + use_process: true + mixup_epoch: 250 + +YoloEvalFeed: + batch_size: 8 + image_shape: [3, 608, 608] + dataset: + dataset_dir: dataset/voc + annotation: VOCdevkit/VOC_all/ImageSets/Main/val.txt + image_dir: VOCdevkit/VOC_all/JPEGImages + use_default_label: true + +YoloTestFeed: + batch_size: 1 + image_shape: [3, 608, 608] + dataset: + use_default_label: true diff --git a/contrib/PedestrianDetection/demo/001.png b/contrib/PedestrianDetection/demo/001.png new file mode 100644 index 0000000000000000000000000000000000000000..63ae9167fd03e8a95756fe5f6195fc8d741b9cfa Binary files /dev/null and b/contrib/PedestrianDetection/demo/001.png differ diff --git a/contrib/PedestrianDetection/demo/002.png b/contrib/PedestrianDetection/demo/002.png new file mode 100644 index 0000000000000000000000000000000000000000..0de905cf55e6b02487ee1b8220810df8eaa24c2c Binary files /dev/null and b/contrib/PedestrianDetection/demo/002.png differ diff --git a/contrib/PedestrianDetection/demo/003.png b/contrib/PedestrianDetection/demo/003.png new file mode 100644 index 0000000000000000000000000000000000000000..e9026e099df42d4267be07a71401eb5426b47745 Binary files /dev/null and b/contrib/PedestrianDetection/demo/003.png differ diff --git a/contrib/PedestrianDetection/demo/004.png b/contrib/PedestrianDetection/demo/004.png new file mode 100644 index 0000000000000000000000000000000000000000..d8118ec3e0ef63bc74e825b5e7638a1886580604 Binary files /dev/null and b/contrib/PedestrianDetection/demo/004.png differ diff --git a/contrib/PedestrianDetection/demo/output/001.png b/contrib/PedestrianDetection/demo/output/001.png new file mode 100644 index 0000000000000000000000000000000000000000..5194d6ff891b9507fedfc53f36de4f00219c7f30 Binary files /dev/null and b/contrib/PedestrianDetection/demo/output/001.png differ diff --git a/contrib/PedestrianDetection/demo/output/004.png b/contrib/PedestrianDetection/demo/output/004.png new file mode 100644 index 0000000000000000000000000000000000000000..7c62be5051f9a47c5f5e98ccd9f45c3fa5f30257 Binary files /dev/null and b/contrib/PedestrianDetection/demo/output/004.png differ diff --git a/contrib/PedestrianDetection/pedestrian.json b/contrib/PedestrianDetection/pedestrian.json new file mode 100644 index 0000000000000000000000000000000000000000..f72fe6dc65209ab3506d18556fb8b83b6ec832a9 --- /dev/null +++ b/contrib/PedestrianDetection/pedestrian.json @@ -0,0 +1,11 @@ +{ + "images": [], + "annotations": [], + "categories": [ + { + "supercategory": "component", + "id": 1, + "name": "pedestrian" + } + ] +} diff --git a/contrib/PedestrianDetection/pedestrian_yolov3_darknet.yml b/contrib/PedestrianDetection/pedestrian_yolov3_darknet.yml new file mode 100644 index 0000000000000000000000000000000000000000..adc9109aa356e109afc81bea13b856ce0f4be448 --- /dev/null +++ b/contrib/PedestrianDetection/pedestrian_yolov3_darknet.yml @@ -0,0 +1,82 @@ +architecture: YOLOv3 +train_feed: YoloTrainFeed +eval_feed: YoloEvalFeed +test_feed: YoloTestFeed +use_gpu: true +max_iters: 200000 +log_smooth_window: 20 +save_dir: output +snapshot_iter: 5000 +metric: COCO +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/DarkNet53_pretrained.tar +weights: https://paddlemodels.bj.bcebos.com/object_detection/pedestrian_yolov3_darknet.tar +num_classes: 1 + +YOLOv3: + backbone: DarkNet + yolo_head: YOLOv3Head + +DarkNet: + norm_type: sync_bn + norm_decay: 0. + depth: 53 + +YOLOv3Head: + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + norm_decay: 0. + ignore_thresh: 0.7 + label_smooth: true + nms: + background_label: -1 + keep_top_k: 100 + nms_threshold: 0.45 + nms_top_k: 1000 + normalized: false + score_threshold: 0.01 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 150000 + - 180000 + - !LinearWarmup + start_factor: 0. + steps: 4000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 + +YoloTrainFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/pedestrian + annotation: annotations/instances_train2017.json + image_dir: train2017 + num_workers: 8 + bufsize: 128 + use_process: true + +YoloEvalFeed: + batch_size: 8 + image_shape: [3, 608, 608] + dataset: + dataset_dir: dataset/pedestrian + annotation: annotations/instances_val2017.json + image_dir: val2017 + +YoloTestFeed: + batch_size: 1 + image_shape: [3, 608, 608] + dataset: + annotation: contrib/PedestrianDetection/pedestrian.json diff --git a/contrib/README.md b/contrib/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fbb55512acec372fa31d2955afe65cbc8abf7b2e --- /dev/null +++ b/contrib/README.md @@ -0,0 +1,104 @@ +# PaddleDetection applied for specific scenarios + +We provide some models implemented by PaddlePaddle to detect objects in specific scenarios, users can download the models and use them in these scenarios. + +| Task | Algorithm | Box AP | Download | +|:---------------------|:---------:|:------:| :-------------------------------------------------------------------------------------: | +| Vehicle Detection | YOLOv3 | 54.5 | [model](https://paddlemodels.bj.bcebos.com/object_detection/vehicle_yolov3_darknet.tar) | +| Pedestrian Detection | YOLOv3 | 51.8 | [model](https://paddlemodels.bj.bcebos.com/object_detection/pedestrian_yolov3_darknet.tar) | + +## Vehicle Detection + +One of major applications of vehichle detection is traffic monitoring. In this scenary, vehicles to be detected are mostly captured by the cameras mounted on top of traffic light columns. + +### 1. Network + +The network for detecting vehicles is YOLOv3, the backbone of which is Dacknet53. + +### 2. Configuration for training + +PaddleDetection provides users with a configuration file [yolov3_darnet.yml](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/PaddleDetection/configs/yolov3_darknet.yml) to train YOLOv3 on the COCO dataset, compared with this file, we modify some parameters as followed to conduct the training for vehicle detection: + +* max_iters: 120000 +* num_classes: 6 +* anchors: [[8, 9], [10, 23], [19, 15], [23, 33], [40, 25], [54, 50], [101, 80], [139, 145], [253, 224]] +* label_smooth: false +* nms/nms_top_k: 400 +* nms/score_threshold: 0.005 +* milestones: [60000, 80000] +* dataset_dir: dataset/vehicle + +### 3. Accuracy + +The accuracy of the model trained and evaluated on our private data is shown as followed: + +AP at IoU=.50:.05:.95 is 0.545. + +AP at IoU=.50 is 0.764. + +### 4. Inference + +Users can employ the model to conduct the inference: + +``` +export CUDA_VISIBLE_DEVICES=0 +export PYTHONPATH=$PYTHONPATH:. +python -u tools/infer.py -c contrib/VehicleDetection/vehicle_yolov3_darknet.yml \ + -o weights=https://paddlemodels.bj.bcebos.com/object_detection/vehicle_yolov3_darknet.tar \ + --infer_dir contrib/VehicleDetection/demo \ + --draw_threshold 0.2 \ + --output_dir contrib/VehicleDetection/demo/output + +``` + +Some inference results are visualized below: + +![](VehicleDetection/demo/output/001.jpeg) + +![](VehicleDetection/demo/output/005.png) + +## Pedestrian Detection + +The main applications of pedetestrian detection include intelligent monitoring. In this scenary, photos of pedetestrians are taken by surveillance cameras in public areas, then pedestrian detection are conducted on these photos. + +### 1. Network + +The network for detecting vehicles is YOLOv3, the backbone of which is Dacknet53. + +### 2. Configuration for training + +PaddleDetection provides users with a configuration file [yolov3_darnet.yml](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/PaddleDetection/configs/yolov3_darknet.yml) to train YOLOv3 on the COCO dataset, compared with this file, we modify some parameters as followed to conduct the training for pedestrian detection: + +* max_iters: 200000 +* num_classes: 1 +* snapshot_iter: 5000 +* milestones: [150000, 180000] +* dataset_dir: dataset/pedestrian + +### 3. Accuracy + +The accuracy of the model trained and evaluted on our private data is shown as followed: + +AP at IoU=.50:.05:.95 is 0.518. + +AP at IoU=.50 is 0.792. + +### 4. Inference + +Users can employ the model to conduct the inference: + +``` +export CUDA_VISIBLE_DEVICES=0 +export PYTHONPATH=$PYTHONPATH:. +python -u tools/infer.py -c contrib/PedestrianDetection/pedestrian_yolov3_darknet.yml \ + -o weights=https://paddlemodels.bj.bcebos.com/object_detection/pedestrian_yolov3_darknet.tar \ + --infer_dir contrib/PedestrianDetection/demo \ + --draw_threshold 0.3 \ + --output_dir contrib/PedestrianDetection/demo/output +``` + +Some inference results are visualized below: + +![](PedestrianDetection/demo/output/001.png) + +![](PedestrianDetection/demo/output/004.png) diff --git a/contrib/README_cn.md b/contrib/README_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..92fc25ece0ca62c0255582208706e8a773f80e5a --- /dev/null +++ b/contrib/README_cn.md @@ -0,0 +1,106 @@ +# PaddleDetection 特色垂类检测模型 + +我们提供了针对不同场景的基于PaddlePaddle的检测模型,用户可以下载模型进行使用。 + +| 任务 | 算法 | 精度(Box AP) | 下载 | +|:---------------------|:---------:|:------:| :---------------------------------------------------------------------------------: | +| 车辆检测 | YOLOv3 | 54.5 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/vehicle_yolov3_darknet.tar) | +| 行人检测 | YOLOv3 | 51.8 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/pedestrian_yolov3_darknet.tar) | + + +## 车辆检测(Vehicle Detection) + +车辆检测的主要应用之一是交通监控。在这样的监控场景中,待检测的车辆多为道路红绿灯柱上的摄像头拍摄所得。 + +### 1. 模型结构 + +Backbone为Dacknet53的YOLOv3。 + +### 2. 训练参数配置 + +PaddleDetection提供了使用COCO数据集对YOLOv3进行训练的参数配置文件[yolov3_darnet.yml](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/PaddleDetection/configs/yolov3_darknet.yml),与之相比,在进行车辆检测的模型训练时,我们对以下参数进行了修改: + +* max_iters: 120000 +* num_classes: 6 +* anchors: [[8, 9], [10, 23], [19, 15], [23, 33], [40, 25], [54, 50], [101, 80], [139, 145], [253, 224]] +* label_smooth: false +* nms/nms_top_k: 400 +* nms/score_threshold: 0.005 +* milestones: [60000, 80000] +* dataset_dir: dataset/vehicle + +### 3. 精度指标 + +模型在我们内部数据上的精度指标为: + +IOU=.50:.05:.95时的AP为 0.545。 + +IOU=.5时的AP为 0.764。 + +### 4. 预测 + +用户可以使用我们训练好的模型进行车辆检测: + +``` +export CUDA_VISIBLE_DEVICES=0 +export PYTHONPATH=$PYTHONPATH:. +python -u tools/infer.py -c contrib/VehicleDetection/vehicle_yolov3_darknet.yml \ + -o weights=https://paddlemodels.bj.bcebos.com/object_detection/vehicle_yolov3_darknet.tar \ + --infer_dir contrib/VehicleDetection/demo \ + --draw_threshold 0.2 \ + --output_dir contrib/VehicleDetection/demo/output + +``` + +预测结果示例: + +![](VehicleDetection/demo/output/001.jpeg) + +![](VehicleDetection/demo/output/005.png) + +## 行人检测(Pedestrian Detection) + +行人检测的主要应用有智能监控。在监控场景中,大多是从公共区域的监控摄像头视角拍摄行人,获取图像后再进行行人检测。 + +### 1. 模型结构 + +Backbone为Dacknet53的YOLOv3。 + + +### 2. 训练参数配置 + +PaddleDetection提供了使用COCO数据集对YOLOv3进行训练的参数配置文件[yolov3_darnet.yml](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/PaddleDetection/configs/yolov3_darknet.yml),与之相比,在进行行人检测的模型训练时,我们对以下参数进行了修改: + +* max_iters: 200000 +* num_classes: 1 +* snapshot_iter: 5000 +* milestones: [150000, 180000] +* dataset_dir: dataset/pedestrian + +### 2. 精度指标 + +模型在我们针对监控场景的内部数据上精度指标为: + +IOU=.5时的AP为 0.792。 + +IOU=.5-.95时的AP为 0.518。 + +### 3. 预测 + +用户可以使用我们训练好的模型进行行人检测: + +``` +export CUDA_VISIBLE_DEVICES=0 +export PYTHONPATH=$PYTHONPATH:. +python -u tools/infer.py -c contrib/PedestrianDetection/pedestrian_yolov3_darknet.yml \ + -o weights=https://paddlemodels.bj.bcebos.com/object_detection/pedestrian_yolov3_darknet.tar \ + --infer_dir contrib/PedestrianDetection/demo \ + --draw_threshold 0.3 \ + --output_dir contrib/PedestrianDetection/demo/output +``` + +预测结果示例: + +![](PedestrianDetection/demo/output/001.png) + +![](PedestrianDetection/demo/output/004.png) diff --git a/contrib/VehicleDetection/demo/001.jpeg b/contrib/VehicleDetection/demo/001.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..8786db5eb6773931c363358bb39462b33db55369 Binary files /dev/null and b/contrib/VehicleDetection/demo/001.jpeg differ diff --git a/contrib/VehicleDetection/demo/003.png b/contrib/VehicleDetection/demo/003.png new file mode 100644 index 0000000000000000000000000000000000000000..c01ab4ce769fb3b1c8863093a35d27da0ab10efd Binary files /dev/null and b/contrib/VehicleDetection/demo/003.png differ diff --git a/contrib/VehicleDetection/demo/004.png b/contrib/VehicleDetection/demo/004.png new file mode 100644 index 0000000000000000000000000000000000000000..8907eb8d4d9b82e08ca214509c9fb41ca889db2a Binary files /dev/null and b/contrib/VehicleDetection/demo/004.png differ diff --git a/contrib/VehicleDetection/demo/005.png b/contrib/VehicleDetection/demo/005.png new file mode 100644 index 0000000000000000000000000000000000000000..bf17712809c2fe6fa8e7d4f093ec4ac94523537c Binary files /dev/null and b/contrib/VehicleDetection/demo/005.png differ diff --git a/contrib/VehicleDetection/demo/output/001.jpeg b/contrib/VehicleDetection/demo/output/001.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..aa2b679d4d2a73487edd5f9c67323ab18df93893 Binary files /dev/null and b/contrib/VehicleDetection/demo/output/001.jpeg differ diff --git a/contrib/VehicleDetection/demo/output/005.png b/contrib/VehicleDetection/demo/output/005.png new file mode 100644 index 0000000000000000000000000000000000000000..57f918a30fcc5bf7bda284c1a1a0304e8822d325 Binary files /dev/null and b/contrib/VehicleDetection/demo/output/005.png differ diff --git a/contrib/VehicleDetection/vehicle.json b/contrib/VehicleDetection/vehicle.json new file mode 100644 index 0000000000000000000000000000000000000000..5863a9a8c9e0d8b4daeff31e7fe7869e084d3fb4 --- /dev/null +++ b/contrib/VehicleDetection/vehicle.json @@ -0,0 +1,36 @@ +{ + "images": [], + "annotations": [], + "categories": [ + { + "supercategory": "component", + "id": 1, + "name": "car" + }, + { + "supercategory": "component", + "id": 2, + "name": "truck" + }, + { + "supercategory": "component", + "id": 3, + "name": "bus" + }, + { + "supercategory": "component", + "id": 4, + "name": "motorbike" + }, + { + "supercategory": "component", + "id": 5, + "name": "tricycle" + }, + { + "supercategory": "component", + "id": 6, + "name": "carplate" + } + ] +} diff --git a/contrib/VehicleDetection/vehicle_yolov3_darknet.yml b/contrib/VehicleDetection/vehicle_yolov3_darknet.yml new file mode 100644 index 0000000000000000000000000000000000000000..6a923a0c109a30ed2e247fc7204e81ef0a82eef4 --- /dev/null +++ b/contrib/VehicleDetection/vehicle_yolov3_darknet.yml @@ -0,0 +1,82 @@ +architecture: YOLOv3 +train_feed: YoloTrainFeed +eval_feed: YoloEvalFeed +test_feed: YoloTestFeed +use_gpu: true +max_iters: 120000 +log_smooth_window: 20 +save_dir: output +snapshot_iter: 2000 +metric: COCO +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/DarkNet53_pretrained.tar +weights: https://paddlemodels.bj.bcebos.com/object_detection/vehicle_yolov3_darknet.tar +num_classes: 6 + +YOLOv3: + backbone: DarkNet + yolo_head: YOLOv3Head + +DarkNet: + norm_type: sync_bn + norm_decay: 0. + depth: 53 + +YOLOv3Head: + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + anchors: [[8, 9], [10, 23], [19, 15], + [23, 33], [40, 25], [54, 50], + [101, 80], [139, 145], [253, 224]] + norm_decay: 0. + ignore_thresh: 0.7 + label_smooth: false + nms: + background_label: -1 + keep_top_k: 100 + nms_threshold: 0.45 + nms_top_k: 400 + normalized: false + score_threshold: 0.005 + +LearningRate: + base_lr: 0.001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 60000 + - 80000 + - !LinearWarmup + start_factor: 0. + steps: 4000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0005 + type: L2 + +YoloTrainFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/vehicle + annotation: annotations/instances_train2017.json + image_dir: train2017 + num_workers: 8 + bufsize: 128 + use_process: true + +YoloEvalFeed: + batch_size: 8 + image_shape: [3, 608, 608] + dataset: + dataset_dir: dataset/vehicle + annotation: annotations/instances_val2017.json + image_dir: val2017 + +YoloTestFeed: + batch_size: 1 + image_shape: [3, 608, 608] + dataset: + annotation: contrib/VehicleDetection/vehicle.json diff --git a/dataset/coco/download_coco.py b/dataset/coco/download_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..2b4f7e764e17296ccd8905478bf3ccb3818b909f --- /dev/null +++ b/dataset/coco/download_coco.py @@ -0,0 +1,25 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import sys +import os.path as osp +import logging + +from ppdet.utils.download import download_dataset + +logging.basicConfig(level=logging.INFO) + +download_path = osp.split(osp.realpath(sys.argv[0]))[0] +download_dataset(download_path, 'coco') diff --git a/dataset/fruit/download_fruit.py b/dataset/fruit/download_fruit.py new file mode 100644 index 0000000000000000000000000000000000000000..5cce18895af3eeb81c4e49f4897cc591b2f40f9b --- /dev/null +++ b/dataset/fruit/download_fruit.py @@ -0,0 +1,25 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import sys +import os.path as osp +import logging + +from ppdet.utils.download import download_dataset + +logging.basicConfig(level=logging.INFO) + +download_path = osp.split(osp.realpath(sys.argv[0]))[0] +download_dataset(download_path, 'fruit') diff --git a/dataset/voc/download_voc.py b/dataset/voc/download_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..e7f32657f1697bd82f0f7dfbb52a3d1cb987c4bd --- /dev/null +++ b/dataset/voc/download_voc.py @@ -0,0 +1,25 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import sys +import os.path as osp +import logging + +from ppdet.utils.download import download_dataset + +logging.basicConfig(level=logging.INFO) + +download_path = osp.split(osp.realpath(sys.argv[0]))[0] +download_dataset(download_path, 'voc') diff --git a/dataset/wider_face/download.sh b/dataset/wider_face/download.sh new file mode 100755 index 0000000000000000000000000000000000000000..6c86a22c6826d88846a16fbd43f8b556d8610b8f --- /dev/null +++ b/dataset/wider_face/download.sh @@ -0,0 +1,21 @@ +# All rights `PaddleDetection` reserved +# References: +# @inproceedings{yang2016wider, +# Author = {Yang, Shuo and Luo, Ping and Loy, Chen Change and Tang, Xiaoou}, +# Booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, +# Title = {WIDER FACE: A Face Detection Benchmark}, +# Year = {2016}} + +DIR="$( cd "$(dirname "$0")" ; pwd -P )" +cd "$DIR" + +# Download the data. +echo "Downloading..." +wget https://dataset.bj.bcebos.com/wider_face/WIDER_train.zip +wget https://dataset.bj.bcebos.com/wider_face/WIDER_val.zip +wget https://dataset.bj.bcebos.com/wider_face/wider_face_split.zip +# Extract the data. +echo "Extracting..." +unzip WIDER_train.zip +unzip WIDER_val.zip +unzip wider_face_split.zip diff --git a/demo/000000014439.jpg b/demo/000000014439.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0abbdab06eb5950b93908cc91adfa640e8a3ac78 Binary files /dev/null and b/demo/000000014439.jpg differ diff --git a/demo/000000014439_640x640.jpg b/demo/000000014439_640x640.jpg new file mode 100644 index 0000000000000000000000000000000000000000..58e9d3e228af43c9b55d8d0cb385ce82ebb8b996 Binary files /dev/null and b/demo/000000014439_640x640.jpg differ diff --git a/demo/000000087038.jpg b/demo/000000087038.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9f77f5d5f057b6f92dc096da704ecb8dee99bdf5 Binary files /dev/null and b/demo/000000087038.jpg differ diff --git a/demo/000000570688.jpg b/demo/000000570688.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cb304bd56c4010c08611a30dcca58ea9140cea54 Binary files /dev/null and b/demo/000000570688.jpg differ diff --git a/demo/cas.png b/demo/cas.png new file mode 100644 index 0000000000000000000000000000000000000000..a60303c99ce0d1ba52e6f89414df8bd5c90fae62 Binary files /dev/null and b/demo/cas.png differ diff --git a/demo/mask_rcnn_demo.ipynb b/demo/mask_rcnn_demo.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..860b185043679e3c7bb28c4fdad505c9f16dda56 --- /dev/null +++ b/demo/mask_rcnn_demo.ipynb @@ -0,0 +1,413 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "source": [ + "Change working directory to the project root" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "autoscroll": false, + "ein.hycell": false, + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/yang/models/PaddleCV/PaddleDetection\n" + ] + } + ], + "source": [ + "%cd .." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "source": [ + "Now let's take a look at the input image." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "autoscroll": false, + "ein.hycell": false, + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from PIL import Image\n", + "\n", + "image_path = 'demo/000000570688.jpg'\n", + "img = Image.open(image_path)\n", + "img" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "source": [ + "For inference, preprocess only involves decoding, normalization and transposing to CHW.\n", + "\n", + "**NOTE:** in most cases, one should use the configuration based [data feed](../docs/DATA.md) API which greatly simplifies the data pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "autoscroll": false, + "ein.hycell": false, + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "outputs": [], + "source": [ + "from ppdet.data.transform.operators import DecodeImage, NormalizeImage, Permute\n", + "\n", + "sample = {'im_file': image_path}\n", + "decode = DecodeImage(to_rgb=True)\n", + "normalize = NormalizeImage(\n", + " mean=[0.485, 0.456, 0.406],\n", + " std=[0.229, 0.224, 0.225],\n", + " is_scale=True,\n", + " is_channel_first=False)\n", + "permute = Permute(to_bgr=False, channel_first=True)\n", + "\n", + "sample = permute(normalize(decode(sample)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "source": [ + "Some extra effort is needed to massage data into the desired format. \n", + "\n", + "**NOTE:** Again, if the data feed API is used, these are handled automatically." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "autoscroll": false, + "ein.hycell": false, + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "h = sample['h']\n", + "w = sample['w']\n", + "im_info = np.array((h, w, 1), dtype=np.float32)\n", + "\n", + "sample['im_info'] = im_info\n", + "sample['im_shape'] = im_info\n", + "\n", + "# we don't need these\n", + "for key in ['im_file', 'h', 'w']:\n", + " del sample[key]\n", + "\n", + "# batch of a single sample\n", + "sample = {k: v[np.newaxis, ...] for k, v in sample.items()}\n", + "\n", + "feed_var_def = [\n", + " {'name': 'image', 'shape': (h, w, 3)},\n", + " {'name': 'im_info', 'shape': [3]},\n", + " {'name': 'im_shape', 'shape': [3]},\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "source": [ + "Next, build the [Mask R-CNN](https://arxiv.org/abs/1703.06870) model and associated fluid programs" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "autoscroll": false, + "ein.hycell": false, + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "outputs": [], + "source": [ + "from paddle import fluid\n", + "from ppdet.modeling import (MaskRCNN, ResNet, ResNetC5, RPNHead, RoIAlign,\n", + " BBoxHead, MaskHead, BBoxAssigner, MaskAssigner)\n", + "\n", + "roi_size = 14\n", + "\n", + "model = MaskRCNN(\n", + " ResNet(feature_maps=4),\n", + " RPNHead(),\n", + " BBoxHead(ResNetC5()),\n", + " BBoxAssigner(),\n", + " RoIAlign(resolution=roi_size),\n", + " MaskAssigner(),\n", + " MaskHead())\n", + "\n", + "startup_prog = fluid.Program()\n", + "infer_prog = fluid.Program()\n", + "with fluid.program_guard(infer_prog, startup_prog):\n", + " with fluid.unique_name.guard():\n", + " feed_vars = {\n", + " var['name']: fluid.layers.data(\n", + " name=var['name'],\n", + " shape=var['shape'],\n", + " dtype='float32',\n", + " lod_level=0) for var in feed_var_def\n", + " }\n", + " test_fetches = model.test(feed_vars)\n", + "infer_prog = infer_prog.clone(for_test=True)\n", + "\n", + "# use GPU if available\n", + "if fluid.core.get_cuda_device_count() > 0:\n", + " place = fluid.CUDAPlace(0)\n", + "else:\n", + " place = fluid.CPUPlace()\n", + "\n", + "exe = fluid.Executor(place)\n", + "_ = exe.run(startup_prog)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "source": [ + "Load the checkpoint weights, just wait a couple of minutes for it to be downloaded." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "autoscroll": false, + "ein.hycell": false, + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 140690/140690 [00:12<00:00, 10843.70KB/s]\n" + ] + } + ], + "source": [ + "from ppdet.utils import checkpoint\n", + "\n", + "ckpt_url = 'https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_r50_1x.tar'\n", + "checkpoint.load_checkpoint(exe, infer_prog, ckpt_url)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "source": [ + "Run the program and fetch the result. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "autoscroll": false, + "ein.hycell": false, + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "outputs": [], + "source": [ + "output = exe.run(infer_prog, feed=sample,\n", + " fetch_list=[t.name for t in test_fetches.values()],\n", + " return_numpy=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "source": [ + "Again, we need to massage the result a bit for visualization." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "autoscroll": false, + "ein.hycell": false, + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "outputs": [], + "source": [ + "res = {\n", + " k: (np.array(v), v.recursive_sequence_lengths())\n", + " for k, v in zip(test_fetches.keys(), output)\n", + "}\n", + "# fake image id\n", + "res['im_id'] = [[[0] for _ in range(res['bbox'][1][0][0])]]\n", + "res['im_shape'] = [[im_info]]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "source": [ + "Now overlay the bboxes and masks onto the image...\n", + "\n", + "And voila, we've successully built and run the Mask R-CNN inference pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "autoscroll": false, + "ein.hycell": false, + "ein.tags": "worksheet-0", + "slideshow": { + "slide_type": "-" + } + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from ppdet.utils.coco_eval import bbox2out, mask2out, coco17_category_info\n", + "from ppdet.utils.visualizer import visualize_results\n", + "\n", + "cls2cat, cat2name = coco17_category_info()\n", + "bboxes = bbox2out([res], cls2cat)\n", + "masks = mask2out([res], cls2cat, roi_size)\n", + "\n", + "visualize_results(img, 0, cat2name, 0.5, bboxes, masks)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.16" + }, + "name": "mask_rcnn_demo.ipynb" + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/demo/obj365_gt.png b/demo/obj365_gt.png new file mode 100644 index 0000000000000000000000000000000000000000..eb69077f6ba1676d9fc0ba3e4cada645c5ab7245 Binary files /dev/null and b/demo/obj365_gt.png differ diff --git a/demo/obj365_pred.png b/demo/obj365_pred.png new file mode 100644 index 0000000000000000000000000000000000000000..d99fb1fd198f1f8b6e003741375e9aa5524f117a Binary files /dev/null and b/demo/obj365_pred.png differ diff --git a/demo/orange_71.jpg b/demo/orange_71.jpg new file mode 100644 index 0000000000000000000000000000000000000000..da7974a1a1371298f1ca5f4ef9c82bd3824d7ac3 Binary files /dev/null and b/demo/orange_71.jpg differ diff --git a/demo/orange_71_detection.jpg b/demo/orange_71_detection.jpg new file mode 100644 index 0000000000000000000000000000000000000000..88cbf9c97120f79d5ef5f80ccb789a6e3c29bedf Binary files /dev/null and b/demo/orange_71_detection.jpg differ diff --git a/demo/output/000000570688.jpg b/demo/output/000000570688.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8a0f84f38331093a0e1afc52d4b7747535bdbb6d Binary files /dev/null and b/demo/output/000000570688.jpg differ diff --git a/demo/output/12_Group_Group_12_Group_Group_12_935.jpg b/demo/output/12_Group_Group_12_Group_Group_12_935.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2a563361ae03fbe079dba017374eee51ccbd17dd Binary files /dev/null and b/demo/output/12_Group_Group_12_Group_Group_12_935.jpg differ diff --git a/demo/tensorboard_fruit.jpg b/demo/tensorboard_fruit.jpg new file mode 100644 index 0000000000000000000000000000000000000000..44a955fafffb4ab03d911818e20e6f72499f2f4f Binary files /dev/null and b/demo/tensorboard_fruit.jpg differ diff --git a/docs/BENCHMARK_INFER_cn.md b/docs/BENCHMARK_INFER_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..bc4c25fee1555eb124c25edfcd066af0f6c7bfcf --- /dev/null +++ b/docs/BENCHMARK_INFER_cn.md @@ -0,0 +1,89 @@ + + +# 推理Benchmark + + + +- 测试环境: + - CUDA 9.0 + - CUDNN 7.5 + - TensorRT-5.1.2.2 + - PaddlePaddle v1.6 + - GPU分别为: Tesla V100和Tesla P4 +- 测试方式: + - 为了方面比较不同模型的推理速度,输入采用同样大小的图片,为 3x640x640,采用 `demo/000000014439_640x640.jpg` 图片。 + - Batch Size=1 + - 去掉前10轮warmup时间,测试100轮的平均时间,单位ms/image,包括输入数据拷贝至GPU的时间、计算时间、数据拷贝只CPU的时间。 + - 采用Fluid C++预测引擎: 包含Fluid C++预测、Fluid-TensorRT预测,下面同时测试了Float32 (FP32) 和Float16 (FP16)的推理速度。 + - 测试时开启了 FLAGS_cudnn_exhaustive_search=True,使用exhaustive方式搜索卷积计算算法。 + +### 推理速度 + + + + + +| 模型 | Tesla V100 Fluid (ms/image) | Tesla V100 Fluid-TensorRT-FP32 (ms/image) | Tesla V100 Fluid-TensorRT-FP16 (ms/image) | Tesla P4 Fluid (ms/image) | Tesla P4 Fluid-TensorRT-FP32 (ms/image) | +| ------------------------------------- | ----------------------------- | ------------------------------------------- | ------------------------------------------- | --------------------------- | ----------------------------------------- | +| faster_rcnn_r50_1x | 147.488 | 146.124 | 142.416 | 471.547 | 471.631 | +| faster_rcnn_r50_2x | 147.636 | 147.73 | 141.664 | 471.548 | 472.86 | +| faster_rcnn_r50_vd_1x | 146.588 | 144.767 | 141.208 | 459.357 | 457.852 | +| faster_rcnn_r50_fpn_1x | 25.11 | 24.758 | 20.744 | 59.411 | 57.585 | +| faster_rcnn_r50_fpn_2x | 25.351 | 24.505 | 20.509 | 59.594 | 57.591 | +| faster_rcnn_r50_vd_fpn_2x | 25.514 | 25.292 | 21.097 | 61.026 | 58.377 | +| faster_rcnn_r50_fpn_gn_2x | 36.959 | 36.173 | 32.356 | 101.339 | 101.212 | +| faster_rcnn_dcn_r50_fpn_1x | 28.707 | 28.162 | 27.503 | 68.154 | 67.443 | +| faster_rcnn_dcn_r50_vd_fpn_2x | 28.576 | 28.271 | 27.512 | 68.959 | 68.448 | +| faster_rcnn_r101_1x | 153.267 | 150.985 | 144.849 | 490.104 | 486.836 | +| faster_rcnn_r101_fpn_1x | 30.949 | 30.331 | 24.021 | 73.591 | 69.736 | +| faster_rcnn_r101_fpn_2x | 30.918 | 29.126 | 23.677 | 73.563 | 70.32 | +| faster_rcnn_r101_vd_fpn_1x | 31.144 | 30.202 | 23.57 | 74.767 | 70.773 | +| faster_rcnn_r101_vd_fpn_2x | 30.678 | 29.969 | 23.327 | 74.882 | 70.842 | +| faster_rcnn_x101_vd_64x4d_fpn_1x | 60.36 | 58.461 | 45.172 | 132.178 | 131.734 | +| faster_rcnn_x101_vd_64x4d_fpn_2x | 59.003 | 59.163 | 46.065 | 131.422 | 132.186 | +| faster_rcnn_dcn_r101_vd_fpn_1x | 36.862 | 37.205 | 36.539 | 93.273 | 92.616 | +| faster_rcnn_dcn_x101_vd_64x4d_fpn_1x | 78.476 | 78.335 | 77.559 | 185.976 | 185.996 | +| faster_rcnn_se154_vd_fpn_s1x | 166.282 | 90.508 | 80.738 | 304.653 | 193.234 | +| mask_rcnn_r50_1x | 160.185 | 160.4 | 160.322 | - | - | +| mask_rcnn_r50_2x | 159.821 | 159.527 | 160.41 | - | - | +| mask_rcnn_r50_fpn_1x | 95.72 | 95.719 | 92.455 | 259.8 | 258.04 | +| mask_rcnn_r50_fpn_2x | 84.545 | 83.567 | 79.269 | 227.284 | 222.975 | +| mask_rcnn_r50_vd_fpn_2x | 82.07 | 82.442 | 77.187 | 223.75 | 221.683 | +| mask_rcnn_r50_fpn_gn_2x | 94.936 | 94.611 | 91.42 | 265.468 | 263.76 | +| mask_rcnn_dcn_r50_fpn_1x | 97.828 | 97.433 | 93.76 | 256.295 | 258.056 | +| mask_rcnn_dcn_r50_vd_fpn_2x | 77.831 | 79.453 | 76.983 | 205.469 | 204.499 | +| mask_rcnn_r101_fpn_1x | 95.543 | 97.929 | 90.314 | 252.997 | 250.782 | +| mask_rcnn_r101_vd_fpn_1x | 98.046 | 97.647 | 90.272 | 261.286 | 262.108 | +| mask_rcnn_x101_vd_64x4d_fpn_1x | 115.461 | 115.756 | 102.04 | 296.066 | 293.62 | +| mask_rcnn_x101_vd_64x4d_fpn_2x | 107.144 | 107.29 | 97.275 | 267.636 | 267.577 | +| mask_rcnn_dcn_r101_vd_fpn_1x | 85.504 | 84.875 | 84.907 | 225.202 | 226.585 | +| mask_rcnn_dcn_x101_vd_64x4d_fpn_1x | 129.937 | 129.934 | 127.804 | 326.786 | 326.161 | +| mask_rcnn_se154_vd_fpn_s1x | 214.188 | 139.807 | 121.516 | 440.391 | 439.727 | +| cascade_rcnn_r50_fpn_1x | 36.866 | 36.949 | 36.637 | 101.851 | 101.912 | +| cascade_mask_rcnn_r50_fpn_1x | 110.344 | 106.412 | 100.367 | 301.703 | 297.739 | +| cascade_rcnn_dcn_r50_fpn_1x | 40.412 | 39.58 | 39.853 | 110.346 | 110.077 | +| cascade_mask_rcnn_r50_fpn_gn_2x | 170.092 | 168.758 | 163.298 | 527.998 | 529.59 | +| cascade_rcnn_dcn_r101_vd_fpn_1x | 48.414 | 48.849 | 48.701 | 134.9 | 134.846 | +| cascade_rcnn_dcn_x101_vd_64x4d_fpn_1x | 90.062 | 90.218 | 90.009 | 228.67 | 228.396 | +| retinanet_r101_fpn_1x | 55.59 | 54.636 | 48.489 | 90.394 | 83.951 | +| retinanet_r50_fpn_1x | 50.048 | 47.932 | 44.385 | 73.819 | 70.282 | +| retinanet_x101_vd_64x4d_fpn_1x | 83.329 | 83.446 | 70.76 | 145.936 | 146.168 | +| yolov3_darknet | 21.427 | 20.252 | 13.856 | 55.173 | 55.692 | +| yolov3_darknet_voc | 17.58 | 16.241 | 9.473 | 51.049 | 51.249 | +| yolov3_mobilenet_v1 | 12.869 | 11.834 | 9.408 | 24.887 | 21.352 | +| yolov3_mobilenet_v1_voc | 9.118 | 8.146 | 5.575 | 20.787 | 17.169 | +| yolov3_r34 | 14.914 | 14.125 | 11.176 | 20.798 | 20.822 | +| yolov3_r34_voc | 11.288 | 10.73 | 7.7 | 25.874 | 22.399 | +| ssd_mobilenet_v1_voc | 5.763 | 5.854 | 4.589 | 11.75 | 9.485 | +| ssd_vgg16_300 | 28.722 | 29.644 | 20.399 | 73.707 | 74.531 | +| ssd_vgg16_300_voc | 18.425 | 19.288 | 11.298 | 56.297 | 56.201 | +| ssd_vgg16_512 | 27.471 | 28.328 | 19.328 | 68.685 | 69.808 | +| ssd_vgg16_512_voc | 18.721 | 19.636 | 12.004 | 54.688 | 56.174 | + +1. RCNN系列模型Fluid-TensorRT速度相比Fluid预测没有优势,原因是: TensorRT仅支持定长输入,当前基于ResNet系列的RCNN模型,只有backbone部分采用了TensorRT子图计算,比较耗时的stage-5没有基于TensorRT计算。 Fluid对CNN模型也做了一系列的融合优化。后续TensorRT版本升级、或有其他优化策略时再更新数据。 +2. YOLO v3系列模型,Fluid-TensorRT相比Fluid预测加速5% - 10%不等。 +3. SSD和YOLOv3系列模型 TensorRT-FP16预测速度有一定的优势,加速约20% - 40%不等。具体如下图。 + +
+ +
diff --git a/docs/CACascadeRCNN.md b/docs/CACascadeRCNN.md new file mode 100644 index 0000000000000000000000000000000000000000..8e72b9af30183abf6c028ec2f4ee2ec695a29a6c --- /dev/null +++ b/docs/CACascadeRCNN.md @@ -0,0 +1,51 @@ +# CACascade RCNN +## 简介 +CACascade RCNN是百度视觉技术部在Objects365 2019 Challenge上夺冠的最佳单模型之一,Objects365是在通用物体检测领域的一个全新的数据集,旨在促进对自然场景不同对象的检测研究。Objects365在63万张图像上标注了365个对象类,训练集中共有超过1000万个边界框。这里放出的是Full Track任务中最好的单模型之一。 + +
+ +
+ +## 方法描述 + +针对大规模物体检测算法的特点,我们提出了一种基于图片包含物体类别的数量的采样方式(Class Aware Sampling)。基于这种方式进行训练模型可以在更短的时间使模型收敛到更好的效果。 + +
+ +
+ +本次公布的最好单模型是一个基于Cascade RCNN的两阶段检测模型,在此基础上将Backbone替换为更加强大的SENet154模型,Deformable Conv模块以及更复杂二阶段网络结构,针对BatchSize比较小的情况增加了Group Normalization操作并同时使用了多尺度训练,最终达到了非常理想的效果。预训练模型先后分别在ImageNet和COCO数据集上进行了训练,其中在COCO数据集上训练时增加了Mask分支,其余结构与CACascade RCNN相同, 会在启动训练时自动下载。 + +## 使用方法 + +1.准备数据 + +数据需要通过[Objects365官方网站](https://www.objects365.org/download.html)进行申请下载,数据下载后将数据放置在dataset目录中。 +``` +${THIS REPO ROOT} + \--dataset + \-- objects365 + \-- annotations + |-- train.json + |-- val.json + \-- train + \-- val +``` + +2.启动训练模型 + +```bash +python tools/train.py -c configs/obj365/cascade_rcnn_dcnv2_se154_vd_fpn_gn.yml +``` + +3.模型预测结果 + +| 模型 | 验证集 mAP | 下载链接 | +| :-----------------: | :--------: | :----------------------------------------------------------: | +| CACascadeRCNN SE154 | 31.7 | [model](https://paddlemodels.bj.bcebos.com/object_detection/cascade_rcnn_dcnv2_se154_vd_fpn_gn_cas_obj365.tar) | + +## 模型效果 + +
+ +
diff --git a/docs/CONFIG.md b/docs/CONFIG.md new file mode 100644 index 0000000000000000000000000000000000000000..3cba54eb546cfb648cc7b5bd2e135652a040b309 --- /dev/null +++ b/docs/CONFIG.md @@ -0,0 +1,206 @@ +English | [简体中文](CONFIG_cn.md) + +# Config Pipline + +## Introduction + +PaddleDetection takes a rather principled approach to configuration management. We aim to automate the configuration workflow and to reduce configuration errors. + + +## Rationale + +Presently, configuration in mainstream frameworks are usually dictionary based: the global config is simply a giant, loosely defined Python dictionary. + +This approach is error prone, e.g., misspelled or displaced keys may lead to serious errors in training process, causing time loss and wasted resources. + +To avoid the common pitfalls, with automation and static analysis in mind, we propose a configuration design that is user friendly, easy to maintain and extensible. + + +## Design + +The design utilizes some of Python's reflection mechanism to extract configuration schematics from Python class definitions. + +To be specific, it extracts information from class constructor arguments, including names, docstrings, default values, data types (if type hints are available). + +This approach advocates modular and testable design, leading to a unified and extensible code base. + + +### API + +Most of the functionality is exposed in `ppdet.core.workspace` module. + +- `register`: This decorator register a class as configurable module; it understands several special annotations in the class definition. + - `__category__`: For better organization, modules are classified into categories. + - `__inject__`: A list of constructor arguments, which are intended to take module instances as input, module instances will be created at runtime an injected. The corresponding configuration value can be a class name string, a serialized object, a config key pointing to a serialized object, or a dict (in which case the constructor needs to handle it, see example below). + - `__op__`: Shortcut for wrapping PaddlePaddle operators into a callable objects, together with `__append_doc__` (extracting docstring from target PaddlePaddle operator automatically), this can be a real time saver. +- `serializable`: This decorator make a class directly serializable in yaml config file, by taking advantage of [pyyaml](https://pyyaml.org/wiki/PyYAMLDocumentation)'s serialization mechanism. +- `create`: Constructs a module instance according to global configuration. +- `load_config` and `merge_config`: Loading yaml file and merge config settings from command line. + + +### Example + +Take the `RPNHead` module for example, it is composed of several PaddlePaddle operators. We first wrap those operators into classes, then pass in instances of these classes when instantiating the `RPNHead` module. + +```python +# excerpt from `ppdet/modeling/ops.py` +from ppdet.core.workspace import register, serializable + +# ... more operators + +@register +@serializable +class GenerateProposals(object): + # NOTE this class simply wraps a PaddlePaddle operator + __op__ = fluid.layers.generate_proposals + # NOTE docstring for args are extracted from PaddlePaddle OP + __append_doc__ = True + + def __init__(self, + pre_nms_top_n=6000, + post_nms_top_n=1000, + nms_thresh=.5, + min_size=.1, + eta=1.): + super(GenerateProposals, self).__init__() + self.pre_nms_top_n = pre_nms_top_n + self.post_nms_top_n = post_nms_top_n + self.nms_thresh = nms_thresh + self.min_size = min_size + self.eta = eta + +# ... more operators + +# excerpt from `ppdet/modeling/anchor_heads/rpn_head.py` +from ppdet.core.workspace import register +from ppdet.modeling.ops import AnchorGenerator, RPNTargetAssign, GenerateProposals + +@register +class RPNHead(object): + """ + RPN Head + + Args: + anchor_generator (object): `AnchorGenerator` instance + rpn_target_assign (object): `RPNTargetAssign` instance + train_proposal (object): `GenerateProposals` instance for training + test_proposal (object): `GenerateProposals` instance for testing + """ + __inject__ = [ + 'anchor_generator', 'rpn_target_assign', 'train_proposal', + 'test_proposal' + ] + + def __init__(self, + anchor_generator=AnchorGenerator().__dict__, + rpn_target_assign=RPNTargetAssign().__dict__, + train_proposal=GenerateProposals(12000, 2000).__dict__, + test_proposal=GenerateProposals().__dict__): + super(RPNHead, self).__init__() + self.anchor_generator = anchor_generator + self.rpn_target_assign = rpn_target_assign + self.train_proposal = train_proposal + self.test_proposal = test_proposal + if isinstance(anchor_generator, dict): + self.anchor_generator = AnchorGenerator(**anchor_generator) + if isinstance(rpn_target_assign, dict): + self.rpn_target_assign = RPNTargetAssign(**rpn_target_assign) + if isinstance(train_proposal, dict): + self.train_proposal = GenerateProposals(**train_proposal) + if isinstance(test_proposal, dict): + self.test_proposal = GenerateProposals(**test_proposal) +``` + +The corresponding(generated) YAML snippet is as follows, note this is the configuration in **FULL**, all the default values can be omitted. In case of the above example, all arguments have default value, meaning nothing is required in the config file. + +```yaml +RPNHead: + test_proposal: + eta: 1.0 + min_size: 0.1 + nms_thresh: 0.5 + post_nms_top_n: 1000 + pre_nms_top_n: 6000 + train_proposal: + eta: 1.0 + min_size: 0.1 + nms_thresh: 0.5 + post_nms_top_n: 2000 + pre_nms_top_n: 12000 + anchor_generator: + # ... + rpn_target_assign: + # ... +``` + +Example snippet that make use of the `RPNHead` module. + +```python +from ppdet.core.workspace import load_config, merge_config, create + +load_config('some_config_file.yml') +merge_config(more_config_options_from_command_line) + +rpn_head = create('RPNHead') +# ... code that use the created module! +``` + +Configuration file can also have serialized objects in it, denoted with `!`, for example + +```yaml +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [60000, 80000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 +``` + +[Complete config files](config_example/) of multiple detection architectures are given and brief description of each parameter. + +## Requirements + +Two Python packages are used, both are optional. + +- [typeguard](https://github.com/agronholm/typeguard) is used for type checking in Python 3. +- [docstring\_parser](https://github.com/rr-/docstring_parser) is needed for docstring parsing. + +To install them, simply run: + +```shell +pip install typeguard http://github.com/willthefrog/docstring_parser/tarball/master +``` + + +## Tooling + +A small utility (`tools/configure.py`) is included to simplify the configuration process, it provides 4 commands to walk users through the configuration process: + +1. `list`: List currently registered modules by category, one can also specify which category to list with the `--category` flag. +2. `help`: Get help information for a module, including description, options, configuration template and example command line flags. +3. `analyze`: Check configuration file for missing/extraneous options, options with mismatch type (if type hint is given) and missing dependencies, it also highlights user provided values (overridden default values). +4. `generate`: Generate a configuration template for a given list of modules. By default it generates a complete configuration file, which can be quite verbose; if a `--minimal` flag is given, it generates a template that only contain non optional settings. For example, to generate a configuration for Faster R-CNN architecture with `ResNet` backbone and `FPN`, run: + + ```shell + python tools/configure.py generate FasterRCNN ResNet RPNHead RoIAlign BBoxAssigner BBoxHead FasterRCNNTrainFeed FasterRCNNTestFeed LearningRate OptimizerBuilder + ``` + + For a minimal version, run: + + ```shell + python tools/configure.py --minimal generate FasterRCNN BBoxHead + ``` + + +## FAQ + +**Q:** There are some configuration options that are used by multiple modules (e.g., `num_classes`), how do I avoid duplication in config files? + +**A:** We provided a `__shared__` annotation for exactly this purpose, simply annotate like this `__shared__ = ['num_classes']`. It works as follows: + +1. if `num_classes` is configured for a module in config file, it takes precedence. +2. if `num_classes` is not configured for a module but is present in the config file as a global key, its value will be used. +3. otherwise, the default value (`81`) will be used. diff --git a/docs/CONFIG_cn.md b/docs/CONFIG_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..8b7eaa653a65264db189fa88a125ce10b5a6f667 --- /dev/null +++ b/docs/CONFIG_cn.md @@ -0,0 +1,196 @@ +# 配置模块 + +## 简介 + +为了使配置过程更加自动化并减少配置错误,PaddleDetection的配置管理采取了较为严谨的设计。 + + +## 设计思想 + +目前主流框架全局配置基本是一个Python dict,这种设计对配置的检查并不严格,拼写错误或者遗漏的配置项往往会造成训练过程中的严重错误,进而造成时间及资源的浪费。为了避免这些陷阱,从自动化和静态分析的原则出发,PaddleDetection采用了一种用户友好、 易于维护和扩展的配置设计。 + + +## 基本设计 + +利用Python的反射机制,PaddleDection的配置系统从Python类的构造函数抽取多种信息 - 如参数名、初始值、参数注释、数据类型(如果给出type hint)- 来作为配置规则。 这种设计便于设计的模块化,提升可测试性及扩展性。 + + +### API + +配置系统的大多数功能由 `ppdet.core.workspace` 模块提供 + +- `register`: 装饰器,将类注册为可配置模块;能够识别类定义中的一些特殊标注。 + - `__category__`: 为便于组织,模块可以分为不同类别。 + - `__inject__`: 如果模块由多个子模块组成,可以这些子模块实例作为构造函数的参数注入。对应的默认值及配置项可以是类名字符串,yaml序列化的对象,指向序列化对象的配置键值或者Python dict(构造函数需要对其作出处理,参见下面的例子)。 + - `__op__`: 配合 `__append_doc__` (抽取目标OP的 注释)使用,可以方便快速的封装PaddlePaddle底层OP。 +- `serializable`: 装饰器,利用 [pyyaml](https://pyyaml.org/wiki/PyYAMLDocumentation) 的序列化机制,可以直接将一个类实例序列化及反序列化。 +- `create`: 根据全局配置构造一个模块实例。 +- `load_config` and `merge_config`: 加载yaml文件,合并命令行提供的配置项。 + + +### 示例 + +以 `RPNHead` 模块为例,该模块包含多个PaddlePaddle OP,先将这些OP封装成类,并将其实例在构造 `RPNHead` 时注入。 + +```python +# excerpt from `ppdet/modeling/ops.py` +from ppdet.core.workspace import register, serializable + +# ... more operators + +@register +@serializable +class GenerateProposals(object): + # NOTE this class simply wraps a PaddlePaddle operator + __op__ = fluid.layers.generate_proposals + # NOTE docstring for args are extracted from PaddlePaddle OP + __append_doc__ = True + + def __init__(self, + pre_nms_top_n=6000, + post_nms_top_n=1000, + nms_thresh=.5, + min_size=.1, + eta=1.): + super(GenerateProposals, self).__init__() + self.pre_nms_top_n = pre_nms_top_n + self.post_nms_top_n = post_nms_top_n + self.nms_thresh = nms_thresh + self.min_size = min_size + self.eta = eta + +# ... more operators + +# excerpt from `ppdet/modeling/anchor_heads/rpn_head.py` +from ppdet.core.workspace import register +from ppdet.modeling.ops import AnchorGenerator, RPNTargetAssign, GenerateProposals + +@register +class RPNHead(object): + """ + RPN Head + + Args: + anchor_generator (object): `AnchorGenerator` instance + rpn_target_assign (object): `RPNTargetAssign` instance + train_proposal (object): `GenerateProposals` instance for training + test_proposal (object): `GenerateProposals` instance for testing + """ + __inject__ = [ + 'anchor_generator', 'rpn_target_assign', 'train_proposal', + 'test_proposal' + ] + + def __init__(self, + anchor_generator=AnchorGenerator().__dict__, + rpn_target_assign=RPNTargetAssign().__dict__, + train_proposal=GenerateProposals(12000, 2000).__dict__, + test_proposal=GenerateProposals().__dict__): + super(RPNHead, self).__init__() + self.anchor_generator = anchor_generator + self.rpn_target_assign = rpn_target_assign + self.train_proposal = train_proposal + self.test_proposal = test_proposal + if isinstance(anchor_generator, dict): + self.anchor_generator = AnchorGenerator(**anchor_generator) + if isinstance(rpn_target_assign, dict): + self.rpn_target_assign = RPNTargetAssign(**rpn_target_assign) + if isinstance(train_proposal, dict): + self.train_proposal = GenerateProposals(**train_proposal) + if isinstance(test_proposal, dict): + self.test_proposal = GenerateProposals(**test_proposal) +``` + +对应的yaml配置如下,请注意这里给出的是 **完整** 配置,其中所有默认值配置项都可以省略。上面的例子中的模块所有的构造函数参数都提供了默认值,因此配置文件中可以完全略过其配置。 + +```yaml +RPNHead: + test_proposal: + eta: 1.0 + min_size: 0.1 + nms_thresh: 0.5 + post_nms_top_n: 1000 + pre_nms_top_n: 6000 + train_proposal: + eta: 1.0 + min_size: 0.1 + nms_thresh: 0.5 + post_nms_top_n: 2000 + pre_nms_top_n: 12000 + anchor_generator: + # ... + rpn_target_assign: + # ... +``` + +`RPNHead` 模块实际使用代码示例。 + +```python +from ppdet.core.workspace import load_config, merge_config, create + +load_config('some_config_file.yml') +merge_config(more_config_options_from_command_line) + +rpn_head = create('RPNHead') +# ... code that use the created module! +``` + +配置文件用可以直接序列化模块实例,用 `!` 标示,如 + +```yaml +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [60000, 80000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 +``` + +[示例配置文件](config_example/)中给出了多种检测结构的完整配置文件,以及其中各个超参的简要说明。 + +## 安装依赖 + +配置系统用到两个Python包,均为可选安装。 + +- [typeguard](https://github.com/agronholm/typeguard) 在Python 3中用来进行数据类型验证。 +- [docstring\_parser](https://github.com/rr-/docstring_parser) 用来解析注释。 + +如需安装,运行下面命令即可。 + +```shell +pip install typeguard http://github.com/willthefrog/docstring_parser/tarball/master +``` + + +## 相关工具 + +为了方便用户配置,PaddleDection提供了一个工具 (`tools/configure.py`), 共支持四个子命令: + +1. `list`: 列出当前已注册的模块,如需列出具体类别的模块,可以使用 `--category` 指定。 +2. `help`: 显示指定模块的帮助信息,如描述,配置项,配置文件模板及命令行示例。 +3. `analyze`: 检查配置文件中的缺少或者多余的配置项以及依赖缺失,如果给出type hint, 还可以检查配置项中错误的数据类型。非默认配置也会高亮显示。 +4. `generate`: 根据给出的模块列表生成配置文件,默认生成完整配置,如果指定 `--minimal` ,生成最小配置,即省略所有默认配置项。例如,执行下列命令可以生成Faster R-CNN (`ResNet` backbone + `FPN`) 架构的配置文件: + + ```shell + python tools/configure.py generate FasterRCNN ResNet RPNHead RoIAlign BBoxAssigner BBoxHead FasterRCNNTrainFeed FasterRCNNTestFeed LearningRate OptimizerBuilder + ``` + + 如需最小配置,运行: + + ```shell + python tools/configure.py --minimal generate FasterRCNN BBoxHead + ``` + + +## FAQ + +**Q:** 某些配置项会在多个模块中用到(如 `num_classes`),如何避免在配置文件中多次重复设置? + +**A:** 框架提供了 `__shared__` 标记来实现配置的共享,用户可以标记参数,如 `__shared__ = ['num_classes']` ,配置数值作用规则如下: + +1. 如果模块配置中提供了 `num_classes` ,会优先使用其数值。 +2. 如果模块配置中未提供 `num_classes` ,但配置文件中存在全局键值,那么会使用全局键值。 +3. 两者均为配置的情况下,将使用默认值(`81`)。 diff --git a/docs/DATA.md b/docs/DATA.md new file mode 100644 index 0000000000000000000000000000000000000000..be9048c0fb59cda8305462006ad53dff6c039631 --- /dev/null +++ b/docs/DATA.md @@ -0,0 +1,224 @@ +English | [简体中文](DATA_cn.md) + +# Data Pipline + +## Introduction + +The data pipeline is responsible for loading and converting data. Each +resulting data sample is a tuple of np.ndarrays. +For example, Faster R-CNN training uses samples of this format: `[(im, +im_info, im_id, gt_bbox, gt_class, is_crowd), (...)]`. + +### Implementation + +The data pipeline consists of four sub-systems: data parsing, image +pre-processing, data conversion and data feeding APIs. + +Data samples are collected to form `data.Dataset`s, usually 3 sets are +needed for training, validation, and testing respectively. + +First, `data.source` loads the data files into memory, then +`data.transform` processes them, and lastly, the batched samples +are fetched by `data.Reader`. + +Sub-systems details: +1. Data parsing +Parses various data sources and creates `data.Dataset` instances. Currently, +following data sources are supported: + +- COCO data source +Loads `COCO` type datasets with directory structures like this: + + ``` + dataset/coco/ + ├── annotations + │ ├── instances_train2014.json + │ ├── instances_train2017.json + │ ├── instances_val2014.json + │ ├── instances_val2017.json + | ... + ├── train2017 + │ ├── 000000000009.jpg + │ ├── 000000580008.jpg + | ... + ├── val2017 + │ ├── 000000000139.jpg + │ ├── 000000000285.jpg + | ... + ``` + +- Pascal VOC data source +Loads `Pascal VOC` like datasets with directory structure like this: + + ``` + data/pascalvoc/ + ├──Annotations + │ ├── i000050.jpg + │ ├── 003876.xml + | ... + ├── ImageSets + │ ├──Main + └── train.txt + └── val.txt + └── test.txt + └── dog_train.txt + └── dog_trainval.txt + └── dog_val.txt + └── dog_test.txt + └── ... + │ ├──Layout + └──... + │ ├── Segmentation + └──... + ├── JPEGImages + │ ├── 000050.jpg + │ ├── 003876.jpg + | ... + ``` + +- Roidb data source +A generalized data source serialized as pickle files, which have the following +structure: +```python +(records, cname2id) +# `cname2id` is a `dict` which maps category name to class IDs +# and `records` is a list of dict of this structure: +{ + 'im_file': im_fname, # image file name + 'im_id': im_id, # image ID + 'h': im_h, # height of image + 'w': im_w, # width of image + 'is_crowd': is_crowd, # crowd marker + 'gt_class': gt_class, # ground truth class + 'gt_bbox': gt_bbox, # ground truth bounding box + 'gt_poly': gt_poly, # ground truth segmentation +} +``` + +We provide a tool to generate roidb data sources. To convert `COCO` or `VOC` +like dataset, run this command: +```sh +# --type: the type of original data (xml or json) +# --annotation: the path of file, which contains the name of annotation files +# --save-dir: the save path +# --samples: the number of samples (default is -1, which mean all datas in dataset) +python ./ppdet/data/tools/generate_data_for_training.py + --type=json \ + --annotation=./annotations/instances_val2017.json \ + --save-dir=./roidb \ + --samples=-1 +``` + + 2. Image preprocessing +the `data.transform.operator` module provides operations such as image +decoding, expanding, cropping, etc. Multiple operators are combined to form +larger processing pipelines. + + 3. Data transformer +Transform a `data.Dataset` to achieve various desired effects, Notably: the +`data.transform.paralle_map` transformer accelerates image processing with +multi-threads or multi-processes. More transformers can be found in +`data.transform.transformer`. + + 4. Data feeding apis +To facilitate data pipeline building, we combine multiple `data.Dataset` to +form a `data.Reader` which can provide data for training, validation and +testing respectively. Users can simply call `Reader.[train|eval|infer]` to get +the corresponding data stream. Many aspect of the `Reader`, such as storage +location, preprocessing pipeline, acceleration mode can be configured with yaml +files. + +### APIs + +The main APIs are as follows: + +1. Data parsing + + - `source/coco_loader.py`: COCO dataset parser. [source](../ppdet/data/source/coco_loader.py) + - `source/voc_loader.py`: Pascal VOC dataset parser. [source](../ppdet/data/source/voc_loader.py) + [Note] To use a non-default label list for VOC datasets, a `label_list.txt` + file is needed, one can use the provided label list + (`data/pascalvoc/ImageSets/Main/label_list.txt`) or generate a custom one (with `tools/generate_data_for_training.py`). Also, `use_default_label` option should + be set to `false` in the configuration file + - `source/loader.py`: Roidb dataset parser. [source](../ppdet/data/source/loader.py) + +2. Operator + `transform/operators.py`: Contains a variety of data augmentation methods, including: +- `DecodeImage`: Read images in RGB format. +- `RandomFlipImage`: Horizontal flip. +- `RandomDistort`: Distort brightness, contrast, saturation, and hue. +- `ResizeImage`: Resize image with interpolation. +- `RandomInterpImage`: Use a random interpolation method to resize the image. +- `CropImage`: Crop image with respect to different scale, aspect ratio, and overlap. +- `ExpandImage`: Pad image to a larger size, padding filled with mean image value. +- `NormalizeImage`: Normalize image pixel values. +- `NormalizeBox`: Normalize the bounding box. +- `Permute`: Arrange the channels of the image and optionally convert image to BGR format. +- `MixupImage`: Mixup two images with given fraction[1](#mix). + +[1] Please refer to [this paper](https://arxiv.org/pdf/1710.09412.pdf)。 + +`transform/arrange_sample.py`: Assemble the data samples needed by different models. +3. Transformer +`transform/post_map.py`: Transformations that operates on whole batches, mainly for: +- Padding whole batch to given stride values +- Resize images to Multi-scales +- Randomly adjust the image size of the batch data +`transform/transformer.py`: Data filtering batching. +`transform/parallel_map.py`: Accelerate data processing with multi-threads/multi-processes. +4. Reader +`reader.py`: Combine source and transforms, return batch data according to `max_iter`. +`data_feed.py`: Configure default parameters for `reader.py`. + + +### Usage + +#### Canned Datasets + +Preset for common datasets, e.g., `COCO` and `Pascal Voc` are included. In +most cases, user can simply use these canned dataset as is. Moreover, the +whole data pipeline is fully customizable through the yaml configuration files. + +#### Custom Datasets + +- Option 1: Convert the dataset to COCO or VOC format. +```sh + # a small utility (`tools/labelme2coco.py`) is provided to convert + # Labelme-annotated dataset to COCO format. + python ./ppdet/data/tools/labelme2coco.py --json_input_dir ./labelme_annos/ + --image_input_dir ./labelme_imgs/ + --output_dir ./cocome/ + --train_proportion 0.8 + --val_proportion 0.2 + --test_proportion 0.0 + # --json_input_dir:The path of json files which are annotated by Labelme. + # --image_input_dir:The path of images. + # --output_dir:The path of coverted COCO dataset. + # --train_proportion:The train proportion of annatation data. + # --val_proportion:The validation proportion of annatation data. + # --test_proportion: The inference proportion of annatation data. +``` + +- Option 2: + +1. Add `source/XX_loader.py` and implement the `load` function, following the + example of `source/coco_loader.py` and `source/voc_loader.py`. +2. Modify the `load` function in `source/loader.py` to make use of the newly + added data loader. +3. Modify `/source/__init__.py` accordingly. +```python +if data_cf['type'] in ['VOCSource', 'COCOSource', 'RoiDbSource']: + source_type = 'RoiDbSource' +# Replace the above code with the following code: +if data_cf['type'] in ['VOCSource', 'COCOSource', 'RoiDbSource', 'XXSource']: + source_type = 'RoiDbSource' +``` +4. In the configure file, define the `type` of `dataset` as `XXSource`. + +#### How to add data pre-processing? + +- To add pre-processing operation for a single image, refer to the classes in + `transform/operators.py`, and implement the desired transformation with a new + class. +- To add pre-processing for a batch, one needs to modify the `build_post_map` + function in `transform/post_map.py`. diff --git a/docs/DATA_cn.md b/docs/DATA_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..57970169711e5c6d605999f119ae488eeb52f96c --- /dev/null +++ b/docs/DATA_cn.md @@ -0,0 +1,203 @@ +# 数据模块 + +## 介绍 +本模块是一个Python模块,用于加载数据并将其转换成适用于检测模型的训练、验证、测试所需要的格式——由多个np.ndarray组成的tuple数组,例如用于Faster R-CNN模型的训练数据格式为:`[(im, im_info, im_id, gt_bbox, gt_class, is_crowd), (...)]`。 + +### 实现 +该模块内部可分为4个子功能:数据解析、图片预处理、数据转换和数据获取接口。 + +我们采用`data.Dataset`表示一份数据,比如`COCO`数据包含3份数据,分别用于训练、验证和测试。原始数据存储与文件中,通过`data.source`加载到内存,然后使用`data.transform`对数据进行处理转换,最终通过`data.Reader`的接口可以获得用于训练、验证和测试的batch数据。 + +子功能介绍: + +1. 数据解析 + 数据解析得到的是`data.Dataset`,实现逻辑位于`data.source`中。通过它可以实现解析不同格式的数据集,已支持的数据源包括: +- COCO数据源 + 该数据集目前分为COCO2012和COCO2017,主要由json文件和image文件组成,其组织结构如下所示: + + ``` + dataset/coco/ + ├── annotations + │ ├── instances_train2014.json + │ ├── instances_train2017.json + │ ├── instances_val2014.json + │ ├── instances_val2017.json + | ... + ├── train2017 + │ ├── 000000000009.jpg + │ ├── 000000580008.jpg + | ... + ├── val2017 + │ ├── 000000000139.jpg + │ ├── 000000000285.jpg + | ... + ``` + + +- Pascal VOC数据源 + 该数据集目前分为VOC2007和VOC2012,主要由xml文件和image文件组成,其组织结构如下所示: + + + ``` + data/pascalvoc/ + ├──Annotations + │ ├── i000050.jpg + │ ├── 003876.xml + | ... + ├── ImageSets + │ ├──Main + └── train.txt + └── val.txt + └── test.txt + └── dog_train.txt + └── dog_trainval.txt + └── dog_val.txt + └── dog_test.txt + └── ... + │ ├──Layout + └──... + │ ├── Segmentation + └──... + ├── JPEGImages + │ ├── 000050.jpg + │ ├── 003876.jpg + | ... + ``` + + + +- Roidb数据源 + 该数据集主要由COCO数据集和Pascal VOC数据集转换而成的pickle文件,包含一个dict,而dict中只包含一个命名为‘records’的list(可能还有一个命名为‘cname2cid’的字典),其内容如下所示: +```python +(records, catname2clsid) +'records'是一个list并且它的结构如下: +{ + 'im_file': im_fname, # 图像文件名 + 'im_id': im_id, # 图像id + 'h': im_h, # 图像高度 + 'w': im_w, # 图像宽度 + 'is_crowd': is_crowd, # 是否重叠 + 'gt_class': gt_class, # 真实框类别 + 'gt_bbox': gt_bbox, # 真实框坐标 + 'gt_poly': gt_poly, # 多边形坐标 +} +'cname2id'是一个dict,保存了类别名到id的映射 + +``` +我们在`./tools/`中提供了一个生成roidb数据集的代码,可以通过下面命令实现该功能。 +``` +# --type: 原始数据集的类别(只能是xml或者json) +# --annotation: 一个包含所需标注文件名的文件的路径 +# --save-dir: 保存路径 +# --samples: sample的个数(默认是-1,代表使用所有sample) +python ./ppdet/data/tools/generate_data_for_training.py + --type=json \ + --annotation=./annotations/instances_val2017.json \ + --save-dir=./roidb \ + --samples=-1 +``` + 2. 图片预处理 + 图片预处理通过包括图片解码、缩放、裁剪等操作,我们采用`data.transform.operator`算子的方式来统一实现,这样能方便扩展。此外,多个算子还可以组合形成复杂的处理流程, 并被`data.transformer`中的转换器使用,比如多线程完成一个复杂的预处理流程。 + + 3. 数据转换器 + 数据转换器的功能是完成对某个`data.Dataset`进行转换处理,从而得到一个新的`data.Dataset`。我们采用装饰器模式实现各种不同的`data.transform.transformer`。比如用于多进程预处理的`dataset.transform.paralle_map`转换器。 + + 4. 数据获取接口 + 为方便训练时的数据获取,我们将多个`data.Dataset`组合在一起构成一个`data.Reader`为用户提供数据,用户只需要调用`Reader.[train|eval|infer]`即可获得对应的数据流。`Reader`支持yaml文件配置数据地址、预处理过程、加速方式等。 + +### APIs + +主要的APIs如下: + + +1. 数据解析 + + - `source/coco_loader.py`:用于解析COCO数据集。[详见代码](../ppdet/data/source/coco_loader.py) + - `source/voc_loader.py`:用于解析Pascal VOC数据集。[详见代码](../ppdet/data/source/voc_loader.py) + [注意]在使用VOC数据集时,若不使用默认的label列表,则需要先使用`tools/generate_data_for_training.py`生成`label_list.txt`(使用方式与数据解析中的roidb数据集获取过程一致),或提供`label_list.txt`放置于`data/pascalvoc/ImageSets/Main`中;同时在配置文件中设置参数`use_default_label`为`true`。 + - `source/loader.py`:用于解析Roidb数据集。[详见代码](../ppdet/data/source/loader.py) + +2. 算子 + `transform/operators.py`:包含多种数据增强方式,主要包括: + +``` python +RandomFlipImage:水平翻转。 +RandomDistort:随机扰动图片亮度、对比度、饱和度和色相。 +ResizeImage:根据特定的插值方式调整图像大小。 +RandomInterpImage:使用随机的插值方式调整图像大小。 +CropImage:根据缩放比例、长宽比例两个参数生成若干候选框,再依据这些候选框和标注框的面积交并比(IoU)挑选出符合要求的裁剪结果。 +ExpandImage:将原始图片放进一张使用像素均值填充(随后会在减均值操作中减掉)的扩张图中,再对此图进行裁剪、缩放和翻转。 +DecodeImage:以RGB格式读取图像。 +Permute:对图像的通道进行排列并转为BGR格式。 +NormalizeImage:对图像像素值进行归一化。 +NormalizeBox:对bounding box进行归一化。 +MixupImage:按比例叠加两张图像。 +``` +[注意]:Mixup的操作可参考[论文](https://arxiv.org/pdf/1710.09412.pdf)。 + +`transform/arrange_sample.py`:实现对输入网络数据的排序。 +3. 转换 +`transform/post_map.py`:用于完成批数据的预处理操作,其主要包括: + +``` python +随机调整批数据的图像大小 +多尺度调整图像大小 +padding操作 +``` +`transform/transformer.py`:用于过滤无用的数据,并返回批数据。 +`transform/parallel_map.py`:用于实现加速。 +4. 读取 +`reader.py`:用于组合source和transformer操作,根据`max_iter`返回batch数据。 +`data_feed.py`: 用于配置 `reader.py`中所需的默认参数. + + + + +### 使用 +#### 常规使用 +结合yaml文件中的配置信息,完成本模块的功能。yaml文件的使用可以参见配置文件部分。 + + - 读取用于训练的数据 + +``` python +ccfg = load_cfg('./config.yml') +coco = Reader(ccfg.DATA, ccfg.TRANSFORM, maxiter=-1) +``` +#### 如何使用自定义数据集? + +- 选择1:将数据集转换为VOC格式或者COCO格式。 +``` + # 在./tools/中提供了labelme2coco.py用于将labelme标注的数据集转换为COCO数据集 + python ./ppdet/data/tools/labelme2coco.py --json_input_dir ./labelme_annos/ + --image_input_dir ./labelme_imgs/ + --output_dir ./cocome/ + --train_proportion 0.8 + --val_proportion 0.2 + --test_proportion 0.0 + # --json_input_dir:使用labelme标注的json文件所在文件夹 + # --image_input_dir:图像文件所在文件夹 + # --output_dir:转换后的COCO格式数据集存放位置 + # --train_proportion:标注数据中用于train的比例 + # --val_proportion:标注数据中用于validation的比例 + # --test_proportion: 标注数据中用于infer的比例 +``` +- 选择2: + +1. 仿照`./source/coco_loader.py`和`./source/voc_loader.py`,添加`./source/XX_loader.py`并实现`load`函数。 +2. 在`./source/loader.py`的`load`函数中添加使用`./source/XX_loader.py`的入口。 +3. 修改`./source/__init__.py`: + + +```python +if data_cf['type'] in ['VOCSource', 'COCOSource', 'RoiDbSource']: + source_type = 'RoiDbSource' +# 将上述代码替换为如下代码: +if data_cf['type'] in ['VOCSource', 'COCOSource', 'RoiDbSource', 'XXSource']: + source_type = 'RoiDbSource' +``` + +4. 在配置文件中修改`dataset`下的`type`为`XXSource`。 + +#### 如何增加数据预处理? +- 若增加单张图像的增强预处理,可在`transform/operators.py`中参考每个类的代码,新建一个类来实现新的数据增强;同时在配置文件中增加该预处理。 +- 若增加单个batch的图像预处理,可在`transform/post_map.py`中参考`build_post_map`中每个函数的代码,新建一个内部函数来实现新的批数据预处理;同时在配置文件中增加该预处理。 diff --git a/docs/EXPORT_MODEL.md b/docs/EXPORT_MODEL.md new file mode 100644 index 0000000000000000000000000000000000000000..614d87e29c5e5d5685f64c9ee78cf7d1d5192d09 --- /dev/null +++ b/docs/EXPORT_MODEL.md @@ -0,0 +1,48 @@ +# 模型导出 + +训练得到一个满足要求的模型后,如果想要将该模型接入到C++预测库或者Serving服务,需要通过`tools/export_model.py`导出该模型。 + +## 启动参数说明 + +| FLAG | 用途 | 默认值 | 备注 | +|:--------------:|:--------------:|:------------:|:-----------------------------------------:| +| -c | 指定配置文件 | None | | +| --output_dir | 模型保存路径 | `./output` | 模型默认保存在`output/配置文件名/`路径下 | + +## 使用示例 + +使用[训练/评估/推断](GETTING_STARTED_cn.md)中训练得到的模型进行试用,脚本如下 + +```bash +# 导出FasterRCNN模型, 模型中data层默认的shape为3x800x1333 +python tools/export_model.py -c configs/faster_rcnn_r50_1x.yml \ + --output_dir=./inference_model \ + -o weights=output/faster_rcnn_r50_1x/model_final \ + +``` + +预测模型会导出到`inference_model/faster_rcnn_r50_1x`目录下,模型名和参数名分别为`__model__`和`__params__`。 + +## 设置导出模型的输入大小 + +使用Fluid-TensorRT进行预测时,由于<=TensorRT 5.1的版本仅支持定长输入,保存模型的`data`层的图片大小需要和实际输入图片大小一致。而Fluid C++预测引擎没有此限制。可通过设置TestFeed的`image_shape`可以修改保存模型中的输入图片大小。示例如下: + +```bash +# 导出FasterRCNN模型,输入是3x640x640 +python tools/export_model.py -c configs/faster_rcnn_r50_1x.yml \ + --output_dir=./inference_model \ + -o weights=https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_1x.tar \ + FasterRCNNTestFeed.image_shape=[3,640,640] + +# 导出YOLOv3模型,输入是3x320x320 +python tools/export_model.py -c configs/yolov3_darknet.yml \ + --output_dir=./inference_model \ + -o weights=https://paddlemodels.bj.bcebos.com/object_detection/yolov3_darknet.tar \ + YoloTestFeed.image_shape=[3,320,320] + +# 导出SSD模型,输入是3x300x300 +python tools/export_model.py -c configs/ssd/ssd_mobilenet_v1_voc.yml \ + --output_dir=./inference_model \ + -o weights= https://paddlemodels.bj.bcebos.com/object_detection/ssd_mobilenet_v1_voc.tar \ + SSDTestFeed.image_shape=[3,300,300] +``` diff --git a/docs/GETTING_STARTED.md b/docs/GETTING_STARTED.md new file mode 100644 index 0000000000000000000000000000000000000000..199b343ed2a80a1bf80fcf3d3206fbdf8413551f --- /dev/null +++ b/docs/GETTING_STARTED.md @@ -0,0 +1,187 @@ +English | [简体中文](GETTING_STARTED_cn.md) + +# Getting Started + +For setting up the running environment, please refer to [installation +instructions](INSTALL.md). + + +## Training/Evaluation/Inference + +PaddleDetection provides scripots for training, evalution and inference with various features according to different configure. + +```bash +# set PYTHONPATH +export PYTHONPATH=$PYTHONPATH:. +# training in single-GPU and multi-GPU. specify different GPU numbers by CUDA_VISIBLE_DEVICES +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +python tools/train.py -c configs/faster_rcnn_r50_1x.yml +# GPU evalution +export CUDA_VISIBLE_DEVICES=0 +python tools/eval.py -c configs/faster_rcnn_r50_1x.yml +# Inference +python tools/infer.py -c configs/faster_rcnn_r50_1x.yml --infer_img=demo/000000570688.jpg +``` + +### Optional argument list + +list below can be viewed by `--help` + +| FLAG | script supported | description | default | remark | +| :----------------------: | :------------: | :---------------: | :--------------: | :-----------------: | +| -c | ALL | Select config file | None | **The whole description of configure can refer to [config_example](config_example)** | +| -o | ALL | Set parameters in configure file | None | `-o` has higher priority to file configured by `-c`. Such as `-o use_gpu=False max_iter=10000` | +| -r/--resume_checkpoint | train | Checkpoint path for resuming training | None | `-r output/faster_rcnn_r50_1x/10000` | +| --eval | train | Whether to perform evaluation in training | False | | +| --output_eval | train/eval | json path in evalution | current path | `--output_eval ./json_result` | +| -d/--dataset_dir | train/eval | path for dataset, same as dataset_dir in configs | None | `-d dataset/coco` | +| --fp16 | train | Whether to enable mixed precision training | False | GPU training is required | +| --loss_scale | train | Loss scaling factor for mixed precision training | 8.0 | enable when `--fp16` is True | +| --json_eval | eval | Whether to evaluate with already existed bbox.json or mask.json | False | json path is set in `--output_eval` | +| --output_dir | infer | Directory for storing the output visualization files | `./output` | `--output_dir output` | +| --draw_threshold | infer | Threshold to reserve the result for visualization | 0.5 | `--draw_threshold 0.7` | +| --infer_dir | infer | Directory for images to perform inference on | None | | +| --infer_img | infer | Image path | None | higher priority over --infer_dir | +| --use_tb | train/infer | Whether to record the data with [tb-paddle](https://github.com/linshuliang/tb-paddle), so as to display in Tensorboard | False | | +| --tb\_log_dir | train/infer | tb-paddle logging directory for image | train:`tb_log_dir/scalar` infer: `tb_log_dir/image` | | + + +## Examples + +### Training + +- Perform evaluation in training + + ```bash + export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + python -u tools/train.py -c configs/faster_rcnn_r50_1x.yml --eval + ``` + + Perform training and evalution alternatively and evaluate at each snapshot_iter. Meanwhile, the best model with highest MAP is saved at each `snapshot_iter` which has the same path as `model_final`. + + If evaluation dataset is large, we suggest decreasing evaluation times or evaluating after training. + +- Fine-tune other task + + When using pre-trained model to fine-tune other task, two methods can be used: + + 1. The excluded pre-trained parameters can be set by `finetune_exclude_pretrained_params` in YAML config + 2. Set -o finetune\_exclude\_pretrained_params in the arguments. + + ```bash + export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + python -u tools/train.py -c configs/faster_rcnn_r50_1x.yml \ + -o pretrain_weights=output/faster_rcnn_r50_1x/model_final/ \ + finetune_exclude_pretrained_params = ['cls_score','bbox_pred'] + ``` + +##### NOTES + +- `CUDA_VISIBLE_DEVICES` can specify different gpu numbers. Such as: `export CUDA_VISIBLE_DEVICES=0,1,2,3`. GPU calculation rules can refer [FAQ](#faq) +- Dataset will be downloaded automatically and cached in `~/.cache/paddle/dataset` if not be found locally. +- Pretrained model is downloaded automatically and cached in `~/.cache/paddle/weights`. +- Checkpoints are saved in `output` by default, and can be revised from save_dir in configure files. +- RCNN models training on CPU is not supported on PaddlePaddle<=1.5.1 and will be fixed on later version. + + +### Mixed Precision Training + +Mixed precision training can be enabled with `--fp16` flag. Currently Faster-FPN, Mask-FPN and Yolov3 have been verified to be working with little to no loss of precision (less than 0.2 mAP) + +To speed up mixed precision training, it is recommended to train in multi-process mode, for example + +```bash +python -m paddle.distributed.launch --selected_gpus 0,1,2,3,4,5,6,7 tools/train.py --fp16 -c configs/faster_rcnn_r50_fpn_1x.yml +``` + +If loss becomes `NaN` during training, try tweak the `--loss_scale` value. Please refer to the Nvidia [documentation](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#mptrain) on mixed precision training for details. + +Also, please note mixed precision training currently requires changing `norm_type` from `affine_channel` to `bn`. + + + +### Evaluation + +- Evaluate by specified weights path and dataset path + + ```bash + export CUDA_VISIBLE_DEVICES=0 + python -u tools/eval.py -c configs/faster_rcnn_r50_1x.yml \ + -o weights=https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_1x.tar \ + -d dataset/coco + ``` + + The path of model to be evaluted can be both local path and link in [MODEL_ZOO](MODEL_ZOO_cn.md). + +- Evaluate with json + + ```bash + export CUDA_VISIBLE_DEVICES=0 + python tools/eval.py -c configs/faster_rcnn_r50_1x.yml \ + --json_eval \ + -f evaluation/ + ``` + + The json file must be named bbox.json or mask.json, placed in the `evaluation/` directory. + +#### NOTES + +- Multi-GPU evaluation for R-CNN and SSD models is not supported at the +moment, but it is a planned feature + + +### Inference + +- Output specified directory && Set up threshold + + ```bash + export CUDA_VISIBLE_DEVICES=0 + python tools/infer.py -c configs/faster_rcnn_r50_1x.yml \ + --infer_img=demo/000000570688.jpg \ + --output_dir=infer_output/ \ + --draw_threshold=0.5 \ + -o weights=output/faster_rcnn_r50_1x/model_final \ + --use_tb=Ture + ``` + + `--draw_threshold` is an optional argument. Default is 0.5. + Different thresholds will produce different results depending on the calculation of [NMS](https://ieeexplore.ieee.org/document/1699659). + + +- Export model + + ```bash + python tools/export_model.py -c configs/faster_rcnn_r50_1x.yml \ + --output_dir=inference_model \ + -o weights=output/faster_rcnn_r50_1x/model_final \ + FasterRCNNTestFeed.image_shape=[3,800,1333] + ``` + + Save inference model `tools/export_model.py`, which can be loaded by PaddlePaddle predict library. + +## FAQ + +**Q:** Why do I get `NaN` loss values during single GPU training?
+**A:** The default learning rate is tuned to multi-GPU training (8x GPUs), it must +be adapted for single GPU training accordingly (e.g., divide by 8). +The calculation rules are as follows,they are equivalent:
+ + +| GPU number | Learning rate | Max_iters | Milestones | +| :---------: | :------------: | :-------: | :--------------: | +| 2 | 0.0025 | 720000 | [480000, 640000] | +| 4 | 0.005 | 360000 | [240000, 320000] | +| 8 | 0.01 | 180000 | [120000, 160000] | + + +**Q:** How to reduce GPU memory usage?
+**A:** Setting environment variable FLAGS_conv_workspace_size_limit to a smaller +number can reduce GPU memory footprint without affecting training speed. +Take Mask-RCNN (R50) as example, by setting `export FLAGS_conv_workspace_size_limit=512`, +batch size could reach 4 per GPU (Tesla V100 16GB). + + +**Q:** How to change data preprocessing?
+**A:** Set `sample_transform` in configuration. Note that **the whole transforms** need to be added in configuration. +For example, `DecodeImage`, `NormalizeImage` and `Permute` in RCNN models. For detail description, please refer +to [config_example](config_example). diff --git a/docs/GETTING_STARTED_cn.md b/docs/GETTING_STARTED_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..b5dd6041033e539e18d59dc8669a3658ba395da2 --- /dev/null +++ b/docs/GETTING_STARTED_cn.md @@ -0,0 +1,173 @@ +# 开始 + +关于配置运行环境,请参考[安装指南](INSTALL_cn.md) + + +## 训练/评估/推断 + +PaddleDetection提供了训练/训练/评估三个功能的使用脚本,支持通过不同可选参数实现特定功能 + +```bash +# 设置PYTHONPATH路径 +export PYTHONPATH=$PYTHONPATH:. +# GPU训练 支持单卡,多卡训练,通过CUDA_VISIBLE_DEVICES指定卡号 +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +python tools/train.py -c configs/faster_rcnn_r50_1x.yml +# GPU评估 +export CUDA_VISIBLE_DEVICES=0 +python tools/eval.py -c configs/faster_rcnn_r50_1x.yml +# 推断 +python tools/infer.py -c configs/faster_rcnn_r50_1x.yml --infer_img=demo/000000570688.jpg +``` + +### 可选参数列表 + +以下列表可以通过`--help`查看 + +| FLAG | 支持脚本 | 用途 | 默认值 | 备注 | +| :----------------------: | :------------: | :---------------: | :--------------: | :-----------------: | +| -c | ALL | 指定配置文件 | None | **完整配置说明请参考[配置案例](config_example)** | +| -o | ALL | 设置配置文件里的参数内容 | None | 使用-o配置相较于-c选择的配置文件具有更高的优先级。例如:`-o use_gpu=False max_iter=10000` | +| -r/--resume_checkpoint | train | 从某一检查点恢复训练 | None | `-r output/faster_rcnn_r50_1x/10000` | +| --eval | train | 是否边训练边测试 | False | | +| --output_eval | train/eval | 编辑评测保存json路径 | 当前路径 | `--output_eval ./json_result` | +| -d/--dataset_dir | train/eval | 数据集路径, 同配置文件里的dataset_dir | None | `-d dataset/coco` | +| --fp16 | train | 是否使用混合精度训练模式 | False | 需使用GPU训练 | +| --loss_scale | train | 设置混合精度训练模式中损失值的缩放比例 | 8.0 | 需先开启`--fp16`后使用 | +| --json_eval | eval | 是否通过已存在的bbox.json或者mask.json进行评估 | False | json文件路径在`--output_eval`中设置 | +| --output_dir | infer | 输出推断后可视化文件 | `./output` | `--output_dir output` | +| --draw_threshold | infer | 可视化时分数阈值 | 0.5 | `--draw_threshold 0.7` | +| --infer_dir | infer | 用于推断的图片文件夹路径 | None | | +| --infer_img | infer | 用于推断的图片路径 | None | 相较于`--infer_dir`具有更高优先级 | +| --use_tb | train/infer | 是否使用[tb-paddle](https://github.com/linshuliang/tb-paddle)记录数据,进而在TensorBoard中显示 | False | | +| --tb\_log_dir | train/infer | 指定 tb-paddle 记录数据的存储路径 | train:`tb_log_dir/scalar` infer: `tb_log_dir/image` | | + + +## 使用示例 + +### 模型训练 + +- 边训练边测试 + + ```bash + export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + python -u tools/train.py -c configs/faster_rcnn_r50_1x.yml --eval -d dataset/coco + ``` + + 在训练中交替执行评估, 评估在每个snapshot\_iter时开始。每次评估后还会评出最佳mAP模型保存到`best_model`文件夹下。 + + 如果验证集很大,测试将会比较耗时,建议减少评估次数,或训练完再进行评估。 + + +- Fine-tune其他任务 + + 使用预训练模型fine-tune其他任务时,可采用如下两种方式: + + 1. 在YAML配置文件中设置`finetune_exclude_pretrained_params` + 2. 在命令行中添加-o finetune\_exclude\_pretrained_params对预训练模型进行选择性加载。 + + ```bash + export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + python -u tools/train.py -c configs/faster_rcnn_r50_1x.yml \ + -o pretrain_weights=output/faster_rcnn_r50_1x/model_final/ \ + finetune_exclude_pretrained_params=['cls_score','bbox_pred'] + ``` + + 详细说明请参考[Transfer Learning](TRANSFER_LEARNING_cn.md) + +#### 提示 + +- `CUDA_VISIBLE_DEVICES` 参数可以指定不同的GPU。例如: `export CUDA_VISIBLE_DEVICES=0,1,2,3`. GPU计算规则可以参考 [FAQ](#faq) +- 若本地未找到数据集,将自动下载数据集并保存在`~/.cache/paddle/dataset`中。 +- 预训练模型自动下载并保存在`〜/.cache/paddle/weights`中。 +- 模型checkpoints默认保存在`output`中,可通过修改配置文件中save_dir进行配置。 +- RCNN系列模型CPU训练在PaddlePaddle 1.5.1及以下版本暂不支持。 + +### 混合精度训练 + +通过设置 `--fp16` 命令行选项可以启用混合精度训练。目前混合精度训练已经在Faster-FPN, Mask-FPN 及 Yolov3 上进行验证,几乎没有精度损失(小于0.2 mAP)。 + +建议使用多进程方式来进一步加速混合精度训练。示例如下。 + +```bash +python -m paddle.distributed.launch --selected_gpus 0,1,2,3,4,5,6,7 tools/train.py --fp16 -c configs/faster_rcnn_r50_fpn_1x.yml +``` + +如果训练过程中loss出现`NaN`,请尝试调节`--loss_scale`选项数值,细节请参看混合精度训练相关的[Nvidia文档](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#mptrain)。 + +另外,请注意将配置文件中的 `norm_type` 由 `affine_channel` 改为 `bn`。 + + +### 模型评估 + +- 指定权重和数据集路径 + + ```bash + export CUDA_VISIBLE_DEVICES=0 + python -u tools/eval.py -c configs/faster_rcnn_r50_1x.yml \ + -o weights=https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_1x.tar \ + -d dataset/coco + ``` + + 评估模型可以为本地路径,例如`output/faster_rcnn_r50_1x/model_final/`, 也可以为[MODEL_ZOO](MODEL_ZOO_cn.md)中给出的模型链接。 + +- 通过json文件评估 + + ```bash + export CUDA_VISIBLE_DEVICES=0 + python -u tools/eval.py -c configs/faster_rcnn_r50_1x.yml \ + --json_eval \ + --output_eval evaluation/ + ``` + + json文件必须命名为bbox.json或者mask.json,放在`evaluation/`目录下。 + +#### 提示 + +- R-CNN和SSD模型目前暂不支持多GPU评估,将在后续版本支持 + + +### 模型推断 + +- 设置输出路径 && 设置推断阈值 + + ```bash + export CUDA_VISIBLE_DEVICES=0 + python -u tools/infer.py -c configs/faster_rcnn_r50_1x.yml \ + --infer_img=demo/000000570688.jpg \ + --output_dir=infer_output/ \ + --draw_threshold=0.5 \ + -o weights=output/faster_rcnn_r50_1x/model_final \ + ``` + + + `--draw_threshold` 是个可选参数. 根据 [NMS](https://ieeexplore.ieee.org/document/1699659) 的计算, + 不同阈值会产生不同的结果。如果用户需要对自定义路径的模型进行推断,可以设置`-o weights`指定模型路径。 + +## FAQ + +**Q:** 为什么我使用单GPU训练loss会出`NaN`?
+**A:** 默认学习率是适配多GPU训练(8x GPU),若使用单GPU训练,须对应调整学习率(例如,除以8)。 +计算规则表如下所示,它们是等价的:
+ + +| GPU数 | 学习率 | 最大轮数 | 变化节点 | +| :---------: | :------------: | :-------: | :--------------: | +| 2 | 0.0025 | 720000 | [480000, 640000] | +| 4 | 0.005 | 360000 | [240000, 320000] | +| 8 | 0.01 | 180000 | [120000, 160000] | + + +**Q:** 如何减少GPU显存使用率?
+**A:** 可通过设置环境变量`FLAGS_conv_workspace_size_limit`为较小的值来减少显存消耗,并且不 +会影响训练速度。以Mask-RCNN(R50)为例,设置`export FLAGS_conv_workspace_size_limit = 512`, +batch size可以达到每GPU 4 (Tesla V100 16GB)。 + + +**Q:** 如何修改数据预处理?
+**A:** 可在配置文件中设置 `sample_transform`。注意需要在配置文件中加入**完整预处理** +例如RCNN模型中`DecodeImage`, `NormalizeImage` and `Permute`。更多详细描述请参考[配置案例](config_example)。 + + +**Q:** affine_channel和batch norm是什么关系? +**A:** 在RCNN系列模型加载预训练模型初始化,有时候会固定住batch norm的参数, 使用预训练模型中的全局均值和方式,并且batch norm的scale和bias参数不更新,已发布的大多ResNet系列的RCNN模型采用这种方式。这种情况下可以在config中设置norm_type为bn或affine_channel, freeze_norm为true (默认为true),两种方式等价。affne_channel的计算方式为`scale * x + bias`。只不过设置affine_channel时,内部对batch norm的参数自动做了融合。如果训练使用的affine_channel,用保存的模型做初始化,训练其他任务时,即可使用affine_channel, 也可使用batch norm, 参数均可正确加载。 diff --git a/docs/INSTALL.md b/docs/INSTALL.md new file mode 100644 index 0000000000000000000000000000000000000000..0761a240b51ea22bcfec5a9d6b70560f1ad17de3 --- /dev/null +++ b/docs/INSTALL.md @@ -0,0 +1,142 @@ +English | [简体中文](INSTALL_cn.md) + +# Installation + +--- +## Table of Contents + +- [Introduction](#introduction) +- [PaddlePaddle](#paddlepaddle) +- [Other Dependencies](#other-dependencies) +- [PaddleDetection](#paddle-detection) +- [Datasets](#datasets) + + +## Introduction + +This document covers how to install PaddleDetection, its dependencies +(including PaddlePaddle), together with COCO and Pascal VOC dataset. + +For general information about PaddleDetection, please see [README.md](../README.md). + + +## PaddlePaddle + +Running PaddleDetection requires PaddlePaddle Fluid v.1.5 and later. please follow the instructions in [installation document](http://www.paddlepaddle.org.cn/). + +Please make sure your PaddlePaddle installation was successful and the version +of your PaddlePaddle is not lower than required. Verify with the following commands. + +``` +# To check PaddlePaddle installation in your Python interpreter +>>> import paddle.fluid as fluid +>>> fluid.install_check.run_check() + +# To check PaddlePaddle version +python -c "import paddle; print(paddle.__version__)" +``` + +### Requirements: + +- Python2 or Python3 (Only support Python3 for windows) +- CUDA >= 8.0 +- cuDNN >= 5.0 +- nccl >= 2.1.2 + + +## Other Dependencies + +[COCO-API](https://github.com/cocodataset/cocoapi): + +COCO-API is needed for running. Installation is as follows: + + git clone https://github.com/cocodataset/cocoapi.git + cd cocoapi/PythonAPI + # if cython is not installed + pip install Cython + # Install into global site-packages + make install + # Alternatively, if you do not have permissions or prefer + # not to install the COCO API into global site-packages + python setup.py install --user + +**Installation of COCO-API in windows:** + + # if cython is not installed + pip install Cython + # Because the origin version of cocoapi does not support windows, another version is used which only supports Python3 + pip install git+https://github.com/philferriere/cocoapi.git#subdirectory=PythonAPI + +## PaddleDetection + +**Clone Paddle models repository:** + +You can clone Paddle models and change working directory to PaddleDetection +with the following commands: + +``` +cd +git clone https://github.com/PaddlePaddle/models +cd models/PaddleCV/PaddleDetection +``` + +**Install Python dependencies:** + +Required python packages are specified in [requirements.txt](../requirements.txt), and can be installed with: + +``` +pip install -r requirements.txt +``` + +**Make sure the tests pass:** + +``` +export PYTHONPATH=`pwd`:$PYTHONPATH +python ppdet/modeling/tests/test_architectures.py +``` + + +## Datasets + +PaddleDetection includes support for [COCO](http://cocodataset.org) and [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/) by default, please follow these instructions to set up the dataset. + +**Create symlinks for local datasets:** + +Default dataset path in config files is `dataset/coco` and `dataset/voc`, if the +datasets are already available on disk, you can simply create symlinks to +their directories: + +``` +ln -sf /dataset/coco +ln -sf /dataset/voc +``` + +**Download datasets manually:** + +On the other hand, to download the datasets, run the following commands: + +- COCO + +``` +export PYTHONPATH=$PYTHONPATH:. +python dataset/coco/download_coco.py +``` + +- Pascal VOC + +``` +export PYTHONPATH=$PYTHONPATH:. +python dataset/voc/download_voc.py +``` + +**Download datasets automatically:** + +If a training session is started but the dataset is not setup properly (e.g, +not found in `dataset/coco` or `dataset/voc`), PaddleDetection can automatically +download them from [COCO-2017](http://images.cocodataset.org) and +[VOC2012](http://host.robots.ox.ac.uk/pascal/VOC), the decompressed datasets +will be cached in `~/.cache/paddle/dataset/` and can be discovered automatically +subsequently. + + +**NOTE:** For further informations on the datasets, please see [DATA.md](DATA.md) diff --git a/docs/INSTALL_cn.md b/docs/INSTALL_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..eca4d41b7a79b6ec385df3ab7e95366530705933 --- /dev/null +++ b/docs/INSTALL_cn.md @@ -0,0 +1,137 @@ +# 安装文档 + +--- +## 目录 + +- [简介](#introduction) +- [PaddlePaddle](#paddlepaddle) +- [其他依赖安装](#other-dependencies) +- [PaddleDetection](#paddle-detection) +- [数据集](#datasets) + + +## 简介 + +这份文档介绍了如何安装PaddleDetection及其依赖项(包括PaddlePaddle),以及COCO和Pascal VOC数据集。 + +PaddleDetection的相关信息,请参考[README.md](../README.md). + + +## PaddlePaddle + + +运行PaddleDetection需要PaddlePaddle Fluid v.1.5及更高版本。请按照[安装文档](http://www.paddlepaddle.org.cn/)中的说明进行操作。 + +请确保您的PaddlePaddle安装成功并且版本不低于需求版本。使用以下命令进行验证。 + +``` +# 在您的Python解释器中确认PaddlePaddle安装成功 +>>> import paddle.fluid as fluid +>>> fluid.install_check.run_check() + +# 确认PaddlePaddle版本 +python -c "import paddle; print(paddle.__version__)" +``` + +### 环境需求: + +- Python2 or Python3 (windows系统仅支持Python3) +- CUDA >= 8.0 +- cuDNN >= 5.0 +- nccl >= 2.1.2 + + +## 其他依赖安装 + +[COCO-API](https://github.com/cocodataset/cocoapi): + +运行需要COCO-API,安装方式如下: + + git clone https://github.com/cocodataset/cocoapi.git + cd cocoapi/PythonAPI + # 若Cython未安装,请安装Cython + pip install Cython + # 安装至全局site-packages + make install + # 若您没有权限或更倾向不安装至全局site-packages + python setup.py install --user + +**windows用户安装COCO-API方式:** + + # 若Cython未安装,请安装Cython + pip install Cython + # 由于原版cocoapi不支持windows,采用第三方实现版本,该版本仅支持Python3 + pip install git+https://github.com/philferriere/cocoapi.git#subdirectory=PythonAPI + +## PaddleDetection + +**克隆Paddle models模型库:** + +您可以通过以下命令克隆Paddle models模型库并切换工作目录至PaddleDetection: + +``` +cd +git clone https://github.com/PaddlePaddle/models +cd models/PaddleCV/PaddleDetection +``` + +**安装Python依赖库:** + +Python依赖库在[requirements.txt](../requirements.txt)中给出,可通过如下命令安装: + +``` +pip install -r requirements.txt +``` + +**确认测试通过:** + +``` +export PYTHONPATH=`pwd`:$PYTHONPATH +python ppdet/modeling/tests/test_architectures.py +``` + + +## 数据集 + + +PaddleDetection默认支持[COCO](http://cocodataset.org)和[Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/), +请按照如下步骤设置数据集。 + +**为本地数据集创建软链接:** + + +配置文件中默认的数据集路径是`dataset/coco`和`dataset/voc`,如果您本地磁盘上已有数据集, +只需创建软链接至数据集目录: + +``` +ln -sf /dataset/coco +ln -sf /dataset/voc +``` + +**手动下载数据集:** + +若您本地没有数据集,可通过如下命令下载: + +- COCO + +``` +export PYTHONPATH=$PYTHONPATH:. +python dataset/coco/download_coco.py +``` + +- Pascal VOC + +``` +export PYTHONPATH=$PYTHONPATH:. +python dataset/voc/download_voc.py +``` + +**自动下载数据集:** + +若您在数据集未成功设置(例如,在`dataset/coco`或`dataset/voc`中找不到)的情况下开始运行, +PaddleDetection将自动从[COCO-2017](http://images.cocodataset.org)或 +[VOC2012](http://host.robots.ox.ac.uk/pascal/VOC)下载,解压后的数据集将被保存在 +`〜/.cache/paddle/dataset/`目录下,下次运行时,也可自动从该目录发现数据集。 + + +**说明:** 更多有关数据集的介绍,请参考[DATA.md](DATA_cn.md) diff --git a/docs/MODEL_ZOO.md b/docs/MODEL_ZOO.md new file mode 100644 index 0000000000000000000000000000000000000000..d6042ada1293ea77a1670871bbff1d6f94f8a163 --- /dev/null +++ b/docs/MODEL_ZOO.md @@ -0,0 +1,165 @@ +English | [简体中文](MODEL_ZOO_cn.md) + +# Model Zoo and Benchmark +## Environment + +- Python 2.7.1 +- PaddlePaddle >=1.5 +- CUDA 9.0 +- cuDNN >=7.4 +- NCCL 2.1.2 + +## Common settings + +- All models below were trained on `coco_2017_train`, and tested on `coco_2017_val`. +- Batch Normalization layers in backbones are replaced by Affine Channel layers. +- Unless otherwise noted, all ResNet backbones adopt the [ResNet-B](https://arxiv.org/pdf/1812.01187) variant.. +- For RCNN and RetinaNet models, only horizontal flipping data augmentation was used in the training phase and no augmentations were used in the testing phase. +- **Inf time (fps)**: the inference time is measured with fps (image/s) on a single GPU (Tesla V100) with cuDNN 7.5 by running 'tools/eval.py' on all validation set, which including data loadding, network forward and post processing. The batch size is 1. + + +## Training Schedules + +- We adopt exactly the same training schedules as [Detectron](https://github.com/facebookresearch/Detectron/blob/master/MODEL_ZOO.md#training-schedules). +- 1x indicates the schedule starts at a LR of 0.02 and is decreased by a factor of 10 after 60k and 80k iterations and eventually terminates at 90k iterations for minibatch size 16. For batch size 8, LR is decreased to 0.01, total training iterations are doubled, and the decay milestones are scaled by 2. +- 2x schedule is twice as long as 1x, with the LR milestones scaled accordingly. + +## ImageNet Pretrained Models + +The backbone models pretrained on ImageNet are available. All backbone models are pretrained on standard ImageNet-1k dataset and can be downloaded [here](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification#supported-models-and-performances). + +- **Notes:** The ResNet50 model was trained with cosine LR decay schedule and can be downloaded [here](https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar). + +## Baselines + +### Faster & Mask R-CNN + +| Backbone | Type | Image/gpu | Lr schd | Inf time (fps) | Box AP | Mask AP | Download | +| :---------------------- | :------------- | :-------: | :-----: | :------------: | :----: | :-----: | :----------------------------------------------------------: | +| ResNet50 | Faster | 1 | 1x | 12.747 | 35.2 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_1x.tar) | +| ResNet50 | Faster | 1 | 2x | 12.686 | 37.1 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_2x.tar) | +| ResNet50 | Mask | 1 | 1x | 11.615 | 36.5 | 32.2 | [model](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_r50_1x.tar) | +| ResNet50 | Mask | 1 | 2x | 11.494 | 38.2 | 33.4 | [model](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_r50_2x.tar) | +| ResNet50-vd | Faster | 1 | 1x | 12.575 | 36.4 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_vd_1x.tar) | +| ResNet50-FPN | Faster | 2 | 1x | 22.273 | 37.2 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_fpn_1x.tar) | +| ResNet50-FPN | Faster | 2 | 2x | 22.297 | 37.7 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_fpn_2x.tar) | +| ResNet50-FPN | Mask | 1 | 1x | 15.184 | 37.9 | 34.2 | [model](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_r50_fpn_1x.tar) | +| ResNet50-FPN | Mask | 1 | 2x | 15.881 | 38.7 | 34.7 | [model](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_r50_fpn_2x.tar) | +| ResNet50-FPN | Cascade Faster | 2 | 1x | 17.507 | 40.9 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/cascade_rcnn_r50_fpn_1x.tar) | +| ResNet50-FPN | Cascade Mask | 1 | 1x | - | 41.3 | 35.5 | [model](https://paddlemodels.bj.bcebos.com/object_detection/cascade_mask_rcnn_r50_fpn_1x.tar) | +| ResNet50-vd-FPN | Faster | 2 | 2x | 21.847 | 38.9 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_vd_fpn_2x.tar) | +| ResNet50-vd-FPN | Mask | 1 | 2x | 15.825 | 39.8 | 35.4 | [model](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_r50_vd_fpn_2x.tar) | +| ResNet101 | Faster | 1 | 1x | 9.316 | 38.3 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r101_1x.tar) | +| ResNet101-FPN | Faster | 1 | 1x | 17.297 | 38.7 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r101_fpn_1x.tar) | +| ResNet101-FPN | Faster | 1 | 2x | 17.246 | 39.1 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r101_fpn_2x.tar) | +| ResNet101-FPN | Mask | 1 | 1x | 12.983 | 39.5 | 35.2 | [model](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_r101_fpn_1x.tar) | +| ResNet101-vd-FPN | Faster | 1 | 1x | 17.011 | 40.5 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r101_vd_fpn_1x.tar) | +| ResNet101-vd-FPN | Faster | 1 | 2x | 16.934 | 40.8 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r101_vd_fpn_2x.tar) | +| ResNet101-vd-FPN | Mask | 1 | 1x | 13.105 | 41.4 | 36.8 | [model](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_r101_vd_fpn_1x.tar) | +| ResNeXt101-vd-64x4d-FPN | Faster | 1 | 1x | 8.815 | 42.2 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_x101_vd_64x4d_fpn_1x.tar) | +| ResNeXt101-vd-64x4d-FPN | Faster | 1 | 2x | 8.809 | 41.7 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_x101_vd_64x4d_fpn_2x.tar) | +| ResNeXt101-vd-64x4d-FPN | Mask | 1 | 1x | 7.689 | 42.9 | 37.9 | [model](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_x101_vd_64x4d_fpn_1x.tar) | +| ResNeXt101-vd-64x4d-FPN | Mask | 1 | 2x | 7.859 | 42.6 | 37.6 | [model](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_x101_vd_64x4d_fpn_2x.tar) | +| SENet154-vd-FPN | Faster | 1 | 1.44x | 3.408 | 42.9 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_se154_vd_fpn_s1x.tar) | +| SENet154-vd-FPN | Mask | 1 | 1.44x | 3.233 | 44.0 | 38.7 | [model](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_se154_vd_fpn_s1x.tar) | + +### Deformable ConvNets v2 + +| Backbone | Type | Conv | Image/gpu | Lr schd | Inf time (fps) | Box AP | Mask AP | Download | +| :---------------------- | :------------- | :---: | :-------: | :-----: | :------------: | :----: | :-----: | :----------------------------------------------------------: | +| ResNet50-FPN | Faster | c3-c5 | 2 | 1x | 19.978 | 41.0 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_dcn_r50_fpn_1x.tar) | +| ResNet50-vd-FPN | Faster | c3-c5 | 2 | 2x | 19.222 | 42.4 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_dcn_r50_vd_fpn_2x.tar) | +| ResNet101-vd-FPN | Faster | c3-c5 | 2 | 1x | 14.477 | 44.1 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_dcn_r101_vd_fpn_1x.tar) | +| ResNeXt101-vd-64x4d-FPN | Faster | c3-c5 | 1 | 1x | 7.209 | 45.2 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_dcn_x101_vd_64x4d_fpn_1x.tar) | +| ResNet50-FPN | Mask | c3-c5 | 1 | 1x | 14.53 | 41.9 | 37.3 | [model](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_dcn_r50_fpn_1x.tar) | +| ResNet50-vd-FPN | Mask | c3-c5 | 1 | 2x | 14.832 | 42.9 | 38.0 | [model](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_dcn_r50_vd_fpn_2x.tar) | +| ResNet101-vd-FPN | Mask | c3-c5 | 1 | 1x | 11.546 | 44.6 | 39.2 | [model](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_dcn_r101_vd_fpn_1x.tar) | +| ResNeXt101-vd-64x4d-FPN | Mask | c3-c5 | 1 | 1x | 6.45 | 46.2 | 40.4 | [model](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_dcn_x101_vd_64x4d_fpn_1x.tar) | +| ResNet50-FPN | Cascade Faster | c3-c5 | 2 | 1x | - | 44.2 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/cascade_rcnn_dcn_r50_fpn_1x.tar) | +| ResNet101-vd-FPN | Cascade Faster | c3-c5 | 2 | 1x | - | 46.4 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/cascade_rcnn_dcn_r101_vd_fpn_1x.tar) | +| ResNeXt101-vd-FPN | Cascade Faster | c3-c5 | 2 | 1x | - | 47.3 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/cascade_rcnn_dcn_x101_vd_64x4d_fpn_1x.tar) | +| SENet154-vd-FPN | Cascade Mask | c3-c5 | 1 | 1.44x | - | 51.9 | 43.9 | [model](https://paddlemodels.bj.bcebos.com/object_detection/cascade_mask_rcnn_dcnv2_se154_vd_fpn_gn_s1x.tar) | + +#### Notes: +- Deformable ConvNets v2(dcn_v2) reference from [Deformable ConvNets v2](https://arxiv.org/abs/1811.11168). +- `c3-c5` means adding `dcn` in resnet stage 3 to 5. +- Detailed configuration file in [configs/dcn](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/PaddleDetection/configs/dcn) + +### Group Normalization +| Backbone | Type | Image/gpu | Lr schd | Box AP | Mask AP | Download | +| :------------------- | :------------- | :-----: | :-----: | :----: | :-----: | :----------------------------------------------------------: | +| ResNet50-FPN | Faster | 2 | 2x | 39.7 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_fpn_gn_2x.tar) | +| ResNet50-FPN | Mask | 1 | 2x | 40.1 | 35.8 | [model](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_r50_fpn_gn_2x.tar) | + +#### Notes: +- Group Normalization reference from [Group Normalization](https://arxiv.org/abs/1803.08494). +- Detailed configuration file in [configs/gn](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/PaddleDetection/configs/gn) + +### Yolo v3 + +| Backbone | Size | Image/gpu | Lr schd | Inf time (fps) | Box AP | Download | +| :----------- | :--: | :-------: | :-----: | :------------: | :----: | :----------------------------------------------------------: | +| DarkNet53 | 608 | 8 | 270e | 45.571 | 38.9 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_darknet.tar) | +| DarkNet53 | 416 | 8 | 270e | - | 37.5 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_darknet.tar) | +| DarkNet53 | 320 | 8 | 270e | - | 34.8 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_darknet.tar) | +| MobileNet-V1 | 608 | 8 | 270e | 78.302 | 29.3 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) | +| MobileNet-V1 | 416 | 8 | 270e | - | 29.3 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) | +| MobileNet-V1 | 320 | 8 | 270e | - | 27.1 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) | +| ResNet34 | 608 | 8 | 270e | 63.356 | 36.2 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34.tar) | +| ResNet34 | 416 | 8 | 270e | - | 34.3 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34.tar) | +| ResNet34 | 320 | 8 | 270e | - | 31.4 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34.tar) | + + +### Yolo v3 on Pascal VOC + +| Backbone | Size | Image/gpu | Lr schd | Inf time (fps) | Box AP | Download | +| :----------- | :--: | :-------: | :-----: | :------------: | :----: | :----------------------------------------------------------: | +| DarkNet53 | 608 | 8 | 270e | 54.977 | 83.5 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_darknet_voc.tar) | +| DarkNet53 | 416 | 8 | 270e | - | 83.6 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_darknet_voc.tar) | +| DarkNet53 | 320 | 8 | 270e | - | 82.2 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_darknet_voc.tar) | +| MobileNet-V1 | 608 | 8 | 270e | 104.291 | 76.2 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) | +| MobileNet-V1 | 416 | 8 | 270e | - | 76.7 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) | +| MobileNet-V1 | 320 | 8 | 270e | - | 75.3 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) | +| ResNet34 | 608 | 8 | 270e | 82.247 | 82.6 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34_voc.tar) | +| ResNet34 | 416 | 8 | 270e | - | 81.9 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34_voc.tar) | +| ResNet34 | 320 | 8 | 270e | - | 80.1 | [model](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34_voc.tar) | + +**Notes:** Yolo v3 is trained in 8 GPU with total batch size as 64 and trained 270 epoches. Yolo v3 training data augmentations: mixup, +randomly color distortion, randomly cropping, randomly expansion, randomly interpolation method, randomly flippling. Yolo v3 used randomly +reshaped minibatch in training, inferences can be performed on different image sizes with the same model weights, and we provided evaluation +results of image size 608/416/320 above. + +### RetinaNet + +| Backbone | Image/gpu | Lr schd | Box AP | Download | +| :---------------: | :-----: | :-----: | :----: | :-------: | +| ResNet50-FPN | 2 | 1x | 36.0 | [model](https://paddlemodels.bj.bcebos.com/object_detection/retinanet_r50_fpn_1x.tar) | +| ResNet101-FPN | 2 | 1x | 37.3 | [model](https://paddlemodels.bj.bcebos.com/object_detection/retinanet_r101_fpn_1x.tar) | +| ResNeXt101-vd-FPN | 1 | 1x | 40.5 | [model](https://paddlemodels.bj.bcebos.com/object_detection/retinanet_x101_vd_64x4d_fpn_1x.tar) | + +**Notes:** In RetinaNet, the base LR is changed to 0.01 for minibatch size 16. + +### SSD + +| Backbone | Size | Image/gpu | Lr schd | Inf time (fps) | Box AP | Download | +| :------: | :--: | :-------: | :-----: | :------------: | :----: | :----------------------------------------------------------: | +| VGG16 | 300 | 8 | 40w | 81.613 | 25.1 | [model](https://paddlemodels.bj.bcebos.com/object_detection/ssd_vgg16_300.tar) | +| VGG16 | 512 | 8 | 40w | 46.007 | 29.1 | [model](https://paddlemodels.bj.bcebos.com/object_detection/ssd_vgg16_512.tar) | + +**Notes:** VGG-SSD is trained in 4 GPU with total batch size as 32 and trained 400000 iters. + +### SSD on Pascal VOC + +| Backbone | Size | Image/gpu | Lr schd | Inf time (fps) | Box AP | Download | +| :----------- | :--: | :-------: | :-----: | :------------: | :----: | :----------------------------------------------------------: | +| MobileNet v1 | 300 | 32 | 120e | 159.543 | 73.2 | [model](https://paddlemodels.bj.bcebos.com/object_detection/ssd_mobilenet_v1_voc.tar) | +| VGG16 | 300 | 8 | 240e | 117.279 | 77.5 | [model](https://paddlemodels.bj.bcebos.com/object_detection/ssd_vgg16_300_voc.tar) | +| VGG16 | 512 | 8 | 240e | 65.975 | 80.2 | [model](https://paddlemodels.bj.bcebos.com/object_detection/ssd_vgg16_512_voc.tar) | + +**NOTE**: MobileNet-SSD is trained in 2 GPU with totoal batch size as 64 and trained 120 epoches. VGG-SSD is trained in 4 GPU with total batch size as 32 and trained 240 epoches. SSD training data augmentations: randomly color distortion, +randomly cropping, randomly expansion, randomly flipping. + + +## Face Detection + +Please refer [face detection models](../configs/face_detection) for details. diff --git a/docs/MODEL_ZOO_cn.md b/docs/MODEL_ZOO_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..918a9cb249d105d9ed7cd0411ede8c5e8ce0c3d5 --- /dev/null +++ b/docs/MODEL_ZOO_cn.md @@ -0,0 +1,156 @@ +# 模型库和基线 + +## 测试环境 + +- Python 2.7.1 +- PaddlePaddle >=1.5 +- CUDA 9.0 +- cuDNN >=7.4 +- NCCL 2.1.2 + +## 通用设置 + +- 所有模型均在COCO17数据集中训练和测试。 +- 除非特殊说明,所有ResNet骨干网络采用[ResNet-B](https://arxiv.org/pdf/1812.01187)结构。 +- 对于RCNN和RetinaNet系列模型,训练阶段仅使用水平翻转作为数据增强,测试阶段不使用数据增强。 +- **推理时间(fps)**: 推理时间是在一张Tesla V100的GPU上通过'tools/eval.py'测试所有验证集得到,单位是fps(图片数/秒), cuDNN版本是7.5,包括数据加载、网络前向执行和后处理, batch size是1。 + +## 训练策略 + +- 我们采用和[Detectron](https://github.com/facebookresearch/Detectron/blob/master/MODEL_ZOO.md#training-schedules)相同的训练策略。 +- 1x 策略表示:在总batch size为16时,初始学习率为0.02,在6万轮和8万轮后学习率分别下降10倍,最终训练9万轮。在总batch size为8时,初始学习率为0.01,在12万轮和16万轮后学习率分别下降10倍,最终训练18万轮。 +- 2x 策略为1x策略的两倍,同时学习率调整位置也为1x的两倍。 + +## ImageNet预训练模型 + +Paddle提供基于ImageNet的骨架网络预训练模型。所有预训练模型均通过标准的Imagenet-1k数据集训练得到。[下载链接](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification#supported-models-and-performances) + +- 注:ResNet50模型通过余弦学习率调整策略训练得到。[下载链接](https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar) + +## 基线 + +### Faster & Mask R-CNN + +| 骨架网络 | 网络类型 | 每张GPU图片个数 | 学习率策略 |推理时间(fps) | Box AP | Mask AP | 下载 | +| :------------------- | :------------- | :-----: | :-----: | :------------: | :-----: | :-----: | :-----------------------------------------------------: | +| ResNet50 | Faster | 1 | 1x | 12.747 | 35.2 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_1x.tar) | +| ResNet50 | Faster | 1 | 2x | 12.686 | 37.1 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_2x.tar) | +| ResNet50 | Mask | 1 | 1x | 11.615 | 36.5 | 32.2 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_r50_1x.tar) | +| ResNet50 | Mask | 1 | 2x | 11.494 | 38.2 | 33.4 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_r50_2x.tar) | +| ResNet50-vd | Faster | 1 | 1x | 12.575 | 36.4 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_vd_1x.tar) | +| ResNet50-FPN | Faster | 2 | 1x | 22.273 | 37.2 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_fpn_1x.tar) | +| ResNet50-FPN | Faster | 2 | 2x | 22.297 | 37.7 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_fpn_2x.tar) | +| ResNet50-FPN | Mask | 1 | 1x | 15.184 | 37.9 | 34.2 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_r50_fpn_1x.tar) | +| ResNet50-FPN | Mask | 1 | 2x | 15.881 | 38.7 | 34.7 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_r50_fpn_2x.tar) | +| ResNet50-FPN | Cascade Faster | 2 | 1x | 17.507 | 40.9 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/cascade_rcnn_r50_fpn_1x.tar) | +| ResNet50-FPN | Cascade Mask | 1 | 1x | - | 41.3 | 35.5 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/cascade_mask_rcnn_r50_fpn_1x.tar) | +| ResNet50-vd-FPN | Faster | 2 | 2x | 21.847 | 38.9 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_vd_fpn_2x.tar) | +| ResNet50-vd-FPN | Mask | 1 | 2x | 15.825 | 39.8 | 35.4 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_r50_vd_fpn_2x.tar) | +| ResNet101 | Faster | 1 | 1x | 9.316 | 38.3 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r101_1x.tar) | +| ResNet101-FPN | Faster | 1 | 1x | 17.297 | 38.7 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r101_fpn_1x.tar) | +| ResNet101-FPN | Faster | 1 | 2x | 17.246 | 39.1 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r101_fpn_2x.tar) | +| ResNet101-FPN | Mask | 1 | 1x | 12.983 | 39.5 | 35.2 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_r101_fpn_1x.tar) | +| ResNet101-vd-FPN | Faster | 1 | 1x | 17.011 | 40.5 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r101_vd_fpn_1x.tar) | +| ResNet101-vd-FPN | Faster | 1 | 2x | 16.934 | 40.8 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r101_vd_fpn_2x.tar) | +| ResNet101-vd-FPN | Mask | 1 | 1x | 13.105 | 41.4 | 36.8 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_r101_vd_fpn_1x.tar) | +| ResNeXt101-vd-FPN | Faster | 1 | 1x | 8.815 | 42.2 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_x101_vd_64x4d_fpn_1x.tar) | +| ResNeXt101-vd-FPN | Faster | 1 | 2x | 8.809 | 41.7 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_x101_vd_64x4d_fpn_2x.tar) | +| ResNeXt101-vd-FPN | Mask | 1 | 1x | 7.689 | 42.9 | 37.9 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_x101_vd_64x4d_fpn_1x.tar) | +| ResNeXt101-vd-FPN | Mask | 1 | 2x | 7.859 | 42.6 | 37.6 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_x101_vd_64x4d_fpn_2x.tar) | +| SENet154-vd-FPN | Faster | 1 | 1.44x | 3.408 | 42.9 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_se154_vd_fpn_s1x.tar) | +| SENet154-vd-FPN | Mask | 1 | 1.44x | 3.233 | 44.0 | 38.7 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_se154_vd_fpn_s1x.tar) | + +### Deformable 卷积网络v2 + +| 骨架网络 | 网络类型 | 卷积 | 每张GPU图片个数 | 学习率策略 |推理时间(fps)| Box AP | Mask AP | 下载 | +| :------------------- | :------------- | :-----: |:--------: | :-----: | :-----------: |:----: | :-----: | :----------------------------------------------------------: | +| ResNet50-FPN | Faster | c3-c5 | 2 | 1x | 19.978 | 41.0 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_dcn_r50_fpn_1x.tar) | +| ResNet50-vd-FPN | Faster | c3-c5 | 2 | 2x | 19.222 | 42.4 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_dcn_r50_vd_fpn_2x.tar) | +| ResNet101-vd-FPN | Faster | c3-c5 | 2 | 1x | 14.477 | 44.1 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_dcn_r101_vd_fpn_1x.tar) | +| ResNeXt101-vd-FPN | Faster | c3-c5 | 1 | 1x | 7.209 | 45.2 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_dcn_x101_vd_64x4d_fpn_1x.tar) | +| ResNet50-FPN | Mask | c3-c5 | 1 | 1x | 14.53 | 41.9 | 37.3 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_dcn_r50_fpn_1x.tar) | +| ResNet50-vd-FPN | Mask | c3-c5 | 1 | 2x | 14.832 | 42.9 | 38.0 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_dcn_r50_vd_fpn_2x.tar) | +| ResNet101-vd-FPN | Mask | c3-c5 | 1 | 1x | 11.546 | 44.6 | 39.2 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_dcn_r101_vd_fpn_1x.tar) | +| ResNeXt101-vd-FPN | Mask | c3-c5 | 1 | 1x | 6.45 | 46.2 | 40.4 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_dcn_x101_vd_64x4d_fpn_1x.tar) | +| ResNet50-FPN | Cascade Faster | c3-c5 | 2 | 1x | - | 44.2 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/cascade_rcnn_dcn_r50_fpn_1x.tar) | +| ResNet101-vd-FPN | Cascade Faster | c3-c5 | 2 | 1x | - | 46.4 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/cascade_rcnn_dcn_r101_vd_fpn_1x.tar) | +| ResNeXt101-vd-FPN | Cascade Faster | c3-c5 | 2 | 1x | - | 47.3 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/cascade_rcnn_dcn_x101_vd_64x4d_fpn_1x.tar) | +| SENet154-vd-FPN | Cascade Mask | c3-c5 | 1 | 1.44x | - | 51.9 | 43.9 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/cascade_mask_rcnn_dcnv2_se154_vd_fpn_gn_s1x.tar) | + +#### 注意事项: +- Deformable卷积网络v2(dcn_v2)参考自论文[Deformable ConvNets v2](https://arxiv.org/abs/1811.11168). +- `c3-c5`意思是在resnet模块的3到5阶段增加`dcn`. +- 详细的配置文件在[configs/dcn](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/PaddleDetection/configs/dcn) + +### Group Normalization +| 骨架网络 | 网络类型 | 每张GPU图片个数 | 学习率策略 | Box AP | Mask AP | 下载 | +| :------------------- | :------------- |:--------: | :-----: | :----: | :-----: | :----------------------------------------------------------: | +| ResNet50-FPN | Faster | 2 | 2x | 39.7 | - | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_fpn_gn_2x.tar) | +| ResNet50-FPN | Mask | 1 | 2x | 40.1 | 35.8 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/mask_rcnn_r50_fpn_gn_2x.tar) | + +#### 注意事项: +- Group Normalization参考论文[Group Normalization](https://arxiv.org/abs/1803.08494). +- 详细的配置文件在[configs/gn](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/PaddleDetection/configs/gn) + +### Yolo v3 + +| 骨架网络 | 输入尺寸 | 每张GPU图片个数 | 学习率策略 |推理时间(fps)| Box AP | 下载 | +| :----------- | :--: | :-----: | :-----: |:------------: |:----: | :-------: | +| DarkNet53 | 608 | 8 | 270e | 45.571 | 38.9 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_darknet.tar) | +| DarkNet53 | 416 | 8 | 270e | - | 37.5 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_darknet.tar) | +| DarkNet53 | 320 | 8 | 270e | - | 34.8 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_darknet.tar) | +| MobileNet-V1 | 608 | 8 | 270e | 78.302 | 29.3 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) | +| MobileNet-V1 | 416 | 8 | 270e | - | 29.3 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) | +| MobileNet-V1 | 320 | 8 | 270e | - | 27.1 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1.tar) | +| ResNet34 | 608 | 8 | 270e | 63.356 | 36.2 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34.tar) | +| ResNet34 | 416 | 8 | 270e | - | 34.3 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34.tar) | +| ResNet34 | 320 | 8 | 270e | - | 31.4 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34.tar) | + +### Yolo v3 基于Pasacl VOC数据集 + +| 骨架网络 | 输入尺寸 | 每张GPU图片个数 | 学习率策略 |推理时间(fps)| Box AP | 下载 | +| :----------- | :--: | :-----: | :-----: |:------------: |:----: | :-------: | +| DarkNet53 | 608 | 8 | 270e | 54.977 | 83.5 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_darknet_voc.tar) | +| DarkNet53 | 416 | 8 | 270e | - | 83.6 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_darknet_voc.tar) | +| DarkNet53 | 320 | 8 | 270e | - | 82.2 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_darknet_voc.tar) | +| MobileNet-V1 | 608 | 8 | 270e | 104.291 | 76.2 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) | +| MobileNet-V1 | 416 | 8 | 270e | - | 76.7 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) | +| MobileNet-V1 | 320 | 8 | 270e | - | 75.3 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar) | +| ResNet34 | 608 | 8 | 270e | 82.247 | 82.6 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34_voc.tar) | +| ResNet34 | 416 | 8 | 270e | - | 81.9 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34_voc.tar) | +| ResNet34 | 320 | 8 | 270e | - | 80.1 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_r34_voc.tar) | + +**注意事项:** Yolo v3在8卡,总batch size为64下训练270轮。数据增强包括:mixup, 随机颜色失真,随机剪裁,随机扩张,随机插值法,随机翻转。Yolo v3在训练阶段对minibatch采用随机reshape,可以采用相同的模型测试不同尺寸图片,我们分别提供了尺寸为608/416/320大小的测试结果。 + +### RetinaNet + +| 骨架网络 | 每张GPU图片个数 | 学习率策略 | Box AP | 下载 | +| :---------------: | :-----: | :-----: | :----: | :-------: | +| ResNet50-FPN | 2 | 1x | 36.0 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/retinanet_r50_fpn_1x.tar) | +| ResNet101-FPN | 2 | 1x | 37.3 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/retinanet_r101_fpn_1x.tar) | +| ResNeXt101-vd-FPN | 1 | 1x | 40.5 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/retinanet_x101_vd_64x4d_fpn_1x.tar) | + +**注意事项:** RetinaNet系列模型中,在总batch size为16下情况下,初始学习率改为0.01。 + +### SSD + +| 骨架网络 | 输入尺寸 | 每张GPU图片个数 | 学习率策略|推理时间(fps) | Box AP | 下载 | +| :----------: | :--: | :-----: | :-----: |:------------: |:----: | :-------: | +| VGG16 | 300 | 8 | 40万 | 81.613 | 25.1 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/ssd_vgg16_300.tar) | +| VGG16 | 512 | 8 | 40万 | 46.007 | 29.1 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/ssd_vgg16_512.tar) | + +**注意事项:** VGG-SSD在总batch size为32下训练40万轮。 + +### SSD 基于Pascal VOC数据集 + +| 骨架网络 | 输入尺寸 | 每张GPU图片个数 | 学习率策略 |推理时间(fps)| Box AP | 下载 | +| :----------- | :--: | :-----: | :-----: | :------------: |:----: | :-------: | +| MobileNet v1 | 300 | 32 | 120e | 159.543 | 73.2 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/ssd_mobilenet_v1_voc.tar) | +| VGG16 | 300 | 8 | 240e | 117.279 | 77.5 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/ssd_vgg16_300_voc.tar) | +| VGG16 | 512 | 8 | 240e | 65.975 | 80.2 | [下载链接](https://paddlemodels.bj.bcebos.com/object_detection/ssd_vgg16_512_voc.tar) | + +**注意事项:** MobileNet-SSD在2卡,总batch size为64下训练120周期。VGG-SSD在总batch size为32下训练240周期。数据增强包括:随机颜色失真,随机剪裁,随机扩张,随机翻转。 + +## 人脸检测 + +详细请参考[人脸检测模型](../configs/face_detection). diff --git a/docs/QUICK_STARTED.md b/docs/QUICK_STARTED.md new file mode 100644 index 0000000000000000000000000000000000000000..c6649f71f7b41c2fda00141e20c16dffc1f93e40 --- /dev/null +++ b/docs/QUICK_STARTED.md @@ -0,0 +1,67 @@ +English | [简体中文](QUICK_STARTED_cn.md) + +# Quick Start + +This tutorial fine-tunes a tiny dataset by pretrained detection model for users to get a model and learn PaddleDetection quickly. The model can be trained in around 20min with good performance. + +## Data Preparation + +Dataset refers to [Kaggle](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection), which contains 240 images in train dataset and 60 images in test dataset. Data categories are apple, orange and banana. Download [here](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar) and uncompress the dataset after download, script for data preparation is located at [download_fruit.py](../dataset/fruit/download_fruit.py). Command is as follows: + +```bash +export PYTHONPATH=$PYTHONPATH:. +python dataset/fruit/download_fruit.py +``` + +- **Note: before started, run the following command and specifiy the GPU** + +```bash +export PYTHONPATH=$PYTHONPATH:. +export CUDA_VISIBLE_DEVICES=0 +``` + +Training: + +```bash +python -u tools/train.py -c configs/yolov3_mobilenet_v1_fruit.yml \ + --use_tb=True \ + --tb_log_dir=tb_fruit_dir/scalar \ + --eval \ +``` + +Use `yolov3_mobilenet_v1` to fine-tune the model from COCO dataset. Meanwhile, loss and mAP can be observed on tensorboard. + +```bash +tensorboard --logdir tb_fruit_dir/scalar/ --host --port +``` + +Result on tensorboard is shown below: + +
+ +
+ +Model can be downloaded [here](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_fruit.tar) + +Evaluation: + +```bash +python -u tools/eval.py -c configs/yolov3_mobilenet_v1_fruit.yml +``` + +Inference: + +```bash +python -u tools/infer.py -c configs/yolov3_mobilenet_v1_fruit.yml \ + -o weights=https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_fruit.tar \ + --infer_img=demo/orange_71.jpg +``` + +Inference images are shown below: + +

+ + +

+ +For detailed infomation of training and evalution, please refer to [GETTING_STARTED.md](GETTING_STARTED.md). diff --git a/docs/QUICK_STARTED_cn.md b/docs/QUICK_STARTED_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..8c02ffb798250a0fa29db02ab7e4b38f04e4daac --- /dev/null +++ b/docs/QUICK_STARTED_cn.md @@ -0,0 +1,67 @@ +[English](QUICK_STARTED.md) | 简体中文 + +# 快速开始 + +为了使得用户能够在很短的时间内快速产出模型,掌握PaddleDetection的使用方式,这篇教程通过一个预训练检测模型对小数据集进行finetune。在P40上单卡大约20min即可产出一个效果不错的模型。 + +## 数据准备 + +数据集参考[Kaggle数据集](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection),其中训练数据集240张图片,测试数据集60张图片,数据类别为3类:苹果,橘子,香蕉。[下载链接](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar)。数据下载后分别解压即可, 数据准备脚本位于[download_fruit.py](../dataset/fruit/download_fruit.py)。下载数据方式如下: + +```bash +export PYTHONPATH=$PYTHONPATH:. +python dataset/fruit/download_fruit.py +``` + +- **注:在开始前,运行如下命令并指定GPU** + +```bash +export PYTHONPATH=$PYTHONPATH:. +export CUDA_VISIBLE_DEVICES=0 +``` + +训练命令如下: + +```bash +python -u tools/train.py -c configs/yolov3_mobilenet_v1_fruit.yml \ + --use_tb=True \ + --tb_log_dir=tb_fruit_dir/scalar \ + --eval \ +``` + +训练使用`yolov3_mobilenet_v1`基于COCO数据集训练好的模型进行finetune。训练期间可以通过tensorboard实时观察loss和精度值,启动命令如下: + +```bash +tensorboard --logdir tb_fruit_dir/scalar/ --host --port +``` + +tensorboard结果显示如下: + +
+ +
+ +训练模型[下载链接](https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_fruit.tar) + +评估命令如下: + +```bash +python -u tools/eval.py -c configs/yolov3_mobilenet_v1_fruit.yml +``` + +预测命令如下 + +```bash +python -u tools/infer.py -c configs/yolov3_mobilenet_v1_fruit.yml \ + -o weights=https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_fruit.tar \ + --infer_img=demo/orange_71.jpg +``` + +预测图片如下: + +

+ + +

+ +更多训练及评估流程,请参考[GETTING_STARTED_cn.md](GETTING_STARTED_cn.md). diff --git a/docs/TRANSFER_LEARNING.md b/docs/TRANSFER_LEARNING.md new file mode 100644 index 0000000000000000000000000000000000000000..0bc0377acb749ee896050660ba122a3a77ca20b7 --- /dev/null +++ b/docs/TRANSFER_LEARNING.md @@ -0,0 +1,39 @@ +English | [简体中文](TRANSFER_LEARNING_cn.md) + +# Transfer Learning + +Transfer learning aims at learning new knowledge from existing knowledge. For example, take pretrained model from ImageNet to initialize detection models, or take pretrained model from COCO dataset to initialize train detection models in PascalVOC dataset. + +In transfer learning, if different dataset and the number of classes is used, the dimensional inconsistency will causes in loading parameters related to the number of classes; On the other hand, if more complicated model is used, need to motify the open-source model construction and selective load parameters. Thus, PaddleDetection should designate parameter fields and ignore loading the parameters which match the fields. + +## Transfer Learning in PaddleDetection + +In transfer learning, it's needed to load pretrained model selectively. The following two methods can be used: + +1. Set `finetune_exclude_pretrained_params` in YAML configuration files. Please refer to [configure file](../configs/yolov3_mobilenet_v1_fruit.yml#L15) +2. Set -o finetune_exclude_pretrained_params in command line. For example: + +```python +export PYTHONPATH=$PYTHONPATH:. +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +python -u tools/train.py -c configs/faster_rcnn_r50_1x.yml \ + -o pretrain_weights=https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_1x.tar \ + finetune_exclude_pretrained_params=['cls_score','bbox_pred'] +``` + +* Note: + +1. The path in pretrain\_weights is the open-source model link of faster RCNN from COCO dataset. For full models link, please refer to [MODEL_ZOO](MODEL_ZOO.md) +2. The parameter fields are set in finetune\_exclude\_pretrained\_params. If the name of parameter matches field (wildcard matching), the parameter will be ignored in loading. + +If users want to fine-tune by own dataet, and remain the model construction, need to ignore the parameters related to the number of classes. PaddleDetection lists ignored parameter fields corresponding to different model type. The table is shown below:
+ +| model type | ignored parameter fields | +| :----------------: | :---------------------------------------: | +| Faster RCNN | cls\_score, bbox\_pred | +| Cascade RCNN | cls\_score, bbox\_pred | +| Mask RCNN | cls\_score, bbox\_pred, mask\_fcn\_logits | +| Cascade-Mask RCNN | cls\_score, bbox\_pred, mask\_fcn\_logits | +| RetinaNet | retnet\_cls\_pred\_fpn | +| SSD | ^conv2d\_ | +| YOLOv3 | yolo\_output | diff --git a/docs/TRANSFER_LEARNING_cn.md b/docs/TRANSFER_LEARNING_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..a54210d0aa9ef32096620e1830d49a2b2430b189 --- /dev/null +++ b/docs/TRANSFER_LEARNING_cn.md @@ -0,0 +1,37 @@ +# 迁移学习 + +迁移学习为利用已有知识,对新知识进行学习。例如利用ImageNet分类预训练模型做初始化来训练检测模型,利用在COCO数据集上的检测模型做初始化来训练基于PascalVOC数据集的检测模型。 + +在进行迁移学习时,由于会使用不同的数据集,数据类别数与COCO/VOC数据类别不同,导致在加载PaddlePaddle开源模型时,与类别数相关的权重(例如分类模块的fc层)会出现维度不匹配的问题;另外,如果需要结构更加复杂的模型,需要对已有开源模型结构进行调整,对应权重也需要选择性加载。因此,需要检测库能够指定参数字段,在加载模型时不加载匹配的权重。 + +## PaddleDetection进行迁移学习 + +在迁移学习中,对预训练模型进行选择性加载,可通过如下两种方式实现: + +1. 在 YMAL 配置文件中通过设置`finetune_exclude_pretrained_params`字段。可参考[配置文件](../configs/yolov3_mobilenet_v1_fruit.yml#L15) +2. 在 train.py的启动参数中设置 -o finetune_exclude_pretrained_params。例如: + +```python +export PYTHONPATH=$PYTHONPATH:. +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +python -u tools/train.py -c configs/faster_rcnn_r50_1x.yml \ + -o pretrain_weights=https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_1x.tar \ + finetune_exclude_pretrained_params=['cls_score','bbox_pred'] +``` + +* 说明: + +1. pretrain\_weights的路径为COCO数据集上开源的faster RCNN模型链接,完整模型链接可参考[MODEL_ZOO](MODEL_ZOO_cn.md) +2. finetune\_exclude\_pretrained\_params中设置参数字段,如果参数名能够匹配以上参数字段(通配符匹配方式),则在模型加载时忽略该参数。 + +如果用户需要利用自己的数据进行finetune,模型结构不变,只需要忽略与类别数相关的参数。PaddleDetection给出了不同模型类型所对应的忽略参数字段。如下表所示:
+ +| 模型类型 | 忽略参数字段 | +| :----------------: | :---------------------------------------: | +| Faster RCNN | cls\_score, bbox\_pred | +| Cascade RCNN | cls\_score, bbox\_pred | +| Mask RCNN | cls\_score, bbox\_pred, mask\_fcn\_logits | +| Cascade-Mask RCNN | cls\_score, bbox\_pred, mask\_fcn\_logits | +| RetinaNet | retnet\_cls\_pred\_fpn | +| SSD | ^conv2d\_ | +| YOLOv3 | yolo\_output | diff --git a/docs/config_example/mask_rcnn_r50_fpn_1x.yml b/docs/config_example/mask_rcnn_r50_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..50d38777a565e8b58e3d8dd4d40279e733bc3b1a --- /dev/null +++ b/docs/config_example/mask_rcnn_r50_fpn_1x.yml @@ -0,0 +1,498 @@ +# Architecture of detection, which is also the prefix of data feed module +architecture: MaskRCNN + +# Data feed module +train_feed: MaskRCNNTrainFeed +eval_feed: MaskRCNNEvalFeed +test_feed: MaskRCNNTestFeed + +# Use GPU or CPU, true by default +use_gpu: true + +# Maximum number of iteration. +# In rcnn models, max_iters is 180000 if lr schedule is 1x and batch_size is 1. +max_iters: 180000 + +# Snapshot period. If training and test at same time, evaluate model at each snapshot_iter. 10000 by default. +snapshot_iter: 10000 + +# Smooth the log output in specified iterations, 20 by default. +log_smooth_window: 20 + +# The number of iteration interval to display in training log. +log_iter: 20 + +# The directory to save models. +save_dir: output + +# The path of oretrained wegiths. If url is provided, it will download the pretrain_weights and decompress automatically. +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar + +# Evalution method, COCO and VOC are available. +metric: COCO + +# The path of final model for evaluation and test. +weights: output/mask_rcnn_r50_fpn_1x/model_final/ + +# Number of classes, 81 for COCO and 21 for VOC +num_classes: 81 + +# Mask RCNN architecture, see https://arxiv.org/abs/1703.06870 +MaskRCNN: + backbone: ResNet + fpn: FPN + roi_extractor: FPNRoIAlign + rpn_head: FPNRPNHead + bbox_assigner: BBoxAssigner + bbox_head: BBoxHead + mask_assigner: MaskAssigner + mask_head: MaskHead + rpn_only: false + +# Backbone module +ResNet: + # Index of stages using deformable conv v2, [] by default + dcn_v2_stages: [] + # ResNet depth, 50 by default + depth: 50 + # Stage index of returned feature map, [2,3,4,5] by default + feature_maps: + - 2 + - 3 + - 4 + - 5 + # Stage Index of backbone to freeze, 2 by default + freeze_at: 2 + # Whether freeze normalization layers, true by default + freeze_norm: true + # Weight decay for normalization layer weights, 0. by default + norm_decay: 0.0 + # Normalization type, bn/sync_bn/affine_channel, affine_channel by default + norm_type: affine_channel + # ResNet variant, supports 'a', 'b', 'c', 'd' currently, b by default + variant: b + +# FPN module +FPN: + # Whether has extra conv in higher levels, false by default + has_extra_convs: false + # Highest level of the backbone feature map to use, 6 by default + max_level: 6 + # Lowest level of the backbone feature map to use, 6 by default + min_level: 2 + # FPN normalization type, bn/sync_bn/affine_channel, null by default + norm_type: null + # Number of feature channels, 256 by default + num_chan: 256 + # Feature map scaling factors, [0.03125, 0.0625, 0.125, 0.25] by default + spatial_scale: + - 0.03125 + - 0.0625 + - 0.125 + - 0.25 + +# RPN module, if use non-FPN architecture, use RPNHead instead +# Extract proposals according to anchors and assign box targets and +# score targets to selected proposals to compute RPN loss. For FPN +# architecture, RPN is computed from each levels and collect proposals +# together. +FPNRPNHead: + # fluid.layers.anchor_generator + # Generate anchors for RCNN models. Each position of input produces + # N anchors. N = anchor_sizes * aspect_ratios. In FPNRPNHead, aspect_ratios + # is provided and anchor_sizes depends on FPN levels and anchor_start_size. + anchor_generator: + aspect_ratios: + - 0.5 + - 1.0 + - 2.0 + variance: + - 1.0 + - 1.0 + - 1.0 + - 1.0 + # fluid.layers.rpn_target_assign + # Assign classification and regression targets to each anchor according + # to Intersection-over-Union(IoU) overlap between anchors and ground + # truth boxes. The classification targets is binary class labels. the + # positive labels are two kinds of anchors: the anchors with the highest + # IoU overlap with a ground-truth box, or an anchor that has an IoU overlap + # higher than rpn_positive_overlap with any ground-truth box. + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + # fluid.layers.generate_proposals in training + # Generate RoIs according to each box with probability to be a foreground + # object. The operation performs following steps: Transposes and resizes + # scores and bbox_deltas; Calculate box locations as proposal candidates; + # Clip boxes to image; Remove predicted boxes with small area; Apply NMS to + # get final proposals as output. + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 2000 + pre_nms_top_n: 2000 + # fluid.layers.generate_proposals in test + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 1000 + pre_nms_top_n: 1000 + # Size of anchor at the first scale, 32 by default + anchor_start_size: 32 + # highest level of FPN output, 6 by default + max_level: 6 + # Lowest level of FPN output, 2 by default + min_level: 2 + # Number of FPN output channels, 256 by default + num_chan: 256 + # Number of classes in RPN output, 1 by default + num_classes: 1 + +# RoI extractor module, if use non-FPN architecture, use RoIAlign instead +# For FPN architecture, proposals are distributed to different levels and +# apply roi align at each level. Then concat the outputs. +FPNRoIAlign: + # The canconical FPN feature map level, 4 by default + canconical_level: 4 + # The canconical FPN feature map size, 224 by default + canonical_size: 224 + # The highest level of FPN layer, 5 by default + max_level: 5 + # The lowest level of FPN layer, 2 by default + min_level: 2 + # Number of sampling points, 0 by default + sampling_ratio: 2 + # Box resolution, 7 by default + box_resolution: 7 + # Mask RoI resolution, 14 by default + mask_resolution: 14 + +# Mask head module +# Generate mask output and compute loss mask. +MaskHead: + # Number of convolutions, 4 for FPN, 0 otherwise. 0 by default + num_convs: 4 + # size of the output mask, 14 by default + resolution: 28 + # Dilation rate, 1 by default + dilation: 1 + # Number of channels after first conv, 256 by default + num_chan_reduced: 256 + # Number of output classes, 81 by default + num_classes: 81 + +# fluid.layers.generate_proposal_labels +# Combine boxes and gt_boxes, and sample foreground proposals and background +# prosals.Then assign classification and regression targets to selected RoIs. +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: + - 0.1 + - 0.1 + - 0.2 + - 0.2 + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + num_classes: 81 + shuffle_before_sample: true + +# fluid.layers.generate_mask_labels +# For given the RoIs and corresponding labels, sample foreground RoIs. +# Assign mask targets to selected RoIs which are encoded to K binary masks +# of resolution M x M. +MaskAssigner: + resolution: 28 + num_classes: 81 + +# BBox head module +# Faster bbox head following the RoI extractor, and apply post process, such as +# NMS and box coder.. +BBoxHead: + # Head after RoI extractor, ResNetC5/TwoFCHead + head: TwoFCHead + # fluid.layers.multiclass_nms + # Select a subset of detection bounding boxes that have high scores larger + # than score_threshold. Then prune away boxes that have high IoU overlap + # with already selected boxes by nms_threshold. + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + # fluid.layers.box_coder + box_coder: + axis: 1 + box_normalized: false + code_type: decode_center_size + prior_box_var: + - 0.1 + - 0.1 + - 0.2 + - 0.2 + num_classes: 81 + +# RCNN head with two Fully Connected layers +TwoFCHead: + # The number of output channels, 1024 by default + num_chan: 1024 + +# Learning rate configuration +LearningRate: + # Base learning rate, 0.01 by default + base_lr: 0.01 + # Learning rate schedulers, PiecewiseDecay and LinearWarmup by default + schedulers: + # fluid.layers.piecewise_decay + # Values has higher priority and if values is null, learning rate is multipled by gamma at each stage + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 120000 + - 160000 + values: null + # fluid.layers.linear_lr_warmup + # Start learning rate equals to base_lr * start_factor + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +# Optimizer module +OptimizerBuilder: + # fluid.optimizer + optimizer: + momentum: 0.9 + type: Momentum + # fluid.regularizer + regularizer: + factor: 0.0001 + type: L2 + +# Data feed module for training +MaskRCNNTrainFeed: + # Batch size per device, 1 by default + batch_size: 1 + # Dataset module + dataset: + # Annotation file path + annotation: annotations/instances_train2017.json + # Dataset directory + dataset_dir: dataset/coco + # Directory where image files are stored + image_dir: train2017 + # List of data fields needed + fields: + - image + - im_info + - im_id + - gt_box + - gt_label + - is_crowd + - gt_mask + # list of image dims + image_shape: + - 3 + - 800 + - 1333 + # List of sample transformations to use + sample_transforms: + # Transform the image data to numpy format. + - !DecodeImage + to_rgb: true # default: true + with_mixup: false # default: false + # Flip images randomly + # Transform the x coordinates of bboxes and segmentations + - !RandomFlipImage + is_mask_flip: true # default: false + # Whether bbox is normalized + is_normalized: false # default: false + prob: 0.5 # default: 0.5 + # Normalize the image + - !NormalizeImage + # The format of image, [H, W, C]/[C, H, W], true by default + is_channel_first: false + # Whether divide by 255, true by default + is_scale: true + # default: [0.485, 0.456, 0.406] + mean: + - 0.485 + - 0.456 + - 0.406 + # default: [1, 1, 1] + std: + - 0.229 + - 0.224 + - 0.225 + # Rescale image to the specified target size, and capped at max_size + - !ResizeImage + # Resize method, cv2.INTER_LINEAR(1) by default + interp: 1 + max_size: 1333 + target_size: 800 + use_cv2: true # default: true + # Change the channel + - !Permute + # The format of image, [H, W, C]/[C, H, W], true by default + channel_first: true + to_bgr: false # default: true + # List of batch transformations to use + batch_transforms: + # Pad a batch of samples to same dimensions + - !PadBatch + pad_to_stride: 32 # default: 32 + # Drop last batch if size is uneven, false by default + drop_last: false + # Number of workers processes(or threads), 2 by default + num_workers: 2 + # Number of samples, -1 represents all samples. -1 by default + samples: -1 + # If samples should be shuffled, true by default + shuffle: true + # If update im_info after padding, false by default + use_padded_im_info: false + # If use multi-process, false by default + use_process: false + +# Data feed module for test +MaskRCNNEvalFeed: + # Batch size per device, 1 by default + batch_size: 1 + # Dataset module + dataset: + # Annotation file path + annotation: annotations/instances_val2017.json + # Dataset directory + dataset_dir: dataset/coco + # Directory where image files are stored + image_dir: val2017 + # List of data fields needed + fields: + - image + - im_info + - im_id + - im_shape + # list of image dims + image_shape: + - 3 + - 800 + - 1333 + # List of sample transformations to use + sample_transforms: + # Transform the image data to numpy format. + - !DecodeImage + to_rgb: true # default: true + with_mixup: false # default: false + # Normalize the image + - !NormalizeImage + # The format of image, [H, W, C]/[C, H, W], true by default + is_channel_first: false + # Whether divide by 255, true by default + is_scale: true + # default: [0.485, 0.456, 0.406] + mean: + - 0.485 + - 0.456 + - 0.406 + # default: [1, 1, 1] + std: + - 0.229 + - 0.224 + - 0.225 + # Rescale image to the specified target size, and capped at max_size + - !ResizeImage + # Resize method, cv2.INTER_LINEAR(1) by default + interp: 1 + max_size: 1333 + target_size: 800 + use_cv2: true # default: true + # Change the channel + - !Permute + # The format of image, [H, W, C]/[C, H, W], true by default + channel_first: true + to_bgr: false # default: true + # List of batch transformations to use + batch_transforms: + # Pad a batch of samples to same dimensions + - !PadBatch + pad_to_stride: 32 # default: 32 + # Drop last batch if size is uneven, false by default + drop_last: false + # Number of workers processes(or threads), 2 by default + num_workers: 2 + # Number of samples, -1 represents all samples. -1 by default + samples: -1 + # If samples should be shuffled, true by default + shuffle: false + # If update im_info after padding, false by default + use_padded_im_info: true + # If use multi-process, false by default + use_process: false + +# Data feed module for test +MaskRCNNTestFeed: + # Batch size per device, 1 by default + batch_size: 1 + # Dataset module + dataset: + # Annotation file path + annotation: dataset/coco/annotations/instances_val2017.json + # List of data fields needed + fields: + - image + - im_info + - im_id + - im_shape + # list of image dims + image_shape: + - 3 + - 800 + - 1333 + # List of sample transformations to use + sample_transforms: + # Transform the image data to numpy format. + - !DecodeImage + to_rgb: true # default: true + with_mixup: false # default: false + # Normalize the image + - !NormalizeImage + # The format of image, [H, W, C]/[C, H, W], true by default + is_channel_first: false + # Whether divide by 255, true by default + is_scale: true + # default: [0.485, 0.456, 0.406] + mean: + - 0.485 + - 0.456 + - 0.406 + # default: [1, 1, 1] + std: + - 0.229 + - 0.224 + - 0.225 + # Change the channel + - !Permute + # The format of image, [H, W, C]/[C, H, W], true by default + channel_first: true + to_bgr: false # default: true + # List of batch transformations to use + batch_transforms: + # Pad a batch of samples to same dimensions + - !PadBatch + pad_to_stride: 32 # default: 32 + # Drop last batch if size is uneven, false by default + drop_last: false + # Number of workers processes(or threads), 2 by default + num_workers: 2 + # Number of samples, -1 represents all samples. -1 by default + samples: -1 + # If samples should be shuffled, true by default + shuffle: false + # If update im_info after padding, false by default + use_padded_im_info: true + # If use multi-process, false by default + use_process: false + + diff --git a/docs/config_example/ssd_vgg16_300.yml b/docs/config_example/ssd_vgg16_300.yml new file mode 100644 index 0000000000000000000000000000000000000000..5a6dc7c210edc0ed2ac550a653463f3142dbc393 --- /dev/null +++ b/docs/config_example/ssd_vgg16_300.yml @@ -0,0 +1,427 @@ +# Architecture of detection, which is also the prefix of data feed module. +architecture: SSD +# Data feed module. +# Data feed in training. +train_feed: SSDTrainFeed +# Data feed in Evaluation. +eval_feed: SSDEvalFeed +# Data feed in infer. +test_feed: SSDTestFeed +# Use GPU or CPU, true by default. +use_gpu: true +# Maximum number of iteration. +max_iters: 400000 +# Snapshot period. If training and test at same time, evaluate model at each snapshot_iter. 10000 by default. +snapshot_iter: 10000 +# Smooth the log output in specified iterations, 20 by default. +log_smooth_window: 20 +# The log in training is displayed once every period. +log_iter: 20 +# Evaluation method, COCO and VOC are available. +metric: COCO +# Evaluation mAP calculation method in VOC metric, 11point and integral are available. +map_type: 11point +# The path of final model for evaluation and test. +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/VGG16_caffe_pretrained.tar +# The directory to save models. +save_dir: output +# The path of final model for evaluation and test. +weights: output/ssd_vgg16_300/model_final +# Number of classes, 81 for COCO and 21 for VOC. +num_classes: 81 + +# SSD architecture, see https://arxiv.org/abs/1512.02325 +SSD: + # backbone instance, defined below. + backbone: VGG + # `MultiBoxHead` instance, defined below. + multi_box_head: MultiBoxHead + + # fluid.layers.detection_output, Detection Output Layer for SSD. + # This operation is to get the detection results by performing following two steps: + # 1. Decode input bounding box predictions according to the prior boxes. + # 2. Get the final detection results by applying multi-class non maximum suppression (NMS). + # this operation doesn’t clip the final output bounding boxes to the image window. + output_decoder: + # The index of background label, the background label will be ignored. + # If set to -1, then all categories will be considered. + background_label: 0 + # Number of total bboxes to be kept per image after NMS. + keep_top_k: 200 + # The parameter for adaptive NMS. + nms_eta: 1.0 + # The threshold to be used in NMS. + nms_threshold: 0.45 + # Maximum number of detections to be kept according to the confidences + # aftern the filtering detections based on score_threshold. + nms_top_k: 400 + # Threshold to filter out bounding boxes with low confidence score. + # If not provided, consider all boxes. + score_threshold: 0.01 + +# VGG backbone, see https://arxiv.org/abs/1409.1556 +VGG: + # the VGG net depth (16 or 19 + depth: 16 + # whether or not extra blocks should be added + with_extra_blocks: true + # in each extra block, params: + # [in_channel, out_channel, padding_size, stride_size, filter_size] + extra_block_filters: + - [256, 512, 1, 2, 3] + - [128, 256, 1, 2, 3] + - [128, 256, 0, 1, 3] + - [128, 256, 0, 1, 3] + # params list of init scale in l2 norm, skip init scale if param is -1. + normalizations: [20., -1, -1, -1, -1, -1] + +# fluid.layers.multi_box_head, Generate prior boxes for SSD algorithm. +# Generate `prior_box` according to the inputs list and other parameters +# Each position of the input produce N prior boxes, N is determined by +# the count of min_sizes, max_sizes and aspect_ratios, The size of the box +# is in range(min_size, max_size) interval, which is generated in sequence +# according to the aspect_ratios. +MultiBoxHead: + # the base_size is used to get min_size and max_size according to min_ratio and max_ratio. + base_size: 300 + # the aspect ratios of generated prior boxes. The length of input and aspect_ratios must be equal. + aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]] + # the min ratio of generated prior boxes. + min_ratio: 15 + # the max ratio of generated prior boxes. + max_ratio: 90 + # If len(inputs) <=2, min_sizes must be set up, and the length of min_sizes + # should equal to the length of inputs. Default: None. + min_sizes: [30.0, 60.0, 111.0, 162.0, 213.0, 264.0] + # If len(inputs) <=2, max_sizes must be set up, and the length of min_sizes + # should equal to the length of inputs. Default: None. + max_sizes: [60.0, 111.0, 162.0, 213.0, 264.0, 315.0] + # If step_w and step_h are the same, step_w and step_h can be replaced by steps. + steps: [8, 16, 32, 64, 100, 300] + # Prior boxes center offset. Default: 0.5 + offset: 0.5 + # Whether to flip aspect ratios. Default:False. + flip: true + # The kernel size of conv2d. Default: 1. + kernel_size: 3 + # The padding of conv2d. Default:0. + pad: 1 + +# Learning rate configuration +LearningRate: + # Base learning rate, 0.01 by default + base_lr: 0.001 + # Learning rate schedulers, PiecewiseDecay and LinearWarmup by default + schedulers: + # fluid.layers.piecewise_decay + # Values has higher priority and if values is null, learning rate is multipled by gamma at each stage + - !PiecewiseDecay + gamma: 0.1 + milestones: [280000, 360000] + # fluid.layers.linear_lr_warmup + # Start learning rate equals to base_lr * start_factor + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +# Optimizer module +OptimizerBuilder: + # fluid.optimizer, Neural network in essence is a Optimization problem . + # With forward computing and back propagation , Optimizer use back-propagation + # gradients to optimize parameters in a neural network. + optimizer: + # Momentum optimizer adds momentum on the basis of SGD , + # reducing noise problem in the process of random gradient descent. + momentum: 0.9 + type: Momentum + # fluid.regularizer + regularizer: + # implements the L2 Weight Decay Regularization + # Small values of L2 can help prevent over fitting the training data. + factor: 0.0005 + type: L2 + +# Data feed module for training +SSDTrainFeed: + # Batch size per device + batch_size: 16 + # list of batch transformations to use + batch_transforms: [] + # The data buffer size + bufsize: 10 + # Dataset module + dataset: + # Dataset directory + dataset_dir: dataset/coco + # Annotation file path + annotation: annotations/instances_train2017.json + # Directory where image files are stored + image_dir: train2017 + # Drop last batch if size is uneven, false by default + drop_last: true + # List of data fields needed + fields: [image, gt_box, gt_label] + # list of image dims + image_shape: [3, 300, 300] + # number of workers processes (or threads) + num_workers: 8 + # List of sample transformations to use + sample_transforms: + # Transform the image data to numpy format. + - !DecodeImage + # whether to convert BGR to RGB + to_rgb: true # default: true + # whether or not to mixup image and gt_bbbox/gt_score + with_mixup: false # default: false + # Transform the bounding box's coornidates to [0,1]. + - !NormalizeBox {} + # modify image brightness,contrast,saturation,hue,reordering channels and etc. + - !RandomDistort + # brightness_lower/ brightness_upper (float): the brightness + # between brightness_lower and brightness_upper + brightness_lower: 0.875 + brightness_upper: 1.125 + # brightness_prob (float): the probability of changing brightness + brightness_prob: 0.5 + # contrast_lower/ contrast_upper (float): the contrast between + # contrast_lower and contrast_lower + contrast_lower: 0.5 + contrast_upper: 1.5 + # contrast_prob (float): the probability of changing contrast + contrast_prob: 0.5 + # count (int): the kinds of doing distrot + count: 4 + # hue_lower/ hue_upper (float): the hue between hue_lower and hue_upper + hue_lower: -18 + hue_upper: 18 + # hue_prob (float): the probability of changing hue + hue_prob: 0.5 + # is_order (bool): whether determine the order of distortion + is_order: true + # saturation_lower/ saturation_upper (float): the saturation + # between saturation_lower and saturation_upper + saturation_lower: 0.5 + saturation_upper: 1.5 + # saturation_prob (float): the probability of changing saturation + saturation_prob: 0.5 + #Expand the image and modify bounding box. + # Operators: + # 1. Scale the image weight and height. + # 2. Construct new images with new height and width. + # 3. Fill the new image with the mean. + # 4. Put original imge into new image. + # 5. Rescale the bounding box. + # 6. Determine if the new bbox is satisfied in the new image. + - !ExpandImage + # max_ratio (float): the ratio of expanding + max_ratio: 4 + # mean (list): the pixel mean + mean: [104, 117, 123] + # prob (float): the probability of expanding image + prob: 0.5 + # Crop the image and modify bounding box. + # Operators: + # 1. Scale the image weight and height. + # 2. Crop the image according to a radom sample. + # 3. Rescale the bounding box. + # 4. Determine if the new bbox is satisfied in the new image. + - !CropImage + # avoid_no_bbox (bool): whether to to avoid the + # situation where the box does not appear. + avoid_no_bbox: false + # batch_sampler (list): Multiple sets of different parameters for cropping. + batch_sampler: + - [1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0] + # satisfy_all (bool): whether all boxes must satisfy. + satisfy_all: false + # Rescale image to the specified target size, and capped at max_size if max_size != 0. + # If target_size is list, selected a scale randomly as the specified target size. + - !ResizeImage + # Resize method, cv2.INTER_LINEAR(1) by default + interp: 1 + # max_size (int): the max size of image + max_size: 0 + # target_size (int|list): the target size of image's short side, + # multi-scale training is adopted when type is list. + target_size: 300 + # use_cv2 (bool): use the cv2 interpolation method or use PIL interpolation method + use_cv2: false + # Filp the image and bounding box. + # Operators: + # 1. Flip the image numpy. + # 2. Transform the bboxes' x coordinates. (Must judge whether the coordinates are normalized!) + # 3. Transform the segmentations' x coordinates. (Must judge whether the coordinates are normalized!) + - !RandomFlipImage + # is_mask_flip (bool): whether flip the segmentation + is_mask_flip: false + # is_normalized (bool): whether the bbox scale to [0,1] + is_normalized: true + # prob (float): the probability of flipping image + prob: 0.5 + # Change the channel + - !Permute + # The format of image, [H, W, C]/[C, H, W], true by default + channel_first: true + # to_bgr (bool): confirm whether to convert RGB to BGR + to_bgr: true + # Normalize the image. + # Operators: + # 1.(optional) Scale the image to [0,1] + # 2. Each pixel minus mean and is divided by std + - !NormalizeImage + # The format of image, [H, W, C]/[C, H, W], true by default + is_channel_first: true + # Whether divide by 255, true by default + is_scale: false + # mean (list): the pixel mean + mean: [104, 117, 123] + # std (list): the pixel variance + std: [1, 1, 1] + # Number of samples, -1 represents all samples. -1 by default + samples: -1 + # If samples should be shuffled, true by default + shuffle: true + # If use multi-process, false by default + use_process: true + +# Data feed module for Eval +SSDEvalFeed: + # Batch size per device + batch_size: 32 + # list of batch transformations to use + batch_transforms: [] + # The data buffer size + bufsize: 10 + # Dataset module + dataset: + # Dataset directory + dataset_dir: dataset/coco + # Annotation file path + annotation: annotations/instances_val2017.json + # Directory where image files are stored + image_dir: val2017 + # Drop last batch if size is uneven, false by default + drop_last: true + # List of data fields needed + fields: [image, im_shape, im_id, gt_box, gt_label, is_difficult] + # list of image dims + image_shape: [3, 300, 300] + # number of workers processes (or threads) + num_workers: 8 + # List of sample transformations to use + sample_transforms: + # Transform the image data to numpy format. + - !DecodeImage + # whether to convert BGR to RGB + to_rgb: true # default: true + # whether or not to mixup image and gt_bbbox/gt_score + with_mixup: false # default: false + # Transform the bounding box's coornidates to [0,1]. + - !NormalizeBox {} + # Rescale image to the specified target size, and capped at max_size if max_size != 0. + # If target_size is list, selected a scale randomly as the specified target size. + - !ResizeImage + # Resize method, cv2.INTER_LINEAR(1) by default + interp: 1 + # max_size (int): the max size of image + max_size: 0 + # target_size (int|list): the target size of image's short side, + # multi-scale training is adopted when type is list. + target_size: 300 + # use_cv2 (bool): use the cv2 interpolation method or use PIL interpolation method + use_cv2: false + - !Permute + # The format of image, [H, W, C]/[C, H, W], true by default + channel_first: true + # to_bgr (bool): confirm whether to convert RGB to BGR + to_bgr: true + # Normalize the image. + # Operators: + # 1.(optional) Scale the image to [0,1] + # 2. Each pixel minus mean and is divided by std + - !NormalizeImage + # The format of image, [H, W, C]/[C, H, W], true by default + is_channel_first: true + # Whether divide by 255, true by default + is_scale: false + # mean (list): the pixel mean + mean: [104, 117, 123] + # std (list): the pixel variance + std: [1, 1, 1] + # Number of samples, -1 represents all samples. -1 by default + samples: -1 + # If samples should be shuffled, true by default + shuffle: false + # If use multi-process, false by default + use_process: false + +# Data feed module for test +SSDTestFeed: + # Batch size per device + batch_size: 1 + # list of batch transformations to use + batch_transforms: [] + # The data buffer size + bufsize: 10 + # Dataset module + dataset: + # Annotation file path + annotation: dataset/coco/annotations/instances_val2017.json + # Drop last batch if size is uneven, false by default + drop_last: false + # List of data fields needed + fields: [image, im_id] + # list of image dims + image_shape: [3, 300, 300] + # number of workers processes (or threads) + num_workers: 8 + # List of sample transformations to use + sample_transforms: + # Transform the image data to numpy format. + - !DecodeImage + # whether to convert BGR to RGB + to_rgb: true # default: true + # whether or not to mixup image and gt_bbbox/gt_score + with_mixup: false # default: false + # Rescale image to the specified target size, and capped at max_size if max_size != 0. + # If target_size is list, selected a scale randomly as the specified target size. + - !ResizeImage + # Resize method, cv2.INTER_LINEAR(1) by default + interp: 1 + # max_size (int): the max size of image + max_size: 0 + # target_size (int|list): the target size of image's short side, + # multi-scale training is adopted when type is list. + target_size: 300 + # use_cv2 (bool): use the cv2 interpolation method or use PIL interpolation method + use_cv2: false + - !Permute + # The format of image, [H, W, C]/[C, H, W], true by default + channel_first: true + # to_bgr (bool): confirm whether to convert RGB to BGR + to_bgr: true + # Normalize the image. + # Operators: + # 1.(optional) Scale the image to [0,1] + # 2. Each pixel minus mean and is divided by std + - !NormalizeImage + # The format of image, [H, W, C]/[C, H, W], true by default + is_channel_first: true + # Whether divide by 255, true by default + is_scale: false + # mean (list): the pixel mean + mean: [104, 117, 123] + # std (list): the pixel variance + std: [1, 1, 1] + # Number of samples, -1 represents all samples. -1 by default + samples: -1 + # If samples should be shuffled, true by default + shuffle: false + # If use multi-process, false by default + use_process: false diff --git a/docs/config_example/yolov3_darknet.yml b/docs/config_example/yolov3_darknet.yml new file mode 100644 index 0000000000000000000000000000000000000000..65f479b72bb2504f84bbed5fed20afd78bad0ea9 --- /dev/null +++ b/docs/config_example/yolov3_darknet.yml @@ -0,0 +1,323 @@ +# Architecture of detection, which is also the prefix of data feed module +architecture: YOLOv3 + +# Data feed module. +train_feed: YoloTrainFeed +eval_feed: YoloEvalFeed +test_feed: YoloTestFeed + +# Use GPU or CPU, true by default. +use_gpu: true + +# Maximum number of iteration. +# In YOLOv3 model, default iteration number is to train for 270 epoches. +max_iters: 500200 + +# Smooth the log output in specified iterations, 20 by default. +log_smooth_window: 20 + +# The number of iteration interval to display in training log. +log_iter: 20 + +# The directory to save models. +save_dir: output + +# Snapshot period. If training and test at same time, evaluate model at each snapshot_iter. 2000 by default. +snapshot_iter: 2000 + +# Evalution method, COCO and VOC are available. +metric: COCO + +# The path of oretrained wegiths. If url is provided, it will be downloaded and decompressed automatically. +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/DarkNet53_pretrained.tar +# The path of final model for evaluation and test. +weights: output/yolov3_darknet/model_final + +# Number of classes, 80 for COCO and 20 for VOC. +num_classes: 80 + + +# YOLOv3 architecture, see https://arxiv.org/abs/1804.02767 +YOLOv3: + backbone: DarkNet + yolo_head: YOLOv3Head + +# Backbone module +DarkNet: + # Batch normalization type in training, sync_bn for synchronized batch normalization + norm_type: sync_bn + # L2 weight decay factor of batch normalization layer + norm_decay: 0. + # Darknet convolution layer number, only support 53 currently + depth: 53 + +# YOLOv3 head module +# Generate bbox output in evaluation and calculate loss in training +# fluid.layers.yolov3_loss / fluid.layers.yolo_box +YOLOv3Head: + # anchor mask of 3 yolo_loss/yolo_box layers, each yolo_loss/yolo_box layer has 3 anchors + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + # 9 anchors for 3 yolo_loss/yolo_box layer, generated by perform kmeans on COCO gtboxes + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + # L2 weight decay factor of batch normalization layer + norm_decay: 0. + # Ignore threshold for yolo_loss layer, 0.7 by default. + # Objectness loss will be ignored if a predcition bbox overlap a gtbox over ignore_thresh. + ignore_thresh: 0.7 + # Whether use label smooth in yolo_loss layer + # It is recommended to set as true when only num_classes is very big + label_smooth: true + # fluid.layers.multiclass_nms + # Non-max suppress for output prediction boxes, see multiclass_nms for following parameters. + # 1. Select detection bounding boxes with high scores larger than score_threshold. + # 2. Select detection bounding boxes with the largest nms_top_k scores. + # 3. Suppress detection bounding boxes which have high IoU overlap witch already selected boxes. + # 4. Keep the top keep_top_k detection bounding boxes as output. + nms: + # Which label is regard as backgroud and will be ignored, -1 for no backgroud label. + background_label: -1 + # Number of total bboxes to be kept per image after NMS step. + keep_top_k: 100 + # IoU threshold for NMS, bbox with IoU over nms_threshold will be suppressed. + nms_threshold: 0.45 + # Maximum number of detections to be kept according to the confidences after the filtering detections based on score_threshold. + nms_top_k: 1000 + # Whether detections are normalized. + normalized: false + # Threshold to filter out bounding boxes with low confidence score. + score_threshold: 0.01 + +# Learning rate configuration +LearningRate: + # Base learning rate for training, 1e-3 by default. + base_lr: 0.001 + # Learning rate schedulers, PiecewiseDecay and LinearWarmup by default + schedulers: + # fluid.layers.piecewise_decay + # each milestone stage decay gamma + - !PiecewiseDecay + gamma: 0.1 + milestones: + - 400000 + - 450000 + # fluid.layers.linear_lr_warmup + # Start learning rate equals to base_lr * start_factor + - !LinearWarmup + start_factor: 0. + steps: 4000 + +# Optimizer module +OptimizerBuilder: + # fluid.optimizer + optimizer: + momentum: 0.9 + type: Momentum + # fluid.regularizer + regularizer: + factor: 0.0005 + type: L2 + +# Data feed module for training +YoloTrainFeed: + # Batch size per device, 8 by default + batch_size: 8 + # Dataset module + dataset: + # Dataset directory. + dataset_dir: dataset/coco + # Annotation file path. + annotation: annotations/instances_train2017.json + # Directory where image files are stored. + image_dir: train2017 + # List of data fields needed. + fields: [image, gt_box, gt_label, gt_score] + # List of image dims + image_shape: [3, 608, 608] + # List of sample transformations to use. + sample_transforms: + # read image data and decode to numpy. + - !DecodeImage + to_rgb: true + # YOLOv3 use image mixup in training. + with_mixup: true + # Mixup two images in training, a trick to improve performance. + - !MixupImage + alpha: 1.5 # default: 1.5 + beta: 1.5 # default: 1.5 + # Normalize gtbox to range [0, 1] + - !NormalizeBox {} + # Random color distort: brightness, contrast, hue, saturation. + - !RandomDistort + brightness_lower: 0.5 + brightness_prob: 0.5 + brightness_upper: 1.5 + contrast_lower: 0.5 + contrast_prob: 0.5 + contrast_upper: 1.5 + count: 4 + hue_lower: -18 + hue_prob: 0.5 + hue_upper: 18 + is_order: false + saturation_lower: 0.5 + saturation_prob: 0.5 + saturation_upper: 1.5 + # Random Expand the image and modify bounding box. + # Operators: + # 1. Scale the image weight and height. + # 2. Construct new images with new height and width. + # 3. Fill the new image with the mean. + # 4. Put original imge into new image. + # 5. Rescale the bounding box. + # 6. Determine if the new bbox is satisfied in the new image. + - !ExpandImage + # max expand ratio, default 4.0. + max_ratio: 4.0 + mean: [123.675, 116.28, 103.53] + prob: 0.5 + # Random Crop the image and modify bounding box. + # Operators: + # 1. Scale the image weight and height. + # 2. Crop the image according to a radom sample. + # 3. Rescale the bounding box. + # 4. Determine if the new bbox is satisfied in the new image. + - !CropImage + # Recrop image if there are no bbox in output cropped image. + avoid_no_bbox: true + batch_sampler: [[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]] + # Whether should all bbox satisfy IoU constrains. + satisfy_all: false + # Interpolate image to target_size with random interpolate method: + # cv2.INTER_NEAREST, + # cv2.INTER_LINEAR, + # cv2.INTER_AREA, + # cv2.INTER_CUBIC, + # cv2.INTER_LANCZOS4, + - !RandomInterpImage + max_size: 0 + target_size: 608 + # Filp the image and bounding box. + # Operators: + # 1. Flip the image numpy. + # 2. Transform the bboxes' x coordinates. (Must judge whether the coordinates are normalized!) + # 3. Transform the segmentations' x coordinates. (Must judge whether the coordinates are normalized!) + - !RandomFlipImage + is_mask_flip: false + is_normalized: true + prob: 0.5 + # Normalize the image. + # Operators: + # 1.(optional) Scale the image to [0,1] + # 2. Each pixel minus mean and is divided by std + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + # Change data layout to [C, H, W]. + - !Permute + channel_first: true + to_bgr: false + # List of batch transformations to use. + batch_transforms: + # Random reshape images in each mini-batch to different shapes. + - !RandomShape + sizes: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608] + # YOLOv3 read gtbox into zero padded tensor with max box number as 50. + num_max_boxes: 50 + # YOLOv3 read gtlabel without regarding backgroud as label 0. + with_background: false + # Number of samples, -1 represents all samples. -1 by default. + samples: -1 + # Whether samples should be shuffled, true by default. + shuffle: true + # Whether drop last images which less than a batch. + drop_last: true + # Whether use multi-process reader in training. + use_process: true + # Use multi-process reader number. + num_workers: 8 + # Buffer size for reader. + bufsize: 128 + # Mixup image epoch number. + mixup_epoch: 250 + +# Data feed module for evaluation +YoloEvalFeed: + batch_size: 8 + dataset: + dataset_dir: dataset/coco + annotation: annotations/instances_val2017.json + image_dir: val2017 + batch_transforms: [] + fields: [image, im_size, im_id, gt_box, gt_label, is_difficult] + image_shape: [3, 608, 608] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + # Rescale image to the specified target size, and capped at max_size if max_size != 0. + # If target_size is list, selected a scale randomly as the specified target size. + - !ResizeImage + interp: 2 # 2 for cv2.INTER_CUBIC + max_size: 0 + target_size: 608 + use_cv2: true + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + - !Permute + channel_first: true + to_bgr: false + num_max_boxes: 50 + samples: -1 + shuffle: false + drop_last: false + # Use multi-thread reader in evaluation mode. + use_process: false + # Thread number for multi-thread reader. + num_workers: 8 + with_background: false + +# Data feed module for test +YoloTestFeed: + batch_size: 1 + dataset: + annotation: dataset/coco/annotations/instances_val2017.json + batch_transforms: [] + fields: [image, im_size, im_id] + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !ResizeImage + interp: 2 + max_size: 0 + target_size: 608 + use_cv2: true + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + - !Permute + channel_first: true + to_bgr: false + num_max_boxes: 50 + samples: -1 + shuffle: false + drop_last: false + # Use multi-thread reader in test mode. + use_process: false + num_workers: 8 + with_background: false diff --git a/docs/images/bench_ssd_yolo_infer.png b/docs/images/bench_ssd_yolo_infer.png new file mode 100644 index 0000000000000000000000000000000000000000..f81600b14cbe9af4b12f86c574ce2dffa937774f Binary files /dev/null and b/docs/images/bench_ssd_yolo_infer.png differ diff --git a/inference/CMakeLists.txt b/inference/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed610da047316d0b08d73d51e0223a06180b4026 --- /dev/null +++ b/inference/CMakeLists.txt @@ -0,0 +1,272 @@ +cmake_minimum_required(VERSION 3.0) +project(cpp_inference_demo CXX C) +message("cmake module path: ${CMAKE_MODULE_PATH}") +message("cmake root path: ${CMAKE_ROOT}") +option(WITH_MKL "Compile demo with MKL/OpenBlas support,defaultuseMKL." ON) +option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." ON) +option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON) +option(USE_TENSORRT "Compile demo with TensorRT." OFF) + +SET(PADDLE_DIR "" CACHE PATH "Location of libraries") +SET(OPENCV_DIR "" CACHE PATH "Location of libraries") +SET(CUDA_LIB "" CACHE PATH "Location of libraries") + + +include(external-cmake/yaml-cpp.cmake) + +macro(safe_set_static_flag) + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) +endmacro() + +if (WITH_MKL) + ADD_DEFINITIONS(-DUSE_MKL) +endif() + +if (NOT DEFINED PADDLE_DIR OR ${PADDLE_DIR} STREQUAL "") + message(FATAL_ERROR "please set PADDLE_DIR with -DPADDLE_DIR=/path/paddle_influence_dir") +endif() + +if (NOT DEFINED OPENCV_DIR OR ${OPENCV_DIR} STREQUAL "") + message(FATAL_ERROR "please set OPENCV_DIR with -DOPENCV_DIR=/path/opencv") +endif() + +include_directories("${CMAKE_SOURCE_DIR}/") +include_directories("${CMAKE_CURRENT_BINARY_DIR}/ext/yaml-cpp/src/ext-yaml-cpp/include") +include_directories("${PADDLE_DIR}/") +include_directories("${PADDLE_DIR}/third_party/install/protobuf/include") +include_directories("${PADDLE_DIR}/third_party/install/glog/include") +include_directories("${PADDLE_DIR}/third_party/install/gflags/include") +include_directories("${PADDLE_DIR}/third_party/install/xxhash/include") +if (EXISTS "${PADDLE_DIR}/third_party/install/snappy/include") + include_directories("${PADDLE_DIR}/third_party/install/snappy/include") +endif() +if(EXISTS "${PADDLE_DIR}/third_party/install/snappystream/include") + include_directories("${PADDLE_DIR}/third_party/install/snappystream/include") +endif() +include_directories("${PADDLE_DIR}/third_party/install/zlib/include") +include_directories("${PADDLE_DIR}/third_party/boost") +include_directories("${PADDLE_DIR}/third_party/eigen3") + +if (EXISTS "${PADDLE_DIR}/third_party/install/snappy/lib") + link_directories("${PADDLE_DIR}/third_party/install/snappy/lib") +endif() +if(EXISTS "${PADDLE_DIR}/third_party/install/snappystream/lib") + link_directories("${PADDLE_DIR}/third_party/install/snappystream/lib") +endif() + +link_directories("${PADDLE_DIR}/third_party/install/zlib/lib") +link_directories("${PADDLE_DIR}/third_party/install/protobuf/lib") +link_directories("${PADDLE_DIR}/third_party/install/glog/lib") +link_directories("${PADDLE_DIR}/third_party/install/gflags/lib") +link_directories("${PADDLE_DIR}/third_party/install/xxhash/lib") +link_directories("${PADDLE_DIR}/paddle/lib/") +link_directories("${CMAKE_CURRENT_BINARY_DIR}/ext/yaml-cpp/lib") +link_directories("${CMAKE_CURRENT_BINARY_DIR}") +if (WIN32) + include_directories("${PADDLE_DIR}/paddle/fluid/inference") + link_directories("${PADDLE_DIR}/paddle/fluid/inference") + include_directories("${OPENCV_DIR}/build/include") + include_directories("${OPENCV_DIR}/opencv/build/include") + link_directories("${OPENCV_DIR}/build/x64/vc14/lib") +else () + include_directories("${PADDLE_DIR}/paddle/include") + link_directories("${PADDLE_DIR}/paddle/lib") + include_directories("${OPENCV_DIR}/include") + link_directories("${OPENCV_DIR}/lib") +endif () + +if (WIN32) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + if (WITH_STATIC_LIB) + safe_set_static_flag() + add_definitions(-DSTATIC_LIB) + endif() +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -o2 -std=c++11") + set(CMAKE_STATIC_LIBRARY_PREFIX "") +endif() + +# TODO let users define cuda lib path +if (WITH_GPU) + if (NOT DEFINED CUDA_LIB OR ${CUDA_LIB} STREQUAL "") + message(FATAL_ERROR "please set CUDA_LIB with -DCUDA_LIB=/path/cuda-8.0/lib64") + endif() + if (NOT WIN32) + if (NOT DEFINED CUDNN_LIB) + message(FATAL_ERROR "please set CUDNN_LIB with -DCUDNN_LIB=/path/cudnn_v7.4/cuda/lib64") + endif() + endif(NOT WIN32) +endif() + + +if (NOT WIN32) + if (USE_TENSORRT AND WITH_GPU) + include_directories("${PADDLE_DIR}/third_party/install/tensorrt/include") + link_directories("${PADDLE_DIR}/third_party/install/tensorrt/lib") + endif() +endif(NOT WIN32) + +if (NOT WIN32) + set(NGRAPH_PATH "${PADDLE_DIR}/third_party/install/ngraph") + if(EXISTS ${NGRAPH_PATH}) + include(GNUInstallDirs) + include_directories("${NGRAPH_PATH}/include") + link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}") + set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() +endif() + +if(WITH_MKL) + include_directories("${PADDLE_DIR}/third_party/install/mklml/include") + if (WIN32) + set(MATH_LIB ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.lib + ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.lib) + else () + set(MATH_LIB ${PADDLE_DIR}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} + ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif () + set(MKLDNN_PATH "${PADDLE_DIR}/third_party/install/mkldnn") + if(EXISTS ${MKLDNN_PATH}) + include_directories("${MKLDNN_PATH}/include") + if (WIN32) + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib) + else () + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) + endif () + endif() +else() + set(MATH_LIB ${PADDLE_DIR}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX}) +endif() + +if(WITH_STATIC_LIB) + if (WIN32) + set(DEPS + ${PADDLE_DIR}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) + else () + set(DEPS + ${PADDLE_DIR}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() +else() + if (WIN32) + set(DEPS + ${PADDLE_DIR}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) + else () + set(DEPS + ${PADDLE_DIR}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() +endif() + +if (NOT WIN32) + set(EXTERNAL_LIB "-lrt -ldl -lpthread") + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + glog gflags protobuf yaml-cpp z xxhash + ${EXTERNAL_LIB}) + if(EXISTS "${PADDLE_DIR}/third_party/install/snappystream/lib") + set(DEPS ${DEPS} snappystream) + endif() + if (EXISTS "${PADDLE_DIR}/third_party/install/snappy/lib") + set(DEPS ${DEPS} snappy) + endif() +else() + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + opencv_world346 glog libyaml-cppmt gflags_static libprotobuf zlibstatic xxhash ${EXTERNAL_LIB}) + set(DEPS ${DEPS} libcmt shlwapi) + if (EXISTS "${PADDLE_DIR}/third_party/install/snappy/lib") + set(DEPS ${DEPS} snappy) + endif() + if(EXISTS "${PADDLE_DIR}/third_party/install/snappystream/lib") + set(DEPS ${DEPS} snappystream) + endif() +endif(NOT WIN32) + +if(WITH_GPU) + if(NOT WIN32) + if (USE_TENSORRT) + set(DEPS ${DEPS} ${PADDLE_DIR}/third_party/install/tensorrt/lib/libnvinfer${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${PADDLE_DIR}/third_party/install/tensorrt/lib/libnvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() + set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${CUDNN_LIB}/libcudnn${CMAKE_SHARED_LIBRARY_SUFFIX}) + else() + set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() +endif() + +if (NOT WIN32) + set(OPENCV_LIB_DIR ${OPENCV_DIR}/lib) + if(EXISTS "${OPENCV_LIB_DIR}") + message("OPENCV_LIB:" ${OPENCV_LIB_DIR}) + else() + set(OPENCV_LIB_DIR ${OPENCV_DIR}/lib64) + message("OPENCV_LIB:" ${OPENCV_LIB_DIR}) + endif() + + set(OPENCV_3RD_LIB_DIR ${OPENCV_DIR}/share/OpenCV/3rdparty/lib) + if(EXISTS "${OPENCV_3RD_LIB_DIR}") + message("OPENCV_3RD_LIB_DIR:" ${OPENCV_3RD_LIB_DIR}) + else() + set(OPENCV_3RD_LIB_DIR ${OPENCV_DIR}/share/OpenCV/3rdparty/lib64) + message("OPENCV_3RD_LIB_DIR:" ${OPENCV_3RD_LIB_DIR}) + endif() + + set(DEPS ${DEPS} ${OPENCV_LIB_DIR}/libopencv_imgcodecs${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${OPENCV_LIB_DIR}/libopencv_imgproc${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${OPENCV_LIB_DIR}/libopencv_core${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${OPENCV_LIB_DIR}/libopencv_highgui${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${OPENCV_3RD_LIB_DIR}/libIlmImf${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${OPENCV_3RD_LIB_DIR}/liblibjasper${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${OPENCV_3RD_LIB_DIR}/liblibpng${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${OPENCV_3RD_LIB_DIR}/liblibtiff${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${OPENCV_3RD_LIB_DIR}/libittnotify${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${OPENCV_3RD_LIB_DIR}/liblibjpeg-turbo${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${OPENCV_3RD_LIB_DIR}/liblibwebp${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${OPENCV_3RD_LIB_DIR}/libzlib${CMAKE_STATIC_LIBRARY_SUFFIX}) + if(EXISTS "${OPENCV_3RD_LIB_DIR}/libippiw${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(DEPS ${DEPS} ${OPENCV_3RD_LIB_DIR}/libippiw${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() + if(EXISTS "${OPENCV_3RD_LIB_DIR}/libippicv${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(DEPS ${DEPS} ${OPENCV_3RD_LIB_DIR}/libippicv${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() +endif() +# message(${CMAKE_CXX_FLAGS}) +# set(CMAKE_CXX_FLAGS "-g ${CMAKE_CXX_FLAGS}") + +SET(PADDLESEG_INFERENCE_SRCS preprocessor/preprocessor.cpp + preprocessor/preprocessor_detection.cpp predictor/detection_predictor.cpp + utils/detection_result.pb.cc) + +ADD_LIBRARY(libpaddleseg_inference STATIC ${PADDLESEG_INFERENCE_SRCS}) +target_link_libraries(libpaddleseg_inference ${DEPS}) + +add_executable(detection_demo detection_demo.cpp) + +ADD_DEPENDENCIES(libpaddleseg_inference ext-yaml-cpp) +ADD_DEPENDENCIES(detection_demo ext-yaml-cpp libpaddleseg_inference) +target_link_libraries(detection_demo ${DEPS} libpaddleseg_inference) + +if (WIN32) + add_custom_command(TARGET detection_demo POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.dll ./mklml.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.dll ./libiomp5md.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mkldnn/lib/mkldnn.dll ./mkldnn.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/mklml.dll ./release/mklml.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mklml/lib/libiomp5md.dll ./release/libiomp5md.dll + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_DIR}/third_party/install/mkldnn/lib/mkldnn.dll ./mkldnn.dll + ) +endif() + +execute_process(COMMAND cp -r ${CMAKE_SOURCE_DIR}/images ${CMAKE_SOURCE_DIR}/conf ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/inference/LICENSE b/inference/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/inference/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/inference/README.md b/inference/README.md new file mode 100644 index 0000000000000000000000000000000000000000..302b5fb3818df8c1ca871095ff368129ce1292fd --- /dev/null +++ b/inference/README.md @@ -0,0 +1,171 @@ +# PaddleDetection C++预测部署方案 + +## 本文档结构 + +[1.说明](#1说明) + +[2.主要目录和文件](#2主要目录和文件) + +[3.编译](#3编译) + +[4.预测并可视化结果](#4预测并可视化结果) + + + + +## 1.说明 + +本目录提供一个跨平台的图像检测模型的C++预测部署方案,用户通过一定的配置,加上少量的代码,即可把模型集成到自己的服务中,完成相应的图像检测任务。 + +主要设计的目标包括以下四点: +- 跨平台,支持在 Windows 和 Linux 完成编译、开发和部署 +- 可扩展性,支持用户针对新模型开发自己特殊的数据预处理等逻辑 +- 高性能,除了`PaddlePaddle`自身带来的性能优势,我们还针对图像检测的特点对关键步骤进行了性能优化 +- 支持多种常见的图像检测模型,如YOLOv3, Faster-RCNN, Faster-RCNN+FPN,用户通过少量配置即可加载模型完成常见检测任务 + +## 2.主要目录和文件 + +```bash +deploy +├── detection_demo.cpp # 完成图像检测预测任务C++代码 +│ +├── conf +│ ├── detection_rcnn.yaml #示例faster rcnn 目标检测配置 +│ └── detection_rcnn_fpn.yaml #示例faster rcnn + fpn目标检测配置 +├── images +│ └── detection_rcnn # 示例faster rcnn + fpn目标检测测试图片目录 +├── tools +│ └── vis.py # 示例图像检测结果可视化脚本 +├── docs +│ ├── linux_build.md # Linux 编译指南 +│ ├── windows_vs2015_build.md # windows VS2015编译指南 +│ └── windows_vs2019_build.md # Windows VS2019编译指南 +│ +├── utils # 一些基础公共函数 +│ +├── preprocess # 数据预处理相关代码 +│ +├── predictor # 模型加载和预测相关代码 +│ +├── CMakeList.txt # cmake编译入口文件 +│ +└── external-cmake # 依赖的外部项目cmake(目前仅有yaml-cpp) + +``` + +## 3.编译 +支持在`Windows`和`Linux`平台编译和使用: +- [Linux 编译指南](./docs/linux_build.md) +- [Windows 使用 Visual Studio 2019 Community 编译指南](./docs/windows_vs2019_build.md) +- [Windows 使用 Visual Studio 2015 编译指南](./docs/windows_vs2015_build.md) + +`Windows`上推荐使用最新的`Visual Studio 2019 Community`直接编译`CMake`项目。 + +## 4.预测并可视化结果 + +完成编译后,便生成了需要的可执行文件和链接库。这里以我们基于`faster rcnn`检测模型为例,介绍部署图像检测模型的通用流程。 + +### 1. 下载模型文件 +我们提供faster rcnn,faster rcnn+fpn模型用于预测coco17数据集,可在以下链接下载:[faster rcnn示例模型下载地址](https://paddleseg.bj.bcebos.com/inference/faster_rcnn_pp50.zip), + [faster rcnn + fpn示例模型下载地址](https://paddleseg.bj.bcebos.com/inference/faster_rcnn_pp50_fpn.zip)。 + +下载并解压,解压后目录结构如下: +``` +faster_rcnn_pp50/ +├── __model__ # 模型文件 +│ +└── __params__ # 参数文件 +``` +解压后把上述目录拷贝到合适的路径: + +**假设**`Windows`系统上,我们模型和参数文件所在路径为`D:\projects\models\faster_rcnn_pp50`。 + +**假设**`Linux`上对应的路径则为`/root/projects/models/faster_rcnn_pp50/`。 + + +### 2. 修改配置 + +`inference`源代码(即本目录)的`conf`目录下提供了示例基于faster rcnn的配置文件`detection_rcnn.yaml`, 相关的字段含义和说明如下: + +```yaml +DEPLOY: + # 是否使用GPU预测 + USE_GPU: 1 + # 模型和参数文件所在目录路径 + MODEL_PATH: "/root/projects/models/faster_rcnn_pp50" + # 模型文件名 + MODEL_FILENAME: "__model__" + # 参数文件名 + PARAMS_FILENAME: "__params__" + # 预测图片的标准输入,尺寸不一致会resize + EVAL_CROP_SIZE: (608, 608) + # resize方式,支持 UNPADDING和RANGE_SCALING + RESIZE_TYPE: "RANGE_SCALING" + # 短边对齐的长度,仅在RANGE_SCALING下有效 + TARGET_SHORT_SIZE : 800 + # 均值 + MEAN: [0.4647, 0.4647, 0.4647] + # 方差 + STD: [0.0834, 0.0834, 0.0834] + # 图片类型, rgb或者rgba + IMAGE_TYPE: "rgb" + # 像素分类数 + NUM_CLASSES: 1 + # 通道数 + CHANNELS : 3 + # 预处理器, 目前提供图像检测的通用处理类DetectionPreProcessor + PRE_PROCESSOR: "DetectionPreProcessor" + # 预测模式,支持 NATIVE 和 ANALYSIS + PREDICTOR_MODE: "ANALYSIS" + # 每次预测的 batch_size + BATCH_SIZE : 3 + # 长边伸缩的最大长度,-1代表无限制。 + RESIZE_MAX_SIZE: 1333 + # 输入的tensor数量。 + FEEDS_SIZE: 3 + +``` +修改字段`MODEL_PATH`的值为你在**上一步**下载并解压的模型文件所放置的目录即可。更多配置文件字段介绍,请参考文档[预测部署方案配置文件说明](./docs/configuration.md)。 + +### 3. 执行预测 + +在终端中切换到生成的可执行文件所在目录为当前目录(Windows系统为`cmd`)。 + +`Linux` 系统中执行以下命令: +```shell +./detection_demo --conf=conf/detection_rcnn.yaml --input_dir=images/detection_rcnn +``` +`Windows` 中执行以下命令: +```shell +.\detection_demo.exe --conf=conf\detection_rcnn.yaml --input_dir=images\detection_rcnn\ +``` + + +预测使用的两个命令参数说明如下: + +| 参数 | 含义 | +|-------|----------| +| conf | 模型配置的Yaml文件路径 | +| input_dir | 需要预测的图片目录 | + +· +配置文件说明请参考上一步,样例程序会扫描input_dir目录下的所有图片,并为每一张图片生成对应的预测结果,输出到屏幕,并在`X`同一目录下保存到`X.pb文件`(X为对应图片的文件名)。可使用工具脚本vis.py将检测结果可视化。 + +**检测结果可视化** + +运行可视化脚本时,只需输入命令行参数图片路径、检测结果pb文件路径、目标框阈值以及类别-标签映射文件路径即可得到可视化的图片`X.png` (tools目录下提供coco17的类别标签映射文件coco17.json)。 + +```bash +python vis.py --img_path=../build/images/detection_rcnn/000000087038.jpg --img_result_path=../build/images/detection_rcnn/000000087038.jpg.pb --threshold=0.1 --c2l_path=coco17.json +``` + +检测结果(每个图片的结果用空行隔开) + +```原图:``` + +![原图](./demo_images/000000087038.jpg) + +```检测结果图:``` + +![检测结果](./demo_images/000000087038.jpg.png) + diff --git a/inference/conf/detection_rcnn.yaml b/inference/conf/detection_rcnn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..50c23fbb3e53ff159844e65da4ed194e169cffb6 --- /dev/null +++ b/inference/conf/detection_rcnn.yaml @@ -0,0 +1,18 @@ +DEPLOY: + USE_GPU: 1 + MODEL_PATH: "/root/projects/models/faster_rcnn_pp50" + MODEL_FILENAME: "__model__" + PARAMS_FILENAME: "__params__" + EVAL_CROP_SIZE: (608, 608) + RESIZE_TYPE: "RANGE_SCALING" + TARGET_SHORT_SIZE : 800 + MEAN: [0.485, 0.456, 0.406] + STD: [0.229, 0.224, 0.225] + IMAGE_TYPE: "rgb" + NUM_CLASSES: 1 + CHANNELS : 3 + PRE_PROCESSOR: "DetectionPreProcessor" + PREDICTOR_MODE: "ANALYSIS" + BATCH_SIZE : 3 + RESIZE_MAX_SIZE: 1333 + FEEDS_SIZE: 3 diff --git a/inference/conf/detection_rcnn_fpn.yaml b/inference/conf/detection_rcnn_fpn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9d6635ef8c2b29fb0ca9318d1ec08f1f7be037f7 --- /dev/null +++ b/inference/conf/detection_rcnn_fpn.yaml @@ -0,0 +1,19 @@ +DEPLOY: + USE_GPU: 1 + MODEL_PATH: "/root/projects/models/faster_rcnn_pp50_fpn" + MODEL_FILENAME: "__model__" + PARAMS_FILENAME: "__params__" + EVAL_CROP_SIZE: (608, 608) + RESIZE_TYPE: "RANGE_SCALING" + TARGET_SHORT_SIZE : 800 + MEAN: [0.485, 0.456, 0.406] + STD: [0.229, 0.224, 0.225] + IMAGE_TYPE: "rgb" + NUM_CLASSES: 1 + CHANNELS : 3 + PRE_PROCESSOR: "DetectionPreProcessor" + PREDICTOR_MODE: "ANALYSIS" + BATCH_SIZE : 1 + RESIZE_MAX_SIZE: 1333 + FEEDS_SIZE: 3 + COARSEST_STRIDE: 32 diff --git a/inference/demo_images/000000087038.jpg b/inference/demo_images/000000087038.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9f77f5d5f057b6f92dc096da704ecb8dee99bdf5 Binary files /dev/null and b/inference/demo_images/000000087038.jpg differ diff --git a/inference/demo_images/000000087038.jpg.png b/inference/demo_images/000000087038.jpg.png new file mode 100644 index 0000000000000000000000000000000000000000..aa2c63d1c3dd1ca08d517239842ce5bd40310d01 Binary files /dev/null and b/inference/demo_images/000000087038.jpg.png differ diff --git a/inference/detection_demo.cpp b/inference/detection_demo.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7e711ed6970358c528a3198bb6168a871d83d380 --- /dev/null +++ b/inference/detection_demo.cpp @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +DEFINE_string(conf, "", "Configuration File Path"); +DEFINE_string(input_dir, "", "Directory of Input Images"); + +int main(int argc, char** argv) { + // 0. parse args + google::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_conf.empty() || FLAGS_input_dir.empty()) { + std::cout << "Usage: ./predictor --conf=/config/path/to/your/model --input_dir=/directory/of/your/input/images"; + return -1; + } + // 1. create a predictor and init it with conf + PaddleSolution::DetectionPredictor predictor; + if (predictor.init(FLAGS_conf) != 0) { + LOG(FATAL) << "Fail to init predictor"; + return -1; + } + + // 2. get all the images with extension '.jpeg' at input_dir + auto imgs = PaddleSolution::utils::get_directory_images(FLAGS_input_dir, ".jpeg|.jpg|.JPEG|.JPG|.bmp|.BMP|.png|.PNG"); + + // 3. predict + predictor.predict(imgs); + return 0; +} diff --git a/inference/docs/configuration.md b/inference/docs/configuration.md new file mode 100644 index 0000000000000000000000000000000000000000..cb2f761f792009c1accb52048d6e4f2cdcb6ad29 --- /dev/null +++ b/inference/docs/configuration.md @@ -0,0 +1,75 @@ +# 预测部署方案配置文件说明 +## 基本概念 +预测部署方案的配置文件旨在给用户提供一个预测部署方案定制化接口。用户仅需理解该配置文件相关字段的含义,无需编写任何代码,即可定制化预测部署方案。为了更好地表达每个字段的含义,首先介绍配置文件中字段的类型。 + +### 字段类型 +- **required**: 表明该字段必须显式定义,否则无法正常启动预测部署程序。 +- **optional**: 表明该字段可忽略不写,预测部署系统会提供默认值,相关默认值将在下文介绍。 + +### 字段值类型 +- **int**:表明该字段必须赋予整型类型的值。 +- **string**:表明该字段必须赋予字符串类型的值。 +- **list**:表明该字段必须赋予列表的值。 +- **tuple**: 表明该字段必须赋予双元素元组的值。 + +## 字段介绍 + +```yaml +# 预测部署时所有配置字段需在DEPLOY字段下 +DEPLOY: + # 类型:required int + # 含义:是否使用GPU预测。 0:不使用 1:使用 + USE_GPU: 1 + # 类型:required string + # 含义:模型和参数文件所在目录 + MODEL_PATH: "/path/to/model_directory" + # 类型:required string + # 含义:模型文件名 + MODEL_FILENAME: "__model__" + # 类型:required string + # 含义:参数文件名 + PARAMS_FILENAME: "__params__" + # 类型:optional string + # 含义:图像resize的类型。支持 UNPADDING 和 RANGE_SCALING模式。默认是UNPADDING模式。 + RESIZE_TYPE: "UNPADDING" + # 类型:required tuple + # 含义:当使用UNPADDING模式时,会将图像直接resize到该尺寸。 + EVAL_CROP_SIZE: (513, 513) + # 类型:optional int + # 含义:当使用RANGE_SCALING模式时,图像短边需要对齐该字段的值,长边会同比例 + # 的缩放,从而在保持图像长宽比例不变的情况下resize到新的尺寸。默认值为0。 + TARGET_SHORT_SIZE: 800 + # 类型:optional int + # 含义: 当使用RANGE_SCALING模式时,长边不能缩放到比该字段的值大。默认值为0。 + RESIZE_MAX_SIZE: 1333 + # 类型:required list + # 含义:图像进行归一化预处理时的均值 + MEAN: [104.008, 116.669, 122.675] + # 类型:required list + # 含义:图像进行归一化预处理时的方差 + STD: [1.0, 1.0, 1.0] + # 类型:string + # 含义:图片类型, rgb 或者 rgba + IMAGE_TYPE: "rgb" + # 类型:required int + # 含义:图像分类类型数 + NUM_CLASSES: 2 + # 类型:required int + # 含义:图片通道数 + CHANNELS : 3 + # 类型:required string + # 含义:预处理方式,目前提供图像检测的通用预处理类DetectionPreProcessor. + PRE_PROCESSOR: "DetectionPreProcessor" + # 类型:required string + # 含义:预测模式,支持 NATIVE 和 ANALYSIS + PREDICTOR_MODE: "ANALYSIS" + # 类型:required int + # 含义:每次预测的 batch_size + BATCH_SIZE : 3 + # 类型:optional int + # 含义: 输入张量的个数。大部分模型不需要设置。 默认值为1. + FEEDS_SIZE: 2 + # 类型: optional int + # 含义: 将图像的边变为该字段的值的整数倍。默认值为1。 + COARSEST_STRIDE: 32 +``` \ No newline at end of file diff --git a/inference/docs/linux_build.md b/inference/docs/linux_build.md new file mode 100644 index 0000000000000000000000000000000000000000..2ad9e46383123efee47b941f97c8e7690c7b95d6 --- /dev/null +++ b/inference/docs/linux_build.md @@ -0,0 +1,84 @@ +# Linux平台 编译指南 + +## 说明 +本文档在 `Linux`平台使用`GCC 4.8.5` 和 `GCC 4.9.4`测试过,如果需要使用更高G++版本编译使用,则需要重新编译Paddle预测库,请参考: [从源码编译Paddle预测库](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/advanced_usage/deploy/inference/build_and_install_lib_cn.html#id15)。 + +## 前置条件 +* G++ 4.8.2 ~ 4.9.4 +* CUDA 8.0/ CUDA 9.0 +* CMake 3.0+ + +请确保系统已经安装好上述基本软件,**下面所有示例以工作目录为 `/root/projects/`演示**。 + +### Step1: 下载代码 + +1. `mkdir -p /root/projects/paddle_models && cd /root/projects/paddle_models` +2. `git clone https://github.com/PaddlePaddle/models.git` + +`C++`预测代码在`/root/projects/paddle_models/models/PaddleCV/PaddleDetection/inference` 目录,该目录不依赖任何`PaddleDetection`下其他目录。 + + +### Step2: 下载PaddlePaddle C++ 预测库 fluid_inference + +目前仅支持`CUDA 8` 和 `CUDA 9`,请点击 [PaddlePaddle预测库下载地址](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/advanced_usage/deploy/inference/build_and_install_lib_cn.html)下载对应的版本(develop版本)。 + + +下载并解压后`/root/projects/fluid_inference`目录包含内容为: +``` +fluid_inference +├── paddle # paddle核心库和头文件 +| +├── third_party # 第三方依赖库和头文件 +| +└── version.txt # 版本和编译信息 +``` + +### Step3: 安装配置OpenCV + +```shell +# 0. 切换到/root/projects目录 +cd /root/projects +# 1. 下载OpenCV3.4.6版本源代码 +wget -c https://paddleseg.bj.bcebos.com/inference/opencv-3.4.6.zip +# 2. 解压 +unzip opencv-3.4.6.zip && cd opencv-3.4.6 +# 3. 创建build目录并编译, 这里安装到/usr/local/opencv3目录 +mkdir build && cd build +cmake .. -DCMAKE_INSTALL_PREFIX=/root/projects/opencv3 -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DWITH_IPP=OFF -DBUILD_IPP_IW=OFF -DWITH_LAPACK=OFF -DWITH_EIGEN=OFF -DCMAKE_INSTALL_LIBDIR=lib64 -DWITH_ZLIB=ON -DBUILD_ZLIB=ON -DWITH_JPEG=ON -DBUILD_JPEG=ON -DWITH_PNG=ON -DBUILD_PNG=ON -DWITH_TIFF=ON -DBUILD_TIFF=ON +make -j4 +make install +``` + +**注意:** 上述操作完成后,`opencv` 被安装在 `/root/projects/opencv3` 目录。 + +### Step4: 编译 + +`CMake`编译时,涉及到四个编译参数用于指定核心依赖库的路径, 他们的定义如下: + +| 参数名 | 含义 | +| ---- | ---- | +| CUDA_LIB | cuda的库路径 | +| CUDNN_LIB | cuDnn的库路径| +| OPENCV_DIR | OpenCV的安装路径, | +| PADDLE_DIR | Paddle预测库的路径 | + +执行下列操作时,**注意**把对应的参数改为你的上述依赖库实际路径: + +```shell +cd /root/projects/paddle_models/models/PaddleCV/PaddleDetection/inference + +mkdir build && cd build +cmake .. -DWITH_GPU=ON -DPADDLE_DIR=/root/projects/fluid_inference -DCUDA_LIB=/usr/local/cuda/lib64/ -DOPENCV_DIR=/root/projects/opencv3/ -DCUDNN_LIB=/usr/local/cuda/lib64/ +make +``` + + +### Step5: 预测及可视化 + +执行命令: + +``` +./detection_demo --conf=/path/to/your/conf --input_dir=/path/to/your/input/data/directory +``` + +更详细说明请参考ReadMe文档: [预测和可视化部分](../README.md) diff --git a/inference/docs/windows_vs2015_build.md b/inference/docs/windows_vs2015_build.md new file mode 100644 index 0000000000000000000000000000000000000000..be1c0289d404c17e561928173b104228ea63dbda --- /dev/null +++ b/inference/docs/windows_vs2015_build.md @@ -0,0 +1,97 @@ +# Windows平台使用 Visual Studio 2015 编译指南 + +本文档步骤,我们同时在`Visual Studio 2015` 和 `Visual Studio 2019 Community` 两个版本进行了测试,我们推荐使用[`Visual Studio 2019`直接编译`CMake`项目](./windows_vs2019_build.md)。 + + +## 前置条件 +* Visual Studio 2015 +* CUDA 8.0/ CUDA 9.0 +* CMake 3.0+ + +请确保系统已经安装好上述基本软件,**下面所有示例以工作目录为 `D:\projects`演示**。 + +### Step1: 下载代码 + +1. 打开`cmd`, 执行 `cd D:\projects\paddle_models` +2. `git clone https://github.com/PaddlePaddle/models.git` + +`C++`预测库代码在`D:\projects\paddle_models\models\PaddleCV\PaddleDetection\inference` 目录,该目录不依赖任何`PaddleDetection`下其他目录。 + + +### Step2: 下载PaddlePaddle C++ 预测库 fluid_inference + +根据Windows环境,下载相应版本的PaddlePaddle预测库,并解压到`D:\projects\`目录 + +| CUDA | GPU | 下载地址 | +|------|------|--------| +| 8.0 | Yes | [fluid_inference.zip](https://bj.bcebos.com/v1/paddleseg/fluid_inference_win.zip) | +| 9.0 | Yes | [fluid_inference_cuda90.zip](https://paddleseg.bj.bcebos.com/fluid_inference_cuda9_cudnn7.zip) | + +解压后`D:\projects\fluid_inference`目录包含内容为: +``` +fluid_inference +├── paddle # paddle核心库和头文件 +| +├── third_party # 第三方依赖库和头文件 +| +└── version.txt # 版本和编译信息 +``` + +### Step3: 安装配置OpenCV + +1. 在OpenCV官网下载适用于Windows平台的3.4.6版本, [下载地址](https://sourceforge.net/projects/opencvlibrary/files/3.4.6/opencv-3.4.6-vc14_vc15.exe/download) +2. 运行下载的可执行文件,将OpenCV解压至指定目录,如`D:\projects\opencv` +3. 配置环境变量,如下流程所示 + - 我的电脑->属性->高级系统设置->环境变量 + - 在系统变量中找到Path(如没有,自行创建),并双击编辑 + - 新建,将opencv路径填入并保存,如`D:\projects\opencv\build\x64\vc14\bin` + +### Step4: 以VS2015为例编译代码 + +以下命令需根据自己系统中各相关依赖的路径进行修改 + +* 调用VS2015, 请根据实际VS安装路径进行调整,打开cmd命令行工具执行以下命令 +* 其他vs版本(比如vs2019),请查找到对应版本的`vcvarsall.bat`路径,替换本命令即可 + +``` +call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 +``` + +* CMAKE编译工程 + * PADDLE_DIR: fluid_inference预测库路径 + * CUDA_LIB: CUDA动态库目录, 请根据实际安装情况调整 + * OPENCV_DIR: OpenCV解压目录 + +``` +# 切换到预测库所在目录 +cd /d D:\projects\paddle_models\models\PaddleCV\PaddleDetection\inference +# 创建构建目录, 重新构建只需要删除该目录即可 +mkdir build +cd build +# cmake构建VS项目 +D:\projects\paddle_models\models\PaddleCV\PaddleDetection\inference\build> cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_GPU=ON -DPADDLE_DIR=D:\projects\fluid_inference -DCUDA_LIB=D:\projects\cudalib\v9.0\lib\x64 -DOPENCV_DIR=D:\projects\opencv -T host=x64 +``` + +这里的`cmake`参数`-G`, 表示生成对应的VS版本的工程,可以根据自己的`VS`版本调整,具体请参考[cmake文档](https://cmake.org/cmake/help/v3.15/manual/cmake-generators.7.html) + +* 生成可执行文件 + +``` +D:\projects\paddle_models\models\PaddleCV\PaddleDetection\inference\build> msbuild /m /p:Configuration=Release cpp_inference_demo.sln +``` + +### Step5: 预测及可视化 + +上述`Visual Studio 2015`编译产出的可执行文件在`build\release`目录下,切换到该目录: +``` +cd /d D:\projects\paddle_models\models\PaddleCV\PaddleDetection\inference\build\release +``` + +之后执行命令: + +``` +detection_demo.exe --conf=/path/to/your/conf --input_dir=/path/to/your/input/data/directory +``` + +更详细说明请参考ReadMe文档: [预测和可视化部分](../README.md) + diff --git a/inference/docs/windows_vs2019_build.md b/inference/docs/windows_vs2019_build.md new file mode 100644 index 0000000000000000000000000000000000000000..f3f589a9a246e494439b26f516ea319c270ff9ab --- /dev/null +++ b/inference/docs/windows_vs2019_build.md @@ -0,0 +1,102 @@ +# Visual Studio 2019 Community CMake 编译指南 + +Windows 平台下,我们使用`Visual Studio 2015` 和 `Visual Studio 2019 Community` 进行了测试。微软从`Visual Studio 2017`开始即支持直接管理`CMake`跨平台编译项目,但是直到`2019`才提供了稳定和完全的支持,所以如果你想使用CMake管理项目编译构建,我们推荐你使用`Visual Studio 2019`环境下构建。 + +你也可以使用和`VS2015`一样,通过把`CMake`项目转化成`VS`项目来编译,其中**有差别的部分**在文档中我们有说明,请参考:[使用Visual Studio 2015 编译指南](./windows_vs2015_build.md) + +## 前置条件 +* Visual Studio 2019 +* CUDA 8.0/ CUDA 9.0 +* CMake 3.0+ + +请确保系统已经安装好上述基本软件,我们使用的是`VS2019`的社区版。 + +**下面所有示例以工作目录为 `D:\projects`演示**。 + +### Step1: 下载代码 + +1. 点击下载源代码:[下载地址](https://github.com/PaddlePaddle/models/archive/develop.zip) +2. 解压,解压后目录重命名为`paddle_models` + +以下代码目录路径为`D:\projects\paddle_models` 为例。 + + +### Step2: 下载PaddlePaddle C++ 预测库 fluid_inference + +根据Windows环境,下载相应版本的PaddlePaddle预测库,并解压到`D:\projects\`目录 + +| CUDA | GPU | 下载地址 | +|------|------|--------| +| 8.0 | Yes | [fluid_inference.zip](https://bj.bcebos.com/v1/paddleseg/fluid_inference_win.zip) | +| 9.0 | Yes | [fluid_inference_cuda90.zip](https://paddleseg.bj.bcebos.com/fluid_inference_cuda9_cudnn7.zip) | + +解压后`D:\projects\fluid_inference`目录包含内容为: +``` +fluid_inference +├── paddle # paddle核心库和头文件 +| +├── third_party # 第三方依赖库和头文件 +| +└── version.txt # 版本和编译信息 +``` +**注意:** `CUDA90`版本解压后目录名称为`fluid_inference_cuda90`。 + +### Step3: 安装配置OpenCV + +1. 在OpenCV官网下载适用于Windows平台的3.4.6版本, [下载地址](https://sourceforge.net/projects/opencvlibrary/files/3.4.6/opencv-3.4.6-vc14_vc15.exe/download) +2. 运行下载的可执行文件,将OpenCV解压至指定目录,如`D:\projects\opencv` +3. 配置环境变量,如下流程所示 + - 我的电脑->属性->高级系统设置->环境变量 + - 在系统变量中找到Path(如没有,自行创建),并双击编辑 + - 新建,将opencv路径填入并保存,如`D:\projects\opencv\build\x64\vc14\bin` + +### Step4: 使用Visual Studio 2019直接编译CMake + +1. 打开Visual Studio 2019 Community,点击`继续但无需代码` +![step2](https://paddleseg.bj.bcebos.com/inference/vs2019_step1.png) +2. 点击: `文件`->`打开`->`CMake` +![step2.1](https://paddleseg.bj.bcebos.com/inference/vs2019_step2.png) + +选择项目代码所在路径,并打开`CMakeList.txt`: + +![step2.2](https://paddleseg.bj.bcebos.com/inference/vs2019_step3.png) + +3. 点击:`项目`->`cpp_inference_demo的CMake设置` + +![step3](https://paddleseg.bj.bcebos.com/inference/vs2019_step4.png) + +4. 点击`浏览`,分别设置编译选项指定`CUDA`、`OpenCV`、`Paddle预测库`的路径 + +![step4](https://paddleseg.bj.bcebos.com/inference/vs2019_step5.png) + +三个编译参数的含义说明如下: + +| 参数名 | 含义 | +| ---- | ---- | +| CUDA_LIB | cuda的库路径 | +| OPENCV_DIR | OpenCV的安装路径, | +| PADDLE_DIR | Paddle预测库的路径 | + +**设置完成后**, 点击上图中`保存并生成CMake缓存以加载变量`。 + +5. 点击`生成`->`全部生成` + +![step6](https://paddleseg.bj.bcebos.com/inference/vs2019_step6.png) + + +### Step5: 预测及可视化 + +上述`Visual Studio 2019`编译产出的可执行文件在`out\build\x64-Release`目录下,打开`cmd`,并切换到该目录: + +``` +cd D:\projects\paddle_models\models\PaddleCV\PaddleDetection\inference\build\x64-Release +``` + +之后执行命令: + +``` +detection_demo.exe --conf=/path/to/your/conf --input_dir=/path/to/your/input/data/directory +``` + +更详细说明请参考ReadMe文档: [预测和可视化部分](../README.md) + diff --git a/inference/external-cmake/yaml-cpp.cmake b/inference/external-cmake/yaml-cpp.cmake new file mode 100644 index 0000000000000000000000000000000000000000..15fa2674e00d85f1db7bbdfdceeebadaf0eabf5a --- /dev/null +++ b/inference/external-cmake/yaml-cpp.cmake @@ -0,0 +1,29 @@ + +find_package(Git REQUIRED) + +include(ExternalProject) + +message("${CMAKE_BUILD_TYPE}") + +ExternalProject_Add( + ext-yaml-cpp + GIT_REPOSITORY https://github.com/jbeder/yaml-cpp.git + GIT_TAG e0e01d53c27ffee6c86153fa41e7f5e57d3e5c90 + CMAKE_ARGS + -DYAML_CPP_BUILD_TESTS=OFF + -DYAML_CPP_BUILD_TOOLS=OFF + -DYAML_CPP_INSTALL=OFF + -DYAML_CPP_BUILD_CONTRIB=OFF + -DMSVC_SHARED_RT=OFF + -DBUILD_SHARED_LIBS=OFF + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=${CMAKE_BINARY_DIR}/ext/yaml-cpp/lib + -DCMAKE_ARCHIVE_OUTPUT_DIRECTORY=${CMAKE_BINARY_DIR}/ext/yaml-cpp/lib + PREFIX "${CMAKE_BINARY_DIR}/ext/yaml-cpp" + # Disable install step + INSTALL_COMMAND "" + LOG_DOWNLOAD ON +) diff --git a/inference/images/detection_rcnn/000000014439.jpg b/inference/images/detection_rcnn/000000014439.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0abbdab06eb5950b93908cc91adfa640e8a3ac78 Binary files /dev/null and b/inference/images/detection_rcnn/000000014439.jpg differ diff --git a/inference/images/detection_rcnn/000000087038.jpg b/inference/images/detection_rcnn/000000087038.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9f77f5d5f057b6f92dc096da704ecb8dee99bdf5 Binary files /dev/null and b/inference/images/detection_rcnn/000000087038.jpg differ diff --git a/inference/images/detection_rcnn/000000570688.jpg b/inference/images/detection_rcnn/000000570688.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cb304bd56c4010c08611a30dcca58ea9140cea54 Binary files /dev/null and b/inference/images/detection_rcnn/000000570688.jpg differ diff --git a/inference/predictor/detection_predictor.cpp b/inference/predictor/detection_predictor.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ba07e3b6c7fb2152bd7825950a3cd94769f36adc --- /dev/null +++ b/inference/predictor/detection_predictor.cpp @@ -0,0 +1,383 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "detection_predictor.h" +#include +#include +#include +#include "utils/detection_result.pb.h" + +namespace PaddleSolution { + /* lod_buffer: every item in lod_buffer is an image matrix after preprocessing + * input_buffer: same data with lod_buffer after flattening to 1-D vector and padding, needed to be empty before using this function + */ + void padding_minibatch(const std::vector> &lod_buffer, std::vector &input_buffer, + std::vector &resize_heights, std::vector &resize_widths, int channels, int coarsest_stride = 1) { + int batch_size = lod_buffer.size(); + int max_h = -1; + int max_w = -1; + for(int i = 0; i < batch_size; ++i) { + max_h = (max_h > resize_heights[i])? max_h:resize_heights[i]; + max_w = (max_w > resize_widths[i])? max_w:resize_widths[i]; + } + max_h = static_cast(ceil(static_cast(max_h) / static_cast(coarsest_stride)) * coarsest_stride); + max_w = static_cast(ceil(static_cast(max_w) / static_cast(coarsest_stride)) * coarsest_stride); + std::cout << "max_w: " << max_w << " max_h: " << max_h << std::endl; + input_buffer.insert(input_buffer.end(), batch_size * channels * max_h * max_w, 0); + // flatten tensor and padding + for(int i = 0; i < lod_buffer.size(); ++i) { + float *input_buffer_ptr = input_buffer.data() + i * channels * max_h * max_w; + const float *lod_ptr = lod_buffer[i].data(); + for(int c = 0; c < channels; ++c) { + for(int h = 0; h < resize_heights[i]; ++h) { + memcpy(input_buffer_ptr, lod_ptr, resize_widths[i] * sizeof(float)); + lod_ptr += resize_widths[i]; + input_buffer_ptr += max_w; + } + input_buffer_ptr += (max_h - resize_heights[i]) * max_w; + } + } + // change resize w, h + for(int i = 0; i < batch_size; ++i){ + resize_widths[i] = max_w; + resize_heights[i] = max_h; + } + } + + void output_detection_result(const float* out_addr, const std::vector> &lod_vector, const std::vector &imgs_batch){ + for(int i = 0; i < lod_vector[0].size() - 1; ++i) { + DetectionResult detection_result; + detection_result.set_filename(imgs_batch[i]); + std::cout << imgs_batch[i] << ":" << std::endl; + for (int j = lod_vector[0][i]; j < lod_vector[0][i+1]; ++j) { + DetectionBox *box_ptr = detection_result.add_detection_boxes(); + box_ptr->set_class_(static_cast(round(out_addr[0 + j * 6]))); + box_ptr->set_score(out_addr[1 + j * 6]); + box_ptr->set_left_top_x(out_addr[2 + j * 6]); + box_ptr->set_left_top_y(out_addr[3 + j * 6]); + box_ptr->set_right_bottom_x(out_addr[4 + j * 6]); + box_ptr->set_right_bottom_y(out_addr[5 + j * 6]); + printf("Class %d, score = %f, left top = [%f, %f], right bottom = [%f, %f]\n", + static_cast(round(out_addr[0 + j * 6])), out_addr[1 + j * 6], out_addr[2 + j * 6], + out_addr[3 + j * 6], out_addr[4 + j * 6], out_addr[5 + j * 6]); + } + printf("\n"); + std::ofstream output(imgs_batch[i] + ".pb", std::ios::out | std::ios::trunc | std::ios::binary); + detection_result.SerializeToOstream(&output); + output.close(); + } + } + + int DetectionPredictor::init(const std::string& conf) { + if (!_model_config.load_config(conf)) { + LOG(FATAL) << "Fail to load config file: [" << conf << "]"; + return -1; + } + _preprocessor = PaddleSolution::create_processor(conf); + if (_preprocessor == nullptr) { + LOG(FATAL) << "Failed to create_processor"; + return -1; + } + + bool use_gpu = _model_config._use_gpu; + const auto& model_dir = _model_config._model_path; + const auto& model_filename = _model_config._model_file_name; + const auto& params_filename = _model_config._param_file_name; + + // load paddle model file + if (_model_config._predictor_mode == "NATIVE") { + paddle::NativeConfig config; + auto prog_file = utils::path_join(model_dir, model_filename); + auto param_file = utils::path_join(model_dir, params_filename); + config.prog_file = prog_file; + config.param_file = param_file; + config.fraction_of_gpu_memory = 0; + config.use_gpu = use_gpu; + config.device = 0; + _main_predictor = paddle::CreatePaddlePredictor(config); + } else if (_model_config._predictor_mode == "ANALYSIS") { + paddle::AnalysisConfig config; + if (use_gpu) { + config.EnableUseGpu(100, 0); + } + auto prog_file = utils::path_join(model_dir, model_filename); + auto param_file = utils::path_join(model_dir, params_filename); + config.SetModel(prog_file, param_file); + config.SwitchUseFeedFetchOps(false); + config.SwitchSpecifyInputNames(true); + config.EnableMemoryOptim(); + _main_predictor = paddle::CreatePaddlePredictor(config); + } else { + return -1; + } + return 0; + + } + + int DetectionPredictor::predict(const std::vector& imgs) { + if (_model_config._predictor_mode == "NATIVE") { + return native_predict(imgs); + } + else if (_model_config._predictor_mode == "ANALYSIS") { + return analysis_predict(imgs); + } + return -1; + } + + int DetectionPredictor::native_predict(const std::vector& imgs) { + int config_batch_size = _model_config._batch_size; + + int channels = _model_config._channels; + int eval_width = _model_config._resize[0]; + int eval_height = _model_config._resize[1]; + std::size_t total_size = imgs.size(); + int default_batch_size = std::min(config_batch_size, (int)total_size); + int batch = total_size / default_batch_size + ((total_size % default_batch_size) != 0); + int batch_buffer_size = default_batch_size * channels * eval_width * eval_height; + + auto& input_buffer = _buffer; + auto& imgs_batch = _imgs_batch; + float sr; + // DetectionResultsContainer result_container; + for (int u = 0; u < batch; ++u) { + int batch_size = default_batch_size; + if (u == (batch - 1) && (total_size % default_batch_size)) { + batch_size = total_size % default_batch_size; + } + + int real_buffer_size = batch_size * channels * eval_width * eval_height; + std::vector feeds; + input_buffer.clear(); + imgs_batch.clear(); + for (int i = 0; i < batch_size; ++i) { + int idx = u * default_batch_size + i; + imgs_batch.push_back(imgs[idx]); + } + std::vector ori_widths; + std::vector ori_heights; + std::vector resize_widths; + std::vector resize_heights; + std::vector scale_ratios; + ori_widths.resize(batch_size); + ori_heights.resize(batch_size); + resize_widths.resize(batch_size); + resize_heights.resize(batch_size); + scale_ratios.resize(batch_size); + std::vector> lod_buffer(batch_size); + if (!_preprocessor->batch_process(imgs_batch, lod_buffer, ori_widths.data(), ori_heights.data(), + resize_widths.data(), resize_heights.data(), scale_ratios.data())) { + return -1; + } + // flatten and padding + padding_minibatch(lod_buffer, input_buffer, resize_heights, resize_widths, channels, _model_config._coarsest_stride); + paddle::PaddleTensor im_tensor, im_size_tensor, im_info_tensor; + + im_tensor.name = "image"; + im_tensor.shape = std::vector({ batch_size, channels, resize_heights[0], resize_widths[0] }); + im_tensor.data.Reset(input_buffer.data(), input_buffer.size() * sizeof(float)); + im_tensor.dtype = paddle::PaddleDType::FLOAT32; + + std::vector image_infos; + for(int i = 0; i < batch_size; ++i) { + image_infos.push_back(resize_heights[i]); + image_infos.push_back(resize_widths[i]); + image_infos.push_back(scale_ratios[i]); + } + im_info_tensor.name = "info"; + im_info_tensor.shape = std::vector({batch_size, 3}); + im_info_tensor.data.Reset(image_infos.data(), batch_size * 3 * sizeof(float)); + im_info_tensor.dtype = paddle::PaddleDType::FLOAT32; + + std::vector image_size; + for(int i = 0; i < batch_size; ++i) { + image_size.push_back(ori_heights[i]); + image_size.push_back(ori_widths[i]); + } + + std::vector image_size_f; + for(int i = 0; i < batch_size; ++i) { + image_size_f.push_back(ori_heights[i]); + image_size_f.push_back(ori_widths[i]); + image_size_f.push_back(1.0); + } + + int feeds_size = _model_config._feeds_size; + im_size_tensor.name = "im_size"; + if(feeds_size == 2) { + im_size_tensor.shape = std::vector({ batch_size, 2}); + im_size_tensor.data.Reset(image_size.data(), batch_size * 2 * sizeof(int)); + im_size_tensor.dtype = paddle::PaddleDType::INT32; + } + else if(feeds_size == 3) { + im_size_tensor.shape = std::vector({ batch_size, 3}); + im_size_tensor.data.Reset(image_size_f.data(), batch_size * 3 * sizeof(float)); + im_size_tensor.dtype = paddle::PaddleDType::FLOAT32; + } + std::cout << "Feed size = " << feeds_size << std::endl; + feeds.push_back(im_tensor); + if(_model_config._feeds_size > 2) { + feeds.push_back(im_info_tensor); + } + feeds.push_back(im_size_tensor); + _outputs.clear(); + + auto t1 = std::chrono::high_resolution_clock::now(); + if (!_main_predictor->Run(feeds, &_outputs, batch_size)) { + LOG(ERROR) << "Failed: NativePredictor->Run() return false at batch: " << u; + continue; + } + auto t2 = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(t2 - t1).count(); + std::cout << "runtime = " << duration << std::endl; + std::cout << "Number of outputs:" << _outputs.size() << std::endl; + int out_num = 1; + // print shape of first output tensor for debugging + std::cout << "size of outputs[" << 0 << "]: ("; + for (int j = 0; j < _outputs[0].shape.size(); ++j) { + out_num *= _outputs[0].shape[j]; + std::cout << _outputs[0].shape[j] << ","; + } + std::cout << ")" << std::endl; + + // const size_t nums = _outputs.front().data.length() / sizeof(float); + // if (out_num % batch_size != 0 || out_num != nums) { + // LOG(ERROR) << "outputs data size mismatch with shape size."; + // return -1; + // } + float* out_addr = (float *)(_outputs[0].data.data()); + output_detection_result(out_addr, _outputs[0].lod, imgs_batch); + } + return 0; + } + + int DetectionPredictor::analysis_predict(const std::vector& imgs) { + + int config_batch_size = _model_config._batch_size; + int channels = _model_config._channels; + int eval_width = _model_config._resize[0]; + int eval_height = _model_config._resize[1]; + auto total_size = imgs.size(); + int default_batch_size = std::min(config_batch_size, (int)total_size); + int batch = total_size / default_batch_size + ((total_size % default_batch_size) != 0); + int batch_buffer_size = default_batch_size * channels * eval_width * eval_height; + + auto& input_buffer = _buffer; + auto& imgs_batch = _imgs_batch; + //DetectionResultsContainer result_container; + for (int u = 0; u < batch; ++u) { + int batch_size = default_batch_size; + if (u == (batch - 1) && (total_size % default_batch_size)) { + batch_size = total_size % default_batch_size; + } + + int real_buffer_size = batch_size * channels * eval_width * eval_height; + std::vector feeds; + //input_buffer.resize(real_buffer_size); + input_buffer.clear(); + imgs_batch.clear(); + for (int i = 0; i < batch_size; ++i) { + int idx = u * default_batch_size + i; + imgs_batch.push_back(imgs[idx]); + } + + std::vector ori_widths; + std::vector ori_heights; + std::vector resize_widths; + std::vector resize_heights; + std::vector scale_ratios; + ori_widths.resize(batch_size); + ori_heights.resize(batch_size); + resize_widths.resize(batch_size); + resize_heights.resize(batch_size); + scale_ratios.resize(batch_size); + + std::vector> lod_buffer(batch_size); + if (!_preprocessor->batch_process(imgs_batch, lod_buffer, ori_widths.data(), ori_heights.data(), + resize_widths.data(), resize_heights.data(), scale_ratios.data())){ + std::cout << "Failed to preprocess!" << std::endl; + return -1; + } + + //flatten tensor + padding_minibatch(lod_buffer, input_buffer, resize_heights, resize_widths, channels, _model_config._coarsest_stride); + + std::vector input_names = _main_predictor->GetInputNames(); + auto im_tensor = _main_predictor->GetInputTensor(input_names.front()); + im_tensor->Reshape({ batch_size, channels, resize_heights[0], resize_widths[0] }); + im_tensor->copy_from_cpu(input_buffer.data()); + + if(input_names.size() > 2){ + std::vector image_infos; + for(int i = 0; i < batch_size; ++i) { + image_infos.push_back(resize_heights[i]); + image_infos.push_back(resize_widths[i]); + image_infos.push_back(scale_ratios[i]); + } + auto im_info_tensor = _main_predictor->GetInputTensor(input_names[1]); + im_info_tensor->Reshape({batch_size, 3}); + im_info_tensor->copy_from_cpu(image_infos.data()); + } + + std::vector image_size; + for(int i = 0; i < batch_size; ++i) { + image_size.push_back(ori_heights[i]); + image_size.push_back(ori_widths[i]); + } + std::vector image_size_f; + for(int i = 0; i < batch_size; ++i) { + image_size_f.push_back(static_cast(ori_heights[i])); + image_size_f.push_back(static_cast(ori_widths[i])); + image_size_f.push_back(1.0); + } + + auto im_size_tensor = _main_predictor->GetInputTensor(input_names.back()); + if(input_names.size() > 2) { + im_size_tensor->Reshape({batch_size, 3}); + im_size_tensor->copy_from_cpu(image_size_f.data()); + } + else{ + im_size_tensor->Reshape({batch_size, 2}); + im_size_tensor->copy_from_cpu(image_size.data()); + } + + + auto t1 = std::chrono::high_resolution_clock::now(); + _main_predictor->ZeroCopyRun(); + auto t2 = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(t2 - t1).count(); + std::cout << "runtime = " << duration << std::endl; + + auto output_names = _main_predictor->GetOutputNames(); + auto output_t = _main_predictor->GetOutputTensor(output_names[0]); + std::vector out_data; + std::vector output_shape = output_t->shape(); + + int out_num = 1; + std::cout << "size of outputs[" << 0 << "]: ("; + for (int j = 0; j < output_shape.size(); ++j) { + out_num *= output_shape[j]; + std::cout << output_shape[j] << ","; + } + std::cout << ")" << std::endl; + + out_data.resize(out_num); + output_t->copy_to_cpu(out_data.data()); + + float* out_addr = (float *)(out_data.data()); + auto lod_vector = output_t->lod(); + output_detection_result(out_addr, lod_vector, imgs_batch); + } + return 0; + } +} diff --git a/inference/predictor/detection_predictor.h b/inference/predictor/detection_predictor.h new file mode 100644 index 0000000000000000000000000000000000000000..3bc4cfdd793291d7d89342c7fbccfdd558d1f004 --- /dev/null +++ b/inference/predictor/detection_predictor.h @@ -0,0 +1,52 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace PaddleSolution { + class DetectionPredictor { + public: + // init a predictor with a yaml config file + int init(const std::string& conf); + // predict api + int predict(const std::vector& imgs); + + private: + int native_predict(const std::vector& imgs); + int analysis_predict(const std::vector& imgs); + private: + std::vector _buffer; + std::vector _imgs_batch; + std::vector _outputs; + + PaddleSolution::PaddleModelConfigPaser _model_config; + std::shared_ptr _preprocessor; + std::unique_ptr _main_predictor; + }; +} diff --git a/inference/preprocessor/preprocessor.cpp b/inference/preprocessor/preprocessor.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dbe7bcf624b649c02297bddd593d173b57550f17 --- /dev/null +++ b/inference/preprocessor/preprocessor.cpp @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "preprocessor.h" +#include "preprocessor_detection.h" + +namespace PaddleSolution { + + std::shared_ptr create_processor(const std::string& conf_file) { + + auto config = std::make_shared(); + if (!config->load_config(conf_file)) { + LOG(FATAL) << "fail to laod conf file [" << conf_file << "]"; + return nullptr; + } + + if (config->_pre_processor == "DetectionPreProcessor") { + auto p = std::make_shared(); + if (!p->init(config)) { + return nullptr; + } + return p; + } + + + LOG(FATAL) << "unknown processor_name [" << config->_pre_processor << "]"; + + return nullptr; + } +} diff --git a/inference/preprocessor/preprocessor.h b/inference/preprocessor/preprocessor.h new file mode 100644 index 0000000000000000000000000000000000000000..a3fb2e029c8acf92010a258dd2824b85a0f2f90f --- /dev/null +++ b/inference/preprocessor/preprocessor.h @@ -0,0 +1,64 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +#include +#include +#include + +#include "utils/conf_parser.h" + +namespace PaddleSolution { + +class ImagePreProcessor { +protected: + ImagePreProcessor() {}; + +public: + virtual ~ImagePreProcessor() {} + + virtual bool single_process(const std::string& fname, float* data, int* ori_w, int* ori_h) { + return true; + } + + virtual bool batch_process(const std::vector& imgs, float* data, int* ori_w, int* ori_h) { + return true; + } + + virtual bool single_process(const std::string& fname, float* data) { + return true; + } + + virtual bool batch_process(const std::vector& imgs, float* data) { + return true; + } + + virtual bool single_process(const std::string& fname, std::vector &data, int* ori_w, int* ori_h, int* resize_w, int* resize_h, float* scale_ratio) { + return true; + } + + virtual bool batch_process(const std::vector& imgs, std::vector> &data, int* ori_w, int* ori_h, int* resize_w, int* resize_h, float* scale_ratio) { + return true; + } + +}; // end of class ImagePreProcessor + +std::shared_ptr create_processor(const std::string &config_file); + +} // end of namespace paddle_solution + diff --git a/inference/preprocessor/preprocessor_detection.cpp b/inference/preprocessor/preprocessor_detection.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ba8fd0e328c5a859e2d4b88adba0e56e5e3a7476 --- /dev/null +++ b/inference/preprocessor/preprocessor_detection.cpp @@ -0,0 +1,130 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include + +#include "preprocessor_detection.h" +#include "utils/utils.h" + +namespace PaddleSolution { + bool DetectionPreProcessor::single_process(const std::string& fname, std::vector &vec_data, int* ori_w, int* ori_h, int* resize_w, int* resize_h, float* scale_ratio) { + cv::Mat im1 = cv::imread(fname, -1); + cv::Mat im; + if(_config->_feeds_size == 3) { // faster rcnn + im1.convertTo(im, CV_32FC3, 1/255.0); + } + else if(_config->_feeds_size == 2){ //yolo v3 + im = im1; + } + if (im.data == nullptr || im.empty()) { + LOG(ERROR) << "Failed to open image: " << fname; + return false; + } + + int channels = im.channels(); + if (channels == 1) { + cv::cvtColor(im, im, cv::COLOR_GRAY2BGR); + } + channels = im.channels(); + if (channels != 3 && channels != 4) { + LOG(ERROR) << "Only support rgb(gray) and rgba image."; + return false; + } + *ori_w = im.cols; + *ori_h = im.rows; + cv::cvtColor(im, im, cv::COLOR_BGR2RGB); + //channels = im.channels(); + + //resize + int rw = im.cols; + int rh = im.rows; + float im_scale_ratio; + utils::scaling(_config->_resize_type, rw, rh, _config->_resize[0], _config->_resize[1], _config->_target_short_size, _config->_resize_max_size, im_scale_ratio); + cv::Size resize_size(rw, rh); + *resize_w = rw; + *resize_h = rh; + *scale_ratio = im_scale_ratio; + if (*ori_h != rh || *ori_w != rw) { + cv::Mat im_temp; + if(_config->_resize_type == utils::SCALE_TYPE::UNPADDING) { + cv::resize(im, im_temp, resize_size, 0, 0, cv::INTER_LINEAR); + } + else if(_config->_resize_type == utils::SCALE_TYPE::RANGE_SCALING) { + cv::resize(im, im_temp, cv::Size(), im_scale_ratio, im_scale_ratio, cv::INTER_LINEAR); + } + im = im_temp; + } + + vec_data.resize(channels * rw * rh); + float *data = vec_data.data(); + + float* pmean = _config->_mean.data(); + float* pscale = _config->_std.data(); + for (int h = 0; h < rh; ++h) { + const uchar* uptr = im.ptr(h); + const float* fptr = im.ptr(h); + int im_index = 0; + for (int w = 0; w < rw; ++w) { + for (int c = 0; c < channels; ++c) { + int top_index = (c * rh + h) * rw + w; + float pixel;// = static_cast(fptr[im_index]);// / 255.0; + if(_config->_feeds_size == 2){ //yolo v3 + pixel = static_cast(uptr[im_index++]) / 255.0; + } + else if(_config->_feeds_size == 3){ + pixel = fptr[im_index++]; + } + pixel = (pixel - pmean[c]) / pscale[c]; + data[top_index] = pixel; + } + } + } + return true; + } + + bool DetectionPreProcessor::batch_process(const std::vector& imgs, std::vector> &data, int* ori_w, int* ori_h, int* resize_w, int* resize_h, float* scale_ratio) { + auto ic = _config->_channels; + auto iw = _config->_resize[0]; + auto ih = _config->_resize[1]; + std::vector threads; + for (int i = 0; i < imgs.size(); ++i) { + std::string path = imgs[i]; + int* width = &ori_w[i]; + int* height = &ori_h[i]; + int* resize_width = &resize_w[i]; + int* resize_height = &resize_h[i]; + float* sr = &scale_ratio[i]; + threads.emplace_back([this, &data, i, path, width, height, resize_width, resize_height, sr] { + std::vector buffer; + single_process(path, buffer, width, height, resize_width, resize_height, sr); + data[i] = buffer; + }); + } + for (auto& t : threads) { + if (t.joinable()) { + t.join(); + } + } + return true; + } + + bool DetectionPreProcessor::init(std::shared_ptr config) { + _config = config; + return true; + } + +} diff --git a/inference/preprocessor/preprocessor_detection.h b/inference/preprocessor/preprocessor_detection.h new file mode 100644 index 0000000000000000000000000000000000000000..731329040423756151a2590d3ed0f46b2800191d --- /dev/null +++ b/inference/preprocessor/preprocessor_detection.h @@ -0,0 +1,36 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "preprocessor.h" + +namespace PaddleSolution { + + class DetectionPreProcessor : public ImagePreProcessor { + + public: + DetectionPreProcessor() : _config(nullptr) { + }; + + bool init(std::shared_ptr config); + + bool single_process(const std::string& fname, std::vector &data, int* ori_w, int* ori_h, int* resize_w, int* resize_h, float* scale_ratio); + + bool batch_process(const std::vector& imgs, std::vector> &data, int* ori_w, int* ori_h, int* resize_w, int* resize_h, float* scale_ratio); + private: + std::shared_ptr _config; + }; + +} diff --git a/inference/tools/coco17.json b/inference/tools/coco17.json new file mode 100644 index 0000000000000000000000000000000000000000..d3bbbaad038534baacf6f86f78db5d32bce16238 --- /dev/null +++ b/inference/tools/coco17.json @@ -0,0 +1,83 @@ +{ + "0" : "background", + "1" : "person", + "2" : "bicycle", + "3" : "car", + "4" : "motorcycle", + "5" : "airplane", + "6" : "bus", + "7" : "train", + "8" : "truck", + "9" : "boat", + "10" : "traffic light", + "11" : "fire hydrant", + "12" : "stop sign", + "13" : "parking meter", + "14" : "bench", + "15" : "bird", + "16" : "cat", + "17" : "dog", + "18" : "horse", + "19" : "sheep", + "20" : "cow", + "21" : "elephant", + "22" : "bear", + "23" : "zebra", + "24" : "giraffe", + "25" : "backpack", + "26" : "umbrella", + "27" : "handbag", + "28" : "tie", + "29" : "suitcase", + "30" : "frisbee", + "31" : "skis", + "32" : "snowboard", + "33" : "sports ball", + "34" : "kite", + "35" : "baseball bat", + "36" : "baseball glove", + "37" : "skateboard", + "38" : "surfboard", + "39" : "tennis racket", + "40" : "bottle", + "41" : "wine glass", + "42" : "cup", + "43" : "fork", + "44" : "knife", + "45" : "spoon", + "46" : "bowl", + "47" : "banana", + "48" : "apple", + "49" : "sandwich", + "50" : "orange", + "51" : "broccoli", + "52" : "carrot", + "53" : "hot dog", + "54" : "pizza", + "55" : "donut", + "56" : "cake", + "57" : "chair", + "58" : "couch", + "59" : "potted plant", + "60" : "bed", + "61" : "dining table", + "62" : "toilet", + "63" : "tv", + "64" : "laptop", + "65" : "mouse", + "66" : "remote", + "67" : "keyboard", + "68" : "cell phone", + "69" : "microwave", + "70" : "oven", + "71" : "toaster", + "72" : "sink", + "73" : "refrigerator", + "74" : "book", + "75" : "clock", + "76" : "vase", + "77" : "scissors", + "78" : "teddy bear", + "79" : "hair drier", + "80" : "toothbrush" +} diff --git a/inference/tools/detection_result_pb2.py b/inference/tools/detection_result_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..3dc66f368b6fea72f70d6a5685b19f23e8021d51 --- /dev/null +++ b/inference/tools/detection_result_pb2.py @@ -0,0 +1,151 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: detection_result.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='detection_result.proto', + package='PaddleSolution', + syntax='proto2', + serialized_pb=_b('\n\x16\x64\x65tection_result.proto\x12\x0ePaddleSolution\"\x84\x01\n\x0c\x44\x65tectionBox\x12\r\n\x05\x63lass\x18\x01 \x01(\x05\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x12\n\nleft_top_x\x18\x03 \x01(\x02\x12\x12\n\nleft_top_y\x18\x04 \x01(\x02\x12\x16\n\x0eright_bottom_x\x18\x05 \x01(\x02\x12\x16\n\x0eright_bottom_y\x18\x06 \x01(\x02\"Z\n\x0f\x44\x65tectionResult\x12\x10\n\x08\x66ilename\x18\x01 \x01(\t\x12\x35\n\x0f\x64\x65tection_boxes\x18\x02 \x03(\x0b\x32\x1c.PaddleSolution.DetectionBox') +) +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + + + + +_DETECTIONBOX = _descriptor.Descriptor( + name='DetectionBox', + full_name='PaddleSolution.DetectionBox', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='class', full_name='PaddleSolution.DetectionBox.class', index=0, + number=1, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='score', full_name='PaddleSolution.DetectionBox.score', index=1, + number=2, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='left_top_x', full_name='PaddleSolution.DetectionBox.left_top_x', index=2, + number=3, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='left_top_y', full_name='PaddleSolution.DetectionBox.left_top_y', index=3, + number=4, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='right_bottom_x', full_name='PaddleSolution.DetectionBox.right_bottom_x', index=4, + number=5, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='right_bottom_y', full_name='PaddleSolution.DetectionBox.right_bottom_y', index=5, + number=6, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=43, + serialized_end=175, +) + + +_DETECTIONRESULT = _descriptor.Descriptor( + name='DetectionResult', + full_name='PaddleSolution.DetectionResult', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='filename', full_name='PaddleSolution.DetectionResult.filename', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='detection_boxes', full_name='PaddleSolution.DetectionResult.detection_boxes', index=1, + number=2, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=177, + serialized_end=267, +) + +_DETECTIONRESULT.fields_by_name['detection_boxes'].message_type = _DETECTIONBOX +DESCRIPTOR.message_types_by_name['DetectionBox'] = _DETECTIONBOX +DESCRIPTOR.message_types_by_name['DetectionResult'] = _DETECTIONRESULT + +DetectionBox = _reflection.GeneratedProtocolMessageType('DetectionBox', (_message.Message,), dict( + DESCRIPTOR = _DETECTIONBOX, + __module__ = 'detection_result_pb2' + # @@protoc_insertion_point(class_scope:PaddleSolution.DetectionBox) + )) +_sym_db.RegisterMessage(DetectionBox) + +DetectionResult = _reflection.GeneratedProtocolMessageType('DetectionResult', (_message.Message,), dict( + DESCRIPTOR = _DETECTIONRESULT, + __module__ = 'detection_result_pb2' + # @@protoc_insertion_point(class_scope:PaddleSolution.DetectionResult) + )) +_sym_db.RegisterMessage(DetectionResult) + + +# @@protoc_insertion_point(module_scope) diff --git a/inference/tools/vis.py b/inference/tools/vis.py new file mode 100644 index 0000000000000000000000000000000000000000..1ca13bfbaf48669a78bf94344d378c37fe071f1a --- /dev/null +++ b/inference/tools/vis.py @@ -0,0 +1,104 @@ +# coding: utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import detection_result_pb2 +import cv2 +import sys +import gflags +import numpy as np +import json +from PIL import Image, ImageDraw, ImageFont + +Flags = gflags.FLAGS +gflags.DEFINE_string('img_path', 'abc', 'image path') +gflags.DEFINE_string('img_result_path', 'def', 'image result path') +gflags.DEFINE_float('threshold', 0.0, 'threshold of score') +gflags.DEFINE_string('c2l_path', 'ghk', 'class to label path') + +def colormap(rgb=False): + """ + Get colormap + """ + color_list = np.array([ + 0.000, 0.447, 0.741, 0.850, 0.325, 0.098, 0.929, 0.694, 0.125, 0.494, + 0.184, 0.556, 0.466, 0.674, 0.188, 0.301, 0.745, 0.933, 0.635, 0.078, + 0.184, 0.300, 0.300, 0.300, 0.600, 0.600, 0.600, 1.000, 0.000, 0.000, + 1.000, 0.500, 0.000, 0.749, 0.749, 0.000, 0.000, 1.000, 0.000, 0.000, + 0.000, 1.000, 0.667, 0.000, 1.000, 0.333, 0.333, 0.000, 0.333, 0.667, + 0.000, 0.333, 1.000, 0.000, 0.667, 0.333, 0.000, 0.667, 0.667, 0.000, + 0.667, 1.000, 0.000, 1.000, 0.333, 0.000, 1.000, 0.667, 0.000, 1.000, + 1.000, 0.000, 0.000, 0.333, 0.500, 0.000, 0.667, 0.500, 0.000, 1.000, + 0.500, 0.333, 0.000, 0.500, 0.333, 0.333, 0.500, 0.333, 0.667, 0.500, + 0.333, 1.000, 0.500, 0.667, 0.000, 0.500, 0.667, 0.333, 0.500, 0.667, + 0.667, 0.500, 0.667, 1.000, 0.500, 1.000, 0.000, 0.500, 1.000, 0.333, + 0.500, 1.000, 0.667, 0.500, 1.000, 1.000, 0.500, 0.000, 0.333, 1.000, + 0.000, 0.667, 1.000, 0.000, 1.000, 1.000, 0.333, 0.000, 1.000, 0.333, + 0.333, 1.000, 0.333, 0.667, 1.000, 0.333, 1.000, 1.000, 0.667, 0.000, + 1.000, 0.667, 0.333, 1.000, 0.667, 0.667, 1.000, 0.667, 1.000, 1.000, + 1.000, 0.000, 1.000, 1.000, 0.333, 1.000, 1.000, 0.667, 1.000, 0.167, + 0.000, 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, + 0.000, 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, + 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, + 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, 0.000, + 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, 0.833, + 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.143, 0.143, 0.143, 0.286, + 0.286, 0.286, 0.429, 0.429, 0.429, 0.571, 0.571, 0.571, 0.714, 0.714, + 0.714, 0.857, 0.857, 0.857, 1.000, 1.000, 1.000 + ]).astype(np.float32) + color_list = color_list.reshape((-1, 3)) * 255 + if not rgb: + color_list = color_list[:, ::-1] + return color_list + +if __name__ == "__main__": + if len(sys.argv) != 5: + print("Usage: python vis.py --img_path=/path/to/image --img_result_path=/path/to/image_result.pb --threshold=0.1 --c2l_path=/path/to/class2label.json") + else: + Flags(sys.argv) + color_list = colormap(rgb=True) + text_thickness = 1 + text_scale = 0.3 + with open(Flags.img_result_path, "rb") as f: + detection_result = detection_result_pb2.DetectionResult() + detection_result.ParseFromString(f.read()) + img = cv2.imread(Flags.img_path) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + class2LabelMap = dict() + with open(Flags.c2l_path, "r", encoding="utf-8") as json_f: + class2LabelMap = json.load(json_f) + for box in detection_result.detection_boxes: + if box.score >= Flags.threshold: + box_class = getattr(box, 'class') + text_class_score_str = "%s %.2f" % (class2LabelMap.get(str(box_class)), box.score) + text_point = (int(box.left_top_x), int(box.left_top_y)) + + ptLeftTop = (int(box.left_top_x), int(box.left_top_y)) + ptRightBottom = (int(box.right_bottom_x), int(box.right_bottom_y)) + box_thickness = 1 + color = tuple([int(c) for c in color_list[box_class]]) + cv2.rectangle(img, ptLeftTop, ptRightBottom, color, box_thickness, 8) + if text_point[1] < 0: + text_point = (int(box.left_top_x), int(box.right_bottom_y)) + WHITE = (255, 255, 255) + font = cv2.FONT_HERSHEY_SIMPLEX + text_size = cv2.getTextSize(text_class_score_str, font, text_scale, text_thickness) + + text_box_left_top = (text_point[0], text_point[1] - text_size[0][1]) + text_box_right_bottom = (text_point[0] + text_size[0][0], text_point[1]) + + cv2.rectangle(img, text_box_left_top, text_box_right_bottom, color, -1, 8) + cv2.putText(img, text_class_score_str, text_point, font, text_scale, WHITE, text_thickness) + img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + cv2.imwrite(Flags.img_path + ".png", img) diff --git a/inference/utils/conf_parser.h b/inference/utils/conf_parser.h new file mode 100644 index 0000000000000000000000000000000000000000..21944d032b2c24cdb584dc076a696560d4665ea1 --- /dev/null +++ b/inference/utils/conf_parser.h @@ -0,0 +1,237 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include + +#include +namespace PaddleSolution { + + class PaddleModelConfigPaser { + std::map _scaling_map; + public: + PaddleModelConfigPaser() + :_class_num(0), + _channels(0), + _use_gpu(0), + _batch_size(1), + _target_short_size(0), + _model_file_name("__model__"), + _param_file_name("__params__"), + _scaling_map{{"UNPADDING", 0}, + {"RANGE_SCALING",1}}, + _feeds_size(1), + _coarsest_stride(1) + { + } + ~PaddleModelConfigPaser() { + } + + void reset() { + _crop_size.clear(); + _resize.clear(); + _mean.clear(); + _std.clear(); + _img_type.clear(); + _class_num = 0; + _channels = 0; + _use_gpu = 0; + _target_short_size = 0; + _batch_size = 1; + _model_file_name = "__model__"; + _model_path = "./"; + _param_file_name="__params__"; + _resize_type = 0; + _resize_max_size = 0; + _feeds_size = 1; + _coarsest_stride = 1; + } + + std::string process_parenthesis(const std::string& str) { + if (str.size() < 2) { + return str; + } + std::string nstr(str); + if (str[0] == '(' && str.back() == ')') { + nstr[0] = '['; + nstr[str.size() - 1] = ']'; + } + return nstr; + } + + template + std::vector parse_str_to_vec(const std::string& str) { + std::vector data; + auto node = YAML::Load(str); + for (const auto& item : node) { + data.push_back(item.as()); + } + return data; + } + + bool load_config(const std::string& conf_file) { + + reset(); + + YAML::Node config = YAML::LoadFile(conf_file); + // 1. get resize + auto str = config["DEPLOY"]["EVAL_CROP_SIZE"].as(); + _resize = parse_str_to_vec(process_parenthesis(str)); + + // 0. get crop_size + if(config["DEPLOY"]["CROP_SIZE"].IsDefined()) { + auto crop_str = config["DEPLOY"]["CROP_SIZE"].as(); + _crop_size = parse_str_to_vec(process_parenthesis(crop_str)); + } + else { + _crop_size = _resize; + } + + // 2. get mean + for (const auto& item : config["DEPLOY"]["MEAN"]) { + _mean.push_back(item.as()); + } + + // 3. get std + for (const auto& item : config["DEPLOY"]["STD"]) { + _std.push_back(item.as()); + } + + // 4. get image type + _img_type = config["DEPLOY"]["IMAGE_TYPE"].as(); + // 5. get class number + _class_num = config["DEPLOY"]["NUM_CLASSES"].as(); + // 7. set model path + _model_path = config["DEPLOY"]["MODEL_PATH"].as(); + // 8. get model file_name + _model_file_name = config["DEPLOY"]["MODEL_FILENAME"].as(); + // 9. get model param file name + _param_file_name = config["DEPLOY"]["PARAMS_FILENAME"].as(); + // 10. get pre_processor + _pre_processor = config["DEPLOY"]["PRE_PROCESSOR"].as(); + // 11. use_gpu + _use_gpu = config["DEPLOY"]["USE_GPU"].as(); + // 12. predictor_mode + _predictor_mode = config["DEPLOY"]["PREDICTOR_MODE"].as(); + // 13. batch_size + _batch_size = config["DEPLOY"]["BATCH_SIZE"].as(); + // 14. channels + _channels = config["DEPLOY"]["CHANNELS"].as(); + // 15. target_short_size + if(config["DEPLOY"]["TARGET_SHORT_SIZE"].IsDefined()) { + _target_short_size = config["DEPLOY"]["TARGET_SHORT_SIZE"].as(); + } + // 16.resize_type + if(config["DEPLOY"]["RESIZE_TYPE"].IsDefined() && + _scaling_map.find(config["DEPLOY"]["RESIZE_TYPE"].as()) != _scaling_map.end()) { + _resize_type = _scaling_map[config["DEPLOY"]["RESIZE_TYPE"].as()]; + } + else{ + _resize_type = 0; + } + // 17.resize_max_size + if(config["DEPLOY"]["RESIZE_MAX_SIZE"].IsDefined()) { + _resize_max_size = config["DEPLOY"]["RESIZE_MAX_SIZE"].as(); + } + // 18.feeds_size + if(config["DEPLOY"]["FEEDS_SIZE"].IsDefined()){ + _feeds_size = config["DEPLOY"]["FEEDS_SIZE"].as(); + } + // 19. coarsest_stride + if(config["DEPLOY"]["COARSEST_STRIDE"].IsDefined()) { + _coarsest_stride = config["DEPLOY"]["COARSEST_STRIDE"].as(); + } + return true; + } + + void debug() const { + + std::cout << "SCALE_RESIZE: (" << _resize[0] << ", " << _resize[1] << ")" << std::endl; + + std::cout << "MEAN: ["; + for (int i = 0; i < _mean.size(); ++i) { + if (i != _mean.size() - 1) { + std::cout << _mean[i] << ", "; + } else { + std::cout << _mean[i]; + } + } + std::cout << "]" << std::endl; + + std::cout << "STD: ["; + for (int i = 0; i < _std.size(); ++i) { + if (i != _std.size() - 1) { + std::cout << _std[i] << ", "; + } + else { + std::cout << _std[i]; + } + } + std::cout << "]" << std::endl; + std::cout << "DEPLOY.TARGET_SHORT_SIZE: " << _target_short_size << std::endl; + std::cout << "DEPLOY.IMAGE_TYPE: " << _img_type << std::endl; + std::cout << "DEPLOY.NUM_CLASSES: " << _class_num << std::endl; + std::cout << "DEPLOY.CHANNELS: " << _channels << std::endl; + std::cout << "DEPLOY.MODEL_PATH: " << _model_path << std::endl; + std::cout << "DEPLOY.MODEL_FILENAME: " << _model_file_name << std::endl; + std::cout << "DEPLOY.PARAMS_FILENAME: " << _param_file_name << std::endl; + std::cout << "DEPLOY.PRE_PROCESSOR: " << _pre_processor << std::endl; + std::cout << "DEPLOY.USE_GPU: " << _use_gpu << std::endl; + std::cout << "DEPLOY.PREDICTOR_MODE: " << _predictor_mode << std::endl; + std::cout << "DEPLOY.BATCH_SIZE: " << _batch_size << std::endl; + } + //DEPLOY.COARSEST_STRIDE + int _coarsest_stride; + // DEPLOY.FEEDS_SIZE + int _feeds_size; + // DEPLOY.RESIZE_TYPE 0:unpadding 1:rangescaling Default:0 + int _resize_type; + // DEPLOY.RESIZE_MAX_SIZE + int _resize_max_size; + // DEPLOY.CROP_SIZE + std::vector _crop_size; + // DEPLOY.SCALE_RESIZE + std::vector _resize; + // DEPLOY.MEAN + std::vector _mean; + // DEPLOY.STD + std::vector _std; + // DEPLOY.IMAGE_TYPE + std::string _img_type; + // DEPLOY.TARGET_SHORT_SIZE + int _target_short_size; + // DEPLOY.NUM_CLASSES + int _class_num; + // DEPLOY.CHANNELS + int _channels; + // DEPLOY.MODEL_PATH + std::string _model_path; + // DEPLOY.MODEL_FILENAME + std::string _model_file_name; + // DEPLOY.PARAMS_FILENAME + std::string _param_file_name; + // DEPLOY.PRE_PROCESSOR + std::string _pre_processor; + // DEPLOY.USE_GPU + int _use_gpu; + // DEPLOY.PREDICTOR_MODE + std::string _predictor_mode; + // DEPLOY.BATCH_SIZE + int _batch_size; + }; + +} diff --git a/inference/utils/detection_result.pb.cc b/inference/utils/detection_result.pb.cc new file mode 100644 index 0000000000000000000000000000000000000000..b5cce7317914cf93f99d0d4efa3aee763972cc4e --- /dev/null +++ b/inference/utils/detection_result.pb.cc @@ -0,0 +1,1159 @@ +// Generated by the protocol buffer compiler. DO NOT EDIT! +// source: detection_result.proto + +#define INTERNAL_SUPPRESS_PROTOBUF_FIELD_DEPRECATION +#include "detection_result.pb.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +// @@protoc_insertion_point(includes) + +namespace PaddleSolution { + +namespace { + +const ::google::protobuf::Descriptor* DetectionBox_descriptor_ = NULL; +const ::google::protobuf::internal::GeneratedMessageReflection* + DetectionBox_reflection_ = NULL; +const ::google::protobuf::Descriptor* DetectionResult_descriptor_ = NULL; +const ::google::protobuf::internal::GeneratedMessageReflection* + DetectionResult_reflection_ = NULL; + +} // namespace + + +void protobuf_AssignDesc_detection_5fresult_2eproto() GOOGLE_ATTRIBUTE_COLD; +void protobuf_AssignDesc_detection_5fresult_2eproto() { + protobuf_AddDesc_detection_5fresult_2eproto(); + const ::google::protobuf::FileDescriptor* file = + ::google::protobuf::DescriptorPool::generated_pool()->FindFileByName( + "detection_result.proto"); + GOOGLE_CHECK(file != NULL); + DetectionBox_descriptor_ = file->message_type(0); + static const int DetectionBox_offsets_[6] = { + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(DetectionBox, class__), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(DetectionBox, score_), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(DetectionBox, left_top_x_), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(DetectionBox, left_top_y_), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(DetectionBox, right_bottom_x_), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(DetectionBox, right_bottom_y_), + }; + DetectionBox_reflection_ = + ::google::protobuf::internal::GeneratedMessageReflection::NewGeneratedMessageReflection( + DetectionBox_descriptor_, + DetectionBox::internal_default_instance(), + DetectionBox_offsets_, + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(DetectionBox, _has_bits_), + -1, + -1, + sizeof(DetectionBox), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(DetectionBox, _internal_metadata_)); + DetectionResult_descriptor_ = file->message_type(1); + static const int DetectionResult_offsets_[2] = { + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(DetectionResult, filename_), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(DetectionResult, detection_boxes_), + }; + DetectionResult_reflection_ = + ::google::protobuf::internal::GeneratedMessageReflection::NewGeneratedMessageReflection( + DetectionResult_descriptor_, + DetectionResult::internal_default_instance(), + DetectionResult_offsets_, + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(DetectionResult, _has_bits_), + -1, + -1, + sizeof(DetectionResult), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(DetectionResult, _internal_metadata_)); +} + +namespace { + +GOOGLE_PROTOBUF_DECLARE_ONCE(protobuf_AssignDescriptors_once_); +void protobuf_AssignDescriptorsOnce() { + ::google::protobuf::GoogleOnceInit(&protobuf_AssignDescriptors_once_, + &protobuf_AssignDesc_detection_5fresult_2eproto); +} + +void protobuf_RegisterTypes(const ::std::string&) GOOGLE_ATTRIBUTE_COLD; +void protobuf_RegisterTypes(const ::std::string&) { + protobuf_AssignDescriptorsOnce(); + ::google::protobuf::MessageFactory::InternalRegisterGeneratedMessage( + DetectionBox_descriptor_, DetectionBox::internal_default_instance()); + ::google::protobuf::MessageFactory::InternalRegisterGeneratedMessage( + DetectionResult_descriptor_, DetectionResult::internal_default_instance()); +} + +} // namespace + +void protobuf_ShutdownFile_detection_5fresult_2eproto() { + DetectionBox_default_instance_.Shutdown(); + delete DetectionBox_reflection_; + DetectionResult_default_instance_.Shutdown(); + delete DetectionResult_reflection_; +} + +void protobuf_InitDefaults_detection_5fresult_2eproto_impl() { + GOOGLE_PROTOBUF_VERIFY_VERSION; + + DetectionBox_default_instance_.DefaultConstruct(); + ::google::protobuf::internal::GetEmptyString(); + DetectionResult_default_instance_.DefaultConstruct(); + DetectionBox_default_instance_.get_mutable()->InitAsDefaultInstance(); + DetectionResult_default_instance_.get_mutable()->InitAsDefaultInstance(); +} + +GOOGLE_PROTOBUF_DECLARE_ONCE(protobuf_InitDefaults_detection_5fresult_2eproto_once_); +void protobuf_InitDefaults_detection_5fresult_2eproto() { + ::google::protobuf::GoogleOnceInit(&protobuf_InitDefaults_detection_5fresult_2eproto_once_, + &protobuf_InitDefaults_detection_5fresult_2eproto_impl); +} +void protobuf_AddDesc_detection_5fresult_2eproto_impl() { + GOOGLE_PROTOBUF_VERIFY_VERSION; + + protobuf_InitDefaults_detection_5fresult_2eproto(); + ::google::protobuf::DescriptorPool::InternalAddGeneratedFile( + "\n\026detection_result.proto\022\016PaddleSolution" + "\"\204\001\n\014DetectionBox\022\r\n\005class\030\001 \001(\005\022\r\n\005scor" + "e\030\002 \001(\002\022\022\n\nleft_top_x\030\003 \001(\002\022\022\n\nleft_top_" + "y\030\004 \001(\002\022\026\n\016right_bottom_x\030\005 \001(\002\022\026\n\016right" + "_bottom_y\030\006 \001(\002\"Z\n\017DetectionResult\022\020\n\010fi" + "lename\030\001 \001(\t\0225\n\017detection_boxes\030\002 \003(\0132\034." + "PaddleSolution.DetectionBox", 267); + ::google::protobuf::MessageFactory::InternalRegisterGeneratedFile( + "detection_result.proto", &protobuf_RegisterTypes); + ::google::protobuf::internal::OnShutdown(&protobuf_ShutdownFile_detection_5fresult_2eproto); +} + +GOOGLE_PROTOBUF_DECLARE_ONCE(protobuf_AddDesc_detection_5fresult_2eproto_once_); +void protobuf_AddDesc_detection_5fresult_2eproto() { + ::google::protobuf::GoogleOnceInit(&protobuf_AddDesc_detection_5fresult_2eproto_once_, + &protobuf_AddDesc_detection_5fresult_2eproto_impl); +} +// Force AddDescriptors() to be called at static initialization time. +struct StaticDescriptorInitializer_detection_5fresult_2eproto { + StaticDescriptorInitializer_detection_5fresult_2eproto() { + protobuf_AddDesc_detection_5fresult_2eproto(); + } +} static_descriptor_initializer_detection_5fresult_2eproto_; + +namespace { + +static void MergeFromFail(int line) GOOGLE_ATTRIBUTE_COLD GOOGLE_ATTRIBUTE_NORETURN; +static void MergeFromFail(int line) { + ::google::protobuf::internal::MergeFromFail(__FILE__, line); +} + +} // namespace + + +// =================================================================== + +#if !defined(_MSC_VER) || _MSC_VER >= 1900 +const int DetectionBox::kClassFieldNumber; +const int DetectionBox::kScoreFieldNumber; +const int DetectionBox::kLeftTopXFieldNumber; +const int DetectionBox::kLeftTopYFieldNumber; +const int DetectionBox::kRightBottomXFieldNumber; +const int DetectionBox::kRightBottomYFieldNumber; +#endif // !defined(_MSC_VER) || _MSC_VER >= 1900 + +DetectionBox::DetectionBox() + : ::google::protobuf::Message(), _internal_metadata_(NULL) { + if (this != internal_default_instance()) protobuf_InitDefaults_detection_5fresult_2eproto(); + SharedCtor(); + // @@protoc_insertion_point(constructor:PaddleSolution.DetectionBox) +} + +void DetectionBox::InitAsDefaultInstance() { +} + +DetectionBox::DetectionBox(const DetectionBox& from) + : ::google::protobuf::Message(), + _internal_metadata_(NULL) { + SharedCtor(); + UnsafeMergeFrom(from); + // @@protoc_insertion_point(copy_constructor:PaddleSolution.DetectionBox) +} + +void DetectionBox::SharedCtor() { + _cached_size_ = 0; + ::memset(&class__, 0, reinterpret_cast(&right_bottom_y_) - + reinterpret_cast(&class__) + sizeof(right_bottom_y_)); +} + +DetectionBox::~DetectionBox() { + // @@protoc_insertion_point(destructor:PaddleSolution.DetectionBox) + SharedDtor(); +} + +void DetectionBox::SharedDtor() { +} + +void DetectionBox::SetCachedSize(int size) const { + GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN(); + _cached_size_ = size; + GOOGLE_SAFE_CONCURRENT_WRITES_END(); +} +const ::google::protobuf::Descriptor* DetectionBox::descriptor() { + protobuf_AssignDescriptorsOnce(); + return DetectionBox_descriptor_; +} + +const DetectionBox& DetectionBox::default_instance() { + protobuf_InitDefaults_detection_5fresult_2eproto(); + return *internal_default_instance(); +} + +::google::protobuf::internal::ExplicitlyConstructed DetectionBox_default_instance_; + +DetectionBox* DetectionBox::New(::google::protobuf::Arena* arena) const { + DetectionBox* n = new DetectionBox; + if (arena != NULL) { + arena->Own(n); + } + return n; +} + +void DetectionBox::Clear() { +// @@protoc_insertion_point(message_clear_start:PaddleSolution.DetectionBox) +#if defined(__clang__) +#define ZR_HELPER_(f) \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Winvalid-offsetof\"") \ + __builtin_offsetof(DetectionBox, f) \ + _Pragma("clang diagnostic pop") +#else +#define ZR_HELPER_(f) reinterpret_cast(\ + &reinterpret_cast(16)->f) +#endif + +#define ZR_(first, last) do {\ + ::memset(&(first), 0,\ + ZR_HELPER_(last) - ZR_HELPER_(first) + sizeof(last));\ +} while (0) + + ZR_(class__, right_bottom_y_); + +#undef ZR_HELPER_ +#undef ZR_ + + _has_bits_.Clear(); + if (_internal_metadata_.have_unknown_fields()) { + mutable_unknown_fields()->Clear(); + } +} + +bool DetectionBox::MergePartialFromCodedStream( + ::google::protobuf::io::CodedInputStream* input) { +#define DO_(EXPRESSION) if (!GOOGLE_PREDICT_TRUE(EXPRESSION)) goto failure + ::google::protobuf::uint32 tag; + // @@protoc_insertion_point(parse_start:PaddleSolution.DetectionBox) + for (;;) { + ::std::pair< ::google::protobuf::uint32, bool> p = input->ReadTagWithCutoff(127); + tag = p.first; + if (!p.second) goto handle_unusual; + switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) { + // optional int32 class = 1; + case 1: { + if (tag == 8) { + set_has_class_(); + DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive< + ::google::protobuf::int32, ::google::protobuf::internal::WireFormatLite::TYPE_INT32>( + input, &class__))); + } else { + goto handle_unusual; + } + if (input->ExpectTag(21)) goto parse_score; + break; + } + + // optional float score = 2; + case 2: { + if (tag == 21) { + parse_score: + set_has_score(); + DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive< + float, ::google::protobuf::internal::WireFormatLite::TYPE_FLOAT>( + input, &score_))); + } else { + goto handle_unusual; + } + if (input->ExpectTag(29)) goto parse_left_top_x; + break; + } + + // optional float left_top_x = 3; + case 3: { + if (tag == 29) { + parse_left_top_x: + set_has_left_top_x(); + DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive< + float, ::google::protobuf::internal::WireFormatLite::TYPE_FLOAT>( + input, &left_top_x_))); + } else { + goto handle_unusual; + } + if (input->ExpectTag(37)) goto parse_left_top_y; + break; + } + + // optional float left_top_y = 4; + case 4: { + if (tag == 37) { + parse_left_top_y: + set_has_left_top_y(); + DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive< + float, ::google::protobuf::internal::WireFormatLite::TYPE_FLOAT>( + input, &left_top_y_))); + } else { + goto handle_unusual; + } + if (input->ExpectTag(45)) goto parse_right_bottom_x; + break; + } + + // optional float right_bottom_x = 5; + case 5: { + if (tag == 45) { + parse_right_bottom_x: + set_has_right_bottom_x(); + DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive< + float, ::google::protobuf::internal::WireFormatLite::TYPE_FLOAT>( + input, &right_bottom_x_))); + } else { + goto handle_unusual; + } + if (input->ExpectTag(53)) goto parse_right_bottom_y; + break; + } + + // optional float right_bottom_y = 6; + case 6: { + if (tag == 53) { + parse_right_bottom_y: + set_has_right_bottom_y(); + DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive< + float, ::google::protobuf::internal::WireFormatLite::TYPE_FLOAT>( + input, &right_bottom_y_))); + } else { + goto handle_unusual; + } + if (input->ExpectAtEnd()) goto success; + break; + } + + default: { + handle_unusual: + if (tag == 0 || + ::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_END_GROUP) { + goto success; + } + DO_(::google::protobuf::internal::WireFormat::SkipField( + input, tag, mutable_unknown_fields())); + break; + } + } + } +success: + // @@protoc_insertion_point(parse_success:PaddleSolution.DetectionBox) + return true; +failure: + // @@protoc_insertion_point(parse_failure:PaddleSolution.DetectionBox) + return false; +#undef DO_ +} + +void DetectionBox::SerializeWithCachedSizes( + ::google::protobuf::io::CodedOutputStream* output) const { + // @@protoc_insertion_point(serialize_start:PaddleSolution.DetectionBox) + // optional int32 class = 1; + if (has_class_()) { + ::google::protobuf::internal::WireFormatLite::WriteInt32(1, this->class_(), output); + } + + // optional float score = 2; + if (has_score()) { + ::google::protobuf::internal::WireFormatLite::WriteFloat(2, this->score(), output); + } + + // optional float left_top_x = 3; + if (has_left_top_x()) { + ::google::protobuf::internal::WireFormatLite::WriteFloat(3, this->left_top_x(), output); + } + + // optional float left_top_y = 4; + if (has_left_top_y()) { + ::google::protobuf::internal::WireFormatLite::WriteFloat(4, this->left_top_y(), output); + } + + // optional float right_bottom_x = 5; + if (has_right_bottom_x()) { + ::google::protobuf::internal::WireFormatLite::WriteFloat(5, this->right_bottom_x(), output); + } + + // optional float right_bottom_y = 6; + if (has_right_bottom_y()) { + ::google::protobuf::internal::WireFormatLite::WriteFloat(6, this->right_bottom_y(), output); + } + + if (_internal_metadata_.have_unknown_fields()) { + ::google::protobuf::internal::WireFormat::SerializeUnknownFields( + unknown_fields(), output); + } + // @@protoc_insertion_point(serialize_end:PaddleSolution.DetectionBox) +} + +::google::protobuf::uint8* DetectionBox::InternalSerializeWithCachedSizesToArray( + bool deterministic, ::google::protobuf::uint8* target) const { + (void)deterministic; // Unused + // @@protoc_insertion_point(serialize_to_array_start:PaddleSolution.DetectionBox) + // optional int32 class = 1; + if (has_class_()) { + target = ::google::protobuf::internal::WireFormatLite::WriteInt32ToArray(1, this->class_(), target); + } + + // optional float score = 2; + if (has_score()) { + target = ::google::protobuf::internal::WireFormatLite::WriteFloatToArray(2, this->score(), target); + } + + // optional float left_top_x = 3; + if (has_left_top_x()) { + target = ::google::protobuf::internal::WireFormatLite::WriteFloatToArray(3, this->left_top_x(), target); + } + + // optional float left_top_y = 4; + if (has_left_top_y()) { + target = ::google::protobuf::internal::WireFormatLite::WriteFloatToArray(4, this->left_top_y(), target); + } + + // optional float right_bottom_x = 5; + if (has_right_bottom_x()) { + target = ::google::protobuf::internal::WireFormatLite::WriteFloatToArray(5, this->right_bottom_x(), target); + } + + // optional float right_bottom_y = 6; + if (has_right_bottom_y()) { + target = ::google::protobuf::internal::WireFormatLite::WriteFloatToArray(6, this->right_bottom_y(), target); + } + + if (_internal_metadata_.have_unknown_fields()) { + target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray( + unknown_fields(), target); + } + // @@protoc_insertion_point(serialize_to_array_end:PaddleSolution.DetectionBox) + return target; +} + +size_t DetectionBox::ByteSizeLong() const { +// @@protoc_insertion_point(message_byte_size_start:PaddleSolution.DetectionBox) + size_t total_size = 0; + + if (_has_bits_[0 / 32] & 63u) { + // optional int32 class = 1; + if (has_class_()) { + total_size += 1 + + ::google::protobuf::internal::WireFormatLite::Int32Size( + this->class_()); + } + + // optional float score = 2; + if (has_score()) { + total_size += 1 + 4; + } + + // optional float left_top_x = 3; + if (has_left_top_x()) { + total_size += 1 + 4; + } + + // optional float left_top_y = 4; + if (has_left_top_y()) { + total_size += 1 + 4; + } + + // optional float right_bottom_x = 5; + if (has_right_bottom_x()) { + total_size += 1 + 4; + } + + // optional float right_bottom_y = 6; + if (has_right_bottom_y()) { + total_size += 1 + 4; + } + + } + if (_internal_metadata_.have_unknown_fields()) { + total_size += + ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize( + unknown_fields()); + } + int cached_size = ::google::protobuf::internal::ToCachedSize(total_size); + GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN(); + _cached_size_ = cached_size; + GOOGLE_SAFE_CONCURRENT_WRITES_END(); + return total_size; +} + +void DetectionBox::MergeFrom(const ::google::protobuf::Message& from) { +// @@protoc_insertion_point(generalized_merge_from_start:PaddleSolution.DetectionBox) + if (GOOGLE_PREDICT_FALSE(&from == this)) MergeFromFail(__LINE__); + const DetectionBox* source = + ::google::protobuf::internal::DynamicCastToGenerated( + &from); + if (source == NULL) { + // @@protoc_insertion_point(generalized_merge_from_cast_fail:PaddleSolution.DetectionBox) + ::google::protobuf::internal::ReflectionOps::Merge(from, this); + } else { + // @@protoc_insertion_point(generalized_merge_from_cast_success:PaddleSolution.DetectionBox) + UnsafeMergeFrom(*source); + } +} + +void DetectionBox::MergeFrom(const DetectionBox& from) { +// @@protoc_insertion_point(class_specific_merge_from_start:PaddleSolution.DetectionBox) + if (GOOGLE_PREDICT_TRUE(&from != this)) { + UnsafeMergeFrom(from); + } else { + MergeFromFail(__LINE__); + } +} + +void DetectionBox::UnsafeMergeFrom(const DetectionBox& from) { + GOOGLE_DCHECK(&from != this); + if (from._has_bits_[0 / 32] & (0xffu << (0 % 32))) { + if (from.has_class_()) { + set_class_(from.class_()); + } + if (from.has_score()) { + set_score(from.score()); + } + if (from.has_left_top_x()) { + set_left_top_x(from.left_top_x()); + } + if (from.has_left_top_y()) { + set_left_top_y(from.left_top_y()); + } + if (from.has_right_bottom_x()) { + set_right_bottom_x(from.right_bottom_x()); + } + if (from.has_right_bottom_y()) { + set_right_bottom_y(from.right_bottom_y()); + } + } + if (from._internal_metadata_.have_unknown_fields()) { + ::google::protobuf::UnknownFieldSet::MergeToInternalMetdata( + from.unknown_fields(), &_internal_metadata_); + } +} + +void DetectionBox::CopyFrom(const ::google::protobuf::Message& from) { +// @@protoc_insertion_point(generalized_copy_from_start:PaddleSolution.DetectionBox) + if (&from == this) return; + Clear(); + MergeFrom(from); +} + +void DetectionBox::CopyFrom(const DetectionBox& from) { +// @@protoc_insertion_point(class_specific_copy_from_start:PaddleSolution.DetectionBox) + if (&from == this) return; + Clear(); + UnsafeMergeFrom(from); +} + +bool DetectionBox::IsInitialized() const { + + return true; +} + +void DetectionBox::Swap(DetectionBox* other) { + if (other == this) return; + InternalSwap(other); +} +void DetectionBox::InternalSwap(DetectionBox* other) { + std::swap(class__, other->class__); + std::swap(score_, other->score_); + std::swap(left_top_x_, other->left_top_x_); + std::swap(left_top_y_, other->left_top_y_); + std::swap(right_bottom_x_, other->right_bottom_x_); + std::swap(right_bottom_y_, other->right_bottom_y_); + std::swap(_has_bits_[0], other->_has_bits_[0]); + _internal_metadata_.Swap(&other->_internal_metadata_); + std::swap(_cached_size_, other->_cached_size_); +} + +::google::protobuf::Metadata DetectionBox::GetMetadata() const { + protobuf_AssignDescriptorsOnce(); + ::google::protobuf::Metadata metadata; + metadata.descriptor = DetectionBox_descriptor_; + metadata.reflection = DetectionBox_reflection_; + return metadata; +} + +#if PROTOBUF_INLINE_NOT_IN_HEADERS +// DetectionBox + +// optional int32 class = 1; +bool DetectionBox::has_class_() const { + return (_has_bits_[0] & 0x00000001u) != 0; +} +void DetectionBox::set_has_class_() { + _has_bits_[0] |= 0x00000001u; +} +void DetectionBox::clear_has_class_() { + _has_bits_[0] &= ~0x00000001u; +} +void DetectionBox::clear_class_() { + class__ = 0; + clear_has_class_(); +} +::google::protobuf::int32 DetectionBox::class_() const { + // @@protoc_insertion_point(field_get:PaddleSolution.DetectionBox.class) + return class__; +} +void DetectionBox::set_class_(::google::protobuf::int32 value) { + set_has_class_(); + class__ = value; + // @@protoc_insertion_point(field_set:PaddleSolution.DetectionBox.class) +} + +// optional float score = 2; +bool DetectionBox::has_score() const { + return (_has_bits_[0] & 0x00000002u) != 0; +} +void DetectionBox::set_has_score() { + _has_bits_[0] |= 0x00000002u; +} +void DetectionBox::clear_has_score() { + _has_bits_[0] &= ~0x00000002u; +} +void DetectionBox::clear_score() { + score_ = 0; + clear_has_score(); +} +float DetectionBox::score() const { + // @@protoc_insertion_point(field_get:PaddleSolution.DetectionBox.score) + return score_; +} +void DetectionBox::set_score(float value) { + set_has_score(); + score_ = value; + // @@protoc_insertion_point(field_set:PaddleSolution.DetectionBox.score) +} + +// optional float left_top_x = 3; +bool DetectionBox::has_left_top_x() const { + return (_has_bits_[0] & 0x00000004u) != 0; +} +void DetectionBox::set_has_left_top_x() { + _has_bits_[0] |= 0x00000004u; +} +void DetectionBox::clear_has_left_top_x() { + _has_bits_[0] &= ~0x00000004u; +} +void DetectionBox::clear_left_top_x() { + left_top_x_ = 0; + clear_has_left_top_x(); +} +float DetectionBox::left_top_x() const { + // @@protoc_insertion_point(field_get:PaddleSolution.DetectionBox.left_top_x) + return left_top_x_; +} +void DetectionBox::set_left_top_x(float value) { + set_has_left_top_x(); + left_top_x_ = value; + // @@protoc_insertion_point(field_set:PaddleSolution.DetectionBox.left_top_x) +} + +// optional float left_top_y = 4; +bool DetectionBox::has_left_top_y() const { + return (_has_bits_[0] & 0x00000008u) != 0; +} +void DetectionBox::set_has_left_top_y() { + _has_bits_[0] |= 0x00000008u; +} +void DetectionBox::clear_has_left_top_y() { + _has_bits_[0] &= ~0x00000008u; +} +void DetectionBox::clear_left_top_y() { + left_top_y_ = 0; + clear_has_left_top_y(); +} +float DetectionBox::left_top_y() const { + // @@protoc_insertion_point(field_get:PaddleSolution.DetectionBox.left_top_y) + return left_top_y_; +} +void DetectionBox::set_left_top_y(float value) { + set_has_left_top_y(); + left_top_y_ = value; + // @@protoc_insertion_point(field_set:PaddleSolution.DetectionBox.left_top_y) +} + +// optional float right_bottom_x = 5; +bool DetectionBox::has_right_bottom_x() const { + return (_has_bits_[0] & 0x00000010u) != 0; +} +void DetectionBox::set_has_right_bottom_x() { + _has_bits_[0] |= 0x00000010u; +} +void DetectionBox::clear_has_right_bottom_x() { + _has_bits_[0] &= ~0x00000010u; +} +void DetectionBox::clear_right_bottom_x() { + right_bottom_x_ = 0; + clear_has_right_bottom_x(); +} +float DetectionBox::right_bottom_x() const { + // @@protoc_insertion_point(field_get:PaddleSolution.DetectionBox.right_bottom_x) + return right_bottom_x_; +} +void DetectionBox::set_right_bottom_x(float value) { + set_has_right_bottom_x(); + right_bottom_x_ = value; + // @@protoc_insertion_point(field_set:PaddleSolution.DetectionBox.right_bottom_x) +} + +// optional float right_bottom_y = 6; +bool DetectionBox::has_right_bottom_y() const { + return (_has_bits_[0] & 0x00000020u) != 0; +} +void DetectionBox::set_has_right_bottom_y() { + _has_bits_[0] |= 0x00000020u; +} +void DetectionBox::clear_has_right_bottom_y() { + _has_bits_[0] &= ~0x00000020u; +} +void DetectionBox::clear_right_bottom_y() { + right_bottom_y_ = 0; + clear_has_right_bottom_y(); +} +float DetectionBox::right_bottom_y() const { + // @@protoc_insertion_point(field_get:PaddleSolution.DetectionBox.right_bottom_y) + return right_bottom_y_; +} +void DetectionBox::set_right_bottom_y(float value) { + set_has_right_bottom_y(); + right_bottom_y_ = value; + // @@protoc_insertion_point(field_set:PaddleSolution.DetectionBox.right_bottom_y) +} + +inline const DetectionBox* DetectionBox::internal_default_instance() { + return &DetectionBox_default_instance_.get(); +} +#endif // PROTOBUF_INLINE_NOT_IN_HEADERS + +// =================================================================== + +#if !defined(_MSC_VER) || _MSC_VER >= 1900 +const int DetectionResult::kFilenameFieldNumber; +const int DetectionResult::kDetectionBoxesFieldNumber; +#endif // !defined(_MSC_VER) || _MSC_VER >= 1900 + +DetectionResult::DetectionResult() + : ::google::protobuf::Message(), _internal_metadata_(NULL) { + if (this != internal_default_instance()) protobuf_InitDefaults_detection_5fresult_2eproto(); + SharedCtor(); + // @@protoc_insertion_point(constructor:PaddleSolution.DetectionResult) +} + +void DetectionResult::InitAsDefaultInstance() { +} + +DetectionResult::DetectionResult(const DetectionResult& from) + : ::google::protobuf::Message(), + _internal_metadata_(NULL) { + SharedCtor(); + UnsafeMergeFrom(from); + // @@protoc_insertion_point(copy_constructor:PaddleSolution.DetectionResult) +} + +void DetectionResult::SharedCtor() { + _cached_size_ = 0; + filename_.UnsafeSetDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} + +DetectionResult::~DetectionResult() { + // @@protoc_insertion_point(destructor:PaddleSolution.DetectionResult) + SharedDtor(); +} + +void DetectionResult::SharedDtor() { + filename_.DestroyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} + +void DetectionResult::SetCachedSize(int size) const { + GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN(); + _cached_size_ = size; + GOOGLE_SAFE_CONCURRENT_WRITES_END(); +} +const ::google::protobuf::Descriptor* DetectionResult::descriptor() { + protobuf_AssignDescriptorsOnce(); + return DetectionResult_descriptor_; +} + +const DetectionResult& DetectionResult::default_instance() { + protobuf_InitDefaults_detection_5fresult_2eproto(); + return *internal_default_instance(); +} + +::google::protobuf::internal::ExplicitlyConstructed DetectionResult_default_instance_; + +DetectionResult* DetectionResult::New(::google::protobuf::Arena* arena) const { + DetectionResult* n = new DetectionResult; + if (arena != NULL) { + arena->Own(n); + } + return n; +} + +void DetectionResult::Clear() { +// @@protoc_insertion_point(message_clear_start:PaddleSolution.DetectionResult) + if (has_filename()) { + filename_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); + } + detection_boxes_.Clear(); + _has_bits_.Clear(); + if (_internal_metadata_.have_unknown_fields()) { + mutable_unknown_fields()->Clear(); + } +} + +bool DetectionResult::MergePartialFromCodedStream( + ::google::protobuf::io::CodedInputStream* input) { +#define DO_(EXPRESSION) if (!GOOGLE_PREDICT_TRUE(EXPRESSION)) goto failure + ::google::protobuf::uint32 tag; + // @@protoc_insertion_point(parse_start:PaddleSolution.DetectionResult) + for (;;) { + ::std::pair< ::google::protobuf::uint32, bool> p = input->ReadTagWithCutoff(127); + tag = p.first; + if (!p.second) goto handle_unusual; + switch (::google::protobuf::internal::WireFormatLite::GetTagFieldNumber(tag)) { + // optional string filename = 1; + case 1: { + if (tag == 10) { + DO_(::google::protobuf::internal::WireFormatLite::ReadString( + input, this->mutable_filename())); + ::google::protobuf::internal::WireFormat::VerifyUTF8StringNamedField( + this->filename().data(), this->filename().length(), + ::google::protobuf::internal::WireFormat::PARSE, + "PaddleSolution.DetectionResult.filename"); + } else { + goto handle_unusual; + } + if (input->ExpectTag(18)) goto parse_detection_boxes; + break; + } + + // repeated .PaddleSolution.DetectionBox detection_boxes = 2; + case 2: { + if (tag == 18) { + parse_detection_boxes: + DO_(input->IncrementRecursionDepth()); + parse_loop_detection_boxes: + DO_(::google::protobuf::internal::WireFormatLite::ReadMessageNoVirtualNoRecursionDepth( + input, add_detection_boxes())); + } else { + goto handle_unusual; + } + if (input->ExpectTag(18)) goto parse_loop_detection_boxes; + input->UnsafeDecrementRecursionDepth(); + if (input->ExpectAtEnd()) goto success; + break; + } + + default: { + handle_unusual: + if (tag == 0 || + ::google::protobuf::internal::WireFormatLite::GetTagWireType(tag) == + ::google::protobuf::internal::WireFormatLite::WIRETYPE_END_GROUP) { + goto success; + } + DO_(::google::protobuf::internal::WireFormat::SkipField( + input, tag, mutable_unknown_fields())); + break; + } + } + } +success: + // @@protoc_insertion_point(parse_success:PaddleSolution.DetectionResult) + return true; +failure: + // @@protoc_insertion_point(parse_failure:PaddleSolution.DetectionResult) + return false; +#undef DO_ +} + +void DetectionResult::SerializeWithCachedSizes( + ::google::protobuf::io::CodedOutputStream* output) const { + // @@protoc_insertion_point(serialize_start:PaddleSolution.DetectionResult) + // optional string filename = 1; + if (has_filename()) { + ::google::protobuf::internal::WireFormat::VerifyUTF8StringNamedField( + this->filename().data(), this->filename().length(), + ::google::protobuf::internal::WireFormat::SERIALIZE, + "PaddleSolution.DetectionResult.filename"); + ::google::protobuf::internal::WireFormatLite::WriteStringMaybeAliased( + 1, this->filename(), output); + } + + // repeated .PaddleSolution.DetectionBox detection_boxes = 2; + for (unsigned int i = 0, n = this->detection_boxes_size(); i < n; i++) { + ::google::protobuf::internal::WireFormatLite::WriteMessageMaybeToArray( + 2, this->detection_boxes(i), output); + } + + if (_internal_metadata_.have_unknown_fields()) { + ::google::protobuf::internal::WireFormat::SerializeUnknownFields( + unknown_fields(), output); + } + // @@protoc_insertion_point(serialize_end:PaddleSolution.DetectionResult) +} + +::google::protobuf::uint8* DetectionResult::InternalSerializeWithCachedSizesToArray( + bool deterministic, ::google::protobuf::uint8* target) const { + (void)deterministic; // Unused + // @@protoc_insertion_point(serialize_to_array_start:PaddleSolution.DetectionResult) + // optional string filename = 1; + if (has_filename()) { + ::google::protobuf::internal::WireFormat::VerifyUTF8StringNamedField( + this->filename().data(), this->filename().length(), + ::google::protobuf::internal::WireFormat::SERIALIZE, + "PaddleSolution.DetectionResult.filename"); + target = + ::google::protobuf::internal::WireFormatLite::WriteStringToArray( + 1, this->filename(), target); + } + + // repeated .PaddleSolution.DetectionBox detection_boxes = 2; + for (unsigned int i = 0, n = this->detection_boxes_size(); i < n; i++) { + target = ::google::protobuf::internal::WireFormatLite:: + InternalWriteMessageNoVirtualToArray( + 2, this->detection_boxes(i), false, target); + } + + if (_internal_metadata_.have_unknown_fields()) { + target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray( + unknown_fields(), target); + } + // @@protoc_insertion_point(serialize_to_array_end:PaddleSolution.DetectionResult) + return target; +} + +size_t DetectionResult::ByteSizeLong() const { +// @@protoc_insertion_point(message_byte_size_start:PaddleSolution.DetectionResult) + size_t total_size = 0; + + // optional string filename = 1; + if (has_filename()) { + total_size += 1 + + ::google::protobuf::internal::WireFormatLite::StringSize( + this->filename()); + } + + // repeated .PaddleSolution.DetectionBox detection_boxes = 2; + { + unsigned int count = this->detection_boxes_size(); + total_size += 1UL * count; + for (unsigned int i = 0; i < count; i++) { + total_size += + ::google::protobuf::internal::WireFormatLite::MessageSizeNoVirtual( + this->detection_boxes(i)); + } + } + + if (_internal_metadata_.have_unknown_fields()) { + total_size += + ::google::protobuf::internal::WireFormat::ComputeUnknownFieldsSize( + unknown_fields()); + } + int cached_size = ::google::protobuf::internal::ToCachedSize(total_size); + GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN(); + _cached_size_ = cached_size; + GOOGLE_SAFE_CONCURRENT_WRITES_END(); + return total_size; +} + +void DetectionResult::MergeFrom(const ::google::protobuf::Message& from) { +// @@protoc_insertion_point(generalized_merge_from_start:PaddleSolution.DetectionResult) + if (GOOGLE_PREDICT_FALSE(&from == this)) MergeFromFail(__LINE__); + const DetectionResult* source = + ::google::protobuf::internal::DynamicCastToGenerated( + &from); + if (source == NULL) { + // @@protoc_insertion_point(generalized_merge_from_cast_fail:PaddleSolution.DetectionResult) + ::google::protobuf::internal::ReflectionOps::Merge(from, this); + } else { + // @@protoc_insertion_point(generalized_merge_from_cast_success:PaddleSolution.DetectionResult) + UnsafeMergeFrom(*source); + } +} + +void DetectionResult::MergeFrom(const DetectionResult& from) { +// @@protoc_insertion_point(class_specific_merge_from_start:PaddleSolution.DetectionResult) + if (GOOGLE_PREDICT_TRUE(&from != this)) { + UnsafeMergeFrom(from); + } else { + MergeFromFail(__LINE__); + } +} + +void DetectionResult::UnsafeMergeFrom(const DetectionResult& from) { + GOOGLE_DCHECK(&from != this); + detection_boxes_.MergeFrom(from.detection_boxes_); + if (from._has_bits_[0 / 32] & (0xffu << (0 % 32))) { + if (from.has_filename()) { + set_has_filename(); + filename_.AssignWithDefault(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), from.filename_); + } + } + if (from._internal_metadata_.have_unknown_fields()) { + ::google::protobuf::UnknownFieldSet::MergeToInternalMetdata( + from.unknown_fields(), &_internal_metadata_); + } +} + +void DetectionResult::CopyFrom(const ::google::protobuf::Message& from) { +// @@protoc_insertion_point(generalized_copy_from_start:PaddleSolution.DetectionResult) + if (&from == this) return; + Clear(); + MergeFrom(from); +} + +void DetectionResult::CopyFrom(const DetectionResult& from) { +// @@protoc_insertion_point(class_specific_copy_from_start:PaddleSolution.DetectionResult) + if (&from == this) return; + Clear(); + UnsafeMergeFrom(from); +} + +bool DetectionResult::IsInitialized() const { + + return true; +} + +void DetectionResult::Swap(DetectionResult* other) { + if (other == this) return; + InternalSwap(other); +} +void DetectionResult::InternalSwap(DetectionResult* other) { + filename_.Swap(&other->filename_); + detection_boxes_.UnsafeArenaSwap(&other->detection_boxes_); + std::swap(_has_bits_[0], other->_has_bits_[0]); + _internal_metadata_.Swap(&other->_internal_metadata_); + std::swap(_cached_size_, other->_cached_size_); +} + +::google::protobuf::Metadata DetectionResult::GetMetadata() const { + protobuf_AssignDescriptorsOnce(); + ::google::protobuf::Metadata metadata; + metadata.descriptor = DetectionResult_descriptor_; + metadata.reflection = DetectionResult_reflection_; + return metadata; +} + +#if PROTOBUF_INLINE_NOT_IN_HEADERS +// DetectionResult + +// optional string filename = 1; +bool DetectionResult::has_filename() const { + return (_has_bits_[0] & 0x00000001u) != 0; +} +void DetectionResult::set_has_filename() { + _has_bits_[0] |= 0x00000001u; +} +void DetectionResult::clear_has_filename() { + _has_bits_[0] &= ~0x00000001u; +} +void DetectionResult::clear_filename() { + filename_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); + clear_has_filename(); +} +const ::std::string& DetectionResult::filename() const { + // @@protoc_insertion_point(field_get:PaddleSolution.DetectionResult.filename) + return filename_.GetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +void DetectionResult::set_filename(const ::std::string& value) { + set_has_filename(); + filename_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value); + // @@protoc_insertion_point(field_set:PaddleSolution.DetectionResult.filename) +} +void DetectionResult::set_filename(const char* value) { + set_has_filename(); + filename_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value)); + // @@protoc_insertion_point(field_set_char:PaddleSolution.DetectionResult.filename) +} +void DetectionResult::set_filename(const char* value, size_t size) { + set_has_filename(); + filename_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), + ::std::string(reinterpret_cast(value), size)); + // @@protoc_insertion_point(field_set_pointer:PaddleSolution.DetectionResult.filename) +} +::std::string* DetectionResult::mutable_filename() { + set_has_filename(); + // @@protoc_insertion_point(field_mutable:PaddleSolution.DetectionResult.filename) + return filename_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +::std::string* DetectionResult::release_filename() { + // @@protoc_insertion_point(field_release:PaddleSolution.DetectionResult.filename) + clear_has_filename(); + return filename_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +void DetectionResult::set_allocated_filename(::std::string* filename) { + if (filename != NULL) { + set_has_filename(); + } else { + clear_has_filename(); + } + filename_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), filename); + // @@protoc_insertion_point(field_set_allocated:PaddleSolution.DetectionResult.filename) +} + +// repeated .PaddleSolution.DetectionBox detection_boxes = 2; +int DetectionResult::detection_boxes_size() const { + return detection_boxes_.size(); +} +void DetectionResult::clear_detection_boxes() { + detection_boxes_.Clear(); +} +const ::PaddleSolution::DetectionBox& DetectionResult::detection_boxes(int index) const { + // @@protoc_insertion_point(field_get:PaddleSolution.DetectionResult.detection_boxes) + return detection_boxes_.Get(index); +} +::PaddleSolution::DetectionBox* DetectionResult::mutable_detection_boxes(int index) { + // @@protoc_insertion_point(field_mutable:PaddleSolution.DetectionResult.detection_boxes) + return detection_boxes_.Mutable(index); +} +::PaddleSolution::DetectionBox* DetectionResult::add_detection_boxes() { + // @@protoc_insertion_point(field_add:PaddleSolution.DetectionResult.detection_boxes) + return detection_boxes_.Add(); +} +::google::protobuf::RepeatedPtrField< ::PaddleSolution::DetectionBox >* +DetectionResult::mutable_detection_boxes() { + // @@protoc_insertion_point(field_mutable_list:PaddleSolution.DetectionResult.detection_boxes) + return &detection_boxes_; +} +const ::google::protobuf::RepeatedPtrField< ::PaddleSolution::DetectionBox >& +DetectionResult::detection_boxes() const { + // @@protoc_insertion_point(field_list:PaddleSolution.DetectionResult.detection_boxes) + return detection_boxes_; +} + +inline const DetectionResult* DetectionResult::internal_default_instance() { + return &DetectionResult_default_instance_.get(); +} +#endif // PROTOBUF_INLINE_NOT_IN_HEADERS + +// @@protoc_insertion_point(namespace_scope) + +} // namespace PaddleSolution + +// @@protoc_insertion_point(global_scope) diff --git a/inference/utils/detection_result.pb.h b/inference/utils/detection_result.pb.h new file mode 100644 index 0000000000000000000000000000000000000000..1b2f89ea9ca13f3f949bd19b097bb514a4afc525 --- /dev/null +++ b/inference/utils/detection_result.pb.h @@ -0,0 +1,563 @@ +// Generated by the protocol buffer compiler. DO NOT EDIT! +// source: detection_result.proto + +#ifndef PROTOBUF_detection_5fresult_2eproto__INCLUDED +#define PROTOBUF_detection_5fresult_2eproto__INCLUDED + +#include + +#include + +#if GOOGLE_PROTOBUF_VERSION < 3001000 +#error This file was generated by a newer version of protoc which is +#error incompatible with your Protocol Buffer headers. Please update +#error your headers. +#endif +#if 3001000 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION +#error This file was generated by an older version of protoc which is +#error incompatible with your Protocol Buffer headers. Please +#error regenerate this file with a newer version of protoc. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +// @@protoc_insertion_point(includes) + +namespace PaddleSolution { + +// Internal implementation detail -- do not call these. +void protobuf_AddDesc_detection_5fresult_2eproto(); +void protobuf_InitDefaults_detection_5fresult_2eproto(); +void protobuf_AssignDesc_detection_5fresult_2eproto(); +void protobuf_ShutdownFile_detection_5fresult_2eproto(); + +class DetectionBox; +class DetectionResult; + +// =================================================================== + +class DetectionBox : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:PaddleSolution.DetectionBox) */ { + public: + DetectionBox(); + virtual ~DetectionBox(); + + DetectionBox(const DetectionBox& from); + + inline DetectionBox& operator=(const DetectionBox& from) { + CopyFrom(from); + return *this; + } + + inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields(); + } + + inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields(); + } + + static const ::google::protobuf::Descriptor* descriptor(); + static const DetectionBox& default_instance(); + + static const DetectionBox* internal_default_instance(); + + void Swap(DetectionBox* other); + + // implements Message ---------------------------------------------- + + inline DetectionBox* New() const { return New(NULL); } + + DetectionBox* New(::google::protobuf::Arena* arena) const; + void CopyFrom(const ::google::protobuf::Message& from); + void MergeFrom(const ::google::protobuf::Message& from); + void CopyFrom(const DetectionBox& from); + void MergeFrom(const DetectionBox& from); + void Clear(); + bool IsInitialized() const; + + size_t ByteSizeLong() const; + bool MergePartialFromCodedStream( + ::google::protobuf::io::CodedInputStream* input); + void SerializeWithCachedSizes( + ::google::protobuf::io::CodedOutputStream* output) const; + ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray( + bool deterministic, ::google::protobuf::uint8* output) const; + ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const { + return InternalSerializeWithCachedSizesToArray(false, output); + } + int GetCachedSize() const { return _cached_size_; } + private: + void SharedCtor(); + void SharedDtor(); + void SetCachedSize(int size) const; + void InternalSwap(DetectionBox* other); + void UnsafeMergeFrom(const DetectionBox& from); + private: + inline ::google::protobuf::Arena* GetArenaNoVirtual() const { + return _internal_metadata_.arena(); + } + inline void* MaybeArenaPtr() const { + return _internal_metadata_.raw_arena_ptr(); + } + public: + + ::google::protobuf::Metadata GetMetadata() const; + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + // optional int32 class = 1; + bool has_class_() const; + void clear_class_(); + static const int kClassFieldNumber = 1; + ::google::protobuf::int32 class_() const; + void set_class_(::google::protobuf::int32 value); + + // optional float score = 2; + bool has_score() const; + void clear_score(); + static const int kScoreFieldNumber = 2; + float score() const; + void set_score(float value); + + // optional float left_top_x = 3; + bool has_left_top_x() const; + void clear_left_top_x(); + static const int kLeftTopXFieldNumber = 3; + float left_top_x() const; + void set_left_top_x(float value); + + // optional float left_top_y = 4; + bool has_left_top_y() const; + void clear_left_top_y(); + static const int kLeftTopYFieldNumber = 4; + float left_top_y() const; + void set_left_top_y(float value); + + // optional float right_bottom_x = 5; + bool has_right_bottom_x() const; + void clear_right_bottom_x(); + static const int kRightBottomXFieldNumber = 5; + float right_bottom_x() const; + void set_right_bottom_x(float value); + + // optional float right_bottom_y = 6; + bool has_right_bottom_y() const; + void clear_right_bottom_y(); + static const int kRightBottomYFieldNumber = 6; + float right_bottom_y() const; + void set_right_bottom_y(float value); + + // @@protoc_insertion_point(class_scope:PaddleSolution.DetectionBox) + private: + inline void set_has_class_(); + inline void clear_has_class_(); + inline void set_has_score(); + inline void clear_has_score(); + inline void set_has_left_top_x(); + inline void clear_has_left_top_x(); + inline void set_has_left_top_y(); + inline void clear_has_left_top_y(); + inline void set_has_right_bottom_x(); + inline void clear_has_right_bottom_x(); + inline void set_has_right_bottom_y(); + inline void clear_has_right_bottom_y(); + + ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_; + ::google::protobuf::internal::HasBits<1> _has_bits_; + mutable int _cached_size_; + ::google::protobuf::int32 class__; + float score_; + float left_top_x_; + float left_top_y_; + float right_bottom_x_; + float right_bottom_y_; + friend void protobuf_InitDefaults_detection_5fresult_2eproto_impl(); + friend void protobuf_AddDesc_detection_5fresult_2eproto_impl(); + friend void protobuf_AssignDesc_detection_5fresult_2eproto(); + friend void protobuf_ShutdownFile_detection_5fresult_2eproto(); + + void InitAsDefaultInstance(); +}; +extern ::google::protobuf::internal::ExplicitlyConstructed DetectionBox_default_instance_; + +// ------------------------------------------------------------------- + +class DetectionResult : public ::google::protobuf::Message /* @@protoc_insertion_point(class_definition:PaddleSolution.DetectionResult) */ { + public: + DetectionResult(); + virtual ~DetectionResult(); + + DetectionResult(const DetectionResult& from); + + inline DetectionResult& operator=(const DetectionResult& from) { + CopyFrom(from); + return *this; + } + + inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const { + return _internal_metadata_.unknown_fields(); + } + + inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() { + return _internal_metadata_.mutable_unknown_fields(); + } + + static const ::google::protobuf::Descriptor* descriptor(); + static const DetectionResult& default_instance(); + + static const DetectionResult* internal_default_instance(); + + void Swap(DetectionResult* other); + + // implements Message ---------------------------------------------- + + inline DetectionResult* New() const { return New(NULL); } + + DetectionResult* New(::google::protobuf::Arena* arena) const; + void CopyFrom(const ::google::protobuf::Message& from); + void MergeFrom(const ::google::protobuf::Message& from); + void CopyFrom(const DetectionResult& from); + void MergeFrom(const DetectionResult& from); + void Clear(); + bool IsInitialized() const; + + size_t ByteSizeLong() const; + bool MergePartialFromCodedStream( + ::google::protobuf::io::CodedInputStream* input); + void SerializeWithCachedSizes( + ::google::protobuf::io::CodedOutputStream* output) const; + ::google::protobuf::uint8* InternalSerializeWithCachedSizesToArray( + bool deterministic, ::google::protobuf::uint8* output) const; + ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const { + return InternalSerializeWithCachedSizesToArray(false, output); + } + int GetCachedSize() const { return _cached_size_; } + private: + void SharedCtor(); + void SharedDtor(); + void SetCachedSize(int size) const; + void InternalSwap(DetectionResult* other); + void UnsafeMergeFrom(const DetectionResult& from); + private: + inline ::google::protobuf::Arena* GetArenaNoVirtual() const { + return _internal_metadata_.arena(); + } + inline void* MaybeArenaPtr() const { + return _internal_metadata_.raw_arena_ptr(); + } + public: + + ::google::protobuf::Metadata GetMetadata() const; + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + // optional string filename = 1; + bool has_filename() const; + void clear_filename(); + static const int kFilenameFieldNumber = 1; + const ::std::string& filename() const; + void set_filename(const ::std::string& value); + void set_filename(const char* value); + void set_filename(const char* value, size_t size); + ::std::string* mutable_filename(); + ::std::string* release_filename(); + void set_allocated_filename(::std::string* filename); + + // repeated .PaddleSolution.DetectionBox detection_boxes = 2; + int detection_boxes_size() const; + void clear_detection_boxes(); + static const int kDetectionBoxesFieldNumber = 2; + const ::PaddleSolution::DetectionBox& detection_boxes(int index) const; + ::PaddleSolution::DetectionBox* mutable_detection_boxes(int index); + ::PaddleSolution::DetectionBox* add_detection_boxes(); + ::google::protobuf::RepeatedPtrField< ::PaddleSolution::DetectionBox >* + mutable_detection_boxes(); + const ::google::protobuf::RepeatedPtrField< ::PaddleSolution::DetectionBox >& + detection_boxes() const; + + // @@protoc_insertion_point(class_scope:PaddleSolution.DetectionResult) + private: + inline void set_has_filename(); + inline void clear_has_filename(); + + ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_; + ::google::protobuf::internal::HasBits<1> _has_bits_; + mutable int _cached_size_; + ::google::protobuf::RepeatedPtrField< ::PaddleSolution::DetectionBox > detection_boxes_; + ::google::protobuf::internal::ArenaStringPtr filename_; + friend void protobuf_InitDefaults_detection_5fresult_2eproto_impl(); + friend void protobuf_AddDesc_detection_5fresult_2eproto_impl(); + friend void protobuf_AssignDesc_detection_5fresult_2eproto(); + friend void protobuf_ShutdownFile_detection_5fresult_2eproto(); + + void InitAsDefaultInstance(); +}; +extern ::google::protobuf::internal::ExplicitlyConstructed DetectionResult_default_instance_; + +// =================================================================== + + +// =================================================================== + +#if !PROTOBUF_INLINE_NOT_IN_HEADERS +// DetectionBox + +// optional int32 class = 1; +inline bool DetectionBox::has_class_() const { + return (_has_bits_[0] & 0x00000001u) != 0; +} +inline void DetectionBox::set_has_class_() { + _has_bits_[0] |= 0x00000001u; +} +inline void DetectionBox::clear_has_class_() { + _has_bits_[0] &= ~0x00000001u; +} +inline void DetectionBox::clear_class_() { + class__ = 0; + clear_has_class_(); +} +inline ::google::protobuf::int32 DetectionBox::class_() const { + // @@protoc_insertion_point(field_get:PaddleSolution.DetectionBox.class) + return class__; +} +inline void DetectionBox::set_class_(::google::protobuf::int32 value) { + set_has_class_(); + class__ = value; + // @@protoc_insertion_point(field_set:PaddleSolution.DetectionBox.class) +} + +// optional float score = 2; +inline bool DetectionBox::has_score() const { + return (_has_bits_[0] & 0x00000002u) != 0; +} +inline void DetectionBox::set_has_score() { + _has_bits_[0] |= 0x00000002u; +} +inline void DetectionBox::clear_has_score() { + _has_bits_[0] &= ~0x00000002u; +} +inline void DetectionBox::clear_score() { + score_ = 0; + clear_has_score(); +} +inline float DetectionBox::score() const { + // @@protoc_insertion_point(field_get:PaddleSolution.DetectionBox.score) + return score_; +} +inline void DetectionBox::set_score(float value) { + set_has_score(); + score_ = value; + // @@protoc_insertion_point(field_set:PaddleSolution.DetectionBox.score) +} + +// optional float left_top_x = 3; +inline bool DetectionBox::has_left_top_x() const { + return (_has_bits_[0] & 0x00000004u) != 0; +} +inline void DetectionBox::set_has_left_top_x() { + _has_bits_[0] |= 0x00000004u; +} +inline void DetectionBox::clear_has_left_top_x() { + _has_bits_[0] &= ~0x00000004u; +} +inline void DetectionBox::clear_left_top_x() { + left_top_x_ = 0; + clear_has_left_top_x(); +} +inline float DetectionBox::left_top_x() const { + // @@protoc_insertion_point(field_get:PaddleSolution.DetectionBox.left_top_x) + return left_top_x_; +} +inline void DetectionBox::set_left_top_x(float value) { + set_has_left_top_x(); + left_top_x_ = value; + // @@protoc_insertion_point(field_set:PaddleSolution.DetectionBox.left_top_x) +} + +// optional float left_top_y = 4; +inline bool DetectionBox::has_left_top_y() const { + return (_has_bits_[0] & 0x00000008u) != 0; +} +inline void DetectionBox::set_has_left_top_y() { + _has_bits_[0] |= 0x00000008u; +} +inline void DetectionBox::clear_has_left_top_y() { + _has_bits_[0] &= ~0x00000008u; +} +inline void DetectionBox::clear_left_top_y() { + left_top_y_ = 0; + clear_has_left_top_y(); +} +inline float DetectionBox::left_top_y() const { + // @@protoc_insertion_point(field_get:PaddleSolution.DetectionBox.left_top_y) + return left_top_y_; +} +inline void DetectionBox::set_left_top_y(float value) { + set_has_left_top_y(); + left_top_y_ = value; + // @@protoc_insertion_point(field_set:PaddleSolution.DetectionBox.left_top_y) +} + +// optional float right_bottom_x = 5; +inline bool DetectionBox::has_right_bottom_x() const { + return (_has_bits_[0] & 0x00000010u) != 0; +} +inline void DetectionBox::set_has_right_bottom_x() { + _has_bits_[0] |= 0x00000010u; +} +inline void DetectionBox::clear_has_right_bottom_x() { + _has_bits_[0] &= ~0x00000010u; +} +inline void DetectionBox::clear_right_bottom_x() { + right_bottom_x_ = 0; + clear_has_right_bottom_x(); +} +inline float DetectionBox::right_bottom_x() const { + // @@protoc_insertion_point(field_get:PaddleSolution.DetectionBox.right_bottom_x) + return right_bottom_x_; +} +inline void DetectionBox::set_right_bottom_x(float value) { + set_has_right_bottom_x(); + right_bottom_x_ = value; + // @@protoc_insertion_point(field_set:PaddleSolution.DetectionBox.right_bottom_x) +} + +// optional float right_bottom_y = 6; +inline bool DetectionBox::has_right_bottom_y() const { + return (_has_bits_[0] & 0x00000020u) != 0; +} +inline void DetectionBox::set_has_right_bottom_y() { + _has_bits_[0] |= 0x00000020u; +} +inline void DetectionBox::clear_has_right_bottom_y() { + _has_bits_[0] &= ~0x00000020u; +} +inline void DetectionBox::clear_right_bottom_y() { + right_bottom_y_ = 0; + clear_has_right_bottom_y(); +} +inline float DetectionBox::right_bottom_y() const { + // @@protoc_insertion_point(field_get:PaddleSolution.DetectionBox.right_bottom_y) + return right_bottom_y_; +} +inline void DetectionBox::set_right_bottom_y(float value) { + set_has_right_bottom_y(); + right_bottom_y_ = value; + // @@protoc_insertion_point(field_set:PaddleSolution.DetectionBox.right_bottom_y) +} + +inline const DetectionBox* DetectionBox::internal_default_instance() { + return &DetectionBox_default_instance_.get(); +} +// ------------------------------------------------------------------- + +// DetectionResult + +// optional string filename = 1; +inline bool DetectionResult::has_filename() const { + return (_has_bits_[0] & 0x00000001u) != 0; +} +inline void DetectionResult::set_has_filename() { + _has_bits_[0] |= 0x00000001u; +} +inline void DetectionResult::clear_has_filename() { + _has_bits_[0] &= ~0x00000001u; +} +inline void DetectionResult::clear_filename() { + filename_.ClearToEmptyNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); + clear_has_filename(); +} +inline const ::std::string& DetectionResult::filename() const { + // @@protoc_insertion_point(field_get:PaddleSolution.DetectionResult.filename) + return filename_.GetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline void DetectionResult::set_filename(const ::std::string& value) { + set_has_filename(); + filename_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), value); + // @@protoc_insertion_point(field_set:PaddleSolution.DetectionResult.filename) +} +inline void DetectionResult::set_filename(const char* value) { + set_has_filename(); + filename_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), ::std::string(value)); + // @@protoc_insertion_point(field_set_char:PaddleSolution.DetectionResult.filename) +} +inline void DetectionResult::set_filename(const char* value, size_t size) { + set_has_filename(); + filename_.SetNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), + ::std::string(reinterpret_cast(value), size)); + // @@protoc_insertion_point(field_set_pointer:PaddleSolution.DetectionResult.filename) +} +inline ::std::string* DetectionResult::mutable_filename() { + set_has_filename(); + // @@protoc_insertion_point(field_mutable:PaddleSolution.DetectionResult.filename) + return filename_.MutableNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline ::std::string* DetectionResult::release_filename() { + // @@protoc_insertion_point(field_release:PaddleSolution.DetectionResult.filename) + clear_has_filename(); + return filename_.ReleaseNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited()); +} +inline void DetectionResult::set_allocated_filename(::std::string* filename) { + if (filename != NULL) { + set_has_filename(); + } else { + clear_has_filename(); + } + filename_.SetAllocatedNoArena(&::google::protobuf::internal::GetEmptyStringAlreadyInited(), filename); + // @@protoc_insertion_point(field_set_allocated:PaddleSolution.DetectionResult.filename) +} + +// repeated .PaddleSolution.DetectionBox detection_boxes = 2; +inline int DetectionResult::detection_boxes_size() const { + return detection_boxes_.size(); +} +inline void DetectionResult::clear_detection_boxes() { + detection_boxes_.Clear(); +} +inline const ::PaddleSolution::DetectionBox& DetectionResult::detection_boxes(int index) const { + // @@protoc_insertion_point(field_get:PaddleSolution.DetectionResult.detection_boxes) + return detection_boxes_.Get(index); +} +inline ::PaddleSolution::DetectionBox* DetectionResult::mutable_detection_boxes(int index) { + // @@protoc_insertion_point(field_mutable:PaddleSolution.DetectionResult.detection_boxes) + return detection_boxes_.Mutable(index); +} +inline ::PaddleSolution::DetectionBox* DetectionResult::add_detection_boxes() { + // @@protoc_insertion_point(field_add:PaddleSolution.DetectionResult.detection_boxes) + return detection_boxes_.Add(); +} +inline ::google::protobuf::RepeatedPtrField< ::PaddleSolution::DetectionBox >* +DetectionResult::mutable_detection_boxes() { + // @@protoc_insertion_point(field_mutable_list:PaddleSolution.DetectionResult.detection_boxes) + return &detection_boxes_; +} +inline const ::google::protobuf::RepeatedPtrField< ::PaddleSolution::DetectionBox >& +DetectionResult::detection_boxes() const { + // @@protoc_insertion_point(field_list:PaddleSolution.DetectionResult.detection_boxes) + return detection_boxes_; +} + +inline const DetectionResult* DetectionResult::internal_default_instance() { + return &DetectionResult_default_instance_.get(); +} +#endif // !PROTOBUF_INLINE_NOT_IN_HEADERS +// ------------------------------------------------------------------- + + +// @@protoc_insertion_point(namespace_scope) + +} // namespace PaddleSolution + +// @@protoc_insertion_point(global_scope) + +#endif // PROTOBUF_detection_5fresult_2eproto__INCLUDED diff --git a/inference/utils/detection_result.proto b/inference/utils/detection_result.proto new file mode 100644 index 0000000000000000000000000000000000000000..2d1cbb2464ac09b0dcea01f8331da5ee7894a4d5 --- /dev/null +++ b/inference/utils/detection_result.proto @@ -0,0 +1,21 @@ +syntax = "proto2"; +package PaddleSolution; + +message DetectionBox { + optional int32 class = 1; + optional float score = 2; + optional float left_top_x = 3; + optional float left_top_y = 4; + optional float right_bottom_x = 5; + optional float right_bottom_y = 6; +} + +message DetectionResult { + optional string filename = 1; + repeated DetectionBox detection_boxes = 2; +} + +//message DetectionResultsContainer { +// repeated DetectionResult result = 1; +//} + diff --git a/inference/utils/utils.h b/inference/utils/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..63245219edb6ad39e896f1eb041e8bff69613382 --- /dev/null +++ b/inference/utils/utils.h @@ -0,0 +1,124 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#ifdef _WIN32 +#include +#else +#include +#include +#endif + +namespace PaddleSolution { + namespace utils { + enum SCALE_TYPE{ + UNPADDING, + RANGE_SCALING + }; + inline std::string path_join(const std::string& dir, const std::string& path) { + std::string seperator = "/"; + #ifdef _WIN32 + seperator = "\\"; + #endif + return dir + seperator + path; + } + #ifndef _WIN32 + // scan a directory and get all files with input extensions + inline std::vector get_directory_images(const std::string& path, const std::string& exts) + { + std::vector imgs; + struct dirent *entry; + DIR *dir = opendir(path.c_str()); + if (dir == NULL) { + closedir(dir); + return imgs; + } + + while ((entry = readdir(dir)) != NULL) { + std::string item = entry->d_name; + auto ext = strrchr(entry->d_name, '.'); + if (!ext || std::string(ext) == "." || std::string(ext) == "..") { + continue; + } + if (exts.find(ext) != std::string::npos) { + imgs.push_back(path_join(path, entry->d_name)); + } + } + sort(imgs.begin(), imgs.end()); + return imgs; + } + #else + // scan a directory and get all files with input extensions + inline std::vector get_directory_images(const std::string& path, const std::string& exts) + { + std::vector imgs; + for (const auto& item : std::experimental::filesystem::directory_iterator(path)) { + auto suffix = item.path().extension().string(); + if (exts.find(suffix) != std::string::npos && suffix.size() > 0) { + auto fullname = path_join(path, item.path().filename().string()); + imgs.push_back(item.path().string()); + } + } + sort(imgs.begin(), imgs.end()); + return imgs; + } + #endif + + inline int scaling(int resize_type, int &w, int &h, int new_w, int new_h, int target_size, int max_size, float &im_scale_ratio) + { + if(w <= 0 || h <= 0 || new_w <= 0 || new_h <= 0){ + return -1; + } + switch(resize_type) { + case SCALE_TYPE::UNPADDING: + { + w = new_w; + h = new_h; + im_scale_ratio=0; + } + break; + case SCALE_TYPE::RANGE_SCALING: + { + int im_max_size = std::max(w, h); + int im_min_size = std::min(w, h); + float scale_ratio= static_cast(target_size) / static_cast(im_min_size); + if(max_size > 0) { + if(round(scale_ratio * im_max_size) > max_size) { + scale_ratio = static_cast(max_size) / static_cast(im_max_size); + } + } + w = round(scale_ratio * static_cast(w)); + h = round(scale_ratio * static_cast(h)); + im_scale_ratio = scale_ratio; + } + break; + default : + { + std::cout << "Can't support this type of scaling strategy." << std::endl; + std::cout << "Throw exception at file " << __FILE__ << " on line " << __LINE__ << std::endl; + throw 0; + } + break; + } + return 0; + } + } +} diff --git a/ppdet/__init__.py b/ppdet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d0c32e26092f6ea25771279418582a24ea449ab2 --- /dev/null +++ b/ppdet/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ppdet/core/__init__.py b/ppdet/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f8561f944e4ca7453456c476092ba4e2d7a0bb5d --- /dev/null +++ b/ppdet/core/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ppdet.modeling +import ppdet.optimizer +import ppdet.data diff --git a/ppdet/core/config/__init__.py b/ppdet/core/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d0c32e26092f6ea25771279418582a24ea449ab2 --- /dev/null +++ b/ppdet/core/config/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ppdet/core/config/schema.py b/ppdet/core/config/schema.py new file mode 100644 index 0000000000000000000000000000000000000000..efba5be1dfd93243ff7a071f0f30accfa513e18f --- /dev/null +++ b/ppdet/core/config/schema.py @@ -0,0 +1,258 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import inspect +import importlib +import re + +try: + from docstring_parser import parse as doc_parse +except Exception: + + def doc_parse(*args): + if not doc_parse.__warning_sent__: + from ppdet.utils.cli import ColorTTY + color_tty = ColorTTY() + message = "docstring_parser is not installed, " \ + + "argument description is not available" + print(color_tty.yellow(message)) + doc_parse.__warning_sent__ = True + + doc_parse.__warning_sent__ = False + +try: + from typeguard import check_type +except Exception: + + def check_type(*args): + if not check_type.__warning_sent__: + from ppdet.utils.cli import ColorTTY + color_tty = ColorTTY() + message = "typeguard is not installed," \ + + "type checking is not available" + print(color_tty.yellow(message)) + check_type.__warning_sent__ = True + + check_type.__warning_sent__ = False + +__all__ = ['SchemaValue', 'SchemaDict', 'SharedConfig', 'extract_schema'] + + +class SchemaValue(object): + def __init__(self, name, doc='', type=None): + super(SchemaValue, self).__init__() + self.name = name + self.doc = doc + self.type = type + + def set_default(self, value): + self.default = value + + def has_default(self): + return hasattr(self, 'default') + + +class SchemaDict(dict): + def __init__(self, **kwargs): + super(SchemaDict, self).__init__() + self.schema = {} + self.strict = False + self.doc = "" + self.update(kwargs) + + def __setitem__(self, key, value): + # XXX also update regular dict to SchemaDict?? + if isinstance(value, dict) and key in self and isinstance(self[key], + SchemaDict): + self[key].update(value) + else: + super(SchemaDict, self).__setitem__(key, value) + + def __missing__(self, key): + if self.has_default(key): + return self.schema[key].default + elif key in self.schema: + return self.schema[key] + else: + raise KeyError(key) + + def copy(self): + newone = SchemaDict() + newone.__dict__.update(self.__dict__) + newone.update(self) + return newone + + def set_schema(self, key, value): + assert isinstance(value, SchemaValue) + self.schema[key] = value + + def set_strict(self, strict): + self.strict = strict + + def has_default(self, key): + return key in self.schema and self.schema[key].has_default() + + def is_default(self, key): + if not self.has_default(key): + return False + if hasattr(self[key], '__dict__'): + return True + else: + return key not in self or self[key] == self.schema[key].default + + def find_default_keys(self): + return [ + k for k in list(self.keys()) + list(self.schema.keys()) + if self.is_default(k) + ] + + def mandatory(self): + return any([k for k in self.schema.keys() if not self.has_default(k)]) + + def find_missing_keys(self): + missing = [ + k for k in self.schema.keys() + if k not in self and not self.has_default(k) + ] + placeholders = [k for k in self if self[k] in ('', '')] + return missing + placeholders + + def find_extra_keys(self): + return list(set(self.keys()) - set(self.schema.keys())) + + def find_mismatch_keys(self): + mismatch_keys = [] + for arg in self.schema.values(): + if arg.type is not None: + try: + check_type("{}.{}".format(self.name, arg.name), + self[arg.name], arg.type) + except Exception: + mismatch_keys.append(arg.name) + return mismatch_keys + + def validate(self): + missing_keys = self.find_missing_keys() + if missing_keys: + raise ValueError("Missing param for class<{}>: {}".format( + self.name, ", ".join(missing_keys))) + extra_keys = self.find_extra_keys() + if extra_keys and self.strict: + raise ValueError("Extraneous param for class<{}>: {}".format( + self.name, ", ".join(extra_keys))) + mismatch_keys = self.find_mismatch_keys() + if mismatch_keys: + raise TypeError("Wrong param type for class<{}>: {}".format( + self.name, ", ".join(mismatch_keys))) + + +class SharedConfig(object): + """ + Representation class for `__shared__` annotations, which work as follows: + + - if `key` is set for the module in config file, its value will take + precedence + - if `key` is not set for the module but present in the config file, its + value will be used + - otherwise, use the provided `default_value` as fallback + + Args: + key: config[key] will be injected + default_value: fallback value + """ + + def __init__(self, key, default_value=None): + super(SharedConfig, self).__init__() + self.key = key + self.default_value = default_value + + +def extract_schema(cls): + """ + Extract schema from a given class + + Args: + cls (type): Class from which to extract. + + Returns: + schema (SchemaDict): Extracted schema. + """ + ctor = cls.__init__ + # python 2 compatibility + if hasattr(inspect, 'getfullargspec'): + argspec = inspect.getfullargspec(ctor) + annotations = argspec.annotations + has_kwargs = argspec.varkw is not None + else: + argspec = inspect.getargspec(ctor) + # python 2 type hinting workaround, see pep-3107 + # however, since `typeguard` does not support python 2, type checking + # is still python 3 only for now + annotations = getattr(ctor, '__annotations__', {}) + has_kwargs = argspec.keywords is not None + + names = [arg for arg in argspec.args if arg != 'self'] + defaults = argspec.defaults + num_defaults = argspec.defaults is not None and len(argspec.defaults) or 0 + num_required = len(names) - num_defaults + + docs = cls.__doc__ + if docs is None and getattr(cls, '__category__', None) == 'op': + docs = cls.__call__.__doc__ + docstring = doc_parse(docs) + if docstring is None: + comments = {} + else: + comments = {} + for p in docstring.params: + match_obj = re.match('^([a-zA-Z_]+[a-zA-Z_0-9]*).*', p.arg_name) + if match_obj is not None: + comments[match_obj.group(1)] = p.description + + schema = SchemaDict() + schema.name = cls.__name__ + schema.doc = "" + if docs is not None: + start_pos = docs[0] == '\n' and 1 or 0 + schema.doc = docs[start_pos:].split("\n")[0].strip() + # XXX handle paddle's weird doc convention + if '**' == schema.doc[:2] and '**' == schema.doc[-2:]: + schema.doc = schema.doc[2:-2].strip() + schema.category = hasattr(cls, '__category__') and getattr( + cls, '__category__') or 'module' + schema.strict = not has_kwargs + schema.pymodule = importlib.import_module(cls.__module__) + schema.inject = getattr(cls, '__inject__', []) + schema.shared = getattr(cls, '__shared__', []) + for idx, name in enumerate(names): + comment = name in comments and comments[name] or name + if name in schema.inject: + type_ = None + else: + type_ = name in annotations and annotations[name] or None + value_schema = SchemaValue(name, comment, type_) + if name in schema.shared: + assert idx >= num_required, "shared config must have default value" + default = defaults[idx - num_required] + value_schema.set_default(SharedConfig(name, default)) + elif idx >= num_required: + default = defaults[idx - num_required] + value_schema.set_default(default) + schema.set_schema(name, value_schema) + + return schema diff --git a/ppdet/core/config/yaml_helpers.py b/ppdet/core/config/yaml_helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..8a7738b47f4f86acde78ab8a3bcac590d61615fa --- /dev/null +++ b/ppdet/core/config/yaml_helpers.py @@ -0,0 +1,109 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import inspect + +import yaml +from .schema import SharedConfig + +__all__ = ['serializable', 'Callable'] + + +def _make_python_constructor(cls): + def python_constructor(loader, node): + if isinstance(node, yaml.SequenceNode): + args = loader.construct_sequence(node, deep=True) + return cls(*args) + else: + kwargs = loader.construct_mapping(node, deep=True) + try: + return cls(**kwargs) + except Exception as ex: + print("Error when construct {} instance from yaml config". + format(cls.__name__)) + raise ex + + return python_constructor + + +def _make_python_representer(cls): + # python 2 compatibility + if hasattr(inspect, 'getfullargspec'): + argspec = inspect.getfullargspec(cls) + else: + argspec = inspect.getargspec(cls.__init__) + argnames = [arg for arg in argspec.args if arg != 'self'] + + def python_representer(dumper, obj): + if argnames: + data = {name: getattr(obj, name) for name in argnames} + else: + data = obj.__dict__ + if '_id' in data: + del data['_id'] + return dumper.represent_mapping(u'!{}'.format(cls.__name__), data) + + return python_representer + + +def serializable(cls): + """ + Add loader and dumper for given class, which must be + "trivially serializable" + + Args: + cls: class to be serialized + + Returns: cls + """ + yaml.add_constructor(u'!{}'.format(cls.__name__), + _make_python_constructor(cls)) + yaml.add_representer(cls, _make_python_representer(cls)) + return cls + + +yaml.add_representer(SharedConfig, + lambda d, o: d.represent_data(o.default_value)) + + +@serializable +class Callable(object): + """ + Helper to be used in Yaml for creating arbitrary class objects + + Args: + full_type (str): the full module path to target function + """ + + def __init__(self, full_type, args=[], kwargs={}): + super(Callable, self).__init__() + self.full_type = full_type + self.args = args + self.kwargs = kwargs + + def __call__(self): + if '.' in self.full_type: + idx = self.full_type.rfind('.') + module = importlib.import_module(self.full_type[:idx]) + func_name = self.full_type[idx + 1:] + else: + try: + module = importlib.import_module('builtins') + except Exception: + module = importlib.import_module('__builtin__') + func_name = self.full_type + + func = getattr(module, func_name) + return func(*self.args, **self.kwargs) diff --git a/ppdet/core/workspace.py b/ppdet/core/workspace.py new file mode 100644 index 0000000000000000000000000000000000000000..bf505d6e4d1aab311057763c52f4ef501606a7fb --- /dev/null +++ b/ppdet/core/workspace.py @@ -0,0 +1,207 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import importlib +import os +import sys + +import yaml +import copy + +from .config.schema import SchemaDict, SharedConfig, extract_schema +from .config.yaml_helpers import serializable + +__all__ = [ + 'global_config', + 'load_config', + 'merge_config', + 'get_registered_modules', + 'create', + 'register', + 'serializable', + 'dump_value', +] + + +def dump_value(value): + # XXX this is hackish, but collections.abc is not available in python 2 + if hasattr(value, '__dict__') or isinstance(value, (dict, tuple, list)): + value = yaml.dump(value, default_flow_style=True) + value = value.replace('\n', '') + value = value.replace('...', '') + return "'{}'".format(value) + else: + # primitive types + return str(value) + + +class AttrDict(dict): + """Single level attribute dict, NOT recursive""" + + def __init__(self, **kwargs): + super(AttrDict, self).__init__() + super(AttrDict, self).update(kwargs) + + def __getattr__(self, key): + if key in self: + return self[key] + raise AttributeError("object has no attribute '{}'".format(key)) + + +global_config = AttrDict() + + +def load_config(file_path): + """ + Load config from file. + + Args: + file_path (str): Path of the config file to be loaded. + + Returns: global config + """ + _, ext = os.path.splitext(file_path) + assert ext in ['.yml', '.yaml'], "only support yaml files for now" + with open(file_path) as f: + merge_config(yaml.load(f, Loader=yaml.Loader)) + return global_config + + +def merge_config(config): + """ + Merge config into global config. + + Args: + config (dict): Config to be merged. + + Returns: global config + """ + for key, value in config.items(): + if isinstance(value, dict) and key in global_config: + global_config[key].update(value) + else: + global_config[key] = value + + +def get_registered_modules(): + return {k: v for k, v in global_config.items() if isinstance(v, SchemaDict)} + + +def make_partial(cls): + op_module = importlib.import_module(cls.__op__.__module__) + op = getattr(op_module, cls.__op__.__name__) + cls.__category__ = getattr(cls, '__category__', None) or 'op' + + def partial_apply(self, *args, **kwargs): + kwargs_ = self.__dict__.copy() + kwargs_.update(kwargs) + return op(*args, **kwargs_) + + if getattr(cls, '__append_doc__', True): # XXX should default to True? + if sys.version_info[0] > 2: + cls.__doc__ = "Wrapper for `{}` OP".format(op.__name__) + cls.__init__.__doc__ = op.__doc__ + cls.__call__ = partial_apply + cls.__call__.__doc__ = op.__doc__ + else: + # XXX work around for python 2 + partial_apply.__doc__ = op.__doc__ + cls.__call__ = partial_apply + return cls + + +def register(cls): + """ + Register a given module class. + + Args: + cls (type): Module class to be registered. + + Returns: cls + """ + if cls.__name__ in global_config: + raise ValueError("Module class already registered: {}".format( + cls.__name__)) + if hasattr(cls, '__op__'): + cls = make_partial(cls) + global_config[cls.__name__] = extract_schema(cls) + return cls + + +def create(cls_or_name, **kwargs): + """ + Create an instance of given module class. + + Args: + cls_or_name (type or str): Class of which to create instance. + + Returns: instance of type `cls_or_name` + """ + assert type(cls_or_name) in [type, str + ], "should be a class or name of a class" + name = type(cls_or_name) == str and cls_or_name or cls_or_name.__name__ + assert name in global_config and \ + isinstance(global_config[name], SchemaDict), \ + "the module {} is not registered".format(name) + config = global_config[name] + config.update(kwargs) + config.validate() + cls = getattr(config.pymodule, name) + + kwargs = {} + kwargs.update(global_config[name]) + + # parse `shared` annoation of registered modules + if getattr(config, 'shared', None): + for k in config.shared: + target_key = config[k] + shared_conf = config.schema[k].default + assert isinstance(shared_conf, SharedConfig) + if target_key is not None and not isinstance(target_key, + SharedConfig): + continue # value is given for the module + elif shared_conf.key in global_config: + # `key` is present in config + kwargs[k] = global_config[shared_conf.key] + else: + kwargs[k] = shared_conf.default_value + + # parse `inject` annoation of registered modules + if getattr(config, 'inject', None): + for k in config.inject: + target_key = config[k] + # optional dependency + if target_key is None: + continue + # also accept dictionaries and serialized objects + if isinstance(target_key, dict) or hasattr(target_key, '__dict__'): + continue + elif isinstance(target_key, str): + if target_key not in global_config: + raise ValueError("Missing injection config:", target_key) + target = global_config[target_key] + if isinstance(target, SchemaDict): + kwargs[k] = create(target_key) + elif hasattr(target, '__dict__'): # serialized object + kwargs[k] = target + else: + raise ValueError("Unsupported injection type:", target_key) + # prevent modification of global config values of reference types + # (e.g., list, dict) from within the created module instances + kwargs = copy.deepcopy(kwargs) + return cls(**kwargs) diff --git a/ppdet/data/README.md b/ppdet/data/README.md new file mode 120000 index 0000000000000000000000000000000000000000..238fc99bf487f0505c27541ecaa9a64b0bcd62f7 --- /dev/null +++ b/ppdet/data/README.md @@ -0,0 +1 @@ +docs/DATA.md \ No newline at end of file diff --git a/ppdet/data/README_cn.md b/ppdet/data/README_cn.md new file mode 120000 index 0000000000000000000000000000000000000000..c8e59f3054954c6abe6732b01998a87d6d3074c4 --- /dev/null +++ b/ppdet/data/README_cn.md @@ -0,0 +1 @@ +docs/DATA_cn.md \ No newline at end of file diff --git a/ppdet/data/__init__.py b/ppdet/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1104c33f6ac34b8ec32681f5c4a7fc4d89274bfb --- /dev/null +++ b/ppdet/data/__init__.py @@ -0,0 +1,46 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# function: +# module to prepare data for detection model training +# +# implementation notes: +# - Dateset +# basic interface to accessing data samples in stream mode +# +# - xxxSource (RoiDbSource) +# * subclass of 'Dataset' +# * load data from local files and other source data +# +# - xxxOperator (DecodeImage) +# * subclass of 'BaseOperator' +# * each op can transform a sample, eg: decode/resize/crop image +# * each op must obey basic rules defined in transform.operator.base +# +# - transformer +# * subclass of 'Dataset' +# * 'MappedDataset' accept a 'xxxSource' and a list of 'xxxOperator' +# to build a transformed 'Dataset' + +from __future__ import absolute_import + +from .dataset import Dataset +from .reader import Reader +import traceback +if traceback.extract_stack()[0][ + 0] == 'ppdet/data/tools/generate_data_for_training.py': + __all__ = ['Dataset', 'Reader'] +else: + from .data_feed import create_reader + __all__ = ['Dataset', 'Reader', 'create_reader'] diff --git a/ppdet/data/data_feed.py b/ppdet/data/data_feed.py new file mode 100644 index 0000000000000000000000000000000000000000..b70f4be2067cd789a52b553eb7d8beb96bc4be94 --- /dev/null +++ b/ppdet/data/data_feed.py @@ -0,0 +1,1068 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import os +import inspect + +from ppdet.core.workspace import register, serializable +from ppdet.utils.download import get_dataset_path + +from ppdet.data.reader import Reader +# XXX these are for triggering the decorator +from ppdet.data.transform.operators import ( + DecodeImage, MixupImage, NormalizeBox, NormalizeImage, RandomDistort, + RandomFlipImage, RandomInterpImage, ResizeImage, ExpandImage, CropImage, + Permute, MultiscaleTestResize) +from ppdet.data.transform.arrange_sample import ( + ArrangeRCNN, ArrangeEvalRCNN, ArrangeTestRCNN, ArrangeSSD, ArrangeEvalSSD, + ArrangeTestSSD, ArrangeYOLO, ArrangeEvalYOLO, ArrangeTestYOLO) + +__all__ = [ + 'PadBatch', 'MultiScale', 'RandomShape', 'PadMSTest', 'DataSet', + 'CocoDataSet', 'DataFeed', 'TrainFeed', 'EvalFeed', 'FasterRCNNTrainFeed', + 'MaskRCNNTrainFeed', 'FasterRCNNEvalFeed', 'MaskRCNNEvalFeed', + 'FasterRCNNTestFeed', 'MaskRCNNTestFeed', 'SSDTrainFeed', 'SSDEvalFeed', + 'SSDTestFeed', 'YoloTrainFeed', 'YoloEvalFeed', 'YoloTestFeed', + 'create_reader' +] + + +def _prepare_data_config(feed, args_path): + # if `DATASET_DIR` does not exists, search ~/.paddle/dataset for a directory + # named `DATASET_DIR` (e.g., coco, pascal), if not present either, download + dataset_home = args_path if args_path else feed.dataset.dataset_dir + if dataset_home: + annotation = getattr(feed.dataset, 'annotation', None) + image_dir = getattr(feed.dataset, 'image_dir', None) + dataset_dir = get_dataset_path(dataset_home, annotation, image_dir) + if annotation: + feed.dataset.annotation = os.path.join(dataset_dir, annotation) + if image_dir: + feed.dataset.image_dir = os.path.join(dataset_dir, image_dir) + + mixup_epoch = -1 + if getattr(feed, 'mixup_epoch', None) is not None: + mixup_epoch = feed.mixup_epoch + + data_config = { + 'ANNO_FILE': feed.dataset.annotation, + 'IMAGE_DIR': feed.dataset.image_dir, + 'USE_DEFAULT_LABEL': feed.dataset.use_default_label, + 'IS_SHUFFLE': feed.shuffle, + 'SAMPLES': feed.samples, + 'WITH_BACKGROUND': feed.with_background, + 'MIXUP_EPOCH': mixup_epoch, + 'TYPE': type(feed.dataset).__source__ + } + + if feed.mode == 'TRAIN': + data_config['CLASS_AWARE_SAMPLING'] = getattr( + feed, 'class_aware_sampling', False) + + if len(getattr(feed.dataset, 'images', [])) > 0: + data_config['IMAGES'] = feed.dataset.images + + return data_config + + +def create_reader(feed, max_iter=0, args_path=None, my_source=None): + """ + Return iterable data reader. + + Args: + max_iter (int): number of iterations. + my_source (callable): callable function to create a source iterator + which is used to provide source data in 'ppdet.data.reader' + """ + + # if `DATASET_DIR` does not exists, search ~/.paddle/dataset for a directory + # named `DATASET_DIR` (e.g., coco, pascal), if not present either, download + data_config = _prepare_data_config(feed, args_path) + + bufsize = getattr(feed, 'bufsize', 10) + use_process = getattr(feed, 'use_process', False) + memsize = getattr(feed, 'memsize', '3G') + transform_config = { + 'WORKER_CONF': { + 'bufsize': bufsize, + 'worker_num': feed.num_workers, + 'use_process': use_process, + 'memsize': memsize + }, + 'BATCH_SIZE': feed.batch_size, + 'DROP_LAST': feed.drop_last, + 'USE_PADDED_IM_INFO': feed.use_padded_im_info, + } + + batch_transforms = feed.batch_transforms + pad = [t for t in batch_transforms if isinstance(t, PadBatch)] + rand_shape = [t for t in batch_transforms if isinstance(t, RandomShape)] + multi_scale = [t for t in batch_transforms if isinstance(t, MultiScale)] + pad_ms_test = [t for t in batch_transforms if isinstance(t, PadMSTest)] + + if any(pad): + transform_config['IS_PADDING'] = True + if pad[0].pad_to_stride != 0: + transform_config['COARSEST_STRIDE'] = pad[0].pad_to_stride + if any(rand_shape): + transform_config['RANDOM_SHAPES'] = rand_shape[0].sizes + if any(multi_scale): + transform_config['MULTI_SCALES'] = multi_scale[0].scales + if any(pad_ms_test): + transform_config['ENABLE_MULTISCALE_TEST'] = True + transform_config['NUM_SCALE'] = feed.num_scale + transform_config['COARSEST_STRIDE'] = pad_ms_test[0].pad_to_stride + + if hasattr(inspect, 'getfullargspec'): + argspec = inspect.getfullargspec + else: + argspec = inspect.getargspec + + ops = [] + for op in feed.sample_transforms: + op_dict = op.__dict__.copy() + argnames = [ + arg for arg in argspec(type(op).__init__).args if arg != 'self' + ] + op_dict = {k: v for k, v in op_dict.items() if k in argnames} + op_dict['op'] = op.__class__.__name__ + ops.append(op_dict) + transform_config['OPS'] = ops + + return Reader.create(feed.mode, data_config, transform_config, max_iter, + my_source) + + +# XXX batch transforms are only stubs for now, actually handled by `post_map` +@serializable +class PadBatch(object): + """ + Pad a batch of samples to same dimensions + + Args: + pad_to_stride (int): pad to multiple of strides, e.g., 32 + """ + + def __init__(self, pad_to_stride=0): + super(PadBatch, self).__init__() + self.pad_to_stride = pad_to_stride + + +@serializable +class MultiScale(object): + """ + Randomly resize image by scale + + Args: + scales (list): list of int, randomly resize to one of these scales + """ + + def __init__(self, scales=[]): + super(MultiScale, self).__init__() + self.scales = scales + + +@serializable +class RandomShape(object): + """ + Randomly reshape a batch + + Args: + sizes (list): list of int, random choose a size from these + """ + + def __init__(self, sizes=[]): + super(RandomShape, self).__init__() + self.sizes = sizes + + +@serializable +class PadMSTest(object): + """ + Padding for multi-scale test + + Args: + pad_to_stride (int): pad to multiple of strides, e.g., 32 + """ + + def __init__(self, pad_to_stride=0): + super(PadMSTest, self).__init__() + self.pad_to_stride = pad_to_stride + + +@serializable +class DataSet(object): + """ + Dataset, e.g., coco, pascal voc + + Args: + annotation (str): annotation file path + image_dir (str): directory where image files are stored + shuffle (bool): shuffle samples + """ + __source__ = 'RoiDbSource' + + def __init__(self, + annotation, + image_dir, + dataset_dir=None, + use_default_label=None): + super(DataSet, self).__init__() + self.dataset_dir = dataset_dir + self.annotation = annotation + self.image_dir = image_dir + self.use_default_label = use_default_label + + +COCO_DATASET_DIR = 'coco' +COCO_TRAIN_ANNOTATION = 'annotations/instances_train2017.json' +COCO_TRAIN_IMAGE_DIR = 'train2017' +COCO_VAL_ANNOTATION = 'annotations/instances_val2017.json' +COCO_VAL_IMAGE_DIR = 'val2017' + + +@serializable +class CocoDataSet(DataSet): + def __init__(self, + dataset_dir=COCO_DATASET_DIR, + annotation=COCO_TRAIN_ANNOTATION, + image_dir=COCO_TRAIN_IMAGE_DIR): + super(CocoDataSet, self).__init__( + dataset_dir=dataset_dir, annotation=annotation, image_dir=image_dir) + + +VOC_DATASET_DIR = 'pascalvoc' +VOC_TRAIN_ANNOTATION = 'VOCdevkit/VOC_all/ImageSets/Main/train.txt' +VOC_VAL_ANNOTATION = 'VOCdevkit/VOC_all/ImageSets/Main/val.txt' +VOC_TEST_ANNOTATION = 'VOCdevkit/VOC_all/ImageSets/Main/test.txt' +VOC_IMAGE_DIR = 'VOCdevkit/VOC_all/JPEGImages' +VOC_USE_DEFAULT_LABEL = None + + +@serializable +class VocDataSet(DataSet): + __source__ = 'VOCSource' + + def __init__(self, + dataset_dir=VOC_DATASET_DIR, + annotation=VOC_TRAIN_ANNOTATION, + image_dir=VOC_IMAGE_DIR, + use_default_label=VOC_USE_DEFAULT_LABEL): + super(VocDataSet, self).__init__( + dataset_dir=dataset_dir, + annotation=annotation, + image_dir=image_dir, + use_default_label=use_default_label) + + +@serializable +class SimpleDataSet(DataSet): + __source__ = 'SimpleSource' + + def __init__(self, + dataset_dir=None, + annotation=None, + image_dir=None, + use_default_label=None): + super(SimpleDataSet, self).__init__( + dataset_dir=dataset_dir, annotation=annotation, image_dir=image_dir) + self.images = [] + + def add_images(self, images): + self.images.extend(images) + + +@serializable +class DataFeed(object): + """ + DataFeed encompasses all data loading related settings + + Args: + dataset (object): a `Dataset` instance + fields (list): list of data fields needed + image_shape (list): list of image dims (C, MAX_DIM, MIN_DIM) + sample_transforms (list): list of sample transformations to use + batch_transforms (list): list of batch transformations to use + batch_size (int): number of images per device + shuffle (bool): if samples should be shuffled + drop_last (bool): drop last batch if size is uneven + num_workers (int): number of workers processes (or threads) + bufsize (int): size of queue used to buffer results from workers + use_process (bool): use process or thread as workers + memsize (str): size of shared memory used in result queue + when 'use_process' is True, default to '3G' + """ + __category__ = 'data' + + def __init__(self, + dataset, + fields, + image_shape, + sample_transforms=None, + batch_transforms=None, + batch_size=1, + shuffle=False, + samples=-1, + drop_last=False, + with_background=True, + num_workers=2, + bufsize=10, + use_process=False, + memsize=None, + use_padded_im_info=False, + class_aware_sampling=False): + super(DataFeed, self).__init__() + self.fields = fields + self.image_shape = image_shape + self.sample_transforms = sample_transforms + self.batch_transforms = batch_transforms + self.batch_size = batch_size + self.shuffle = shuffle + self.samples = samples + self.drop_last = drop_last + self.with_background = with_background + self.num_workers = num_workers + self.bufsize = bufsize + self.use_process = use_process + self.memsize = memsize + self.dataset = dataset + self.use_padded_im_info = use_padded_im_info + self.class_aware_sampling = class_aware_sampling + if isinstance(dataset, dict): + self.dataset = DataSet(**dataset) + + +# for custom (i.e., Non-preset) datasets +@register +class TrainFeed(DataFeed): + __doc__ = DataFeed.__doc__ + + def __init__(self, + dataset, + fields, + image_shape, + sample_transforms=[], + batch_transforms=[], + batch_size=1, + shuffle=True, + samples=-1, + drop_last=False, + with_background=True, + num_workers=2, + bufsize=10, + use_process=True, + memsize=None): + super(TrainFeed, self).__init__( + dataset, + fields, + image_shape, + sample_transforms, + batch_transforms, + batch_size=batch_size, + shuffle=shuffle, + samples=samples, + drop_last=drop_last, + with_background=with_background, + num_workers=num_workers, + bufsize=bufsize, + use_process=use_process, + memsize=memsize) + + +@register +class EvalFeed(DataFeed): + __doc__ = DataFeed.__doc__ + + def __init__(self, + dataset, + fields, + image_shape, + sample_transforms=[], + batch_transforms=[], + batch_size=1, + shuffle=False, + samples=-1, + drop_last=False, + with_background=True, + num_workers=2): + super(EvalFeed, self).__init__( + dataset, + fields, + image_shape, + sample_transforms, + batch_transforms, + batch_size=batch_size, + shuffle=shuffle, + samples=samples, + drop_last=drop_last, + with_background=with_background, + num_workers=num_workers) + + +@register +class TestFeed(DataFeed): + __doc__ = DataFeed.__doc__ + + def __init__(self, + dataset, + fields, + image_shape, + sample_transforms=[], + batch_transforms=[], + batch_size=1, + shuffle=False, + drop_last=False, + with_background=True, + num_workers=2): + super(TestFeed, self).__init__( + dataset, + fields, + image_shape, + sample_transforms, + batch_transforms, + batch_size=batch_size, + shuffle=shuffle, + drop_last=drop_last, + with_background=with_background, + num_workers=num_workers) + + +# yapf: disable +@register +class FasterRCNNTrainFeed(DataFeed): + __doc__ = DataFeed.__doc__ + + def __init__(self, + dataset=CocoDataSet().__dict__, + fields=[ + 'image', 'im_info', 'im_id', 'gt_box', 'gt_label', + 'is_crowd' + ], + image_shape=[3, 800, 1333], + sample_transforms=[ + DecodeImage(to_rgb=True), + RandomFlipImage(prob=0.5), + NormalizeImage(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + is_scale=True, + is_channel_first=False), + ResizeImage(target_size=800, max_size=1333, interp=1), + Permute(to_bgr=False) + ], + batch_transforms=[PadBatch()], + batch_size=1, + shuffle=True, + samples=-1, + drop_last=False, + bufsize=10, + num_workers=2, + use_process=False, + memsize=None, + class_aware_sampling=False): + # XXX this should be handled by the data loader, since `fields` is + # given, just collect them + sample_transforms.append(ArrangeRCNN()) + super(FasterRCNNTrainFeed, self).__init__( + dataset, + fields, + image_shape, + sample_transforms, + batch_transforms, + batch_size=batch_size, + shuffle=shuffle, + samples=samples, + drop_last=drop_last, + bufsize=bufsize, + num_workers=num_workers, + use_process=use_process, + memsize=memsize, + class_aware_sampling=class_aware_sampling) + # XXX these modes should be unified + self.mode = 'TRAIN' + + +@register +class FasterRCNNEvalFeed(DataFeed): + __doc__ = DataFeed.__doc__ + + def __init__(self, + dataset=CocoDataSet(COCO_VAL_ANNOTATION, + COCO_VAL_IMAGE_DIR).__dict__, + fields=['image', 'im_info', 'im_id', 'im_shape', 'gt_box', + 'gt_label', 'is_difficult'], + image_shape=[3, 800, 1333], + sample_transforms=[ + DecodeImage(to_rgb=True), + NormalizeImage(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + is_scale=True, + is_channel_first=False), + ResizeImage(target_size=800, max_size=1333, interp=1), + Permute(to_bgr=False) + ], + batch_transforms=[PadBatch()], + batch_size=1, + shuffle=False, + samples=-1, + drop_last=False, + num_workers=2, + use_padded_im_info=True, + enable_multiscale=False, + num_scale=1, + enable_aug_flip=False): + sample_transforms.append(ArrangeEvalRCNN()) + super(FasterRCNNEvalFeed, self).__init__( + dataset, + fields, + image_shape, + sample_transforms, + batch_transforms, + batch_size=batch_size, + shuffle=shuffle, + samples=samples, + drop_last=drop_last, + num_workers=num_workers, + use_padded_im_info=use_padded_im_info) + self.mode = 'VAL' + self.enable_multiscale = enable_multiscale + self.num_scale = num_scale + self.enable_aug_flip = enable_aug_flip + + +@register +class FasterRCNNTestFeed(DataFeed): + __doc__ = DataFeed.__doc__ + + def __init__(self, + dataset=SimpleDataSet(COCO_VAL_ANNOTATION, + COCO_VAL_IMAGE_DIR).__dict__, + fields=['image', 'im_info', 'im_id', 'im_shape'], + image_shape=[3, 800, 1333], + sample_transforms=[ + DecodeImage(to_rgb=True), + NormalizeImage(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + is_scale=True, + is_channel_first=False), + Permute(to_bgr=False) + ], + batch_transforms=[PadBatch()], + batch_size=1, + shuffle=False, + samples=-1, + drop_last=False, + num_workers=2, + use_padded_im_info=True): + sample_transforms.append(ArrangeTestRCNN()) + if isinstance(dataset, dict): + dataset = SimpleDataSet(**dataset) + super(FasterRCNNTestFeed, self).__init__( + dataset, + fields, + image_shape, + sample_transforms, + batch_transforms, + batch_size=batch_size, + shuffle=shuffle, + samples=samples, + drop_last=drop_last, + num_workers=num_workers, + use_padded_im_info=use_padded_im_info) + self.mode = 'TEST' + + +# XXX currently use two presets, in the future, these should be combined into a +# single `RCNNTrainFeed`. Mask (and keypoint) should be processed +# automatically if `gt_mask` (or `gt_keypoints`) is in the required fields +@register +class MaskRCNNTrainFeed(DataFeed): + __doc__ = DataFeed.__doc__ + + def __init__(self, + dataset=CocoDataSet().__dict__, + fields=[ + 'image', 'im_info', 'im_id', 'gt_box', 'gt_label', + 'is_crowd', 'gt_mask' + ], + image_shape=[3, 800, 1333], + sample_transforms=[ + DecodeImage(to_rgb=True), + RandomFlipImage(prob=0.5, is_mask_flip=True), + NormalizeImage(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + is_scale=True, + is_channel_first=False), + ResizeImage(target_size=800, + max_size=1333, + interp=1, + use_cv2=True), + Permute(to_bgr=False, channel_first=True) + ], + batch_transforms=[PadBatch()], + batch_size=1, + shuffle=True, + samples=-1, + drop_last=False, + num_workers=2, + use_process=False, + use_padded_im_info=False): + sample_transforms.append(ArrangeRCNN(is_mask=True)) + super(MaskRCNNTrainFeed, self).__init__( + dataset, + fields, + image_shape, + sample_transforms, + batch_transforms, + batch_size=batch_size, + shuffle=shuffle, + samples=samples, + drop_last=drop_last, + num_workers=num_workers, + use_process=use_process) + self.mode = 'TRAIN' + + +@register +class MaskRCNNEvalFeed(DataFeed): + __doc__ = DataFeed.__doc__ + + def __init__(self, + dataset=CocoDataSet(COCO_VAL_ANNOTATION, + COCO_VAL_IMAGE_DIR).__dict__, + fields=['image', 'im_info', 'im_id', 'im_shape'], + image_shape=[3, 800, 1333], + sample_transforms=[ + DecodeImage(to_rgb=True), + NormalizeImage(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + is_scale=True, + is_channel_first=False), + ResizeImage(target_size=800, + max_size=1333, + interp=1, + use_cv2=True), + Permute(to_bgr=False, channel_first=True) + ], + batch_transforms=[PadBatch()], + batch_size=1, + shuffle=False, + samples=-1, + drop_last=False, + num_workers=2, + use_process=False, + use_padded_im_info=True, + enable_multiscale=False, + num_scale=1, + enable_aug_flip=False): + sample_transforms.append(ArrangeTestRCNN()) + super(MaskRCNNEvalFeed, self).__init__( + dataset, + fields, + image_shape, + sample_transforms, + batch_transforms, + batch_size=batch_size, + shuffle=shuffle, + samples=samples, + drop_last=drop_last, + num_workers=num_workers, + use_process=use_process, + use_padded_im_info=use_padded_im_info) + self.mode = 'VAL' + self.enable_multiscale = enable_multiscale + self.num_scale = num_scale + self.enable_aug_flip = enable_aug_flip + + +@register +class MaskRCNNTestFeed(DataFeed): + __doc__ = DataFeed.__doc__ + + def __init__(self, + dataset=SimpleDataSet(COCO_VAL_ANNOTATION, + COCO_VAL_IMAGE_DIR).__dict__, + fields=['image', 'im_info', 'im_id', 'im_shape'], + image_shape=[3, 800, 1333], + sample_transforms=[ + DecodeImage(to_rgb=True), + NormalizeImage( + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + is_scale=True, + is_channel_first=False), + Permute(to_bgr=False, channel_first=True) + ], + batch_transforms=[PadBatch()], + batch_size=1, + shuffle=False, + samples=-1, + drop_last=False, + num_workers=2, + use_process=False, + use_padded_im_info=True): + sample_transforms.append(ArrangeTestRCNN()) + if isinstance(dataset, dict): + dataset = SimpleDataSet(**dataset) + super(MaskRCNNTestFeed, self).__init__( + dataset, + fields, + image_shape, + sample_transforms, + batch_transforms, + batch_size=batch_size, + shuffle=shuffle, + samples=samples, + drop_last=drop_last, + num_workers=num_workers, + use_process=use_process, + use_padded_im_info=use_padded_im_info) + self.mode = 'TEST' + + +@register +class SSDTrainFeed(DataFeed): + __doc__ = DataFeed.__doc__ + + def __init__(self, + dataset=VocDataSet().__dict__, + fields=['image', 'gt_box', 'gt_label'], + image_shape=[3, 300, 300], + sample_transforms=[ + DecodeImage(to_rgb=True, with_mixup=False), + NormalizeBox(), + RandomDistort(brightness_lower=0.875, + brightness_upper=1.125, + is_order=True), + ExpandImage(max_ratio=4, prob=0.5), + CropImage(batch_sampler=[[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]], + satisfy_all=False, avoid_no_bbox=False), + ResizeImage(target_size=300, use_cv2=False, interp=1), + RandomFlipImage(is_normalized=True), + Permute(), + NormalizeImage(mean=[127.5, 127.5, 127.5], + std=[127.502231, 127.502231, 127.502231], + is_scale=False) + ], + batch_transforms=[], + batch_size=32, + shuffle=True, + samples=-1, + drop_last=True, + num_workers=8, + bufsize=10, + use_process=True, + memsize=None): + sample_transforms.append(ArrangeSSD()) + super(SSDTrainFeed, self).__init__( + dataset, + fields, + image_shape, + sample_transforms, + batch_transforms, + batch_size=batch_size, + shuffle=shuffle, + samples=samples, + drop_last=drop_last, + num_workers=num_workers, + bufsize=bufsize, + use_process=use_process, + memsize=None) + self.mode = 'TRAIN' + + +@register +class SSDEvalFeed(DataFeed): + __doc__ = DataFeed.__doc__ + + def __init__( + self, + dataset=VocDataSet(VOC_VAL_ANNOTATION).__dict__, + fields=['image', 'im_shape', 'im_id', 'gt_box', + 'gt_label', 'is_difficult'], + image_shape=[3, 300, 300], + sample_transforms=[ + DecodeImage(to_rgb=True, with_mixup=False), + NormalizeBox(), + ResizeImage(target_size=300, use_cv2=False, interp=1), + Permute(), + NormalizeImage( + mean=[127.5, 127.5, 127.5], + std=[127.502231, 127.502231, 127.502231], + is_scale=False) + ], + batch_transforms=[], + batch_size=64, + shuffle=False, + samples=-1, + drop_last=True, + num_workers=8, + bufsize=10, + use_process=False, + memsize=None): + sample_transforms.append(ArrangeEvalSSD(fields)) + super(SSDEvalFeed, self).__init__( + dataset, + fields, + image_shape, + sample_transforms, + batch_transforms, + batch_size=batch_size, + shuffle=shuffle, + samples=samples, + drop_last=drop_last, + num_workers=num_workers, + bufsize=bufsize, + use_process=use_process, + memsize=memsize) + self.mode = 'VAL' + + +@register +class SSDTestFeed(DataFeed): + __doc__ = DataFeed.__doc__ + + def __init__(self, + dataset=SimpleDataSet(VOC_TEST_ANNOTATION).__dict__, + fields=['image', 'im_id', 'im_shape'], + image_shape=[3, 300, 300], + sample_transforms=[ + DecodeImage(to_rgb=True), + ResizeImage(target_size=300, use_cv2=False, interp=1), + Permute(), + NormalizeImage( + mean=[127.5, 127.5, 127.5], + std=[127.502231, 127.502231, 127.502231], + is_scale=False) + ], + batch_transforms=[], + batch_size=1, + shuffle=False, + samples=-1, + drop_last=False, + num_workers=8, + bufsize=10, + use_process=False, + memsize=None): + sample_transforms.append(ArrangeTestSSD()) + if isinstance(dataset, dict): + dataset = SimpleDataSet(**dataset) + super(SSDTestFeed, self).__init__( + dataset, + fields, + image_shape, + sample_transforms, + batch_transforms, + batch_size=batch_size, + shuffle=shuffle, + samples=samples, + drop_last=drop_last, + num_workers=num_workers, + bufsize=bufsize, + use_process=use_process, + memsize=memsize) + self.mode = 'TEST' + + +@register +class YoloTrainFeed(DataFeed): + __doc__ = DataFeed.__doc__ + + def __init__(self, + dataset=CocoDataSet().__dict__, + fields=['image', 'gt_box', 'gt_label', 'gt_score'], + image_shape=[3, 608, 608], + sample_transforms=[ + DecodeImage(to_rgb=True, with_mixup=True), + MixupImage(alpha=1.5, beta=1.5), + NormalizeBox(), + RandomDistort(), + ExpandImage(max_ratio=4., prob=.5, + mean=[123.675, 116.28, 103.53]), + CropImage([[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]]), + RandomInterpImage(target_size=608), + RandomFlipImage(is_normalized=True), + NormalizeImage( + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + is_scale=True, + is_channel_first=False), + Permute(to_bgr=False), + ], + batch_transforms=[ + RandomShape(sizes=[ + 320, 352, 384, 416, 448, 480, 512, 544, 576, 608 + ]) + ], + batch_size=8, + shuffle=True, + samples=-1, + drop_last=True, + with_background=False, + num_workers=8, + bufsize=128, + use_process=True, + memsize=None, + num_max_boxes=50, + mixup_epoch=250, + class_aware_sampling=False): + sample_transforms.append(ArrangeYOLO()) + super(YoloTrainFeed, self).__init__( + dataset, + fields, + image_shape, + sample_transforms, + batch_transforms, + batch_size=batch_size, + shuffle=shuffle, + samples=samples, + drop_last=drop_last, + with_background=with_background, + num_workers=num_workers, + bufsize=bufsize, + use_process=use_process, + memsize=memsize, + class_aware_sampling=class_aware_sampling) + self.num_max_boxes = num_max_boxes + self.mixup_epoch = mixup_epoch + self.mode = 'TRAIN' + + +@register +class YoloEvalFeed(DataFeed): + __doc__ = DataFeed.__doc__ + + def __init__(self, + dataset=CocoDataSet(COCO_VAL_ANNOTATION, + COCO_VAL_IMAGE_DIR).__dict__, + fields=['image', 'im_size', 'im_id', 'gt_box', + 'gt_label', 'is_difficult'], + image_shape=[3, 608, 608], + sample_transforms=[ + DecodeImage(to_rgb=True), + ResizeImage(target_size=608, interp=2), + NormalizeImage( + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + is_scale=True, + is_channel_first=False), + Permute(to_bgr=False), + ], + batch_transforms=[], + batch_size=8, + shuffle=False, + samples=-1, + drop_last=False, + with_background=False, + num_workers=8, + num_max_boxes=50, + use_process=False, + memsize=None): + sample_transforms.append(ArrangeEvalYOLO()) + super(YoloEvalFeed, self).__init__( + dataset, + fields, + image_shape, + sample_transforms, + batch_transforms, + batch_size=batch_size, + shuffle=shuffle, + samples=samples, + drop_last=drop_last, + with_background=with_background, + num_workers=num_workers, + use_process=use_process, + memsize=memsize) + self.num_max_boxes = num_max_boxes + self.mode = 'VAL' + self.bufsize = 128 + + # support image shape config, resize image with image_shape + for i, trans in enumerate(sample_transforms): + if isinstance(trans, ResizeImage): + sample_transforms[i] = ResizeImage( + target_size=self.image_shape[-1], + interp=trans.interp) + + +@register +class YoloTestFeed(DataFeed): + __doc__ = DataFeed.__doc__ + + def __init__(self, + dataset=SimpleDataSet(COCO_VAL_ANNOTATION, + COCO_VAL_IMAGE_DIR).__dict__, + fields=['image', 'im_size', 'im_id'], + image_shape=[3, 608, 608], + sample_transforms=[ + DecodeImage(to_rgb=True), + ResizeImage(target_size=608, interp=2), + NormalizeImage(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + is_scale=True, + is_channel_first=False), + Permute(to_bgr=False), + ], + batch_transforms=[], + batch_size=1, + shuffle=False, + samples=-1, + drop_last=False, + with_background=False, + num_workers=8, + num_max_boxes=50, + use_process=False, + memsize=None): + sample_transforms.append(ArrangeTestYOLO()) + if isinstance(dataset, dict): + dataset = SimpleDataSet(**dataset) + super(YoloTestFeed, self).__init__( + dataset, + fields, + image_shape, + sample_transforms, + batch_transforms, + batch_size=batch_size, + shuffle=shuffle, + samples=samples, + drop_last=drop_last, + with_background=with_background, + num_workers=num_workers, + use_process=use_process, + memsize=memsize) + self.mode = 'TEST' + self.bufsize = 128 + + # support image shape config, resize image with image_shape + for i, trans in enumerate(sample_transforms): + if isinstance(trans, ResizeImage): + sample_transforms[i] = ResizeImage( + target_size=self.image_shape[-1], + interp=trans.interp) +# yapf: enable diff --git a/ppdet/data/dataset.py b/ppdet/data/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..31d4df4a0146fd7bc5cdd21d02e06d58ebe0f99f --- /dev/null +++ b/ppdet/data/dataset.py @@ -0,0 +1,63 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# function: +# interface for accessing data samples in stream + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +class Dataset(object): + """interface to access a stream of data samples""" + + def __init__(self): + self._epoch = -1 + + def __next__(self): + return self.next() + + def __iter__(self): + return self + + def __str__(self): + return "{}(fname:{}, epoch:{:d}, size:{:d}, pos:{:d})".format( + type(self).__name__, self._fname, self._epoch, + self.size(), self._pos) + + def next(self): + """get next sample""" + raise NotImplementedError('%s.next not available' % + (self.__class__.__name__)) + + def reset(self): + """reset to initial status and begins a new epoch""" + raise NotImplementedError('%s.reset not available' % + (self.__class__.__name__)) + + def size(self): + """get number of samples in this dataset""" + raise NotImplementedError('%s.size not available' % + (self.__class__.__name__)) + + def drained(self): + """whether all sampled has been readed out for this epoch""" + raise NotImplementedError('%s.drained not available' % + (self.__class__.__name__)) + + def epoch_id(self): + """return epoch id for latest sample""" + raise NotImplementedError('%s.epoch_id not available' % + (self.__class__.__name__)) diff --git a/ppdet/data/reader.py b/ppdet/data/reader.py new file mode 100644 index 0000000000000000000000000000000000000000..b2d4d07850601fb047b7b17fcb7c33ce6f35f6ea --- /dev/null +++ b/ppdet/data/reader.py @@ -0,0 +1,142 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# function: +# Interface to build readers for detection data like COCO or VOC +# + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from numbers import Integral + +import logging +from .source import build_source +from .transform import build_mapper, map, batch, batch_map + +logger = logging.getLogger(__name__) + + +class Reader(object): + """Interface to make readers for training or evaluation""" + + def __init__(self, data_cf, trans_conf, maxiter=-1): + self._data_cf = data_cf + self._trans_conf = trans_conf + self._maxiter = maxiter + self._cname2cid = None + assert isinstance(self._maxiter, Integral), "maxiter should be int" + + def _make_reader(self, mode, my_source=None): + """Build reader for training or validation""" + if my_source is None: + file_conf = self._data_cf[mode] + + # 1, Build data source + + sc_conf = {'data_cf': file_conf, 'cname2cid': self._cname2cid} + sc = build_source(sc_conf) + else: + sc = my_source + + # 2, Buid a transformed dataset + ops = self._trans_conf[mode]['OPS'] + batchsize = self._trans_conf[mode]['BATCH_SIZE'] + drop_last = False if 'DROP_LAST' not in \ + self._trans_conf[mode] else self._trans_conf[mode]['DROP_LAST'] + + mapper = build_mapper(ops, {'is_train': mode == 'TRAIN'}) + + worker_args = None + if 'WORKER_CONF' in self._trans_conf[mode]: + worker_args = self._trans_conf[mode]['WORKER_CONF'] + worker_args = {k.lower(): v for k, v in worker_args.items()} + + mapped_ds = map(sc, mapper, worker_args) + # In VAL mode, gt_bbox, gt_label can be empty, and should + # not be dropped + batched_ds = batch( + mapped_ds, batchsize, drop_last, drop_empty=(mode != "VAL")) + + trans_conf = {k.lower(): v for k, v in self._trans_conf[mode].items()} + need_keys = { + 'is_padding', + 'coarsest_stride', + 'random_shapes', + 'multi_scales', + 'use_padded_im_info', + 'enable_multiscale_test', + 'num_scale', + } + bm_config = { + key: value + for key, value in trans_conf.items() if key in need_keys + } + + batched_ds = batch_map(batched_ds, bm_config) + + batched_ds.reset() + if mode.lower() == 'train': + if self._cname2cid is not None: + logger.warn('cname2cid already set, it will be overridden') + self._cname2cid = getattr(sc, 'cname2cid', None) + + # 3, Build a reader + maxit = -1 if self._maxiter <= 0 else self._maxiter + + def _reader(): + n = 0 + while True: + for _batch in batched_ds: + yield _batch + n += 1 + if maxit > 0 and n == maxit: + return + batched_ds.reset() + if maxit <= 0: + return + + if hasattr(sc, 'get_imid2path'): + _reader.imid2path = sc.get_imid2path() + + return _reader + + def train(self): + """Build reader for training""" + return self._make_reader('TRAIN') + + def val(self): + """Build reader for validation""" + return self._make_reader('VAL') + + def test(self): + """Build reader for inference""" + return self._make_reader('TEST') + + @classmethod + def create(cls, + mode, + data_config, + transform_config, + max_iter=-1, + my_source=None, + ret_iter=True): + """ create a specific reader """ + reader = Reader({mode: data_config}, {mode: transform_config}, max_iter) + if ret_iter: + return reader._make_reader(mode, my_source) + else: + return reader diff --git a/ppdet/data/source/__init__.py b/ppdet/data/source/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e55df6962b36906edac71e8b3cb25334a1d336a4 --- /dev/null +++ b/ppdet/data/source/__init__.py @@ -0,0 +1,73 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy + +from .roidb_source import RoiDbSource +from .simple_source import SimpleSource +from .iterator_source import IteratorSource +from .class_aware_sampling_roidb_source import ClassAwareSamplingRoiDbSource + + +def build_source(config): + """ + Build dataset from source data, default source type is 'RoiDbSource' + Args: + config (dict): should have following structure: + { + data_cf (dict): + anno_file (str): label file or image list file path + image_dir (str): root directory for images + samples (int): number of samples to load, -1 means all + is_shuffle (bool): should samples be shuffled + load_img (bool): should images be loaded + mixup_epoch (int): parse mixup in first n epoch + with_background (bool): whether load background as a class + cname2cid (dict): the label name to id dictionary + } + """ + if 'data_cf' in config: + data_cf = config['data_cf'] + data_cf['cname2cid'] = config['cname2cid'] + else: + data_cf = config + + data_cf = {k.lower(): v for k, v in data_cf.items()} + + args = copy.deepcopy(data_cf) + # defaut type is 'RoiDbSource' + source_type = 'RoiDbSource' + if 'type' in data_cf: + if data_cf['type'] in ['VOCSource', 'COCOSource', 'RoiDbSource']: + if 'class_aware_sampling' in args and args['class_aware_sampling']: + source_type = 'ClassAwareSamplingRoiDbSource' + else: + source_type = 'RoiDbSource' + if 'class_aware_sampling' in args: + del args['class_aware_sampling'] + else: + source_type = data_cf['type'] + del args['type'] + if source_type == 'RoiDbSource': + return RoiDbSource(**args) + elif source_type == 'SimpleSource': + return SimpleSource(**args) + elif source_type == 'ClassAwareSamplingRoiDbSource': + return ClassAwareSamplingRoiDbSource(**args) + else: + raise ValueError('source type not supported: ' + source_type) diff --git a/ppdet/data/source/class_aware_sampling_roidb_source.py b/ppdet/data/source/class_aware_sampling_roidb_source.py new file mode 100644 index 0000000000000000000000000000000000000000..0175037c352594c48cce09ca033de18534937f87 --- /dev/null +++ b/ppdet/data/source/class_aware_sampling_roidb_source.py @@ -0,0 +1,132 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#function: +# interface to load data from local files and parse it for samples, +# eg: roidb data in pickled files + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import os +import random + +import copy +import collections +import pickle as pkl +import numpy as np +from .roidb_source import RoiDbSource + + +class ClassAwareSamplingRoiDbSource(RoiDbSource): + """ interface to load class aware sampling roidb data from files + """ + + def __init__(self, + anno_file, + image_dir=None, + samples=-1, + is_shuffle=True, + load_img=False, + cname2cid=None, + use_default_label=None, + mixup_epoch=-1, + with_background=True): + """ Init + + Args: + fname (str): label file path + image_dir (str): root dir for images + samples (int): samples to load, -1 means all + is_shuffle (bool): whether to shuffle samples + load_img (bool): whether load data in this class + cname2cid (dict): the label name to id dictionary + use_default_label (bool):whether use the default mapping of label to id + mixup_epoch (int): parse mixup in first n epoch + with_background (bool): whether load background + as a class + """ + super(ClassAwareSamplingRoiDbSource, self).__init__( + anno_file=anno_file, + image_dir=image_dir, + samples=samples, + is_shuffle=is_shuffle, + load_img=load_img, + cname2cid=cname2cid, + use_default_label=use_default_label, + mixup_epoch=mixup_epoch, + with_background=with_background) + self._img_weights = None + + def __str__(self): + return 'ClassAwareSamplingRoidbSource(fname:%s,epoch:%d,size:%d)' \ + % (self._fname, self._epoch, self.size()) + + def next(self): + """ load next sample + """ + if self._epoch < 0: + self.reset() + + _pos = np.random.choice( + self._samples, 1, replace=False, p=self._img_weights)[0] + sample = copy.deepcopy(self._roidb[_pos]) + + if self._load_img: + sample['image'] = self._load_image(sample['im_file']) + else: + sample['im_file'] = os.path.join(self._image_dir, sample['im_file']) + + return sample + + def _calc_img_weights(self): + """ calculate the probabilities of each sample + """ + imgs_cls = [] + num_per_cls = {} + img_weights = [] + for i, roidb in enumerate(self._roidb): + img_cls = set( + [k for cls in self._roidb[i]['gt_class'] for k in cls]) + imgs_cls.append(img_cls) + for c in img_cls: + if c not in num_per_cls: + num_per_cls[c] = 1 + else: + num_per_cls[c] += 1 + + for i in range(len(self._roidb)): + weights = 0 + for c in imgs_cls[i]: + weights += 1 / num_per_cls[c] + img_weights.append(weights) + # Probabilities sum to 1 + img_weights = img_weights / np.sum(img_weights) + return img_weights + + def reset(self): + """ implementation of Dataset.reset + """ + if self._roidb is None: + self._roidb = self._load() + + if self._img_weights is None: + self._img_weights = self._calc_img_weights() + + self._samples = len(self._roidb) + + if self._epoch < 0: + self._epoch = 0 diff --git a/ppdet/data/source/coco_loader.py b/ppdet/data/source/coco_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..db18498905358eef66b07969dab8f65606d3cdc0 --- /dev/null +++ b/ppdet/data/source/coco_loader.py @@ -0,0 +1,128 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from pycocotools.coco import COCO + +import logging +logger = logging.getLogger(__name__) + + +def load(anno_path, sample_num=-1, with_background=True): + """ + Load COCO records with annotations in json file 'anno_path' + + Args: + anno_path (str): json file path + sample_num (int): number of samples to load, -1 means all + with_background (bool): whether load background as a class. + if True, total class number will + be 81. default True + + Returns: + (records, cname2cid) + 'records' is list of dict whose structure is: + { + 'im_file': im_fname, # image file name + 'im_id': img_id, # image id + 'h': im_h, # height of image + 'w': im_w, # width + 'is_crowd': is_crowd, + 'gt_score': gt_score, + 'gt_class': gt_class, + 'gt_bbox': gt_bbox, + 'gt_poly': gt_poly, + } + 'cname2cid' is a dict used to map category name to class id + """ + assert anno_path.endswith('.json'), 'invalid coco annotation file: ' \ + + anno_path + coco = COCO(anno_path) + img_ids = coco.getImgIds() + cat_ids = coco.getCatIds() + records = [] + ct = 0 + + # when with_background = True, mapping category to classid, like: + # background:0, first_class:1, second_class:2, ... + catid2clsid = dict( + {catid: i + int(with_background) + for i, catid in enumerate(cat_ids)}) + cname2cid = dict({ + coco.loadCats(catid)[0]['name']: clsid + for catid, clsid in catid2clsid.items() + }) + + for img_id in img_ids: + img_anno = coco.loadImgs(img_id)[0] + im_fname = img_anno['file_name'] + im_w = float(img_anno['width']) + im_h = float(img_anno['height']) + + ins_anno_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False) + instances = coco.loadAnns(ins_anno_ids) + + bboxes = [] + for inst in instances: + x, y, box_w, box_h = inst['bbox'] + x1 = max(0, x) + y1 = max(0, y) + x2 = min(im_w - 1, x1 + max(0, box_w - 1)) + y2 = min(im_h - 1, y1 + max(0, box_h - 1)) + if inst['area'] > 0 and x2 >= x1 and y2 >= y1: + inst['clean_bbox'] = [x1, y1, x2, y2] + bboxes.append(inst) + else: + logger.warn( + 'Found an invalid bbox in annotations: im_id: {}, area: {} x1: {}, y1: {}, x2: {}, y2: {}.'. + format(img_id, float(inst['area']), x1, y1, x2, y2)) + num_bbox = len(bboxes) + + gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32) + gt_class = np.zeros((num_bbox, 1), dtype=np.int32) + gt_score = np.ones((num_bbox, 1), dtype=np.float32) + is_crowd = np.zeros((num_bbox, 1), dtype=np.int32) + difficult = np.zeros((num_bbox, 1), dtype=np.int32) + gt_poly = [None] * num_bbox + + for i, box in enumerate(bboxes): + catid = box['category_id'] + gt_class[i][0] = catid2clsid[catid] + gt_bbox[i, :] = box['clean_bbox'] + is_crowd[i][0] = box['iscrowd'] + if 'segmentation' in box: + gt_poly[i] = box['segmentation'] + + coco_rec = { + 'im_file': im_fname, + 'im_id': np.array([img_id]), + 'h': im_h, + 'w': im_w, + 'is_crowd': is_crowd, + 'gt_class': gt_class, + 'gt_bbox': gt_bbox, + 'gt_score': gt_score, + 'gt_poly': gt_poly, + 'difficult': difficult + } + + logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format( + im_fname, img_id, im_h, im_w)) + records.append(coco_rec) + ct += 1 + if sample_num > 0 and ct >= sample_num: + break + assert len(records) > 0, 'not found any coco record in %s' % (anno_path) + logger.info('{} samples in file {}'.format(ct, anno_path)) + return records, cname2cid diff --git a/ppdet/data/source/iterator_source.py b/ppdet/data/source/iterator_source.py new file mode 100644 index 0000000000000000000000000000000000000000..2785d4843e660843b554197a82530c8129244321 --- /dev/null +++ b/ppdet/data/source/iterator_source.py @@ -0,0 +1,103 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import numpy as np +import copy + +import logging +logger = logging.getLogger(__name__) + +from ..dataset import Dataset + + +class IteratorSource(Dataset): + """ + Load data samples from iterator in stream mode + + Args: + iter_maker (callable): callable function to generate a iter + samples (int): number of samples to load, -1 means all + """ + + def __init__(self, + iter_maker, + samples=-1, + **kwargs): + super(IteratorSource, self).__init__() + self._epoch = -1 + + self._iter_maker = iter_maker + self._data_iter = None + self._pos = -1 + self._drained = False + self._samples = samples + self._sample_num = -1 + + def next(self): + if self._epoch < 0: + self.reset() + + if self._data_iter is not None: + try: + sample = next(self._data_iter) + self._pos += 1 + ret = sample + except StopIteration as e: + if self._sample_num <= 0: + self._sample_num = self._pos + elif self._sample_num != self._pos: + logger.info('num of loaded samples is different ' + 'with previouse setting[prev:%d,now:%d]' % (self._sample_num, self._pos)) + self._sample_num = self._pos + + self._data_iter = None + self._drained = True + raise e + else: + raise StopIteration("no more data in " + str(self)) + + if self._samples > 0 and self._pos >= self._samples: + self._data_iter = None + self._drained = True + raise StopIteration("no more data in " + str(self)) + else: + return ret + + def reset(self): + if self._data_iter is None: + self._data_iter = self._iter_maker() + + if self._epoch < 0: + self._epoch = 0 + else: + self._epoch += 1 + + self._pos = 0 + self._drained = False + + def size(self): + return self._sample_num + + def drained(self): + assert self._epoch >= 0, "the first epoch has not started yet" + return self._pos >= self.size() + + def epoch_id(self): + return self._epoch + diff --git a/ppdet/data/source/loader.py b/ppdet/data/source/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..bc73c8ea67b3bd82ca38ba4a65fd7a185a0c86b1 --- /dev/null +++ b/ppdet/data/source/loader.py @@ -0,0 +1,143 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# function: +# load data records from local files(maybe in COCO or VOC data formats) + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import os + +import numpy as np +import logging +import pickle as pkl + +logger = logging.getLogger(__name__) + + +def check_records(records): + """ check the fields of 'records' must contains some keys + """ + needed_fields = [ + 'im_file', 'im_id', 'h', 'w', 'is_crowd', 'gt_class', 'gt_bbox', + 'gt_poly' + ] + + for i, rec in enumerate(records): + for k in needed_fields: + assert k in rec, 'not found field[%s] in record[%d]' % (k, i) + + +def load_roidb(anno_file, sample_num=-1): + """ load normalized data records from file + 'anno_file' which is a pickled file. + And the records should has a structure: + { + 'im_file': str, # image file name + 'im_id': int, # image id + 'h': int, # height of image + 'w': int, # width of image + 'is_crowd': bool, + 'gt_class': list of np.ndarray, # classids info + 'gt_bbox': list of np.ndarray, # bounding box info + 'gt_poly': list of int, # poly info + } + + Args: + anno_file (str): file name for picked records + sample_num (int): number of samples to load + + Returns: + list of records for detection model training + """ + + assert anno_file.endswith('.roidb'), 'invalid roidb file[%s]' % (anno_file) + with open(anno_file, 'rb') as f: + roidb = f.read() + # for support python3 and python2 + try: + records, cname2cid = pkl.loads(roidb, encoding='bytes') + except: + records, cname2cid = pkl.loads(roidb) + + assert type(records) is list, 'invalid data type from roidb' + + if sample_num > 0 and sample_num < len(records): + records = records[:sample_num] + + return records, cname2cid + + +def load(fname, + samples=-1, + with_background=True, + with_cat2id=False, + use_default_label=None, + cname2cid=None): + """ Load data records from 'fnames' + + Args: + fnames (str): file name for data record, eg: + instances_val2017.json or COCO17_val2017.roidb + samples (int): number of samples to load, default to all + with_background (bool): whether load background as a class. + default True. + with_cat2id (bool): whether return cname2cid info out + use_default_label (bool): whether use the default mapping of label to id + cname2cid (dict): the mapping of category name to id + + Returns: + list of loaded records whose structure is: + { + 'im_file': str, # image file name + 'im_id': int, # image id + 'h': int, # height of image + 'w': int, # width of image + 'is_crowd': bool, + 'gt_class': list of np.ndarray, # classids info + 'gt_bbox': list of np.ndarray, # bounding box info + 'gt_poly': list of int, # poly info + } + + """ + + if fname.endswith('.roidb'): + records, cname2cid = load_roidb(fname, samples) + elif fname.endswith('.json'): + from . import coco_loader + records, cname2cid = coco_loader.load(fname, samples, with_background) + elif "wider_face" in fname: + from . import widerface_loader + records = widerface_loader.load(fname, samples) + return records + elif os.path.isfile(fname): + from . import voc_loader + if use_default_label is None or cname2cid is not None: + records, cname2cid = voc_loader.get_roidb(fname, samples, cname2cid, + with_background=with_background) + else: + records, cname2cid = voc_loader.load(fname, samples, + use_default_label, + with_background=with_background) + else: + raise ValueError('invalid file type when load data from file[%s]' % + (fname)) + check_records(records) + if with_cat2id: + return records, cname2cid + else: + return records diff --git a/ppdet/data/source/roidb_source.py b/ppdet/data/source/roidb_source.py new file mode 100644 index 0000000000000000000000000000000000000000..5eeb08c1a06f13cad2ab2c6f5e82d5a04ab32182 --- /dev/null +++ b/ppdet/data/source/roidb_source.py @@ -0,0 +1,170 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#function: +# interface to load data from local files and parse it for samples, +# eg: roidb data in pickled files + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import os +import random + +import copy +import pickle as pkl +from ..dataset import Dataset + + +class RoiDbSource(Dataset): + """ interface to load roidb data from files + """ + + def __init__(self, + anno_file, + image_dir=None, + samples=-1, + is_shuffle=True, + load_img=False, + cname2cid=None, + use_default_label=None, + mixup_epoch=-1, + with_background=True): + """ Init + + Args: + fname (str): label file path + image_dir (str): root dir for images + samples (int): samples to load, -1 means all + is_shuffle (bool): whether to shuffle samples + load_img (bool): whether load data in this class + cname2cid (dict): the label name to id dictionary + use_default_label (bool):whether use the default mapping of label to id + mixup_epoch (int): parse mixup in first n epoch + with_background (bool): whether load background + as a class + """ + super(RoiDbSource, self).__init__() + self._epoch = -1 + assert os.path.isfile(anno_file) or os.path.isdir(anno_file), \ + 'anno_file {} is not a file or a directory'.format(anno_file) + self._fname = anno_file + self._image_dir = image_dir + if image_dir is not None: + assert os.path.isdir(image_dir), \ + 'image_dir {} is not a directory'.format(image_dir) + self._roidb = None + self._pos = -1 + self._drained = False + self._samples = samples + self._is_shuffle = is_shuffle + self._load_img = load_img + self.use_default_label = use_default_label + self._mixup_epoch = mixup_epoch + self._with_background = with_background + self.cname2cid = cname2cid + self._imid2path = None + + def __str__(self): + return 'RoiDbSource(fname:%s,epoch:%d,size:%d,pos:%d)' \ + % (self._fname, self._epoch, self.size(), self._pos) + + def next(self): + """ load next sample + """ + if self._epoch < 0: + self.reset() + if self._pos >= self._samples: + self._drained = True + raise StopIteration('%s no more data' % (str(self))) + sample = copy.deepcopy(self._roidb[self._pos]) + if self._load_img: + sample['image'] = self._load_image(sample['im_file']) + else: + sample['im_file'] = os.path.join(self._image_dir, sample['im_file']) + + if self._epoch < self._mixup_epoch: + mix_idx = random.randint(1, self._samples - 1) + mix_pos = (mix_idx + self._pos) % self._samples + sample['mixup'] = copy.deepcopy(self._roidb[mix_pos]) + if self._load_img: + sample['mixup']['image'] = \ + self._load_image(sample['mixup']['im_file']) + else: + sample['mixup']['im_file'] = \ + os.path.join(self._image_dir, sample['mixup']['im_file']) + self._pos += 1 + return sample + + def _load(self): + """ load data from file + """ + from . import loader + records, cname2cid = loader.load(self._fname, self._samples, + self._with_background, True, + self.use_default_label, self.cname2cid) + self.cname2cid = cname2cid + return records + + def _load_image(self, where): + fn = os.path.join(self._image_dir, where) + with open(fn, 'rb') as f: + return f.read() + + def reset(self): + """ implementation of Dataset.reset + """ + if self._roidb is None: + self._roidb = self._load() + + self._samples = len(self._roidb) + if self._is_shuffle: + random.shuffle(self._roidb) + + if self._epoch < 0: + self._epoch = 0 + else: + self._epoch += 1 + + self._pos = 0 + self._drained = False + + def size(self): + """ implementation of Dataset.size + """ + return len(self._roidb) + + def drained(self): + """ implementation of Dataset.drained + """ + assert self._epoch >= 0, 'The first epoch has not begin!' + return self._pos >= self.size() + + def epoch_id(self): + """ return epoch id for latest sample + """ + return self._epoch + + def get_imid2path(self): + """return image id to image path map""" + if self._imid2path is None: + self._imid2path = {} + for record in self._roidb: + im_id = record['im_id'] + im_id = im_id if isinstance(im_id, int) else im_id[0] + im_path = os.path.join(self._image_dir, record['im_file']) + self._imid2path[im_id] = im_path + return self._imid2path diff --git a/ppdet/data/source/simple_source.py b/ppdet/data/source/simple_source.py new file mode 100644 index 0000000000000000000000000000000000000000..a65dd054e369d97b303b9018facac3381440dfec --- /dev/null +++ b/ppdet/data/source/simple_source.py @@ -0,0 +1,113 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# function: +# interface to load data from txt file. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import numpy as np +import copy +from ..dataset import Dataset + + +class SimpleSource(Dataset): + """ + Load image files for testing purpose + + Args: + images (list): list of path of images + samples (int): number of samples to load, -1 means all + load_img (bool): should images be loaded + """ + + def __init__(self, + images=[], + samples=-1, + load_img=True, + **kwargs): + super(SimpleSource, self).__init__() + self._epoch = -1 + for image in images: + assert image != '' and os.path.isfile(image), \ + "Image {} not found".format(image) + self._images = images + self._fname = None + self._simple = None + self._pos = -1 + self._drained = False + self._samples = samples + self._load_img = load_img + self._imid2path = {} + + def next(self): + if self._epoch < 0: + self.reset() + + if self._pos >= self.size(): + self._drained = True + raise StopIteration("no more data in " + str(self)) + else: + sample = copy.deepcopy(self._simple[self._pos]) + if self._load_img: + sample['image'] = self._load_image(sample['im_file']) + + self._pos += 1 + return sample + + def _load(self): + ct = 0 + records = [] + for image in self._images: + if self._samples > 0 and ct >= self._samples: + break + rec = {'im_id': np.array([ct]), 'im_file': image} + self._imid2path[ct] = image + ct += 1 + records.append(rec) + assert len(records) > 0, "no image file found" + return records + + def _load_image(self, where): + with open(where, 'rb') as f: + return f.read() + + def reset(self): + if self._simple is None: + self._simple = self._load() + + if self._epoch < 0: + self._epoch = 0 + else: + self._epoch += 1 + + self._pos = 0 + self._drained = False + + def size(self): + return len(self._simple) + + def drained(self): + assert self._epoch >= 0, "the first epoch has not started yet" + return self._pos >= self.size() + + def epoch_id(self): + return self._epoch + + def get_imid2path(self): + """return image id to image path map""" + return self._imid2path diff --git a/ppdet/data/source/voc_loader.py b/ppdet/data/source/voc_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..4b25eecb84f81de75d568be6b749b08280fa779f --- /dev/null +++ b/ppdet/data/source/voc_loader.py @@ -0,0 +1,284 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np + +import xml.etree.ElementTree as ET + + +def get_roidb(anno_path, + sample_num=-1, + cname2cid=None, + with_background=True): + """ + Load VOC records with annotations in xml directory 'anno_path' + + Notes: + ${anno_path}/ImageSets/Main/train.txt must contains xml file names for annotations + ${anno_path}/Annotations/xxx.xml must contain annotation info for one record + + Args: + anno_path (str): root directory for voc annotation data + sample_num (int): number of samples to load, -1 means all + cname2cid (dict): the label name to id dictionary + with_background (bool): whether load background as a class. + if True, total class number will + be 81. default True + + Returns: + (records, catname2clsid) + 'records' is list of dict whose structure is: + { + 'im_file': im_fname, # image file name + 'im_id': im_id, # image id + 'h': im_h, # height of image + 'w': im_w, # width + 'is_crowd': is_crowd, + 'gt_class': gt_class, + 'gt_bbox': gt_bbox, + 'gt_poly': gt_poly, + } + 'cname2id' is a dict to map category name to class id + """ + + txt_file = anno_path + part = txt_file.split('ImageSets') + xml_path = os.path.join(part[0], 'Annotations') + assert os.path.isfile(txt_file) and \ + os.path.isdir(xml_path), 'invalid xml path' + + records = [] + ct = 0 + existence = False if cname2cid is None else True + if cname2cid is None: + cname2cid = {} + + # mapping category name to class id + # background:0, first_class:1, second_class:2, ... + with open(txt_file, 'r') as fr: + while True: + line = fr.readline() + if not line: + break + fname = line.strip() + '.xml' + xml_file = os.path.join(xml_path, fname) + if not os.path.isfile(xml_file): + continue + tree = ET.parse(xml_file) + im_fname = tree.find('filename').text + if tree.find('id') is None: + im_id = np.array([ct]) + else: + im_id = np.array([int(tree.find('id').text)]) + + objs = tree.findall('object') + im_w = float(tree.find('size').find('width').text) + im_h = float(tree.find('size').find('height').text) + gt_bbox = np.zeros((len(objs), 4), dtype=np.float32) + gt_class = np.zeros((len(objs), 1), dtype=np.int32) + gt_score = np.ones((len(objs), 1), dtype=np.float32) + is_crowd = np.zeros((len(objs), 1), dtype=np.int32) + difficult = np.zeros((len(objs), 1), dtype=np.int32) + for i, obj in enumerate(objs): + cname = obj.find('name').text + if not existence and cname not in cname2cid: + # the background's id is 0, so need to add 1. + cname2cid[cname] = len(cname2cid) + int(with_background) + elif existence and cname not in cname2cid: + raise KeyError( + 'Not found cname[%s] in cname2cid when map it to cid.' % + (cname)) + gt_class[i][0] = cname2cid[cname] + _difficult = int(obj.find('difficult').text) + x1 = float(obj.find('bndbox').find('xmin').text) + y1 = float(obj.find('bndbox').find('ymin').text) + x2 = float(obj.find('bndbox').find('xmax').text) + y2 = float(obj.find('bndbox').find('ymax').text) + x1 = max(0, x1) + y1 = max(0, y1) + x2 = min(im_w - 1, x2) + y2 = min(im_h - 1, y2) + gt_bbox[i] = [x1, y1, x2, y2] + is_crowd[i][0] = 0 + difficult[i][0] = _difficult + voc_rec = { + 'im_file': im_fname, + 'im_id': im_id, + 'h': im_h, + 'w': im_w, + 'is_crowd': is_crowd, + 'gt_class': gt_class, + 'gt_score': gt_score, + 'gt_bbox': gt_bbox, + 'gt_poly': [], + 'difficult': difficult + } + if len(objs) != 0: + records.append(voc_rec) + + ct += 1 + if sample_num > 0 and ct >= sample_num: + break + assert len(records) > 0, 'not found any voc record in %s' % (anno_path) + return [records, cname2cid] + + +def load(anno_path, + sample_num=-1, + use_default_label=True, + with_background=True): + """ + Load VOC records with annotations in + xml directory 'anno_path' + + Notes: + ${anno_path}/ImageSets/Main/train.txt must contains xml file names for annotations + ${anno_path}/Annotations/xxx.xml must contain annotation info for one record + + Args: + @anno_path (str): root directory for voc annotation data + @sample_num (int): number of samples to load, -1 means all + @use_default_label (bool): whether use the default mapping of label to id + @with_background (bool): whether load background as a class. + if True, total class number will + be 81. default True + + Returns: + (records, catname2clsid) + 'records' is list of dict whose structure is: + { + 'im_file': im_fname, # image file name + 'im_id': im_id, # image id + 'h': im_h, # height of image + 'w': im_w, # width + 'is_crowd': is_crowd, + 'gt_class': gt_class, + 'gt_bbox': gt_bbox, + 'gt_poly': gt_poly, + } + 'cname2id' is a dict to map category name to class id + """ + + txt_file = anno_path + part = txt_file.split('ImageSets') + xml_path = os.path.join(part[0], 'Annotations') + assert os.path.isfile(txt_file) and \ + os.path.isdir(xml_path), 'invalid xml path' + + # mapping category name to class id + # if with_background is True: + # background:0, first_class:1, second_class:2, ... + # if with_background is False: + # first_class:0, second_class:1, ... + records = [] + ct = 0 + cname2cid = {} + if not use_default_label: + label_path = os.path.join(part[0], 'ImageSets/Main/label_list.txt') + with open(label_path, 'r') as fr: + label_id = int(with_background) + for line in fr.readlines(): + cname2cid[line.strip()] = label_id + label_id += 1 + else: + cname2cid = pascalvoc_label(with_background) + + with open(txt_file, 'r') as fr: + while True: + line = fr.readline() + if not line: + break + fname = line.strip() + '.xml' + xml_file = os.path.join(xml_path, fname) + if not os.path.isfile(xml_file): + continue + tree = ET.parse(xml_file) + im_fname = tree.find('filename').text + if tree.find('id') is None: + im_id = np.array([ct]) + else: + im_id = np.array([int(tree.find('id').text)]) + + objs = tree.findall('object') + im_w = float(tree.find('size').find('width').text) + im_h = float(tree.find('size').find('height').text) + gt_bbox = np.zeros((len(objs), 4), dtype=np.float32) + gt_class = np.zeros((len(objs), 1), dtype=np.int32) + gt_score = np.ones((len(objs), 1), dtype=np.float32) + is_crowd = np.zeros((len(objs), 1), dtype=np.int32) + difficult = np.zeros((len(objs), 1), dtype=np.int32) + for i, obj in enumerate(objs): + cname = obj.find('name').text + gt_class[i][0] = cname2cid[cname] + _difficult = int(obj.find('difficult').text) + x1 = float(obj.find('bndbox').find('xmin').text) + y1 = float(obj.find('bndbox').find('ymin').text) + x2 = float(obj.find('bndbox').find('xmax').text) + y2 = float(obj.find('bndbox').find('ymax').text) + x1 = max(0, x1) + y1 = max(0, y1) + x2 = min(im_w - 1, x2) + y2 = min(im_h - 1, y2) + gt_bbox[i] = [x1, y1, x2, y2] + is_crowd[i][0] = 0 + difficult[i][0] = _difficult + voc_rec = { + 'im_file': im_fname, + 'im_id': im_id, + 'h': im_h, + 'w': im_w, + 'is_crowd': is_crowd, + 'gt_class': gt_class, + 'gt_score': gt_score, + 'gt_bbox': gt_bbox, + 'gt_poly': [], + 'difficult': difficult + } + if len(objs) != 0: + records.append(voc_rec) + + ct += 1 + if sample_num > 0 and ct >= sample_num: + break + assert len(records) > 0, 'not found any voc record in %s' % (anno_path) + return [records, cname2cid] + + +def pascalvoc_label(with_background=True): + labels_map = { + 'aeroplane': 1, + 'bicycle': 2, + 'bird': 3, + 'boat': 4, + 'bottle': 5, + 'bus': 6, + 'car': 7, + 'cat': 8, + 'chair': 9, + 'cow': 10, + 'diningtable': 11, + 'dog': 12, + 'horse': 13, + 'motorbike': 14, + 'person': 15, + 'pottedplant': 16, + 'sheep': 17, + 'sofa': 18, + 'train': 19, + 'tvmonitor': 20 + } + if not with_background: + labels_map = {k: v - 1 for k, v in labels_map.items()} + return labels_map diff --git a/ppdet/data/source/widerface_loader.py b/ppdet/data/source/widerface_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..97ed476004e8c7a6ef67dfe155487efd2277f15c --- /dev/null +++ b/ppdet/data/source/widerface_loader.py @@ -0,0 +1,128 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +import logging +logger = logging.getLogger(__name__) + + +def load(anno_path, + sample_num=-1, + cname2cid=None, + with_background=True): + """ + Load WiderFace records with 'anno_path' + + Args: + anno_path (str): root directory for voc annotation data + sample_num (int): number of samples to load, -1 means all + with_background (bool): whether load background as a class. + if True, total class number will + be 2. default True + + Returns: + (records, catname2clsid) + 'records' is list of dict whose structure is: + { + 'im_file': im_fname, # image file name + 'im_id': im_id, # image id + 'gt_class': gt_class, + 'gt_bbox': gt_bbox, + } + 'cname2id' is a dict to map category name to class id + """ + + txt_file = anno_path + + records = [] + ct = 0 + file_lists = _load_file_list(txt_file) + cname2cid = widerface_label(with_background) + + for item in file_lists: + im_fname = item[0] + im_id = np.array([ct]) + gt_bbox = np.zeros((len(item) - 2, 4), dtype=np.float32) + gt_class = np.ones((len(item) - 2, 1), dtype=np.int32) + for index_box in range(len(item)): + if index_box >= 2: + temp_info_box = item[index_box].split(' ') + xmin = float(temp_info_box[0]) + ymin = float(temp_info_box[1]) + w = float(temp_info_box[2]) + h = float(temp_info_box[3]) + # Filter out wrong labels + if w < 0 or h < 0: + continue + xmin = max(0, xmin) + ymin = max(0, ymin) + xmax = xmin + w + ymax = ymin + h + gt_bbox[index_box - 2] = [xmin, ymin, xmax, ymax] + + widerface_rec = { + 'im_file': im_fname, + 'im_id': im_id, + 'gt_bbox': gt_bbox, + 'gt_class': gt_class, + } + # logger.debug + if len(item) != 0: + records.append(widerface_rec) + + ct += 1 + if sample_num > 0 and ct >= sample_num: + break + assert len(records) > 0, 'not found any widerface in %s' % (anno_path) + logger.info('{} samples in file {}'.format(ct, anno_path)) + return records, cname2cid + + +def _load_file_list(input_txt): + with open(input_txt, 'r') as f_dir: + lines_input_txt = f_dir.readlines() + + file_dict = {} + num_class = 0 + for i in range(len(lines_input_txt)): + line_txt = lines_input_txt[i].strip('\n\t\r') + if '.jpg' in line_txt: + if i != 0: + num_class += 1 + file_dict[num_class] = [] + file_dict[num_class].append(line_txt) + if '.jpg' not in line_txt: + if len(line_txt) > 6: + split_str = line_txt.split(' ') + x1_min = float(split_str[0]) + y1_min = float(split_str[1]) + x2_max = float(split_str[2]) + y2_max = float(split_str[3]) + line_txt = str(x1_min) + ' ' + str(y1_min) + ' ' + str( + x2_max) + ' ' + str(y2_max) + file_dict[num_class].append(line_txt) + else: + file_dict[num_class].append(line_txt) + + return list(file_dict.values()) + + +def widerface_label(with_background=True): + labels_map = { + 'face': 1 + } + if not with_background: + labels_map = {k: v - 1 for k, v in labels_map.items()} + return labels_map diff --git a/ppdet/data/tests/000012.jpg b/ppdet/data/tests/000012.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b829107b842f6f15706744fdcbea05ec7341b311 Binary files /dev/null and b/ppdet/data/tests/000012.jpg differ diff --git a/ppdet/data/tests/coco.yml b/ppdet/data/tests/coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..80ae7ed9e424601082e233e7158d9c0a28f1727b --- /dev/null +++ b/ppdet/data/tests/coco.yml @@ -0,0 +1,48 @@ +DATA: + TRAIN: + ANNO_FILE: data/coco.test/train2017.roidb + IMAGE_DIR: data/coco.test/train2017 + SAMPLES: 10 + TYPE: RoiDbSource + VAL: + ANNO_FILE: data/coco.test/val2017.roidb + IMAGE_DIR: data/coco.test/val2017 + SAMPLES: 10 + TYPE: RoiDbSource +TRANSFORM: + TRAIN: + OPS: + - OP: DecodeImage + TO_RGB: False + - OP: RandomFlipImage + PROB: 0.5 + - OP: NormalizeImage + MEAN: [102.9801, 115.9465, 122.7717] + IS_SCALE: False + IS_CHANNEL_FIRST: False + - OP: ResizeImage + TARGET_SIZE: 800 + MAX_SIZE: 1333 + - OP: Permute + TO_BGR: False + - OP: ArrangeRCNN + BATCH_SIZE: 1 + IS_PADDING: True + DROP_LAST: False + WORKER_CONF: + BUFSIZE: 100 + WORKER_NUM: 4 + USE_PROCESS: True + MEMSIZE: 2G + VAL: + OPS: + - OP: DecodeImage + TO_RGB: True + - OP: ResizeImage + TARGET_SIZE: 224 + - OP: ArrangeSSD + BATCH_SIZE: 1 + WORKER_CONF: + BUFSIZE: 100 + WORKER_NUM: 4 + USE_PROCESS: True diff --git a/ppdet/data/tests/data/prepare_data.sh b/ppdet/data/tests/data/prepare_data.sh new file mode 100755 index 0000000000000000000000000000000000000000..a81abc5d80b16e795d8020d00a27ef7c851f0185 --- /dev/null +++ b/ppdet/data/tests/data/prepare_data.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +#function: +# prepare coco data for testing + +root=$(dirname `readlink -f ${BASH_SOURCE}[0]`) +cwd=`pwd` + +if [[ $cwd != $root ]];then + pushd $root 2>&1 1>/dev/null +fi + +test_coco_python2_url="http://filecenter.matrix.baidu.com/api/v1/file/wanglong03/coco.test.python2.zip/20190603095315/download" +test_coco_python3_url="http://filecenter.matrix.baidu.com/api/v1/file/wanglong03/coco.test.python3.zip/20190603095447/download" + +if [[ $1 = "python2" ]];then + test_coco_data_url=${test_coco_python2_url} + coco_zip_file="coco.test.python2.zip" +else + test_coco_data_url=${test_coco_python3_url} + coco_zip_file="coco.test.python3.zip" +fi +echo "download testing coco from url[${test_coco_data_url}]" +coco_root_dir=${coco_zip_file/.zip/} + +# clear already exist file or directory +rm -rf ${coco_root_dir} ${coco_zip_file} + +wget ${test_coco_data_url} -O ${coco_zip_file} +if [ -e $coco_zip_file ];then + echo "succeed to download ${coco_zip_file}, so unzip it" + unzip ${coco_zip_file} >/dev/null 2>&1 +fi + +if [ -e ${coco_root_dir} ];then + rm -rf coco.test + ln -s ${coco_root_dir} coco.test + echo "succeed to generate coco data in[${coco_root_dir}] for testing" + exit 0 +else + echo "failed to generate coco data" + exit 1 +fi diff --git a/ppdet/data/tests/rcnn_dataset.yml b/ppdet/data/tests/rcnn_dataset.yml new file mode 100644 index 0000000000000000000000000000000000000000..b57fd55e94df68a4a9ddbbaa60ca20226377c451 --- /dev/null +++ b/ppdet/data/tests/rcnn_dataset.yml @@ -0,0 +1,32 @@ +DATA: + TRAIN: + ANNO_FILE: data/coco.test/train2017.roidb + IMAGE_DIR: data/coco.test/train2017 + SAMPLES: 10 + IS_SHUFFLE: True + TYPE: RoiDbSource +TRANSFORM: + TRAIN: + OPS: + - OP: DecodeImage + TO_RGB: False + - OP: RandomFlipImage + PROB: 0.5 + - OP: NormalizeImage + MEAN: [102.9801, 115.9465, 122.7717] + IS_SCALE: False + IS_CHANNEL_FIRST: False + - OP: ResizeImage + TARGET_SIZE: 800 + MAX_SIZE: 1333 + - OP: Permute + TO_BGR: False + - OP: ArrangeRCNN + BATCH_SIZE: 1 + IS_PADDING: True + DROP_LAST: False + WORKER_CONF: + BUFSIZE: 100 + WORKER_NUM: 4 + MEMSIZE: 2G + USE_PROCESS: True diff --git a/ppdet/data/tests/run_all_tests.py b/ppdet/data/tests/run_all_tests.py new file mode 100644 index 0000000000000000000000000000000000000000..a1882d5dd38a4da14335244e7603af47568b7bbf --- /dev/null +++ b/ppdet/data/tests/run_all_tests.py @@ -0,0 +1,45 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#!/usr/bin/python +#-*-coding:utf-8-*- +"""Run all tests +""" + +import unittest +import test_loader +import test_operator +import test_roidb_source +import test_iterator_source +import test_transformer +import test_reader + +if __name__ == '__main__': + alltests = unittest.TestSuite([ + unittest.TestLoader().loadTestsFromTestCase(t) \ + for t in [ + test_loader.TestLoader, + test_operator.TestBase, + test_roidb_source.TestRoiDbSource, + test_iterator_source.TestIteratorSource, + test_transformer.TestTransformer, + test_reader.TestReader, + ] + ]) + + was_succ = unittest\ + .TextTestRunner(verbosity=2)\ + .run(alltests)\ + .wasSuccessful() + + exit(0 if was_succ else 1) diff --git a/ppdet/data/tests/set_env.py b/ppdet/data/tests/set_env.py new file mode 100644 index 0000000000000000000000000000000000000000..bc46ac0f10e3b1690cf59bd6803eedd0bee6d9d4 --- /dev/null +++ b/ppdet/data/tests/set_env.py @@ -0,0 +1,51 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +import os +import six +import logging + +import matplotlib +matplotlib.use('Agg', force=False) + +prefix = os.path.dirname(os.path.abspath(__file__)) + +#coco data for testing +if six.PY3: + version = 'python3' +else: + version = 'python2' + +data_root = os.path.join(prefix, 'data/coco.test.%s' % (version)) + +# coco data for testing +coco_data = { + 'TRAIN': { + 'ANNO_FILE': os.path.join(data_root, 'train2017.roidb'), + 'IMAGE_DIR': os.path.join(data_root, 'train2017') + }, + 'VAL': { + 'ANNO_FILE': os.path.join(data_root, 'val2017.roidb'), + 'IMAGE_DIR': os.path.join(data_root, 'val2017') + } +} + +script = os.path.join(os.path.dirname(__file__), 'data/prepare_data.sh') + +if not os.path.exists(data_root): + ret = os.system('bash %s %s' % (script, version)) + if ret != 0: + logging.error('not found file[%s], you should manually prepare ' + 'your data using "data/prepare_data.sh"' % (data_root)) + sys.exit(1) diff --git a/ppdet/data/tests/test_iterator_source.py b/ppdet/data/tests/test_iterator_source.py new file mode 100644 index 0000000000000000000000000000000000000000..c32a734fb258dce2205d24f1dc60479fadaf57a6 --- /dev/null +++ b/ppdet/data/tests/test_iterator_source.py @@ -0,0 +1,73 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import time +import unittest +import sys +import logging + +import set_env +from ppdet.data.source import IteratorSource + + +def _generate_iter_maker(num=10): + def _reader(): + for i in range(num): + yield {'image': 'image_' + str(i), 'label': i} + + return _reader + +class TestIteratorSource(unittest.TestCase): + """Test cases for dataset.source.roidb_source + """ + + @classmethod + def setUpClass(cls): + """ setup + """ + pass + + @classmethod + def tearDownClass(cls): + """ tearDownClass """ + pass + + def test_basic(self): + """ test basic apis 'next/size/drained' + """ + iter_maker = _generate_iter_maker() + iter_source = IteratorSource(iter_maker) + for i, sample in enumerate(iter_source): + self.assertTrue('image' in sample) + self.assertGreater(len(sample['image']), 0) + self.assertTrue(iter_source.drained()) + self.assertEqual(i + 1, iter_source.size()) + + def test_reset(self): + """ test functions 'reset/epoch_id' + """ + iter_maker = _generate_iter_maker() + iter_source = IteratorSource(iter_maker) + + self.assertTrue(iter_source.next() is not None) + self.assertEqual(iter_source.epoch_id(), 0) + + iter_source.reset() + + self.assertEqual(iter_source.epoch_id(), 1) + self.assertTrue(iter_source.next() is not None) + + +if __name__ == '__main__': + unittest.main() diff --git a/ppdet/data/tests/test_loader.py b/ppdet/data/tests/test_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..dc835aa0f12ecb82e267c57f7a9f875a9adc48ce --- /dev/null +++ b/ppdet/data/tests/test_loader.py @@ -0,0 +1,108 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +import os +import time +import unittest +import sys +import logging +import numpy as np + +import set_env + + +class TestLoader(unittest.TestCase): + """Test cases for dataset.source.loader + """ + + @classmethod + def setUpClass(cls): + """ setup + """ + cls.prefix = os.path.dirname(os.path.abspath(__file__)) + # json data + cls.anno_path = os.path.join(cls.prefix, + 'data/coco/instances_val2017.json') + cls.image_dir = os.path.join(cls.prefix, 'data/coco/val2017') + cls.anno_path1 = os.path.join(cls.prefix, + "data/voc/ImageSets/Main/train.txt") + cls.image_dir1 = os.path.join(cls.prefix, "data/voc/JPEGImages") + + @classmethod + def tearDownClass(cls): + """ tearDownClass """ + pass + + def test_load_coco_in_json(self): + """ test loading COCO data in json file + """ + from ppdet.data.source.coco_loader import load + if not os.path.exists(self.anno_path): + logging.warn('not found %s, so skip this test' % (self.anno_path)) + return + samples = 10 + records, cname2id = load(self.anno_path, samples) + self.assertEqual(len(records), samples) + self.assertGreater(len(cname2id), 0) + + def test_load_coco_in_roidb(self): + """ test loading COCO data in pickled records + """ + anno_path = os.path.join(self.prefix, + 'data/roidbs/instances_val2017.roidb') + + if not os.path.exists(anno_path): + logging.warn('not found %s, so skip this test' % (anno_path)) + return + + samples = 10 + from ppdet.data.source.loader import load_roidb + records, cname2cid = load_roidb(anno_path, samples) + self.assertEqual(len(records), samples) + self.assertGreater(len(cname2cid), 0) + + def test_load_voc_in_xml(self): + """ test loading VOC data in xml files + """ + from ppdet.data.source.voc_loader import load + if not os.path.exists(self.anno_path1): + logging.warn('not found %s, so skip this test' % (self.anno_path1)) + return + samples = 3 + records, cname2cid = load(self.anno_path1, samples) + self.assertEqual(len(records), samples) + self.assertGreater(len(cname2cid), 0) + + def test_load_voc_in_roidb(self): + """ test loading VOC data in pickled records + """ + anno_path = os.path.join(self.prefix, 'data/roidbs/train.roidb') + + if not os.path.exists(anno_path): + logging.warn('not found %s, so skip this test' % (anno_path)) + return + + samples = 3 + from ppdet.data.source.loader import load_roidb + records, cname2cid = load_roidb(anno_path, samples) + self.assertEqual(len(records), samples) + self.assertGreater(len(cname2cid), 0) + + +if __name__ == '__main__': + unittest.main() diff --git a/ppdet/data/tests/test_operator.py b/ppdet/data/tests/test_operator.py new file mode 100644 index 0000000000000000000000000000000000000000..85d5b229d48fab968f38639d5f7533e5646f25c3 --- /dev/null +++ b/ppdet/data/tests/test_operator.py @@ -0,0 +1,156 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest +import logging +import numpy as np +import set_env +import ppdet.data.transform as tf +logging.basicConfig(level=logging.INFO) + + +class TestBase(unittest.TestCase): + """Test cases for dataset.transform.operator + """ + + @classmethod + def setUpClass(cls, with_mixup=False): + """ setup + """ + roidb_fname = set_env.coco_data['TRAIN']['ANNO_FILE'] + image_dir = set_env.coco_data['TRAIN']['IMAGE_DIR'] + import pickle as pkl + with open(roidb_fname, 'rb') as f: + roidb = f.read() + roidb = pkl.loads(roidb) + fn = os.path.join(image_dir, roidb[0][0]['im_file']) + with open(fn, 'rb') as f: + roidb[0][0]['image'] = f.read() + if with_mixup: + mixup_fn = os.path.join(image_dir, roidb[0][1]['im_file']) + roidb[0][0]['mixup'] = roidb[0][1] + with open(fn, 'rb') as f: + roidb[0][0]['mixup']['image'] = f.read() + cls.sample = roidb[0][0] + + @classmethod + def tearDownClass(cls): + """ tearDownClass """ + pass + + def test_ops_all(self): + """ test operators + """ + # ResizeImage + ops_conf = [{ + 'op': 'DecodeImage' + }, { + 'op': 'ResizeImage', + 'target_size': 300, + 'max_size': 1333 + }] + mapper = tf.build_mapper(ops_conf) + self.assertTrue(mapper is not None) + data = self.sample.copy() + result0 = mapper(data) + self.assertIsNotNone(result0['image']) + self.assertEqual(len(result0['image'].shape), 3) + # RandFlipImage + ops_conf = [{'op': 'RandomFlipImage'}] + mapper = tf.build_mapper(ops_conf) + self.assertTrue(mapper is not None) + result1 = mapper(result0) + self.assertEqual(result1['image'].shape, result0['image'].shape) + self.assertEqual(result1['gt_bbox'].shape, result0['gt_bbox'].shape) + # NormalizeImage + ops_conf = [{'op': 'NormalizeImage', 'is_channel_first': False}] + mapper = tf.build_mapper(ops_conf) + self.assertTrue(mapper is not None) + result2 = mapper(result1) + im1 = result1['image'] + count = np.where(im1 <= 1)[0] + if im1.dtype == 'float64': + self.assertEqual(count, im1.shape[0] * im1.shape[1], im1.shape[2]) + # ArrangeSample + ops_conf = [{'op': 'ArrangeRCNN'}] + mapper = tf.build_mapper(ops_conf) + self.assertTrue(mapper is not None) + result3 = mapper(result2) + self.assertEqual(type(result3), tuple) + + def test_ops_part1(self): + """test Crop and Resize + """ + ops_conf = [{ + 'op': 'DecodeImage' + }, { + 'op': 'NormalizeBox' + }, { + 'op': 'CropImage', + 'batch_sampler': [[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]] + }] + mapper = tf.build_mapper(ops_conf) + self.assertTrue(mapper is not None) + data = self.sample.copy() + result = mapper(data) + self.assertEqual(len(result['image'].shape), 3) + + def test_ops_part2(self): + """test Expand and RandomDistort + """ + ops_conf = [{ + 'op': 'DecodeImage' + }, { + 'op': 'NormalizeBox' + }, { + 'op': 'ExpandImage', + 'max_ratio': 1.5, + 'prob': 1 + }] + mapper = tf.build_mapper(ops_conf) + self.assertTrue(mapper is not None) + data = self.sample.copy() + result = mapper(data) + self.assertEqual(len(result['image'].shape), 3) + self.assertGreater(result['gt_bbox'].shape[0], 0) + + def test_ops_part3(self): + """test Mixup and RandomInterp + """ + ops_conf = [{ + 'op': 'DecodeImage', + 'with_mixup': True, + }, { + 'op': 'MixupImage', + }, { + 'op': 'RandomInterpImage', + 'target_size': 608 + }] + mapper = tf.build_mapper(ops_conf) + self.assertTrue(mapper is not None) + data = self.sample.copy() + result = mapper(data) + self.assertEqual(len(result['image'].shape), 3) + self.assertGreater(result['gt_bbox'].shape[0], 0) + #self.assertGreater(result['gt_score'].shape[0], 0) + + +if __name__ == '__main__': + unittest.main() diff --git a/ppdet/data/tests/test_reader.py b/ppdet/data/tests/test_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..e94484dff077d9b12a82f7337cd9cb165e8bd23f --- /dev/null +++ b/ppdet/data/tests/test_reader.py @@ -0,0 +1,159 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import time +import unittest +import sys +import logging +import numpy as np +import yaml + +import set_env +from ppdet.data.reader import Reader +from ppdet.data.source import build_source +from ppdet.data.source import IteratorSource + + +class TestReader(unittest.TestCase): + """Test cases for dataset.reader + """ + + @classmethod + def setUpClass(cls): + """ setup + """ + prefix = os.path.dirname(os.path.abspath(__file__)) + coco_yml = os.path.join(prefix, 'coco.yml') + with open(coco_yml, 'rb') as f: + cls.coco_conf = yaml.load(f.read()) + + cls.coco_conf['DATA']['TRAIN'] = set_env.coco_data['TRAIN'] + cls.coco_conf['DATA']['VAL'] = set_env.coco_data['VAL'] + + rcnn_yml = os.path.join(prefix, 'rcnn_dataset.yml') + + with open(rcnn_yml, 'rb') as f: + cls.rcnn_conf = yaml.load(f.read()) + + cls.rcnn_conf['DATA']['TRAIN'] = set_env.coco_data['TRAIN'] + cls.rcnn_conf['DATA']['VAL'] = set_env.coco_data['VAL'] + + @classmethod + def tearDownClass(cls): + """ tearDownClass """ + pass + + def test_train(self): + """ Test reader for training + """ + coco = Reader( + self.coco_conf['DATA'], self.coco_conf['TRANSFORM'], maxiter=1000) + train_rd = coco.train() + self.assertTrue(train_rd is not None) + + ct = 0 + total = 0 + bytes = 0 + prev_ts = None + for sample in train_rd(): + if prev_ts is None: + start_ts = time.time() + prev_ts = start_ts + + ct += 1 + bytes += 4 * sample[0][0].size * len(sample[0]) + self.assertTrue(sample is not None) + cost = time.time() - prev_ts + if cost >= 1.0: + total += ct + qps = total / (time.time() - start_ts) + bps = bytes / (time.time() - start_ts) + + logging.info('got %d/%d samples in %.3fsec with qps:%d bps:%d' % + (ct, total, cost, qps, bps)) + bytes = 0 + ct = 0 + prev_ts = time.time() + + total += ct + self.assertEqual(total, coco._maxiter) + + def test_val(self): + """ Test reader for validation + """ + coco = Reader(self.coco_conf['DATA'], self.coco_conf['TRANSFORM'], 10) + val_rd = coco.val() + self.assertTrue(val_rd is not None) + + # test 3 epoches + for _ in range(3): + ct = 0 + for sample in val_rd(): + ct += 1 + self.assertTrue(sample is not None) + self.assertGreaterEqual(ct, coco._maxiter) + + def test_rcnn_train(self): + """ Test reader for training + """ + anno = self.rcnn_conf['DATA']['TRAIN']['ANNO_FILE'] + if not os.path.exists(anno): + logging.error('exit test_rcnn for not found file[%s]' % (anno)) + return + + rcnn = Reader(self.rcnn_conf['DATA'], self.rcnn_conf['TRANSFORM'], 10) + rcnn_rd = rcnn.train() + self.assertTrue(rcnn_rd is not None) + + ct = 0 + out = None + for sample in rcnn_rd(): + out = sample + ct += 1 + self.assertTrue(sample is not None) + self.assertEqual(out[0][0].shape[0], 3) + self.assertEqual(out[0][1].shape[0], 3) + self.assertEqual(out[0][3].shape[1], 4) + self.assertEqual(out[0][4].shape[1], 1) + self.assertEqual(out[0][5].shape[1], 1) + self.assertGreaterEqual(ct, rcnn._maxiter) + + def test_create(self): + """ Test create a reader using my source + """ + def _my_data_reader(): + mydata = build_source(self.rcnn_conf['DATA']['TRAIN']) + for i, sample in enumerate(mydata): + yield sample + + my_source = IteratorSource(_my_data_reader) + mode = 'TRAIN' + train_rd = Reader.create(mode, + self.rcnn_conf['DATA'][mode], + self.rcnn_conf['TRANSFORM'][mode], + max_iter=10, my_source=my_source) + + out = None + for sample in train_rd(): + out = sample + self.assertTrue(sample is not None) + self.assertEqual(out[0][0].shape[0], 3) + self.assertEqual(out[0][1].shape[0], 3) + self.assertEqual(out[0][3].shape[1], 4) + self.assertEqual(out[0][4].shape[1], 1) + self.assertEqual(out[0][5].shape[1], 1) + + +if __name__ == '__main__': + unittest.main() diff --git a/ppdet/data/tests/test_roidb_source.py b/ppdet/data/tests/test_roidb_source.py new file mode 100644 index 0000000000000000000000000000000000000000..105cc9cd7d9d971bf2a3f69cccf251ce2943275a --- /dev/null +++ b/ppdet/data/tests/test_roidb_source.py @@ -0,0 +1,74 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import time +import unittest +import sys +import logging + +import set_env +from ppdet.data.source import build_source + + +class TestRoiDbSource(unittest.TestCase): + """Test cases for dataset.source.roidb_source + """ + + @classmethod + def setUpClass(cls): + """ setup + """ + anno_path = set_env.coco_data['TRAIN']['ANNO_FILE'] + image_dir = set_env.coco_data['TRAIN']['IMAGE_DIR'] + cls.config = { + 'data_cf': { + 'anno_file': anno_path, + 'image_dir': image_dir, + 'samples': 100, + 'load_img': True + }, + 'cname2cid': None + } + + @classmethod + def tearDownClass(cls): + """ tearDownClass """ + pass + + def test_basic(self): + """ test basic apis 'next/size/drained' + """ + roi_source = build_source(self.config) + for i, sample in enumerate(roi_source): + self.assertTrue('image' in sample) + self.assertGreater(len(sample['image']), 0) + self.assertTrue(roi_source.drained()) + self.assertEqual(i + 1, roi_source.size()) + + def test_reset(self): + """ test functions 'reset/epoch_id' + """ + roi_source = build_source(self.config) + + self.assertTrue(roi_source.next() is not None) + self.assertEqual(roi_source.epoch_id(), 0) + + roi_source.reset() + + self.assertEqual(roi_source.epoch_id(), 1) + self.assertTrue(roi_source.next() is not None) + + +if __name__ == '__main__': + unittest.main() diff --git a/ppdet/data/tests/test_transformer.py b/ppdet/data/tests/test_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..9123669293e6d0f432d1e90aa32e933d7fb6851c --- /dev/null +++ b/ppdet/data/tests/test_transformer.py @@ -0,0 +1,117 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import time +import unittest +import sys +import logging +import numpy as np + +import set_env +import ppdet.data.transform as tf +from ppdet.data.source import build_source + +logger = logging.getLogger(__name__) + +logging.basicConfig(level=logging.INFO) + + +class TestTransformer(unittest.TestCase): + """Test cases for dataset.transform.transformer + """ + + @classmethod + def setUpClass(cls): + """ setup + """ + + prefix = os.path.dirname(os.path.abspath(__file__)) + # json data + anno_path = set_env.coco_data['TRAIN']['ANNO_FILE'] + image_dir = set_env.coco_data['TRAIN']['IMAGE_DIR'] + cls.sc_config = { + 'anno_file': anno_path, + 'image_dir': image_dir, + 'samples': 200 + } + + cls.ops = [{ + 'op': 'DecodeImage', + 'to_rgb': True + }, { + 'op': 'ResizeImage', + 'target_size': 800, + 'max_size': 1333 + }, { + 'op': 'ArrangeRCNN', + 'is_mask': False + }] + + @classmethod + def tearDownClass(cls): + """ tearDownClass """ + pass + + def test_map(self): + """ test transformer.map + """ + mapper = tf.build_mapper(self.ops) + ds = build_source(self.sc_config) + mapped_ds = tf.map(ds, mapper) + ct = 0 + for sample in mapped_ds: + self.assertTrue(type(sample[0]) is np.ndarray) + ct += 1 + + self.assertEqual(ct, mapped_ds.size()) + + def test_parallel_map(self): + """ test transformer.map with concurrent workers + """ + mapper = tf.build_mapper(self.ops) + ds = build_source(self.sc_config) + worker_conf = {'WORKER_NUM': 2, 'use_process': True} + mapped_ds = tf.map(ds, mapper, worker_conf) + + ct = 0 + for sample in mapped_ds: + self.assertTrue(type(sample[0]) is np.ndarray) + ct += 1 + + self.assertTrue(mapped_ds.drained()) + self.assertEqual(ct, mapped_ds.size()) + mapped_ds.reset() + + ct = 0 + for sample in mapped_ds: + self.assertTrue(type(sample[0]) is np.ndarray) + ct += 1 + + self.assertEqual(ct, mapped_ds.size()) + + def test_batch(self): + """ test batched dataset + """ + batchsize = 2 + mapper = tf.build_mapper(self.ops) + ds = build_source(self.sc_config) + mapped_ds = tf.map(ds, mapper) + batched_ds = tf.batch(mapped_ds, batchsize, True) + for sample in batched_ds: + out = sample + self.assertEqual(len(out), batchsize) + + +if __name__ == '__main__': + unittest.main() diff --git a/ppdet/data/tools/generate_data_for_training.py b/ppdet/data/tools/generate_data_for_training.py new file mode 100644 index 0000000000000000000000000000000000000000..30b196f61418f28edbdb80c95a1d9adbdf00c11b --- /dev/null +++ b/ppdet/data/tools/generate_data_for_training.py @@ -0,0 +1,147 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# function: +# tool used convert COCO or VOC data to a pickled file whose +# schema for each sample is the same. +# +# notes: +# Original data format of COCO or VOC can also be directly +# used by 'PPdetection' to train. +# This tool just convert data to a unified schema, +# and it's useful when debuging with small dataset. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import argparse + +import os +import sys +import logging +import pickle as pkl + +path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../') +if path not in sys.path: + sys.path.insert(0, path) + +from data.source import loader + + +def parse_args(): + """ parse arguments + """ + parser = argparse.ArgumentParser( + description='Generate Standard Dataset for PPdetection') + + parser.add_argument( + '--type', + type=str, + default='json', + help='file format of label file, eg: json for COCO and xml for VOC') + parser.add_argument( + '--annotation', + type=str, + help='label file name for COCO or VOC dataset, ' + 'eg: instances_val2017.json or train.txt') + parser.add_argument( + '--save-dir', + type=str, + default='roidb', + help='directory to save roidb file which contains pickled samples') + parser.add_argument( + '--samples', + type=int, + default=-1, + help='number of samples to dump, default to all') + + args = parser.parse_args() + return args + + +def dump_coco_as_pickle(args): + """ Load COCO data, and then save it as pickled file. + + Notes: + label file of COCO contains a json which consists + of label info for each sample + """ + samples = args.samples + save_dir = args.save_dir + if not os.path.exists(save_dir): + os.makedirs(save_dir) + anno_path = args.annotation + roidb, cat2id = loader.load(anno_path, samples, with_cat2id=True) + samples = len(roidb) + dsname = os.path.basename(anno_path).rstrip('.json') + roidb_fname = save_dir + "/%s.roidb" % (dsname) + with open(roidb_fname, "wb") as fout: + pkl.dump((roidb, cat2id), fout) + + #for rec in roidb: + # sys.stderr.write('%s\n' % (rec['im_file'])) + logging.info('dumped %d samples to file[%s]' % (samples, roidb_fname)) + + +def dump_voc_as_pickle(args): + """ Load VOC data, and then save it as pickled file. + + Notes: + we assume label file of VOC contains lines + each of which corresponds to a xml file + that contains it's label info + """ + samples = args.samples + save_dir = args.save_dir + if not os.path.exists(save_dir): + os.makedirs(save_dir) + save_dir = args.save_dir + anno_path = os.path.expanduser(args.annotation) + roidb, cat2id = loader.load( + anno_path, samples, with_cat2id=True, use_default_label=None) + samples = len(roidb) + part = anno_path.split('/') + dsname = part[-4] + roidb_fname = save_dir + "/%s.roidb" % (dsname) + with open(roidb_fname, "wb") as fout: + pkl.dump((roidb, cat2id), fout) + anno_path = os.path.join(anno_path.split('/train.txt')[0], 'label_list.txt') + with open(anno_path, 'w') as fw: + for key in cat2id.keys(): + fw.write(key + '\n') + logging.info('dumped %d samples to file[%s]' % (samples, roidb_fname)) + + +if __name__ == "__main__": + """ Make sure you have already downloaded original COCO or VOC data, + then you can convert it using this tool. + + Usage: + python generate_data_for_training.py --type=json + --annotation=./annotations/instances_val2017.json + --save-dir=./roidb --samples=100 + """ + args = parse_args() + + # VOC data are organized in xml files + if args.type == 'xml': + dump_voc_as_pickle(args) + # COCO data are organized in json file + elif args.type == 'json': + dump_coco_as_pickle(args) + else: + TypeError('Can\'t deal with {} type. '\ + 'Only xml or json file format supported'.format(args.type)) diff --git a/ppdet/data/tools/labelme2coco.py b/ppdet/data/tools/labelme2coco.py new file mode 100644 index 0000000000000000000000000000000000000000..4cacd064c08cf9fbef867b1bb472012054e5d820 --- /dev/null +++ b/ppdet/data/tools/labelme2coco.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python +# coding: utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import glob +import json +import os +import os.path as osp +import sys +import shutil + +import numpy as np +import PIL.ImageDraw + + +class MyEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return super(MyEncoder, self).default(obj) + + +def getbbox(self, points): + polygons = points + mask = self.polygons_to_mask([self.height, self.width], polygons) + return self.mask2box(mask) + + +def images(data, num): + image = {} + image['height'] = data['imageHeight'] + image['width'] = data['imageWidth'] + image['id'] = num + 1 + image['file_name'] = data['imagePath'].split('/')[-1] + return image + + +def categories(label, labels_list): + category = {} + category['supercategory'] = 'component' + category['id'] = len(labels_list) + 1 + category['name'] = label + return category + + +def annotations_rectangle(points, label, image_num, object_num, label_to_num): + annotation = {} + seg_points = np.asarray(points).copy() + seg_points[1, :] = np.asarray(points)[2, :] + seg_points[2, :] = np.asarray(points)[1, :] + annotation['segmentation'] = [list(seg_points.flatten())] + annotation['iscrowd'] = 0 + annotation['image_id'] = image_num + 1 + annotation['bbox'] = list( + map(float, [ + points[0][0], points[0][1], points[1][0] - points[0][0], points[1][ + 1] - points[0][1] + ])) + annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3] + annotation['category_id'] = label_to_num[label] + annotation['id'] = object_num + 1 + return annotation + + +def annotations_polygon(height, width, points, label, image_num, object_num, label_to_num): + annotation = {} + annotation['segmentation'] = [list(np.asarray(points).flatten())] + annotation['iscrowd'] = 0 + annotation['image_id'] = image_num + 1 + annotation['bbox'] = list(map(float, get_bbox(height, width, points))) + annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3] + annotation['category_id'] = label_to_num[label] + annotation['id'] = object_num + 1 + return annotation + + +def get_bbox(height, width, points): + polygons = points + mask = np.zeros([height, width], dtype=np.uint8) + mask = PIL.Image.fromarray(mask) + xy = list(map(tuple, polygons)) + PIL.ImageDraw.Draw(mask).polygon(xy=xy, outline=1, fill=1) + mask = np.array(mask, dtype=bool) + index = np.argwhere(mask == 1) + rows = index[:, 0] + clos = index[:, 1] + left_top_r = np.min(rows) + left_top_c = np.min(clos) + right_bottom_r = np.max(rows) + right_bottom_c = np.max(clos) + return [ + left_top_c, left_top_r, right_bottom_c - left_top_c, + right_bottom_r - left_top_r + ] + + +def deal_json(img_path, json_path): + data_coco = {} + label_to_num = {} + images_list = [] + categories_list = [] + annotations_list = [] + labels_list = [] + image_num = -1 + for img_file in os.listdir(img_path): + img_label = img_file.split('.')[0] + label_file = osp.join(json_path, img_label + '.json') + print('Generating dataset from:', label_file) + image_num = image_num + 1 + with open(label_file) as f: + data = json.load(f) + images_list.append(images(data, image_num)) + object_num = -1 + for shapes in data['shapes']: + object_num = object_num + 1 + label = shapes['label'] + if label not in labels_list: + categories_list.append(categories(label, labels_list)) + labels_list.append(label) + label_to_num[label] = len(labels_list) + points = shapes['points'] + p_type = shapes['shape_type'] + if p_type == 'polygon': + annotations_list.append( + annotations_polygon(data['imageHeight'], data[ + 'imageWidth'], points, label, image_num, object_num, label_to_num)) + + if p_type == 'rectangle': + points.append([points[0][0], points[1][1]]) + points.append([points[1][0], points[0][1]]) + annotations_list.append( + annotations_rectangle(points, label, image_num, object_num, label_to_num)) + data_coco['images'] = images_list + data_coco['categories'] = categories_list + data_coco['annotations'] = annotations_list + return data_coco + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--json_input_dir', help='input annotated directory') + parser.add_argument('--image_input_dir', help='image directory') + parser.add_argument( + '--output_dir', help='output dataset directory', default='../../../') + parser.add_argument( + '--train_proportion', + help='the proportion of train dataset', + type=float, + default=1.0) + parser.add_argument( + '--val_proportion', + help='the proportion of validation dataset', + type=float, + default=0.0) + parser.add_argument( + '--test_proportion', + help='the proportion of test dataset', + type=float, + default=0.0) + args = parser.parse_args() + try: + assert os.path.exists(args.json_input_dir) + except AssertionError as e: + print('The json folder does not exist!') + os._exit(0) + try: + assert os.path.exists(args.image_input_dir) + except AssertionError as e: + print('The image folder does not exist!') + os._exit(0) + try: + assert args.train_proportion + args.val_proportion + args.test_proportion == 1.0 + except AssertionError as e: + print( + 'The sum of pqoportion of training, validation and test datase must be 1!' + ) + os._exit(0) + + # Allocate the dataset. + total_num = len(glob.glob(osp.join(args.json_input_dir, '*.json'))) + if args.train_proportion != 0: + train_num = int(total_num * args.train_proportion) + os.makedirs(args.output_dir + '/train') + else: + train_num = 0 + if args.val_proportion == 0.0: + val_num = 0 + test_num = total_num - train_num + if args.test_proportion != 0.0: + os.makedirs(args.output_dir + '/test') + else: + val_num = int(total_num * args.val_proportion) + test_num = total_num - train_num - val_num + os.makedirs(args.output_dir + '/val') + if args.test_proportion != 0.0: + os.makedirs(args.output_dir + '/test') + count = 1 + for img_name in os.listdir(args.image_input_dir): + if count <= train_num: + shutil.copyfile( + osp.join(args.image_input_dir, img_name), + osp.join(args.output_dir + '/train/', img_name)) + else: + if count <= train_num + val_num: + shutil.copyfile( + osp.join(args.image_input_dir, img_name), + osp.join(args.output_dir + '/val/', img_name)) + else: + shutil.copyfile( + osp.join(args.image_input_dir, img_name), + osp.join(args.output_dir + '/test/', img_name)) + count = count + 1 + + # Deal with the json files. + if not os.path.exists(args.output_dir + '/annotations'): + os.makedirs(args.output_dir + '/annotations') + if args.train_proportion != 0: + train_data_coco = deal_json(args.output_dir + '/train', + args.json_input_dir) + train_json_path = osp.join(args.output_dir + '/annotations', + 'instance_train.json') + json.dump( + train_data_coco, + open(train_json_path, 'w'), + indent=4, + cls=MyEncoder) + if args.val_proportion != 0: + val_data_coco = deal_json(args.output_dir + '/val', args.json_input_dir) + val_json_path = osp.join(args.output_dir + '/annotations', + 'instance_val.json') + json.dump( + val_data_coco, open(val_json_path, 'w'), indent=4, cls=MyEncoder) + if args.test_proportion != 0: + test_data_coco = deal_json(args.output_dir + '/test', + args.json_input_dir) + test_json_path = osp.join(args.output_dir + '/annotations', + 'instance_test.json') + json.dump( + test_data_coco, open(test_json_path, 'w'), indent=4, cls=MyEncoder) + +if __name__ == '__main__': + main() diff --git a/ppdet/data/transform/__init__.py b/ppdet/data/transform/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f4d15e9c3354c6c44912c456eb766eae4d7d32c5 --- /dev/null +++ b/ppdet/data/transform/__init__.py @@ -0,0 +1,143 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import print_function + +import copy +import logging +import traceback + +from .transformer import MappedDataset, BatchedDataset +from .post_map import build_post_map +from .parallel_map import ParallelMappedDataset +from .operators import BaseOperator, registered_ops + +__all__ = ['build_mapper', 'map', 'batch', 'batch_map'] + +logger = logging.getLogger(__name__) + + +def build_mapper(ops, context=None): + """ + Build a mapper for operators in 'ops' + + Args: + ops (list of operator.BaseOperator or list of op dict): + configs for oprators, eg: + [{'name': 'DecodeImage', 'params': {'to_rgb': True}}, {xxx}] + context (dict): a context object for mapper + + Returns: + a mapper function which accept one argument 'sample' and + return the processed result + """ + new_ops = [] + for _dict in ops: + new_dict = {} + for i, j in _dict.items(): + new_dict[i.lower()] = j + new_ops.append(new_dict) + ops = new_ops + op_funcs = [] + op_repr = [] + for op in ops: + if type(op) is dict and 'op' in op: + op_func = getattr(BaseOperator, op['op']) + params = copy.deepcopy(op) + del params['op'] + o = op_func(**params) + elif not isinstance(op, BaseOperator): + op_func = getattr(BaseOperator, op['name']) + params = {} if 'params' not in op else op['params'] + o = op_func(**params) + else: + assert isinstance(op, BaseOperator), \ + "invalid operator when build ops" + o = op + op_funcs.append(o) + op_repr.append('{{{}}}'.format(str(o))) + op_repr = '[{}]'.format(','.join(op_repr)) + + def _mapper(sample): + ctx = {} if context is None else copy.deepcopy(context) + for f in op_funcs: + try: + out = f(sample, ctx) + sample = out + except Exception as e: + stack_info = traceback.format_exc() + logger.warn("fail to map op [{}] with error: {} and stack:\n{}". + format(f, e, str(stack_info))) + raise e + + return out + + _mapper.ops = op_repr + return _mapper + + +def map(ds, mapper, worker_args=None): + """ + Apply 'mapper' to 'ds' + + Args: + ds (instance of Dataset): dataset to be mapped + mapper (function): action to be executed for every data sample + worker_args (dict): configs for concurrent mapper + Returns: + a mapped dataset + """ + + if worker_args is not None: + return ParallelMappedDataset(ds, mapper, worker_args) + else: + return MappedDataset(ds, mapper) + + +def batch(ds, batchsize, drop_last=False, drop_empty=True): + """ + Batch data samples to batches + Args: + batchsize (int): number of samples for a batch + drop_last (bool): drop last few samples if not enough for a batch + + Returns: + a batched dataset + """ + + return BatchedDataset( + ds, batchsize, drop_last=drop_last, drop_empty=drop_empty) + + +def batch_map(ds, config): + """ + Post process the batches. + + Args: + ds (instance of Dataset): dataset to be mapped + mapper (function): action to be executed for every batch + Returns: + a batched dataset which is processed + """ + + mapper = build_post_map(**config) + return MappedDataset(ds, mapper) + + +for nm in registered_ops: + op = getattr(BaseOperator, nm) + locals()[nm] = op + +__all__ += registered_ops diff --git a/ppdet/data/transform/arrange_sample.py b/ppdet/data/transform/arrange_sample.py new file mode 100644 index 0000000000000000000000000000000000000000..bebce691d36ddb12141dd7bfdf81030ff8ed2d1f --- /dev/null +++ b/ppdet/data/transform/arrange_sample.py @@ -0,0 +1,384 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# function: +# operators to process sample, +# eg: decode/resize/crop image + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import logging +import numpy as np +from .operators import BaseOperator, register_op + +logger = logging.getLogger(__name__) + + +@register_op +class ArrangeRCNN(BaseOperator): + """ + Transform dict to tuple format needed for training. + + Args: + is_mask (bool): whether to use include mask data + """ + + def __init__(self, is_mask=False): + super(ArrangeRCNN, self).__init__() + self.is_mask = is_mask + assert isinstance(self.is_mask, bool), "wrong type for is_mask" + + def __call__(self, sample, context=None): + """ + Args: + sample: a dict which contains image + info and annotation info. + context: a dict which contains additional info. + Returns: + sample: a tuple containing following items + (image, im_info, im_id, gt_bbox, gt_class, is_crowd, gt_masks) + """ + im = sample['image'] + gt_bbox = sample['gt_bbox'] + gt_class = sample['gt_class'] + keys = list(sample.keys()) + if 'is_crowd' in keys: + is_crowd = sample['is_crowd'] + else: + raise KeyError("The dataset doesn't have 'is_crowd' key.") + if 'im_info' in keys: + im_info = sample['im_info'] + else: + raise KeyError("The dataset doesn't have 'im_info' key.") + im_id = sample['im_id'] + + outs = (im, im_info, im_id, gt_bbox, gt_class, is_crowd) + gt_masks = [] + if self.is_mask and len(sample['gt_poly']) != 0 \ + and 'is_crowd' in keys: + valid = True + segms = sample['gt_poly'] + assert len(segms) == is_crowd.shape[0] + for i in range(len(sample['gt_poly'])): + segm, iscrowd = segms[i], is_crowd[i] + gt_segm = [] + if iscrowd: + gt_segm.append([[0, 0]]) + else: + for poly in segm: + if len(poly) == 0: + valid = False + break + gt_segm.append(np.array(poly).reshape(-1, 2)) + if (not valid) or len(gt_segm) == 0: + break + gt_masks.append(gt_segm) + outs = outs + (gt_masks, ) + return outs + + +@register_op +class ArrangeEvalRCNN(BaseOperator): + """ + Transform dict to the tuple format needed for evaluation. + """ + + def __init__(self): + super(ArrangeEvalRCNN, self).__init__() + + def __call__(self, sample, context=None): + """ + Args: + sample: a dict which contains image + info and annotation info. + context: a dict which contains additional info. + Returns: + sample: a tuple containing the following items: + (image, im_info, im_id, im_shape, gt_bbox, + gt_class, difficult) + """ + ims = [] + keys = sorted(list(sample.keys())) + for k in keys: + if 'image' in k: + ims.append(sample[k]) + if 'im_info' in keys: + im_info = sample['im_info'] + else: + raise KeyError("The dataset doesn't have 'im_info' key.") + im_id = sample['im_id'] + h = sample['h'] + w = sample['w'] + # For rcnn models in eval and infer stage, original image size + # is needed to clip the bounding boxes. And box clip op in + # bbox prediction needs im_info as input in format of [N, 3], + # so im_shape is appended by 1 to match dimension. + im_shape = np.array((h, w, 1), dtype=np.float32) + gt_bbox = sample['gt_bbox'] + gt_class = sample['gt_class'] + difficult = sample['difficult'] + remain_list = [im_info, im_id, im_shape, gt_bbox, gt_class, difficult] + ims.extend(remain_list) + outs = tuple(ims) + return outs + + +@register_op +class ArrangeTestRCNN(BaseOperator): + """ + Transform dict to the tuple format needed for training. + """ + + def __init__(self): + super(ArrangeTestRCNN, self).__init__() + + def __call__(self, sample, context=None): + """ + Args: + sample: a dict which contains image + info and annotation info. + context: a dict which contains additional info. + Returns: + sample: a tuple containing the following items: + (image, im_info, im_id, im_shape) + """ + ims = [] + keys = sorted(list(sample.keys())) + for k in keys: + if 'image' in k: + ims.append(sample[k]) + if 'im_info' in keys: + im_info = sample['im_info'] + else: + raise KeyError("The dataset doesn't have 'im_info' key.") + im_id = sample['im_id'] + h = sample['h'] + w = sample['w'] + # For rcnn models in eval and infer stage, original image size + # is needed to clip the bounding boxes. And box clip op in + # bbox prediction needs im_info as input in format of [N, 3], + # so im_shape is appended by 1 to match dimension. + im_shape = np.array((h, w, 1), dtype=np.float32) + remain_list = [im_info, im_id, im_shape] + ims.extend(remain_list) + outs = tuple(ims) + return outs + + +@register_op +class ArrangeSSD(BaseOperator): + """ + Transform dict to tuple format needed for training. + """ + + def __init__(self): + super(ArrangeSSD, self).__init__() + + def __call__(self, sample, context=None): + """ + Args: + sample: a dict which contains image + info and annotation info. + context: a dict which contains additional info. + Returns: + sample: a tuple containing the following items: + (image, gt_bbox, gt_class, difficult) + """ + im = sample['image'] + gt_bbox = sample['gt_bbox'] + gt_class = sample['gt_class'] + outs = (im, gt_bbox, gt_class) + return outs + + +@register_op +class ArrangeEvalSSD(BaseOperator): + """ + Transform dict to tuple format needed for training. + """ + + def __init__(self, fields): + super(ArrangeEvalSSD, self).__init__() + self.fields = fields + + def __call__(self, sample, context=None): + """ + Args: + sample: a dict which contains image + info and annotation info. + context: a dict which contains additional info. + Returns: + sample: a tuple containing the following items: (image) + """ + outs = [] + if len(sample['gt_bbox']) != len(sample['gt_class']): + raise ValueError("gt num mismatch: bbox and class.") + for field in self.fields: + if field == 'im_shape': + h = sample['h'] + w = sample['w'] + im_shape = np.array((h, w)) + outs.append(im_shape) + elif field == 'is_difficult': + outs.append(sample['difficult']) + elif field == 'gt_box': + outs.append(sample['gt_bbox']) + elif field == 'gt_label': + outs.append(sample['gt_class']) + else: + outs.append(sample[field]) + + outs = tuple(outs) + + return outs + + +@register_op +class ArrangeTestSSD(BaseOperator): + """ + Transform dict to tuple format needed for training. + + Args: + is_mask (bool): whether to use include mask data + """ + + def __init__(self): + super(ArrangeTestSSD, self).__init__() + + def __call__(self, sample, context=None): + """ + Args: + sample: a dict which contains image + info and annotation info. + context: a dict which contains additional info. + Returns: + sample: a tuple containing the following items: (image) + """ + im = sample['image'] + im_id = sample['im_id'] + h = sample['h'] + w = sample['w'] + im_shape = np.array((h, w)) + outs = (im, im_id, im_shape) + return outs + + +@register_op +class ArrangeYOLO(BaseOperator): + """ + Transform dict to the tuple format needed for training. + """ + + def __init__(self): + super(ArrangeYOLO, self).__init__() + + def __call__(self, sample, context=None): + """ + Args: + sample: a dict which contains image + info and annotation info. + context: a dict which contains additional info. + Returns: + sample: a tuple containing the following items: + (image, gt_bbox, gt_class, gt_score, + is_crowd, im_info, gt_masks) + """ + im = sample['image'] + if len(sample['gt_bbox']) != len(sample['gt_class']): + raise ValueError("gt num mismatch: bbox and class.") + if len(sample['gt_bbox']) != len(sample['gt_score']): + raise ValueError("gt num mismatch: bbox and score.") + gt_bbox = np.zeros((50, 4), dtype=im.dtype) + gt_class = np.zeros((50, ), dtype=np.int32) + gt_score = np.zeros((50, ), dtype=im.dtype) + gt_num = min(50, len(sample['gt_bbox'])) + if gt_num > 0: + gt_bbox[:gt_num, :] = sample['gt_bbox'][:gt_num, :] + gt_class[:gt_num] = sample['gt_class'][:gt_num, 0] + gt_score[:gt_num] = sample['gt_score'][:gt_num, 0] + # parse [x1, y1, x2, y2] to [x, y, w, h] + gt_bbox[:, 2:4] = gt_bbox[:, 2:4] - gt_bbox[:, :2] + gt_bbox[:, :2] = gt_bbox[:, :2] + gt_bbox[:, 2:4] / 2. + outs = (im, gt_bbox, gt_class, gt_score) + return outs + + +@register_op +class ArrangeEvalYOLO(BaseOperator): + """ + Transform dict to the tuple format needed for evaluation. + """ + + def __init__(self): + super(ArrangeEvalYOLO, self).__init__() + + def __call__(self, sample, context=None): + """ + Args: + sample: a dict which contains image + info and annotation info. + context: a dict which contains additional info. + Returns: + sample: a tuple containing the following items: + (image, im_shape, im_id, gt_bbox, gt_class, + difficult) + """ + im = sample['image'] + if len(sample['gt_bbox']) != len(sample['gt_class']): + raise ValueError("gt num mismatch: bbox and class.") + im_id = sample['im_id'] + h = sample['h'] + w = sample['w'] + im_shape = np.array((h, w)) + gt_bbox = np.zeros((50, 4), dtype=im.dtype) + gt_class = np.zeros((50, ), dtype=np.int32) + difficult = np.zeros((50, ), dtype=np.int32) + gt_num = min(50, len(sample['gt_bbox'])) + if gt_num > 0: + gt_bbox[:gt_num, :] = sample['gt_bbox'][:gt_num, :] + gt_class[:gt_num] = sample['gt_class'][:gt_num, 0] + difficult[:gt_num] = sample['difficult'][:gt_num, 0] + outs = (im, im_shape, im_id, gt_bbox, gt_class, difficult) + return outs + + +@register_op +class ArrangeTestYOLO(BaseOperator): + """ + Transform dict to the tuple format needed for inference. + """ + + def __init__(self): + super(ArrangeTestYOLO, self).__init__() + + def __call__(self, sample, context=None): + """ + Args: + sample: a dict which contains image + info and annotation info. + context: a dict which contains additional info. + Returns: + sample: a tuple containing the following items: + (image, gt_bbox, gt_class, gt_score, is_crowd, + im_info, gt_masks) + """ + im = sample['image'] + im_id = sample['im_id'] + h = sample['h'] + w = sample['w'] + im_shape = np.array((h, w)) + outs = (im, im_shape, im_id) + return outs diff --git a/ppdet/data/transform/op_helper.py b/ppdet/data/transform/op_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..838714f4dda2b664ae4d2b1f3ee343e5b6e50360 --- /dev/null +++ b/ppdet/data/transform/op_helper.py @@ -0,0 +1,389 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# this file contains helper methods for BBOX processing + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import random +import math +import cv2 + + +def meet_emit_constraint(src_bbox, sample_bbox): + center_x = (src_bbox[2] + src_bbox[0]) / 2 + center_y = (src_bbox[3] + src_bbox[1]) / 2 + if center_x >= sample_bbox[0] and \ + center_x <= sample_bbox[2] and \ + center_y >= sample_bbox[1] and \ + center_y <= sample_bbox[3]: + return True + return False + + +def clip_bbox(src_bbox): + src_bbox[0] = max(min(src_bbox[0], 1.0), 0.0) + src_bbox[1] = max(min(src_bbox[1], 1.0), 0.0) + src_bbox[2] = max(min(src_bbox[2], 1.0), 0.0) + src_bbox[3] = max(min(src_bbox[3], 1.0), 0.0) + return src_bbox + + +def bbox_area(src_bbox): + if src_bbox[2] < src_bbox[0] or src_bbox[3] < src_bbox[1]: + return 0. + else: + width = src_bbox[2] - src_bbox[0] + height = src_bbox[3] - src_bbox[1] + return width * height + + +def is_overlap(object_bbox, sample_bbox): + if object_bbox[0] >= sample_bbox[2] or \ + object_bbox[2] <= sample_bbox[0] or \ + object_bbox[1] >= sample_bbox[3] or \ + object_bbox[3] <= sample_bbox[1]: + return False + else: + return True + + +def filter_and_process(sample_bbox, bboxes, labels, scores=None): + new_bboxes = [] + new_labels = [] + new_scores = [] + for i in range(len(bboxes)): + new_bbox = [0, 0, 0, 0] + obj_bbox = [bboxes[i][0], bboxes[i][1], bboxes[i][2], bboxes[i][3]] + if not meet_emit_constraint(obj_bbox, sample_bbox): + continue + if not is_overlap(obj_bbox, sample_bbox): + continue + sample_width = sample_bbox[2] - sample_bbox[0] + sample_height = sample_bbox[3] - sample_bbox[1] + new_bbox[0] = (obj_bbox[0] - sample_bbox[0]) / sample_width + new_bbox[1] = (obj_bbox[1] - sample_bbox[1]) / sample_height + new_bbox[2] = (obj_bbox[2] - sample_bbox[0]) / sample_width + new_bbox[3] = (obj_bbox[3] - sample_bbox[1]) / sample_height + new_bbox = clip_bbox(new_bbox) + if bbox_area(new_bbox) > 0: + new_bboxes.append(new_bbox) + new_labels.append([labels[i][0]]) + if scores is not None: + new_scores.append([scores[i][0]]) + bboxes = np.array(new_bboxes) + labels = np.array(new_labels) + scores = np.array(new_scores) + return bboxes, labels, scores + + +def bbox_area_sampling(bboxes, labels, scores, target_size, min_size): + new_bboxes = [] + new_labels = [] + new_scores = [] + for i, bbox in enumerate(bboxes): + w = float((bbox[2] - bbox[0]) * target_size) + h = float((bbox[3] - bbox[1]) * target_size) + if w * h < float(min_size * min_size): + continue + else: + new_bboxes.append(bbox) + new_labels.append(labels[i]) + if scores is not None and scores.size != 0: + new_scores.append(scores[i]) + bboxes = np.array(new_bboxes) + labels = np.array(new_labels) + scores = np.array(new_scores) + return bboxes, labels, scores + + +def generate_sample_bbox(sampler): + scale = np.random.uniform(sampler[2], sampler[3]) + aspect_ratio = np.random.uniform(sampler[4], sampler[5]) + aspect_ratio = max(aspect_ratio, (scale**2.0)) + aspect_ratio = min(aspect_ratio, 1 / (scale**2.0)) + bbox_width = scale * (aspect_ratio**0.5) + bbox_height = scale / (aspect_ratio**0.5) + xmin_bound = 1 - bbox_width + ymin_bound = 1 - bbox_height + xmin = np.random.uniform(0, xmin_bound) + ymin = np.random.uniform(0, ymin_bound) + xmax = xmin + bbox_width + ymax = ymin + bbox_height + sampled_bbox = [xmin, ymin, xmax, ymax] + return sampled_bbox + + +def generate_sample_bbox_square(sampler, image_width, image_height): + scale = np.random.uniform(sampler[2], sampler[3]) + aspect_ratio = np.random.uniform(sampler[4], sampler[5]) + aspect_ratio = max(aspect_ratio, (scale**2.0)) + aspect_ratio = min(aspect_ratio, 1 / (scale**2.0)) + bbox_width = scale * (aspect_ratio**0.5) + bbox_height = scale / (aspect_ratio**0.5) + if image_height < image_width: + bbox_width = bbox_height * image_height / image_width + else: + bbox_height = bbox_width * image_width / image_height + xmin_bound = 1 - bbox_width + ymin_bound = 1 - bbox_height + xmin = np.random.uniform(0, xmin_bound) + ymin = np.random.uniform(0, ymin_bound) + xmax = xmin + bbox_width + ymax = ymin + bbox_height + sampled_bbox = [xmin, ymin, xmax, ymax] + return sampled_bbox + + +def data_anchor_sampling(bbox_labels, image_width, image_height, scale_array, + resize_width): + num_gt = len(bbox_labels) + # np.random.randint range: [low, high) + rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0 + + if num_gt != 0: + norm_xmin = bbox_labels[rand_idx][0] + norm_ymin = bbox_labels[rand_idx][1] + norm_xmax = bbox_labels[rand_idx][2] + norm_ymax = bbox_labels[rand_idx][3] + + xmin = norm_xmin * image_width + ymin = norm_ymin * image_height + wid = image_width * (norm_xmax - norm_xmin) + hei = image_height * (norm_ymax - norm_ymin) + range_size = 0 + + area = wid * hei + for scale_ind in range(0, len(scale_array) - 1): + if area > scale_array[scale_ind] ** 2 and area < \ + scale_array[scale_ind + 1] ** 2: + range_size = scale_ind + 1 + break + + if area > scale_array[len(scale_array) - 2]**2: + range_size = len(scale_array) - 2 + + scale_choose = 0.0 + if range_size == 0: + rand_idx_size = 0 + else: + # np.random.randint range: [low, high) + rng_rand_size = np.random.randint(0, range_size + 1) + rand_idx_size = rng_rand_size % (range_size + 1) + + if rand_idx_size == range_size: + min_resize_val = scale_array[rand_idx_size] / 2.0 + max_resize_val = min(2.0 * scale_array[rand_idx_size], + 2 * math.sqrt(wid * hei)) + scale_choose = random.uniform(min_resize_val, max_resize_val) + else: + min_resize_val = scale_array[rand_idx_size] / 2.0 + max_resize_val = 2.0 * scale_array[rand_idx_size] + scale_choose = random.uniform(min_resize_val, max_resize_val) + + sample_bbox_size = wid * resize_width / scale_choose + + w_off_orig = 0.0 + h_off_orig = 0.0 + if sample_bbox_size < max(image_height, image_width): + if wid <= sample_bbox_size: + w_off_orig = np.random.uniform(xmin + wid - sample_bbox_size, + xmin) + else: + w_off_orig = np.random.uniform(xmin, + xmin + wid - sample_bbox_size) + + if hei <= sample_bbox_size: + h_off_orig = np.random.uniform(ymin + hei - sample_bbox_size, + ymin) + else: + h_off_orig = np.random.uniform(ymin, + ymin + hei - sample_bbox_size) + + else: + w_off_orig = np.random.uniform(image_width - sample_bbox_size, 0.0) + h_off_orig = np.random.uniform(image_height - sample_bbox_size, 0.0) + + w_off_orig = math.floor(w_off_orig) + h_off_orig = math.floor(h_off_orig) + + # Figure out top left coordinates. + w_off = float(w_off_orig / image_width) + h_off = float(h_off_orig / image_height) + + sampled_bbox = [ + w_off, h_off, w_off + float(sample_bbox_size / image_width), + h_off + float(sample_bbox_size / image_height) + ] + return sampled_bbox + else: + return 0 + + +def jaccard_overlap(sample_bbox, object_bbox): + if sample_bbox[0] >= object_bbox[2] or \ + sample_bbox[2] <= object_bbox[0] or \ + sample_bbox[1] >= object_bbox[3] or \ + sample_bbox[3] <= object_bbox[1]: + return 0 + intersect_xmin = max(sample_bbox[0], object_bbox[0]) + intersect_ymin = max(sample_bbox[1], object_bbox[1]) + intersect_xmax = min(sample_bbox[2], object_bbox[2]) + intersect_ymax = min(sample_bbox[3], object_bbox[3]) + intersect_size = (intersect_xmax - intersect_xmin) * ( + intersect_ymax - intersect_ymin) + sample_bbox_size = bbox_area(sample_bbox) + object_bbox_size = bbox_area(object_bbox) + overlap = intersect_size / ( + sample_bbox_size + object_bbox_size - intersect_size) + return overlap + + +def intersect_bbox(bbox1, bbox2): + if bbox2[0] > bbox1[2] or bbox2[2] < bbox1[0] or \ + bbox2[1] > bbox1[3] or bbox2[3] < bbox1[1]: + intersection_box = [0.0, 0.0, 0.0, 0.0] + else: + intersection_box = [ + max(bbox1[0], bbox2[0]), max(bbox1[1], bbox2[1]), + min(bbox1[2], bbox2[2]), min(bbox1[3], bbox2[3]) + ] + return intersection_box + + +def bbox_coverage(bbox1, bbox2): + inter_box = intersect_bbox(bbox1, bbox2) + intersect_size = bbox_area(inter_box) + + if intersect_size > 0: + bbox1_size = bbox_area(bbox1) + return intersect_size / bbox1_size + else: + return 0. + + +def satisfy_sample_constraint(sampler, + sample_bbox, + gt_bboxes, + satisfy_all=False): + if sampler[6] == 0 and sampler[7] == 0: + return True + satisfied = [] + for i in range(len(gt_bboxes)): + object_bbox = [ + gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3] + ] + overlap = jaccard_overlap(sample_bbox, object_bbox) + if sampler[6] != 0 and \ + overlap < sampler[6]: + satisfied.append(False) + continue + if sampler[7] != 0 and \ + overlap > sampler[7]: + satisfied.append(False) + continue + satisfied.append(True) + if not satisfy_all: + return True + + if satisfy_all: + return np.all(satisfied) + else: + return False + + +def satisfy_sample_constraint_coverage(sampler, sample_bbox, gt_bboxes): + if sampler[6] == 0 and sampler[7] == 0: + has_jaccard_overlap = False + else: + has_jaccard_overlap = True + if sampler[8] == 0 and sampler[9] == 0: + has_object_coverage = False + else: + has_object_coverage = True + + if not has_jaccard_overlap and not has_object_coverage: + return True + found = False + for i in range(len(gt_bboxes)): + object_bbox = [ + gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3] + ] + if has_jaccard_overlap: + overlap = jaccard_overlap(sample_bbox, object_bbox) + if sampler[6] != 0 and \ + overlap < sampler[6]: + continue + if sampler[7] != 0 and \ + overlap > sampler[7]: + continue + found = True + if has_object_coverage: + object_coverage = bbox_coverage(object_bbox, sample_bbox) + if sampler[8] != 0 and \ + object_coverage < sampler[8]: + continue + if sampler[9] != 0 and \ + object_coverage > sampler[9]: + continue + found = True + if found: + return True + return found + + +def crop_image_sampling(img, sample_bbox, image_width, image_height, + target_size): + # no clipping here + xmin = int(sample_bbox[0] * image_width) + xmax = int(sample_bbox[2] * image_width) + ymin = int(sample_bbox[1] * image_height) + ymax = int(sample_bbox[3] * image_height) + + w_off = xmin + h_off = ymin + width = xmax - xmin + height = ymax - ymin + cross_xmin = max(0.0, float(w_off)) + cross_ymin = max(0.0, float(h_off)) + cross_xmax = min(float(w_off + width - 1.0), float(image_width)) + cross_ymax = min(float(h_off + height - 1.0), float(image_height)) + cross_width = cross_xmax - cross_xmin + cross_height = cross_ymax - cross_ymin + + roi_xmin = 0 if w_off >= 0 else abs(w_off) + roi_ymin = 0 if h_off >= 0 else abs(h_off) + roi_width = cross_width + roi_height = cross_height + + roi_y1 = int(roi_ymin) + roi_y2 = int(roi_ymin + roi_height) + roi_x1 = int(roi_xmin) + roi_x2 = int(roi_xmin + roi_width) + + cross_y1 = int(cross_ymin) + cross_y2 = int(cross_ymin + cross_height) + cross_x1 = int(cross_xmin) + cross_x2 = int(cross_xmin + cross_width) + + sample_img = np.zeros((height, width, 3)) + sample_img[roi_y1: roi_y2, roi_x1: roi_x2] = \ + img[cross_y1: cross_y2, cross_x1: cross_x2] + + sample_img = cv2.resize( + sample_img, (target_size, target_size), interpolation=cv2.INTER_AREA) + + return sample_img diff --git a/ppdet/data/transform/operators.py b/ppdet/data/transform/operators.py new file mode 100644 index 0000000000000000000000000000000000000000..b09998120ffc1e76e42299489f16bacc6424454c --- /dev/null +++ b/ppdet/data/transform/operators.py @@ -0,0 +1,991 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# function: +# operators to process sample, +# eg: decode/resize/crop image + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import uuid +import logging +import random +import math +import numpy as np +import cv2 +from PIL import Image, ImageEnhance + +from ppdet.core.workspace import serializable + +from .op_helper import (satisfy_sample_constraint, filter_and_process, + generate_sample_bbox, clip_bbox, data_anchor_sampling, + satisfy_sample_constraint_coverage, crop_image_sampling, + generate_sample_bbox_square, bbox_area_sampling) + +logger = logging.getLogger(__name__) + +registered_ops = [] + + +def register_op(cls): + registered_ops.append(cls.__name__) + if not hasattr(BaseOperator, cls.__name__): + setattr(BaseOperator, cls.__name__, cls) + else: + raise KeyError("The {} class has been registered.".format(cls.__name__)) + return serializable(cls) + + +class BboxError(ValueError): + pass + + +class ImageError(ValueError): + pass + + +class BaseOperator(object): + def __init__(self, name=None): + if name is None: + name = self.__class__.__name__ + self._id = name + '_' + str(uuid.uuid4())[-6:] + + def __call__(self, sample, context=None): + """ Process a sample. + Args: + sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx} + context (dict): info about this sample processing + Returns: + result (dict): a processed sample + """ + return sample + + def __str__(self): + return str(self._id) + + +@register_op +class DecodeImage(BaseOperator): + def __init__(self, to_rgb=True, with_mixup=False): + """ Transform the image data to numpy format. + + Args: + to_rgb (bool): whether to convert BGR to RGB + with_mixup (bool): whether or not to mixup image and gt_bbbox/gt_score + """ + + super(DecodeImage, self).__init__() + self.to_rgb = to_rgb + self.with_mixup = with_mixup + if not isinstance(self.to_rgb, bool): + raise TypeError("{}: input type is invalid.".format(self)) + if not isinstance(self.with_mixup, bool): + raise TypeError("{}: input type is invalid.".format(self)) + + def __call__(self, sample, context=None): + """ load image if 'im_file' field is not empty but 'image' is""" + if 'image' not in sample: + with open(sample['im_file'], 'rb') as f: + sample['image'] = f.read() + + im = sample['image'] + data = np.frombuffer(im, dtype='uint8') + im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode + if self.to_rgb: + im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) + sample['image'] = im + + if 'h' not in sample: + sample['h'] = im.shape[0] + if 'w' not in sample: + sample['w'] = im.shape[1] + # make default im_info with [h, w, 1] + sample['im_info'] = np.array( + [im.shape[0], im.shape[1], 1.], dtype=np.float32) + # decode mixup image + if self.with_mixup and 'mixup' in sample: + self.__call__(sample['mixup'], context) + return sample + + +@register_op +class MultiscaleTestResize(BaseOperator): + def __init__(self, + origin_target_size=800, + origin_max_size=1333, + target_size=[], + max_size=2000, + interp=cv2.INTER_LINEAR, + use_flip=True): + """ + Rescale image to the each size in target size, and capped at max_size. + + Args: + origin_target_size(int): original target size of image's short side. + origin_max_size(int): original max size of image. + target_size (list): A list of target sizes of image's short side. + max_size (int): the max size of image. + interp (int): the interpolation method. + use_flip (bool): whether use flip augmentation. + """ + super(MultiscaleTestResize, self).__init__() + self.origin_target_size = int(origin_target_size) + self.origin_max_size = int(origin_max_size) + self.max_size = int(max_size) + self.interp = int(interp) + self.use_flip = use_flip + + if not isinstance(target_size, list): + raise TypeError( + "Type of target_size is invalid. Must be List, now is {}". + format(type(target_size))) + self.target_size = target_size + if not (isinstance(self.origin_target_size, int) and isinstance( + self.origin_max_size, int) and isinstance(self.max_size, int) + and isinstance(self.interp, int)): + raise TypeError("{}: input type is invalid.".format(self)) + + def __call__(self, sample, context=None): + """ Resize the image numpy for multi-scale test. + """ + origin_ims = {} + im = sample['image'] + if not isinstance(im, np.ndarray): + raise TypeError("{}: image type is not numpy.".format(self)) + if len(im.shape) != 3: + raise ImageError('{}: image is not 3-dimensional.'.format(self)) + im_shape = im.shape + im_size_min = np.min(im_shape[0:2]) + im_size_max = np.max(im_shape[0:2]) + if float(im_size_min) == 0: + raise ZeroDivisionError('{}: min size of image is 0'.format(self)) + base_name_list = ['image'] + origin_ims['image'] = im + if self.use_flip: + sample['flip_image'] = im[:, ::-1, :] + base_name_list.append('flip_image') + origin_ims['flip_image'] = sample['flip_image'] + im_info = [] + for base_name in base_name_list: + im_scale = float(self.origin_target_size) / float(im_size_min) + # Prevent the biggest axis from being more than max_size + if np.round(im_scale * im_size_max) > self.origin_max_size: + im_scale = float(self.origin_max_size) / float(im_size_max) + im_scale_x = im_scale + im_scale_y = im_scale + + resize_w = np.round(im_scale_x * float(im_shape[1])) + resize_h = np.round(im_scale_y * float(im_shape[0])) + im_resize = cv2.resize( + origin_ims[base_name], + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=self.interp) + im_info.extend([resize_h, resize_w, im_scale]) + sample[base_name] = im_resize + for i, size in enumerate(self.target_size): + im_scale = float(size) / float(im_size_min) + if np.round(im_scale * im_size_max) > self.max_size: + im_scale = float(self.max_size) / float(im_size_max) + im_scale_x = im_scale + im_scale_y = im_scale + resize_w = np.round(im_scale_x * float(im_shape[1])) + resize_h = np.round(im_scale_y * float(im_shape[0])) + im_resize = cv2.resize( + origin_ims[base_name], + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=self.interp) + im_info.extend([resize_h, resize_w, im_scale]) + name = base_name + '_scale_' + str(i) + sample[name] = im_resize + sample['im_info'] = np.array(im_info, dtype=np.float32) + return sample + + +@register_op +class ResizeImage(BaseOperator): + def __init__(self, + target_size=0, + max_size=0, + interp=cv2.INTER_LINEAR, + use_cv2=True): + """ + Rescale image to the specified target size, and capped at max_size + if max_size != 0. + If target_size is list, selected a scale randomly as the specified + target size. + + Args: + target_size (int|list): the target size of image's short side, + multi-scale training is adopted when type is list. + max_size (int): the max size of image + interp (int): the interpolation method + use_cv2 (bool): use the cv2 interpolation method or use PIL + interpolation method + """ + super(ResizeImage, self).__init__() + self.max_size = int(max_size) + self.interp = int(interp) + self.use_cv2 = use_cv2 + if not (isinstance(target_size, int) or isinstance(target_size, list)): + raise TypeError( + "Type of target_size is invalid. Must be Integer or List, now is {}". + format(type(target_size))) + self.target_size = target_size + if not (isinstance(self.max_size, int) and isinstance(self.interp, + int)): + raise TypeError("{}: input type is invalid.".format(self)) + + def __call__(self, sample, context=None): + """ Resize the image numpy. + """ + im = sample['image'] + if not isinstance(im, np.ndarray): + raise TypeError("{}: image type is not numpy.".format(self)) + if len(im.shape) != 3: + raise ImageError('{}: image is not 3-dimensional.'.format(self)) + im_shape = im.shape + im_size_min = np.min(im_shape[0:2]) + im_size_max = np.max(im_shape[0:2]) + if isinstance(self.target_size, list): + # Case for multi-scale training + selected_size = random.choice(self.target_size) + else: + selected_size = self.target_size + if float(im_size_min) == 0: + raise ZeroDivisionError('{}: min size of image is 0'.format(self)) + if self.max_size != 0: + im_scale = float(selected_size) / float(im_size_min) + # Prevent the biggest axis from being more than max_size + if np.round(im_scale * im_size_max) > self.max_size: + im_scale = float(self.max_size) / float(im_size_max) + im_scale_x = im_scale + im_scale_y = im_scale + + resize_w = np.round(im_scale_x * float(im_shape[1])) + resize_h = np.round(im_scale_y * float(im_shape[0])) + im_info = [resize_h, resize_w, im_scale] + if 'im_info' in sample and sample['im_info'][2] != 1.: + sample['im_info'] = np.append( + list(sample['im_info']), im_info).astype(np.float32) + else: + sample['im_info'] = np.array(im_info).astype(np.float32) + else: + im_scale_x = float(selected_size) / float(im_shape[1]) + im_scale_y = float(selected_size) / float(im_shape[0]) + + resize_w = selected_size + resize_h = selected_size + + if self.use_cv2: + im = cv2.resize( + im, + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=self.interp) + else: + im = Image.fromarray(im) + im = im.resize((resize_w, resize_h), self.interp) + im = np.array(im) + + sample['image'] = im + return sample + + +@register_op +class RandomFlipImage(BaseOperator): + def __init__(self, prob=0.5, is_normalized=False, is_mask_flip=False): + """ + Args: + prob (float): the probability of flipping image + is_normalized (bool): whether the bbox scale to [0,1] + is_mask_flip (bool): whether flip the segmentation + """ + super(RandomFlipImage, self).__init__() + self.prob = prob + self.is_normalized = is_normalized + self.is_mask_flip = is_mask_flip + if not (isinstance(self.prob, float) and + isinstance(self.is_normalized, bool) and + isinstance(self.is_mask_flip, bool)): + raise TypeError("{}: input type is invalid.".format(self)) + + def flip_segms(self, segms, height, width): + def _flip_poly(poly, width): + flipped_poly = np.array(poly) + flipped_poly[0::2] = width - np.array(poly[0::2]) - 1 + return flipped_poly.tolist() + + def _flip_rle(rle, height, width): + if 'counts' in rle and type(rle['counts']) == list: + rle = mask_util.frPyObjects([rle], height, width) + mask = mask_util.decode(rle) + mask = mask[:, ::-1, :] + rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) + return rle + + def is_poly(segm): + assert isinstance(segm, (list, dict)), \ + "Invalid segm type: {}".format(type(segm)) + return isinstance(segm, list) + + flipped_segms = [] + for segm in segms: + if is_poly(segm): + # Polygon format + flipped_segms.append([_flip_poly(poly, width) for poly in segm]) + else: + # RLE format + import pycocotools.mask as mask_util + flipped_segms.append(_flip_rle(segm, height, width)) + return flipped_segms + + def __call__(self, sample, context=None): + """Filp the image and bounding box. + Operators: + 1. Flip the image numpy. + 2. Transform the bboxes' x coordinates. + (Must judge whether the coordinates are normalized!) + 3. Transform the segmentations' x coordinates. + (Must judge whether the coordinates are normalized!) + Output: + sample: the image, bounding box and segmentation part + in sample are flipped. + """ + gt_bbox = sample['gt_bbox'] + im = sample['image'] + if not isinstance(im, np.ndarray): + raise TypeError("{}: image is not a numpy array.".format(self)) + if len(im.shape) != 3: + raise ImageError("{}: image is not 3-dimensional.".format(self)) + height, width, _ = im.shape + if np.random.uniform(0, 1) < self.prob: + im = im[:, ::-1, :] + if gt_bbox.shape[0] == 0: + return sample + oldx1 = gt_bbox[:, 0].copy() + oldx2 = gt_bbox[:, 2].copy() + if self.is_normalized: + gt_bbox[:, 0] = 1 - oldx2 + gt_bbox[:, 2] = 1 - oldx1 + else: + gt_bbox[:, 0] = width - oldx2 - 1 + gt_bbox[:, 2] = width - oldx1 - 1 + if gt_bbox.shape[0] != 0 and (gt_bbox[:, 2] < gt_bbox[:, 0]).all(): + m = "{}: invalid box, x2 should be greater than x1".format(self) + raise BboxError(m) + sample['gt_bbox'] = gt_bbox + if self.is_mask_flip and len(sample['gt_poly']) != 0: + sample['gt_poly'] = self.flip_segms(sample['gt_poly'], height, + width) + sample['flipped'] = True + sample['image'] = im + return sample + + +@register_op +class NormalizeImage(BaseOperator): + def __init__(self, + mean=[0.485, 0.456, 0.406], + std=[1, 1, 1], + is_scale=True, + is_channel_first=True): + """ + Args: + mean (list): the pixel mean + std (list): the pixel variance + """ + super(NormalizeImage, self).__init__() + self.mean = mean + self.std = std + self.is_scale = is_scale + self.is_channel_first = is_channel_first + if not (isinstance(self.mean, list) and isinstance(self.std, list) and + isinstance(self.is_scale, bool)): + raise TypeError("{}: input type is invalid.".format(self)) + from functools import reduce + if reduce(lambda x, y: x * y, self.std) == 0: + raise ValueError('{}: std is invalid!'.format(self)) + + def __call__(self, sample, context=None): + """Normalize the image. + Operators: + 1.(optional) Scale the image to [0,1] + 2. Each pixel minus mean and is divided by std + """ + for k in sample.keys(): + if 'image' in k: + im = sample[k] + im = im.astype(np.float32, copy=False) + if self.is_channel_first: + mean = np.array(self.mean)[:, np.newaxis, np.newaxis] + std = np.array(self.std)[:, np.newaxis, np.newaxis] + else: + mean = np.array(self.mean)[np.newaxis, np.newaxis, :] + std = np.array(self.std)[np.newaxis, np.newaxis, :] + if self.is_scale: + im = im / 255.0 + im -= mean + im /= std + sample[k] = im + return sample + + +@register_op +class RandomDistort(BaseOperator): + def __init__(self, + brightness_lower=0.5, + brightness_upper=1.5, + contrast_lower=0.5, + contrast_upper=1.5, + saturation_lower=0.5, + saturation_upper=1.5, + hue_lower=-18, + hue_upper=18, + brightness_prob=0.5, + contrast_prob=0.5, + saturation_prob=0.5, + hue_prob=0.5, + count=4, + is_order=False): + """ + Args: + brightness_lower/ brightness_upper (float): the brightness + between brightness_lower and brightness_upper + contrast_lower/ contrast_upper (float): the contrast between + contrast_lower and contrast_lower + saturation_lower/ saturation_upper (float): the saturation + between saturation_lower and saturation_upper + hue_lower/ hue_upper (float): the hue between + hue_lower and hue_upper + brightness_prob (float): the probability of changing brightness + contrast_prob (float): the probability of changing contrast + saturation_prob (float): the probability of changing saturation + hue_prob (float): the probability of changing hue + count (int): the kinds of doing distrot + is_order (bool): whether determine the order of distortion + """ + super(RandomDistort, self).__init__() + self.brightness_lower = brightness_lower + self.brightness_upper = brightness_upper + self.contrast_lower = contrast_lower + self.contrast_upper = contrast_upper + self.saturation_lower = saturation_lower + self.saturation_upper = saturation_upper + self.hue_lower = hue_lower + self.hue_upper = hue_upper + self.brightness_prob = brightness_prob + self.contrast_prob = contrast_prob + self.saturation_prob = saturation_prob + self.hue_prob = hue_prob + self.count = count + self.is_order = is_order + + def random_brightness(self, img): + brightness_delta = np.random.uniform(self.brightness_lower, + self.brightness_upper) + prob = np.random.uniform(0, 1) + if prob < self.brightness_prob: + img = ImageEnhance.Brightness(img).enhance(brightness_delta) + return img + + def random_contrast(self, img): + contrast_delta = np.random.uniform(self.contrast_lower, + self.contrast_upper) + prob = np.random.uniform(0, 1) + if prob < self.contrast_prob: + img = ImageEnhance.Contrast(img).enhance(contrast_delta) + return img + + def random_saturation(self, img): + saturation_delta = np.random.uniform(self.saturation_lower, + self.saturation_upper) + prob = np.random.uniform(0, 1) + if prob < self.saturation_prob: + img = ImageEnhance.Color(img).enhance(saturation_delta) + return img + + def random_hue(self, img): + hue_delta = np.random.uniform(self.hue_lower, self.hue_upper) + prob = np.random.uniform(0, 1) + if prob < self.hue_prob: + img = np.array(img.convert('HSV')) + img[:, :, 0] = img[:, :, 0] + hue_delta + img = Image.fromarray(img, mode='HSV').convert('RGB') + return img + + def __call__(self, sample, context): + """random distort the image""" + ops = [ + self.random_brightness, self.random_contrast, + self.random_saturation, self.random_hue + ] + if self.is_order: + prob = np.random.uniform(0, 1) + if prob < 0.5: + ops = [ + self.random_brightness, + self.random_saturation, + self.random_hue, + self.random_contrast, + ] + else: + ops = random.sample(ops, self.count) + assert 'image' in sample, "image data not found" + im = sample['image'] + im = Image.fromarray(im) + for id in range(self.count): + im = ops[id](im) + im = np.asarray(im) + sample['image'] = im + return sample + + +@register_op +class ExpandImage(BaseOperator): + def __init__(self, max_ratio, prob, mean=[127.5, 127.5, 127.5]): + """ + Args: + max_ratio (float): the ratio of expanding + prob (float): the probability of expanding image + mean (list): the pixel mean + """ + super(ExpandImage, self).__init__() + self.max_ratio = max_ratio + self.mean = mean + self.prob = prob + + def __call__(self, sample, context): + """ + Expand the image and modify bounding box. + Operators: + 1. Scale the image width and height. + 2. Construct new images with new height and width. + 3. Fill the new image with the mean. + 4. Put original imge into new image. + 5. Rescale the bounding box. + 6. Determine if the new bbox is satisfied in the new image. + Returns: + sample: the image, bounding box are replaced. + """ + + prob = np.random.uniform(0, 1) + assert 'image' in sample, 'not found image data' + im = sample['image'] + gt_bbox = sample['gt_bbox'] + gt_class = sample['gt_class'] + im_width = sample['w'] + im_height = sample['h'] + if prob < self.prob: + if self.max_ratio - 1 >= 0.01: + expand_ratio = np.random.uniform(1, self.max_ratio) + height = int(im_height * expand_ratio) + width = int(im_width * expand_ratio) + h_off = math.floor(np.random.uniform(0, height - im_height)) + w_off = math.floor(np.random.uniform(0, width - im_width)) + expand_bbox = [ + -w_off / im_width, -h_off / im_height, + (width - w_off) / im_width, (height - h_off) / im_height + ] + expand_im = np.ones((height, width, 3)) + expand_im = np.uint8(expand_im * np.squeeze(self.mean)) + expand_im = Image.fromarray(expand_im) + im = Image.fromarray(im) + expand_im.paste(im, (int(w_off), int(h_off))) + expand_im = np.asarray(expand_im) + gt_bbox, gt_class, _ = filter_and_process(expand_bbox, gt_bbox, + gt_class) + sample['image'] = expand_im + sample['gt_bbox'] = gt_bbox + sample['gt_class'] = gt_class + sample['w'] = width + sample['h'] = height + + return sample + + +@register_op +class CropImage(BaseOperator): + def __init__(self, batch_sampler, satisfy_all=False, avoid_no_bbox=True): + """ + Args: + batch_sampler (list): Multiple sets of different + parameters for cropping. + satisfy_all (bool): whether all boxes must satisfy. + e.g.[[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]] + [max sample, max trial, min scale, max scale, + min aspect ratio, max aspect ratio, + min overlap, max overlap] + avoid_no_bbox (bool): whether to to avoid the + situation where the box does not appear. + """ + super(CropImage, self).__init__() + self.batch_sampler = batch_sampler + self.satisfy_all = satisfy_all + self.avoid_no_bbox = avoid_no_bbox + + def __call__(self, sample, context): + """ + Crop the image and modify bounding box. + Operators: + 1. Scale the image width and height. + 2. Crop the image according to a radom sample. + 3. Rescale the bounding box. + 4. Determine if the new bbox is satisfied in the new image. + Returns: + sample: the image, bounding box are replaced. + """ + assert 'image' in sample, "image data not found" + im = sample['image'] + gt_bbox = sample['gt_bbox'] + gt_class = sample['gt_class'] + im_width = sample['w'] + im_height = sample['h'] + gt_score = None + if 'gt_score' in sample: + gt_score = sample['gt_score'] + sampled_bbox = [] + gt_bbox = gt_bbox.tolist() + for sampler in self.batch_sampler: + found = 0 + for i in range(sampler[1]): + if found >= sampler[0]: + break + sample_bbox = generate_sample_bbox(sampler) + if satisfy_sample_constraint(sampler, sample_bbox, gt_bbox, + self.satisfy_all): + sampled_bbox.append(sample_bbox) + found = found + 1 + im = np.array(im) + while sampled_bbox: + idx = int(np.random.uniform(0, len(sampled_bbox))) + sample_bbox = sampled_bbox.pop(idx) + sample_bbox = clip_bbox(sample_bbox) + crop_bbox, crop_class, crop_score = \ + filter_and_process(sample_bbox, gt_bbox, gt_class, gt_score) + if self.avoid_no_bbox: + if len(crop_bbox) < 1: + continue + xmin = int(sample_bbox[0] * im_width) + xmax = int(sample_bbox[2] * im_width) + ymin = int(sample_bbox[1] * im_height) + ymax = int(sample_bbox[3] * im_height) + im = im[ymin:ymax, xmin:xmax] + sample['image'] = im + sample['gt_bbox'] = crop_bbox + sample['gt_class'] = crop_class + sample['gt_score'] = crop_score + return sample + return sample + + +@register_op +class CropImageWithDataAchorSampling(BaseOperator): + def __init__(self, + batch_sampler, + anchor_sampler=None, + target_size=None, + das_anchor_scales=[16, 32, 64, 128], + sampling_prob=0.5, + min_size=8., + avoid_no_bbox=True): + """ + Args: + anchor_sampler (list): anchor_sampling sets of different + parameters for cropping. + batch_sampler (list): Multiple sets of different + parameters for cropping. + e.g.[[1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0]] + [[1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]] + [max sample, max trial, min scale, max scale, + min aspect ratio, max aspect ratio, + min overlap, max overlap, min coverage, max coverage] + target_size (bool): target image size. + das_anchor_scales (list[float]): a list of anchor scales in data + anchor smapling. + min_size (float): minimum size of sampled bbox. + avoid_no_bbox (bool): whether to to avoid the + situation where the box does not appear. + """ + super(CropImageWithDataAchorSampling, self).__init__() + self.anchor_sampler = anchor_sampler + self.batch_sampler = batch_sampler + self.target_size = target_size + self.sampling_prob = sampling_prob + self.min_size = min_size + self.avoid_no_bbox = avoid_no_bbox + self.das_anchor_scales = np.array(das_anchor_scales) + + def __call__(self, sample, context): + """ + Crop the image and modify bounding box. + Operators: + 1. Scale the image width and height. + 2. Crop the image according to a radom sample. + 3. Rescale the bounding box. + 4. Determine if the new bbox is satisfied in the new image. + Returns: + sample: the image, bounding box are replaced. + """ + assert 'image' in sample, "image data not found" + im = sample['image'] + gt_bbox = sample['gt_bbox'] + gt_class = sample['gt_class'] + image_width = sample['w'] + image_height = sample['h'] + gt_score = None + if 'gt_score' in sample: + gt_score = sample['gt_score'] + sampled_bbox = [] + gt_bbox = gt_bbox.tolist() + + prob = np.random.uniform(0., 1.) + if prob > self.sampling_prob: # anchor sampling + assert self.anchor_sampler + for sampler in self.anchor_sampler: + found = 0 + for i in range(sampler[1]): + if found >= sampler[0]: + break + sample_bbox = data_anchor_sampling( + gt_bbox, image_width, image_height, + self.das_anchor_scales, self.target_size) + if sample_bbox == 0: + break + if satisfy_sample_constraint_coverage(sampler, sample_bbox, + gt_bbox): + sampled_bbox.append(sample_bbox) + found = found + 1 + im = np.array(im) + while sampled_bbox: + idx = int(np.random.uniform(0, len(sampled_bbox))) + sample_bbox = sampled_bbox.pop(idx) + + crop_bbox, crop_class, crop_score = filter_and_process( + sample_bbox, gt_bbox, gt_class, gt_score) + crop_bbox, crop_class, crop_score = bbox_area_sampling( + crop_bbox, crop_class, crop_score, self.target_size, + self.min_size) + + if self.avoid_no_bbox: + if len(crop_bbox) < 1: + continue + im = crop_image_sampling(im, sample_bbox, image_width, + image_height, self.target_size) + sample['image'] = im + sample['gt_bbox'] = crop_bbox + sample['gt_class'] = crop_class + sample['gt_score'] = crop_score + return sample + return sample + + else: + for sampler in self.batch_sampler: + found = 0 + for i in range(sampler[1]): + if found >= sampler[0]: + break + sample_bbox = generate_sample_bbox_square( + sampler, image_width, image_height) + if satisfy_sample_constraint_coverage(sampler, sample_bbox, + gt_bbox): + sampled_bbox.append(sample_bbox) + found = found + 1 + im = np.array(im) + while sampled_bbox: + idx = int(np.random.uniform(0, len(sampled_bbox))) + sample_bbox = sampled_bbox.pop(idx) + sample_bbox = clip_bbox(sample_bbox) + + crop_bbox, crop_class, crop_score = filter_and_process( + sample_bbox, gt_bbox, gt_class, gt_score) + # sampling bbox according the bbox area + crop_bbox, crop_class, crop_score = bbox_area_sampling( + crop_bbox, crop_class, crop_score, self.target_size, + self.min_size) + + if self.avoid_no_bbox: + if len(crop_bbox) < 1: + continue + xmin = int(sample_bbox[0] * image_width) + xmax = int(sample_bbox[2] * image_width) + ymin = int(sample_bbox[1] * image_height) + ymax = int(sample_bbox[3] * image_height) + im = im[ymin:ymax, xmin:xmax] + sample['image'] = im + sample['gt_bbox'] = crop_bbox + sample['gt_class'] = crop_class + sample['gt_score'] = crop_score + return sample + return sample + + +@register_op +class NormalizeBox(BaseOperator): + """Transform the bounding box's coornidates to [0,1].""" + + def __init__(self): + super(NormalizeBox, self).__init__() + + def __call__(self, sample, context): + gt_bbox = sample['gt_bbox'] + width = sample['w'] + height = sample['h'] + for i in range(gt_bbox.shape[0]): + gt_bbox[i][0] = gt_bbox[i][0] / width + gt_bbox[i][1] = gt_bbox[i][1] / height + gt_bbox[i][2] = gt_bbox[i][2] / width + gt_bbox[i][3] = gt_bbox[i][3] / height + sample['gt_bbox'] = gt_bbox + return sample + + +@register_op +class Permute(BaseOperator): + def __init__(self, to_bgr=True, channel_first=True): + """ + Change the channel. + Args: + to_bgr (bool): confirm whether to convert RGB to BGR + channel_first (bool): confirm whether to change channel + + """ + super(Permute, self).__init__() + self.to_bgr = to_bgr + self.channel_first = channel_first + if not (isinstance(self.to_bgr, bool) and + isinstance(self.channel_first, bool)): + raise TypeError("{}: input type is invalid.".format(self)) + + def __call__(self, sample, context=None): + assert 'image' in sample, "image data not found" + for k in sample.keys(): + if 'image' in k: + im = sample[k] + if self.channel_first: + im = np.swapaxes(im, 1, 2) + im = np.swapaxes(im, 1, 0) + if self.to_bgr: + im = im[[2, 1, 0], :, :] + sample[k] = im + return sample + + +@register_op +class MixupImage(BaseOperator): + def __init__(self, alpha=1.5, beta=1.5): + """ Mixup image and gt_bbbox/gt_score + Args: + alpha (float): alpha parameter of beta distribute + beta (float): beta parameter of beta distribute + """ + super(MixupImage, self).__init__() + self.alpha = alpha + self.beta = beta + if self.alpha <= 0.0: + raise ValueError("alpha shold be positive in {}".format(self)) + if self.beta <= 0.0: + raise ValueError("beta shold be positive in {}".format(self)) + + def _mixup_img(self, img1, img2, factor): + h = max(img1.shape[0], img2.shape[0]) + w = max(img1.shape[1], img2.shape[1]) + img = np.zeros((h, w, img1.shape[2]), 'float32') + img[:img1.shape[0], :img1.shape[1], :] = \ + img1.astype('float32') * factor + img[:img2.shape[0], :img2.shape[1], :] += \ + img2.astype('float32') * (1.0 - factor) + return img.astype('uint8') + + def __call__(self, sample, context=None): + if 'mixup' not in sample: + return sample + factor = np.random.beta(self.alpha, self.beta) + factor = max(0.0, min(1.0, factor)) + if factor >= 1.0: + sample.pop('mixup') + return sample + if factor <= 0.0: + return sample['mixup'] + im = self._mixup_img(sample['image'], sample['mixup']['image'], factor) + gt_bbox1 = sample['gt_bbox'] + gt_bbox2 = sample['mixup']['gt_bbox'] + gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0) + gt_class1 = sample['gt_class'] + gt_class2 = sample['mixup']['gt_class'] + gt_class = np.concatenate((gt_class1, gt_class2), axis=0) + + gt_score1 = sample['gt_score'] + gt_score2 = sample['mixup']['gt_score'] + gt_score = np.concatenate( + (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0) + sample['image'] = im + sample['gt_bbox'] = gt_bbox + sample['gt_score'] = gt_score + sample['gt_class'] = gt_class + sample['h'] = im.shape[0] + sample['w'] = im.shape[1] + sample.pop('mixup') + return sample + + +@register_op +class RandomInterpImage(BaseOperator): + def __init__(self, target_size=0, max_size=0): + """ + Random reisze image by multiply interpolate method. + Args: + target_size (int): the taregt size of image's short side + max_size (int): the max size of image + """ + super(RandomInterpImage, self).__init__() + self.target_size = target_size + self.max_size = max_size + if not (isinstance(self.target_size, int) and + isinstance(self.max_size, int)): + raise TypeError('{}: input type is invalid.'.format(self)) + interps = [ + cv2.INTER_NEAREST, + cv2.INTER_LINEAR, + cv2.INTER_AREA, + cv2.INTER_CUBIC, + cv2.INTER_LANCZOS4, + ] + self.resizers = [] + for interp in interps: + self.resizers.append(ResizeImage(target_size, max_size, interp)) + + def __call__(self, sample, context=None): + """Resise the image numpy by random resizer.""" + resizer = random.choice(self.resizers) + return resizer(sample, context) diff --git a/ppdet/data/transform/parallel_map.py b/ppdet/data/transform/parallel_map.py new file mode 100644 index 0000000000000000000000000000000000000000..2ba55a8d203cfa7f467e1790b372cf6ce6e02fdb --- /dev/null +++ b/ppdet/data/transform/parallel_map.py @@ -0,0 +1,225 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# function: +# transform samples in 'source' using 'mapper' + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys +import six +import uuid +import logging +import signal +import threading +from .transformer import ProxiedDataset + +logger = logging.getLogger(__name__) + + +class EndSignal(object): + def __init__(self, errno=0, errmsg=''): + self.errno = errno + self.errmsg = errmsg + + +class ParallelMappedDataset(ProxiedDataset): + """ + Transform samples to mapped samples which is similar to 'basic.MappedDataset', + but multiple workers (threads or processes) will be used + + Notes: + this class is not thread-safe + """ + + def __init__(self, source, mapper, worker_args): + super(ParallelMappedDataset, self).__init__(source) + worker_args = {k.lower(): v for k, v in worker_args.items()} + + args = {'bufsize': 100, 'worker_num': 8, + 'use_process': False, 'memsize': '3G'} + args.update(worker_args) + if args['use_process'] and type(args['memsize']) is str: + assert args['memsize'][-1].lower() == 'g', \ + "invalid param for memsize[%s], should be ended with 'G' or 'g'" % (args['memsize']) + gb = args['memsize'][:-1] + args['memsize'] = int(gb) * 1024 ** 3 + + self._worker_args = args + self._started = False + self._source = source + self._mapper = mapper + self._exit = False + self._setup() + + def _setup(self): + """setup input/output queues and workers """ + use_process = self._worker_args.get('use_process', False) + if use_process and sys.platform == "win32": + logger.info("Use multi-thread reader instead of " + "multi-process reader on Windows.") + use_process = False + + bufsize = self._worker_args['bufsize'] + if use_process: + from .shared_queue import SharedQueue as Queue + from multiprocessing import Process as Worker + from multiprocessing import Event + memsize = self._worker_args['memsize'] + self._inq = Queue(bufsize, memsize=memsize) + self._outq = Queue(bufsize, memsize=memsize) + else: + if six.PY3: + from queue import Queue + else: + from Queue import Queue + from threading import Thread as Worker + from threading import Event + self._inq = Queue(bufsize) + self._outq = Queue(bufsize) + + consumer_num = self._worker_args['worker_num'] + id = str(uuid.uuid4())[-3:] + self._producer = threading.Thread( + target=self._produce, + args=('producer-' + id, self._source, self._inq)) + self._producer.daemon = True + + self._consumers = [] + for i in range(consumer_num): + p = Worker( + target=self._consume, + args=('consumer-' + id + '_' + str(i), self._inq, self._outq, + self._mapper)) + self._consumers.append(p) + p.daemon = True + + self._epoch = -1 + self._feeding_ev = Event() + self._produced = 0 # produced sample in self._produce + self._consumed = 0 # consumed sample in self.next + self._stopped_consumers = 0 + + def _produce(self, id, source, inq): + """Fetch data from source and feed it to 'inq' queue""" + while True: + self._feeding_ev.wait() + if self._exit: + break + try: + inq.put(source.next()) + self._produced += 1 + except StopIteration: + self._feeding_ev.clear() + self._feeding_ev.wait() # wait other guy to wake up me + logger.debug("producer[{}] starts new epoch".format(id)) + except Exception as e: + msg = "producer[{}] failed with error: {}".format(id, str(e)) + inq.put(EndSignal(-1, msg)) + break + + logger.debug("producer[{}] exits".format(id)) + + def _consume(self, id, inq, outq, mapper): + """Fetch data from 'inq', process it and put result to 'outq'""" + while True: + sample = inq.get() + if isinstance(sample, EndSignal): + sample.errmsg += "[consumer[{}] exits]".format(id) + outq.put(sample) + logger.debug("end signal received, " + + "consumer[{}] exits".format(id)) + break + + try: + result = mapper(sample) + outq.put(result) + except Exception as e: + msg = 'failed to map consumer[%s], error: {}'.format(str(e), id) + outq.put(EndSignal(-1, msg)) + break + + def drained(self): + assert self._epoch >= 0, "first epoch has not started yet" + return self._source.drained() and self._produced == self._consumed + + def stop(self): + """ notify to exit + """ + self._exit = True + self._feeding_ev.set() + for _ in range(len(self._consumers)): + self._inq.put(EndSignal(0, "notify consumers to exit")) + + def next(self): + """ get next transformed sample + """ + if self._epoch < 0: + self.reset() + + if self.drained(): + raise StopIteration() + + while True: + sample = self._outq.get() + if isinstance(sample, EndSignal): + self._stopped_consumers += 1 + if sample.errno != 0: + logger.warn("consumer failed with error: {}".format( + sample.errmsg)) + + if self._stopped_consumers < len(self._consumers): + self._inq.put(sample) + else: + raise ValueError("all consumers exited, no more samples") + else: + self._consumed += 1 + return sample + + def reset(self): + """ reset for a new epoch of samples + """ + if self._epoch < 0: + self._epoch = 0 + for p in self._consumers: + p.start() + self._producer.start() + else: + if not self.drained(): + logger.warn("do not reset before epoch[%d] finishes".format( + self._epoch)) + self._produced = self._produced - self._consumed + else: + self._produced = 0 + + self._epoch += 1 + + assert self._stopped_consumers == 0, "some consumers already exited," \ + + " cannot start another epoch" + + self._source.reset() + self._consumed = 0 + self._feeding_ev.set() + + +# FIXME(dengkaipeng): fix me if you have better impliment +# handle terminate reader process, do not print stack frame +def _reader_exit(signum, frame): + logger.debug("Reader process exit.") + sys.exit() + + +signal.signal(signal.SIGTERM, _reader_exit) diff --git a/ppdet/data/transform/post_map.py b/ppdet/data/transform/post_map.py new file mode 100644 index 0000000000000000000000000000000000000000..d556160e5a485753fe2d68600d320fcda6c91496 --- /dev/null +++ b/ppdet/data/transform/post_map.py @@ -0,0 +1,152 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import logging +import cv2 +import numpy as np + +logger = logging.getLogger(__name__) + + +def build_post_map(coarsest_stride=1, + is_padding=False, + random_shapes=[], + multi_scales=[], + use_padded_im_info=False, + enable_multiscale_test=False, + num_scale=1): + """ + Build a mapper for post-processing batches + + Args: + config (dict of parameters): + { + coarsest_stride (int): stride of the coarsest FPN level + is_padding (bool): whether to padding in minibatch + random_shapes (list of int): resize to image to random shapes, + [] for not resize. + multi_scales (list of int): resize image by random scales, + [] for not resize. + use_padded_im_info (bool): whether to update im_info after padding + enable_multiscale_test (bool): whether to use multiscale test. + num_scale (int) : the number of scales for multiscale test. + } + Returns: + a mapper function which accept one argument 'batch' and + return the processed result + """ + + def padding_minibatch(batch_data): + if len(batch_data) == 1 and coarsest_stride == 1: + return batch_data + max_shape = np.array([data[0].shape for data in batch_data]).max(axis=0) + if coarsest_stride > 1: + max_shape[1] = int( + np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride) + max_shape[2] = int( + np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride) + padding_batch = [] + for data in batch_data: + im_c, im_h, im_w = data[0].shape[:] + padding_im = np.zeros( + (im_c, max_shape[1], max_shape[2]), dtype=np.float32) + padding_im[:, :im_h, :im_w] = data[0] + if use_padded_im_info: + data[1][:2] = max_shape[1:3] + padding_batch.append((padding_im, ) + data[1:]) + return padding_batch + + def padding_multiscale_test(batch_data): + if len(batch_data) != 1: + raise NotImplementedError( + "Batch size must be 1 when using multiscale test, but now batch size is {}". + format(len(batch_data))) + if coarsest_stride > 1: + padding_batch = [] + padding_images = [] + data = batch_data[0] + for i, input in enumerate(data): + if i < num_scale: + im_c, im_h, im_w = input.shape + max_h = int( + np.ceil(im_h / coarsest_stride) * coarsest_stride) + max_w = int( + np.ceil(im_w / coarsest_stride) * coarsest_stride) + padding_im = np.zeros( + (im_c, max_h, max_w), dtype=np.float32) + padding_im[:, :im_h, :im_w] = input + data[num_scale][3 * i:3 * i + 2] = [max_h, max_w] + padding_batch.append(padding_im) + else: + padding_batch.append(input) + return [tuple(padding_batch)] + # no need to padding + return batch_data + + def random_shape(batch_data): + # For YOLO: gt_bbox is normalized, is scale invariant. + shape = np.random.choice(random_shapes) + scaled_batch = [] + h, w = batch_data[0][0].shape[1:3] + scale_x = float(shape) / w + scale_y = float(shape) / h + for data in batch_data: + im = cv2.resize( + data[0].transpose((1, 2, 0)), + None, + None, + fx=scale_x, + fy=scale_y, + interpolation=cv2.INTER_NEAREST) + scaled_batch.append((im.transpose(2, 0, 1), ) + data[1:]) + return scaled_batch + + def multi_scale_resize(batch_data): + # For RCNN: image shape in record in im_info. + scale = np.random.choice(multi_scales) + scaled_batch = [] + for data in batch_data: + im = cv2.resize( + data[0].transpose((1, 2, 0)), + None, + None, + fx=scale, + fy=scale, + interpolation=cv2.INTER_NEAREST) + im_info = [im.shape[:2], scale] + scaled_batch.append((im.transpose(2, 0, 1), im_info) + data[2:]) + return scaled_batch + + def _mapper(batch_data): + try: + if is_padding: + batch_data = padding_minibatch(batch_data) + if len(random_shapes) > 0: + batch_data = random_shape(batch_data) + if len(multi_scales) > 0: + batch_data = multi_scale_resize(batch_data) + if enable_multiscale_test: + batch_data = padding_multiscale_test(batch_data) + except Exception as e: + errmsg = "post-process failed with error: " + str(e) + logger.warn(errmsg) + raise e + + return batch_data + + return _mapper diff --git a/ppdet/data/transform/shared_queue/__init__.py b/ppdet/data/transform/shared_queue/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f118eb76a5f8349162a3a5dfc41bed2bb26a2cb0 --- /dev/null +++ b/ppdet/data/transform/shared_queue/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +__all__ = ['SharedBuffer', 'SharedMemoryMgr', 'SharedQueue'] + +from .sharedmemory import SharedBuffer +from .sharedmemory import SharedMemoryMgr +from .sharedmemory import SharedMemoryError +from .queue import SharedQueue diff --git a/ppdet/data/transform/shared_queue/queue.py b/ppdet/data/transform/shared_queue/queue.py new file mode 100644 index 0000000000000000000000000000000000000000..0bd44d3e9e0b34cefa3adfd8008a90e0963c5b62 --- /dev/null +++ b/ppdet/data/transform/shared_queue/queue.py @@ -0,0 +1,102 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import sys +import six +if six.PY3: + import pickle + from io import BytesIO as StringIO +else: + import cPickle as pickle + from cStringIO import StringIO + +import logging +import traceback +import multiprocessing as mp +from multiprocessing.queues import Queue +from .sharedmemory import SharedMemoryMgr + +logger = logging.getLogger(__name__) + + +class SharedQueueError(ValueError): + """ SharedQueueError + """ + pass + + +class SharedQueue(Queue): + """ a Queue based on shared memory to communicate data between Process, + and it's interface is compatible with 'multiprocessing.queues.Queue' + """ + + def __init__(self, maxsize=0, mem_mgr=None, memsize=None, pagesize=None): + """ init + """ + if six.PY3: + super(SharedQueue, self).__init__(maxsize, ctx=mp.get_context()) + else: + super(SharedQueue, self).__init__(maxsize) + + if mem_mgr is not None: + self._shared_mem = mem_mgr + else: + self._shared_mem = SharedMemoryMgr( + capacity=memsize, pagesize=pagesize) + + def put(self, obj, **kwargs): + """ put an object to this queue + """ + obj = pickle.dumps(obj, -1) + buff = None + try: + buff = self._shared_mem.malloc(len(obj)) + buff.put(obj) + super(SharedQueue, self).put(buff, **kwargs) + except Exception as e: + stack_info = traceback.format_exc() + err_msg = 'failed to put a element to SharedQueue '\ + 'with stack info[%s]' % (stack_info) + logger.warn(err_msg) + + if buff is not None: + buff.free() + raise e + + def get(self, **kwargs): + """ get an object from this queue + """ + buff = None + try: + buff = super(SharedQueue, self).get(**kwargs) + data = buff.get() + return pickle.load(StringIO(data)) + except Exception as e: + stack_info = traceback.format_exc() + err_msg = 'failed to get element from SharedQueue '\ + 'with stack info[%s]' % (stack_info) + logger.warn(err_msg) + raise e + finally: + if buff is not None: + buff.free() + + def release(self): + self._shared_mem.release() + self._shared_mem = None diff --git a/ppdet/data/transform/shared_queue/sharedmemory.py b/ppdet/data/transform/shared_queue/sharedmemory.py new file mode 100644 index 0000000000000000000000000000000000000000..765708d3b960b61fac8cd2ed985e29f43bab0fe0 --- /dev/null +++ b/ppdet/data/transform/shared_queue/sharedmemory.py @@ -0,0 +1,535 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# utils for memory management which is allocated on sharedmemory, +# note that these structures may not be thread-safe + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import os +import time +import math +import struct +import sys +import six + +if six.PY3: + import pickle +else: + import cPickle as pickle + +import json +import uuid +import random +import numpy as np +import weakref +import logging +from multiprocessing import Lock +from multiprocessing import RawArray + +logger = logging.getLogger(__name__) + + +class SharedMemoryError(ValueError): + """ SharedMemoryError + """ + pass + + +class SharedBufferError(SharedMemoryError): + """ SharedBufferError + """ + pass + + +class MemoryFullError(SharedMemoryError): + """ MemoryFullError + """ + + def __init__(self, errmsg=''): + super(MemoryFullError, self).__init__() + self.errmsg = errmsg + + +def memcopy(dst, src, offset=0, length=None): + """ copy data from 'src' to 'dst' in bytes + """ + length = length if length is not None else len(src) + assert type(dst) == np.ndarray, 'invalid type for "dst" in memcopy' + if type(src) is not np.ndarray: + if type(src) is str and six.PY3: + src = src.encode() + src = np.frombuffer(src, dtype='uint8', count=len(src)) + + dst[:] = src[offset:offset + length] + + +class SharedBuffer(object): + """ Buffer allocated from SharedMemoryMgr, and it stores data on shared memory + + note that: + every instance of this should be freed explicitely by calling 'self.free' + """ + + def __init__(self, owner, capacity, pos, size=0, alloc_status=''): + """ Init + + Args: + owner (str): manager to own this buffer + capacity (int): capacity in bytes for this buffer + pos (int): page position in shared memory + size (int): bytes already used + alloc_status (str): debug info about allocator when allocate this + """ + self._owner = owner + self._cap = capacity + self._pos = pos + self._size = size + self._alloc_status = alloc_status + assert self._pos >= 0 and self._cap > 0, \ + "invalid params[%d:%d] to construct SharedBuffer" \ + % (self._pos, self._cap) + + def owner(self): + """ get owner + """ + return SharedMemoryMgr.get_mgr(self._owner) + + def put(self, data, override=False): + """ put data to this buffer + + Args: + data (str): data to be stored in this buffer + + Returns: + None + + Raises: + SharedMemoryError when not enough space in this buffer + """ + assert type(data) in [str, bytes], \ + 'invalid type[%s] for SharedBuffer::put' % (str(type(data))) + if self._size > 0 and not override: + raise SharedBufferError('already has already been setted before') + + if self.capacity() < len(data): + raise SharedBufferError('data[%d] is larger than size of buffer[%s]'\ + % (len(data), str(self))) + + self.owner().put_data(self, data) + self._size = len(data) + + def get(self, offset=0, size=None, no_copy=True): + """ get the data stored this buffer + + Args: + offset (int): position for the start point to 'get' + size (int): size to get + + Returns: + data (np.ndarray('uint8')): user's data in numpy + which is passed in by 'put' + None: if no data stored in + """ + offset = offset if offset >= 0 else self._size + offset + if self._size <= 0: + return None + + size = self._size if size is None else size + assert offset + size <= self._cap, 'invalid offset[%d] '\ + 'or size[%d] for capacity[%d]' % (offset, size, self._cap) + return self.owner().get_data(self, offset, size, no_copy=no_copy) + + def size(self): + """ bytes of used memory + """ + return self._size + + def resize(self, size): + """ resize the used memory to 'size', should not be greater than capacity + """ + assert size >= 0 and size <= self._cap, \ + "invalid size[%d] for resize" % (size) + + self._size = size + + def capacity(self): + """ size of allocated memory + """ + return self._cap + + def __str__(self): + """ human readable format + """ + return "SharedBuffer(owner:%s, pos:%d, size:%d, "\ + "capacity:%d, alloc_status:[%s], pid:%d)" \ + % (str(self._owner), self._pos, self._size, \ + self._cap, self._alloc_status, os.getpid()) + + def free(self): + """ free this buffer to it's owner + """ + if self._owner is not None: + self.owner().free(self) + self._owner = None + self._cap = 0 + self._pos = -1 + self._size = 0 + return True + else: + return False + + +class PageAllocator(object): + """ allocator used to malloc and free shared memory which + is split into pages + """ + s_allocator_header = 12 + + def __init__(self, base, total_pages, page_size): + """ init + """ + self._magic_num = 1234321000 + random.randint(100, 999) + self._base = base + self._total_pages = total_pages + self._page_size = page_size + + header_pages = int( + math.ceil((total_pages + self.s_allocator_header) / page_size)) + + self._header_pages = header_pages + self._free_pages = total_pages - header_pages + self._header_size = self._header_pages * page_size + self._reset() + + def _dump_alloc_info(self, fname): + hpages, tpages, pos, used = self.header() + + start = self.s_allocator_header + end = start + self._page_size * hpages + alloc_flags = self._base[start:end].tostring() + info = { + 'magic_num': self._magic_num, + 'header_pages': hpages, + 'total_pages': tpages, + 'pos': pos, + 'used': used + } + info['alloc_flags'] = alloc_flags + fname = fname + '.' + str(uuid.uuid4())[:6] + with open(fname, 'wb') as f: + f.write(pickle.dumps(info, -1)) + logger.warn('dump alloc info to file[%s]' % (fname)) + + def _reset(self): + alloc_page_pos = self._header_pages + used_pages = self._header_pages + header_info = struct.pack( + str('III'), self._magic_num, alloc_page_pos, used_pages) + assert len(header_info) == self.s_allocator_header, \ + 'invalid size of header_info' + + memcopy(self._base[0:self.s_allocator_header], header_info) + self.set_page_status(0, self._header_pages, '1') + self.set_page_status(self._header_pages, self._free_pages, '0') + + def header(self): + """ get header info of this allocator + """ + header_str = self._base[0:self.s_allocator_header].tostring() + magic, pos, used = struct.unpack(str('III'), header_str) + + assert magic == self._magic_num, \ + 'invalid header magic[%d] in shared memory' % (magic) + return self._header_pages, self._total_pages, pos, used + + def empty(self): + """ are all allocatable pages available + """ + header_pages, pages, pos, used = self.header() + return header_pages == used + + def full(self): + """ are all allocatable pages used + """ + header_pages, pages, pos, used = self.header() + return header_pages + used == pages + + def __str__(self): + header_pages, pages, pos, used = self.header() + desc = '{page_info[magic:%d,total:%d,used:%d,header:%d,alloc_pos:%d,pagesize:%d]}' \ + % (self._magic_num, pages, used, header_pages, pos, self._page_size) + return 'PageAllocator:%s' % (desc) + + def set_alloc_info(self, alloc_pos, used_pages): + """ set allocating position to new value + """ + memcopy(self._base[4:12], struct.pack(str('II'), alloc_pos, used_pages)) + + def set_page_status(self, start, page_num, status): + """ set pages from 'start' to 'end' with new same status 'status' + """ + assert status in ['0', '1'], 'invalid status[%s] for page status '\ + 'in allocator[%s]' % (status, str(self)) + start += self.s_allocator_header + end = start + page_num + assert start >= 0 and end <= self._header_size, 'invalid end[%d] of pages '\ + 'in allocator[%s]' % (end, str(self)) + memcopy(self._base[start:end], str(status * page_num)) + + def get_page_status(self, start, page_num, ret_flag=False): + start += self.s_allocator_header + end = start + page_num + assert start >= 0 and end <= self._header_size, 'invalid end[%d] of pages '\ + 'in allocator[%s]' % (end, str(self)) + status = self._base[start:end].tostring().decode() + if ret_flag: + return status + + zero_num = status.count('0') + if zero_num == 0: + return (page_num, 1) + else: + return (zero_num, 0) + + def malloc_page(self, page_num): + header_pages, pages, pos, used = self.header() + end = pos + page_num + if end > pages: + pos = self._header_pages + end = pos + page_num + + start_pos = pos + flags = '' + while True: + # maybe flags already has some '0' pages, + # so just check 'page_num - len(flags)' pages + flags = self.get_page_status( + pos, page_num, ret_flag=True) + + if flags.count('0') == page_num: + break + + # not found enough pages, so shift to next few pages + free_pos = flags.rfind('1') + 1 + pos += free_pos + end = pos + page_num + if end > pages: + pos = self._header_pages + end = pos + page_num + flags = '' + + # not found available pages after scan all pages + if pos <= start_pos and end >= start_pos: + logger.debug('not found available pages after scan all pages') + break + + page_status = (flags.count('0'), 0) + if page_status != (page_num, 0): + free_pages = self._total_pages - used + if free_pages == 0: + err_msg = 'all pages have been used:%s' % (str(self)) + else: + err_msg = 'not found available pages with page_status[%s] '\ + 'and %d free pages' % (str(page_status), free_pages) + err_msg = 'failed to malloc %d pages at pos[%d] for reason[%s] and allocator status[%s]' \ + % (page_num, pos, err_msg, str(self)) + raise MemoryFullError(err_msg) + + self.set_page_status(pos, page_num, '1') + used += page_num + self.set_alloc_info(end, used) + return pos + + def free_page(self, start, page_num): + """ free 'page_num' pages start from 'start' + """ + page_status = self.get_page_status(start, page_num) + assert page_status == (page_num, 1), \ + 'invalid status[%s] when free [%d, %d]' \ + % (str(page_status), start, page_num) + self.set_page_status(start, page_num, '0') + _, _, pos, used = self.header() + used -= page_num + self.set_alloc_info(pos, used) + + +DEFAULT_SHARED_MEMORY_SIZE = 1024 * 1024 * 1024 + + +class SharedMemoryMgr(object): + """ manage a continouse block of memory, provide + 'malloc' to allocate new buffer, and 'free' to free buffer + """ + s_memory_mgrs = weakref.WeakValueDictionary() + s_mgr_num = 0 + s_log_statis = False + + @classmethod + def get_mgr(cls, id): + """ get a SharedMemoryMgr with size of 'capacity' + """ + assert id in cls.s_memory_mgrs, 'invalid id[%s] for memory managers' % ( + id) + return cls.s_memory_mgrs[id] + + def __init__(self, capacity=None, pagesize=None): + """ init + """ + logger.debug('create SharedMemoryMgr') + + pagesize = 64 * 1024 if pagesize is None else pagesize + assert type(pagesize) is int, "invalid type of pagesize[%s]" \ + % (str(pagesize)) + + capacity = DEFAULT_SHARED_MEMORY_SIZE if capacity is None else capacity + assert type(capacity) is int, "invalid type of capacity[%s]" \ + % (str(capacity)) + + assert capacity > 0, '"size of shared memory should be greater than 0' + self._released = False + self._cap = capacity + self._page_size = pagesize + + assert self._cap % self._page_size == 0, \ + "capacity[%d] and pagesize[%d] are not consistent" \ + % (self._cap, self._page_size) + self._total_pages = self._cap // self._page_size + + self._pid = os.getpid() + SharedMemoryMgr.s_mgr_num += 1 + self._id = self._pid * 100 + SharedMemoryMgr.s_mgr_num + SharedMemoryMgr.s_memory_mgrs[self._id] = self + self._locker = Lock() + self._setup() + + def _setup(self): + self._shared_mem = RawArray('c', self._cap) + self._base = np.frombuffer( + self._shared_mem, dtype='uint8', count=self._cap) + self._locker.acquire() + try: + self._allocator = PageAllocator(self._base, self._total_pages, + self._page_size) + finally: + self._locker.release() + + def malloc(self, size, wait=True): + """ malloc a new SharedBuffer + + Args: + size (int): buffer size to be malloc + wait (bool): whether to wait when no enough memory + + Returns: + SharedBuffer + + Raises: + SharedMemoryError when not found available memory + """ + page_num = int(math.ceil(size / self._page_size)) + size = page_num * self._page_size + + start = None + ct = 0 + errmsg = '' + while True: + self._locker.acquire() + try: + start = self._allocator.malloc_page(page_num) + alloc_status = str(self._allocator) + except MemoryFullError as e: + start = None + errmsg = e.errmsg + if not wait: + raise e + finally: + self._locker.release() + + if start is None: + time.sleep(0.1) + if ct % 100 == 0: + logger.warn('not enough space for reason[%s]' % (errmsg)) + + ct += 1 + else: + break + + return SharedBuffer(self._id, size, start, alloc_status=alloc_status) + + def free(self, shared_buf): + """ free a SharedBuffer + + Args: + shared_buf (SharedBuffer): buffer to be freed + + Returns: + None + + Raises: + SharedMemoryError when failed to release this buffer + """ + assert shared_buf._owner == self._id, "invalid shared_buf[%s] "\ + "for it's not allocated from me[%s]" % (str(shared_buf), str(self)) + cap = shared_buf.capacity() + start_page = shared_buf._pos + page_num = cap // self._page_size + + #maybe we don't need this lock here + self._locker.acquire() + try: + self._allocator.free_page(start_page, page_num) + finally: + self._locker.release() + + def put_data(self, shared_buf, data): + """ fill 'data' into 'shared_buf' + """ + assert len(data) <= shared_buf.capacity(), 'too large data[%d] '\ + 'for this buffer[%s]' % (len(data), str(shared_buf)) + start = shared_buf._pos * self._page_size + end = start + len(data) + assert start >= 0 and end <= self._cap, "invalid start "\ + "position[%d] when put data to buff:%s" % (start, str(shared_buf)) + self._base[start:end] = np.frombuffer(data, 'uint8', len(data)) + + def get_data(self, shared_buf, offset, size, no_copy=True): + """ extract 'data' from 'shared_buf' in range [offset, offset + size) + """ + start = shared_buf._pos * self._page_size + start += offset + if no_copy: + return self._base[start:start + size] + else: + return self._base[start:start + size].tostring() + + def __str__(self): + return 'SharedMemoryMgr:{id:%d, %s}' % (self._id, str(self._allocator)) + + def __del__(self): + if SharedMemoryMgr.s_log_statis: + logger.info('destroy [%s]' % (self)) + + if not self._released and not self._allocator.empty(): + logger.debug('not empty when delete this SharedMemoryMgr[%s]' % + (self)) + else: + self._released = True + + if self._id in SharedMemoryMgr.s_memory_mgrs: + del SharedMemoryMgr.s_memory_mgrs[self._id] + SharedMemoryMgr.s_mgr_num -= 1 diff --git a/ppdet/data/transform/transformer.py b/ppdet/data/transform/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..ad8845677db7cc17b5212876762a9b697344ba29 --- /dev/null +++ b/ppdet/data/transform/transformer.py @@ -0,0 +1,108 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import functools +import collections +from ..dataset import Dataset + + +class ProxiedDataset(Dataset): + """proxy method called to 'self._ds' when if not defined""" + + def __init__(self, ds): + super(ProxiedDataset, self).__init__() + self._ds = ds + methods = filter(lambda k: not k.startswith('_'), + Dataset.__dict__.keys()) + for m in methods: + func = functools.partial(self._proxy_method, getattr(self, m)) + setattr(self, m, func) + + def _proxy_method(self, func, *args, **kwargs): + """ + proxy call to 'func', if not available then call self._ds.xxx + whose name is the same with func.__name__ + """ + method = func.__name__ + try: + return func(*args, **kwargs) + except NotImplementedError: + ds_func = getattr(self._ds, method) + return ds_func(*args, **kwargs) + + +class MappedDataset(ProxiedDataset): + def __init__(self, ds, mapper): + super(MappedDataset, self).__init__(ds) + self._ds = ds + self._mapper = mapper + + def next(self): + sample = self._ds.next() + return self._mapper(sample) + + +class BatchedDataset(ProxiedDataset): + """ + Batching samples + + Args: + ds (instance of Dataset): dataset to be batched + batchsize (int): sample number for each batch + drop_last (bool): drop last samples when not enough for one batch + drop_empty (bool): drop samples which have empty field + """ + + def __init__(self, ds, batchsize, drop_last=False, drop_empty=True): + super(BatchedDataset, self).__init__(ds) + self._batchsz = batchsize + self._drop_last = drop_last + self._drop_empty = drop_empty + + def next(self): + """proxy to self._ds.next""" + + def empty(x): + if isinstance(x, np.ndarray) and x.size == 0: + return True + elif isinstance(x, collections.Sequence) and len(x) == 0: + return True + else: + return False + + def has_empty(items): + if any(x is None for x in items): + return True + if any(empty(x) for x in items): + return True + return False + + batch = [] + for _ in range(self._batchsz): + try: + out = self._ds.next() + while self._drop_empty and has_empty(out): + out = self._ds.next() + batch.append(out) + except StopIteration: + if not self._drop_last and len(batch) > 0: + return batch + else: + raise StopIteration + return batch diff --git a/ppdet/experimental/__init__.py b/ppdet/experimental/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f70396193135830ef2d1de8e357842ea1ef0eea2 --- /dev/null +++ b/ppdet/experimental/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import + +from .mixed_precision import * +from . import mixed_precision + +__all__ = mixed_precision.__all__ diff --git a/ppdet/experimental/mixed_precision.py b/ppdet/experimental/mixed_precision.py new file mode 100644 index 0000000000000000000000000000000000000000..d8c333bf4b99b51857e47779b2f8998758cd0318 --- /dev/null +++ b/ppdet/experimental/mixed_precision.py @@ -0,0 +1,338 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import print_function + +import six +from paddle.fluid.framework import Parameter +from paddle.fluid import layers +from paddle.fluid import core +from paddle.fluid import unique_name +import paddle.fluid.layer_helper_base as lhb +import paddle.fluid.optimizer as optim + +__all__ = ['mixed_precision_global_state', 'mixed_precision_context', + 'StaticLossScale', 'DynamicLossScale'] + +_mixed_precision_global_state = None + + +def mixed_precision_global_state(): + return _mixed_precision_global_state + + +class LossScale(object): + def __init__(self): + super(LossScale, self).__init__() + + def get_loss_scale_var(self): + return self.scale + + def increment(self): + raise NotImplementedError() + + def decrement(self): + raise NotImplementedError() + + +class StaticLossScale(LossScale): + """ + Static (fixed) loss scale manager. + + Args: + init_loss_scale (float): initial loss scale value. + + Examples: + + .. code-block:: python + + from paddle import fluid + from ppdet.experimental import (mixed_precision_context, + StaticLossScale) + + with mixed_precision_context(StaticLossScale(8.), True) as ctx: + # ... + # scale loss + loss_scale = ctx.get_loss_scale_var() + + """ + + def __init__(self, init_loss_scale=1.): + super(StaticLossScale, self).__init__() + self.scale = layers.create_global_var( + name=unique_name.generate("loss_scale"), + shape=[1], + value=init_loss_scale, + dtype='float32', + persistable=True) + + +class DynamicLossScale(LossScale): + """ + Dynamic loss scale manager. it works as follows: + if gradients is valid for `increment_every` steps, loss scale values is + increased by `factor`, otherwise loss scale values is decreased by `factor` + + Args: + init_loss_scale (float): initial loss scale value. + increment_every (int): minimum 'good' steps before loss scale increase. + factor (float): increase/decrease loss scale by this much. + + Examples: + + .. code-block:: python + + from paddle import fluid + from ppdet.experimental import (mixed_precision_context, + DynamicLossScale) + + loss_scale = DynamicLossScale(8., 1000, 4.) + with mixed_precision_context(loss_scale, True) as ctx: + # ... + # scale loss + loss_scale = ctx.get_loss_scale_var() + + """ + + def __init__(self, init_loss_scale=2**15, increment_every=2000, factor=2.): + super(DynamicLossScale, self).__init__() + self.scale = layers.create_global_var( + name=unique_name.generate("loss_scale"), + shape=[1], + value=init_loss_scale, + dtype='float32', + persistable=True) + self.good_steps = layers.create_global_var( + name=unique_name.generate("good_steps"), + shape=[1], + value=0, + dtype='int32', + persistable=True) + self.increment_every = layers.fill_constant( + shape=[1], dtype='int32', value=increment_every) + self.factor = factor + + def increment(self): + enough_steps = layers.less_than(self.increment_every, + self.good_steps + 1) + with layers.Switch() as switch: + with switch.case(enough_steps): + new_scale = self.scale * self.factor + scale_valid = layers.isfinite(new_scale) + with layers.Switch() as switch2: + with switch2.case(scale_valid): + layers.assign(new_scale, self.scale) + layers.assign(layers.zeros_like(self.good_steps), + self.good_steps) + with switch2.default(): + layers.increment(self.good_steps) + with switch.default(): + layers.increment(self.good_steps) + + def decrement(self): + new_scale = self.scale / self.factor + one = layers.fill_constant(shape=[1], dtype='float32', value=1.0) + less_than_one = layers.less_than(new_scale, one) + with layers.Switch() as switch: + with switch.case(less_than_one): + layers.assign(one, self.scale) + with switch.default(): + layers.assign(new_scale, self.scale) + + layers.assign(layers.zeros_like(self.good_steps), + self.good_steps) + + +class mixed_precision_context(object): + """ + Context manager for mixed precision training. + + Args: + loss_scale (float, str or obj): loss scale settings, can be: + 1. an number: use fixed loss scale. + 2. 'dynamic': use a default `DynamicLossScale`. + 3. `DynamicLossScale` or `StaticLossScale` instance. + enabled (bool): enable mixed precision training. + + Examples: + + .. code-block:: python + + from paddle import fluid + from ppdet.experimental import mixed_precision_context + + with mixed_precision_context('dynamic', True) as ctx: + # cast inputs to float16 + inputs = fluid.layers.cast(inputs, "float16") + # build model here + logits = model(inputs) + # use float32 for softmax + logits = fluid.layers.cast(logits, "float32") + softmax = fluid.layers.softmax(logits) + loss = fluid.layers.cross_entropy(input=softmax, label=label) + avg_loss = fluid.layers.mean(loss) + # scale loss + loss_scale = ctx.get_loss_scale_var() + avg_loss *= loss_scale + optimizer = fluid.optimizer.Momentum(...) + optimizer.minimize(avg_loss) + + """ + + def __init__(self, loss_scale=1., enabled=True): + super(mixed_precision_context, self).__init__() + self.enabled = enabled + if not enabled: + return + monkey_patch() + if isinstance(loss_scale, six.integer_types + (float,)): + self.loss_scale = StaticLossScale(loss_scale) + elif loss_scale == 'dynamic': + self.loss_scale = DynamicLossScale() + else: + assert isinstance(loss_scale, LossScale), \ + "Invalid loss scale argument" + self.loss_scale = loss_scale + + @property + def dynamic_scaling(self): + return isinstance(self.loss_scale, DynamicLossScale) + + def __getattr__(self, attr): + if attr in ['get_loss_scale_var', 'increment', 'decrement']: + return getattr(self.loss_scale, attr) + + def __enter__(self): + if not self.enabled: + return + global _mixed_precision_global_state + _mixed_precision_global_state = self + return mixed_precision_global_state() + + def __exit__(self, *args): + if not self.enabled: + return + global _mixed_precision_global_state + _mixed_precision_global_state = None + return mixed_precision_global_state() + + +def create_parameter(self, + attr, + shape, + dtype, + is_bias=False, + default_initializer=None): + mp_state = mixed_precision_global_state() + is_half = (isinstance(dtype, str) and dtype == 'float16') \ + or (isinstance(dtype, core.VarDesc.VarType) + and dtype == core.VarDesc.VarType.FP16) + + if is_half and mp_state is not None: + dtype = 'float32' + + param = self._create_parameter(attr, shape, dtype, + is_bias, default_initializer) + if not is_half or mp_state is None: + return param + + param16 = self.main_program.current_block().create_var( + name=param.name + '.fp16', + dtype='float16', + type=param.type, + persistable=False) + self.append_op( + type='cast', + inputs={'X': [param]}, + outputs={'Out': [param16]}, + attrs={'in_dtype': param.dtype, + 'out_dtype': param16.dtype}) + return param16 + + +def scale_gradient(block, context): + state = mixed_precision_global_state() + if state is None: + return + scale = state.get_loss_scale_var() + op_desc = block.desc.op(block.desc.op_size() - 1) + op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName() + bwd_role = core.op_proto_and_checker_maker.OpRole.Backward + for name in [n for n in op_desc.output_arg_names() if n in context]: + fwd_var = block._var_recursive(context[name]) + if not isinstance(fwd_var, Parameter): + continue # TODO verify all use cases + clip_op_desc = block.desc.append_op() + clip_op_desc.set_type("elementwise_div") + clip_op_desc.set_input("X", [name]) + clip_op_desc.set_input("Y", [scale.name]) + clip_op_desc.set_output("Out", [name]) + clip_op_desc._set_attr(op_role_attr_name, bwd_role) + + +def update_loss_scale(grads): + state = mixed_precision_global_state() + if state is None or not state.dynamic_scaling: + return + per_grad_check = layers.stack([layers.reduce_sum(g) for g in grads]) + grad_valid = layers.isfinite(per_grad_check) + + with layers.Switch() as switch: + with switch.case(grad_valid): + state.increment() + with switch.default(): + state.decrement() + return grad_valid + + +def backward(self, loss, **kwargs): + state = mixed_precision_global_state() + callbacks = 'callbacks' in kwargs and kwargs['callbacks'] or None + if callbacks is None: + from paddle.fluid.clip import error_clip_callback + callbacks = [error_clip_callback] # XXX what if gradient is zero? + if state is not None: + kwargs['callbacks'] = [scale_gradient] + callbacks + else: + kwargs['callbacks'] = callbacks + param_grads = self._backward(loss, **kwargs) + if state is not None: + grad_valid = update_loss_scale(v for k, v in param_grads) + if state.dynamic_scaling: + with layers.Switch() as switch: + with switch.case(grad_valid): + pass + with switch.default(): + for _, g in param_grads: + layers.assign(layers.zeros_like(g), g) + + return param_grads + + +mixed_precision_patched = False + + +# XXX this is a temporary measure, until thoroughly evaluated +def monkey_patch(): + global mixed_precision_patched + if mixed_precision_patched: + return + create_parameter_orig = lhb.LayerHelperBase.create_parameter + lhb.LayerHelperBase.create_parameter = create_parameter + lhb.LayerHelperBase._create_parameter = create_parameter_orig + backward_orig = optim.Optimizer.backward + optim.Optimizer.backward = backward + optim.Optimizer._backward = backward_orig + mixed_precision_patched = True diff --git a/ppdet/modeling/__init__.py b/ppdet/modeling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f9491d771626848a631804ac3bf663f2ee6ec82c --- /dev/null +++ b/ppdet/modeling/__init__.py @@ -0,0 +1,32 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import + +# XXX for triggering decorators +from . import anchor_heads +from . import architectures +from . import backbones +from . import roi_extractors +from . import roi_heads +from . import ops +from . import target_assigners + +from .anchor_heads import * +from .architectures import * +from .backbones import * +from .roi_extractors import * +from .roi_heads import * +from .ops import * +from .target_assigners import * diff --git a/ppdet/modeling/anchor_heads/__init__.py b/ppdet/modeling/anchor_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1ed22160d23cc5c7437507350037e71134ee8824 --- /dev/null +++ b/ppdet/modeling/anchor_heads/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import + +from . import rpn_head +from . import yolo_head +from . import retina_head + +from .rpn_head import * +from .yolo_head import * +from .retina_head import * diff --git a/ppdet/modeling/anchor_heads/retina_head.py b/ppdet/modeling/anchor_heads/retina_head.py new file mode 100644 index 0000000000000000000000000000000000000000..41246e8b6267881f62386653841ecfb525a254e1 --- /dev/null +++ b/ppdet/modeling/anchor_heads/retina_head.py @@ -0,0 +1,407 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +import paddle.fluid as fluid +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.initializer import Normal, Constant +from paddle.fluid.regularizer import L2Decay +from ppdet.modeling.ops import (AnchorGenerator, RetinaTargetAssign, + RetinaOutputDecoder) + +from ppdet.core.workspace import register + +__all__ = ['RetinaHead'] + + +@register +class RetinaHead(object): + """ + Retina Head + + Args: + anchor_generator (object): `AnchorGenerator` instance + target_assign (object): `RetinaTargetAssign` instance + output_decoder (object): `RetinaOutputDecoder` instance + num_convs_per_octave (int): Number of convolution layers in each octave + num_chan (int): Number of octave output channels + max_level (int): Highest level of FPN output + min_level (int): Lowest level of FPN output + prior_prob (float): Used to set the bias init for the class prediction layer + base_scale (int): Anchors are generated based on this scale + num_scales_per_octave (int): Number of anchor scales per octave + num_classes (int): Number of classes + gamma (float): The parameter in focal loss + alpha (float): The parameter in focal loss + sigma (float): The parameter in smooth l1 loss + """ + __inject__ = ['anchor_generator', 'target_assign', 'output_decoder'] + __shared__ = ['num_classes'] + + def __init__(self, + anchor_generator=AnchorGenerator().__dict__, + target_assign=RetinaTargetAssign().__dict__, + output_decoder=RetinaOutputDecoder().__dict__, + num_convs_per_octave=4, + num_chan=256, + max_level=7, + min_level=3, + prior_prob=0.01, + base_scale=4, + num_scales_per_octave=3, + num_classes=81, + gamma=2.0, + alpha=0.25, + sigma=3.0151134457776365): + self.anchor_generator = anchor_generator + self.target_assign = target_assign + self.output_decoder = output_decoder + self.num_convs_per_octave = num_convs_per_octave + self.num_chan = num_chan + self.max_level = max_level + self.min_level = min_level + self.prior_prob = prior_prob + self.base_scale = base_scale + self.num_scales_per_octave = num_scales_per_octave + self.num_classes = num_classes + self.gamma = gamma + self.alpha = alpha + self.sigma = sigma + if isinstance(anchor_generator, dict): + self.anchor_generator = AnchorGenerator(**anchor_generator) + if isinstance(target_assign, dict): + self.target_assign = RetinaTargetAssign(**target_assign) + if isinstance(output_decoder, dict): + self.output_decoder = RetinaOutputDecoder(**output_decoder) + + def _class_subnet(self, body_feats, spatial_scale): + """ + Get class predictions of all level FPN level. + + Args: + fpn_dict(dict): A dictionary represents the output of FPN with + their name. + spatial_scale(list): A list of multiplicative spatial scale factor. + + Returns: + cls_pred_input(list): Class prediction of all input fpn levels. + """ + assert len(body_feats) == self.max_level - self.min_level + 1 + fpn_name_list = list(body_feats.keys()) + cls_pred_list = [] + for lvl in range(self.min_level, self.max_level + 1): + fpn_name = fpn_name_list[self.max_level - lvl] + subnet_blob = body_feats[fpn_name] + for i in range(self.num_convs_per_octave): + conv_name = 'retnet_cls_conv_n{}_fpn{}'.format(i, lvl) + conv_share_name = 'retnet_cls_conv_n{}_fpn{}'.format( + i, self.min_level) + subnet_blob_in = subnet_blob + subnet_blob = fluid.layers.conv2d( + input=subnet_blob_in, + num_filters=self.num_chan, + filter_size=3, + stride=1, + padding=1, + act='relu', + name=conv_name, + param_attr=ParamAttr( + name=conv_share_name + '_w', + initializer=Normal( + loc=0., scale=0.01)), + bias_attr=ParamAttr( + name=conv_share_name + '_b', + learning_rate=2., + regularizer=L2Decay(0.))) + + # class prediction + cls_name = 'retnet_cls_pred_fpn{}'.format(lvl) + cls_share_name = 'retnet_cls_pred_fpn{}'.format(self.min_level) + num_anchors = self.num_scales_per_octave * len( + self.anchor_generator.aspect_ratios) + cls_dim = num_anchors * (self.num_classes - 1) + # bias initialization: b = -log((1 - pai) / pai) + bias_init = float(-np.log((1 - self.prior_prob) / self.prior_prob)) + out_cls = fluid.layers.conv2d( + input=subnet_blob, + num_filters=cls_dim, + filter_size=3, + stride=1, + padding=1, + act=None, + name=cls_name, + param_attr=ParamAttr( + name=cls_share_name + '_w', + initializer=Normal( + loc=0., scale=0.01)), + bias_attr=ParamAttr( + name=cls_share_name + '_b', + initializer=Constant(value=bias_init), + learning_rate=2., + regularizer=L2Decay(0.))) + cls_pred_list.append(out_cls) + + return cls_pred_list + + def _bbox_subnet(self, body_feats, spatial_scale): + """ + Get bounding box predictions of all level FPN level. + + Args: + fpn_dict(dict): A dictionary represents the output of FPN with + their name. + spatial_scale(list): A list of multiplicative spatial scale factor. + + Returns: + bbox_pred_input(list): Bounding box prediction of all input fpn + levels. + """ + assert len(body_feats) == self.max_level - self.min_level + 1 + fpn_name_list = list(body_feats.keys()) + bbox_pred_list = [] + for lvl in range(self.min_level, self.max_level + 1): + fpn_name = fpn_name_list[self.max_level - lvl] + subnet_blob = body_feats[fpn_name] + for i in range(self.num_convs_per_octave): + conv_name = 'retnet_bbox_conv_n{}_fpn{}'.format(i, lvl) + conv_share_name = 'retnet_bbox_conv_n{}_fpn{}'.format( + i, self.min_level) + subnet_blob_in = subnet_blob + subnet_blob = fluid.layers.conv2d( + input=subnet_blob_in, + num_filters=self.num_chan, + filter_size=3, + stride=1, + padding=1, + act='relu', + name=conv_name, + param_attr=ParamAttr( + name=conv_share_name + '_w', + initializer=Normal( + loc=0., scale=0.01)), + bias_attr=ParamAttr( + name=conv_share_name + '_b', + learning_rate=2., + regularizer=L2Decay(0.))) + + # bbox prediction + bbox_name = 'retnet_bbox_pred_fpn{}'.format(lvl) + bbox_share_name = 'retnet_bbox_pred_fpn{}'.format(self.min_level) + num_anchors = self.num_scales_per_octave * len( + self.anchor_generator.aspect_ratios) + bbox_dim = num_anchors * 4 + out_bbox = fluid.layers.conv2d( + input=subnet_blob, + num_filters=bbox_dim, + filter_size=3, + stride=1, + padding=1, + act=None, + name=bbox_name, + param_attr=ParamAttr( + name=bbox_share_name + '_w', + initializer=Normal( + loc=0., scale=0.01)), + bias_attr=ParamAttr( + name=bbox_share_name + '_b', + learning_rate=2., + regularizer=L2Decay(0.))) + bbox_pred_list.append(out_bbox) + return bbox_pred_list + + def _anchor_generate(self, body_feats, spatial_scale): + """ + Get anchor boxes of all level FPN level. + + Args: + fpn_dict(dict): A dictionary represents the output of FPN with + their name. + spatial_scale(list): A list of multiplicative spatial scale factor. + + Return: + anchor_input(list): Anchors of all input fpn levels with shape of. + anchor_var_input(list): Anchor variance of all input fpn levels with + shape. + """ + assert len(body_feats) == self.max_level - self.min_level + 1 + fpn_name_list = list(body_feats.keys()) + anchor_list = [] + anchor_var_list = [] + for lvl in range(self.min_level, self.max_level + 1): + anchor_sizes = [] + stride = int(1 / spatial_scale[self.max_level - lvl]) + for octave in range(self.num_scales_per_octave): + anchor_size = stride * ( + 2**(float(octave) / + float(self.num_scales_per_octave))) * self.base_scale + anchor_sizes.append(anchor_size) + fpn_name = fpn_name_list[self.max_level - lvl] + anchor, anchor_var = self.anchor_generator( + input=body_feats[fpn_name], + anchor_sizes=anchor_sizes, + aspect_ratios=self.anchor_generator.aspect_ratios, + stride=[stride, stride]) + anchor_list.append(anchor) + anchor_var_list.append(anchor_var) + return anchor_list, anchor_var_list + + def _get_output(self, body_feats, spatial_scale): + """ + Get class, bounding box predictions and anchor boxes of all level FPN level. + + Args: + fpn_dict(dict): A dictionary represents the output of FPN with + their name. + spatial_scale(list): A list of multiplicative spatial scale factor. + + Returns: + cls_pred_input(list): Class prediction of all input fpn levels. + bbox_pred_input(list): Bounding box prediction of all input fpn + levels. + anchor_input(list): Anchors of all input fpn levels with shape of. + anchor_var_input(list): Anchor variance of all input fpn levels with + shape. + """ + assert len(body_feats) == self.max_level - self.min_level + 1 + # class subnet + cls_pred_list = self._class_subnet(body_feats, spatial_scale) + # bbox subnet + bbox_pred_list = self._bbox_subnet(body_feats, spatial_scale) + #generate anchors + anchor_list, anchor_var_list = self._anchor_generate(body_feats, + spatial_scale) + cls_pred_reshape_list = [] + bbox_pred_reshape_list = [] + anchor_reshape_list = [] + anchor_var_reshape_list = [] + for i in range(self.max_level - self.min_level + 1): + cls_pred_transpose = fluid.layers.transpose( + cls_pred_list[i], perm=[0, 2, 3, 1]) + cls_pred_reshape = fluid.layers.reshape( + cls_pred_transpose, shape=(0, -1, self.num_classes - 1)) + bbox_pred_transpose = fluid.layers.transpose( + bbox_pred_list[i], perm=[0, 2, 3, 1]) + bbox_pred_reshape = fluid.layers.reshape( + bbox_pred_transpose, shape=(0, -1, 4)) + anchor_reshape = fluid.layers.reshape(anchor_list[i], shape=(-1, 4)) + anchor_var_reshape = fluid.layers.reshape( + anchor_var_list[i], shape=(-1, 4)) + cls_pred_reshape_list.append(cls_pred_reshape) + bbox_pred_reshape_list.append(bbox_pred_reshape) + anchor_reshape_list.append(anchor_reshape) + anchor_var_reshape_list.append(anchor_var_reshape) + output = {} + output['cls_pred'] = cls_pred_reshape_list + output['bbox_pred'] = bbox_pred_reshape_list + output['anchor'] = anchor_reshape_list + output['anchor_var'] = anchor_var_reshape_list + return output + + def get_prediction(self, body_feats, spatial_scale, im_info): + """ + Get prediction bounding box in test stage. + + Args: + fpn_dict(dict): A dictionary represents the output of FPN with + their name. + spatial_scale(list): A list of multiplicative spatial scale factor. + im_info (Variable): A 2-D LoDTensor with shape [B, 3]. B is the + number of input images, each element consists of im_height, + im_width, im_scale. + + Returns: + pred_result(Variable): Prediction result with shape [N, 6]. Each + row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]. + N is the total number of prediction. + """ + output = self._get_output(body_feats, spatial_scale) + cls_pred_reshape_list = output['cls_pred'] + bbox_pred_reshape_list = output['bbox_pred'] + anchor_reshape_list = output['anchor'] + for i in range(self.max_level - self.min_level + 1): + cls_pred_reshape_list[i] = fluid.layers.sigmoid( + cls_pred_reshape_list[i]) + pred_result = self.output_decoder( + bboxes=bbox_pred_reshape_list, + scores=cls_pred_reshape_list, + anchors=anchor_reshape_list, + im_info=im_info) + return {'bbox': pred_result} + + def get_loss(self, body_feats, spatial_scale, im_info, gt_box, gt_label, + is_crowd): + """ + Calculate the loss of retinanet. + Args: + fpn_dict(dict): A dictionary represents the output of FPN with + their name. + spatial_scale(list): A list of multiplicative spatial scale factor. + im_info(Variable): A 2-D LoDTensor with shape [B, 3]. B is the + number of input images, each element consists of im_height, + im_width, im_scale. + gt_box(Variable): The ground-truth bounding boxes with shape [M, 4]. + M is the number of groundtruth. + gt_label(Variable): The ground-truth labels with shape [M, 1]. + M is the number of groundtruth. + is_crowd(Variable): Indicates groud-truth is crowd or not with + shape [M, 1]. M is the number of groundtruth. + + Returns: + Type: dict + loss_cls(Variable): focal loss. + loss_bbox(Variable): smooth l1 loss. + """ + output = self._get_output(body_feats, spatial_scale) + cls_pred_reshape_list = output['cls_pred'] + bbox_pred_reshape_list = output['bbox_pred'] + anchor_reshape_list = output['anchor'] + anchor_var_reshape_list = output['anchor_var'] + + cls_pred_input = fluid.layers.concat(cls_pred_reshape_list, axis=1) + bbox_pred_input = fluid.layers.concat(bbox_pred_reshape_list, axis=1) + anchor_input = fluid.layers.concat(anchor_reshape_list, axis=0) + anchor_var_input = fluid.layers.concat(anchor_var_reshape_list, axis=0) + score_pred, loc_pred, score_tgt, loc_tgt, bbox_weight, fg_num = \ + self.target_assign( + bbox_pred=bbox_pred_input, + cls_logits=cls_pred_input, + anchor_box=anchor_input, + anchor_var=anchor_var_input, + gt_boxes=gt_box, + gt_labels=gt_label, + is_crowd=is_crowd, + im_info=im_info, + num_classes=self.num_classes - 1) + fg_num = fluid.layers.reduce_sum(fg_num, name='fg_num') + loss_cls = fluid.layers.sigmoid_focal_loss( + x=score_pred, + label=score_tgt, + fg_num=fg_num, + gamma=self.gamma, + alpha=self.alpha) + loss_cls = fluid.layers.reduce_sum(loss_cls, name='loss_cls') + loss_bbox = fluid.layers.smooth_l1( + x=loc_pred, + y=loc_tgt, + sigma=self.sigma, + inside_weight=bbox_weight, + outside_weight=bbox_weight) + loss_bbox = fluid.layers.reduce_sum(loss_bbox, name='loss_bbox') + loss_bbox = loss_bbox / fg_num + return {'loss_cls': loss_cls, 'loss_bbox': loss_bbox} diff --git a/ppdet/modeling/anchor_heads/rpn_head.py b/ppdet/modeling/anchor_heads/rpn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..876aafe36553b31cb1b41fec402949eb5a4c9f4b --- /dev/null +++ b/ppdet/modeling/anchor_heads/rpn_head.py @@ -0,0 +1,497 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.initializer import Normal +from paddle.fluid.regularizer import L2Decay + +from ppdet.core.workspace import register +from ppdet.modeling.ops import (AnchorGenerator, RPNTargetAssign, + GenerateProposals) + +__all__ = ['RPNTargetAssign', 'GenerateProposals', 'RPNHead', 'FPNRPNHead'] + + +@register +class RPNHead(object): + """ + RPN Head + + Args: + anchor_generator (object): `AnchorGenerator` instance + rpn_target_assign (object): `RPNTargetAssign` instance + train_proposal (object): `GenerateProposals` instance for training + test_proposal (object): `GenerateProposals` instance for testing + num_classes (int): number of classes in rpn output + """ + __inject__ = [ + 'anchor_generator', 'rpn_target_assign', 'train_proposal', + 'test_proposal' + ] + + def __init__(self, + anchor_generator=AnchorGenerator().__dict__, + rpn_target_assign=RPNTargetAssign().__dict__, + train_proposal=GenerateProposals(12000, 2000).__dict__, + test_proposal=GenerateProposals().__dict__, + num_classes=1): + super(RPNHead, self).__init__() + self.anchor_generator = anchor_generator + self.rpn_target_assign = rpn_target_assign + self.train_proposal = train_proposal + self.test_proposal = test_proposal + self.num_classes = num_classes + if isinstance(anchor_generator, dict): + self.anchor_generator = AnchorGenerator(**anchor_generator) + if isinstance(rpn_target_assign, dict): + self.rpn_target_assign = RPNTargetAssign(**rpn_target_assign) + if isinstance(train_proposal, dict): + self.train_proposal = GenerateProposals(**train_proposal) + if isinstance(test_proposal, dict): + self.test_proposal = GenerateProposals(**test_proposal) + + def _get_output(self, input): + """ + Get anchor and RPN head output. + + Args: + input(Variable): feature map from backbone with shape of [N, C, H, W] + + Returns: + rpn_cls_score(Variable): Output of rpn head with shape of + [N, num_anchors, H, W]. + rpn_bbox_pred(Variable): Output of rpn head with shape of + [N, num_anchors * 4, H, W]. + """ + dim_out = input.shape[1] + rpn_conv = fluid.layers.conv2d( + input=input, + num_filters=dim_out, + filter_size=3, + stride=1, + padding=1, + act='relu', + name='conv_rpn', + param_attr=ParamAttr( + name="conv_rpn_w", initializer=Normal( + loc=0., scale=0.01)), + bias_attr=ParamAttr( + name="conv_rpn_b", learning_rate=2., regularizer=L2Decay(0.))) + # Generate anchors + self.anchor, self.anchor_var = self.anchor_generator(input=rpn_conv) + num_anchor = self.anchor.shape[2] + # Proposal classification scores + self.rpn_cls_score = fluid.layers.conv2d( + rpn_conv, + num_filters=num_anchor * self.num_classes, + filter_size=1, + stride=1, + padding=0, + act=None, + name='rpn_cls_score', + param_attr=ParamAttr( + name="rpn_cls_logits_w", initializer=Normal( + loc=0., scale=0.01)), + bias_attr=ParamAttr( + name="rpn_cls_logits_b", + learning_rate=2., + regularizer=L2Decay(0.))) + # Proposal bbox regression deltas + self.rpn_bbox_pred = fluid.layers.conv2d( + rpn_conv, + num_filters=4 * num_anchor, + filter_size=1, + stride=1, + padding=0, + act=None, + name='rpn_bbox_pred', + param_attr=ParamAttr( + name="rpn_bbox_pred_w", initializer=Normal( + loc=0., scale=0.01)), + bias_attr=ParamAttr( + name="rpn_bbox_pred_b", + learning_rate=2., + regularizer=L2Decay(0.))) + return self.rpn_cls_score, self.rpn_bbox_pred + + def get_proposals(self, body_feats, im_info, mode='train'): + """ + Get proposals according to the output of backbone. + + Args: + body_feats (dict): The dictionary of feature maps from backbone. + im_info(Variable): The information of image with shape [N, 3] with + shape (height, width, scale). + body_feat_names(list): A list of names of feature maps from + backbone. + + Returns: + rpn_rois(Variable): Output proposals with shape of (rois_num, 4). + """ + + # In RPN Heads, only the last feature map of backbone is used. + # And body_feat_names[-1] represents the last level name of backbone. + body_feat = list(body_feats.values())[-1] + rpn_cls_score, rpn_bbox_pred = self._get_output(body_feat) + + if self.num_classes == 1: + rpn_cls_prob = fluid.layers.sigmoid( + rpn_cls_score, name='rpn_cls_prob') + else: + rpn_cls_score = fluid.layers.transpose( + rpn_cls_score, perm=[0, 2, 3, 1]) + rpn_cls_score = fluid.layers.reshape( + rpn_cls_score, shape=(0, 0, 0, -1, self.num_classes)) + rpn_cls_prob_tmp = fluid.layers.softmax( + rpn_cls_score, use_cudnn=False, name='rpn_cls_prob') + rpn_cls_prob_slice = fluid.layers.slice( + rpn_cls_prob_tmp, axes=[4], starts=[1], + ends=[self.num_classes]) + rpn_cls_prob, _ = fluid.layers.topk(rpn_cls_prob_slice, 1) + rpn_cls_prob = fluid.layers.reshape( + rpn_cls_prob, shape=(0, 0, 0, -1)) + rpn_cls_prob = fluid.layers.transpose( + rpn_cls_prob, perm=[0, 3, 1, 2]) + prop_op = self.train_proposal if mode == 'train' else self.test_proposal + rpn_rois, rpn_roi_probs = prop_op( + scores=rpn_cls_prob, + bbox_deltas=rpn_bbox_pred, + im_info=im_info, + anchors=self.anchor, + variances=self.anchor_var) + return rpn_rois + + def _transform_input(self, rpn_cls_score, rpn_bbox_pred, anchor, + anchor_var): + rpn_cls_score = fluid.layers.transpose(rpn_cls_score, perm=[0, 2, 3, 1]) + rpn_bbox_pred = fluid.layers.transpose(rpn_bbox_pred, perm=[0, 2, 3, 1]) + anchor = fluid.layers.reshape(anchor, shape=(-1, 4)) + anchor_var = fluid.layers.reshape(anchor_var, shape=(-1, 4)) + rpn_cls_score = fluid.layers.reshape( + x=rpn_cls_score, shape=(0, -1, self.num_classes)) + rpn_bbox_pred = fluid.layers.reshape(x=rpn_bbox_pred, shape=(0, -1, 4)) + return rpn_cls_score, rpn_bbox_pred, anchor, anchor_var + + def _get_loss_input(self): + for attr in ['rpn_cls_score', 'rpn_bbox_pred', 'anchor', 'anchor_var']: + if not getattr(self, attr, None): + raise ValueError("self.{} should not be None,".format(attr), + "call RPNHead.get_proposals first") + return self._transform_input(self.rpn_cls_score, self.rpn_bbox_pred, + self.anchor, self.anchor_var) + + def get_loss(self, im_info, gt_box, is_crowd, gt_label=None): + """ + Sample proposals and Calculate rpn loss. + + Args: + im_info(Variable): The information of image with shape [N, 3] with + shape (height, width, scale). + gt_box(Variable): The ground-truth bounding boxes with shape [M, 4]. + M is the number of groundtruth. + is_crowd(Variable): Indicates groud-truth is crowd or not with + shape [M, 1]. M is the number of groundtruth. + + Returns: + Type: dict + rpn_cls_loss(Variable): RPN classification loss. + rpn_bbox_loss(Variable): RPN bounding box regression loss. + + """ + rpn_cls, rpn_bbox, anchor, anchor_var = self._get_loss_input() + if self.num_classes == 1: + score_pred, loc_pred, score_tgt, loc_tgt, bbox_weight = \ + self.rpn_target_assign( + bbox_pred=rpn_bbox, + cls_logits=rpn_cls, + anchor_box=anchor, + anchor_var=anchor_var, + gt_boxes=gt_box, + is_crowd=is_crowd, + im_info=im_info) + score_tgt = fluid.layers.cast(x=score_tgt, dtype='float32') + score_tgt.stop_gradient = True + rpn_cls_loss = fluid.layers.sigmoid_cross_entropy_with_logits( + x=score_pred, label=score_tgt) + else: + score_pred, loc_pred, score_tgt, loc_tgt, bbox_weight = \ + self.rpn_target_assign( + bbox_pred=rpn_bbox, + cls_logits=rpn_cls, + anchor_box=anchor, + anchor_var=anchor_var, + gt_boxes=gt_box, + gt_labels=gt_label, + is_crowd=is_crowd, + num_classes=self.num_classes, + im_info=im_info) + labels_int64 = fluid.layers.cast(x=score_tgt, dtype='int64') + labels_int64.stop_gradient = True + rpn_cls_loss = fluid.layers.softmax_with_cross_entropy( + logits=score_pred, label=labels_int64, numeric_stable_mode=True) + + rpn_cls_loss = fluid.layers.reduce_mean( + rpn_cls_loss, name='loss_rpn_cls') + + loc_tgt = fluid.layers.cast(x=loc_tgt, dtype='float32') + loc_tgt.stop_gradient = True + rpn_reg_loss = fluid.layers.smooth_l1( + x=loc_pred, + y=loc_tgt, + sigma=3.0, + inside_weight=bbox_weight, + outside_weight=bbox_weight) + rpn_reg_loss = fluid.layers.reduce_sum( + rpn_reg_loss, name='loss_rpn_bbox') + score_shape = fluid.layers.shape(score_tgt) + score_shape = fluid.layers.cast(x=score_shape, dtype='float32') + norm = fluid.layers.reduce_prod(score_shape) + norm.stop_gradient = True + rpn_reg_loss = rpn_reg_loss / norm + + return {'loss_rpn_cls': rpn_cls_loss, 'loss_rpn_bbox': rpn_reg_loss} + + +@register +class FPNRPNHead(RPNHead): + """ + RPN Head that supports FPN input + + Args: + anchor_generator (object): `AnchorGenerator` instance + rpn_target_assign (object): `RPNTargetAssign` instance + train_proposal (object): `GenerateProposals` instance for training + test_proposal (object): `GenerateProposals` instance for testing + anchor_start_size (int): size of anchor at the first scale + num_chan (int): number of FPN output channels + min_level (int): lowest level of FPN output + max_level (int): highest level of FPN output + num_classes (int): number of classes in rpn output + """ + + __inject__ = [ + 'anchor_generator', 'rpn_target_assign', 'train_proposal', + 'test_proposal' + ] + + def __init__(self, + anchor_generator=AnchorGenerator().__dict__, + rpn_target_assign=RPNTargetAssign().__dict__, + train_proposal=GenerateProposals(12000, 2000).__dict__, + test_proposal=GenerateProposals().__dict__, + anchor_start_size=32, + num_chan=256, + min_level=2, + max_level=6, + num_classes=1): + super(FPNRPNHead, self).__init__(anchor_generator, rpn_target_assign, + train_proposal, test_proposal) + self.anchor_start_size = anchor_start_size + self.num_chan = num_chan + self.min_level = min_level + self.max_level = max_level + self.num_classes = num_classes + + self.fpn_rpn_list = [] + self.anchors_list = [] + self.anchor_var_list = [] + + def _get_output(self, input, feat_lvl): + """ + Get anchor and FPN RPN head output at one level. + + Args: + input(Variable): Body feature from backbone. + feat_lvl(int): Indicate the level of rpn output corresponding + to the level of feature map. + + Return: + rpn_cls_score(Variable): Output of one level of fpn rpn head with + shape of [N, num_anchors, H, W]. + rpn_bbox_pred(Variable): Output of one level of fpn rpn head with + shape of [N, num_anchors * 4, H, W]. + """ + slvl = str(feat_lvl) + conv_name = 'conv_rpn_fpn' + slvl + cls_name = 'rpn_cls_logits_fpn' + slvl + bbox_name = 'rpn_bbox_pred_fpn' + slvl + conv_share_name = 'conv_rpn_fpn' + str(self.min_level) + cls_share_name = 'rpn_cls_logits_fpn' + str(self.min_level) + bbox_share_name = 'rpn_bbox_pred_fpn' + str(self.min_level) + + num_anchors = len(self.anchor_generator.aspect_ratios) + conv_rpn_fpn = fluid.layers.conv2d( + input=input, + num_filters=self.num_chan, + filter_size=3, + padding=1, + act='relu', + name=conv_name, + param_attr=ParamAttr( + name=conv_share_name + '_w', + initializer=Normal( + loc=0., scale=0.01)), + bias_attr=ParamAttr( + name=conv_share_name + '_b', + learning_rate=2., + regularizer=L2Decay(0.))) + + self.anchors, self.anchor_var = self.anchor_generator( + input=conv_rpn_fpn, + anchor_sizes=(self.anchor_start_size * 2. + **(feat_lvl - self.min_level), ), + stride=(2.**feat_lvl, 2.**feat_lvl)) + + cls_num_filters = num_anchors * self.num_classes + self.rpn_cls_score = fluid.layers.conv2d( + input=conv_rpn_fpn, + num_filters=cls_num_filters, + filter_size=1, + act=None, + name=cls_name, + param_attr=ParamAttr( + name=cls_share_name + '_w', + initializer=Normal( + loc=0., scale=0.01)), + bias_attr=ParamAttr( + name=cls_share_name + '_b', + learning_rate=2., + regularizer=L2Decay(0.))) + self.rpn_bbox_pred = fluid.layers.conv2d( + input=conv_rpn_fpn, + num_filters=num_anchors * 4, + filter_size=1, + act=None, + name=bbox_name, + param_attr=ParamAttr( + name=bbox_share_name + '_w', + initializer=Normal( + loc=0., scale=0.01)), + bias_attr=ParamAttr( + name=bbox_share_name + '_b', + learning_rate=2., + regularizer=L2Decay(0.))) + return self.rpn_cls_score, self.rpn_bbox_pred + + def _get_single_proposals(self, body_feat, im_info, feat_lvl, mode='train'): + """ + Get proposals in one level according to the output of fpn rpn head + + Args: + body_feat(Variable): the feature map from backone. + im_info(Variable): The information of image with shape [N, 3] with + format (height, width, scale). + feat_lvl(int): Indicate the level of proposals corresponding to + the feature maps. + + Returns: + rpn_rois_fpn(Variable): Output proposals with shape of (rois_num, 4). + rpn_roi_probs_fpn(Variable): Scores of proposals with + shape of (rois_num, 1). + """ + + rpn_cls_score_fpn, rpn_bbox_pred_fpn = self._get_output(body_feat, + feat_lvl) + + prop_op = self.train_proposal if mode == 'train' else self.test_proposal + if self.num_classes == 1: + rpn_cls_prob_fpn = fluid.layers.sigmoid( + rpn_cls_score_fpn, name='rpn_cls_prob_fpn' + str(feat_lvl)) + else: + rpn_cls_score_fpn = fluid.layers.transpose( + rpn_cls_score_fpn, perm=[0, 2, 3, 1]) + rpn_cls_score_fpn = fluid.layers.reshape( + rpn_cls_score_fpn, shape=(0, 0, 0, -1, self.num_classes)) + rpn_cls_prob_fpn = fluid.layers.softmax( + rpn_cls_score_fpn, + use_cudnn=False, + name='rpn_cls_prob_fpn' + str(feat_lvl)) + rpn_cls_prob_fpn = fluid.layers.slice( + rpn_cls_prob_fpn, axes=[4], starts=[1], + ends=[self.num_classes]) + rpn_cls_prob_fpn, _ = fluid.layers.topk(rpn_cls_prob_fpn, 1) + rpn_cls_prob_fpn = fluid.layers.reshape( + rpn_cls_prob_fpn, shape=(0, 0, 0, -1)) + rpn_cls_prob_fpn = fluid.layers.transpose( + rpn_cls_prob_fpn, perm=[0, 3, 1, 2]) + rpn_rois_fpn, rpn_roi_prob_fpn = prop_op( + scores=rpn_cls_prob_fpn, + bbox_deltas=rpn_bbox_pred_fpn, + im_info=im_info, + anchors=self.anchors, + variances=self.anchor_var) + return rpn_rois_fpn, rpn_roi_prob_fpn + + def get_proposals(self, fpn_feats, im_info, mode='train'): + """ + Get proposals in multiple levels according to the output of fpn + rpn head + + Args: + fpn_feats(dict): A dictionary represents the output feature map + of FPN with their name. + im_info(Variable): The information of image with shape [N, 3] with + format (height, width, scale). + + Return: + rois_list(Variable): Output proposals in shape of [rois_num, 4] + """ + rois_list = [] + roi_probs_list = [] + fpn_feat_names = list(fpn_feats.keys()) + for lvl in range(self.min_level, self.max_level + 1): + fpn_feat_name = fpn_feat_names[self.max_level - lvl] + fpn_feat = fpn_feats[fpn_feat_name] + rois_fpn, roi_probs_fpn = self._get_single_proposals( + fpn_feat, im_info, lvl, mode) + self.fpn_rpn_list.append((self.rpn_cls_score, self.rpn_bbox_pred)) + rois_list.append(rois_fpn) + roi_probs_list.append(roi_probs_fpn) + self.anchors_list.append(self.anchors) + self.anchor_var_list.append(self.anchor_var) + prop_op = self.train_proposal if mode == 'train' else self.test_proposal + post_nms_top_n = prop_op.post_nms_top_n + rois_collect = fluid.layers.collect_fpn_proposals( + rois_list, + roi_probs_list, + self.min_level, + self.max_level, + post_nms_top_n, + name='collect') + return rois_collect + + def _get_loss_input(self): + rpn_clses = [] + rpn_bboxes = [] + anchors = [] + anchor_vars = [] + for i in range(len(self.fpn_rpn_list)): + single_input = self._transform_input( + self.fpn_rpn_list[i][0], self.fpn_rpn_list[i][1], + self.anchors_list[i], self.anchor_var_list[i]) + rpn_clses.append(single_input[0]) + rpn_bboxes.append(single_input[1]) + anchors.append(single_input[2]) + anchor_vars.append(single_input[3]) + + rpn_cls = fluid.layers.concat(rpn_clses, axis=1) + rpn_bbox = fluid.layers.concat(rpn_bboxes, axis=1) + anchors = fluid.layers.concat(anchors) + anchor_var = fluid.layers.concat(anchor_vars) + return rpn_cls, rpn_bbox, anchors, anchor_var diff --git a/ppdet/modeling/anchor_heads/yolo_head.py b/ppdet/modeling/anchor_heads/yolo_head.py new file mode 100644 index 0000000000000000000000000000000000000000..7e756f267762827b3666e8143dce9a695fc526e2 --- /dev/null +++ b/ppdet/modeling/anchor_heads/yolo_head.py @@ -0,0 +1,309 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.regularizer import L2Decay + +from ppdet.modeling.ops import MultiClassNMS +from ppdet.core.workspace import register + +__all__ = ['YOLOv3Head'] + + +@register +class YOLOv3Head(object): + """ + Head block for YOLOv3 network + + Args: + norm_decay (float): weight decay for normalization layer weights + num_classes (int): number of output classes + ignore_thresh (float): threshold to ignore confidence loss + label_smooth (bool): whether to use label smoothing + anchors (list): anchors + anchor_masks (list): anchor masks + nms (object): an instance of `MultiClassNMS` + """ + __inject__ = ['nms'] + __shared__ = ['num_classes', 'weight_prefix_name'] + + def __init__(self, + norm_decay=0., + num_classes=80, + ignore_thresh=0.7, + label_smooth=True, + anchors=[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], + [59, 119], [116, 90], [156, 198], [373, 326]], + anchor_masks=[[6, 7, 8], [3, 4, 5], [0, 1, 2]], + nms=MultiClassNMS( + score_threshold=0.01, + nms_top_k=1000, + keep_top_k=100, + nms_threshold=0.45, + background_label=-1).__dict__, + weight_prefix_name=''): + self.norm_decay = norm_decay + self.num_classes = num_classes + self.ignore_thresh = ignore_thresh + self.label_smooth = label_smooth + self.anchor_masks = anchor_masks + self._parse_anchors(anchors) + self.nms = nms + self.prefix_name = weight_prefix_name + if isinstance(nms, dict): + self.nms = MultiClassNMS(**nms) + + def _conv_bn(self, + input, + ch_out, + filter_size, + stride, + padding, + act='leaky', + is_test=True, + name=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=ch_out, + filter_size=filter_size, + stride=stride, + padding=padding, + act=None, + param_attr=ParamAttr(name=name + ".conv.weights"), + bias_attr=False) + + bn_name = name + ".bn" + bn_param_attr = ParamAttr( + regularizer=L2Decay(self.norm_decay), name=bn_name + '.scale') + bn_bias_attr = ParamAttr( + regularizer=L2Decay(self.norm_decay), name=bn_name + '.offset') + out = fluid.layers.batch_norm( + input=conv, + act=None, + is_test=is_test, + param_attr=bn_param_attr, + bias_attr=bn_bias_attr, + moving_mean_name=bn_name + '.mean', + moving_variance_name=bn_name + '.var') + + if act == 'leaky': + out = fluid.layers.leaky_relu(x=out, alpha=0.1) + return out + + def _detection_block(self, input, channel, is_test=True, name=None): + assert channel % 2 == 0, \ + "channel {} cannot be divided by 2 in detection block {}" \ + .format(channel, name) + + conv = input + for j in range(2): + conv = self._conv_bn( + conv, + channel, + filter_size=1, + stride=1, + padding=0, + is_test=is_test, + name='{}.{}.0'.format(name, j)) + conv = self._conv_bn( + conv, + channel * 2, + filter_size=3, + stride=1, + padding=1, + is_test=is_test, + name='{}.{}.1'.format(name, j)) + route = self._conv_bn( + conv, + channel, + filter_size=1, + stride=1, + padding=0, + is_test=is_test, + name='{}.2'.format(name)) + tip = self._conv_bn( + route, + channel * 2, + filter_size=3, + stride=1, + padding=1, + is_test=is_test, + name='{}.tip'.format(name)) + return route, tip + + def _upsample(self, input, scale=2, name=None): + out = fluid.layers.resize_nearest( + input=input, scale=float(scale), name=name) + return out + + def _parse_anchors(self, anchors): + """ + Check ANCHORS/ANCHOR_MASKS in config and parse mask_anchors + + """ + self.anchors = [] + self.mask_anchors = [] + + assert len(anchors) > 0, "ANCHORS not set." + assert len(self.anchor_masks) > 0, "ANCHOR_MASKS not set." + + for anchor in anchors: + assert len(anchor) == 2, "anchor {} len should be 2".format(anchor) + self.anchors.extend(anchor) + + anchor_num = len(anchors) + for masks in self.anchor_masks: + self.mask_anchors.append([]) + for mask in masks: + assert mask < anchor_num, "anchor mask index overflow" + self.mask_anchors[-1].extend(anchors[mask]) + + def _get_outputs(self, input, is_train=True): + """ + Get YOLOv3 head output + + Args: + input (list): List of Variables, output of backbone stages + is_train (bool): whether in train or test mode + + Returns: + outputs (list): Variables of each output layer + """ + + outputs = [] + + # get last out_layer_num blocks in reverse order + out_layer_num = len(self.anchor_masks) + blocks = input[-1:-out_layer_num - 1:-1] + + route = None + for i, block in enumerate(blocks): + if i > 0: # perform concat in first 2 detection_block + block = fluid.layers.concat(input=[route, block], axis=1) + route, tip = self._detection_block( + block, + channel=512 // (2**i), + is_test=(not is_train), + name=self.prefix_name + "yolo_block.{}".format(i)) + + # out channel number = mask_num * (5 + class_num) + num_filters = len(self.anchor_masks[i]) * (self.num_classes + 5) + block_out = fluid.layers.conv2d( + input=tip, + num_filters=num_filters, + filter_size=1, + stride=1, + padding=0, + act=None, + param_attr=ParamAttr(name=self.prefix_name + + "yolo_output.{}.conv.weights".format(i)), + bias_attr=ParamAttr( + regularizer=L2Decay(0.), + name=self.prefix_name + + "yolo_output.{}.conv.bias".format(i))) + outputs.append(block_out) + + if i < len(blocks) - 1: + # do not perform upsample in the last detection_block + route = self._conv_bn( + input=route, + ch_out=256 // (2**i), + filter_size=1, + stride=1, + padding=0, + is_test=(not is_train), + name=self.prefix_name + "yolo_transition.{}".format(i)) + # upsample + route = self._upsample(route) + + return outputs + + def get_loss(self, input, gt_box, gt_label, gt_score): + """ + Get final loss of network of YOLOv3. + + Args: + input (list): List of Variables, output of backbone stages + gt_box (Variable): The ground-truth boudding boxes. + gt_label (Variable): The ground-truth class labels. + gt_score (Variable): The ground-truth boudding boxes mixup scores. + + Returns: + loss (Variable): The loss Variable of YOLOv3 network. + + """ + outputs = self._get_outputs(input, is_train=True) + + losses = [] + downsample = 32 + for i, output in enumerate(outputs): + anchor_mask = self.anchor_masks[i] + loss = fluid.layers.yolov3_loss( + x=output, + gt_box=gt_box, + gt_label=gt_label, + gt_score=gt_score, + anchors=self.anchors, + anchor_mask=anchor_mask, + class_num=self.num_classes, + ignore_thresh=self.ignore_thresh, + downsample_ratio=downsample, + use_label_smooth=self.label_smooth, + name=self.prefix_name + "yolo_loss" + str(i)) + losses.append(fluid.layers.reduce_mean(loss)) + downsample //= 2 + + return sum(losses) + + def get_prediction(self, input, im_size): + """ + Get prediction result of YOLOv3 network + + Args: + input (list): List of Variables, output of backbone stages + im_size (Variable): Variable of size([h, w]) of each image + + Returns: + pred (Variable): The prediction result after non-max suppress. + + """ + + outputs = self._get_outputs(input, is_train=False) + + boxes = [] + scores = [] + downsample = 32 + for i, output in enumerate(outputs): + box, score = fluid.layers.yolo_box( + x=output, + img_size=im_size, + anchors=self.mask_anchors[i], + class_num=self.num_classes, + conf_thresh=self.nms.score_threshold, + downsample_ratio=downsample, + name=self.prefix_name + "yolo_box" + str(i)) + boxes.append(box) + scores.append(fluid.layers.transpose(score, perm=[0, 2, 1])) + + downsample //= 2 + + yolo_boxes = fluid.layers.concat(boxes, axis=1) + yolo_scores = fluid.layers.concat(scores, axis=2) + pred = self.nms(bboxes=yolo_boxes, scores=yolo_scores) + return {'bbox': pred} diff --git a/ppdet/modeling/architectures/__init__.py b/ppdet/modeling/architectures/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6d2f8aef6172f37e1389a573aa12c4bcc4f71b6c --- /dev/null +++ b/ppdet/modeling/architectures/__init__.py @@ -0,0 +1,35 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import + +from . import faster_rcnn +from . import mask_rcnn +from . import cascade_rcnn +from . import cascade_mask_rcnn +from . import yolov3 +from . import ssd +from . import retinanet +from . import blazeface +from . import faceboxes + +from .faster_rcnn import * +from .mask_rcnn import * +from .cascade_rcnn import * +from .cascade_mask_rcnn import * +from .yolov3 import * +from .ssd import * +from .retinanet import * +from .blazeface import * +from .faceboxes import * diff --git a/ppdet/modeling/architectures/blazeface.py b/ppdet/modeling/architectures/blazeface.py new file mode 100644 index 0000000000000000000000000000000000000000..cc9a2bb338fd1f3b40be09b7e351c24df06651a4 --- /dev/null +++ b/ppdet/modeling/architectures/blazeface.py @@ -0,0 +1,182 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from paddle import fluid + +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.regularizer import L2Decay + +from ppdet.core.workspace import register +from ppdet.modeling.ops import SSDOutputDecoder + +__all__ = ['BlazeFace'] + + +@register +class BlazeFace(object): + """ + BlazeFace: Sub-millisecond Neural Face Detection on Mobile GPUs, + see https://arxiv.org/abs/1907.05047 + + Args: + backbone (object): backbone instance + output_decoder (object): `SSDOutputDecoder` instance + min_sizes (list|None): min sizes of generated prior boxes. + max_sizes (list|None): max sizes of generated prior boxes. Default: None. + num_classes (int): number of output classes + use_density_prior_box (bool): whether or not use density_prior_box + instead of prior_box + densities (list|None): the densities of generated density prior boxes, + this attribute should be a list or tuple of integers + """ + + __category__ = 'architecture' + __inject__ = ['backbone', 'output_decoder'] + __shared__ = ['num_classes'] + + def __init__(self, + backbone="BlazeNet", + output_decoder=SSDOutputDecoder().__dict__, + min_sizes=[[16., 24.], [32., 48., 64., 80., 96., 128.]], + max_sizes=None, + steps=[8., 16.], + num_classes=2, + use_density_prior_box=False, + densities=[[2, 2], [2, 1, 1, 1, 1, 1]]): + super(BlazeFace, self).__init__() + self.backbone = backbone + self.num_classes = num_classes + self.output_decoder = output_decoder + if isinstance(output_decoder, dict): + self.output_decoder = SSDOutputDecoder(**output_decoder) + self.min_sizes = min_sizes + self.max_sizes = max_sizes + self.steps = steps + self.use_density_prior_box = use_density_prior_box + self.densities = densities + + def build(self, feed_vars, mode='train'): + im = feed_vars['image'] + if mode == 'train': + gt_box = feed_vars['gt_box'] + gt_label = feed_vars['gt_label'] + + body_feats = self.backbone(im) + locs, confs, box, box_var = self._multi_box_head( + inputs=body_feats, + image=im, + num_classes=self.num_classes, + use_density_prior_box=self.use_density_prior_box) + + if mode == 'train': + loss = fluid.layers.ssd_loss( + locs, + confs, + gt_box, + gt_label, + box, + box_var, + overlap_threshold=0.35, + neg_overlap=0.35) + loss = fluid.layers.reduce_sum(loss) + loss.persistable = True + return {'loss': loss} + else: + pred = self.output_decoder(locs, confs, box, box_var) + return {'bbox': pred} + + def _multi_box_head(self, + inputs, + image, + num_classes=2, + use_density_prior_box=False): + def permute_and_reshape(input, last_dim): + trans = fluid.layers.transpose(input, perm=[0, 2, 3, 1]) + compile_shape = [0, -1, last_dim] + return fluid.layers.reshape(trans, shape=compile_shape) + + def _is_list_or_tuple_(data): + return (isinstance(data, list) or isinstance(data, tuple)) + + locs, confs = [], [] + boxes, vars = [], [] + b_attr = ParamAttr(learning_rate=2., regularizer=L2Decay(0.)) + + for i, input in enumerate(inputs): + min_size = self.min_sizes[i] + + if use_density_prior_box: + densities = self.densities[i] + box, var = fluid.layers.density_prior_box( + input, + image, + densities=densities, + fixed_sizes=min_size, + fixed_ratios=[1.], + clip=False, + offset=0.5) + else: + box, var = fluid.layers.prior_box( + input, + image, + min_sizes=min_size, + max_sizes=None, + steps=[self.steps[i]] * 2, + aspect_ratios=[1.], + clip=False, + flip=False, + offset=0.5) + + num_boxes = box.shape[2] + + box = fluid.layers.reshape(box, shape=[-1, 4]) + var = fluid.layers.reshape(var, shape=[-1, 4]) + num_loc_output = num_boxes * 4 + num_conf_output = num_boxes * num_classes + # get loc + mbox_loc = fluid.layers.conv2d( + input, num_loc_output, 3, 1, 1, bias_attr=b_attr) + loc = permute_and_reshape(mbox_loc, 4) + # get conf + mbox_conf = fluid.layers.conv2d( + input, num_conf_output, 3, 1, 1, bias_attr=b_attr) + conf = permute_and_reshape(mbox_conf, 2) + + locs.append(loc) + confs.append(conf) + boxes.append(box) + vars.append(var) + + face_mbox_loc = fluid.layers.concat(locs, axis=1) + face_mbox_conf = fluid.layers.concat(confs, axis=1) + prior_boxes = fluid.layers.concat(boxes) + box_vars = fluid.layers.concat(vars) + return face_mbox_loc, face_mbox_conf, prior_boxes, box_vars + + def train(self, feed_vars): + return self.build(feed_vars, 'train') + + def eval(self, feed_vars): + return self.build(feed_vars, 'eval') + + def test(self, feed_vars): + return self.build(feed_vars, 'test') + + def is_bbox_normalized(self): + return True diff --git a/ppdet/modeling/architectures/cascade_mask_rcnn.py b/ppdet/modeling/architectures/cascade_mask_rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..f77ee62759bd9fe7e2b97669e0bdd82d1bddd9ad --- /dev/null +++ b/ppdet/modeling/architectures/cascade_mask_rcnn.py @@ -0,0 +1,384 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import OrderedDict + +import paddle.fluid as fluid + +from ppdet.experimental import mixed_precision_global_state +from ppdet.core.workspace import register + +__all__ = ['CascadeMaskRCNN'] + + +@register +class CascadeMaskRCNN(object): + """ + Cascade Mask R-CNN architecture, see https://arxiv.org/abs/1712.00726 + + Args: + backbone (object): backbone instance + rpn_head (object): `RPNhead` instance + bbox_assigner (object): `BBoxAssigner` instance + roi_extractor (object): ROI extractor instance + bbox_head (object): `BBoxHead` instance + mask_assigner (object): `MaskAssigner` instance + mask_head (object): `MaskHead` instance + fpn (object): feature pyramid network instance + """ + + __category__ = 'architecture' + __inject__ = [ + 'backbone', 'rpn_head', 'bbox_assigner', 'roi_extractor', 'bbox_head', + 'mask_assigner', 'mask_head', 'fpn' + ] + + def __init__(self, + backbone, + rpn_head, + roi_extractor='FPNRoIAlign', + bbox_head='CascadeBBoxHead', + bbox_assigner='CascadeBBoxAssigner', + mask_assigner='MaskAssigner', + mask_head='MaskHead', + rpn_only=False, + fpn='FPN'): + super(CascadeMaskRCNN, self).__init__() + assert fpn is not None, "cascade RCNN requires FPN" + self.backbone = backbone + self.fpn = fpn + self.rpn_head = rpn_head + self.bbox_assigner = bbox_assigner + self.roi_extractor = roi_extractor + self.bbox_head = bbox_head + self.mask_assigner = mask_assigner + self.mask_head = mask_head + self.rpn_only = rpn_only + # Cascade local cfg + self.cls_agnostic_bbox_reg = 2 + (brw0, brw1, brw2) = self.bbox_assigner.bbox_reg_weights + self.cascade_bbox_reg_weights = [ + [1. / brw0, 1. / brw0, 2. / brw0, 2. / brw0], + [1. / brw1, 1. / brw1, 2. / brw1, 2. / brw1], + [1. / brw2, 1. / brw2, 2. / brw2, 2. / brw2] + ] + self.cascade_rcnn_loss_weight = [1.0, 0.5, 0.25] + + def build(self, feed_vars, mode='train'): + if mode == 'train': + required_fields = [ + 'gt_label', 'gt_box', 'gt_mask', 'is_crowd', 'im_info' + ] + else: + required_fields = ['im_shape', 'im_info'] + self._input_check(required_fields, feed_vars) + + im = feed_vars['image'] + if mode == 'train': + gt_box = feed_vars['gt_box'] + is_crowd = feed_vars['is_crowd'] + + im_info = feed_vars['im_info'] + + mixed_precision_enabled = mixed_precision_global_state() is not None + # cast inputs to FP16 + if mixed_precision_enabled: + im = fluid.layers.cast(im, 'float16') + + # backbone + body_feats = self.backbone(im) + + # cast features back to FP32 + if mixed_precision_enabled: + body_feats = OrderedDict((k, fluid.layers.cast(v, 'float32')) + for k, v in body_feats.items()) + + # FPN + if self.fpn is not None: + body_feats, spatial_scale = self.fpn.get_output(body_feats) + + # rpn proposals + rpn_rois = self.rpn_head.get_proposals(body_feats, im_info, mode=mode) + + if mode == 'train': + rpn_loss = self.rpn_head.get_loss(im_info, gt_box, is_crowd) + else: + if self.rpn_only: + im_scale = fluid.layers.slice( + im_info, [1], starts=[2], ends=[3]) + im_scale = fluid.layers.sequence_expand(im_scale, rpn_rois) + rois = rpn_rois / im_scale + return {'proposal': rois} + + proposal_list = [] + roi_feat_list = [] + rcnn_pred_list = [] + rcnn_target_list = [] + + proposals = None + bbox_pred = None + for i in range(3): + if i > 0: + refined_bbox = self._decode_box( + proposals, + bbox_pred, + curr_stage=i - 1, ) + else: + refined_bbox = rpn_rois + + if mode == 'train': + outs = self.bbox_assigner( + input_rois=refined_bbox, feed_vars=feed_vars, curr_stage=i) + + proposals = outs[0] + rcnn_target_list.append(outs) + else: + proposals = refined_bbox + proposal_list.append(proposals) + + # extract roi features + roi_feat = self.roi_extractor(body_feats, proposals, spatial_scale) + roi_feat_list.append(roi_feat) + + # bbox head + cls_score, bbox_pred = self.bbox_head.get_output( + roi_feat, + wb_scalar=1.0 / self.cascade_rcnn_loss_weight[i], + name='_' + str(i + 1) if i > 0 else '') + rcnn_pred_list.append((cls_score, bbox_pred)) + + # get mask rois + rois = proposal_list[2] + + if mode == 'train': + loss = self.bbox_head.get_loss(rcnn_pred_list, rcnn_target_list, + self.cascade_rcnn_loss_weight) + loss.update(rpn_loss) + + labels_int32 = rcnn_target_list[2][1] + + mask_rois, roi_has_mask_int32, mask_int32 = self.mask_assigner( + rois=rois, + gt_classes=feed_vars['gt_label'], + is_crowd=feed_vars['is_crowd'], + gt_segms=feed_vars['gt_mask'], + im_info=feed_vars['im_info'], + labels_int32=labels_int32) + + if self.fpn is None: + bbox_head_feat = self.bbox_head.get_head_feat() + feat = fluid.layers.gather(bbox_head_feat, roi_has_mask_int32) + else: + feat = self.roi_extractor( + body_feats, mask_rois, spatial_scale, is_mask=True) + mask_loss = self.mask_head.get_loss(feat, mask_int32) + loss.update(mask_loss) + + total_loss = fluid.layers.sum(list(loss.values())) + loss.update({'loss': total_loss}) + return loss + else: + mask_name = 'mask_pred' + mask_pred, bbox_pred = self.single_scale_eval( + body_feats, spatial_scale, im_info, mask_name, bbox_pred, + roi_feat_list, rcnn_pred_list, proposal_list, + feed_vars['im_shape']) + return {'bbox': bbox_pred, 'mask': mask_pred} + + def build_multi_scale(self, feed_vars, mask_branch=False): + required_fields = ['image', 'im_info'] + self._input_check(required_fields, feed_vars) + + ims = [] + for k in feed_vars.keys(): + if 'image' in k: + ims.append(feed_vars[k]) + result = {} + + if not mask_branch: + assert 'im_shape' in feed_vars, \ + "{} has no im_shape field".format(feed_vars) + result.update(feed_vars) + + for i, im in enumerate(ims): + im_info = fluid.layers.slice( + input=feed_vars['im_info'], + axes=[1], + starts=[3 * i], + ends=[3 * i + 3]) + body_feats = self.backbone(im) + result.update(body_feats) + + # FPN + if self.fpn is not None: + body_feats, spatial_scale = self.fpn.get_output(body_feats) + rois = self.rpn_head.get_proposals(body_feats, im_info, mode='test') + if not mask_branch: + im_shape = feed_vars['im_shape'] + body_feat_names = list(body_feats.keys()) + proposal_list = [] + roi_feat_list = [] + rcnn_pred_list = [] + + proposals = None + bbox_pred = None + for i in range(3): + if i > 0: + refined_bbox = self._decode_box( + proposals, + bbox_pred, + curr_stage=i - 1, ) + else: + refined_bbox = rois + + proposals = refined_bbox + proposal_list.append(proposals) + + # extract roi features + roi_feat = self.roi_extractor(body_feats, proposals, + spatial_scale) + roi_feat_list.append(roi_feat) + + # bbox head + cls_score, bbox_pred = self.bbox_head.get_output( + roi_feat, + wb_scalar=1.0 / self.cascade_rcnn_loss_weight[i], + name='_' + str(i + 1) if i > 0 else '') + rcnn_pred_list.append((cls_score, bbox_pred)) + + # get mask rois + if self.fpn is None: + body_feat = body_feats[body_feat_names[-1]] + pred = self.bbox_head.get_prediction( + im_info, + im_shape, + roi_feat_list, + rcnn_pred_list, + proposal_list, + self.cascade_bbox_reg_weights, + return_box_score=True) + bbox_name = 'bbox_' + str(i) + score_name = 'score_' + str(i) + if 'flip' in im.name: + bbox_name += '_flip' + score_name += '_flip' + result[bbox_name] = pred['bbox'] + result[score_name] = pred['score'] + else: + mask_name = 'mask_pred_' + str(i) + bbox_pred = feed_vars['bbox'] + result.update({im.name: im}) + if 'flip' in im.name: + mask_name += '_flip' + bbox_pred = feed_vars['bbox_flip'] + mask_pred, bbox_pred = self.single_scale_eval( + body_feats, + spatial_scale, + im_info, + mask_name, + bbox_pred=bbox_pred, + use_multi_test=True) + result[mask_name] = mask_pred + return result + + def single_scale_eval(self, + body_feats, + spatial_scale, + im_info, + mask_name, + bbox_pred, + roi_feat_list=None, + rcnn_pred_list=None, + proposal_list=None, + im_shape=None, + use_multi_test=False): + if self.fpn is None: + last_feat = body_feats[list(body_feats.keys())[-1]] + if not use_multi_test: + bbox_pred = self.bbox_head.get_prediction( + im_info, im_shape, roi_feat_list, rcnn_pred_list, proposal_list, + self.cascade_bbox_reg_weights) + bbox_pred = bbox_pred['bbox'] + + # share weight + bbox_shape = fluid.layers.shape(bbox_pred) + bbox_size = fluid.layers.reduce_prod(bbox_shape) + bbox_size = fluid.layers.reshape(bbox_size, [1, 1]) + size = fluid.layers.fill_constant([1, 1], value=6, dtype='int32') + cond = fluid.layers.less_than(x=bbox_size, y=size) + + mask_pred = fluid.layers.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=False, + name=mask_name) + with fluid.layers.control_flow.Switch() as switch: + with switch.case(cond): + fluid.layers.assign(input=bbox_pred, output=mask_pred) + with switch.default(): + bbox = fluid.layers.slice(bbox_pred, [1], starts=[2], ends=[6]) + + im_scale = fluid.layers.slice( + im_info, [1], starts=[2], ends=[3]) + im_scale = fluid.layers.sequence_expand(im_scale, bbox) + + mask_rois = bbox * im_scale + if self.fpn is None: + mask_feat = self.roi_extractor(last_feat, mask_rois) + mask_feat = self.bbox_head.get_head_feat(mask_feat) + else: + mask_feat = self.roi_extractor( + body_feats, mask_rois, spatial_scale, is_mask=True) + + mask_out = self.mask_head.get_prediction(mask_feat, bbox) + fluid.layers.assign(input=mask_out, output=mask_pred) + return mask_pred, bbox_pred + + def _input_check(self, require_fields, feed_vars): + for var in require_fields: + assert var in feed_vars, \ + "{} has no {} field".format(feed_vars, var) + + def _decode_box(self, proposals, bbox_pred, curr_stage): + rcnn_loc_delta_r = fluid.layers.reshape( + bbox_pred, (-1, self.cls_agnostic_bbox_reg, 4)) + # only use fg box delta to decode box + rcnn_loc_delta_s = fluid.layers.slice( + rcnn_loc_delta_r, axes=[1], starts=[1], ends=[2]) + refined_bbox = fluid.layers.box_coder( + prior_box=proposals, + prior_box_var=self.cascade_bbox_reg_weights[curr_stage], + target_box=rcnn_loc_delta_s, + code_type='decode_center_size', + box_normalized=False, + axis=1, ) + refined_bbox = fluid.layers.reshape(refined_bbox, shape=[-1, 4]) + + return refined_bbox + + def train(self, feed_vars): + return self.build(feed_vars, 'train') + + def eval(self, feed_vars, multi_scale=None, mask_branch=False): + if multi_scale: + return self.build_multi_scale(feed_vars, mask_branch) + return self.build(feed_vars, 'test') + + def test(self, feed_vars): + return self.build(feed_vars, 'test') diff --git a/ppdet/modeling/architectures/cascade_rcnn.py b/ppdet/modeling/architectures/cascade_rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..b80a8d7f62ccd13f632fe8124372156d656d2abc --- /dev/null +++ b/ppdet/modeling/architectures/cascade_rcnn.py @@ -0,0 +1,289 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import OrderedDict + +import paddle.fluid as fluid + +from ppdet.experimental import mixed_precision_global_state +from ppdet.core.workspace import register + +__all__ = ['CascadeRCNN'] + + +@register +class CascadeRCNN(object): + """ + Cascade R-CNN architecture, see https://arxiv.org/abs/1712.00726 + + Args: + backbone (object): backbone instance + rpn_head (object): `RPNhead` instance + bbox_assigner (object): `BBoxAssigner` instance + roi_extractor (object): ROI extractor instance + bbox_head (object): `BBoxHead` instance + fpn (object): feature pyramid network instance + """ + + __category__ = 'architecture' + __inject__ = [ + 'backbone', 'fpn', 'rpn_head', 'bbox_assigner', 'roi_extractor', + 'bbox_head' + ] + + def __init__(self, + backbone, + rpn_head, + roi_extractor='FPNRoIAlign', + bbox_head='CascadeBBoxHead', + bbox_assigner='CascadeBBoxAssigner', + rpn_only=False, + fpn='FPN'): + super(CascadeRCNN, self).__init__() + assert fpn is not None, "cascade RCNN requires FPN" + self.backbone = backbone + self.fpn = fpn + self.rpn_head = rpn_head + self.bbox_assigner = bbox_assigner + self.roi_extractor = roi_extractor + self.bbox_head = bbox_head + self.rpn_only = rpn_only + # Cascade local cfg + self.cls_agnostic_bbox_reg = 2 + (brw0, brw1, brw2) = self.bbox_assigner.bbox_reg_weights + self.cascade_bbox_reg_weights = [ + [1. / brw0, 1. / brw0, 2. / brw0, 2. / brw0], + [1. / brw1, 1. / brw1, 2. / brw1, 2. / brw1], + [1. / brw2, 1. / brw2, 2. / brw2, 2. / brw2] + ] + self.cascade_rcnn_loss_weight = [1.0, 0.5, 0.25] + + def build(self, feed_vars, mode='train'): + if mode == 'train': + required_fields = ['gt_label', 'gt_box', 'is_crowd', 'im_info'] + else: + required_fields = ['im_shape', 'im_info'] + self._input_check(required_fields, feed_vars) + + im = feed_vars['image'] + im_info = feed_vars['im_info'] + + if mode == 'train': + gt_box = feed_vars['gt_box'] + is_crowd = feed_vars['is_crowd'] + + mixed_precision_enabled = mixed_precision_global_state() is not None + # cast inputs to FP16 + if mixed_precision_enabled: + im = fluid.layers.cast(im, 'float16') + + # backbone + body_feats = self.backbone(im) + + # cast features back to FP32 + if mixed_precision_enabled: + body_feats = OrderedDict((k, fluid.layers.cast(v, 'float32')) + for k, v in body_feats.items()) + + # FPN + if self.fpn is not None: + body_feats, spatial_scale = self.fpn.get_output(body_feats) + + # rpn proposals + rpn_rois = self.rpn_head.get_proposals(body_feats, im_info, mode=mode) + + if mode == 'train': + rpn_loss = self.rpn_head.get_loss(im_info, gt_box, is_crowd) + else: + if self.rpn_only: + im_scale = fluid.layers.slice( + im_info, [1], starts=[2], ends=[3]) + im_scale = fluid.layers.sequence_expand(im_scale, rpn_rois) + rois = rpn_rois / im_scale + return {'proposal': rois} + + proposal_list = [] + roi_feat_list = [] + rcnn_pred_list = [] + rcnn_target_list = [] + + proposals = None + bbox_pred = None + for i in range(3): + if i > 0: + refined_bbox = self._decode_box( + proposals, + bbox_pred, + curr_stage=i - 1, ) + else: + refined_bbox = rpn_rois + + if mode == 'train': + outs = self.bbox_assigner( + input_rois=refined_bbox, feed_vars=feed_vars, curr_stage=i) + + proposals = outs[0] + rcnn_target_list.append(outs) + else: + proposals = refined_bbox + proposal_list.append(proposals) + + # extract roi features + roi_feat = self.roi_extractor(body_feats, proposals, spatial_scale) + roi_feat_list.append(roi_feat) + + # bbox head + cls_score, bbox_pred = self.bbox_head.get_output( + roi_feat, + wb_scalar=1.0 / self.cascade_rcnn_loss_weight[i], + name='_' + str(i + 1) if i > 0 else '') + rcnn_pred_list.append((cls_score, bbox_pred)) + + if mode == 'train': + loss = self.bbox_head.get_loss(rcnn_pred_list, rcnn_target_list, + self.cascade_rcnn_loss_weight) + loss.update(rpn_loss) + total_loss = fluid.layers.sum(list(loss.values())) + loss.update({'loss': total_loss}) + return loss + else: + pred = self.bbox_head.get_prediction( + im_info, feed_vars['im_shape'], roi_feat_list, rcnn_pred_list, + proposal_list, self.cascade_bbox_reg_weights, + self.cls_agnostic_bbox_reg) + return pred + + def build_multi_scale(self, feed_vars): + required_fields = ['image', 'im_shape', 'im_info'] + self._input_check(required_fields, feed_vars) + ims = [] + for k in feed_vars.keys(): + if 'image' in k: + ims.append(feed_vars[k]) + result = {} + result.update(feed_vars) + for i, im in enumerate(ims): + im_info = fluid.layers.slice( + input=feed_vars['im_info'], + axes=[1], + starts=[3 * i], + ends=[3 * i + 3]) + im_shape = feed_vars['im_shape'] + + # backbone + body_feats = self.backbone(im) + result.update(body_feats) + body_feat_names = list(body_feats.keys()) + + # FPN + if self.fpn is not None: + body_feats, spatial_scale = self.fpn.get_output(body_feats) + + # rpn proposals + rpn_rois = self.rpn_head.get_proposals( + body_feats, im_info, mode='test') + + proposal_list = [] + roi_feat_list = [] + rcnn_pred_list = [] + + proposals = None + bbox_pred = None + for i in range(3): + if i > 0: + refined_bbox = self._decode_box( + proposals, + bbox_pred, + curr_stage=i - 1, ) + else: + refined_bbox = rpn_rois + + proposals = refined_bbox + proposal_list.append(proposals) + + # extract roi features + roi_feat = self.roi_extractor(body_feats, proposals, + spatial_scale) + roi_feat_list.append(roi_feat) + + # bbox head + cls_score, bbox_pred = self.bbox_head.get_output( + roi_feat, + wb_scalar=1.0 / self.cascade_rcnn_loss_weight[i], + name='_' + str(i + 1) if i > 0 else '') + rcnn_pred_list.append((cls_score, bbox_pred)) + + # get mask rois + rois = proposal_list[2] + + if self.fpn is None: + last_feat = body_feats[list(body_feats.keys())[-1]] + roi_feat = self.roi_extractor(last_feat, rois) + else: + roi_feat = self.roi_extractor(body_feats, rois, spatial_scale) + + pred = self.bbox_head.get_prediction( + im_info, + im_shape, + roi_feat_list, + rcnn_pred_list, + proposal_list, + self.cascade_bbox_reg_weights, + self.cls_agnostic_bbox_reg, + return_box_score=True) + bbox_name = 'bbox_' + str(i) + score_name = 'score_' + str(i) + if 'flip' in im.name: + bbox_name += '_flip' + score_name += '_flip' + result[bbox_name] = pred['bbox'] + result[score_name] = pred['score'] + return result + + def _input_check(self, require_fields, feed_vars): + for var in require_fields: + assert var in feed_vars, \ + "{} has no {} field".format(feed_vars, var) + + def _decode_box(self, proposals, bbox_pred, curr_stage): + rcnn_loc_delta_r = fluid.layers.reshape( + bbox_pred, (-1, self.cls_agnostic_bbox_reg, 4)) + # only use fg box delta to decode box + rcnn_loc_delta_s = fluid.layers.slice( + rcnn_loc_delta_r, axes=[1], starts=[1], ends=[2]) + refined_bbox = fluid.layers.box_coder( + prior_box=proposals, + prior_box_var=self.cascade_bbox_reg_weights[curr_stage], + target_box=rcnn_loc_delta_s, + code_type='decode_center_size', + box_normalized=False, + axis=1, ) + refined_bbox = fluid.layers.reshape(refined_bbox, shape=[-1, 4]) + + return refined_bbox + + def train(self, feed_vars): + return self.build(feed_vars, 'train') + + def eval(self, feed_vars, multi_scale=None): + if multi_scale: + return self.build_multi_scale(feed_vars) + return self.build(feed_vars, 'test') + + def test(self, feed_vars): + return self.build(feed_vars, 'test') diff --git a/ppdet/modeling/architectures/faceboxes.py b/ppdet/modeling/architectures/faceboxes.py new file mode 100644 index 0000000000000000000000000000000000000000..194b3a7e864f30051a20e514bbf9a1e970548afc --- /dev/null +++ b/ppdet/modeling/architectures/faceboxes.py @@ -0,0 +1,154 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from paddle import fluid + +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.regularizer import L2Decay + +from ppdet.core.workspace import register +from ppdet.modeling.ops import SSDOutputDecoder + +__all__ = ['FaceBoxes'] + + +@register +class FaceBoxes(object): + """ + FaceBoxes: Sub-millisecond Neural Face Detection on Mobile GPUs, + see https://https://arxiv.org/abs/1708.05234 + + Args: + backbone (object): backbone instance + output_decoder (object): `SSDOutputDecoder` instance + densities (list|None): the densities of generated density prior boxes, + this attribute should be a list or tuple of integers. + fixed_sizes (list|None): the fixed sizes of generated density prior boxes, + this attribute should a list or tuple of same length with `densities`. + num_classes (int): number of output classes + """ + + __category__ = 'architecture' + __inject__ = ['backbone', 'output_decoder'] + __shared__ = ['num_classes'] + + def __init__(self, + backbone="FaceBoxNet", + output_decoder=SSDOutputDecoder().__dict__, + densities=[[4, 2, 1], [1], [1]], + fixed_sizes=[[32., 64., 128.], [256.], [512.]], + num_classes=2): + super(FaceBoxes, self).__init__() + self.backbone = backbone + self.num_classes = num_classes + self.output_decoder = output_decoder + if isinstance(output_decoder, dict): + self.output_decoder = SSDOutputDecoder(**output_decoder) + self.densities = densities + self.fixed_sizes = fixed_sizes + + def build(self, feed_vars, mode='train'): + im = feed_vars['image'] + if mode == 'train': + gt_box = feed_vars['gt_box'] + gt_label = feed_vars['gt_label'] + + body_feats = self.backbone(im) + locs, confs, box, box_var = self._multi_box_head( + inputs=body_feats, image=im, num_classes=self.num_classes) + + if mode == 'train': + loss = fluid.layers.ssd_loss( + locs, + confs, + gt_box, + gt_label, + box, + box_var, + overlap_threshold=0.35, + neg_overlap=0.35) + loss = fluid.layers.reduce_sum(loss) + loss.persistable = True + return {'loss': loss} + else: + pred = self.output_decoder(locs, confs, box, box_var) + return {'bbox': pred} + + def _multi_box_head(self, inputs, image, num_classes=2): + def permute_and_reshape(input, last_dim): + trans = fluid.layers.transpose(input, perm=[0, 2, 3, 1]) + compile_shape = [0, -1, last_dim] + return fluid.layers.reshape(trans, shape=compile_shape) + + def _is_list_or_tuple_(data): + return (isinstance(data, list) or isinstance(data, tuple)) + + locs, confs = [], [] + boxes, vars = [], [] + b_attr = ParamAttr(learning_rate=2., regularizer=L2Decay(0.)) + + for i, input in enumerate(inputs): + densities = self.densities[i] + fixed_sizes = self.fixed_sizes[i] + box, var = fluid.layers.density_prior_box( + input, + image, + densities=densities, + fixed_sizes=fixed_sizes, + fixed_ratios=[1.], + clip=False, + offset=0.5) + + num_boxes = box.shape[2] + + box = fluid.layers.reshape(box, shape=[-1, 4]) + var = fluid.layers.reshape(var, shape=[-1, 4]) + num_loc_output = num_boxes * 4 + num_conf_output = num_boxes * num_classes + # get loc + mbox_loc = fluid.layers.conv2d( + input, num_loc_output, 3, 1, 1, bias_attr=b_attr) + loc = permute_and_reshape(mbox_loc, 4) + # get conf + mbox_conf = fluid.layers.conv2d( + input, num_conf_output, 3, 1, 1, bias_attr=b_attr) + conf = permute_and_reshape(mbox_conf, 2) + + locs.append(loc) + confs.append(conf) + boxes.append(box) + vars.append(var) + + face_mbox_loc = fluid.layers.concat(locs, axis=1) + face_mbox_conf = fluid.layers.concat(confs, axis=1) + prior_boxes = fluid.layers.concat(boxes) + box_vars = fluid.layers.concat(vars) + return face_mbox_loc, face_mbox_conf, prior_boxes, box_vars + + def train(self, feed_vars): + return self.build(feed_vars, 'train') + + def eval(self, feed_vars): + return self.build(feed_vars, 'eval') + + def test(self, feed_vars): + return self.build(feed_vars, 'test') + + def is_bbox_normalized(self): + return True diff --git a/ppdet/modeling/architectures/faster_rcnn.py b/ppdet/modeling/architectures/faster_rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..e0ef7355c0d358d7b409ce3080f2416cd38de0b3 --- /dev/null +++ b/ppdet/modeling/architectures/faster_rcnn.py @@ -0,0 +1,204 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import OrderedDict + +from paddle import fluid + +from ppdet.experimental import mixed_precision_global_state +from ppdet.core.workspace import register + +__all__ = ['FasterRCNN'] + + +@register +class FasterRCNN(object): + """ + Faster R-CNN architecture, see https://arxiv.org/abs/1506.01497 + Args: + backbone (object): backbone instance + rpn_head (object): `RPNhead` instance + bbox_assigner (object): `BBoxAssigner` instance + roi_extractor (object): ROI extractor instance + bbox_head (object): `BBoxHead` instance + fpn (object): feature pyramid network instance + """ + + __category__ = 'architecture' + __inject__ = [ + 'backbone', 'rpn_head', 'bbox_assigner', 'roi_extractor', 'bbox_head', + 'fpn' + ] + + def __init__(self, + backbone, + rpn_head, + roi_extractor, + bbox_head='BBoxHead', + bbox_assigner='BBoxAssigner', + rpn_only=False, + fpn=None): + super(FasterRCNN, self).__init__() + self.backbone = backbone + self.rpn_head = rpn_head + self.bbox_assigner = bbox_assigner + self.roi_extractor = roi_extractor + self.bbox_head = bbox_head + self.fpn = fpn + self.rpn_only = rpn_only + + def build(self, feed_vars, mode='train'): + if mode == 'train': + required_fields = ['gt_label', 'gt_box', 'is_crowd', 'im_info'] + else: + required_fields = ['im_shape', 'im_info'] + self._input_check(required_fields, feed_vars) + + im = feed_vars['image'] + im_info = feed_vars['im_info'] + if mode == 'train': + gt_box = feed_vars['gt_box'] + is_crowd = feed_vars['is_crowd'] + else: + im_shape = feed_vars['im_shape'] + + mixed_precision_enabled = mixed_precision_global_state() is not None + + # cast inputs to FP16 + if mixed_precision_enabled: + im = fluid.layers.cast(im, 'float16') + + body_feats = self.backbone(im) + body_feat_names = list(body_feats.keys()) + + # cast features back to FP32 + if mixed_precision_enabled: + body_feats = OrderedDict((k, fluid.layers.cast(v, 'float32')) + for k, v in body_feats.items()) + + if self.fpn is not None: + body_feats, spatial_scale = self.fpn.get_output(body_feats) + + rois = self.rpn_head.get_proposals(body_feats, im_info, mode=mode) + + if mode == 'train': + rpn_loss = self.rpn_head.get_loss(im_info, gt_box, is_crowd) + # sampled rpn proposals + for var in ['gt_label', 'is_crowd', 'gt_box', 'im_info']: + assert var in feed_vars, "{} has no {}".format(feed_vars, var) + outs = self.bbox_assigner( + rpn_rois=rois, + gt_classes=feed_vars['gt_label'], + is_crowd=feed_vars['is_crowd'], + gt_boxes=feed_vars['gt_box'], + im_info=feed_vars['im_info']) + + rois = outs[0] + labels_int32 = outs[1] + bbox_targets = outs[2] + bbox_inside_weights = outs[3] + bbox_outside_weights = outs[4] + else: + if self.rpn_only: + im_scale = fluid.layers.slice( + im_info, [1], starts=[2], ends=[3]) + im_scale = fluid.layers.sequence_expand(im_scale, rois) + rois = rois / im_scale + return {'proposal': rois} + if self.fpn is None: + # in models without FPN, roi extractor only uses the last level of + # feature maps. And body_feat_names[-1] represents the name of + # last feature map. + body_feat = body_feats[body_feat_names[-1]] + roi_feat = self.roi_extractor(body_feat, rois) + else: + roi_feat = self.roi_extractor(body_feats, rois, spatial_scale) + + if mode == 'train': + loss = self.bbox_head.get_loss(roi_feat, labels_int32, bbox_targets, + bbox_inside_weights, + bbox_outside_weights) + loss.update(rpn_loss) + total_loss = fluid.layers.sum(list(loss.values())) + loss.update({'loss': total_loss}) + return loss + else: + pred = self.bbox_head.get_prediction(roi_feat, rois, im_info, + im_shape) + return pred + + def build_multi_scale(self, feed_vars): + required_fields = ['image', 'im_info', 'im_shape'] + self._input_check(required_fields, feed_vars) + ims = [] + for k in feed_vars.keys(): + if 'image' in k: + ims.append(feed_vars[k]) + result = {} + result.update(feed_vars) + for i, im in enumerate(ims): + im_info = fluid.layers.slice( + input=feed_vars['im_info'], + axes=[1], + starts=[3 * i], + ends=[3 * i + 3]) + im_shape = feed_vars['im_shape'] + body_feats = self.backbone(im) + result.update(body_feats) + body_feat_names = list(body_feats.keys()) + + if self.fpn is not None: + body_feats, spatial_scale = self.fpn.get_output(body_feats) + + rois = self.rpn_head.get_proposals(body_feats, im_info, mode='test') + + if self.fpn is None: + # in models without FPN, roi extractor only uses the last level of + # feature maps. And body_feat_names[-1] represents the name of + # last feature map. + body_feat = body_feats[body_feat_names[-1]] + roi_feat = self.roi_extractor(body_feat, rois) + else: + roi_feat = self.roi_extractor(body_feats, rois, spatial_scale) + + pred = self.bbox_head.get_prediction( + roi_feat, rois, im_info, im_shape, return_box_score=True) + bbox_name = 'bbox_' + str(i) + score_name = 'score_' + str(i) + if 'flip' in im.name: + bbox_name += '_flip' + score_name += '_flip' + result[bbox_name] = pred['bbox'] + result[score_name] = pred['score'] + return result + + def _input_check(self, require_fields, feed_vars): + for var in require_fields: + assert var in feed_vars, \ + "{} has no {} field".format(feed_vars, var) + + def train(self, feed_vars): + return self.build(feed_vars, 'train') + + def eval(self, feed_vars, multi_scale=None): + if multi_scale: + return self.build_multi_scale(feed_vars) + return self.build(feed_vars, 'test') + + def test(self, feed_vars): + return self.build(feed_vars, 'test') diff --git a/ppdet/modeling/architectures/mask_rcnn.py b/ppdet/modeling/architectures/mask_rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..715f3efa90e52c3330f36e9ba787082722f2c8bb --- /dev/null +++ b/ppdet/modeling/architectures/mask_rcnn.py @@ -0,0 +1,285 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import OrderedDict + +import paddle.fluid as fluid + +from ppdet.experimental import mixed_precision_global_state +from ppdet.core.workspace import register + +__all__ = ['MaskRCNN'] + + +@register +class MaskRCNN(object): + """ + Mask R-CNN architecture, see https://arxiv.org/abs/1703.06870 + Args: + backbone (object): backbone instance + rpn_head (object): `RPNhead` instance + bbox_assigner (object): `BBoxAssigner` instance + roi_extractor (object): ROI extractor instance + bbox_head (object): `BBoxHead` instance + mask_assigner (object): `MaskAssigner` instance + mask_head (object): `MaskHead` instance + fpn (object): feature pyramid network instance + """ + + __category__ = 'architecture' + __inject__ = [ + 'backbone', 'rpn_head', 'bbox_assigner', 'roi_extractor', 'bbox_head', + 'mask_assigner', 'mask_head', 'fpn' + ] + + def __init__(self, + backbone, + rpn_head, + bbox_head='BBoxHead', + bbox_assigner='BBoxAssigner', + roi_extractor='RoIAlign', + mask_assigner='MaskAssigner', + mask_head='MaskHead', + rpn_only=False, + fpn=None): + super(MaskRCNN, self).__init__() + self.backbone = backbone + self.rpn_head = rpn_head + self.bbox_assigner = bbox_assigner + self.roi_extractor = roi_extractor + self.bbox_head = bbox_head + self.mask_assigner = mask_assigner + self.mask_head = mask_head + self.rpn_only = rpn_only + self.fpn = fpn + + def build(self, feed_vars, mode='train'): + if mode == 'train': + required_fields = [ + 'gt_label', 'gt_box', 'gt_mask', 'is_crowd', 'im_info' + ] + else: + required_fields = ['im_shape', 'im_info'] + self._input_check(required_fields, feed_vars) + im = feed_vars['image'] + im_info = feed_vars['im_info'] + + mixed_precision_enabled = mixed_precision_global_state() is not None + # cast inputs to FP16 + if mixed_precision_enabled: + im = fluid.layers.cast(im, 'float16') + + # backbone + body_feats = self.backbone(im) + + # cast features back to FP32 + if mixed_precision_enabled: + body_feats = OrderedDict((k, fluid.layers.cast(v, 'float32')) + for k, v in body_feats.items()) + + # FPN + spatial_scale = None + if self.fpn is not None: + body_feats, spatial_scale = self.fpn.get_output(body_feats) + + # RPN proposals + rois = self.rpn_head.get_proposals(body_feats, im_info, mode=mode) + + if mode == 'train': + rpn_loss = self.rpn_head.get_loss(im_info, feed_vars['gt_box'], + feed_vars['is_crowd']) + + outs = self.bbox_assigner( + rpn_rois=rois, + gt_classes=feed_vars['gt_label'], + is_crowd=feed_vars['is_crowd'], + gt_boxes=feed_vars['gt_box'], + im_info=feed_vars['im_info']) + rois = outs[0] + labels_int32 = outs[1] + + if self.fpn is None: + last_feat = body_feats[list(body_feats.keys())[-1]] + roi_feat = self.roi_extractor(last_feat, rois) + else: + roi_feat = self.roi_extractor(body_feats, rois, spatial_scale) + + loss = self.bbox_head.get_loss(roi_feat, labels_int32, *outs[2:]) + loss.update(rpn_loss) + + mask_rois, roi_has_mask_int32, mask_int32 = self.mask_assigner( + rois=rois, + gt_classes=feed_vars['gt_label'], + is_crowd=feed_vars['is_crowd'], + gt_segms=feed_vars['gt_mask'], + im_info=feed_vars['im_info'], + labels_int32=labels_int32) + if self.fpn is None: + bbox_head_feat = self.bbox_head.get_head_feat() + feat = fluid.layers.gather(bbox_head_feat, roi_has_mask_int32) + else: + feat = self.roi_extractor( + body_feats, mask_rois, spatial_scale, is_mask=True) + + mask_loss = self.mask_head.get_loss(feat, mask_int32) + loss.update(mask_loss) + + total_loss = fluid.layers.sum(list(loss.values())) + loss.update({'loss': total_loss}) + return loss + + else: + if self.rpn_only: + im_scale = fluid.layers.slice( + im_info, [1], starts=[2], ends=[3]) + im_scale = fluid.layers.sequence_expand(im_scale, rois) + rois = rois / im_scale + return {'proposal': rois} + mask_name = 'mask_pred' + mask_pred, bbox_pred = self.single_scale_eval( + body_feats, mask_name, rois, im_info, feed_vars['im_shape'], + spatial_scale) + return {'bbox': bbox_pred, 'mask': mask_pred} + + def build_multi_scale(self, feed_vars, mask_branch=False): + required_fields = ['image', 'im_info'] + self._input_check(required_fields, feed_vars) + + ims = [] + for k in feed_vars.keys(): + if 'image' in k: + ims.append(feed_vars[k]) + result = {} + + if not mask_branch: + assert 'im_shape' in feed_vars, \ + "{} has no im_shape field".format(feed_vars) + result.update(feed_vars) + + for i, im in enumerate(ims): + im_info = fluid.layers.slice( + input=feed_vars['im_info'], + axes=[1], + starts=[3 * i], + ends=[3 * i + 3]) + body_feats = self.backbone(im) + result.update(body_feats) + + # FPN + if self.fpn is not None: + body_feats, spatial_scale = self.fpn.get_output(body_feats) + rois = self.rpn_head.get_proposals(body_feats, im_info, mode='test') + if not mask_branch: + im_shape = feed_vars['im_shape'] + body_feat_names = list(body_feats.keys()) + if self.fpn is None: + body_feat = body_feats[body_feat_names[-1]] + roi_feat = self.roi_extractor(body_feat, rois) + else: + roi_feat = self.roi_extractor(body_feats, rois, + spatial_scale) + pred = self.bbox_head.get_prediction( + roi_feat, rois, im_info, im_shape, return_box_score=True) + bbox_name = 'bbox_' + str(i) + score_name = 'score_' + str(i) + if 'flip' in im.name: + bbox_name += '_flip' + score_name += '_flip' + result[bbox_name] = pred['bbox'] + result[score_name] = pred['score'] + else: + mask_name = 'mask_pred_' + str(i) + bbox_pred = feed_vars['bbox'] + result.update({im.name: im}) + if 'flip' in im.name: + mask_name += '_flip' + bbox_pred = feed_vars['bbox_flip'] + mask_pred, bbox_pred = self.single_scale_eval( + body_feats, mask_name, rois, im_info, feed_vars['im_shape'], + spatial_scale, bbox_pred) + result[mask_name] = mask_pred + return result + + def single_scale_eval(self, + body_feats, + mask_name, + rois, + im_info, + im_shape, + spatial_scale, + bbox_pred=None): + if self.fpn is None: + last_feat = body_feats[list(body_feats.keys())[-1]] + roi_feat = self.roi_extractor(last_feat, rois) + else: + roi_feat = self.roi_extractor(body_feats, rois, spatial_scale) + if not bbox_pred: + bbox_pred = self.bbox_head.get_prediction(roi_feat, rois, im_info, + im_shape) + bbox_pred = bbox_pred['bbox'] + + # share weight + bbox_shape = fluid.layers.shape(bbox_pred) + bbox_size = fluid.layers.reduce_prod(bbox_shape) + bbox_size = fluid.layers.reshape(bbox_size, [1, 1]) + size = fluid.layers.fill_constant([1, 1], value=6, dtype='int32') + cond = fluid.layers.less_than(x=bbox_size, y=size) + + mask_pred = fluid.layers.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=False, + name=mask_name) + with fluid.layers.control_flow.Switch() as switch: + with switch.case(cond): + fluid.layers.assign(input=bbox_pred, output=mask_pred) + with switch.default(): + bbox = fluid.layers.slice(bbox_pred, [1], starts=[2], ends=[6]) + + im_scale = fluid.layers.slice( + im_info, [1], starts=[2], ends=[3]) + im_scale = fluid.layers.sequence_expand(im_scale, bbox) + + mask_rois = bbox * im_scale + if self.fpn is None: + mask_feat = self.roi_extractor(last_feat, mask_rois) + mask_feat = self.bbox_head.get_head_feat(mask_feat) + else: + mask_feat = self.roi_extractor( + body_feats, mask_rois, spatial_scale, is_mask=True) + + mask_out = self.mask_head.get_prediction(mask_feat, bbox) + fluid.layers.assign(input=mask_out, output=mask_pred) + return mask_pred, bbox_pred + + def _input_check(self, require_fields, feed_vars): + for var in require_fields: + assert var in feed_vars, \ + "{} has no {} field".format(feed_vars, var) + + def train(self, feed_vars): + return self.build(feed_vars, 'train') + + def eval(self, feed_vars, multi_scale=None, mask_branch=False): + if multi_scale: + return self.build_multi_scale(feed_vars, mask_branch) + return self.build(feed_vars, 'test') + + def test(self, feed_vars): + return self.build(feed_vars, 'test') diff --git a/ppdet/modeling/architectures/retinanet.py b/ppdet/modeling/architectures/retinanet.py new file mode 100644 index 0000000000000000000000000000000000000000..4ce5ac500c65fa713b3735b480e8e40b9b123063 --- /dev/null +++ b/ppdet/modeling/architectures/retinanet.py @@ -0,0 +1,92 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import OrderedDict + +import paddle.fluid as fluid + +from ppdet.experimental import mixed_precision_global_state +from ppdet.core.workspace import register + +__all__ = ['RetinaNet'] + + +@register +class RetinaNet(object): + """ + RetinaNet architecture, see https://arxiv.org/abs/1708.02002 + + Args: + backbone (object): backbone instance + fpn (object): feature pyramid network instance + retina_head (object): `RetinaHead` instance + """ + + __category__ = 'architecture' + __inject__ = ['backbone', 'fpn', 'retina_head'] + + def __init__(self, backbone, fpn, retina_head): + super(RetinaNet, self).__init__() + self.backbone = backbone + self.fpn = fpn + self.retina_head = retina_head + + def build(self, feed_vars, mode='train'): + im = feed_vars['image'] + im_info = feed_vars['im_info'] + if mode == 'train': + gt_box = feed_vars['gt_box'] + gt_label = feed_vars['gt_label'] + is_crowd = feed_vars['is_crowd'] + + mixed_precision_enabled = mixed_precision_global_state() is not None + # cast inputs to FP16 + if mixed_precision_enabled: + im = fluid.layers.cast(im, 'float16') + + # backbone + body_feats = self.backbone(im) + + # cast features back to FP32 + if mixed_precision_enabled: + body_feats = OrderedDict((k, fluid.layers.cast(v, 'float32')) + for k, v in body_feats.items()) + + # FPN + body_feats, spatial_scale = self.fpn.get_output(body_feats) + + # retinanet head + if mode == 'train': + loss = self.retina_head.get_loss(body_feats, spatial_scale, im_info, + gt_box, gt_label, is_crowd) + total_loss = fluid.layers.sum(list(loss.values())) + loss.update({'loss': total_loss}) + return loss + else: + pred = self.retina_head.get_prediction(body_feats, spatial_scale, + im_info) + return pred + + def train(self, feed_vars): + return self.build(feed_vars, 'train') + + def eval(self, feed_vars): + return self.build(feed_vars, 'test') + + def test(self, feed_vars): + return self.build(feed_vars, 'test') diff --git a/ppdet/modeling/architectures/ssd.py b/ppdet/modeling/architectures/ssd.py new file mode 100644 index 0000000000000000000000000000000000000000..e899075f21291a38a5af0e4c8d5e70af4e55eaec --- /dev/null +++ b/ppdet/modeling/architectures/ssd.py @@ -0,0 +1,105 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import OrderedDict + +import paddle.fluid as fluid + +from ppdet.experimental import mixed_precision_global_state +from ppdet.core.workspace import register +from ppdet.modeling.ops import SSDOutputDecoder + +__all__ = ['SSD'] + + +@register +class SSD(object): + """ + Single Shot MultiBox Detector, see https://arxiv.org/abs/1512.02325 + + Args: + backbone (object): backbone instance + multi_box_head (object): `MultiBoxHead` instance + output_decoder (object): `SSDOutputDecoder` instance + num_classes (int): number of output classes + """ + + __category__ = 'architecture' + __inject__ = ['backbone', 'multi_box_head', 'output_decoder'] + __shared__ = ['num_classes'] + + def __init__(self, + backbone, + multi_box_head='MultiBoxHead', + output_decoder=SSDOutputDecoder().__dict__, + num_classes=21): + super(SSD, self).__init__() + self.backbone = backbone + self.multi_box_head = multi_box_head + self.num_classes = num_classes + self.output_decoder = output_decoder + if isinstance(output_decoder, dict): + self.output_decoder = SSDOutputDecoder(**output_decoder) + + def build(self, feed_vars, mode='train'): + im = feed_vars['image'] + if mode == 'train' or mode == 'eval': + gt_box = feed_vars['gt_box'] + gt_label = feed_vars['gt_label'] + + mixed_precision_enabled = mixed_precision_global_state() is not None + # cast inputs to FP16 + if mixed_precision_enabled: + im = fluid.layers.cast(im, 'float16') + + # backbone + body_feats = self.backbone(im) + + if isinstance(body_feats, OrderedDict): + body_feat_names = list(body_feats.keys()) + body_feats = [body_feats[name] for name in body_feat_names] + + # cast features back to FP32 + if mixed_precision_enabled: + body_feats = [fluid.layers.cast(v, 'float32') for v in body_feats] + + locs, confs, box, box_var = self.multi_box_head( + inputs=body_feats, image=im, num_classes=self.num_classes) + + if mode == 'train': + loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box, + box_var) + loss = fluid.layers.reduce_sum(loss) + return {'loss': loss} + else: + pred = self.output_decoder(locs, confs, box, box_var) + return {'bbox': pred} + + def train(self, feed_vars): + return self.build(feed_vars, 'train') + + def eval(self, feed_vars): + return self.build(feed_vars, 'eval') + + def test(self, feed_vars): + return self.build(feed_vars, 'test') + + def is_bbox_normalized(self): + # SSD use output_decoder in output layers, bbox is normalized + # to range [0, 1], is_bbox_normalized is used in eval.py and infer.py + return True diff --git a/ppdet/modeling/architectures/yolov3.py b/ppdet/modeling/architectures/yolov3.py new file mode 100644 index 0000000000000000000000000000000000000000..2912ffda5215af594d57255397b8a572455aa090 --- /dev/null +++ b/ppdet/modeling/architectures/yolov3.py @@ -0,0 +1,86 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import OrderedDict + +from paddle import fluid + +from ppdet.experimental import mixed_precision_global_state +from ppdet.core.workspace import register + +__all__ = ['YOLOv3'] + + +@register +class YOLOv3(object): + """ + YOLOv3 network, see https://arxiv.org/abs/1804.02767 + + Args: + backbone (object): an backbone instance + yolo_head (object): an `YOLOv3Head` instance + """ + + __category__ = 'architecture' + __inject__ = ['backbone', 'yolo_head'] + + def __init__(self, backbone, yolo_head='YOLOv3Head'): + super(YOLOv3, self).__init__() + self.backbone = backbone + self.yolo_head = yolo_head + + def build(self, feed_vars, mode='train'): + im = feed_vars['image'] + + mixed_precision_enabled = mixed_precision_global_state() is not None + + # cast inputs to FP16 + if mixed_precision_enabled: + im = fluid.layers.cast(im, 'float16') + + body_feats = self.backbone(im) + + if isinstance(body_feats, OrderedDict): + body_feat_names = list(body_feats.keys()) + body_feats = [body_feats[name] for name in body_feat_names] + + # cast features back to FP32 + if mixed_precision_enabled: + body_feats = [fluid.layers.cast(v, 'float32') for v in body_feats] + + if mode == 'train': + gt_box = feed_vars['gt_box'] + gt_label = feed_vars['gt_label'] + gt_score = feed_vars['gt_score'] + + return { + 'loss': self.yolo_head.get_loss(body_feats, gt_box, gt_label, + gt_score) + } + else: + im_size = feed_vars['im_size'] + return self.yolo_head.get_prediction(body_feats, im_size) + + def train(self, feed_vars): + return self.build(feed_vars, mode='train') + + def eval(self, feed_vars): + return self.build(feed_vars, mode='test') + + def test(self, feed_vars): + return self.build(feed_vars, mode='test') diff --git a/ppdet/modeling/backbones/__init__.py b/ppdet/modeling/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..48081429fecd83ad086fb66fb7eac99e0cc96fb5 --- /dev/null +++ b/ppdet/modeling/backbones/__init__.py @@ -0,0 +1,35 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import + +from . import resnet +from . import resnext +from . import darknet +from . import mobilenet +from . import senet +from . import fpn +from . import vgg +from . import blazenet +from . import faceboxnet + +from .resnet import * +from .resnext import * +from .darknet import * +from .mobilenet import * +from .senet import * +from .fpn import * +from .vgg import * +from .blazenet import * +from .faceboxnet import * diff --git a/ppdet/modeling/backbones/blazenet.py b/ppdet/modeling/backbones/blazenet.py new file mode 100644 index 0000000000000000000000000000000000000000..54c3f7e262464661f39fb73a9c5c70eabe4955c9 --- /dev/null +++ b/ppdet/modeling/backbones/blazenet.py @@ -0,0 +1,314 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr + +from ppdet.core.workspace import register + +__all__ = ['BlazeNet'] + + +@register +class BlazeNet(object): + """ + BlazeFace, see https://arxiv.org/abs/1907.05047 + + Args: + blaze_filters (list): number of filter for each blaze block + double_blaze_filters (list): number of filter for each double_blaze block + with_extra_blocks (bool): whether or not extra blocks should be added + lite_edition (bool): whether or not is blazeface-lite + """ + + def __init__( + self, + blaze_filters=[[24, 24], [24, 24], [24, 48, 2], [48, 48], [48, 48]], + double_blaze_filters=[[48, 24, 96, 2], [96, 24, 96], [96, 24, 96], + [96, 24, 96, 2], [96, 24, 96], [96, 24, 96]], + with_extra_blocks=True, + lite_edition=False): + super(BlazeNet, self).__init__() + + self.blaze_filters = blaze_filters + self.double_blaze_filters = double_blaze_filters + self.with_extra_blocks = with_extra_blocks + self.lite_edition = lite_edition + + def __call__(self, input): + if not self.lite_edition: + conv1_num_filters = self.blaze_filters[0][0] + conv = self._conv_norm( + input=input, + num_filters=conv1_num_filters, + filter_size=3, + stride=2, + padding=1, + act='relu', + name="conv1") + + for k, v in enumerate(self.blaze_filters): + assert len(v) in [2, 3], \ + "blaze_filters {} not in [2, 3]" + if len(v) == 2: + conv = self.BlazeBlock( + conv, v[0], v[1], name='blaze_{}'.format(k)) + elif len(v) == 3: + conv = self.BlazeBlock( + conv, + v[0], + v[1], + stride=v[2], + name='blaze_{}'.format(k)) + + layers = [] + for k, v in enumerate(self.double_blaze_filters): + assert len(v) in [3, 4], \ + "blaze_filters {} not in [3, 4]" + if len(v) == 3: + conv = self.BlazeBlock( + conv, + v[0], + v[1], + double_channels=v[2], + name='double_blaze_{}'.format(k)) + elif len(v) == 4: + layers.append(conv) + conv = self.BlazeBlock( + conv, + v[0], + v[1], + double_channels=v[2], + stride=v[3], + name='double_blaze_{}'.format(k)) + layers.append(conv) + + if not self.with_extra_blocks: + return layers[-1] + return layers[-2], layers[-1] + else: + conv1 = self._conv_norm( + input=input, + num_filters=24, + filter_size=5, + stride=2, + padding=2, + act='relu', + name="conv1") + conv2 = self.Blaze_lite(conv1, 24, 24, 1, 'conv2') + conv3 = self.Blaze_lite(conv2, 24, 28, 1, 'conv3') + conv4 = self.Blaze_lite(conv3, 28, 32, 2, 'conv4') + conv5 = self.Blaze_lite(conv4, 32, 36, 1, 'conv5') + conv6 = self.Blaze_lite(conv5, 36, 42, 1, 'conv6') + conv7 = self.Blaze_lite(conv6, 42, 48, 2, 'conv7') + in_ch = 48 + for i in range(5): + conv7 = self.Blaze_lite(conv7, in_ch, in_ch + 8, 1, + 'conv{}'.format(8 + i)) + in_ch += 8 + assert in_ch == 88 + conv13 = self.Blaze_lite(conv7, 88, 96, 2, 'conv13') + for i in range(4): + conv13 = self.Blaze_lite(conv13, 96, 96, 1, + 'conv{}'.format(14 + i)) + + return conv7, conv13 + + def BlazeBlock(self, + input, + in_channels, + out_channels, + double_channels=None, + stride=1, + use_5x5kernel=True, + name=None): + assert stride in [1, 2] + use_pool = not stride == 1 + use_double_block = double_channels is not None + act = 'relu' if use_double_block else None + + if use_5x5kernel: + conv_dw = self._conv_norm( + input=input, + filter_size=5, + num_filters=in_channels, + stride=stride, + padding=2, + num_groups=in_channels, + use_cudnn=False, + name=name + "1_dw") + else: + conv_dw_1 = self._conv_norm( + input=input, + filter_size=3, + num_filters=in_channels, + stride=1, + padding=1, + num_groups=in_channels, + use_cudnn=False, + name=name + "1_dw_1") + conv_dw = self._conv_norm( + input=conv_dw_1, + filter_size=3, + num_filters=in_channels, + stride=stride, + padding=1, + num_groups=in_channels, + use_cudnn=False, + name=name + "1_dw_2") + + conv_pw = self._conv_norm( + input=conv_dw, + filter_size=1, + num_filters=out_channels, + stride=1, + padding=0, + act=act, + name=name + "1_sep") + + if use_double_block: + if use_5x5kernel: + conv_dw = self._conv_norm( + input=conv_pw, + filter_size=5, + num_filters=out_channels, + stride=1, + padding=2, + use_cudnn=False, + name=name + "2_dw") + else: + conv_dw_1 = self._conv_norm( + input=conv_pw, + filter_size=3, + num_filters=out_channels, + stride=1, + padding=1, + num_groups=out_channels, + use_cudnn=False, + name=name + "2_dw_1") + conv_dw = self._conv_norm( + input=conv_dw_1, + filter_size=3, + num_filters=out_channels, + stride=1, + padding=1, + num_groups=out_channels, + use_cudnn=False, + name=name + "2_dw_2") + + conv_pw = self._conv_norm( + input=conv_dw, + filter_size=1, + num_filters=double_channels, + stride=1, + padding=0, + name=name + "2_sep") + + # shortcut + if use_pool: + shortcut_channel = double_channels or out_channels + shortcut_pool = self._pooling_block(input, stride, stride) + channel_pad = self._conv_norm( + input=shortcut_pool, + filter_size=1, + num_filters=shortcut_channel, + stride=1, + padding=0, + name="shortcut" + name) + return fluid.layers.elementwise_add( + x=channel_pad, y=conv_pw, act='relu') + return fluid.layers.elementwise_add(x=input, y=conv_pw, act='relu') + + def Blaze_lite(self, input, in_channels, out_channels, stride=1, name=None): + assert stride in [1, 2] + use_pool = not stride == 1 + ues_pad = not in_channels == out_channels + conv_dw = self._conv_norm( + input=input, + filter_size=3, + num_filters=in_channels, + stride=stride, + padding=1, + num_groups=in_channels, + name=name + "_dw") + + conv_pw = self._conv_norm( + input=conv_dw, + filter_size=1, + num_filters=out_channels, + stride=1, + padding=0, + name=name + "_sep") + + if use_pool: + shortcut_pool = self._pooling_block(input, stride, stride) + if ues_pad: + conv_pad = shortcut_pool if use_pool else input + channel_pad = self._conv_norm( + input=conv_pad, + filter_size=1, + num_filters=out_channels, + stride=1, + padding=0, + name="shortcut" + name) + return fluid.layers.elementwise_add( + x=channel_pad, y=conv_pw, act='relu') + return fluid.layers.elementwise_add(x=input, y=conv_pw, act='relu') + + def _conv_norm( + self, + input, + filter_size, + num_filters, + stride, + padding, + num_groups=1, + act='relu', # None + use_cudnn=True, + name=None): + parameter_attr = ParamAttr( + learning_rate=0.1, + initializer=fluid.initializer.MSRA(), + name=name + "_weights") + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + act=None, + use_cudnn=use_cudnn, + param_attr=parameter_attr, + bias_attr=False) + return fluid.layers.batch_norm(input=conv, act=act) + + def _pooling_block(self, + conv, + pool_size, + pool_stride, + pool_padding=0, + ceil_mode=True): + pool = fluid.layers.pool2d( + input=conv, + pool_size=pool_size, + pool_type='max', + pool_stride=pool_stride, + pool_padding=pool_padding, + ceil_mode=ceil_mode) + return pool diff --git a/ppdet/modeling/backbones/darknet.py b/ppdet/modeling/backbones/darknet.py new file mode 100644 index 0000000000000000000000000000000000000000..37583ab29d8cd0602b0f406b52c2ce8dae1ce3e7 --- /dev/null +++ b/ppdet/modeling/backbones/darknet.py @@ -0,0 +1,170 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import six + +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.regularizer import L2Decay + +from ppdet.core.workspace import register + +__all__ = ['DarkNet'] + + +@register +class DarkNet(object): + """ + DarkNet, see https://pjreddie.com/darknet/yolo/ + Args: + depth (int): network depth, currently only darknet 53 is supported + norm_type (str): normalization type, 'bn' and 'sync_bn' are supported + norm_decay (float): weight decay for normalization layer weights + """ + __shared__ = ['norm_type', 'weight_prefix_name'] + + def __init__(self, + depth=53, + norm_type='bn', + norm_decay=0., + weight_prefix_name=''): + assert depth in [53], "unsupported depth value" + self.depth = depth + self.norm_type = norm_type + self.norm_decay = norm_decay + self.depth_cfg = {53: ([1, 2, 8, 8, 4], self.basicblock)} + self.prefix_name = weight_prefix_name + + def _conv_norm(self, + input, + ch_out, + filter_size, + stride, + padding, + act='leaky', + name=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=ch_out, + filter_size=filter_size, + stride=stride, + padding=padding, + act=None, + param_attr=ParamAttr(name=name + ".conv.weights"), + bias_attr=False) + + bn_name = name + ".bn" + bn_param_attr = ParamAttr( + regularizer=L2Decay(float(self.norm_decay)), + name=bn_name + '.scale') + bn_bias_attr = ParamAttr( + regularizer=L2Decay(float(self.norm_decay)), + name=bn_name + '.offset') + + out = fluid.layers.batch_norm( + input=conv, + act=None, + param_attr=bn_param_attr, + bias_attr=bn_bias_attr, + moving_mean_name=bn_name + '.mean', + moving_variance_name=bn_name + '.var') + + # leaky relu here has `alpha` as 0.1, can not be set by + # `act` param in fluid.layers.batch_norm above. + if act == 'leaky': + out = fluid.layers.leaky_relu(x=out, alpha=0.1) + + return out + + def _downsample(self, + input, + ch_out, + filter_size=3, + stride=2, + padding=1, + name=None): + return self._conv_norm( + input, + ch_out=ch_out, + filter_size=filter_size, + stride=stride, + padding=padding, + name=name) + + def basicblock(self, input, ch_out, name=None): + conv1 = self._conv_norm( + input, + ch_out=ch_out, + filter_size=1, + stride=1, + padding=0, + name=name + ".0") + conv2 = self._conv_norm( + conv1, + ch_out=ch_out * 2, + filter_size=3, + stride=1, + padding=1, + name=name + ".1") + out = fluid.layers.elementwise_add(x=input, y=conv2, act=None) + return out + + def layer_warp(self, block_func, input, ch_out, count, name=None): + out = block_func(input, ch_out=ch_out, name='{}.0'.format(name)) + for j in six.moves.xrange(1, count): + out = block_func(out, ch_out=ch_out, name='{}.{}'.format(name, j)) + return out + + def __call__(self, input): + """ + Get the backbone of DarkNet, that is output for the 5 stages. + + Args: + input (Variable): input variable. + + Returns: + The last variables of each stage. + """ + stages, block_func = self.depth_cfg[self.depth] + stages = stages[0:5] + conv = self._conv_norm( + input=input, + ch_out=32, + filter_size=3, + stride=1, + padding=1, + name=self.prefix_name + "yolo_input") + downsample_ = self._downsample( + input=conv, + ch_out=conv.shape[1] * 2, + name=self.prefix_name + "yolo_input.downsample") + blocks = [] + for i, stage in enumerate(stages): + block = self.layer_warp( + block_func=block_func, + input=downsample_, + ch_out=32 * 2**i, + count=stage, + name=self.prefix_name + "stage.{}".format(i)) + blocks.append(block) + if i < len(stages) - 1: # do not downsaple in the last stage + downsample_ = self._downsample( + input=block, + ch_out=block.shape[1] * 2, + name=self.prefix_name + "stage.{}.downsample".format(i)) + return blocks diff --git a/ppdet/modeling/backbones/faceboxnet.py b/ppdet/modeling/backbones/faceboxnet.py new file mode 100644 index 0000000000000000000000000000000000000000..0b82c86b232a9784928167f1d226eb07562821c2 --- /dev/null +++ b/ppdet/modeling/backbones/faceboxnet.py @@ -0,0 +1,364 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr + +from ppdet.core.workspace import register + +__all__ = ['FaceBoxNet'] + + +@register +class FaceBoxNet(object): + """ + FaceBoxes, see https://https://arxiv.org/abs/1708.05234 + + Args: + with_extra_blocks (bool): whether or not extra blocks should be added + lite_edition (bool): whether or not is FaceBoxes-lite + """ + + def __init__(self, + with_extra_blocks=True, + lite_edition=False): + super(FaceBoxNet, self).__init__() + + self.with_extra_blocks = with_extra_blocks + self.lite_edition = lite_edition + + def __call__(self, input): + if self.lite_edition: + return self._simplified_edition(input) + else: + return self._original_edition(input) + + def _simplified_edition(self, input): + conv_1_1 = self._conv_norm_crelu( + input=input, + num_filters=8, + filter_size=3, + stride=2, + padding=1, + act='relu', + name="conv_1_1") + + conv_1_2 = self._conv_norm_crelu( + input=conv_1_1, + num_filters=24, + filter_size=3, + stride=2, + padding=1, + act='relu', + name="conv_1_2") + + pool1 = fluid.layers.pool2d( + input=conv_1_2, + pool_size=3, + pool_padding=1, + pool_type='avg', + name="pool_1") + + conv_2_1 = self._conv_norm( + input=pool1, + num_filters=48, + filter_size=3, + stride=2, + padding=1, + act='relu', + name="conv_2_1") + + conv_2_2 = self._conv_norm( + input=conv_2_1, + num_filters=64, + filter_size=1, + stride=1, + padding=0, + act='relu', + name="conv_2_2") + + conv_inception = conv_2_2 + + for i in range(3): + conv_inception = self._inceptionA(conv_inception, i) + + layers = [] + layers.append(conv_inception) + + conv_3_1 = self._conv_norm( + input=conv_inception, + num_filters=128, + filter_size=1, + stride=1, + padding=0, + act='relu', + name="conv_3_1") + + conv_3_2 = self._conv_norm( + input=conv_3_1, + num_filters=256, + filter_size=3, + stride=2, + padding=1, + act='relu', + name="conv_3_2") + + layers.append(conv_3_2) + + if not self.with_extra_blocks: + return layers[-1] + return layers[-2], layers[-1] + + def _original_edition(self, input): + conv_1 = self._conv_norm_crelu( + input=input, + num_filters=24, + filter_size=7, + stride=4, + padding=3, + act='relu', + name="conv_1") + + pool_1 = fluid.layers.pool2d( + input=conv_1, + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max', + name="pool_1") + + conv_2 = self._conv_norm_crelu( + input=pool_1, + num_filters=64, + filter_size=5, + stride=2, + padding=2, + act='relu', + name="conv_2") + + pool_2 = fluid.layers.pool2d( + input=conv_1, + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max', + name="pool_2") + + conv_inception = pool_2 + + for i in range(3): + conv_inception = self._inceptionA(conv_inception, i) + + layers = [] + layers.append(conv_inception) + + conv_3_1 = self._conv_norm( + input=conv_inception, + num_filters=128, + filter_size=1, + stride=1, + padding=0, + act='relu', + name="conv_3_1") + + conv_3_2 = self._conv_norm( + input=conv_3_1, + num_filters=256, + filter_size=3, + stride=2, + padding=1, + act='relu', + name="conv_3_2") + + layers.append(conv_3_2) + + conv_4_1 = self._conv_norm( + input=conv_3_2, + num_filters=128, + filter_size=1, + stride=1, + padding=0, + act='relu', + name="conv_4_1") + + conv_4_2 = self._conv_norm( + input=conv_4_1, + num_filters=256, + filter_size=3, + stride=2, + padding=1, + act='relu', + name="conv_4_2") + + layers.append(conv_4_2) + + if not self.with_extra_blocks: + return layers[-1] + + return layers[-3], layers[-2], layers[-1] + + def _conv_norm( + self, + input, + filter_size, + num_filters, + stride, + padding, + num_groups=1, + act='relu', + use_cudnn=True, + name=None): + parameter_attr = ParamAttr( + learning_rate=0.1, + initializer=fluid.initializer.MSRA(), + name=name + "_weights") + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + act=None, + use_cudnn=use_cudnn, + param_attr=parameter_attr, + bias_attr=False) + return fluid.layers.batch_norm(input=conv, act=act) + + def _conv_norm_crelu( + self, + input, + filter_size, + num_filters, + stride, + padding, + num_groups=1, + act='relu', + use_cudnn=True, + name=None): + parameter_attr = ParamAttr( + learning_rate=0.1, + initializer=fluid.initializer.MSRA(), + name=name + "_weights") + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + act=None, + use_cudnn=use_cudnn, + param_attr=parameter_attr, + bias_attr=False) + + conv_a = fluid.layers.batch_norm(input=conv, act=act) + conv_b = fluid.layers.scale(conv_a, -1) + + concat = fluid.layers.concat([conv_a, conv_b], axis=1) + + return concat + + def _pooling_block(self, + conv, + pool_size, + pool_stride, + pool_padding=0, + ceil_mode=True): + pool = fluid.layers.pool2d( + input=conv, + pool_size=pool_size, + pool_type='max', + pool_stride=pool_stride, + pool_padding=pool_padding, + ceil_mode=ceil_mode) + return pool + + def _inceptionA(self, data, idx): + idx = str(idx) + + pool1 = fluid.layers.pool2d( + input=data, + pool_size=3, + pool_padding=1, + pool_type='avg', + name='inceptionA_' + idx + '_pool1') + conv1 = self._conv_norm( + input=pool1, + filter_size=1, + num_filters=32, + stride=1, + padding=0, + act='relu', + name='inceptionA_' + idx + '_conv1') + + conv2 = self._conv_norm( + input=data, + filter_size=1, + num_filters=32, + stride=1, + padding=0, + act='relu', + name='inceptionA_' + idx + '_conv2') + + conv3 = self._conv_norm( + input=data, + filter_size=1, + num_filters=24, + stride=1, + padding=0, + act='relu', + name='inceptionA_' + idx + '_conv3_1') + conv3 = self._conv_norm( + input=conv3, + filter_size=3, + num_filters=32, + stride=1, + padding=1, + act='relu', + name='inceptionA_' + idx + '_conv3_2') + + conv4 = self._conv_norm( + input=data, + filter_size=1, + num_filters=24, + stride=1, + padding=0, + act='relu', + name='inceptionA_' + idx + '_conv4_1') + conv4 = self._conv_norm( + input=conv4, + filter_size=3, + num_filters=32, + stride=1, + padding=1, + act='relu', + name='inceptionA_' + idx + '_conv4_2') + conv4 = self._conv_norm( + input=conv4, + filter_size=3, + num_filters=32, + stride=1, + padding=1, + act='relu', + name='inceptionA_' + idx + '_conv4_3') + + concat = fluid.layers.concat( + [conv1, conv2, conv3, conv4], axis=1) + + return concat diff --git a/ppdet/modeling/backbones/fpn.py b/ppdet/modeling/backbones/fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..9bd491a662dd640c62668c5878d47ea48c21d223 --- /dev/null +++ b/ppdet/modeling/backbones/fpn.py @@ -0,0 +1,216 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import OrderedDict +import copy +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.initializer import Xavier +from paddle.fluid.regularizer import L2Decay + +from ppdet.core.workspace import register +from ppdet.modeling.ops import ConvNorm + +__all__ = ['FPN'] + + +@register +class FPN(object): + """ + Feature Pyramid Network, see https://arxiv.org/abs/1612.03144 + + Args: + num_chan (int): number of feature channels + min_level (int): lowest level of the backbone feature map to use + max_level (int): highest level of the backbone feature map to use + spatial_scale (list): feature map scaling factor + has_extra_convs (bool): whether has extral convolutions in higher levels + norm_type (str|None): normalization type, 'bn'/'sync_bn'/'affine_channel' + """ + __shared__ = ['norm_type', 'freeze_norm'] + + def __init__(self, + num_chan=256, + min_level=2, + max_level=6, + spatial_scale=[1. / 32., 1. / 16., 1. / 8., 1. / 4.], + has_extra_convs=False, + norm_type=None, + freeze_norm=False): + self.freeze_norm = freeze_norm + self.num_chan = num_chan + self.min_level = min_level + self.max_level = max_level + self.spatial_scale = spatial_scale + self.has_extra_convs = has_extra_convs + self.norm_type = norm_type + + def _add_topdown_lateral(self, body_name, body_input, upper_output): + lateral_name = 'fpn_inner_' + body_name + '_lateral' + topdown_name = 'fpn_topdown_' + body_name + fan = body_input.shape[1] + if self.norm_type: + initializer = Xavier(fan_out=fan) + lateral = ConvNorm( + body_input, + self.num_chan, + 1, + initializer=initializer, + norm_type=self.norm_type, + freeze_norm=self.freeze_norm, + name=lateral_name, + norm_name=lateral_name) + else: + lateral = fluid.layers.conv2d( + body_input, + self.num_chan, + 1, + param_attr=ParamAttr( + name=lateral_name + "_w", initializer=Xavier(fan_out=fan)), + bias_attr=ParamAttr( + name=lateral_name + "_b", + learning_rate=2., + regularizer=L2Decay(0.)), + name=lateral_name) + topdown = fluid.layers.resize_nearest( + upper_output, scale=2., name=topdown_name) + + return lateral + topdown + + def get_output(self, body_dict): + """ + Add FPN onto backbone. + + Args: + body_dict(OrderedDict): Dictionary of variables and each element is the + output of backbone. + + Return: + fpn_dict(OrderedDict): A dictionary represents the output of FPN with + their name. + spatial_scale(list): A list of multiplicative spatial scale factor. + """ + spatial_scale = copy.deepcopy(self.spatial_scale) + body_name_list = list(body_dict.keys())[::-1] + num_backbone_stages = len(body_name_list) + self.fpn_inner_output = [[] for _ in range(num_backbone_stages)] + fpn_inner_name = 'fpn_inner_' + body_name_list[0] + body_input = body_dict[body_name_list[0]] + fan = body_input.shape[1] + if self.norm_type: + initializer = Xavier(fan_out=fan) + self.fpn_inner_output[0] = ConvNorm( + body_input, + self.num_chan, + 1, + initializer=initializer, + norm_type=self.norm_type, + freeze_norm=self.freeze_norm, + name=fpn_inner_name, + norm_name=fpn_inner_name) + else: + self.fpn_inner_output[0] = fluid.layers.conv2d( + body_input, + self.num_chan, + 1, + param_attr=ParamAttr( + name=fpn_inner_name + "_w", + initializer=Xavier(fan_out=fan)), + bias_attr=ParamAttr( + name=fpn_inner_name + "_b", + learning_rate=2., + regularizer=L2Decay(0.)), + name=fpn_inner_name) + for i in range(1, num_backbone_stages): + body_name = body_name_list[i] + body_input = body_dict[body_name] + top_output = self.fpn_inner_output[i - 1] + fpn_inner_single = self._add_topdown_lateral(body_name, body_input, + top_output) + self.fpn_inner_output[i] = fpn_inner_single + fpn_dict = {} + fpn_name_list = [] + for i in range(num_backbone_stages): + fpn_name = 'fpn_' + body_name_list[i] + fan = self.fpn_inner_output[i].shape[1] * 3 * 3 + if self.norm_type: + initializer = Xavier(fan_out=fan) + fpn_output = ConvNorm( + self.fpn_inner_output[i], + self.num_chan, + 3, + initializer=initializer, + norm_type=self.norm_type, + freeze_norm=self.freeze_norm, + name=fpn_name, + norm_name=fpn_name) + else: + fpn_output = fluid.layers.conv2d( + self.fpn_inner_output[i], + self.num_chan, + filter_size=3, + padding=1, + param_attr=ParamAttr( + name=fpn_name + "_w", initializer=Xavier(fan_out=fan)), + bias_attr=ParamAttr( + name=fpn_name + "_b", + learning_rate=2., + regularizer=L2Decay(0.)), + name=fpn_name) + fpn_dict[fpn_name] = fpn_output + fpn_name_list.append(fpn_name) + if not self.has_extra_convs and self.max_level - self.min_level == len( + spatial_scale): + body_top_name = fpn_name_list[0] + body_top_extension = fluid.layers.pool2d( + fpn_dict[body_top_name], + 1, + 'max', + pool_stride=2, + name=body_top_name + '_subsampled_2x') + fpn_dict[body_top_name + '_subsampled_2x'] = body_top_extension + fpn_name_list.insert(0, body_top_name + '_subsampled_2x') + spatial_scale.insert(0, spatial_scale[0] * 0.5) + # Coarser FPN levels introduced for RetinaNet + highest_backbone_level = self.min_level + len(spatial_scale) - 1 + if self.has_extra_convs and self.max_level > highest_backbone_level: + fpn_blob = body_dict[body_name_list[0]] + for i in range(highest_backbone_level + 1, self.max_level + 1): + fpn_blob_in = fpn_blob + fpn_name = 'fpn_' + str(i) + if i > highest_backbone_level + 1: + fpn_blob_in = fluid.layers.relu(fpn_blob) + fan = fpn_blob_in.shape[1] * 3 * 3 + fpn_blob = fluid.layers.conv2d( + input=fpn_blob_in, + num_filters=self.num_chan, + filter_size=3, + stride=2, + padding=1, + param_attr=ParamAttr( + name=fpn_name + "_w", initializer=Xavier(fan_out=fan)), + bias_attr=ParamAttr( + name=fpn_name + "_b", + learning_rate=2., + regularizer=L2Decay(0.)), + name=fpn_name) + fpn_dict[fpn_name] = fpn_blob + fpn_name_list.insert(0, fpn_name) + spatial_scale.insert(0, spatial_scale[0] * 0.5) + res_dict = OrderedDict([(k, fpn_dict[k]) for k in fpn_name_list]) + return res_dict, spatial_scale diff --git a/ppdet/modeling/backbones/mobilenet.py b/ppdet/modeling/backbones/mobilenet.py new file mode 100644 index 0000000000000000000000000000000000000000..56afdf96454a994591a3d97e71b8d9619e0e68c8 --- /dev/null +++ b/ppdet/modeling/backbones/mobilenet.py @@ -0,0 +1,210 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.regularizer import L2Decay + +from ppdet.core.workspace import register + +__all__ = ['MobileNet'] + + +@register +class MobileNet(object): + """ + MobileNet v1, see https://arxiv.org/abs/1704.04861 + + Args: + norm_type (str): normalization type, 'bn' and 'sync_bn' are supported + norm_decay (float): weight decay for normalization layer weights + conv_group_scale (int): scaling factor for convolution groups + with_extra_blocks (bool): if extra blocks should be added + extra_block_filters (list): number of filter for each extra block + """ + __shared__ = ['norm_type', 'weight_prefix_name'] + + def __init__(self, + norm_type='bn', + norm_decay=0., + conv_group_scale=1, + conv_learning_rate=1.0, + with_extra_blocks=False, + extra_block_filters=[[256, 512], [128, 256], [128, 256], + [64, 128]], + weight_prefix_name=''): + self.norm_type = norm_type + self.norm_decay = norm_decay + self.conv_group_scale = conv_group_scale + self.conv_learning_rate = conv_learning_rate + self.with_extra_blocks = with_extra_blocks + self.extra_block_filters = extra_block_filters + self.prefix_name = weight_prefix_name + + def _conv_norm(self, + input, + filter_size, + num_filters, + stride, + padding, + num_groups=1, + act='relu', + use_cudnn=True, + name=None): + parameter_attr = ParamAttr( + learning_rate=self.conv_learning_rate, + initializer=fluid.initializer.MSRA(), + name=name + "_weights") + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + act=None, + use_cudnn=use_cudnn, + param_attr=parameter_attr, + bias_attr=False) + + bn_name = name + "_bn" + norm_decay = self.norm_decay + bn_param_attr = ParamAttr( + regularizer=L2Decay(norm_decay), name=bn_name + '_scale') + bn_bias_attr = ParamAttr( + regularizer=L2Decay(norm_decay), name=bn_name + '_offset') + return fluid.layers.batch_norm( + input=conv, + act=act, + param_attr=bn_param_attr, + bias_attr=bn_bias_attr, + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance') + + def depthwise_separable(self, + input, + num_filters1, + num_filters2, + num_groups, + stride, + scale, + name=None): + depthwise_conv = self._conv_norm( + input=input, + filter_size=3, + num_filters=int(num_filters1 * scale), + stride=stride, + padding=1, + num_groups=int(num_groups * scale), + use_cudnn=False, + name=name + "_dw") + + pointwise_conv = self._conv_norm( + input=depthwise_conv, + filter_size=1, + num_filters=int(num_filters2 * scale), + stride=1, + padding=0, + name=name + "_sep") + return pointwise_conv + + def _extra_block(self, + input, + num_filters1, + num_filters2, + num_groups, + stride, + name=None): + pointwise_conv = self._conv_norm( + input=input, + filter_size=1, + num_filters=int(num_filters1), + stride=1, + num_groups=int(num_groups), + padding=0, + name=name + "_extra1") + normal_conv = self._conv_norm( + input=pointwise_conv, + filter_size=3, + num_filters=int(num_filters2), + stride=2, + num_groups=int(num_groups), + padding=1, + name=name + "_extra2") + return normal_conv + + def __call__(self, input): + scale = self.conv_group_scale + + blocks = [] + # input 1/1 + out = self._conv_norm( + input, 3, int(32 * scale), 2, 1, name=self.prefix_name + "conv1") + # 1/2 + out = self.depthwise_separable( + out, 32, 64, 32, 1, scale, name=self.prefix_name + "conv2_1") + out = self.depthwise_separable( + out, 64, 128, 64, 2, scale, name=self.prefix_name + "conv2_2") + # 1/4 + out = self.depthwise_separable( + out, 128, 128, 128, 1, scale, name=self.prefix_name + "conv3_1") + out = self.depthwise_separable( + out, 128, 256, 128, 2, scale, name=self.prefix_name + "conv3_2") + # 1/8 + blocks.append(out) + out = self.depthwise_separable( + out, 256, 256, 256, 1, scale, name=self.prefix_name + "conv4_1") + out = self.depthwise_separable( + out, 256, 512, 256, 2, scale, name=self.prefix_name + "conv4_2") + # 1/16 + blocks.append(out) + for i in range(5): + out = self.depthwise_separable( + out, + 512, + 512, + 512, + 1, + scale, + name=self.prefix_name + "conv5_" + str(i + 1)) + module11 = out + + out = self.depthwise_separable( + out, 512, 1024, 512, 2, scale, name=self.prefix_name + "conv5_6") + # 1/32 + out = self.depthwise_separable( + out, 1024, 1024, 1024, 1, scale, name=self.prefix_name + "conv6") + module13 = out + blocks.append(out) + if not self.with_extra_blocks: + return blocks + + num_filters = self.extra_block_filters + module14 = self._extra_block(module13, num_filters[0][0], + num_filters[0][1], 1, 2, + self.prefix_name + "conv7_1") + module15 = self._extra_block(module14, num_filters[1][0], + num_filters[1][1], 1, 2, + self.prefix_name + "conv7_2") + module16 = self._extra_block(module15, num_filters[2][0], + num_filters[2][1], 1, 2, + self.prefix_name + "conv7_3") + module17 = self._extra_block(module16, num_filters[3][0], + num_filters[3][1], 1, 2, + self.prefix_name + "conv7_4") + return module11, module13, module14, module15, module16, module17 diff --git a/ppdet/modeling/backbones/name_adapter.py b/ppdet/modeling/backbones/name_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..2cb16d0c9cc82d9c07988ae47c6347973add0dc8 --- /dev/null +++ b/ppdet/modeling/backbones/name_adapter.py @@ -0,0 +1,73 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class NameAdapter(object): + """Fix the backbones variable names for pretrained weight""" + + def __init__(self, model): + super(NameAdapter, self).__init__() + self.model = model + + @property + def model_type(self): + return getattr(self.model, '_model_type', '') + + @property + def variant(self): + return getattr(self.model, 'variant', '') + + def fix_conv_norm_name(self, name): + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + # the naming rule is same as pretrained weight + if self.model_type == 'SEResNeXt': + bn_name = name + "_bn" + return bn_name + + def fix_shortcut_name(self, name): + if self.model_type == 'SEResNeXt': + name = 'conv' + name + '_prj' + return name + + def fix_bottleneck_name(self, name): + if self.model_type == 'SEResNeXt': + conv_name1 = 'conv' + name + '_x1' + conv_name2 = 'conv' + name + '_x2' + conv_name3 = 'conv' + name + '_x3' + shortcut_name = name + else: + conv_name1 = name + "_branch2a" + conv_name2 = name + "_branch2b" + conv_name3 = name + "_branch2c" + shortcut_name = name + "_branch1" + return conv_name1, conv_name2, conv_name3, shortcut_name + + def fix_layer_warp_name(self, stage_num, count, i): + name = 'res' + str(stage_num) + if count > 10 and stage_num == 4: + if i == 0: + conv_name = name + "a" + else: + conv_name = name + "b" + str(i) + else: + conv_name = name + chr(ord("a") + i) + if self.model_type == 'SEResNeXt': + conv_name = str(stage_num + 2) + '_' + str(i + 1) + return conv_name + + def fix_c1_stage_name(self): + return "res_conv1" if self.model_type == 'ResNeXt' else "conv1" diff --git a/ppdet/modeling/backbones/resnet.py b/ppdet/modeling/backbones/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..496f5e76cbec09639c63b3b5d6499ee0d6f32598 --- /dev/null +++ b/ppdet/modeling/backbones/resnet.py @@ -0,0 +1,431 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import OrderedDict + +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.framework import Variable +from paddle.fluid.regularizer import L2Decay +from paddle.fluid.initializer import Constant + +from ppdet.core.workspace import register, serializable +from numbers import Integral + +from .name_adapter import NameAdapter + +__all__ = ['ResNet', 'ResNetC5'] + + +@register +@serializable +class ResNet(object): + """ + Residual Network, see https://arxiv.org/abs/1512.03385 + Args: + depth (int): ResNet depth, should be 18, 34, 50, 101, 152. + freeze_at (int): freeze the backbone at which stage + norm_type (str): normalization type, 'bn'/'sync_bn'/'affine_channel' + freeze_norm (bool): freeze normalization layers + norm_decay (float): weight decay for normalization layer weights + variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently + feature_maps (list): index of stages whose feature maps are returned + dcn_v2_stages (list): index of stages who select deformable conv v2 + """ + __shared__ = ['norm_type', 'freeze_norm', 'weight_prefix_name'] + + def __init__(self, + depth=50, + freeze_at=2, + norm_type='affine_channel', + freeze_norm=True, + norm_decay=0., + variant='b', + feature_maps=[2, 3, 4, 5], + dcn_v2_stages=[], + weight_prefix_name=''): + super(ResNet, self).__init__() + + if isinstance(feature_maps, Integral): + feature_maps = [feature_maps] + + assert depth in [18, 34, 50, 101, 152], \ + "depth {} not in [18, 34, 50, 101, 152]" + assert variant in ['a', 'b', 'c', 'd'], "invalid ResNet variant" + assert 0 <= freeze_at <= 4, "freeze_at should be 0, 1, 2, 3 or 4" + assert len(feature_maps) > 0, "need one or more feature maps" + assert norm_type in ['bn', 'sync_bn', 'affine_channel'] + + self.depth = depth + self.freeze_at = freeze_at + self.norm_type = norm_type + self.norm_decay = norm_decay + self.freeze_norm = freeze_norm + self.variant = variant + self._model_type = 'ResNet' + self.feature_maps = feature_maps + self.dcn_v2_stages = dcn_v2_stages + self.depth_cfg = { + 18: ([2, 2, 2, 2], self.basicblock), + 34: ([3, 4, 6, 3], self.basicblock), + 50: ([3, 4, 6, 3], self.bottleneck), + 101: ([3, 4, 23, 3], self.bottleneck), + 152: ([3, 8, 36, 3], self.bottleneck) + } + self.stage_filters = [64, 128, 256, 512] + self._c1_out_chan_num = 64 + self.na = NameAdapter(self) + self.prefix_name = weight_prefix_name + + def _conv_offset(self, + input, + filter_size, + stride, + padding, + act=None, + name=None): + out_channel = filter_size * filter_size * 3 + out = fluid.layers.conv2d( + input, + num_filters=out_channel, + filter_size=filter_size, + stride=stride, + padding=padding, + param_attr=ParamAttr( + initializer=Constant(0.0), name=name + ".w_0"), + bias_attr=ParamAttr( + initializer=Constant(0.0), name=name + ".b_0"), + act=act, + name=name) + return out + + def _conv_norm(self, + input, + num_filters, + filter_size, + stride=1, + groups=1, + act=None, + name=None, + dcn_v2=False): + _name = self.prefix_name + name if self.prefix_name != '' else name + if not dcn_v2: + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + param_attr=ParamAttr(name=_name + "_weights"), + bias_attr=False, + name=_name + '.conv2d.output.1') + else: + # select deformable conv" + offset_mask = self._conv_offset( + input=input, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + act=None, + name=_name + "_conv_offset") + offset_channel = filter_size**2 * 2 + mask_channel = filter_size**2 + offset, mask = fluid.layers.split( + input=offset_mask, + num_or_sections=[offset_channel, mask_channel], + dim=1) + mask = fluid.layers.sigmoid(mask) + conv = fluid.layers.deformable_conv( + input=input, + offset=offset, + mask=mask, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + deformable_groups=1, + im2col_step=1, + param_attr=ParamAttr(name=_name + "_weights"), + bias_attr=False, + name=_name + ".conv2d.output.1") + + bn_name = self.na.fix_conv_norm_name(name) + bn_name = self.prefix_name + bn_name if self.prefix_name != '' else bn_name + + norm_lr = 0. if self.freeze_norm else 1. + norm_decay = self.norm_decay + pattr = ParamAttr( + name=bn_name + '_scale', + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay)) + battr = ParamAttr( + name=bn_name + '_offset', + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay)) + + if self.norm_type in ['bn', 'sync_bn']: + global_stats = True if self.freeze_norm else False + out = fluid.layers.batch_norm( + input=conv, + act=act, + name=bn_name + '.output.1', + param_attr=pattr, + bias_attr=battr, + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance', + use_global_stats=global_stats) + scale = fluid.framework._get_var(pattr.name) + bias = fluid.framework._get_var(battr.name) + elif self.norm_type == 'affine_channel': + scale = fluid.layers.create_parameter( + shape=[conv.shape[1]], + dtype=conv.dtype, + attr=pattr, + default_initializer=fluid.initializer.Constant(1.)) + bias = fluid.layers.create_parameter( + shape=[conv.shape[1]], + dtype=conv.dtype, + attr=battr, + default_initializer=fluid.initializer.Constant(0.)) + out = fluid.layers.affine_channel( + x=conv, scale=scale, bias=bias, act=act) + if self.freeze_norm: + scale.stop_gradient = True + bias.stop_gradient = True + return out + + def _shortcut(self, input, ch_out, stride, is_first, name): + max_pooling_in_short_cut = self.variant == 'd' + ch_in = input.shape[1] + # the naming rule is same as pretrained weight + name = self.na.fix_shortcut_name(name) + std_senet = getattr(self, 'std_senet', False) + if ch_in != ch_out or stride != 1 or (self.depth < 50 and is_first): + if std_senet: + if is_first: + return self._conv_norm(input, ch_out, 1, stride, name=name) + else: + return self._conv_norm(input, ch_out, 3, stride, name=name) + if max_pooling_in_short_cut and not is_first: + input = fluid.layers.pool2d( + input=input, + pool_size=2, + pool_stride=2, + pool_padding=0, + ceil_mode=True, + pool_type='avg') + return self._conv_norm(input, ch_out, 1, 1, name=name) + return self._conv_norm(input, ch_out, 1, stride, name=name) + else: + return input + + def bottleneck(self, + input, + num_filters, + stride, + is_first, + name, + dcn_v2=False): + if self.variant == 'a': + stride1, stride2 = stride, 1 + else: + stride1, stride2 = 1, stride + + # ResNeXt + groups = getattr(self, 'groups', 1) + group_width = getattr(self, 'group_width', -1) + if groups == 1: + expand = 4 + elif (groups * group_width) == 256: + expand = 1 + else: # FIXME hard code for now, handles 32x4d, 64x4d and 32x8d + num_filters = num_filters // 2 + expand = 2 + + conv_name1, conv_name2, conv_name3, \ + shortcut_name = self.na.fix_bottleneck_name(name) + std_senet = getattr(self, 'std_senet', False) + if std_senet: + conv_def = [ + [int(num_filters / 2), 1, stride1, 'relu', 1, conv_name1], + [num_filters, 3, stride2, 'relu', groups, conv_name2], + [num_filters * expand, 1, 1, None, 1, conv_name3] + ] + else: + conv_def = [[num_filters, 1, stride1, 'relu', 1, conv_name1], + [num_filters, 3, stride2, 'relu', groups, conv_name2], + [num_filters * expand, 1, 1, None, 1, conv_name3]] + + residual = input + for i, (c, k, s, act, g, _name) in enumerate(conv_def): + residual = self._conv_norm( + input=residual, + num_filters=c, + filter_size=k, + stride=s, + act=act, + groups=g, + name=_name, + dcn_v2=(i == 1 and dcn_v2)) + short = self._shortcut( + input, + num_filters * expand, + stride, + is_first=is_first, + name=shortcut_name) + # Squeeze-and-Excitation + if callable(getattr(self, '_squeeze_excitation', None)): + residual = self._squeeze_excitation( + input=residual, num_channels=num_filters, name='fc' + name) + return fluid.layers.elementwise_add( + x=short, y=residual, act='relu', name=name + ".add.output.5") + + def basicblock(self, + input, + num_filters, + stride, + is_first, + name, + dcn_v2=False): + assert dcn_v2 is False, "Not implemented yet." + conv0 = self._conv_norm( + input=input, + num_filters=num_filters, + filter_size=3, + act='relu', + stride=stride, + name=name + "_branch2a") + conv1 = self._conv_norm( + input=conv0, + num_filters=num_filters, + filter_size=3, + act=None, + name=name + "_branch2b") + short = self._shortcut( + input, num_filters, stride, is_first, name=name + "_branch1") + return fluid.layers.elementwise_add(x=short, y=conv1, act='relu') + + def layer_warp(self, input, stage_num): + """ + Args: + input (Variable): input variable. + stage_num (int): the stage number, should be 2, 3, 4, 5 + + Returns: + The last variable in endpoint-th stage. + """ + assert stage_num in [2, 3, 4, 5] + + stages, block_func = self.depth_cfg[self.depth] + count = stages[stage_num - 2] + + ch_out = self.stage_filters[stage_num - 2] + is_first = False if stage_num != 2 else True + dcn_v2 = True if stage_num in self.dcn_v2_stages else False + # Make the layer name and parameter name consistent + # with ImageNet pre-trained model + conv = input + for i in range(count): + conv_name = self.na.fix_layer_warp_name(stage_num, count, i) + if self.depth < 50: + is_first = True if i == 0 and stage_num == 2 else False + conv = block_func( + input=conv, + num_filters=ch_out, + stride=2 if i == 0 and stage_num != 2 else 1, + is_first=is_first, + name=conv_name, + dcn_v2=dcn_v2) + return conv + + def c1_stage(self, input): + out_chan = self._c1_out_chan_num + + conv1_name = self.na.fix_c1_stage_name() + + if self.variant in ['c', 'd']: + conv_def = [ + [out_chan // 2, 3, 2, "conv1_1"], + [out_chan // 2, 3, 1, "conv1_2"], + [out_chan, 3, 1, "conv1_3"], + ] + else: + conv_def = [[out_chan, 7, 2, conv1_name]] + + for (c, k, s, _name) in conv_def: + input = self._conv_norm( + input=input, + num_filters=c, + filter_size=k, + stride=s, + act='relu', + name=_name) + + output = fluid.layers.pool2d( + input=input, + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + return output + + def __call__(self, input): + assert isinstance(input, Variable) + assert not (set(self.feature_maps) - set([2, 3, 4, 5])), \ + "feature maps {} not in [2, 3, 4, 5]".format(self.feature_maps) + + res_endpoints = [] + + res = input + feature_maps = self.feature_maps + severed_head = getattr(self, 'severed_head', False) + if not severed_head: + res = self.c1_stage(res) + feature_maps = range(2, max(self.feature_maps) + 1) + + for i in feature_maps: + res = self.layer_warp(res, i) + if i in self.feature_maps: + res_endpoints.append(res) + if self.freeze_at >= i: + res.stop_gradient = True + + return OrderedDict([('res{}_sum'.format(self.feature_maps[idx]), feat) + for idx, feat in enumerate(res_endpoints)]) + + +@register +@serializable +class ResNetC5(ResNet): + __doc__ = ResNet.__doc__ + + def __init__(self, + depth=50, + freeze_at=2, + norm_type='affine_channel', + freeze_norm=True, + norm_decay=0., + variant='b', + feature_maps=[5], + weight_prefix_name=''): + super(ResNetC5, self).__init__(depth, freeze_at, norm_type, freeze_norm, + norm_decay, variant, feature_maps) + self.severed_head = True diff --git a/ppdet/modeling/backbones/resnext.py b/ppdet/modeling/backbones/resnext.py new file mode 100644 index 0000000000000000000000000000000000000000..5452511376de21ccca731b0f4e08b38552fb4e9d --- /dev/null +++ b/ppdet/modeling/backbones/resnext.py @@ -0,0 +1,89 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from ppdet.core.workspace import register, serializable +from .resnet import ResNet + +__all__ = ['ResNeXt'] + + +@register +@serializable +class ResNeXt(ResNet): + """ + ResNeXt, see https://arxiv.org/abs/1611.05431 + Args: + depth (int): network depth, should be 50, 101, 152. + groups (int): group convolution cardinality + group_width (int): width of each group convolution + freeze_at (int): freeze the backbone at which stage + norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel' + freeze_norm (bool): freeze normalization layers + norm_decay (float): weight decay for normalization layer weights + variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently + feature_maps (list): index of the stages whose feature maps are returned + dcn_v2_stages (list): index of stages who select deformable conv v2 + """ + + def __init__(self, + depth=50, + groups=64, + group_width=4, + freeze_at=2, + norm_type='affine_channel', + freeze_norm=True, + norm_decay=True, + variant='a', + feature_maps=[2, 3, 4, 5], + dcn_v2_stages=[], + weight_prefix_name=''): + assert depth in [50, 101, 152], "depth {} should be 50, 101 or 152" + super(ResNeXt, self).__init__(depth, freeze_at, norm_type, freeze_norm, + norm_decay, variant, feature_maps) + self.depth_cfg = { + 50: ([3, 4, 6, 3], self.bottleneck), + 101: ([3, 4, 23, 3], self.bottleneck), + 152: ([3, 8, 36, 3], self.bottleneck) + } + self.stage_filters = [256, 512, 1024, 2048] + self.groups = groups + self.group_width = group_width + self._model_type = 'ResNeXt' + self.dcn_v2_stages = dcn_v2_stages + + +@register +@serializable +class ResNeXtC5(ResNeXt): + __doc__ = ResNeXt.__doc__ + + def __init__(self, + depth=50, + groups=64, + group_width=4, + freeze_at=2, + norm_type='affine_channel', + freeze_norm=True, + norm_decay=True, + variant='a', + feature_maps=[5], + weight_prefix_name=''): + super(ResNeXtC5, self).__init__(depth, groups, group_width, freeze_at, + norm_type, freeze_norm, norm_decay, + variant, feature_maps) + self.severed_head = True diff --git a/ppdet/modeling/backbones/senet.py b/ppdet/modeling/backbones/senet.py new file mode 100644 index 0000000000000000000000000000000000000000..09c69ff199f03837b979127664d7e752db7e587a --- /dev/null +++ b/ppdet/modeling/backbones/senet.py @@ -0,0 +1,122 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr + +from ppdet.core.workspace import register, serializable +from .resnext import ResNeXt + +__all__ = ['SENet', 'SENetC5'] + + +@register +@serializable +class SENet(ResNeXt): + """ + Squeeze-and-Excitation Networks, see https://arxiv.org/abs/1709.01507 + Args: + depth (int): SENet depth, should be 50, 101, 152 + groups (int): group convolution cardinality + group_width (int): width of each group convolution + freeze_at (int): freeze the backbone at which stage + norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel' + freeze_norm (bool): freeze normalization layers + norm_decay (float): weight decay for normalization layer weights + variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently + feature_maps (list): index of the stages whose feature maps are returned + dcn_v2_stages (list): index of stages who select deformable conv v2 + """ + + def __init__(self, + depth=50, + groups=64, + group_width=4, + freeze_at=2, + norm_type='affine_channel', + freeze_norm=True, + norm_decay=0., + variant='d', + feature_maps=[2, 3, 4, 5], + dcn_v2_stages=[], + std_senet=False, + weight_prefix_name=''): + super(SENet, self).__init__(depth, groups, group_width, freeze_at, + norm_type, freeze_norm, norm_decay, variant, + feature_maps) + if depth < 152: + self.stage_filters = [128, 256, 512, 1024] + else: + self.stage_filters = [256, 512, 1024, 2048] + self.reduction_ratio = 16 + self.std_senet = std_senet + self._c1_out_chan_num = 128 + self._model_type = 'SEResNeXt' + self.dcn_v2_stages = dcn_v2_stages + + def _squeeze_excitation(self, input, num_channels, name=None): + pool = fluid.layers.pool2d( + input=input, + pool_size=0, + pool_type='avg', + global_pooling=True, + use_cudnn=False) + stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) + squeeze = fluid.layers.fc( + input=pool, + size=int(num_channels / self.reduction_ratio), + act='relu', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv), + name=name + '_sqz_weights'), + bias_attr=ParamAttr(name=name + '_sqz_offset')) + stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0) + excitation = fluid.layers.fc( + input=squeeze, + size=num_channels, + act='sigmoid', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv), + name=name + '_exc_weights'), + bias_attr=ParamAttr(name=name + '_exc_offset')) + scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) + return scale + + +@register +@serializable +class SENetC5(SENet): + __doc__ = SENet.__doc__ + + def __init__(self, + depth=50, + groups=64, + group_width=4, + freeze_at=2, + norm_type='affine_channel', + freeze_norm=True, + norm_decay=0., + variant='d', + feature_maps=[5], + weight_prefix_name=''): + super(SENetC5, self).__init__(depth, groups, group_width, freeze_at, + norm_type, freeze_norm, norm_decay, + variant, feature_maps) + self.severed_head = True diff --git a/ppdet/modeling/backbones/vgg.py b/ppdet/modeling/backbones/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..28bd29272773dd670e0ef7223814a25784c54bf1 --- /dev/null +++ b/ppdet/modeling/backbones/vgg.py @@ -0,0 +1,200 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr + +from ppdet.core.workspace import register + +__all__ = ['VGG'] + + +@register +class VGG(object): + """ + VGG, see https://arxiv.org/abs/1409.1556 + + Args: + depth (int): the VGG net depth (16 or 19) + normalizations (list): params list of init scale in l2 norm, skip init + scale if param is -1. + with_extra_blocks (bool): whether or not extra blocks should be added + extra_block_filters (list): in each extra block, params: + [in_channel, out_channel, padding_size, stride_size, filter_size] + """ + + def __init__(self, + depth=16, + with_extra_blocks=False, + normalizations=[20., -1, -1, -1, -1, -1], + extra_block_filters=[[256, 512, 1, 2, 3], [128, 256, 1, 2, 3], + [128, 256, 0, 1, 3], [128, 256, 0, 1, 3]]): + assert depth in [16, 19], \ + "depth {} not in [16, 19]" + + self.depth = depth + self.depth_cfg = { + 16: [2, 2, 3, 3, 3], + 19: [2, 2, 4, 4, 4] + } + self.with_extra_blocks = with_extra_blocks + self.normalizations = normalizations + self.extra_block_filters = extra_block_filters + + def __call__(self, input): + layers = [] + layers += self._vgg_block(input) + + if not self.with_extra_blocks: + return layers[-1] + + layers += self._add_extras_block(layers[-1]) + norm_cfg = self.normalizations + for k, v in enumerate(layers): + if not norm_cfg[k] == -1: + layers[k] = self._l2_norm_scale(v, init_scale=norm_cfg[k]) + + return layers + + def _vgg_block(self, input): + nums = self.depth_cfg[self.depth] + vgg_base = [64, 128, 256, 512, 512] + conv = input + layers = [] + for k, v in enumerate(vgg_base): + conv = self._conv_block(conv, v, nums[k], name="conv{}_".format(k + 1)) + layers.append(conv) + if k == 4: + conv = self._pooling_block(conv, 3, 1, pool_padding=1) + else: + conv = self._pooling_block(conv, 2, 2) + + fc6 = self._conv_layer(conv, 1024, 3, 1, 6, dilation=6, name="fc6") + fc7 = self._conv_layer(fc6, 1024, 1, 1, 0, name="fc7") + + return [layers[3], fc7] + + def _add_extras_block(self, input): + cfg = self.extra_block_filters + conv = input + layers = [] + for k, v in enumerate(cfg): + assert len(v) == 5, "extra_block_filters size not fix" + conv = self._extra_block(conv, v[0], v[1], + v[2], v[3], v[4], name="conv{}_".format(6 + k)) + layers.append(conv) + + return layers + + def _conv_block(self, input, num_filter, groups, name=None): + conv = input + for i in range(groups): + conv = self._conv_layer( + input=conv, + num_filters=num_filter, + filter_size=3, + stride=1, + padding=1, + act='relu', + name=name + str(i + 1)) + return conv + + def _extra_block(self, + input, + num_filters1, + num_filters2, + padding_size, + stride_size, + filter_size, + name=None): + # 1x1 conv + conv_1 = self._conv_layer( + input=input, + num_filters=int(num_filters1), + filter_size=1, + stride=1, + act='relu', + padding=0, + name=name + "1") + + # 3x3 conv + conv_2 = self._conv_layer( + input=conv_1, + num_filters=int(num_filters2), + filter_size=filter_size, + stride=stride_size, + act='relu', + padding=padding_size, + name=name + "2") + return conv_2 + + def _conv_layer(self, + input, + num_filters, + filter_size, + stride, + padding, + dilation=1, + act='relu', + use_cudnn=True, + name=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=padding, + dilation=dilation, + act=act, + use_cudnn=use_cudnn, + param_attr=ParamAttr(name=name + "_weights"), + bias_attr=ParamAttr(name=name + "_biases"), + name=name + '.conv2d.output.1') + return conv + + def _pooling_block(self, + conv, + pool_size, + pool_stride, + pool_padding=0, + ceil_mode=True): + pool = fluid.layers.pool2d( + input=conv, + pool_size=pool_size, + pool_type='max', + pool_stride=pool_stride, + pool_padding=pool_padding, + ceil_mode=ceil_mode) + return pool + + def _l2_norm_scale(self, input, init_scale=1.0, channel_shared=False): + from paddle.fluid.layer_helper import LayerHelper + from paddle.fluid.initializer import Constant + helper = LayerHelper("Scale") + l2_norm = fluid.layers.l2_normalize( + input, axis=1) # l2 norm along channel + shape = [1] if channel_shared else [input.shape[1]] + scale = helper.create_parameter( + attr=helper.param_attr, + shape=shape, + dtype=input.dtype, + default_initializer=Constant(init_scale)) + out = fluid.layers.elementwise_mul( + x=l2_norm, y=scale, axis=-1 if channel_shared else 1, + name="conv4_3_norm_scale") + return out diff --git a/ppdet/modeling/model_input.py b/ppdet/modeling/model_input.py new file mode 100644 index 0000000000000000000000000000000000000000..6b407cfbd82d1588cbc020b3e8bf5e7d5f4b930c --- /dev/null +++ b/ppdet/modeling/model_input.py @@ -0,0 +1,129 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +from collections import OrderedDict +from ppdet.data.transform.operators import * + +from paddle import fluid + +__all__ = ['create_feed'] + +# yapf: disable +feed_var_def = [ + {'name': 'im_info', 'shape': [3], 'dtype': 'float32', 'lod_level': 0}, + {'name': 'im_id', 'shape': [1], 'dtype': 'int32', 'lod_level': 0}, + {'name': 'gt_box', 'shape': [4], 'dtype': 'float32', 'lod_level': 1}, + {'name': 'gt_label', 'shape': [1], 'dtype': 'int32', 'lod_level': 1}, + {'name': 'is_crowd', 'shape': [1], 'dtype': 'int32', 'lod_level': 1}, + {'name': 'gt_mask', 'shape': [2], 'dtype': 'float32', 'lod_level': 3}, + {'name': 'is_difficult', 'shape': [1], 'dtype': 'int32', 'lod_level': 1}, + {'name': 'gt_score', 'shape': [1], 'dtype': 'float32', 'lod_level': 0}, + {'name': 'im_shape', 'shape': [3], 'dtype': 'float32', 'lod_level': 0}, + {'name': 'im_size', 'shape': [2], 'dtype': 'int32', 'lod_level': 0}, +] +# yapf: enable + + +def create_feed(feed, use_pyreader=True, sub_prog_feed=False): + image_shape = feed.image_shape + feed_var_map = {var['name']: var for var in feed_var_def} + feed_var_map['image'] = { + 'name': 'image', + 'shape': image_shape, + 'dtype': 'float32', + 'lod_level': 0 + } + + # tensor padding with 0 is used instead of LoD tensor when + # num_max_boxes is set + if getattr(feed, 'num_max_boxes', None) is not None: + feed_var_map['gt_label']['shape'] = [feed.num_max_boxes] + feed_var_map['gt_score']['shape'] = [feed.num_max_boxes] + feed_var_map['gt_box']['shape'] = [feed.num_max_boxes, 4] + feed_var_map['is_difficult']['shape'] = [feed.num_max_boxes] + feed_var_map['gt_label']['lod_level'] = 0 + feed_var_map['gt_score']['lod_level'] = 0 + feed_var_map['gt_box']['lod_level'] = 0 + feed_var_map['is_difficult']['lod_level'] = 0 + + base_name_list = ['image'] + num_scale = getattr(feed, 'num_scale', 1) + sample_transform = feed.sample_transforms + multiscale_test = False + aug_flip = False + for t in sample_transform: + if isinstance(t, MultiscaleTestResize): + multiscale_test = True + aug_flip = t.use_flip + assert (len(t.target_size)+1)*(aug_flip+1) == num_scale, \ + "num_scale: {} is not equal to the actual number of scale: {}."\ + .format(num_scale, (len(t.target_size)+1)*(aug_flip+1)) + break + + if aug_flip: + num_scale //= 2 + base_name_list.insert(0, 'flip_image') + feed_var_map['flip_image'] = { + 'name': 'flip_image', + 'shape': image_shape, + 'dtype': 'float32', + 'lod_level': 0 + } + + image_name_list = [] + if multiscale_test: + for base_name in base_name_list: + for i in range(0, num_scale): + name = base_name if i == 0 else base_name + '_scale_' + str(i - + 1) + feed_var_map[name] = { + 'name': name, + 'shape': image_shape, + 'dtype': 'float32', + 'lod_level': 0 + } + image_name_list.append(name) + feed_var_map['im_info']['shape'] = [feed.num_scale * 3] + feed.fields = image_name_list + feed.fields[1:] + if sub_prog_feed: + box_names = ['bbox', 'bbox_flip'] + for box_name in box_names: + sub_prog_feed = { + 'name': box_name, + 'shape': [6], + 'dtype': 'float32', + 'lod_level': 1 + } + + feed.fields = feed.fields + [box_name] + feed_var_map[box_name] = sub_prog_feed + + feed_vars = OrderedDict([(key, fluid.layers.data( + name=feed_var_map[key]['name'], + shape=feed_var_map[key]['shape'], + dtype=feed_var_map[key]['dtype'], + lod_level=feed_var_map[key]['lod_level'])) for key in feed.fields]) + + pyreader = None + if use_pyreader: + pyreader = fluid.io.PyReader( + feed_list=list(feed_vars.values()), + capacity=64, + use_double_buffer=True, + iterable=False) + return pyreader, feed_vars diff --git a/ppdet/modeling/ops.py b/ppdet/modeling/ops.py new file mode 100644 index 0000000000000000000000000000000000000000..1312f86b49077536043ceb96cbca0f6a1c9b9ef3 --- /dev/null +++ b/ppdet/modeling/ops.py @@ -0,0 +1,349 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from numbers import Integral + +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.regularizer import L2Decay +from ppdet.core.workspace import register, serializable + +__all__ = [ + 'AnchorGenerator', 'RPNTargetAssign', 'GenerateProposals', 'MultiClassNMS', + 'BBoxAssigner', 'MaskAssigner', 'RoIAlign', 'RoIPool', 'MultiBoxHead', + 'SSDOutputDecoder', 'RetinaTargetAssign', 'RetinaOutputDecoder', 'ConvNorm' +] + + +def ConvNorm(input, + num_filters, + filter_size, + stride=1, + groups=1, + norm_decay=0., + norm_type='affine_channel', + norm_groups=32, + dilation=1, + lr_scale=1, + freeze_norm=False, + act=None, + norm_name=None, + initializer=None, + name=None): + fan = num_filters + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=((filter_size - 1) // 2) * dilation, + dilation=dilation, + groups=groups, + act=None, + param_attr=ParamAttr( + name=name + "_weights", + initializer=initializer, + learning_rate=lr_scale), + bias_attr=False, + name=name + '.conv2d.output.1') + + norm_lr = 0. if freeze_norm else 1. + pattr = ParamAttr( + name=norm_name + '_scale', + learning_rate=norm_lr * lr_scale, + regularizer=L2Decay(norm_decay)) + battr = ParamAttr( + name=norm_name + '_offset', + learning_rate=norm_lr * lr_scale, + regularizer=L2Decay(norm_decay)) + + if norm_type in ['bn', 'sync_bn']: + global_stats = True if freeze_norm else False + out = fluid.layers.batch_norm( + input=conv, + act=act, + name=norm_name + '.output.1', + param_attr=pattr, + bias_attr=battr, + moving_mean_name=norm_name + '_mean', + moving_variance_name=norm_name + '_variance', + use_global_stats=global_stats) + scale = fluid.framework._get_var(pattr.name) + bias = fluid.framework._get_var(battr.name) + elif norm_type == 'gn': + out = fluid.layers.group_norm( + input=conv, + act=act, + name=norm_name + '.output.1', + groups=norm_groups, + param_attr=pattr, + bias_attr=battr) + scale = fluid.framework._get_var(pattr.name) + bias = fluid.framework._get_var(battr.name) + elif norm_type == 'affine_channel': + scale = fluid.layers.create_parameter( + shape=[conv.shape[1]], + dtype=conv.dtype, + attr=pattr, + default_initializer=fluid.initializer.Constant(1.)) + bias = fluid.layers.create_parameter( + shape=[conv.shape[1]], + dtype=conv.dtype, + attr=battr, + default_initializer=fluid.initializer.Constant(0.)) + out = fluid.layers.affine_channel( + x=conv, scale=scale, bias=bias, act=act) + if freeze_norm: + scale.stop_gradient = True + bias.stop_gradient = True + return out + + +@register +@serializable +class AnchorGenerator(object): + __op__ = fluid.layers.anchor_generator + __append_doc__ = True + + def __init__(self, + stride=[16.0, 16.0], + anchor_sizes=[32, 64, 128, 256, 512], + aspect_ratios=[0.5, 1., 2.], + variance=[1., 1., 1., 1.]): + super(AnchorGenerator, self).__init__() + self.anchor_sizes = anchor_sizes + self.aspect_ratios = aspect_ratios + self.variance = variance + self.stride = stride + + +@register +@serializable +class RPNTargetAssign(object): + __op__ = fluid.layers.rpn_target_assign + __append_doc__ = True + + def __init__(self, + rpn_batch_size_per_im=256, + rpn_straddle_thresh=0., + rpn_fg_fraction=0.5, + rpn_positive_overlap=0.7, + rpn_negative_overlap=0.3, + use_random=True): + super(RPNTargetAssign, self).__init__() + self.rpn_batch_size_per_im = rpn_batch_size_per_im + self.rpn_straddle_thresh = rpn_straddle_thresh + self.rpn_fg_fraction = rpn_fg_fraction + self.rpn_positive_overlap = rpn_positive_overlap + self.rpn_negative_overlap = rpn_negative_overlap + self.use_random = use_random + + +@register +@serializable +class GenerateProposals(object): + __op__ = fluid.layers.generate_proposals + __append_doc__ = True + + def __init__(self, + pre_nms_top_n=6000, + post_nms_top_n=1000, + nms_thresh=.5, + min_size=.1, + eta=1.): + super(GenerateProposals, self).__init__() + self.pre_nms_top_n = pre_nms_top_n + self.post_nms_top_n = post_nms_top_n + self.nms_thresh = nms_thresh + self.min_size = min_size + self.eta = eta + + +@register +class MaskAssigner(object): + __op__ = fluid.layers.generate_mask_labels + __append_doc__ = True + __shared__ = ['num_classes'] + + def __init__(self, num_classes=81, resolution=14): + super(MaskAssigner, self).__init__() + self.num_classes = num_classes + self.resolution = resolution + + +@register +@serializable +class MultiClassNMS(object): + __op__ = fluid.layers.multiclass_nms + __append_doc__ = True + + def __init__(self, + score_threshold=.05, + nms_top_k=-1, + keep_top_k=100, + nms_threshold=.5, + normalized=False, + nms_eta=1.0, + background_label=0): + super(MultiClassNMS, self).__init__() + self.score_threshold = score_threshold + self.nms_top_k = nms_top_k + self.keep_top_k = keep_top_k + self.nms_threshold = nms_threshold + self.normalized = normalized + self.nms_eta = nms_eta + self.background_label = background_label + + +@register +class BBoxAssigner(object): + __op__ = fluid.layers.generate_proposal_labels + __append_doc__ = True + __shared__ = ['num_classes'] + + def __init__(self, + batch_size_per_im=512, + fg_fraction=.25, + fg_thresh=.5, + bg_thresh_hi=.5, + bg_thresh_lo=0., + bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], + num_classes=81, + shuffle_before_sample=True): + super(BBoxAssigner, self).__init__() + self.batch_size_per_im = batch_size_per_im + self.fg_fraction = fg_fraction + self.fg_thresh = fg_thresh + self.bg_thresh_hi = bg_thresh_hi + self.bg_thresh_lo = bg_thresh_lo + self.bbox_reg_weights = bbox_reg_weights + self.class_nums = num_classes + self.use_random = shuffle_before_sample + + +@register +class RoIAlign(object): + __op__ = fluid.layers.roi_align + __append_doc__ = True + + def __init__(self, resolution=7, spatial_scale=1. / 16, sampling_ratio=0): + super(RoIAlign, self).__init__() + if isinstance(resolution, Integral): + resolution = [resolution, resolution] + self.pooled_height = resolution[0] + self.pooled_width = resolution[1] + self.spatial_scale = spatial_scale + self.sampling_ratio = sampling_ratio + + +@register +class RoIPool(object): + __op__ = fluid.layers.roi_pool + __append_doc__ = True + + def __init__(self, resolution=7, spatial_scale=1. / 16): + super(RoIPool, self).__init__() + if isinstance(resolution, Integral): + resolution = [resolution, resolution] + self.pooled_height = resolution[0] + self.pooled_width = resolution[1] + self.spatial_scale = spatial_scale + + +@register +class MultiBoxHead(object): + __op__ = fluid.layers.multi_box_head + __append_doc__ = True + + def __init__(self, + min_ratio=20, + max_ratio=90, + base_size=300, + min_sizes=[60.0, 105.0, 150.0, 195.0, 240.0, 285.0], + max_sizes=[[], 150.0, 195.0, 240.0, 285.0, 300.0], + aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2., 3.], + [2., 3.]], + steps=None, + offset=0.5, + flip=True, + min_max_aspect_ratios_order=False, + kernel_size=1, + pad=0): + super(MultiBoxHead, self).__init__() + self.min_ratio = min_ratio + self.max_ratio = max_ratio + self.base_size = base_size + self.min_sizes = min_sizes + self.max_sizes = max_sizes + self.aspect_ratios = aspect_ratios + self.steps = steps + self.offset = offset + self.flip = flip + self.min_max_aspect_ratios_order = min_max_aspect_ratios_order + self.kernel_size = kernel_size + self.pad = pad + + +@register +@serializable +class SSDOutputDecoder(object): + __op__ = fluid.layers.detection_output + __append_doc__ = True + + def __init__(self, + nms_threshold=0.45, + nms_top_k=400, + keep_top_k=200, + score_threshold=0.01, + nms_eta=1.0, + background_label=0): + super(SSDOutputDecoder, self).__init__() + self.nms_threshold = nms_threshold + self.background_label = background_label + self.nms_top_k = nms_top_k + self.keep_top_k = keep_top_k + self.score_threshold = score_threshold + self.nms_eta = nms_eta + + +@register +@serializable +class RetinaTargetAssign(object): + __op__ = fluid.layers.retinanet_target_assign + __append_doc__ = True + + def __init__(self, positive_overlap=0.5, negative_overlap=0.4): + super(RetinaTargetAssign, self).__init__() + self.positive_overlap = positive_overlap + self.negative_overlap = negative_overlap + + +@register +@serializable +class RetinaOutputDecoder(object): + __op__ = fluid.layers.retinanet_detection_output + __append_doc__ = True + + def __init__(self, + score_thresh=0.05, + nms_thresh=0.3, + pre_nms_top_n=1000, + detections_per_im=100, + nms_eta=1.0): + super(RetinaOutputDecoder, self).__init__() + self.score_threshold = score_thresh + self.nms_threshold = nms_thresh + self.nms_top_k = pre_nms_top_n + self.keep_top_k = detections_per_im + self.nms_eta = nms_eta diff --git a/ppdet/modeling/roi_extractors/__init__.py b/ppdet/modeling/roi_extractors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..15d2525db8c8dd2f72c09641ced94a0c0864b2a0 --- /dev/null +++ b/ppdet/modeling/roi_extractors/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import + +from . import roi_extractor +from .roi_extractor import * diff --git a/ppdet/modeling/roi_extractors/roi_extractor.py b/ppdet/modeling/roi_extractors/roi_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..1caf3936f584bc0eb116d32a7e38559a917afe85 --- /dev/null +++ b/ppdet/modeling/roi_extractors/roi_extractor.py @@ -0,0 +1,97 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle.fluid as fluid + +from ppdet.core.workspace import register +from ppdet.modeling.ops import RoIAlign, RoIPool + +__all__ = ['RoIPool', 'RoIAlign', 'FPNRoIAlign'] + + +@register +class FPNRoIAlign(object): + """ + RoI align pooling for FPN feature maps + Args: + sampling_ratio (int): number of sampling points + min_level (int): lowest level of FPN layer + max_level (int): highest level of FPN layer + canconical_level (int): the canconical FPN feature map level + canonical_size (int): the canconical FPN feature map size + box_resolution (int): box resolution + mask_resolution (int): mask roi resolution + """ + + def __init__(self, + sampling_ratio=0, + min_level=2, + max_level=5, + canconical_level=4, + canonical_size=224, + box_resolution=7, + mask_resolution=14): + super(FPNRoIAlign, self).__init__() + self.sampling_ratio = sampling_ratio + self.min_level = min_level + self.max_level = max_level + self.canconical_level = canconical_level + self.canonical_size = canonical_size + self.box_resolution = box_resolution + self.mask_resolution = mask_resolution + + def __call__(self, head_inputs, rois, spatial_scale, is_mask=False): + """ + Adopt RoI align onto several level of feature maps to get RoI features. + Distribute RoIs to different levels by area and get a list of RoI + features by distributed RoIs and their corresponding feature maps. + + Returns: + roi_feat(Variable): RoI features with shape of [M, C, R, R], + where M is the number of RoIs and R is RoI resolution + + """ + k_min = self.min_level + k_max = self.max_level + num_roi_lvls = k_max - k_min + 1 + name_list = list(head_inputs.keys()) + input_name_list = name_list[-num_roi_lvls:] + spatial_scale = spatial_scale[-num_roi_lvls:] + rois_dist, restore_index = fluid.layers.distribute_fpn_proposals( + rois, k_min, k_max, self.canconical_level, self.canonical_size) + # rois_dist is in ascend order + roi_out_list = [] + resolution = is_mask and self.mask_resolution or self.box_resolution + for lvl in range(num_roi_lvls): + name_index = num_roi_lvls - lvl - 1 + rois_input = rois_dist[lvl] + head_input = head_inputs[input_name_list[name_index]] + sc = spatial_scale[name_index] + roi_out = fluid.layers.roi_align( + input=head_input, + rois=rois_input, + pooled_height=resolution, + pooled_width=resolution, + spatial_scale=sc, + sampling_ratio=self.sampling_ratio) + roi_out_list.append(roi_out) + roi_feat_shuffle = fluid.layers.concat(roi_out_list) + roi_feat_ = fluid.layers.gather(roi_feat_shuffle, restore_index) + roi_feat = fluid.layers.lod_reset(roi_feat_, rois) + + return roi_feat diff --git a/ppdet/modeling/roi_heads/__init__.py b/ppdet/modeling/roi_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..345a0eb3e30d69af15b8a5f0b4766e8693462e5d --- /dev/null +++ b/ppdet/modeling/roi_heads/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import + +from . import bbox_head +from . import mask_head +from . import cascade_head + +from .bbox_head import * +from .mask_head import * +from .cascade_head import * diff --git a/ppdet/modeling/roi_heads/bbox_head.py b/ppdet/modeling/roi_heads/bbox_head.py new file mode 100644 index 0000000000000000000000000000000000000000..314aeb6087e05aa1c33d26b6b838075523ff2b10 --- /dev/null +++ b/ppdet/modeling/roi_heads/bbox_head.py @@ -0,0 +1,319 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import OrderedDict + +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.initializer import Normal, Xavier +from paddle.fluid.regularizer import L2Decay +from paddle.fluid.initializer import MSRA + +from ppdet.modeling.ops import MultiClassNMS +from ppdet.modeling.ops import ConvNorm +from ppdet.core.workspace import register, serializable +from ppdet.experimental import mixed_precision_global_state + +__all__ = ['BBoxHead', 'TwoFCHead', 'XConvNormHead'] + + +@register +@serializable +class BoxCoder(object): + __op__ = fluid.layers.box_coder + __append_doc__ = True + + def __init__(self, + prior_box_var=[0.1, 0.1, 0.2, 0.2], + code_type='decode_center_size', + box_normalized=False, + axis=1): + super(BoxCoder, self).__init__() + self.prior_box_var = prior_box_var + self.code_type = code_type + self.box_normalized = box_normalized + self.axis = axis + + +@register +class XConvNormHead(object): + """ + RCNN head with serveral convolution layers + + Args: + conv_num (int): num of convolution layers for the rcnn head + conv_dim (int): num of filters for the conv layers + mlp_dim (int): num of filters for the fc layers + """ + __shared__ = ['norm_type', 'freeze_norm'] + + def __init__(self, + num_conv=4, + conv_dim=256, + mlp_dim=1024, + norm_type=None, + freeze_norm=False): + super(XConvNormHead, self).__init__() + self.conv_dim = conv_dim + self.mlp_dim = mlp_dim + self.num_conv = num_conv + self.norm_type = norm_type + self.freeze_norm = freeze_norm + + def __call__(self, roi_feat): + conv = roi_feat + fan = self.conv_dim * 3 * 3 + initializer = MSRA(uniform=False, fan_in=fan) + for i in range(self.num_conv): + name = 'bbox_head_conv' + str(i) + conv = ConvNorm( + conv, + self.conv_dim, + 3, + act='relu', + initializer=initializer, + norm_type=self.norm_type, + freeze_norm=self.freeze_norm, + name=name, + norm_name=name) + fan = conv.shape[1] * conv.shape[2] * conv.shape[3] + head_heat = fluid.layers.fc(input=conv, + size=self.mlp_dim, + act='relu', + name='fc6' + name, + param_attr=ParamAttr( + name='fc6%s_w' % name, + initializer=Xavier(fan_out=fan)), + bias_attr=ParamAttr( + name='fc6%s_b' % name, + learning_rate=2, + regularizer=L2Decay(0.))) + return head_heat + + +@register +class TwoFCHead(object): + """ + RCNN head with two Fully Connected layers + + Args: + mlp_dim (int): num of filters for the fc layers + """ + + def __init__(self, mlp_dim=1024): + super(TwoFCHead, self).__init__() + self.mlp_dim = mlp_dim + + def __call__(self, roi_feat): + fan = roi_feat.shape[1] * roi_feat.shape[2] * roi_feat.shape[3] + + mixed_precision_enabled = mixed_precision_global_state() is not None + + if mixed_precision_enabled: + roi_feat = fluid.layers.cast(roi_feat, 'float16') + + fc6 = fluid.layers.fc(input=roi_feat, + size=self.mlp_dim, + act='relu', + name='fc6', + param_attr=ParamAttr( + name='fc6_w', + initializer=Xavier(fan_out=fan)), + bias_attr=ParamAttr( + name='fc6_b', + learning_rate=2., + regularizer=L2Decay(0.))) + head_feat = fluid.layers.fc(input=fc6, + size=self.mlp_dim, + act='relu', + name='fc7', + param_attr=ParamAttr( + name='fc7_w', initializer=Xavier()), + bias_attr=ParamAttr( + name='fc7_b', + learning_rate=2., + regularizer=L2Decay(0.))) + + if mixed_precision_enabled: + head_feat = fluid.layers.cast(head_feat, 'float32') + + return head_feat + + +@register +class BBoxHead(object): + """ + RCNN bbox head + + Args: + head (object): the head module instance, e.g., `ResNetC5`, `TwoFCHead` + box_coder (object): `BoxCoder` instance + nms (object): `MultiClassNMS` instance + num_classes: number of output classes + """ + __inject__ = ['head', 'box_coder', 'nms'] + __shared__ = ['num_classes'] + + def __init__(self, + head, + box_coder=BoxCoder().__dict__, + nms=MultiClassNMS().__dict__, + num_classes=81): + super(BBoxHead, self).__init__() + self.head = head + self.num_classes = num_classes + self.box_coder = box_coder + self.nms = nms + if isinstance(box_coder, dict): + self.box_coder = BoxCoder(**box_coder) + if isinstance(nms, dict): + self.nms = MultiClassNMS(**nms) + self.head_feat = None + + def get_head_feat(self, input=None): + """ + Get the bbox head feature map. + """ + + if input is not None: + feat = self.head(input) + if isinstance(feat, OrderedDict): + feat = list(feat.values())[0] + self.head_feat = feat + return self.head_feat + + def _get_output(self, roi_feat): + """ + Get bbox head output. + + Args: + roi_feat (Variable): RoI feature from RoIExtractor. + + Returns: + cls_score(Variable): Output of rpn head with shape of + [N, num_anchors, H, W]. + bbox_pred(Variable): Output of rpn head with shape of + [N, num_anchors * 4, H, W]. + """ + head_feat = self.get_head_feat(roi_feat) + # when ResNetC5 output a single feature map + if not isinstance(self.head, TwoFCHead) and not isinstance( + self.head, XConvNormHead): + head_feat = fluid.layers.pool2d( + head_feat, pool_type='avg', global_pooling=True) + cls_score = fluid.layers.fc(input=head_feat, + size=self.num_classes, + act=None, + name='cls_score', + param_attr=ParamAttr( + name='cls_score_w', + initializer=Normal( + loc=0.0, scale=0.01)), + bias_attr=ParamAttr( + name='cls_score_b', + learning_rate=2., + regularizer=L2Decay(0.))) + bbox_pred = fluid.layers.fc(input=head_feat, + size=4 * self.num_classes, + act=None, + name='bbox_pred', + param_attr=ParamAttr( + name='bbox_pred_w', + initializer=Normal( + loc=0.0, scale=0.001)), + bias_attr=ParamAttr( + name='bbox_pred_b', + learning_rate=2., + regularizer=L2Decay(0.))) + return cls_score, bbox_pred + + def get_loss(self, roi_feat, labels_int32, bbox_targets, + bbox_inside_weights, bbox_outside_weights): + """ + Get bbox_head loss. + + Args: + roi_feat (Variable): RoI feature from RoIExtractor. + labels_int32(Variable): Class label of a RoI with shape [P, 1]. + P is the number of RoI. + bbox_targets(Variable): Box label of a RoI with shape + [P, 4 * class_nums]. + bbox_inside_weights(Variable): Indicates whether a box should + contribute to loss. Same shape as bbox_targets. + bbox_outside_weights(Variable): Indicates whether a box should + contribute to loss. Same shape as bbox_targets. + + Return: + Type: Dict + loss_cls(Variable): bbox_head loss. + loss_bbox(Variable): bbox_head loss. + """ + + cls_score, bbox_pred = self._get_output(roi_feat) + + labels_int64 = fluid.layers.cast(x=labels_int32, dtype='int64') + labels_int64.stop_gradient = True + loss_cls = fluid.layers.softmax_with_cross_entropy( + logits=cls_score, label=labels_int64, numeric_stable_mode=True) + loss_cls = fluid.layers.reduce_mean(loss_cls) + loss_bbox = fluid.layers.smooth_l1( + x=bbox_pred, + y=bbox_targets, + inside_weight=bbox_inside_weights, + outside_weight=bbox_outside_weights, + sigma=1.0) + loss_bbox = fluid.layers.reduce_mean(loss_bbox) + return {'loss_cls': loss_cls, 'loss_bbox': loss_bbox} + + def get_prediction(self, + roi_feat, + rois, + im_info, + im_shape, + return_box_score=False): + """ + Get prediction bounding box in test stage. + + Args: + roi_feat (Variable): RoI feature from RoIExtractor. + rois (Variable): Output of generate_proposals in rpn head. + im_info (Variable): A 2-D LoDTensor with shape [B, 3]. B is the + number of input images, each element consists of im_height, + im_width, im_scale. + im_shape (Variable): Actual shape of original image with shape + [B, 3]. B is the number of images, each element consists of + original_height, original_width, 1 + + Returns: + pred_result(Variable): Prediction result with shape [N, 6]. Each + row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]. + N is the total number of prediction. + """ + cls_score, bbox_pred = self._get_output(roi_feat) + + im_scale = fluid.layers.slice(im_info, [1], starts=[2], ends=[3]) + im_scale = fluid.layers.sequence_expand(im_scale, rois) + boxes = rois / im_scale + cls_prob = fluid.layers.softmax(cls_score, use_cudnn=False) + bbox_pred = fluid.layers.reshape(bbox_pred, (-1, self.num_classes, 4)) + decoded_box = self.box_coder(prior_box=boxes, target_box=bbox_pred) + cliped_box = fluid.layers.box_clip(input=decoded_box, im_info=im_shape) + if return_box_score: + return {'bbox': cliped_box, 'score': cls_prob} + pred_result = self.nms(bboxes=cliped_box, scores=cls_prob) + return {'bbox': pred_result} diff --git a/ppdet/modeling/roi_heads/cascade_head.py b/ppdet/modeling/roi_heads/cascade_head.py new file mode 100644 index 0000000000000000000000000000000000000000..d36ff4c7541d8825e9491e696f787e10ef95b97e --- /dev/null +++ b/ppdet/modeling/roi_heads/cascade_head.py @@ -0,0 +1,321 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import paddle.fluid as fluid +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.initializer import Normal, Xavier +from paddle.fluid.regularizer import L2Decay +from paddle.fluid.initializer import MSRA + +from ppdet.modeling.ops import MultiClassNMS +from ppdet.modeling.ops import ConvNorm +from ppdet.core.workspace import register + +__all__ = ['CascadeBBoxHead'] + + +@register +class CascadeBBoxHead(object): + """ + Cascade RCNN bbox head + + Args: + head (object): the head module instance + nms (object): `MultiClassNMS` instance + num_classes: number of output classes + """ + __inject__ = ['head', 'nms'] + __shared__ = ['num_classes'] + + def __init__(self, head, nms=MultiClassNMS().__dict__, num_classes=81): + super(CascadeBBoxHead, self).__init__() + self.head = head + self.nms = nms + self.num_classes = num_classes + if isinstance(nms, dict): + self.nms = MultiClassNMS(**nms) + + def get_output(self, + roi_feat, + cls_agnostic_bbox_reg=2, + wb_scalar=1.0, + name=''): + """ + Get bbox head output. + + Args: + roi_feat (Variable): RoI feature from RoIExtractor. + cls_agnostic_bbox_reg(Int): BBox regressor are class agnostic. + wb_scalar(Float): Weights and Bias's learning rate. + name(String): Layer's name + + Returns: + cls_score(Variable): cls score. + bbox_pred(Variable): bbox regression. + """ + head_feat = self.head(roi_feat, wb_scalar, name) + cls_score = fluid.layers.fc(input=head_feat, + size=self.num_classes, + act=None, + name='cls_score' + name, + param_attr=ParamAttr( + name='cls_score%s_w' % name, + initializer=Normal( + loc=0.0, scale=0.01), + learning_rate=wb_scalar), + bias_attr=ParamAttr( + name='cls_score%s_b' % name, + learning_rate=wb_scalar * 2, + regularizer=L2Decay(0.))) + bbox_pred = fluid.layers.fc(input=head_feat, + size=4 * cls_agnostic_bbox_reg, + act=None, + name='bbox_pred' + name, + param_attr=ParamAttr( + name='bbox_pred%s_w' % name, + initializer=Normal( + loc=0.0, scale=0.001), + learning_rate=wb_scalar), + bias_attr=ParamAttr( + name='bbox_pred%s_b' % name, + learning_rate=wb_scalar * 2, + regularizer=L2Decay(0.))) + return cls_score, bbox_pred + + def get_loss(self, rcnn_pred_list, rcnn_target_list, rcnn_loss_weight_list): + """ + Get bbox_head loss. + + Args: + rcnn_pred_list(List): Cascade RCNN's head's output including + bbox_pred and cls_score + rcnn_target_list(List): Cascade rcnn's bbox and label target + rcnn_loss_weight_list(List): The weight of location and class loss + + Return: + loss_cls(Variable): bbox_head loss. + loss_bbox(Variable): bbox_head loss. + """ + loss_dict = {} + for i, (rcnn_pred, rcnn_target + ) in enumerate(zip(rcnn_pred_list, rcnn_target_list)): + labels_int64 = fluid.layers.cast(x=rcnn_target[1], dtype='int64') + labels_int64.stop_gradient = True + + loss_cls = fluid.layers.softmax_with_cross_entropy( + logits=rcnn_pred[0], + label=labels_int64, + numeric_stable_mode=True, ) + loss_cls = fluid.layers.reduce_mean( + loss_cls, name='loss_cls_' + str(i)) * rcnn_loss_weight_list[i] + + loss_bbox = fluid.layers.smooth_l1( + x=rcnn_pred[1], + y=rcnn_target[2], + inside_weight=rcnn_target[3], + outside_weight=rcnn_target[4], + sigma=1.0, # detectron use delta = 1./sigma**2 + ) + loss_bbox = fluid.layers.reduce_mean( + loss_bbox, + name='loss_bbox_' + str(i)) * rcnn_loss_weight_list[i] + + loss_dict['loss_cls_%d' % i] = loss_cls + loss_dict['loss_loc_%d' % i] = loss_bbox + + return loss_dict + + def get_prediction(self, + im_info, + im_shape, + roi_feat_list, + rcnn_pred_list, + proposal_list, + cascade_bbox_reg_weights, + cls_agnostic_bbox_reg=2, + return_box_score=False): + """ + Get prediction bounding box in test stage. + : + Args: + im_info (Variable): A 2-D LoDTensor with shape [B, 3]. B is the + number of input images, each element consists + of im_height, im_width, im_scale. + im_shape (Variable): Actual shape of original image with shape + [B, 3]. B is the number of images, each element consists of + original_height, original_width, 1 + rois_feat_list (List): RoI feature from RoIExtractor. + rcnn_pred_list (Variable): Cascade rcnn's head's output + including bbox_pred and cls_score + proposal_list (List): RPN proposal boxes. + cascade_bbox_reg_weights (List): BBox decode var. + cls_agnostic_bbox_reg(Int): BBox regressor are class agnostic + + Returns: + pred_result(Variable): Prediction result with shape [N, 6]. Each + row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]. + N is the total number of prediction. + """ + self.im_scale = fluid.layers.slice(im_info, [1], starts=[2], ends=[3]) + boxes_cls_prob_l = [] + + rcnn_pred = rcnn_pred_list[-1] # stage 3 + repreat_num = 1 + repreat_num = 3 + bbox_reg_w = cascade_bbox_reg_weights[-1] + for i in range(repreat_num): + # cls score + if i < 2: + cls_score, _ = self.get_output( + roi_feat_list[-1], # roi_feat_3 + name='_' + str(i + 1) if i > 0 else '') + else: + cls_score = rcnn_pred[0] + cls_prob = fluid.layers.softmax(cls_score, use_cudnn=False) + boxes_cls_prob_l.append(cls_prob) + + boxes_cls_prob_mean = ( + boxes_cls_prob_l[0] + boxes_cls_prob_l[1] + boxes_cls_prob_l[2] + ) / 3.0 + + # bbox pred + proposals_boxes = proposal_list[-1] + im_scale_lod = fluid.layers.sequence_expand(self.im_scale, + proposals_boxes) + proposals_boxes = proposals_boxes / im_scale_lod + bbox_pred = rcnn_pred[1] + bbox_pred_new = fluid.layers.reshape(bbox_pred, + (-1, cls_agnostic_bbox_reg, 4)) + if cls_agnostic_bbox_reg == 2: + # only use fg box delta to decode box + bbox_pred_new = fluid.layers.slice( + bbox_pred_new, axes=[1], starts=[1], ends=[2]) + bbox_pred_new = fluid.layers.expand(bbox_pred_new, + [1, self.num_classes, 1]) + decoded_box = fluid.layers.box_coder( + prior_box=proposals_boxes, + prior_box_var=bbox_reg_w, + target_box=bbox_pred_new, + code_type='decode_center_size', + box_normalized=False, + axis=1) + + box_out = fluid.layers.box_clip(input=decoded_box, im_info=im_shape) + if return_box_score: + return {'bbox': box_out, 'score': boxes_cls_prob_mean} + pred_result = self.nms(bboxes=box_out, scores=boxes_cls_prob_mean) + return {"bbox": pred_result} + + +@register +class CascadeXConvNormHead(object): + """ + RCNN head with serveral convolution layers + + Args: + conv_num (int): num of convolution layers for the rcnn head + conv_dim (int): num of filters for the conv layers + mlp_dim (int): num of filters for the fc layers + """ + __shared__ = ['norm_type', 'freeze_norm'] + + def __init__(self, + num_conv=4, + conv_dim=256, + mlp_dim=1024, + norm_type=None, + freeze_norm=False): + super(CascadeXConvNormHead, self).__init__() + self.conv_dim = conv_dim + self.mlp_dim = mlp_dim + self.num_conv = num_conv + self.norm_type = norm_type + self.freeze_norm = freeze_norm + + def __call__(self, roi_feat, wb_scalar=1.0, name=''): + conv = roi_feat + fan = self.conv_dim * 3 * 3 + initializer = MSRA(uniform=False, fan_in=fan) + for i in range(self.num_conv): + name = 'bbox_head_conv' + str(i) + conv = ConvNorm( + conv, + self.conv_dim, + 3, + act='relu', + initializer=initializer, + norm_type=self.norm_type, + freeze_norm=self.freeze_norm, + lr_scale=wb_scalar, + name=name, + norm_name=name) + fan = conv.shape[1] * conv.shape[2] * conv.shape[3] + head_heat = fluid.layers.fc(input=conv, + size=self.mlp_dim, + act='relu', + name='fc6' + name, + param_attr=ParamAttr( + name='fc6%s_w' % name, + initializer=Xavier(fan_out=fan), + learning_rate=wb_scalar), + bias_attr=ParamAttr( + name='fc6%s_b' % name, + regularizer=L2Decay(0.), + learning_rate=wb_scalar * 2)) + return head_heat + + +@register +class CascadeTwoFCHead(object): + """ + RCNN head with serveral convolution layers + + Args: + mlp_dim (int): num of filters for the fc layers + """ + + def __init__(self, mlp_dim): + super(CascadeTwoFCHead, self).__init__() + self.mlp_dim = mlp_dim + + def __call__(self, roi_feat, wb_scalar=1.0, name=''): + fan = roi_feat.shape[1] * roi_feat.shape[2] * roi_feat.shape[3] + fc6 = fluid.layers.fc(input=roi_feat, + size=self.mlp_dim, + act='relu', + name='fc6' + name, + param_attr=ParamAttr( + name='fc6%s_w' % name, + initializer=Xavier(fan_out=fan), + learning_rate=wb_scalar), + bias_attr=ParamAttr( + name='fc6%s_b' % name, + learning_rate=wb_scalar * 2, + regularizer=L2Decay(0.))) + head_feat = fluid.layers.fc(input=fc6, + size=self.mlp_dim, + act='relu', + name='fc7' + name, + param_attr=ParamAttr( + name='fc7%s_w' % name, + initializer=Xavier(), + learning_rate=wb_scalar), + bias_attr=ParamAttr( + name='fc7%s_b' % name, + learning_rate=wb_scalar * 2, + regularizer=L2Decay(0.))) + return head_feat diff --git a/ppdet/modeling/roi_heads/mask_head.py b/ppdet/modeling/roi_heads/mask_head.py new file mode 100644 index 0000000000000000000000000000000000000000..f61add0402664113de83e5537acae459260481ee --- /dev/null +++ b/ppdet/modeling/roi_heads/mask_head.py @@ -0,0 +1,160 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.initializer import MSRA +from paddle.fluid.regularizer import L2Decay + +from ppdet.core.workspace import register +from ppdet.modeling.ops import ConvNorm + +__all__ = ['MaskHead'] + + +@register +class MaskHead(object): + """ + RCNN mask head + Args: + num_convs (int): num of convolutions, 4 for FPN, 1 otherwise + conv_dim (int): num of channels after first convolution + resolution (int): size of the output mask + dilation (int): dilation rate + num_classes (int): number of output classes + """ + + __shared__ = ['num_classes'] + + def __init__(self, + num_convs=0, + conv_dim=256, + resolution=14, + dilation=1, + num_classes=81, + norm_type=None): + super(MaskHead, self).__init__() + self.num_convs = num_convs + self.conv_dim = conv_dim + self.resolution = resolution + self.dilation = dilation + self.num_classes = num_classes + self.norm_type = norm_type + + def _mask_conv_head(self, roi_feat, num_convs, norm_type): + if norm_type == 'gn': + for i in range(num_convs): + layer_name = "mask_inter_feat_" + str(i + 1) + fan = self.conv_dim * 3 * 3 + initializer = MSRA(uniform=False, fan_in=fan) + roi_feat = ConvNorm( + roi_feat, + self.conv_dim, + 3, + act='relu', + dilation=self.dilation, + initializer=initializer, + norm_type=self.norm_type, + name=layer_name, + norm_name=layer_name) + else: + for i in range(num_convs): + layer_name = "mask_inter_feat_" + str(i + 1) + fan = self.conv_dim * 3 * 3 + initializer = MSRA(uniform=False, fan_in=fan) + roi_feat = fluid.layers.conv2d( + input=roi_feat, + num_filters=self.conv_dim, + filter_size=3, + padding=1 * self.dilation, + act='relu', + stride=1, + dilation=self.dilation, + name=layer_name, + param_attr=ParamAttr( + name=layer_name + '_w', initializer=initializer), + bias_attr=ParamAttr( + name=layer_name + '_b', + learning_rate=2., + regularizer=L2Decay(0.))) + fan = roi_feat.shape[1] * 2 * 2 + feat = fluid.layers.conv2d_transpose( + input=roi_feat, + num_filters=self.conv_dim, + filter_size=2, + stride=2, + act='relu', + param_attr=ParamAttr( + name='conv5_mask_w', + initializer=MSRA( + uniform=False, fan_in=fan)), + bias_attr=ParamAttr( + name='conv5_mask_b', learning_rate=2., regularizer=L2Decay(0.))) + return feat + + def _get_output(self, roi_feat): + class_num = self.num_classes + # configure the conv number for FPN if necessary + head_feat = self._mask_conv_head(roi_feat, self.num_convs, + self.norm_type) + fan = class_num + mask_logits = fluid.layers.conv2d( + input=head_feat, + num_filters=class_num, + filter_size=1, + act=None, + param_attr=ParamAttr( + name='mask_fcn_logits_w', + initializer=MSRA( + uniform=False, fan_in=fan)), + bias_attr=ParamAttr( + name="mask_fcn_logits_b", + learning_rate=2., + regularizer=L2Decay(0.))) + return mask_logits + + def get_loss(self, roi_feat, mask_int32): + mask_logits = self._get_output(roi_feat) + num_classes = self.num_classes + resolution = self.resolution + dim = num_classes * resolution * resolution + mask_logits = fluid.layers.reshape(mask_logits, (-1, dim)) + + mask_label = fluid.layers.cast(x=mask_int32, dtype='float32') + mask_label.stop_gradient = True + loss_mask = fluid.layers.sigmoid_cross_entropy_with_logits( + x=mask_logits, label=mask_label, ignore_index=-1, normalize=True) + loss_mask = fluid.layers.reduce_sum(loss_mask, name='loss_mask') + return {'loss_mask': loss_mask} + + def get_prediction(self, roi_feat, bbox_pred): + """ + Get prediction mask in test stage. + + Args: + roi_feat (Variable): RoI feature from RoIExtractor. + bbox_pred (Variable): predicted bbox. + + Returns: + mask_pred (Variable): Prediction mask with shape + [N, num_classes, resolution, resolution]. + """ + mask_logits = self._get_output(roi_feat) + mask_prob = fluid.layers.sigmoid(mask_logits) + mask_prob = fluid.layers.lod_reset(mask_prob, bbox_pred) + return mask_prob diff --git a/ppdet/modeling/target_assigners.py b/ppdet/modeling/target_assigners.py new file mode 100644 index 0000000000000000000000000000000000000000..57d7bd738d2ee730effa3406df04fddedf1e5cd8 --- /dev/null +++ b/ppdet/modeling/target_assigners.py @@ -0,0 +1,73 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import fluid + +from ppdet.core.workspace import register +from ppdet.modeling.ops import BBoxAssigner, MaskAssigner + +__all__ = ['BBoxAssigner', 'MaskAssigner', 'CascadeBBoxAssigner'] + + +@register +class CascadeBBoxAssigner(object): + __shared__ = ['num_classes'] + + def __init__(self, + batch_size_per_im=512, + fg_fraction=.25, + fg_thresh=[0.5, 0.6, 0.7], + bg_thresh_hi=[0.5, 0.6, 0.7], + bg_thresh_lo=[0., 0., 0.], + bbox_reg_weights=[10, 20, 30], + num_classes=81, + shuffle_before_sample=True): + super(CascadeBBoxAssigner, self).__init__() + self.batch_size_per_im = batch_size_per_im + self.fg_fraction = fg_fraction + self.fg_thresh = fg_thresh + self.bg_thresh_hi = bg_thresh_hi + self.bg_thresh_lo = bg_thresh_lo + self.bbox_reg_weights = bbox_reg_weights + self.class_nums = num_classes + self.use_random = shuffle_before_sample + + def __call__(self, input_rois, feed_vars, curr_stage): + + curr_bbox_reg_w = [ + 1. / self.bbox_reg_weights[curr_stage], + 1. / self.bbox_reg_weights[curr_stage], + 2. / self.bbox_reg_weights[curr_stage], + 2. / self.bbox_reg_weights[curr_stage], + ] + outs = fluid.layers.generate_proposal_labels( + rpn_rois=input_rois, + gt_classes=feed_vars['gt_label'], + is_crowd=feed_vars['is_crowd'], + gt_boxes=feed_vars['gt_box'], + im_info=feed_vars['im_info'], + batch_size_per_im=self.batch_size_per_im, + fg_thresh=self.fg_thresh[curr_stage], + bg_thresh_hi=self.bg_thresh_hi[curr_stage], + bg_thresh_lo=self.bg_thresh_lo[curr_stage], + bbox_reg_weights=curr_bbox_reg_w, + use_random=self.use_random, + class_nums=2, + is_cls_agnostic=True, + is_cascade_rcnn=True if curr_stage > 0 else False) + return outs diff --git a/ppdet/modeling/tests/__init__.py b/ppdet/modeling/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..33ed0ecf10ec4cad807ebb6df1590de65eeeab1e --- /dev/null +++ b/ppdet/modeling/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ppdet/modeling/tests/decorator_helper.py b/ppdet/modeling/tests/decorator_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..894833ce15eab82ea06c2e66a8e53cb2e7e057b5 --- /dev/null +++ b/ppdet/modeling/tests/decorator_helper.py @@ -0,0 +1,33 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid + +__all__ = ['prog_scope'] + + +def prog_scope(): + def __impl__(fn): + def __fn__(*args, **kwargs): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + with fluid.unique_name.guard(): + fn(*args, **kwargs) + + return __fn__ + + return __impl__ diff --git a/ppdet/modeling/tests/test_architectures.py b/ppdet/modeling/tests/test_architectures.py new file mode 100644 index 0000000000000000000000000000000000000000..7df9580666858a834506a9a0beac742e548266f5 --- /dev/null +++ b/ppdet/modeling/tests/test_architectures.py @@ -0,0 +1,79 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import unittest +import numpy as np + +import paddle.fluid as fluid + +from ppdet.modeling.tests.decorator_helper import prog_scope +from ppdet.core.workspace import load_config, merge_config, create +from ppdet.modeling.model_input import create_feed + + +class TestFasterRCNN(unittest.TestCase): + def setUp(self): + self.set_config() + self.cfg = load_config(self.cfg_file) + self.detector_type = self.cfg['architecture'] + + def set_config(self): + self.cfg_file = 'configs/faster_rcnn_r50_1x.yml' + + @prog_scope() + def test_train(self): + train_feed = create(self.cfg['train_feed']) + model = create(self.detector_type) + _, feed_vars = create_feed(train_feed) + train_fetches = model.train(feed_vars) + + @prog_scope() + def test_test(self): + test_feed = create(self.cfg['eval_feed']) + model = create(self.detector_type) + _, feed_vars = create_feed(test_feed) + test_fetches = model.eval(feed_vars) + + +class TestMaskRCNN(TestFasterRCNN): + def set_config(self): + self.cfg_file = 'configs/mask_rcnn_r50_1x.yml' + + +class TestCascadeRCNN(TestFasterRCNN): + def set_config(self): + self.cfg_file = 'configs/cascade_rcnn_r50_fpn_1x.yml' + + +class TestYolov3(TestFasterRCNN): + def set_config(self): + self.cfg_file = 'configs/yolov3_darknet.yml' + + +class TestRetinaNet(TestFasterRCNN): + def set_config(self): + self.cfg_file = 'configs/retinanet_r50_fpn_1x.yml' + + +class TestSSD(TestFasterRCNN): + def set_config(self): + self.cfg_file = 'configs/ssd/ssd_mobilenet_v1_voc.yml' + + +if __name__ == '__main__': + unittest.main() diff --git a/ppdet/optimizer.py b/ppdet/optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..e695aad043b8778128f1b6870a27d2b0b3fe6adc --- /dev/null +++ b/ppdet/optimizer.py @@ -0,0 +1,141 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import logging + +from paddle import fluid + +import paddle.fluid.optimizer as optimizer +import paddle.fluid.regularizer as regularizer + +from ppdet.core.workspace import register, serializable + +__all__ = ['LearningRate', 'OptimizerBuilder'] + +logger = logging.getLogger(__name__) + + +@serializable +class PiecewiseDecay(object): + """ + Multi step learning rate decay + + Args: + gamma (float): decay factor + milestones (list): steps at which to decay learning rate + """ + + def __init__(self, gamma=0.1, milestones=[60000, 80000], values=None): + super(PiecewiseDecay, self).__init__() + self.gamma = gamma + self.milestones = milestones + self.values = values + + def __call__(self, base_lr=None, learning_rate=None): + if self.values is not None: + return fluid.layers.piecewise_decay(self.milestones, self.values) + assert base_lr is not None, "either base LR or values should be provided" + values = [base_lr] + lr = base_lr + for _ in self.milestones: + lr *= self.gamma + values.append(lr) + return fluid.layers.piecewise_decay(self.milestones, values) + + +@serializable +class LinearWarmup(object): + """ + Warm up learning rate linearly + + Args: + steps (int): warm up steps + start_factor (float): initial learning rate factor + """ + + def __init__(self, steps=500, start_factor=1. / 3): + super(LinearWarmup, self).__init__() + self.steps = steps + self.start_factor = start_factor + + def __call__(self, base_lr, learning_rate): + start_lr = base_lr * self.start_factor + + return fluid.layers.linear_lr_warmup( + learning_rate=learning_rate, + warmup_steps=self.steps, + start_lr=start_lr, + end_lr=base_lr) + + +@register +class LearningRate(object): + """ + Learning Rate configuration + + Args: + base_lr (float): base learning rate + schedulers (list): learning rate schedulers + """ + __category__ = 'optim' + + def __init__(self, + base_lr=0.01, + schedulers=[PiecewiseDecay(), LinearWarmup()]): + super(LearningRate, self).__init__() + self.base_lr = base_lr + self.schedulers = schedulers + + def __call__(self): + lr = None + for sched in self.schedulers: + lr = sched(self.base_lr, lr) + return lr + + +@register +class OptimizerBuilder(): + """ + Build optimizer handles + + Args: + regularizer (object): an `Regularizer` instance + optimizer (object): an `Optimizer` instance + """ + __category__ = 'optim' + + def __init__(self, + regularizer={'type': 'L2', + 'factor': .0001}, + optimizer={'type': 'Momentum', + 'momentum': .9}): + self.regularizer = regularizer + self.optimizer = optimizer + + def __call__(self, learning_rate): + reg_type = self.regularizer['type'] + 'Decay' + reg_factor = self.regularizer['factor'] + regularization = getattr(regularizer, reg_type)(reg_factor) + + optim_args = self.optimizer.copy() + optim_type = optim_args['type'] + del optim_args['type'] + op = getattr(optimizer, optim_type) + return op(learning_rate=learning_rate, + regularization=regularization, + **optim_args) diff --git a/ppdet/utils/__init__.py b/ppdet/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d0c32e26092f6ea25771279418582a24ea449ab2 --- /dev/null +++ b/ppdet/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ppdet/utils/check.py b/ppdet/utils/check.py new file mode 100644 index 0000000000000000000000000000000000000000..9e816eaadcbf21b48e80ab6e607bff97269e7c39 --- /dev/null +++ b/ppdet/utils/check.py @@ -0,0 +1,47 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import sys + +import paddle.fluid as fluid + +import logging +logger = logging.getLogger(__name__) + +__all__ = ['check_gpu'] + + +def check_gpu(use_gpu): + """ + Log error and exit when set use_gpu=true in paddlepaddle + cpu version. + """ + err = "Config use_gpu cannot be set as true while you are " \ + "using paddlepaddle cpu version ! \nPlease try: \n" \ + "\t1. Install paddlepaddle-gpu to run model on GPU \n" \ + "\t2. Set use_gpu as false in config file to run " \ + "model on CPU" + + try: + if use_gpu and not fluid.is_compiled_with_cuda(): + logger.error(err) + sys.exit(1) + except Exception as e: + pass + diff --git a/ppdet/utils/checkpoint.py b/ppdet/utils/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..54c364812b041280148a1d0b85543209dc449724 --- /dev/null +++ b/ppdet/utils/checkpoint.py @@ -0,0 +1,291 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import errno +import os +import shutil +import time +import numpy as np +import re +import paddle.fluid as fluid + +from .download import get_weights_path + +import logging +logger = logging.getLogger(__name__) + +__all__ = [ + 'load_checkpoint', + 'load_and_fusebn', + 'load_params', + 'save', +] + + +def is_url(path): + """ + Whether path is URL. + Args: + path (string): URL string or not. + """ + return path.startswith('http://') or path.startswith('https://') + + +def _get_weight_path(path): + env = os.environ + if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env: + trainer_id = int(env['PADDLE_TRAINER_ID']) + num_trainers = int(env['PADDLE_TRAINERS_NUM']) + if num_trainers <= 1: + path = get_weights_path(path) + else: + from ppdet.utils.download import map_path, WEIGHTS_HOME + weight_path = map_path(path, WEIGHTS_HOME) + lock_path = weight_path + '.lock' + if not os.path.exists(weight_path): + try: + os.makedirs(os.path.dirname(weight_path)) + except OSError as e: + if e.errno != errno.EEXIST: + raise + with open(lock_path, 'w'): # touch + os.utime(lock_path, None) + if trainer_id == 0: + get_weights_path(path) + os.remove(lock_path) + else: + while os.path.exists(lock_path): + time.sleep(1) + path = weight_path + else: + path = get_weights_path(path) + return path + + +def load_params(exe, prog, path, ignore_params=[]): + """ + Load model from the given path. + Args: + exe (fluid.Executor): The fluid.Executor object. + prog (fluid.Program): load weight to which Program object. + path (string): URL string or loca model path. + ignore_params (bool): ignore variable to load when finetuning. + It can be specified by finetune_exclude_pretrained_params + and the usage can refer to docs/TRANSFER_LEARNING.md + """ + + if is_url(path): + path = _get_weight_path(path) + + if not os.path.exists(path): + raise ValueError("Model pretrain path {} does not " + "exists.".format(path)) + + logger.info('Loading parameters from {}...'.format(path)) + + def _if_exist(var): + do_ignore = False + param_exist = os.path.exists(os.path.join(path, var.name)) + if len(ignore_params) > 0: + # Parameter related to num_classes will be ignored in finetuning + do_ignore_list = [ + bool(re.match(name, var.name)) for name in ignore_params + ] + do_ignore = any(do_ignore_list) + if do_ignore and param_exist: + logger.info('In load_params, ignore {}'.format(var.name)) + do_load = param_exist and not do_ignore + if do_load: + logger.debug('load weight {}'.format(var.name)) + return do_load + + fluid.io.load_vars(exe, path, prog, predicate=_if_exist) + + +def load_checkpoint(exe, prog, path): + """ + Load model from the given path. + Args: + exe (fluid.Executor): The fluid.Executor object. + prog (fluid.Program): load weight to which Program object. + path (string): URL string or loca model path. + """ + if is_url(path): + path = _get_weight_path(path) + + if not os.path.exists(path): + raise ValueError("Model checkpoint path {} does not " + "exists.".format(path)) + + logger.info('Loading checkpoint from {}...'.format(path)) + fluid.io.load_persistables(exe, path, prog) + + +def global_step(scope=None): + """ + Load global step in scope. + Args: + scope (fluid.Scope): load global step from which scope. If None, + from default global_scope(). + + Returns: + global step: int. + """ + if scope is None: + scope = fluid.global_scope() + v = scope.find_var('@LR_DECAY_COUNTER@') + step = np.array(v.get_tensor())[0] if v else 0 + return step + + +def save(exe, prog, path): + """ + Load model from the given path. + Args: + exe (fluid.Executor): The fluid.Executor object. + prog (fluid.Program): save weight from which Program object. + path (string): the path to save model. + """ + if os.path.isdir(path): + shutil.rmtree(path) + logger.info('Save model to {}.'.format(path)) + fluid.io.save_persistables(exe, path, prog) + + +def load_and_fusebn(exe, prog, path): + """ + Fuse params of batch norm to scale and bias. + + Args: + exe (fluid.Executor): The fluid.Executor object. + prog (fluid.Program): save weight from which Program object. + path (string): the path to save model. + """ + logger.info('Load model and fuse batch norm if have from {}...'.format( + path)) + + if is_url(path): + path = _get_weight_path(path) + + if not os.path.exists(path): + raise ValueError("Model path {} does not exists.".format(path)) + + def _if_exist(var): + b = os.path.exists(os.path.join(path, var.name)) + + if b: + logger.debug('load weight {}'.format(var.name)) + return b + + all_vars = list(filter(_if_exist, prog.list_vars())) + + # Since the program uses affine-channel, there is no running mean and var + # in the program, here append running mean and var. + # NOTE, the params of batch norm should be like: + # x_scale + # x_offset + # x_mean + # x_variance + # x is any prefix + mean_variances = set() + bn_vars = [] + + bn_in_path = True + + inner_prog = fluid.Program() + inner_start_prog = fluid.Program() + inner_block = inner_prog.global_block() + with fluid.program_guard(inner_prog, inner_start_prog): + for block in prog.blocks: + ops = list(block.ops) + if not bn_in_path: + break + for op in ops: + if op.type == 'affine_channel': + # remove 'scale' as prefix + scale_name = op.input('Scale')[0] # _scale + bias_name = op.input('Bias')[0] # _offset + prefix = scale_name[:-5] + mean_name = prefix + 'mean' + variance_name = prefix + 'variance' + + if not os.path.exists(os.path.join(path, mean_name)): + bn_in_path = False + break + if not os.path.exists(os.path.join(path, variance_name)): + bn_in_path = False + break + + bias = block.var(bias_name) + + mean_vb = inner_block.create_var( + name=mean_name, + type=bias.type, + shape=bias.shape, + dtype=bias.dtype, + persistable=True) + variance_vb = inner_block.create_var( + name=variance_name, + type=bias.type, + shape=bias.shape, + dtype=bias.dtype, + persistable=True) + + mean_variances.add(mean_vb) + mean_variances.add(variance_vb) + + bn_vars.append( + [scale_name, bias_name, mean_name, variance_name]) + + if not bn_in_path: + fluid.io.load_vars(exe, path, prog, vars=all_vars) + logger.warning( + "There is no paramters of batch norm in model {}. " + "Skip to fuse batch norm. And load paramters done.".format(path)) + return + + # load running mean and running variance on cpu place into global scope. + place = fluid.CPUPlace() + exe_cpu = fluid.Executor(place) + fluid.io.load_vars(exe_cpu, path, vars=[v for v in mean_variances]) + + # load params on real place into global scope. + fluid.io.load_vars(exe, path, prog, vars=all_vars) + + eps = 1e-5 + for names in bn_vars: + scale_name, bias_name, mean_name, var_name = names + + scale = fluid.global_scope().find_var(scale_name).get_tensor() + bias = fluid.global_scope().find_var(bias_name).get_tensor() + mean = fluid.global_scope().find_var(mean_name).get_tensor() + var = fluid.global_scope().find_var(var_name).get_tensor() + + scale_arr = np.array(scale) + bias_arr = np.array(bias) + mean_arr = np.array(mean) + var_arr = np.array(var) + + bn_std = np.sqrt(np.add(var_arr, eps)) + new_scale = np.float32(np.divide(scale_arr, bn_std)) + new_bias = bias_arr - mean_arr * new_scale + + # fuse to scale and bias in affine_channel + scale.set(new_scale, exe.place) + bias.set(new_bias, exe.place) diff --git a/ppdet/utils/cli.py b/ppdet/utils/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..b8ba59d78f1ddf606012fd0cf6d71a71d79eea05 --- /dev/null +++ b/ppdet/utils/cli.py @@ -0,0 +1,151 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from argparse import ArgumentParser, RawDescriptionHelpFormatter + +import yaml +import re +from ppdet.core.workspace import get_registered_modules, dump_value + +__all__ = ['ColorTTY', 'ArgsParser'] + + +class ColorTTY(object): + def __init__(self): + super(ColorTTY, self).__init__() + self.colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan'] + + def __getattr__(self, attr): + if attr in self.colors: + color = self.colors.index(attr) + 31 + + def color_message(message): + return "[{}m{}".format(color, message) + + setattr(self, attr, color_message) + return color_message + + def bold(self, message): + return self.with_code('01', message) + + def with_code(self, code, message): + return "[{}m{}".format(code, message) + + +class ArgsParser(ArgumentParser): + def __init__(self): + super(ArgsParser, self).__init__( + formatter_class=RawDescriptionHelpFormatter) + self.add_argument("-c", "--config", help="configuration file to use") + self.add_argument( + "-o", "--opt", nargs='*', help="set configuration options") + + def parse_args(self, argv=None): + args = super(ArgsParser, self).parse_args(argv) + assert args.config is not None, \ + "Please specify --config=configure_file_path." + args.opt = self._parse_opt(args.opt) + return args + + def _parse_opt(self, opts): + config = {} + if not opts: + return config + for s in opts: + s = s.strip() + k, v = s.split('=', 1) + if '.' not in k: + config[k] = yaml.load(v, Loader=yaml.Loader) + else: + keys = k.split('.') + if keys[0] not in config: + config[keys[0]] = {} + cur = config[keys[0]] + for idx, key in enumerate(keys[1:]): + if idx == len(keys) - 2: + cur[key] = yaml.load(v, Loader=yaml.Loader) + else: + cur[key] = {} + cur = cur[key] + return config + + +def print_total_cfg(config): + modules = get_registered_modules() + color_tty = ColorTTY() + green = '___{}___'.format(color_tty.colors.index('green') + 31) + + styled = {} + for key in config.keys(): + if not config[key]: # empty schema + continue + + if key not in modules and not hasattr(config[key], '__dict__'): + styled[key] = config[key] + continue + elif key in modules: + module = modules[key] + else: + type_name = type(config[key]).__name__ + if type_name in modules: + module = modules[type_name].copy() + module.update({ + k: v + for k, v in config[key].__dict__.items() + if k in module.schema + }) + key += " ({})".format(type_name) + default = module.find_default_keys() + missing = module.find_missing_keys() + mismatch = module.find_mismatch_keys() + extra = module.find_extra_keys() + dep_missing = [] + for dep in module.inject: + if isinstance(module[dep], str) and module[dep] != '': + if module[dep] not in modules: # not a valid module + dep_missing.append(dep) + else: + dep_mod = modules[module[dep]] + # empty dict but mandatory + if not dep_mod and dep_mod.mandatory(): + dep_missing.append(dep) + override = list( + set(module.keys()) - set(default) - set(extra) - set(dep_missing)) + replacement = {} + for name in set(override + default + extra + mismatch + missing): + new_name = name + if name in missing: + value = "" + else: + value = module[name] + + if name in extra: + value = dump_value(value) + " " + elif name in mismatch: + value = dump_value(value) + " " + elif name in dep_missing: + value = dump_value(value) + " " + elif name in override and value != '': + mark = green + new_name = mark + name + replacement[new_name] = value + styled[key] = replacement + buffer = yaml.dump(styled, default_flow_style=False, default_style='') + buffer = (re.sub(r"", r"[31m[0m", buffer)) + buffer = (re.sub(r"", r"[33m[0m", buffer)) + buffer = (re.sub(r"", r"[31m[0m", buffer)) + buffer = (re.sub(r"", + r"[31m[0m", buffer)) + buffer = re.sub(r"___(\d+)___(.*?):", r"[\1m\2[0m:", buffer) + print(buffer) diff --git a/ppdet/utils/coco_eval.py b/ppdet/utils/coco_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..cb5df475fb16f10fffa18a02920687780a3fe298 --- /dev/null +++ b/ppdet/utils/coco_eval.py @@ -0,0 +1,560 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import os +import sys +import json +import cv2 +import numpy as np +import matplotlib +matplotlib.use('Agg') + +import logging +logger = logging.getLogger(__name__) + +__all__ = [ + 'bbox_eval', + 'mask_eval', + 'bbox2out', + 'mask2out', + 'get_category_info', + 'proposal_eval', + 'cocoapi_eval', +] + + +def clip_bbox(bbox): + xmin = max(min(bbox[0], 1.), 0.) + ymin = max(min(bbox[1], 1.), 0.) + xmax = max(min(bbox[2], 1.), 0.) + ymax = max(min(bbox[3], 1.), 0.) + return xmin, ymin, xmax, ymax + + +def proposal_eval(results, anno_file, outfile, max_dets=(100, 300, 1000)): + assert 'proposal' in results[0] + assert outfile.endswith('.json') + + xywh_results = proposal2out(results) + assert len( + xywh_results) > 0, "The number of valid proposal detected is zero.\n \ + Please use reasonable model and check input data." + + with open(outfile, 'w') as f: + json.dump(xywh_results, f) + + cocoapi_eval(outfile, 'proposal', anno_file=anno_file, max_dets=max_dets) + # flush coco evaluation result + sys.stdout.flush() + + +def bbox_eval(results, + anno_file, + outfile, + with_background=True, + is_bbox_normalized=False): + assert 'bbox' in results[0] + assert outfile.endswith('.json') + from pycocotools.coco import COCO + + coco_gt = COCO(anno_file) + cat_ids = coco_gt.getCatIds() + + # when with_background = True, mapping category to classid, like: + # background:0, first_class:1, second_class:2, ... + clsid2catid = dict( + {i + int(with_background): catid + for i, catid in enumerate(cat_ids)}) + + xywh_results = bbox2out( + results, clsid2catid, is_bbox_normalized=is_bbox_normalized) + + if len(xywh_results) == 0: + logger.warning("The number of valid bbox detected is zero.\n \ + Please use reasonable model and check input data.\n \ + stop eval!") + return [0.0] + with open(outfile, 'w') as f: + json.dump(xywh_results, f) + + map_stats = cocoapi_eval(outfile, 'bbox', coco_gt=coco_gt) + # flush coco evaluation result + sys.stdout.flush() + return map_stats + + +def mask_eval(results, anno_file, outfile, resolution, thresh_binarize=0.5): + assert 'mask' in results[0] + assert outfile.endswith('.json') + from pycocotools.coco import COCO + + coco_gt = COCO(anno_file) + clsid2catid = {i + 1: v for i, v in enumerate(coco_gt.getCatIds())} + + segm_results = mask2out(results, clsid2catid, resolution, thresh_binarize) + if len(segm_results) == 0: + logger.warning("The number of valid mask detected is zero.\n \ + Please use reasonable model and check input data.") + return + + with open(outfile, 'w') as f: + json.dump(segm_results, f) + + cocoapi_eval(outfile, 'segm', coco_gt=coco_gt) + + +def cocoapi_eval(jsonfile, + style, + coco_gt=None, + anno_file=None, + max_dets=(100, 300, 1000)): + """ + Args: + jsonfile: Evaluation json file, eg: bbox.json, mask.json. + style: COCOeval style, can be `bbox` , `segm` and `proposal`. + coco_gt: Whether to load COCOAPI through anno_file, + eg: coco_gt = COCO(anno_file) + anno_file: COCO annotations file. + max_dets: COCO evaluation maxDets. + """ + assert coco_gt != None or anno_file != None + from pycocotools.coco import COCO + from pycocotools.cocoeval import COCOeval + + if coco_gt == None: + coco_gt = COCO(anno_file) + logger.info("Start evaluate...") + coco_dt = coco_gt.loadRes(jsonfile) + if style == 'proposal': + coco_eval = COCOeval(coco_gt, coco_dt, 'bbox') + coco_eval.params.useCats = 0 + coco_eval.params.maxDets = list(max_dets) + else: + coco_eval = COCOeval(coco_gt, coco_dt, style) + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + return coco_eval.stats + + +def proposal2out(results, is_bbox_normalized=False): + xywh_res = [] + for t in results: + bboxes = t['proposal'][0] + lengths = t['proposal'][1][0] + im_ids = np.array(t['im_id'][0]) + if bboxes.shape == (1, 1) or bboxes is None: + continue + + k = 0 + for i in range(len(lengths)): + num = lengths[i] + im_id = int(im_ids[i][0]) + for j in range(num): + dt = bboxes[k] + xmin, ymin, xmax, ymax = dt.tolist() + + if is_bbox_normalized: + xmin, ymin, xmax, ymax = \ + clip_bbox([xmin, ymin, xmax, ymax]) + w = xmax - xmin + h = ymax - ymin + else: + w = xmax - xmin + 1 + h = ymax - ymin + 1 + + bbox = [xmin, ymin, w, h] + coco_res = { + 'image_id': im_id, + 'category_id': 1, + 'bbox': bbox, + 'score': 1.0 + } + xywh_res.append(coco_res) + k += 1 + return xywh_res + + +def bbox2out(results, clsid2catid, is_bbox_normalized=False): + """ + Args: + results: request a dict, should include: `bbox`, `im_id`, + if is_bbox_normalized=True, also need `im_shape`. + clsid2catid: class id to category id map of COCO2017 dataset. + is_bbox_normalized: whether or not bbox is normalized. + """ + xywh_res = [] + for t in results: + bboxes = t['bbox'][0] + lengths = t['bbox'][1][0] + im_ids = np.array(t['im_id'][0]) + if bboxes.shape == (1, 1) or bboxes is None: + continue + + k = 0 + for i in range(len(lengths)): + num = lengths[i] + im_id = int(im_ids[i][0]) + for j in range(num): + dt = bboxes[k] + clsid, score, xmin, ymin, xmax, ymax = dt.tolist() + catid = (clsid2catid[int(clsid)]) + + if is_bbox_normalized: + xmin, ymin, xmax, ymax = \ + clip_bbox([xmin, ymin, xmax, ymax]) + w = xmax - xmin + h = ymax - ymin + im_height, im_width = t['im_shape'][0][i].tolist() + xmin *= im_width + ymin *= im_height + w *= im_width + h *= im_height + else: + w = xmax - xmin + 1 + h = ymax - ymin + 1 + + bbox = [xmin, ymin, w, h] + coco_res = { + 'image_id': im_id, + 'category_id': catid, + 'bbox': bbox, + 'score': score + } + xywh_res.append(coco_res) + k += 1 + return xywh_res + + +def mask2out(results, clsid2catid, resolution, thresh_binarize=0.5): + import pycocotools.mask as mask_util + scale = (resolution + 2.0) / resolution + + segm_res = [] + + # for each batch + for t in results: + bboxes = t['bbox'][0] + + lengths = t['bbox'][1][0] + im_ids = np.array(t['im_id'][0]) + if bboxes.shape == (1, 1) or bboxes is None: + continue + if len(bboxes.tolist()) == 0: + continue + + masks = t['mask'][0] + + s = 0 + # for each sample + for i in range(len(lengths)): + num = lengths[i] + im_id = int(im_ids[i][0]) + im_shape = t['im_shape'][0][i] + + bbox = bboxes[s:s + num][:, 2:] + clsid_scores = bboxes[s:s + num][:, 0:2] + mask = masks[s:s + num] + s += num + + im_h = int(im_shape[0]) + im_w = int(im_shape[1]) + + expand_bbox = expand_boxes(bbox, scale) + expand_bbox = expand_bbox.astype(np.int32) + + padded_mask = np.zeros( + (resolution + 2, resolution + 2), dtype=np.float32) + + for j in range(num): + xmin, ymin, xmax, ymax = expand_bbox[j].tolist() + clsid, score = clsid_scores[j].tolist() + clsid = int(clsid) + padded_mask[1:-1, 1:-1] = mask[j, clsid, :, :] + + catid = clsid2catid[clsid] + + w = xmax - xmin + 1 + h = ymax - ymin + 1 + w = np.maximum(w, 1) + h = np.maximum(h, 1) + + resized_mask = cv2.resize(padded_mask, (w, h)) + resized_mask = np.array( + resized_mask > thresh_binarize, dtype=np.uint8) + im_mask = np.zeros((im_h, im_w), dtype=np.uint8) + + x0 = min(max(xmin, 0), im_w) + x1 = min(max(xmax + 1, 0), im_w) + y0 = min(max(ymin, 0), im_h) + y1 = min(max(ymax + 1, 0), im_h) + + im_mask[y0:y1, x0:x1] = resized_mask[(y0 - ymin):(y1 - ymin), ( + x0 - xmin):(x1 - xmin)] + segm = mask_util.encode( + np.array( + im_mask[:, :, np.newaxis], order='F'))[0] + catid = clsid2catid[clsid] + segm['counts'] = segm['counts'].decode('utf8') + coco_res = { + 'image_id': im_id, + 'category_id': catid, + 'segmentation': segm, + 'score': score + } + segm_res.append(coco_res) + return segm_res + + +def expand_boxes(boxes, scale): + """ + Expand an array of boxes by a given scale. + """ + w_half = (boxes[:, 2] - boxes[:, 0]) * .5 + h_half = (boxes[:, 3] - boxes[:, 1]) * .5 + x_c = (boxes[:, 2] + boxes[:, 0]) * .5 + y_c = (boxes[:, 3] + boxes[:, 1]) * .5 + + w_half *= scale + h_half *= scale + + boxes_exp = np.zeros(boxes.shape) + boxes_exp[:, 0] = x_c - w_half + boxes_exp[:, 2] = x_c + w_half + boxes_exp[:, 1] = y_c - h_half + boxes_exp[:, 3] = y_c + h_half + + return boxes_exp + + +def get_category_info(anno_file=None, + with_background=True, + use_default_label=False): + if use_default_label or anno_file is None \ + or not os.path.exists(anno_file): + logger.info("Not found annotation file {}, load " + "coco17 categories.".format(anno_file)) + return coco17_category_info(with_background) + else: + logger.info("Load categories from {}".format(anno_file)) + return get_category_info_from_anno(anno_file, with_background) + + +def get_category_info_from_anno(anno_file, with_background=True): + """ + Get class id to category id map and category id + to category name map from annotation file. + + Args: + anno_file (str): annotation file path + with_background (bool, default True): + whether load background as class 0. + """ + from pycocotools.coco import COCO + coco = COCO(anno_file) + cats = coco.loadCats(coco.getCatIds()) + clsid2catid = { + i + int(with_background): cat['id'] + for i, cat in enumerate(cats) + } + catid2name = {cat['id']: cat['name'] for cat in cats} + + return clsid2catid, catid2name + + +def coco17_category_info(with_background=True): + """ + Get class id to category id map and category id + to category name map of COCO2017 dataset + + Args: + with_background (bool, default True): + whether load background as class 0. + """ + clsid2catid = { + 1: 1, + 2: 2, + 3: 3, + 4: 4, + 5: 5, + 6: 6, + 7: 7, + 8: 8, + 9: 9, + 10: 10, + 11: 11, + 12: 13, + 13: 14, + 14: 15, + 15: 16, + 16: 17, + 17: 18, + 18: 19, + 19: 20, + 20: 21, + 21: 22, + 22: 23, + 23: 24, + 24: 25, + 25: 27, + 26: 28, + 27: 31, + 28: 32, + 29: 33, + 30: 34, + 31: 35, + 32: 36, + 33: 37, + 34: 38, + 35: 39, + 36: 40, + 37: 41, + 38: 42, + 39: 43, + 40: 44, + 41: 46, + 42: 47, + 43: 48, + 44: 49, + 45: 50, + 46: 51, + 47: 52, + 48: 53, + 49: 54, + 50: 55, + 51: 56, + 52: 57, + 53: 58, + 54: 59, + 55: 60, + 56: 61, + 57: 62, + 58: 63, + 59: 64, + 60: 65, + 61: 67, + 62: 70, + 63: 72, + 64: 73, + 65: 74, + 66: 75, + 67: 76, + 68: 77, + 69: 78, + 70: 79, + 71: 80, + 72: 81, + 73: 82, + 74: 84, + 75: 85, + 76: 86, + 77: 87, + 78: 88, + 79: 89, + 80: 90 + } + + catid2name = { + 0: 'background', + 1: 'person', + 2: 'bicycle', + 3: 'car', + 4: 'motorcycle', + 5: 'airplane', + 6: 'bus', + 7: 'train', + 8: 'truck', + 9: 'boat', + 10: 'traffic light', + 11: 'fire hydrant', + 13: 'stop sign', + 14: 'parking meter', + 15: 'bench', + 16: 'bird', + 17: 'cat', + 18: 'dog', + 19: 'horse', + 20: 'sheep', + 21: 'cow', + 22: 'elephant', + 23: 'bear', + 24: 'zebra', + 25: 'giraffe', + 27: 'backpack', + 28: 'umbrella', + 31: 'handbag', + 32: 'tie', + 33: 'suitcase', + 34: 'frisbee', + 35: 'skis', + 36: 'snowboard', + 37: 'sports ball', + 38: 'kite', + 39: 'baseball bat', + 40: 'baseball glove', + 41: 'skateboard', + 42: 'surfboard', + 43: 'tennis racket', + 44: 'bottle', + 46: 'wine glass', + 47: 'cup', + 48: 'fork', + 49: 'knife', + 50: 'spoon', + 51: 'bowl', + 52: 'banana', + 53: 'apple', + 54: 'sandwich', + 55: 'orange', + 56: 'broccoli', + 57: 'carrot', + 58: 'hot dog', + 59: 'pizza', + 60: 'donut', + 61: 'cake', + 62: 'chair', + 63: 'couch', + 64: 'potted plant', + 65: 'bed', + 67: 'dining table', + 70: 'toilet', + 72: 'tv', + 73: 'laptop', + 74: 'mouse', + 75: 'remote', + 76: 'keyboard', + 77: 'cell phone', + 78: 'microwave', + 79: 'oven', + 80: 'toaster', + 81: 'sink', + 82: 'refrigerator', + 84: 'book', + 85: 'clock', + 86: 'vase', + 87: 'scissors', + 88: 'teddy bear', + 89: 'hair drier', + 90: 'toothbrush' + } + + if not with_background: + clsid2catid = {k - 1: v for k, v in clsid2catid.items()} + + return clsid2catid, catid2name diff --git a/ppdet/utils/colormap.py b/ppdet/utils/colormap.py new file mode 100644 index 0000000000000000000000000000000000000000..566185ef90390e2f45747bee48e3b008f7dfc0e9 --- /dev/null +++ b/ppdet/utils/colormap.py @@ -0,0 +1,56 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np + + +def colormap(rgb=False): + """ + Get colormap + """ + color_list = np.array([ + 0.000, 0.447, 0.741, 0.850, 0.325, 0.098, 0.929, 0.694, 0.125, 0.494, + 0.184, 0.556, 0.466, 0.674, 0.188, 0.301, 0.745, 0.933, 0.635, 0.078, + 0.184, 0.300, 0.300, 0.300, 0.600, 0.600, 0.600, 1.000, 0.000, 0.000, + 1.000, 0.500, 0.000, 0.749, 0.749, 0.000, 0.000, 1.000, 0.000, 0.000, + 0.000, 1.000, 0.667, 0.000, 1.000, 0.333, 0.333, 0.000, 0.333, 0.667, + 0.000, 0.333, 1.000, 0.000, 0.667, 0.333, 0.000, 0.667, 0.667, 0.000, + 0.667, 1.000, 0.000, 1.000, 0.333, 0.000, 1.000, 0.667, 0.000, 1.000, + 1.000, 0.000, 0.000, 0.333, 0.500, 0.000, 0.667, 0.500, 0.000, 1.000, + 0.500, 0.333, 0.000, 0.500, 0.333, 0.333, 0.500, 0.333, 0.667, 0.500, + 0.333, 1.000, 0.500, 0.667, 0.000, 0.500, 0.667, 0.333, 0.500, 0.667, + 0.667, 0.500, 0.667, 1.000, 0.500, 1.000, 0.000, 0.500, 1.000, 0.333, + 0.500, 1.000, 0.667, 0.500, 1.000, 1.000, 0.500, 0.000, 0.333, 1.000, + 0.000, 0.667, 1.000, 0.000, 1.000, 1.000, 0.333, 0.000, 1.000, 0.333, + 0.333, 1.000, 0.333, 0.667, 1.000, 0.333, 1.000, 1.000, 0.667, 0.000, + 1.000, 0.667, 0.333, 1.000, 0.667, 0.667, 1.000, 0.667, 1.000, 1.000, + 1.000, 0.000, 1.000, 1.000, 0.333, 1.000, 1.000, 0.667, 1.000, 0.167, + 0.000, 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, + 0.000, 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, + 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, + 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, 0.000, + 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, 0.833, + 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.143, 0.143, 0.143, 0.286, + 0.286, 0.286, 0.429, 0.429, 0.429, 0.571, 0.571, 0.571, 0.714, 0.714, + 0.714, 0.857, 0.857, 0.857, 1.000, 1.000, 1.000 + ]).astype(np.float32) + color_list = color_list.reshape((-1, 3)) * 255 + if not rgb: + color_list = color_list[:, ::-1] + return color_list diff --git a/ppdet/utils/dist_utils.py b/ppdet/utils/dist_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..32eead4a797ba70cb6980e0368ff9873102680c2 --- /dev/null +++ b/ppdet/utils/dist_utils.py @@ -0,0 +1,41 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import + +import os + +import paddle.fluid as fluid + + +def nccl2_prepare(trainer_id, startup_prog, main_prog): + config = fluid.DistributeTranspilerConfig() + config.mode = "nccl2" + t = fluid.DistributeTranspiler(config=config) + t.transpile( + trainer_id, + trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'), + current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'), + startup_program=startup_prog, + program=main_prog) + + +def prepare_for_multi_process(exe, build_strategy, startup_prog, main_prog): + trainer_id = int(os.environ.get('PADDLE_TRAINER_ID', 0)) + num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) + if num_trainers < 2: + return + build_strategy.num_trainers = num_trainers + build_strategy.trainer_id = trainer_id + nccl2_prepare(trainer_id, startup_prog, main_prog) diff --git a/ppdet/utils/download.py b/ppdet/utils/download.py new file mode 100644 index 0000000000000000000000000000000000000000..473cf5ff8fb72a0b203241208171e769f6ba244e --- /dev/null +++ b/ppdet/utils/download.py @@ -0,0 +1,355 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import os.path as osp +import shutil +import requests +import tqdm +import hashlib +import tarfile +import zipfile + +from .voc_utils import merge_and_create_list + +import logging +logger = logging.getLogger(__name__) + +__all__ = ['get_weights_path', 'get_dataset_path'] + +WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/weights") +DATASET_HOME = osp.expanduser("~/.cache/paddle/dataset") + +# dict of {dataset_name: (download_info, sub_dirs)} +# download info: (url, md5sum) +DATASETS = { + 'coco': ([ + ( + 'http://images.cocodataset.org/zips/train2017.zip', + 'cced6f7f71b7629ddf16f17bbcfab6b2', ), + ( + 'http://images.cocodataset.org/zips/val2017.zip', + '442b8da7639aecaf257c1dceb8ba8c80', ), + ( + 'http://images.cocodataset.org/annotations/annotations_trainval2017.zip', + 'f4bbac642086de4f52a3fdda2de5fa2c', ), + ], ["annotations", "train2017", "val2017"]), + 'voc': ([ + ( + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar', + '6cd6e144f989b92b3379bac3b3de84fd', ), + ( + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar', + 'c52e279531787c972589f7e41ab4ae64', ), + ( + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar', + 'b6e924de25625d8de591ea690078ad9f', ), + ], ["VOCdevkit/VOC_all"]), + 'wider_face': ([ + ( + 'https://dataset.bj.bcebos.com/wider_face/WIDER_train.zip', + '3fedf70df600953d25982bcd13d91ba2', ), + ( + 'https://dataset.bj.bcebos.com/wider_face/WIDER_val.zip', + 'dfa7d7e790efa35df3788964cf0bbaea', ), + ( + 'https://dataset.bj.bcebos.com/wider_face/wider_face_split.zip', + 'a4a898d6193db4b9ef3260a68bad0dc7', ), + ], ["WIDER_train", "WIDER_val", "wider_face_split"]), + 'fruit': ([ + ( + 'https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar', + '374554a7633b1b68d6a5fbb7c061b8ba', ), + ], ["fruit-detection"]), +} + +DOWNLOAD_RETRY_LIMIT = 3 + + +def get_weights_path(url): + """Get weights path from WEIGHT_HOME, if not exists, + download it from url. + """ + return get_path(url, WEIGHTS_HOME) + + +def get_dataset_path(path, annotation, image_dir): + """ + If path exists, return path. + Otherwise, get dataset path from DATASET_HOME, if not exists, + download it. + """ + if _dataset_exists(path, annotation, image_dir): + return path + + logger.info("Dataset {} is not valid for reason above, try searching {} or " + "downloading dataset...".format( + osp.realpath(path), DATASET_HOME)) + + for name, dataset in DATASETS.items(): + if os.path.split(path.strip().lower())[-1] == name: + logger.info("Parse dataset_dir {} as dataset " + "{}".format(path, name)) + data_dir = osp.join(DATASET_HOME, name) + + # For voc, only check merged dir VOC_all + if name == 'voc': + check_dir = osp.join(data_dir, dataset[1][0]) + if osp.exists(check_dir): + logger.info("Found {}".format(check_dir)) + return data_dir + + for url, md5sum in dataset[0]: + get_path(url, data_dir, md5sum) + + # voc should merge dir and create list after download + if name == 'voc': + _merge_voc_dir(data_dir, dataset[1][0]) + return data_dir + + # not match any dataset in DATASETS + raise ValueError("Dataset {} is not valid and cannot parse dataset type " + "'{}' for automaticly downloading, which only supports " + "'voc' and 'coco' currently".format(path, + osp.split(path)[-1])) + + +def _merge_voc_dir(data_dir, output_subdir): + logger.info("Download voc dataset successed, merge " + "VOC2007 and VOC2012 to VOC_all...") + output_dir = osp.join(data_dir, output_subdir) + devkit_dir = "/".join(output_dir.split('/')[:-1]) + years = ['2007', '2012'] + # merge dir in output_tmp_dir at first, move to + # output_dir after merge sucessed. + output_tmp_dir = osp.join(data_dir, 'tmp') + if osp.isdir(output_tmp_dir): + shutil.rmtree(output_tmp_dir) + # NOTE: since using auto download VOC + # dataset, VOC default label list should be used, + # do not generate label_list.txt here. For default + # label, see ../data/source/voc_loader.py + merge_and_create_list(devkit_dir, years, output_tmp_dir) + shutil.move(output_tmp_dir, output_dir) + # remove source directory VOC2007 and VOC2012 + shutil.rmtree(osp.join(devkit_dir, "VOC2007")) + shutil.rmtree(osp.join(devkit_dir, "VOC2012")) + + +def map_path(url, root_dir): + # parse path after download to decompress under root_dir + fname = url.split('/')[-1] + zip_formats = ['.zip', '.tar', '.gz'] + fpath = fname + for zip_format in zip_formats: + fpath = fpath.replace(zip_format, '') + return osp.join(root_dir, fpath) + + +def get_path(url, root_dir, md5sum=None): + """ Download from given url to root_dir. + if file or directory specified by url is exists under + root_dir, return the path directly, otherwise download + from url and decompress it, return the path. + + url (str): download url + root_dir (str): root dir for downloading, it should be + WEIGHTS_HOME or DATASET_HOME + md5sum (str): md5 sum of download package + """ + # parse path after download to decompress under root_dir + fullpath = map_path(url, root_dir) + + # For same zip file, decompressed directory name different + # from zip file name, rename by following map + decompress_name_map = { + "VOC": "VOCdevkit/VOC_all", + "annotations_trainval": "annotations" + } + for k, v in decompress_name_map.items(): + if fullpath.find(k) >= 0: + fullpath = '/'.join(fullpath.split('/')[:-1] + [v]) + + if osp.exists(fullpath): + logger.info("Found {}".format(fullpath)) + else: + fullname = _download(url, root_dir, md5sum) + _decompress(fullname) + + return fullpath + + +def download_dataset(path, dataset=None): + if dataset not in DATASETS.keys(): + logger.error("Unknown dataset {}, it should be " + "{}".format(dataset, DATASETS.keys())) + return + dataset_info = DATASETS[dataset][0] + for info in dataset_info: + get_path(info[0], path, info[1]) + if dataset == 'voc': + _merge_voc_dir(path, DATASETS[dataset][1][0]) + logger.info("Download dataset {} finished.".format(dataset)) + + +def _dataset_exists(path, annotation, image_dir): + """ + Check if user define dataset exists + """ + if not osp.exists(path): + logger.info("Config dataset_dir {} is not exits, " + "dataset config is not valid".format(path)) + return False + + if annotation: + annotation_path = osp.join(path, annotation) + if not osp.isfile(annotation_path): + logger.info("Config annotation {} is not a " + "file, dataset config is not " + "valid".format(annotation_path)) + return False + if image_dir: + image_path = osp.join(path, image_dir) + if not osp.isdir(image_path): + logger.info("Config image_dir {} is not a " + "directory, dataset config is not " + "valid".format(image_path)) + return False + return True + + +def _download(url, path, md5sum=None): + """ + Download from url, save to path. + + url (str): download url + path (str): download to given path + """ + if not osp.exists(path): + os.makedirs(path) + + fname = url.split('/')[-1] + fullname = osp.join(path, fname) + retry_cnt = 0 + + while not (osp.exists(fullname) and _md5check(fullname, md5sum)): + if retry_cnt < DOWNLOAD_RETRY_LIMIT: + retry_cnt += 1 + else: + raise RuntimeError("Download from {} failed. " + "Retry limit reached".format(url)) + + logger.info("Downloading {} from {}".format(fname, url)) + + req = requests.get(url, stream=True) + if req.status_code != 200: + raise RuntimeError("Downloading from {} failed with code " + "{}!".format(url, req.status_code)) + + # For protecting download interupted, download to + # tmp_fullname firstly, move tmp_fullname to fullname + # after download finished + tmp_fullname = fullname + "_tmp" + total_size = req.headers.get('content-length') + with open(tmp_fullname, 'wb') as f: + if total_size: + for chunk in tqdm.tqdm( + req.iter_content(chunk_size=1024), + total=(int(total_size) + 1023) // 1024, + unit='KB'): + f.write(chunk) + else: + for chunk in req.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + shutil.move(tmp_fullname, fullname) + + return fullname + + +def _md5check(fullname, md5sum=None): + if md5sum is None: + return True + + logger.info("File {} md5 checking...".format(fullname)) + md5 = hashlib.md5() + with open(fullname, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + md5.update(chunk) + calc_md5sum = md5.hexdigest() + + if calc_md5sum != md5sum: + logger.info("File {} md5 check failed, {}(calc) != " + "{}(base)".format(fullname, calc_md5sum, md5sum)) + return False + return True + + +def _decompress(fname): + """ + Decompress for zip and tar file + """ + logger.info("Decompressing {}...".format(fname)) + + # For protecting decompressing interupted, + # decompress to fpath_tmp directory firstly, if decompress + # successed, move decompress files to fpath and delete + # fpath_tmp and remove download compress file. + fpath = '/'.join(fname.split('/')[:-1]) + fpath_tmp = osp.join(fpath, 'tmp') + if osp.isdir(fpath_tmp): + shutil.rmtree(fpath_tmp) + os.makedirs(fpath_tmp) + + if fname.find('tar') >= 0: + with tarfile.open(fname) as tf: + tf.extractall(path=fpath_tmp) + elif fname.find('zip') >= 0: + with zipfile.ZipFile(fname) as zf: + zf.extractall(path=fpath_tmp) + else: + raise TypeError("Unsupport compress file type {}".format(fname)) + + for f in os.listdir(fpath_tmp): + src_dir = osp.join(fpath_tmp, f) + dst_dir = osp.join(fpath, f) + _move_and_merge_tree(src_dir, dst_dir) + + shutil.rmtree(fpath_tmp) + os.remove(fname) + + +def _move_and_merge_tree(src, dst): + """ + Move src directory to dst, if dst is already exists, + merge src to dst + """ + if not osp.exists(dst): + shutil.move(src, dst) + else: + for fp in os.listdir(src): + src_fp = osp.join(src, fp) + dst_fp = osp.join(dst, fp) + if osp.isdir(src_fp): + if osp.isdir(dst_fp): + _move_and_merge_tree(src_fp, dst_fp) + else: + shutil.move(src_fp, dst_fp) + elif osp.isfile(src_fp) and \ + not osp.isfile(dst_fp): + shutil.move(src_fp, dst_fp) diff --git a/ppdet/utils/eval_utils.py b/ppdet/utils/eval_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..dbded30197b46e12282d50991406a6585c0cb572 --- /dev/null +++ b/ppdet/utils/eval_utils.py @@ -0,0 +1,247 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import logging +import numpy as np +import os +import time + +import paddle.fluid as fluid + +from ppdet.utils.voc_eval import bbox_eval as voc_bbox_eval +from ppdet.utils.post_process import mstest_box_post_process, mstest_mask_post_process, box_flip + +__all__ = ['parse_fetches', 'eval_run', 'eval_results', 'json_eval_results'] + +logger = logging.getLogger(__name__) + + +def parse_fetches(fetches, prog=None, extra_keys=None): + """ + Parse fetch variable infos from model fetches, + values for fetch_list and keys for stat + """ + keys, values = [], [] + cls = [] + for k, v in fetches.items(): + if hasattr(v, 'name'): + keys.append(k) + v.persistable = True + values.append(v.name) + else: + cls.append(v) + + if prog is not None and extra_keys is not None: + for k in extra_keys: + try: + v = fluid.framework._get_var(k, prog) + keys.append(k) + values.append(v.name) + except Exception: + pass + + return keys, values, cls + + +def length2lod(length_lod): + offset_lod = [0] + for i in length_lod: + offset_lod.append(offset_lod[-1] + i) + return [offset_lod] + + +def get_sub_feed(input, place): + new_dict = {} + res_feed = {} + key_name = ['bbox', 'im_info', 'im_id', 'im_shape', 'bbox_flip'] + for k in key_name: + if k in input.keys(): + new_dict[k] = input[k] + for k in input.keys(): + if 'image' in k: + new_dict[k] = input[k] + for k, v in new_dict.items(): + data_t = fluid.LoDTensor() + data_t.set(v[0], place) + if 'bbox' in k: + lod = length2lod(v[1][0]) + data_t.set_lod(lod) + res_feed[k] = data_t + return res_feed + + +def clean_res(result, keep_name_list): + clean_result = {} + for k in result.keys(): + if k in keep_name_list: + clean_result[k] = result[k] + result.clear() + return clean_result + + +def eval_run(exe, + compile_program, + pyreader, + keys, + values, + cls, + cfg=None, + sub_prog=None, + sub_keys=None, + sub_values=None): + """ + Run evaluation program, return program outputs. + """ + iter_id = 0 + results = [] + if len(cls) != 0: + values = [] + for i in range(len(cls)): + _, accum_map = cls[i].get_map_var() + cls[i].reset(exe) + values.append(accum_map) + + images_num = 0 + start_time = time.time() + has_bbox = 'bbox' in keys + + try: + pyreader.start() + while True: + outs = exe.run(compile_program, + fetch_list=values, + return_numpy=False) + res = { + k: (np.array(v), v.recursive_sequence_lengths()) + for k, v in zip(keys, outs) + } + multi_scale_test = getattr(cfg, 'MultiScaleTEST', None) + mask_multi_scale_test = multi_scale_test and 'Mask' in cfg.architecture + + if multi_scale_test: + post_res = mstest_box_post_process(res, cfg) + res.update(post_res) + if mask_multi_scale_test: + place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() + sub_feed = get_sub_feed(res, place) + sub_prog_outs = exe.run(sub_prog, + feed=sub_feed, + fetch_list=sub_values, + return_numpy=False) + sub_prog_res = { + k: (np.array(v), v.recursive_sequence_lengths()) + for k, v in zip(sub_keys, sub_prog_outs) + } + post_res = mstest_mask_post_process(sub_prog_res, cfg) + res.update(post_res) + if multi_scale_test: + res = clean_res( + res, ['im_info', 'bbox', 'im_id', 'im_shape', 'mask']) + results.append(res) + if iter_id % 100 == 0: + logger.info('Test iter {}'.format(iter_id)) + iter_id += 1 + images_num += len(res['bbox'][1][0]) if has_bbox else 1 + except (StopIteration, fluid.core.EOFException): + pyreader.reset() + logger.info('Test finish iter {}'.format(iter_id)) + + end_time = time.time() + fps = images_num / (end_time - start_time) + if has_bbox: + logger.info('Total number of images: {}, inference time: {} fps.'. + format(images_num, fps)) + else: + logger.info('Total iteration: {}, inference time: {} batch/s.'.format( + images_num, fps)) + + return results + + +def eval_results(results, + feed, + metric, + num_classes, + resolution=None, + is_bbox_normalized=False, + output_directory=None, + map_type='11point'): + """Evaluation for evaluation program results""" + box_ap_stats = [] + if metric == 'COCO': + from ppdet.utils.coco_eval import proposal_eval, bbox_eval, mask_eval + anno_file = getattr(feed.dataset, 'annotation', None) + with_background = getattr(feed, 'with_background', True) + if 'proposal' in results[0]: + output = 'proposal.json' + if output_directory: + output = os.path.join(output_directory, 'proposal.json') + proposal_eval(results, anno_file, output) + if 'bbox' in results[0]: + output = 'bbox.json' + if output_directory: + output = os.path.join(output_directory, 'bbox.json') + + box_ap_stats = bbox_eval( + results, + anno_file, + output, + with_background, + is_bbox_normalized=is_bbox_normalized) + + if 'mask' in results[0]: + output = 'mask.json' + if output_directory: + output = os.path.join(output_directory, 'mask.json') + mask_eval(results, anno_file, output, resolution) + else: + if 'accum_map' in results[-1]: + res = np.mean(results[-1]['accum_map'][0]) + logger.info('mAP: {:.2f}'.format(res * 100.)) + box_ap_stats.append(res * 100.) + elif 'bbox' in results[0]: + box_ap = voc_bbox_eval( + results, + num_classes, + is_bbox_normalized=is_bbox_normalized, + map_type=map_type) + box_ap_stats.append(box_ap) + return box_ap_stats + + +def json_eval_results(feed, metric, json_directory=None): + """ + cocoapi eval with already exists proposal.json, bbox.json or mask.json + """ + assert metric == 'COCO' + from ppdet.utils.coco_eval import cocoapi_eval + anno_file = getattr(feed.dataset, 'annotation', None) + json_file_list = ['proposal.json', 'bbox.json', 'mask.json'] + if json_directory: + assert os.path.exists( + json_directory), "The json directory:{} does not exist".format( + json_directory) + for k, v in enumerate(json_file_list): + json_file_list[k] = os.path.join(str(json_directory), v) + + coco_eval_style = ['proposal', 'bbox', 'segm'] + for i, v_json in enumerate(json_file_list): + if os.path.exists(v_json): + cocoapi_eval(v_json, coco_eval_style[i], anno_file=anno_file) + else: + logger.info("{} not exists!".format(v_json)) diff --git a/ppdet/utils/map_utils.py b/ppdet/utils/map_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fe0530596bb54920a81235ee6716e2d43918899a --- /dev/null +++ b/ppdet/utils/map_utils.py @@ -0,0 +1,214 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import sys +import numpy as np +import logging +logger = logging.getLogger(__name__) + +__all__ = ['bbox_area', 'jaccard_overlap', 'DetectionMAP'] + + +def bbox_area(bbox, is_bbox_normalized): + """ + Calculate area of a bounding box + """ + norm = 1. - float(is_bbox_normalized) + width = bbox[2] - bbox[0] + norm + height = bbox[3] - bbox[1] + norm + return width * height + + +def jaccard_overlap(pred, gt, is_bbox_normalized=False): + """ + Calculate jaccard overlap ratio between two bounding box + """ + if pred[0] >= gt[2] or pred[2] <= gt[0] or \ + pred[1] >= gt[3] or pred[3] <= gt[1]: + return 0. + inter_xmin = max(pred[0], gt[0]) + inter_ymin = max(pred[1], gt[1]) + inter_xmax = min(pred[2], gt[2]) + inter_ymax = min(pred[3], gt[3]) + inter_size = bbox_area([inter_xmin, inter_ymin, inter_xmax, inter_ymax], + is_bbox_normalized) + pred_size = bbox_area(pred, is_bbox_normalized) + gt_size = bbox_area(gt, is_bbox_normalized) + overlap = float(inter_size) / (pred_size + gt_size - inter_size) + return overlap + + +class DetectionMAP(object): + """ + Calculate detection mean average precision. + Currently support two types: 11point and integral + + Args: + class_num (int): the class number. + overlap_thresh (float): The threshold of overlap + ratio between prediction bounding box and + ground truth bounding box for deciding + true/false positive. Default 0.5. + map_type (str): calculation method of mean average + precision, currently support '11point' and + 'integral'. Default '11point'. + is_bbox_normalized (bool): whther bounding boxes + is normalized to range[0, 1]. Default False. + evaluate_difficult (bool): whether to evaluate + difficult bounding boxes. Default False. + """ + + def __init__(self, + class_num, + overlap_thresh=0.5, + map_type='11point', + is_bbox_normalized=False, + evaluate_difficult=False): + self.class_num = class_num + self.overlap_thresh = overlap_thresh + assert map_type in ['11point', 'integral'], \ + "map_type currently only support '11point' "\ + "and 'integral'" + self.map_type = map_type + self.is_bbox_normalized = is_bbox_normalized + self.evaluate_difficult = evaluate_difficult + self.reset() + + def update(self, bbox, gt_box, gt_label, difficult=None): + """ + Update metric statics from given prediction and ground + truth infomations. + """ + if difficult is None: + difficult = np.zeros_like(gt_label) + + # record class gt count + for gtl, diff in zip(gt_label, difficult): + if self.evaluate_difficult or int(diff) == 0: + self.class_gt_counts[int(np.array(gtl))] += 1 + + # record class score positive + visited = [False] * len(gt_label) + for b in bbox: + label, score, xmin, ymin, xmax, ymax = b.tolist() + pred = [xmin, ymin, xmax, ymax] + max_idx = -1 + max_overlap = -1.0 + for i, gl in enumerate(gt_label): + if int(gl) == int(label): + overlap = jaccard_overlap(pred, gt_box[i], + self.is_bbox_normalized) + if overlap > max_overlap: + max_overlap = overlap + max_idx = i + + if max_overlap > self.overlap_thresh: + if self.evaluate_difficult or \ + int(np.array(difficult[max_idx])) == 0: + if not visited[max_idx]: + self.class_score_poss[int(label)].append([score, 1.0]) + visited[max_idx] = True + else: + self.class_score_poss[int(label)].append([score, 0.0]) + else: + self.class_score_poss[int(label)].append([score, 0.0]) + + def reset(self): + """ + Reset metric statics + """ + self.class_score_poss = [[] for _ in range(self.class_num)] + self.class_gt_counts = [0] * self.class_num + self.mAP = None + + def accumulate(self): + """ + Accumulate metric results and calculate mAP + """ + mAP = 0. + valid_cnt = 0 + for score_pos, count in zip(self.class_score_poss, + self.class_gt_counts): + if count == 0 or len(score_pos) == 0: + continue + + accum_tp_list, accum_fp_list = \ + self._get_tp_fp_accum(score_pos) + precision = [] + recall = [] + for ac_tp, ac_fp in zip(accum_tp_list, accum_fp_list): + precision.append(float(ac_tp) / (ac_tp + ac_fp)) + recall.append(float(ac_tp) / count) + + if self.map_type == '11point': + max_precisions = [0.] * 11 + start_idx = len(precision) - 1 + for j in range(10, -1, -1): + for i in range(start_idx, -1, -1): + if recall[i] < float(j) / 10.: + start_idx = i + if j > 0: + max_precisions[j - 1] = max_precisions[j] + break + else: + if max_precisions[j] < precision[i]: + max_precisions[j] = precision[i] + mAP += sum(max_precisions) / 11. + valid_cnt += 1 + elif self.map_type == 'integral': + import math + ap = 0. + prev_recall = 0. + for i in range(len(precision)): + recall_gap = math.fabs(recall[i] - prev_recall) + if recall_gap > 1e-6: + ap += precision[i] * recall_gap + prev_recall = recall[i] + mAP += ap + valid_cnt += 1 + else: + logger.error("Unspported mAP type {}".format(self.map_type)) + sys.exit(1) + + self.mAP = mAP / float(valid_cnt) if valid_cnt > 0 else mAP + + def get_map(self): + """ + Get mAP result + """ + if self.mAP is None: + logger.error("mAP is not calculated.") + return self.mAP + + def _get_tp_fp_accum(self, score_pos_list): + """ + Calculate accumulating true/false positive results from + [score, pos] records + """ + sorted_list = sorted(score_pos_list, key=lambda s: s[0], reverse=True) + accum_tp = 0 + accum_fp = 0 + accum_tp_list = [] + accum_fp_list = [] + for (score, pos) in sorted_list: + accum_tp += int(pos) + accum_tp_list.append(accum_tp) + accum_fp += 1 - int(pos) + accum_fp_list.append(accum_fp) + return accum_tp_list, accum_fp_list diff --git a/ppdet/utils/post_process.py b/ppdet/utils/post_process.py new file mode 100644 index 0000000000000000000000000000000000000000..cc80bc186626c4e983328ffd53f53d217230a244 --- /dev/null +++ b/ppdet/utils/post_process.py @@ -0,0 +1,212 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import logging +import numpy as np + +import paddle.fluid as fluid + +__all__ = ['nms'] + +logger = logging.getLogger(__name__) + + +def box_flip(boxes, im_shape): + im_width = im_shape[0][1] + flipped_boxes = boxes.copy() + + flipped_boxes[:, 0::4] = im_width - boxes[:, 2::4] - 1 + flipped_boxes[:, 2::4] = im_width - boxes[:, 0::4] - 1 + return flipped_boxes + + +def nms(dets, thresh): + """Apply classic DPM-style greedy NMS.""" + if dets.shape[0] == 0: + return [] + scores = dets[:, 0] + x1 = dets[:, 1] + y1 = dets[:, 2] + x2 = dets[:, 3] + y2 = dets[:, 4] + + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.argsort()[::-1] + + ndets = dets.shape[0] + suppressed = np.zeros((ndets), dtype=np.int) + + # nominal indices + # _i, _j + # sorted indices + # i, j + # temp variables for box i's (the box currently under consideration) + # ix1, iy1, ix2, iy2, iarea + + # variables for computing overlap with box j (lower scoring box) + # xx1, yy1, xx2, yy2 + # w, h + # inter, ovr + + for _i in range(ndets): + i = order[_i] + if suppressed[i] == 1: + continue + ix1 = x1[i] + iy1 = y1[i] + ix2 = x2[i] + iy2 = y2[i] + iarea = areas[i] + for _j in range(_i + 1, ndets): + j = order[_j] + if suppressed[j] == 1: + continue + xx1 = max(ix1, x1[j]) + yy1 = max(iy1, y1[j]) + xx2 = min(ix2, x2[j]) + yy2 = min(iy2, y2[j]) + w = max(0.0, xx2 - xx1 + 1) + h = max(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (iarea + areas[j] - inter) + if ovr >= thresh: + suppressed[j] = 1 + + return np.where(suppressed == 0)[0] + + +def bbox_area(box): + w = box[2] - box[0] + 1 + h = box[3] - box[1] + 1 + return w * h + + +def bbox_overlaps(x, y): + N = x.shape[0] + K = y.shape[0] + overlaps = np.zeros((N, K), dtype=np.float32) + for k in range(K): + y_area = bbox_area(y[k]) + for n in range(N): + iw = min(x[n, 2], y[k, 2]) - max(x[n, 0], y[k, 0]) + 1 + if iw > 0: + ih = min(x[n, 3], y[k, 3]) - max(x[n, 1], y[k, 1]) + 1 + if ih > 0: + x_area = bbox_area(x[n]) + ua = x_area + y_area - iw * ih + overlaps[n, k] = iw * ih / ua + return overlaps + + +def box_voting(nms_dets, dets, vote_thresh): + top_dets = nms_dets.copy() + top_boxes = nms_dets[:, 1:] + all_boxes = dets[:, 1:] + all_scores = dets[:, 0] + top_to_all_overlaps = bbox_overlaps(top_boxes, all_boxes) + for k in range(nms_dets.shape[0]): + inds_to_vote = np.where(top_to_all_overlaps[k] >= vote_thresh)[0] + boxes_to_vote = all_boxes[inds_to_vote, :] + ws = all_scores[inds_to_vote] + top_dets[k, 1:] = np.average(boxes_to_vote, axis=0, weights=ws) + + return top_dets + + +def get_nms_result(boxes, scores, cfg): + cls_boxes = [[] for _ in range(cfg.num_classes)] + for j in range(1, cfg.num_classes): + inds = np.where(scores[:, j] > cfg.MultiScaleTEST['score_thresh'])[0] + scores_j = scores[inds, j] + boxes_j = boxes[inds, j * 4:(j + 1) * 4] + dets_j = np.hstack((scores_j[:, np.newaxis], boxes_j)).astype( + np.float32, copy=False) + keep = nms(dets_j, cfg.MultiScaleTEST['nms_thresh']) + nms_dets = dets_j[keep, :] + if cfg.MultiScaleTEST['enable_voting']: + nms_dets = box_voting(nms_dets, dets_j, + cfg.MultiScaleTEST['vote_thresh']) + #add labels + label = np.array([j for _ in range(len(keep))]) + nms_dets = np.hstack((label[:, np.newaxis], nms_dets)).astype( + np.float32, copy=False) + cls_boxes[j] = nms_dets + # Limit to max_per_image detections **over all classes** + image_scores = np.hstack( + [cls_boxes[j][:, 1] for j in range(1, cfg.num_classes)]) + if len(image_scores) > cfg.MultiScaleTEST['detections_per_im']: + image_thresh = np.sort(image_scores)[-cfg.MultiScaleTEST[ + 'detections_per_im']] + for j in range(1, cfg.num_classes): + keep = np.where(cls_boxes[j][:, 1] >= image_thresh)[0] + cls_boxes[j] = cls_boxes[j][keep, :] + + im_results = np.vstack([cls_boxes[j] for j in range(1, cfg.num_classes)]) + return im_results + + +def mstest_box_post_process(result, cfg): + """ + Multi-scale Test + Only available for batch_size=1 now. + """ + post_bbox = {} + use_flip = False + ms_boxes = [] + ms_scores = [] + im_shape = result['im_shape'][0] + for k in result.keys(): + if 'bbox' in k: + boxes = result[k][0] + boxes = np.reshape(boxes, (-1, 4 * cfg.num_classes)) + scores = result['score' + k[4:]][0] + if 'flip' in k: + boxes = box_flip(boxes, im_shape) + use_flip = True + ms_boxes.append(boxes) + ms_scores.append(scores) + + ms_boxes = np.concatenate(ms_boxes) + ms_scores = np.concatenate(ms_scores) + bbox_pred = get_nms_result(ms_boxes, ms_scores, cfg) + post_bbox.update({'bbox': (bbox_pred, [[len(bbox_pred)]])}) + if use_flip: + bbox = bbox_pred[:, 2:] + bbox_flip = np.append( + bbox_pred[:, :2], box_flip(bbox, im_shape), axis=1) + post_bbox.update({'bbox_flip': (bbox_flip, [[len(bbox_flip)]])}) + return post_bbox + + +def mstest_mask_post_process(result, cfg): + mask_list = [] + im_shape = result['im_shape'][0] + M = cfg.FPNRoIAlign['mask_resolution'] + for k in result.keys(): + if 'mask' in k: + masks = result[k][0] + if len(masks.shape) != 4: + masks = np.zeros((0, M, M)) + mask_list.append(masks) + continue + if 'flip' in k: + masks = masks[:, :, :, ::-1] + mask_list.append(masks) + + mask_pred = np.mean(mask_list, axis=0) + return {'mask': (mask_pred, [[len(mask_pred)]])} diff --git a/ppdet/utils/stats.py b/ppdet/utils/stats.py new file mode 100644 index 0000000000000000000000000000000000000000..4d7e36babf8d53170162cfd5581f591e376ec8cd --- /dev/null +++ b/ppdet/utils/stats.py @@ -0,0 +1,65 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import numpy as np +import datetime + +__all__ = ['TrainingStats', 'Time'] + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size): + self.deque = collections.deque(maxlen=window_size) + + def add_value(self, value): + self.deque.append(value) + + def get_median_value(self): + return np.median(self.deque) + + +def Time(): + return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + + +class TrainingStats(object): + def __init__(self, window_size, stats_keys): + self.smoothed_losses_and_metrics = { + key: SmoothedValue(window_size) + for key in stats_keys + } + + def update(self, stats): + for k, v in self.smoothed_losses_and_metrics.items(): + v.add_value(stats[k]) + + def get(self, extras=None): + stats = collections.OrderedDict() + if extras: + for k, v in extras.items(): + stats[k] = v + for k, v in self.smoothed_losses_and_metrics.items(): + stats[k] = format(v.get_median_value(), '.6f') + + return stats + + def log(self, extras=None): + d = self.get(extras) + strs = ', '.join(str(dict({x: y})).strip('{}') for x, y in d.items()) + return strs diff --git a/ppdet/utils/visualizer.py b/ppdet/utils/visualizer.py new file mode 100644 index 0000000000000000000000000000000000000000..ff35dc2d577fcaff042583d63c709a498dadb1da --- /dev/null +++ b/ppdet/utils/visualizer.py @@ -0,0 +1,110 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import pycocotools.mask as mask_util +from PIL import Image, ImageDraw + +from .colormap import colormap + +__all__ = ['visualize_results'] + + +def visualize_results(image, + im_id, + catid2name, + threshold=0.5, + bbox_results=None, + mask_results=None): + """ + Visualize bbox and mask results + """ + if mask_results: + image = draw_mask(image, im_id, mask_results, threshold) + if bbox_results: + image = draw_bbox(image, im_id, catid2name, bbox_results, + threshold) + return image + + +def draw_mask(image, im_id, segms, threshold, alpha=0.7): + """ + Draw mask on image + """ + mask_color_id = 0 + w_ratio = .4 + color_list = colormap(rgb=True) + img_array = np.array(image).astype('float32') + for dt in np.array(segms): + if im_id != dt['image_id']: + continue + segm, score = dt['segmentation'], dt['score'] + if score < threshold: + continue + mask = mask_util.decode(segm) * 255 + color_mask = color_list[mask_color_id % len(color_list), 0:3] + mask_color_id += 1 + for c in range(3): + color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255 + idx = np.nonzero(mask) + img_array[idx[0], idx[1], :] *= 1.0 - alpha + img_array[idx[0], idx[1], :] += alpha * color_mask + return Image.fromarray(img_array.astype('uint8')) + + +def draw_bbox(image, im_id, catid2name, bboxes, threshold): + """ + Draw bbox on image + """ + draw = ImageDraw.Draw(image) + + catid2color = {} + color_list = colormap(rgb=True)[:40] + for dt in np.array(bboxes): + if im_id != dt['image_id']: + continue + catid, bbox, score = dt['category_id'], dt['bbox'], dt['score'] + if score < threshold: + continue + + xmin, ymin, w, h = bbox + xmax = xmin + w + ymax = ymin + h + + if catid not in catid2color: + idx = np.random.randint(len(color_list)) + catid2color[catid] = color_list[idx] + color = tuple(catid2color[catid]) + + # draw bbox + draw.line( + [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), + (xmin, ymin)], + width=2, + fill=color) + + # draw label + text = "{} {:.2f}".format(catid2name[catid], score) + tw, th = draw.textsize(text) + draw.rectangle([(xmin + 1, ymin - th), + (xmin + tw + 1, ymin)], + fill=color) + draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) + + return image diff --git a/ppdet/utils/voc_eval.py b/ppdet/utils/voc_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..67e66e4c683aaf7d5af2fd352817daf1ad3576d4 --- /dev/null +++ b/ppdet/utils/voc_eval.py @@ -0,0 +1,186 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import os +import sys +import numpy as np + +from ..data.source.voc_loader import pascalvoc_label +from .map_utils import DetectionMAP +from .coco_eval import bbox2out + +import logging +logger = logging.getLogger(__name__) + +__all__ = [ + 'bbox_eval', 'bbox2out', 'get_category_info' +] + + +def bbox_eval(results, + class_num, + overlap_thresh=0.5, + map_type='11point', + is_bbox_normalized=False, + evaluate_difficult=False): + """ + Bounding box evaluation for VOC dataset + + Args: + results (list): prediction bounding box results. + class_num (int): evaluation class number. + overlap_thresh (float): the postive threshold of + bbox overlap + map_type (string): method for mAP calcualtion, + can only be '11point' or 'integral' + is_bbox_normalized (bool): whether bbox is normalized + to range [0, 1]. + evaluate_difficult (bool): whether to evaluate + difficult gt bbox. + """ + assert 'bbox' in results[0] + logger.info("Start evaluate...") + + detection_map = DetectionMAP(class_num=class_num, + overlap_thresh=overlap_thresh, + map_type=map_type, + is_bbox_normalized=is_bbox_normalized, + evaluate_difficult=evaluate_difficult) + + for t in results: + bboxes = t['bbox'][0] + bbox_lengths = t['bbox'][1][0] + + if bboxes.shape == (1, 1) or bboxes is None: + continue + + gt_boxes = t['gt_box'][0] + gt_labels = t['gt_label'][0] + difficults = t['is_difficult'][0] if not evaluate_difficult \ + else None + + if len(t['gt_box'][1]) == 0: + # gt_box, gt_label, difficult read as zero padded Tensor + bbox_idx = 0 + for i in range(len(gt_boxes)): + gt_box = gt_boxes[i] + gt_label = gt_labels[i] + difficult = None if difficults is None \ + else difficults[i] + bbox_num = bbox_lengths[i] + bbox = bboxes[bbox_idx: bbox_idx + bbox_num] + gt_box, gt_label, difficult = prune_zero_padding( + gt_box, gt_label, difficult) + detection_map.update(bbox, gt_box, gt_label, difficult) + bbox_idx += bbox_num + else: + # gt_box, gt_label, difficult read as LoDTensor + gt_box_lengths = t['gt_box'][1][0] + bbox_idx = 0 + gt_box_idx = 0 + for i in range(len(bbox_lengths)): + bbox_num = bbox_lengths[i] + gt_box_num = gt_box_lengths[i] + bbox = bboxes[bbox_idx: bbox_idx + bbox_num] + gt_box = gt_boxes[gt_box_idx: gt_box_idx + gt_box_num] + gt_label = gt_labels[gt_box_idx: gt_box_idx + gt_box_num] + difficult = None if difficults is None else \ + difficults[gt_box_idx: gt_box_idx + gt_box_num] + detection_map.update(bbox, gt_box, gt_label, difficult) + bbox_idx += bbox_num + gt_box_idx += gt_box_num + + logger.info("Accumulating evaluatation results...") + detection_map.accumulate() + map_stat = 100. * detection_map.get_map() + logger.info("mAP({:.2f}, {}) = {:.2f}".format(overlap_thresh, + map_type, map_stat)) + return map_stat + + +def prune_zero_padding(gt_box, gt_label, difficult=None): + valid_cnt = 0 + for i in range(len(gt_box)): + if gt_box[i, 0] == 0 and gt_box[i, 1] == 0 and \ + gt_box[i, 2] == 0 and gt_box[i, 3] == 0: + break + valid_cnt += 1 + return (gt_box[:valid_cnt], gt_label[:valid_cnt], + difficult[:valid_cnt] if difficult is not None else None) + + +def get_category_info(anno_file=None, + with_background=True, + use_default_label=False): + if use_default_label or anno_file is None \ + or not os.path.exists(anno_file): + logger.info("Not found annotation file {}, load " + "voc2012 categories.".format(anno_file)) + return vocall_category_info(with_background) + else: + logger.info("Load categories from {}".format(anno_file)) + return get_category_info_from_anno(anno_file, with_background) + + +def get_category_info_from_anno(anno_file, with_background=True): + """ + Get class id to category id map and category id + to category name map from annotation file. + + Args: + anno_file (str): annotation file path + with_background (bool, default True): + whether load background as class 0. + """ + cats = [] + with open(anno_file) as f: + for line in f.readlines(): + cats.append(line.strip()) + + if cats[0] != 'background' and with_background: + cats.insert(0, 'background') + if cats[0] == 'background' and not with_background: + cats = cats[1:] + + clsid2catid = {i: i for i in range(len(cats))} + catid2name = {i: name for i, name in enumerate(cats)} + + return clsid2catid, catid2name + + +def vocall_category_info(with_background=True): + """ + Get class id to category id map and category id + to category name map of mixup voc dataset + + Args: + with_background (bool, default True): + whether load background as class 0. + """ + label_map = pascalvoc_label(with_background) + label_map = sorted(label_map.items(), key=lambda x: x[1]) + cats = [l[0] for l in label_map] + + if with_background: + cats.insert(0, 'background') + + clsid2catid = {i: i for i in range(len(cats))} + catid2name = {i: name for i, name in enumerate(cats)} + + return clsid2catid, catid2name diff --git a/ppdet/utils/voc_utils.py b/ppdet/utils/voc_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2d7fc4a3618e047275d3c8f5366fdf03545be6de --- /dev/null +++ b/ppdet/utils/voc_utils.py @@ -0,0 +1,99 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import os.path as osp +import re +import random +import shutil + +__all__ = ['merge_and_create_list'] + + +def merge_and_create_list(devkit_dir, years, output_dir): + """ + Merge VOC2007 and VOC2012 to output_dir and create following list: + 1. train.txt + 2. val.txt + 3. test.txt + """ + os.makedirs(osp.join(output_dir, 'Annotations/')) + os.makedirs(osp.join(output_dir, 'ImageSets/Main/')) + os.makedirs(osp.join(output_dir, 'JPEGImages/')) + + trainval_list = [] + test_list = [] + for year in years: + trainval, test = _walk_voc_dir(devkit_dir, year, output_dir) + trainval_list.extend(trainval) + test_list.extend(test) + + main_dir = osp.join(output_dir, 'ImageSets/Main/') + random.shuffle(trainval_list) + with open(osp.join(main_dir, 'train.txt'), 'w') as ftrainval: + for item in trainval_list: + ftrainval.write(item + '\n') + + with open(osp.join(main_dir, 'val.txt'), 'w') as fval: + with open(osp.join(main_dir, 'test.txt'), 'w') as ftest: + ct = 0 + for item in test_list: + ct += 1 + fval.write(item + '\n') + if ct <= 1000: + ftest.write(item + '\n') + + +def _get_voc_dir(devkit_dir, year, type): + return osp.join(devkit_dir, 'VOC' + year, type) + + +def _walk_voc_dir(devkit_dir, year, output_dir): + filelist_dir = _get_voc_dir(devkit_dir, year, 'ImageSets/Main') + annotation_dir = _get_voc_dir(devkit_dir, year, 'Annotations') + img_dir = _get_voc_dir(devkit_dir, year, 'JPEGImages') + trainval_list = [] + test_list = [] + added = set() + + for _, _, files in os.walk(filelist_dir): + for fname in files: + img_ann_list = [] + if re.match('[a-z]+_trainval\.txt', fname): + img_ann_list = trainval_list + elif re.match('[a-z]+_test\.txt', fname): + img_ann_list = test_list + else: + continue + fpath = osp.join(filelist_dir, fname) + for line in open(fpath): + name_prefix = line.strip().split()[0] + if name_prefix in added: + continue + added.add(name_prefix) + ann_path = osp.join(annotation_dir, name_prefix + '.xml') + img_path = osp.join(img_dir, name_prefix + '.jpg') + new_ann_path = osp.join(output_dir, 'Annotations/', + name_prefix + '.xml') + new_img_path = osp.join(output_dir, 'JPEGImages/', + name_prefix + '.jpg') + shutil.copy(ann_path, new_ann_path) + shutil.copy(img_path, new_img_path) + img_ann_list.append(name_prefix) + + return trainval_list, test_list diff --git a/ppdet/utils/widerface_eval_utils.py b/ppdet/utils/widerface_eval_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a19cd08350e7be9b3c7db8aa45e95f86b7088bf5 --- /dev/null +++ b/ppdet/utils/widerface_eval_utils.py @@ -0,0 +1,227 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import numpy as np + +from ppdet.data.source.widerface_loader import widerface_label +from ppdet.utils.coco_eval import bbox2out + +import logging +logger = logging.getLogger(__name__) + +__all__ = [ + 'get_shrink', 'bbox_vote', 'save_widerface_bboxes', 'save_fddb_bboxes', + 'to_chw_bgr', 'bbox2out', 'get_category_info' +] + + +def to_chw_bgr(image): + """ + Transpose image from HWC to CHW and from RBG to BGR. + Args: + image (np.array): an image with HWC and RBG layout. + """ + # HWC to CHW + if len(image.shape) == 3: + image = np.swapaxes(image, 1, 2) + image = np.swapaxes(image, 1, 0) + # RBG to BGR + image = image[[2, 1, 0], :, :] + return image + + +def bbox_vote(det): + order = det[:, 4].ravel().argsort()[::-1] + det = det[order, :] + if det.shape[0] == 0: + dets = np.array([[10, 10, 20, 20, 0.002]]) + det = np.empty(shape=[0, 5]) + while det.shape[0] > 0: + # IOU + area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1) + xx1 = np.maximum(det[0, 0], det[:, 0]) + yy1 = np.maximum(det[0, 1], det[:, 1]) + xx2 = np.minimum(det[0, 2], det[:, 2]) + yy2 = np.minimum(det[0, 3], det[:, 3]) + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + o = inter / (area[0] + area[:] - inter) + + # nms + merge_index = np.where(o >= 0.3)[0] + det_accu = det[merge_index, :] + det = np.delete(det, merge_index, 0) + if merge_index.shape[0] <= 1: + if det.shape[0] == 0: + try: + dets = np.row_stack((dets, det_accu)) + except: + dets = det_accu + continue + det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4)) + max_score = np.max(det_accu[:, 4]) + det_accu_sum = np.zeros((1, 5)) + det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4], + axis=0) / np.sum(det_accu[:, -1:]) + det_accu_sum[:, 4] = max_score + try: + dets = np.row_stack((dets, det_accu_sum)) + except: + dets = det_accu_sum + dets = dets[0:750, :] + # Only keep 0.3 or more + keep_index = np.where(dets[:, 4] >= 0.01)[0] + dets = dets[keep_index, :] + return dets + + +def get_shrink(height, width): + """ + Args: + height (int): image height. + width (int): image width. + """ + # avoid out of memory + max_shrink_v1 = (0x7fffffff / 577.0 / (height * width))**0.5 + max_shrink_v2 = ((678 * 1024 * 2.0 * 2.0) / (height * width))**0.5 + + def get_round(x, loc): + str_x = str(x) + if '.' in str_x: + str_before, str_after = str_x.split('.') + len_after = len(str_after) + if len_after >= 3: + str_final = str_before + '.' + str_after[0:loc] + return float(str_final) + else: + return x + + max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3 + if max_shrink >= 1.5 and max_shrink < 2: + max_shrink = max_shrink - 0.1 + elif max_shrink >= 2 and max_shrink < 3: + max_shrink = max_shrink - 0.2 + elif max_shrink >= 3 and max_shrink < 4: + max_shrink = max_shrink - 0.3 + elif max_shrink >= 4 and max_shrink < 5: + max_shrink = max_shrink - 0.4 + elif max_shrink >= 5: + max_shrink = max_shrink - 0.5 + + shrink = max_shrink if max_shrink < 1 else 1 + return shrink, max_shrink + + +def save_widerface_bboxes(image_path, bboxes_scores, output_dir): + image_name = image_path.split('/')[-1] + image_class = image_path.split('/')[-2] + odir = os.path.join(output_dir, image_class) + if not os.path.exists(odir): + os.makedirs(odir) + + ofname = os.path.join(odir, '%s.txt' % (image_name[:-4])) + f = open(ofname, 'w') + f.write('{:s}\n'.format(image_class + '/' + image_name)) + f.write('{:d}\n'.format(bboxes_scores.shape[0])) + for box_score in bboxes_scores: + xmin, ymin, xmax, ymax, score = box_score + f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'.format(xmin, ymin, ( + xmax - xmin + 1), (ymax - ymin + 1), score)) + f.close() + logger.info("The predicted result is saved as {}".format(ofname)) + + +def save_fddb_bboxes(bboxes_scores, + output_dir, + output_fname='pred_fddb_res.txt'): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + predict_file = os.path.join(output_dir, output_fname) + f = open(predict_file, 'w') + for image_path, dets in bboxes_scores.iteritems(): + f.write('{:s}\n'.format(image_path)) + f.write('{:d}\n'.format(dets.shape[0])) + for box_score in dets: + xmin, ymin, xmax, ymax, score = box_score + width, height = xmax - xmin, ymax - ymin + f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n' + .format(xmin, ymin, width, height, score)) + logger.info("The predicted result is saved as {}".format(predict_file)) + return predict_file + + +def get_category_info(anno_file=None, + with_background=True, + use_default_label=False): + if use_default_label or anno_file is None \ + or not os.path.exists(anno_file): + logger.info("Not found annotation file {}, load " + "wider-face categories.".format(anno_file)) + return widerfaceall_category_info(with_background) + else: + logger.info("Load categories from {}".format(anno_file)) + return get_category_info_from_anno(anno_file, with_background) + + +def get_category_info_from_anno(anno_file, with_background=True): + """ + Get class id to category id map and category id + to category name map from annotation file. + Args: + anno_file (str): annotation file path + with_background (bool, default True): + whether load background as class 0. + """ + cats = [] + with open(anno_file) as f: + for line in f.readlines(): + cats.append(line.strip()) + + if cats[0] != 'background' and with_background: + cats.insert(0, 'background') + if cats[0] == 'background' and not with_background: + cats = cats[1:] + + clsid2catid = {i: i for i in range(len(cats))} + catid2name = {i: name for i, name in enumerate(cats)} + + return clsid2catid, catid2name + + +def widerfaceall_category_info(with_background=True): + """ + Get class id to category id map and category id + to category name map of mixup wider_face dataset + + Args: + with_background (bool, default True): + whether load background as class 0. + """ + label_map = widerface_label(with_background) + label_map = sorted(label_map.items(), key=lambda x: x[1]) + cats = [l[0] for l in label_map] + + if with_background: + cats.insert(0, 'background') + + clsid2catid = {i: i for i in range(len(cats))} + catid2name = {i: name for i, name in enumerate(cats)} + + return clsid2catid, catid2name diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..798b006ad7baa3b4f24b4a7db15a63d6d2e533f4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +tqdm +docstring_parser @ http://github.com/willthefrog/docstring_parser/tarball/master +typeguard ; python_version >= '3.4' +tb-paddle +tb-nightly diff --git a/slim/distillation/README.md b/slim/distillation/README.md new file mode 100755 index 0000000000000000000000000000000000000000..e970cc42b54c17a6131c4873662fb2be46767b60 --- /dev/null +++ b/slim/distillation/README.md @@ -0,0 +1,141 @@ +>运行该示例前请安装Paddle1.6或更高版本 + +# 检测模型蒸馏示例 + +## 概述 + +该示例使用PaddleSlim提供的[蒸馏策略](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/tutorial.md#3-蒸馏)对检测库中的模型进行蒸馏训练。 +在阅读该示例前,建议您先了解以下内容: + +- [检测库的常规训练方法](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/PaddleDetection) +- [PaddleSlim使用文档](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/usage.md) + + +## 配置文件说明 + +关于配置文件如何编写您可以参考: + +- [PaddleSlim配置文件编写说明](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/usage.md#122-%E9%85%8D%E7%BD%AE%E6%96%87%E4%BB%B6%E7%9A%84%E4%BD%BF%E7%94%A8) +- [蒸馏策略配置文件编写说明](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/usage.md#23-蒸馏) + +这里以ResNet34-YoloV3蒸馏MobileNetV1-YoloV3模型为例,首先,为了对`student model`和`teacher model`有个总体的认识,从而进一步确认蒸馏的对象,我们通过以下命令分别观察两个网络变量(Variable)的名称和形状: + +```python +# 观察student model的Variable +for v in fluid.default_main_program().list_vars(): + if "py_reader" not in v.name and "double_buffer" not in v.name and "generated_var" not in v.name: + print(v.name, v.shape) +# 观察teacher model的Variable +for v in teacher_program.list_vars(): + print(v.name, v.shape) +``` + +经过对比可以发现,`student model`和`teacher model`的部分中间结果分别为: + +```bash +# student model +conv2d_15.tmp_0 +# teacher model +teacher_teacher_conv2d_1.tmp_0 +``` + + +所以,我们用`l2_distiller`对这两个特征图做蒸馏。在配置文件中进行如下配置: + +```yaml +distillers: + l2_distiller: + class: 'L2Distiller' + teacher_feature_map: 'teacher_teacher_conv2d_1.tmp_0' + student_feature_map: 'conv2d_15.tmp_0' + distillation_loss_weight: 1 +strategies: + distillation_strategy: + class: 'DistillationStrategy' + distillers: ['l2_distiller'] + start_epoch: 0 + end_epoch: 270 +``` + +我们也可以根据上述操作为蒸馏策略选择其他loss,PaddleSlim支持的有`FSP_loss`, `L2_loss`和`softmax_with_cross_entropy_loss` 。 + +## 训练 + +根据[PaddleDetection/tools/train.py](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/PaddleDetection/tools/train.py)编写压缩脚本compress.py。 +在该脚本中定义了Compressor对象,用于执行压缩任务。 + + + + +您可以通过运行脚本`run.sh`运行该示例。 + + +### 保存断点(checkpoint) + +如果在配置文件中设置了`checkpoint_path`, 则在蒸馏任务执行过程中会自动保存断点,当任务异常中断时, +重启任务会自动从`checkpoint_path`路径下按数字顺序加载最新的checkpoint文件。如果不想让重启的任务从断点恢复, +需要修改配置文件中的`checkpoint_path`,或者将`checkpoint_path`路径下文件清空。 + +>注意:配置文件中的信息不会保存在断点中,重启前对配置文件的修改将会生效。 + + +## 评估 + +如果在配置文件中设置了`checkpoint_path`,则每个epoch会保存一个压缩后的用于评估的模型, +该模型会保存在`${checkpoint_path}/${epoch_id}/eval_model/`路径下,包含`__model__`和`__params__`两个文件。 +其中,`__model__`用于保存模型结构信息,`__params__`用于保存参数(parameters)信息。 + +如果不需要保存评估模型,可以在定义Compressor对象时,将`save_eval_model`选项设置为False(默认为True)。 + +运行命令为: +``` +python ../eval.py \ + --model_path ${checkpoint_path}/${epoch_id}/eval_model/ \ + --model_name __model__ \ + --params_name __params__ \ + -c ../../configs/yolov3_mobilenet_v1_voc.yml \ + -d "../../dataset/voc" +``` + +## 预测 + +如果在配置文件中设置了`checkpoint_path`,并且在定义Compressor对象时指定了`prune_infer_model`选项,则每个epoch都会 +保存一个`inference model`。该模型是通过删除eval_program中多余的operators而得到的。 + +该模型会保存在`${checkpoint_path}/${epoch_id}/eval_model/`路径下,包含`__model__.infer`和`__params__`两个文件。 +其中,`__model__.infer`用于保存模型结构信息,`__params__`用于保存参数(parameters)信息。 + +更多关于`prune_infer_model`选项的介绍,请参考:[Compressor介绍](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/usage.md#121-%E5%A6%82%E4%BD%95%E6%94%B9%E5%86%99%E6%99%AE%E9%80%9A%E8%AE%AD%E7%BB%83%E8%84%9A%E6%9C%AC) + +### python预测 + +在脚本slim/infer.py中展示了如何使用fluid python API加载使用预测模型进行预测。 + +运行命令为: +``` +python ../infer.py \ + --model_path ${checkpoint_path}/${epoch_id}/eval_model/ \ + --model_name __model__.infer \ + --params_name __params__ \ + -c ../../configs/yolov3_mobilenet_v1_voc.yml \ + --infer_dir ../../demo +``` + +### PaddleLite + +该示例中产出的预测(inference)模型可以直接用PaddleLite进行加载使用。 +关于PaddleLite如何使用,请参考:[PaddleLite使用文档](https://github.com/PaddlePaddle/Paddle-Lite/wiki#%E4%BD%BF%E7%94%A8) + +## 示例结果 + +>当前release的结果并非超参调优后的最好结果,仅做示例参考,后续我们会优化当前结果。 + +### MobileNetV1-YOLO-V3 + +| FLOPS |Box AP| +|---|---| +|baseline|76.2 | +|蒸馏后|- | + + +## FAQ diff --git a/slim/distillation/compress.py b/slim/distillation/compress.py new file mode 100644 index 0000000000000000000000000000000000000000..6dc4fff9742ec258b30cad4408b5e6f428273495 --- /dev/null +++ b/slim/distillation/compress.py @@ -0,0 +1,324 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import multiprocessing +import numpy as np +from collections import deque, OrderedDict +from paddle.fluid.contrib.slim.core import Compressor +from paddle.fluid.framework import IrGraph + + +def set_paddle_flags(**kwargs): + for key, value in kwargs.items(): + if os.environ.get(key, None) is None: + os.environ[key] = str(value) + + +# NOTE(paddle-dev): All of these flags should be set before +# `import paddle`. Otherwise, it would not take any effect. +set_paddle_flags( + FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory +) + +from paddle import fluid + +import sys +sys.path.append("../../") +from ppdet.core.workspace import load_config, merge_config, create +from ppdet.data.data_feed import create_reader +from ppdet.utils.eval_utils import parse_fetches, eval_results +from ppdet.utils.stats import TrainingStats +from ppdet.utils.cli import ArgsParser +from ppdet.utils.check import check_gpu +import ppdet.utils.checkpoint as checkpoint +from ppdet.modeling.model_input import create_feed + +import logging +FORMAT = '%(asctime)s-%(levelname)s: %(message)s' +logging.basicConfig(level=logging.INFO, format=FORMAT) +logger = logging.getLogger(__name__) + + +def eval_run(exe, compile_program, reader, keys, values, cls, test_feed): + """ + Run evaluation program, return program outputs. + """ + iter_id = 0 + results = [] + if len(cls) != 0: + values = [] + for i in range(len(cls)): + _, accum_map = cls[i].get_map_var() + cls[i].reset(exe) + values.append(accum_map) + + images_num = 0 + start_time = time.time() + has_bbox = 'bbox' in keys + for data in reader(): + data = test_feed.feed(data) + feed_data = {'image': data['image'], 'im_size': data['im_size']} + outs = exe.run(compile_program, + feed=feed_data, + fetch_list=[values[0]], + return_numpy=False) + outs.append(data['gt_box']) + outs.append(data['gt_label']) + outs.append(data['is_difficult']) + res = { + k: (np.array(v), v.recursive_sequence_lengths()) + for k, v in zip(keys, outs) + } + results.append(res) + if iter_id % 100 == 0: + logger.info('Test iter {}'.format(iter_id)) + iter_id += 1 + images_num += len(res['bbox'][1][0]) if has_bbox else 1 + logger.info('Test finish iter {}'.format(iter_id)) + + end_time = time.time() + fps = images_num / (end_time - start_time) + if has_bbox: + logger.info('Total number of images: {}, inference time: {} fps.'. + format(images_num, fps)) + else: + logger.info('Total iteration: {}, inference time: {} batch/s.'.format( + images_num, fps)) + + return results + + +def main(): + cfg = load_config(FLAGS.config) + if 'architecture' in cfg: + main_arch = cfg.architecture + else: + raise ValueError("'architecture' not specified in config file.") + + merge_config(FLAGS.opt) + if 'log_iter' not in cfg: + cfg.log_iter = 20 + + # check if set use_gpu=True in paddlepaddle cpu version + check_gpu(cfg.use_gpu) + + if cfg.use_gpu: + devices_num = fluid.core.get_cuda_device_count() + else: + devices_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + + if 'train_feed' not in cfg: + train_feed = create(main_arch + 'TrainFeed') + else: + train_feed = create(cfg.train_feed) + + if 'eval_feed' not in cfg: + eval_feed = create(main_arch + 'EvalFeed') + else: + eval_feed = create(cfg.eval_feed) + + place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + + lr_builder = create('LearningRate') + optim_builder = create('OptimizerBuilder') + + # build program + model = create(main_arch) + _, train_feed_vars = create_feed(train_feed, False) + train_fetches = model.train(train_feed_vars) + loss = train_fetches['loss'] + lr = lr_builder() + opt = optim_builder(lr) + opt.minimize(loss) + #for v in fluid.default_main_program().list_vars(): + # if "py_reader" not in v.name and "double_buffer" not in v.name and "generated_var" not in v.name: + # print(v.name, v.shape) + + cfg.max_iters = 258 + train_reader = create_reader(train_feed, cfg.max_iters, FLAGS.dataset_dir) + + exe.run(fluid.default_startup_program()) + + # parse train fetches + train_keys, train_values, _ = parse_fetches(train_fetches) + train_keys.append('lr') + train_values.append(lr.name) + + train_fetch_list = [] + for k, v in zip(train_keys, train_values): + train_fetch_list.append((k, v)) + print("train_fetch_list: {}".format(train_fetch_list)) + + eval_prog = fluid.Program() + startup_prog = fluid.Program() + with fluid.program_guard(eval_prog, startup_prog): + with fluid.unique_name.guard(): + model = create(main_arch) + _, test_feed_vars = create_feed(eval_feed, False) + fetches = model.eval(test_feed_vars) + eval_prog = eval_prog.clone(True) + + eval_reader = create_reader(eval_feed, args_path=FLAGS.dataset_dir) + test_data_feed = fluid.DataFeeder(test_feed_vars.values(), place) + + # parse eval fetches + extra_keys = [] + if cfg.metric == 'COCO': + extra_keys = ['im_info', 'im_id', 'im_shape'] + if cfg.metric == 'VOC': + extra_keys = ['gt_box', 'gt_label', 'is_difficult'] + eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog, + extra_keys) + + eval_fetch_list = [] + for k, v in zip(eval_keys, eval_values): + eval_fetch_list.append((k, v)) + print("eval_fetch_list: {}".format(eval_fetch_list)) + + exe.run(startup_prog) + checkpoint.load_params(exe, + fluid.default_main_program(), cfg.pretrain_weights) + + best_box_ap_list = [] + + def eval_func(program, scope): + results = eval_run(exe, program, eval_reader, eval_keys, eval_values, + eval_cls, test_data_feed) + + resolution = None + is_bbox_normalized = False + if 'mask' in results[0]: + resolution = model.mask_head.resolution + box_ap_stats = eval_results(results, eval_feed, cfg.metric, + cfg.num_classes, resolution, + is_bbox_normalized, FLAGS.output_eval) + if len(best_box_ap_list) == 0: + best_box_ap_list.append(box_ap_stats[0]) + elif box_ap_stats[0] > best_box_ap_list[0]: + best_box_ap_list[0] = box_ap_stats[0] + logger.info("Best test box ap: {}".format(best_box_ap_list[0])) + return best_box_ap_list[0] + + test_feed = [('image', test_feed_vars['image'].name), + ('im_size', test_feed_vars['im_size'].name)] + + teacher_cfg = load_config(FLAGS.teacher_config) + teacher_arch = teacher_cfg.architecture + teacher_programs = [] + teacher_program = fluid.Program() + teacher_startup_program = fluid.Program() + with fluid.program_guard(teacher_program, teacher_startup_program): + with fluid.unique_name.guard('teacher_'): + teacher_feed_vars = OrderedDict() + for name, var in train_feed_vars.items(): + teacher_feed_vars[name] = teacher_program.global_block( + )._clone_variable( + var, force_persistable=False) + model = create(teacher_arch) + train_fetches = model.train(teacher_feed_vars) + #print("="*50+"teacher_model_params"+"="*50) + #for v in teacher_program.list_vars(): + # print(v.name, v.shape) + #return + + exe.run(teacher_startup_program) + assert FLAGS.teacher_pretrained and os.path.exists( + FLAGS.teacher_pretrained + ), "teacher_pretrained should be set when teacher_model is not None." + + def if_exist(var): + return os.path.exists(os.path.join(FLAGS.teacher_pretrained, var.name)) + + fluid.io.load_vars( + exe, + FLAGS.teacher_pretrained, + main_program=teacher_program, + predicate=if_exist) + + teacher_programs.append(teacher_program.clone(for_test=True)) + + com = Compressor( + place, + fluid.global_scope(), + fluid.default_main_program(), + train_reader=train_reader, + train_feed_list=[(key, value.name) + for key, value in train_feed_vars.items()], + train_fetch_list=train_fetch_list, + eval_program=eval_prog, + eval_reader=eval_reader, + eval_feed_list=test_feed, + eval_func={'map': eval_func}, + eval_fetch_list=eval_fetch_list[0:1], + save_eval_model=True, + prune_infer_model=[["image", "im_size"], ["multiclass_nms_0.tmp_0"]], + teacher_programs=teacher_programs, + train_optimizer=None, + distiller_optimizer=opt, + log_period=20) + com.config(FLAGS.slim_file) + com.run() + + +if __name__ == '__main__': + parser = ArgsParser() + parser.add_argument( + "-t", + "--teacher_config", + default=None, + type=str, + help="Config file of teacher architecture.") + parser.add_argument( + "-s", + "--slim_file", + default=None, + type=str, + help="Config file of PaddleSlim.") + parser.add_argument( + "-r", + "--resume_checkpoint", + default=None, + type=str, + help="Checkpoint path for resuming training.") + parser.add_argument( + "--eval", + action='store_true', + default=False, + help="Whether to perform evaluation in train") + parser.add_argument( + "--teacher_pretrained", + default=None, + type=str, + help="Whether to use pretrained model.") + parser.add_argument( + "--output_eval", + default=None, + type=str, + help="Evaluation directory, default is current directory.") + parser.add_argument( + "-d", + "--dataset_dir", + default=None, + type=str, + help="Dataset path, same as DataFeed.dataset.dataset_dir") + FLAGS = parser.parse_args() + main() diff --git a/slim/distillation/run.sh b/slim/distillation/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..a5497bdce2464c72e14fa2168b87db60685e83e8 --- /dev/null +++ b/slim/distillation/run.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +# download pretrain model +root_url="https://paddlemodels.bj.bcebos.com/object_detection" +yolov3_r34_voc="yolov3_r34_voc.tar" +pretrain_dir='./pretrain' + +if [ ! -d ${pretrain_dir} ]; then + mkdir ${pretrain_dir} +fi + +cd ${pretrain_dir} + +if [ ! -f ${yolov3_r34_voc} ]; then + wget ${root_url}/${yolov3_r34_voc} + tar xf ${yolov3_r34_voc} +fi +cd - + +# enable GC strategy +export FLAGS_fast_eager_deletion_mode=1 +export FLAGS_eager_delete_tensor_gb=0.0 + +# for distillation +#----------------- +export CUDA_VISIBLE_DEVICES=0,1,2,3 + + +# Fixing name conflicts in distillation +cd ${pretrain_dir}/yolov3_r34_voc +for files in $(ls teacher_*) + do mv $files ${files#*_} +done +for files in $(ls *) + do mv $files "teacher_"$files +done +cd - + +python -u compress.py \ +-c ../../configs/yolov3_mobilenet_v1_voc.yml \ +-t yolov3_resnet34.yml \ +-s yolov3_mobilenet_v1_yolov3_resnet34_distillation.yml \ +-o YoloTrainFeed.batch_size=64 \ +-d ../../dataset/voc \ +--teacher_pretrained ./pretrain/yolov3_r34_voc \ +> yolov3_distallation.log 2>&1 & +tailf yolov3_distallation.log diff --git a/slim/distillation/yolov3_mobilenet_v1_yolov3_resnet34_distillation.yml b/slim/distillation/yolov3_mobilenet_v1_yolov3_resnet34_distillation.yml new file mode 100644 index 0000000000000000000000000000000000000000..6a2a5a2575b71563c557c528a4dc94f00dce73ca --- /dev/null +++ b/slim/distillation/yolov3_mobilenet_v1_yolov3_resnet34_distillation.yml @@ -0,0 +1,18 @@ +version: 1.0 +distillers: + l2_distiller: + class: 'L2Distiller' + teacher_feature_map: 'teacher_teacher_conv2d_1.tmp_0' + student_feature_map: 'conv2d_15.tmp_0' + distillation_loss_weight: 1 +strategies: + distillation_strategy: + class: 'DistillationStrategy' + distillers: ['l2_distiller'] + start_epoch: 0 + end_epoch: 270 +compressor: + epoch: 271 + checkpoint_path: './checkpoints/' + strategies: + - distillation_strategy diff --git a/slim/distillation/yolov3_resnet34.yml b/slim/distillation/yolov3_resnet34.yml new file mode 100644 index 0000000000000000000000000000000000000000..c04bdde9f6e35a1ce53231e3862a37364ff7dbb8 --- /dev/null +++ b/slim/distillation/yolov3_resnet34.yml @@ -0,0 +1,34 @@ +architecture: YOLOv3 +log_smooth_window: 20 +metric: VOC +map_type: 11point +num_classes: 20 +weight_prefix_name: teacher_ + +YOLOv3: + backbone: ResNet + yolo_head: YOLOv3Head + +ResNet: + norm_type: sync_bn + freeze_at: 0 + freeze_norm: false + norm_decay: 0. + depth: 34 + feature_maps: [3, 4, 5] + +YOLOv3Head: + anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + anchors: [[10, 13], [16, 30], [33, 23], + [30, 61], [62, 45], [59, 119], + [116, 90], [156, 198], [373, 326]] + norm_decay: 0. + ignore_thresh: 0.7 + label_smooth: false + nms: + background_label: -1 + keep_top_k: 100 + nms_threshold: 0.45 + nms_top_k: 1000 + normalized: false + score_threshold: 0.01 diff --git a/slim/eval.py b/slim/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..579f58d2932e215362efa9e8864c16d1a451b26a --- /dev/null +++ b/slim/eval.py @@ -0,0 +1,194 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import multiprocessing +import numpy as np +import datetime +from collections import deque +import sys +sys.path.append("../../") +from paddle.fluid.contrib.slim import Compressor +from paddle.fluid.framework import IrGraph +from paddle.fluid import core +from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass +from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass +from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass +from paddle.fluid.contrib.slim.quantization import TransformForMobilePass + + +def set_paddle_flags(**kwargs): + for key, value in kwargs.items(): + if os.environ.get(key, None) is None: + os.environ[key] = str(value) + + +# NOTE(paddle-dev): All of these flags should be set before +# `import paddle`. Otherwise, it would not take any effect. +set_paddle_flags( + FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory +) + +from paddle import fluid + +from ppdet.core.workspace import load_config, merge_config, create +from ppdet.data.data_feed import create_reader + +from ppdet.utils.eval_utils import parse_fetches, eval_results +from ppdet.utils.stats import TrainingStats +from ppdet.utils.cli import ArgsParser +from ppdet.utils.check import check_gpu +import ppdet.utils.checkpoint as checkpoint +from ppdet.modeling.model_input import create_feed + +import logging +FORMAT = '%(asctime)s-%(levelname)s: %(message)s' +logging.basicConfig(level=logging.INFO, format=FORMAT) +logger = logging.getLogger(__name__) + + +def eval_run(exe, compile_program, reader, keys, values, cls, test_feed): + """ + Run evaluation program, return program outputs. + """ + iter_id = 0 + results = [] + + images_num = 0 + start_time = time.time() + has_bbox = 'bbox' in keys + for data in reader(): + data = test_feed.feed(data) + feed_data = {'image': data['image'], 'im_size': data['im_size']} + outs = exe.run(compile_program, + feed=feed_data, + fetch_list=values[0], + return_numpy=False) + outs.append(data['gt_box']) + outs.append(data['gt_label']) + outs.append(data['is_difficult']) + res = { + k: (np.array(v), v.recursive_sequence_lengths()) + for k, v in zip(keys, outs) + } + results.append(res) + if iter_id % 100 == 0: + logger.info('Test iter {}'.format(iter_id)) + iter_id += 1 + images_num += len(res['bbox'][1][0]) if has_bbox else 1 + logger.info('Test finish iter {}'.format(iter_id)) + + end_time = time.time() + fps = images_num / (end_time - start_time) + if has_bbox: + logger.info('Total number of images: {}, inference time: {} fps.'. + format(images_num, fps)) + else: + logger.info('Total iteration: {}, inference time: {} batch/s.'.format( + images_num, fps)) + + return results + + +def main(): + cfg = load_config(FLAGS.config) + if 'architecture' in cfg: + main_arch = cfg.architecture + else: + raise ValueError("'architecture' not specified in config file.") + + merge_config(FLAGS.opt) + if 'log_iter' not in cfg: + cfg.log_iter = 20 + + # check if set use_gpu=True in paddlepaddle cpu version + check_gpu(cfg.use_gpu) + + if cfg.use_gpu: + devices_num = fluid.core.get_cuda_device_count() + else: + devices_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + + if 'eval_feed' not in cfg: + eval_feed = create(main_arch + 'EvalFeed') + else: + eval_feed = create(cfg.eval_feed) + + place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + + _, test_feed_vars = create_feed(eval_feed, False) + + eval_reader = create_reader(eval_feed, args_path=FLAGS.dataset_dir) + #eval_pyreader.decorate_sample_list_generator(eval_reader, place) + test_data_feed = fluid.DataFeeder(test_feed_vars.values(), place) + + assert os.path.exists(FLAGS.model_path) + infer_prog, feed_names, fetch_targets = fluid.io.load_inference_model( + dirname=FLAGS.model_path, + executor=exe, + model_filename=FLAGS.model_name, + params_filename=FLAGS.params_name) + + eval_keys = ['bbox', 'gt_box', 'gt_label', 'is_difficult'] + eval_values = [ + 'multiclass_nms_0.tmp_0', 'gt_box', 'gt_label', 'is_difficult' + ] + eval_cls = [] + eval_values[0] = fetch_targets[0] + + results = eval_run(exe, infer_prog, eval_reader, eval_keys, eval_values, + eval_cls, test_data_feed) + + resolution = None + if 'mask' in results[0]: + resolution = model.mask_head.resolution + eval_results(results, eval_feed, cfg.metric, cfg.num_classes, resolution, + False, FLAGS.output_eval) + + +if __name__ == '__main__': + parser = ArgsParser() + parser.add_argument( + "-m", "--model_path", default=None, type=str, help="path of checkpoint") + parser.add_argument( + "--output_eval", + default=None, + type=str, + help="Evaluation directory, default is current directory.") + parser.add_argument( + "-d", + "--dataset_dir", + default=None, + type=str, + help="Dataset path, same as DataFeed.dataset.dataset_dir") + parser.add_argument( + "--model_name", + default='model', + type=str, + help="model file name to load_inference_model") + parser.add_argument( + "--params_name", + default='params', + type=str, + help="params file name to load_inference_model") + + FLAGS = parser.parse_args() + main() diff --git a/slim/infer.py b/slim/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..a5c00de983a15c40db60deb9097c3d8f92a557aa --- /dev/null +++ b/slim/infer.py @@ -0,0 +1,300 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import glob +import time + +import numpy as np +from PIL import Image +sys.path.append("../../") + + +def set_paddle_flags(**kwargs): + for key, value in kwargs.items(): + if os.environ.get(key, None) is None: + os.environ[key] = str(value) + + +# NOTE(paddle-dev): All of these flags should be set before +# `import paddle`. Otherwise, it would not take any effect. +set_paddle_flags( + FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory +) + +from paddle import fluid +from ppdet.utils.cli import print_total_cfg +from ppdet.core.workspace import load_config, merge_config, create +from ppdet.modeling.model_input import create_feed +from ppdet.data.data_feed import create_reader + +from ppdet.utils.eval_utils import parse_fetches +from ppdet.utils.cli import ArgsParser +from ppdet.utils.check import check_gpu +from ppdet.utils.visualizer import visualize_results +import ppdet.utils.checkpoint as checkpoint + +import logging +FORMAT = '%(asctime)s-%(levelname)s: %(message)s' +logging.basicConfig(level=logging.INFO, format=FORMAT) +logger = logging.getLogger(__name__) + + +def get_save_image_name(output_dir, image_path): + """ + Get save image name from source image path. + """ + if not os.path.exists(output_dir): + os.makedirs(output_dir) + image_name = os.path.split(image_path)[-1] + name, ext = os.path.splitext(image_name) + return os.path.join(output_dir, "{}".format(name)) + ext + + +def get_test_images(infer_dir, infer_img): + """ + Get image path list in TEST mode + """ + assert infer_img is not None or infer_dir is not None, \ + "--infer_img or --infer_dir should be set" + assert infer_img is None or os.path.isfile(infer_img), \ + "{} is not a file".format(infer_img) + assert infer_dir is None or os.path.isdir(infer_dir), \ + "{} is not a directory".format(infer_dir) + images = [] + + # infer_img has a higher priority + if infer_img and os.path.isfile(infer_img): + images.append(infer_img) + return images + + infer_dir = os.path.abspath(infer_dir) + assert os.path.isdir(infer_dir), \ + "infer_dir {} is not a directory".format(infer_dir) + exts = ['jpg', 'jpeg', 'png', 'bmp'] + exts += [ext.upper() for ext in exts] + for ext in exts: + images.extend(glob.glob('{}/*.{}'.format(infer_dir, ext))) + + assert len(images) > 0, "no image found in {}".format(infer_dir) + logger.info("Found {} inference images in total.".format(len(images))) + + return images + + +def main(): + cfg = load_config(FLAGS.config) + + if 'architecture' in cfg: + main_arch = cfg.architecture + else: + raise ValueError("'architecture' not specified in config file.") + + merge_config(FLAGS.opt) + + # check if set use_gpu=True in paddlepaddle cpu version + check_gpu(cfg.use_gpu) + # print_total_cfg(cfg) + + if 'test_feed' not in cfg: + test_feed = create(main_arch + 'TestFeed') + else: + test_feed = create(cfg.test_feed) + + test_images = get_test_images(FLAGS.infer_dir, FLAGS.infer_img) + test_feed.dataset.add_images(test_images) + + place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + + infer_prog, feed_var_names, fetch_list = fluid.io.load_inference_model( + dirname=FLAGS.model_path, + model_filename=FLAGS.model_name, + params_filename=FLAGS.params_name, + executor=exe) + + reader = create_reader(test_feed) + feeder = fluid.DataFeeder( + place=place, feed_list=feed_var_names, program=infer_prog) + + # parse infer fetches + assert cfg.metric in ['COCO', 'VOC'], \ + "unknown metric type {}".format(cfg.metric) + extra_keys = [] + if cfg['metric'] == 'COCO': + extra_keys = ['im_info', 'im_id', 'im_shape'] + if cfg['metric'] == 'VOC': + extra_keys = ['im_id', 'im_shape'] + keys, values, _ = parse_fetches({ + 'bbox': fetch_list + }, infer_prog, extra_keys) + + # parse dataset category + if cfg.metric == 'COCO': + from ppdet.utils.coco_eval import bbox2out, mask2out, get_category_info + if cfg.metric == "VOC": + from ppdet.utils.voc_eval import bbox2out, get_category_info + + anno_file = getattr(test_feed.dataset, 'annotation', None) + with_background = getattr(test_feed, 'with_background', True) + use_default_label = getattr(test_feed, 'use_default_label', False) + clsid2catid, catid2name = get_category_info(anno_file, with_background, + use_default_label) + + # whether output bbox is normalized in model output layer + is_bbox_normalized = False + + # use tb-paddle to log image + if FLAGS.use_tb: + from tb_paddle import SummaryWriter + tb_writer = SummaryWriter(FLAGS.tb_log_dir) + tb_image_step = 0 + tb_image_frame = 0 # each frame can display ten pictures at most. + + imid2path = reader.imid2path + keys = ['bbox'] + infer_time = True + compile_prog = fluid.compiler.CompiledProgram(infer_prog) + + for iter_id, data in enumerate(reader()): + feed_data = [[d[0], d[1]] for d in data] + # for infer time + if infer_time: + warmup_times = 10 + repeats_time = 100 + feed_data_dict = feeder.feed(feed_data) + for i in range(warmup_times): + exe.run(compile_prog, + feed=feed_data_dict, + fetch_list=fetch_list, + return_numpy=False) + start_time = time.time() + for i in range(repeats_time): + exe.run(compile_prog, + feed=feed_data_dict, + fetch_list=fetch_list, + return_numpy=False) + + print("infer time: {} ms/sample".format((time.time() - start_time) * + 1000 / repeats_time)) + infer_time = False + + outs = exe.run(compile_prog, + feed=feeder.feed(feed_data), + fetch_list=fetch_list, + return_numpy=False) + res = { + k: (np.array(v), v.recursive_sequence_lengths()) + for k, v in zip(keys, outs) + } + res['im_id'] = [[d[2] for d in data]] + logger.info('Infer iter {}'.format(iter_id)) + + bbox_results = None + mask_results = None + if 'bbox' in res: + bbox_results = bbox2out([res], clsid2catid, is_bbox_normalized) + if 'mask' in res: + mask_results = mask2out([res], clsid2catid, + model.mask_head.resolution) + + # visualize result + im_ids = res['im_id'][0] + for im_id in im_ids: + image_path = imid2path[int(im_id)] + image = Image.open(image_path).convert('RGB') + + # use tb-paddle to log original image + if FLAGS.use_tb: + original_image_np = np.array(image) + tb_writer.add_image( + "original/frame_{}".format(tb_image_frame), + original_image_np, + tb_image_step, + dataformats='HWC') + + image = visualize_results(image, + int(im_id), catid2name, + FLAGS.draw_threshold, bbox_results, + mask_results) + + # use tb-paddle to log image with bbox + if FLAGS.use_tb: + infer_image_np = np.array(image) + tb_writer.add_image( + "bbox/frame_{}".format(tb_image_frame), + infer_image_np, + tb_image_step, + dataformats='HWC') + tb_image_step += 1 + if tb_image_step % 10 == 0: + tb_image_step = 0 + tb_image_frame += 1 + + save_name = get_save_image_name(FLAGS.output_dir, image_path) + logger.info("Detection bbox results save in {}".format(save_name)) + image.save(save_name, quality=95) + + +if __name__ == '__main__': + parser = ArgsParser() + parser.add_argument( + "--infer_dir", + type=str, + default=None, + help="Directory for images to perform inference on.") + parser.add_argument( + "--infer_img", + type=str, + default=None, + help="Image path, has higher priority over --infer_dir") + parser.add_argument( + "--output_dir", + type=str, + default="output", + help="Directory for storing the output visualization files.") + parser.add_argument( + "--draw_threshold", + type=float, + default=0.5, + help="Threshold to reserve the result for visualization.") + parser.add_argument( + "--use_tb", + type=bool, + default=False, + help="whether to record the data to Tensorboard.") + parser.add_argument( + '--tb_log_dir', + type=str, + default="tb_log_dir/image", + help='Tensorboard logging directory for image.') + parser.add_argument( + '--model_path', type=str, default=None, help="inference model path") + parser.add_argument( + '--model_name', + type=str, + default='__model__.infer', + help="model filename for inference model") + parser.add_argument( + '--params_name', + type=str, + default='__params__', + help="params filename for inference model") + FLAGS = parser.parse_args() + main() diff --git a/slim/prune/README.md b/slim/prune/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b06fdd2bdd6a3cd75eb00ab7952dfd546b2bfaad --- /dev/null +++ b/slim/prune/README.md @@ -0,0 +1,221 @@ +>运行该示例前请安装Paddle1.6或更高版本 + +# 检测模型卷积通道剪裁示例 + +## 概述 + +该示例使用PaddleSlim提供的[卷积通道剪裁压缩策略](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/tutorial.md#2-%E5%8D%B7%E7%A7%AF%E6%A0%B8%E5%89%AA%E8%A3%81%E5%8E%9F%E7%90%86)对检测库中的模型进行压缩。 +在阅读该示例前,建议您先了解以下内容: + +- 检测库的常规训练方法 +- [检测模型数据准备](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/PaddleDetection/docs/INSTALL_cn.md#%E6%95%B0%E6%8D%AE%E9%9B%86) +- [PaddleSlim使用文档](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/usage.md) + + +## 配置文件说明 + +关于配置文件如何编写您可以参考: + +- [PaddleSlim配置文件编写说明](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/usage.md#122-%E9%85%8D%E7%BD%AE%E6%96%87%E4%BB%B6%E7%9A%84%E4%BD%BF%E7%94%A8) +- [裁剪策略配置文件编写说明](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/usage.md#22-%E6%A8%A1%E5%9E%8B%E9%80%9A%E9%81%93%E5%89%AA%E8%A3%81) + +其中,配置文件中的`pruned_params`需要根据当前模型的网络结构特点设置,它用来指定要裁剪的parameters. + +这里以MobileNetV1-YoloV3模型为例,其卷积可以三种:主干网络中的普通卷积,主干网络中的`depthwise convolution`和`yolo block`里的普通卷积。PaddleSlim暂时无法对`depthwise convolution`直接进行剪裁, 因为`depthwise convolution`的`channel`的变化会同时影响到前后的卷积层。我们这里只对主干网络中的普通卷积和`yolo block`里的普通卷积做裁剪。 + +通过以下方式可视化模型结构: + +``` +from paddle.fluid.framework import IrGraph +from paddle.fluid import core + +graph = IrGraph(core.Graph(train_prog.desc), for_test=True) +marked_nodes = set() +for op in graph.all_op_nodes(): + print(op.name()) + if op.name().find('conv') > -1: + marked_nodes.add(op) +graph.draw('.', 'forward', marked_nodes) +``` + +该示例中MobileNetV1-YoloV3模型结构的可视化结果:MobileNetV1-YoloV3.pdf + +同时通过以下命令观察目标卷积层的参数(parameters)的名称和shape: + +``` +for param in fluid.default_main_program().global_block().all_parameters(): + if 'weights' in param.name: + print(param.name, param.shape) +``` + + +从可视化结果,我们可以排除后续会做concat的卷积层,最终得到如下要裁剪的参数名称: + +``` +conv2_1_sep_weights +conv2_2_sep_weights +conv3_1_sep_weights +conv4_1_sep_weights +conv5_1_sep_weights +conv5_2_sep_weights +conv5_3_sep_weights +conv5_4_sep_weights +conv5_5_sep_weights +conv5_6_sep_weights +yolo_block.0.0.0.conv.weights +yolo_block.0.0.1.conv.weights +yolo_block.0.1.0.conv.weights +yolo_block.0.1.1.conv.weights +yolo_block.1.0.0.conv.weights +yolo_block.1.0.1.conv.weights +yolo_block.1.1.0.conv.weights +yolo_block.1.1.1.conv.weights +yolo_block.1.2.conv.weights +yolo_block.2.0.0.conv.weights +yolo_block.2.0.1.conv.weights +yolo_block.2.1.1.conv.weights +yolo_block.2.2.conv.weights +yolo_block.2.tip.conv.weights +``` + +``` +(conv2_1_sep_weights)|(conv2_2_sep_weights)|(conv3_1_sep_weights)|(conv4_1_sep_weights)|(conv5_1_sep_weights)|(conv5_2_sep_weights)|(conv5_3_sep_weights)|(conv5_4_sep_weights)|(conv5_5_sep_weights)|(conv5_6_sep_weights)|(yolo_block.0.0.0.conv.weights)|(yolo_block.0.0.1.conv.weights)|(yolo_block.0.1.0.conv.weights)|(yolo_block.0.1.1.conv.weights)|(yolo_block.1.0.0.conv.weights)|(yolo_block.1.0.1.conv.weights)|(yolo_block.1.1.0.conv.weights)|(yolo_block.1.1.1.conv.weights)|(yolo_block.1.2.conv.weights)|(yolo_block.2.0.0.conv.weights)|(yolo_block.2.0.1.conv.weights)|(yolo_block.2.1.1.conv.weights)|(yolo_block.2.2.conv.weights)|(yolo_block.2.tip.conv.weights) +``` + +综上,我们将MobileNetV2配置文件中的`pruned_params`设置为以下正则表达式: + +``` +(conv2_1_sep_weights)|(conv2_2_sep_weights)|(conv3_1_sep_weights)|(conv4_1_sep_weights)|(conv5_1_sep_weights)|(conv5_2_sep_weights)|(conv5_3_sep_weights)|(conv5_4_sep_weights)|(conv5_5_sep_weights)|(conv5_6_sep_weights)|(yolo_block.0.0.0.conv.weights)|(yolo_block.0.0.1.conv.weights)|(yolo_block.0.1.0.conv.weights)|(yolo_block.0.1.1.conv.weights)|(yolo_block.1.0.0.conv.weights)|(yolo_block.1.0.1.conv.weights)|(yolo_block.1.1.0.conv.weights)|(yolo_block.1.1.1.conv.weights)|(yolo_block.1.2.conv.weights)|(yolo_block.2.0.0.conv.weights)|(yolo_block.2.0.1.conv.weights)|(yolo_block.2.1.1.conv.weights)|(yolo_block.2.2.conv.weights)|(yolo_block.2.tip.conv.weights) +``` + +我们可以用上述操作观察其它检测模型的参数名称规律,然后设置合适的正则表达式来剪裁合适的参数。 + +## 训练 + +根据PaddleDetection/tools/train.py编写压缩脚本compress.py。 +在该脚本中定义了Compressor对象,用于执行压缩任务。 + +### 执行示例 + +step1: 设置gpu卡 +``` +export CUDA_VISIBLE_DEVICES=0 +``` +step2: 开始训练 + +使用PaddleDetection提供的配置文件在用8卡进行训练: + +``` +python compress.py \ + -s yolov3_mobilenet_v1_slim.yaml \ + -c ../../configs/yolov3_mobilenet_v1_voc.yml \ + -o max_iters=258 \ + YoloTrainFeed.batch_size=64 \ + -d "../../dataset/voc" +``` + +>通过命令行覆盖设置max_iters选项,因为PaddleDetection中训练是以`batch`为单位迭代的,并没有涉及`epoch`的概念,但是PaddleSlim需要知道当前训练进行到第几个`epoch`, 所以需要将`max_iters`设置为一个`epoch`内的`batch`的数量。 + +如果要调整训练卡数,需要调整配置文件`yolov3_mobilenet_v1_voc.yml`中的以下参数: + +- **max_iters:** 一个`epoch`中batch的数量,需要设置为`total_num / batch_size`, 其中`total_num`为训练样本总数量,`batch_size`为多卡上总的batch size. +- **YoloTrainFeed.batch_size:** 当使用DataLoader时,表示单张卡上的batch size; 当使用普通reader时,则表示多卡上的总的`batch_size`。`batch_size`受限于显存大小。 +- **LeaningRate.base_lr:** 根据多卡的总`batch_size`调整`base_lr`,两者大小正相关,可以简单的按比例进行调整。 +- **LearningRate.schedulers.PiecewiseDecay.milestones:** 请根据batch size的变化对其调整。 +- **LearningRate.schedulers.PiecewiseDecay.LinearWarmup.steps:** 请根据batch size的变化对其进行调整。 + + +以下为4卡训练示例,通过命令行覆盖`yolov3_mobilenet_v1_voc.yml`中的参数: + +``` +python compress.py \ + -s yolov3_mobilenet_v1_slim.yaml \ + -c ../../configs/yolov3_mobilenet_v1_voc.yml \ + -o max_iters=258 \ + YoloTrainFeed.batch_size=64 \ + -d "../../dataset/voc" +``` + +以下为2卡训练示例,受显存所制,单卡`batch_size`不变,总`batch_size`减小,`base_lr`减小,一个epoch内batch数量增加,同时需要调整学习率相关参数,如下: +``` +python compress.py \ + -s yolov3_mobilenet_v1_slim.yaml \ + -c ../../configs/yolov3_mobilenet_v1_voc.yml \ + -o max_iters=516 \ + LeaningRate.base_lr=0.005 \ + YoloTrainFeed.batch_size=32 \ + LearningRate.schedulers='[!PiecewiseDecay {gamma: 0.1, milestones: [110000, 124000]}, !LinearWarmup {start_factor: 0., steps: 2000}]' \ + -d "../../dataset/voc" +``` + +通过`python compress.py --help`查看可配置参数。 +通过`python ../../tools/configure.py ${option_name} help`查看如何通过命令行覆盖配置文件`yolov3_mobilenet_v1_voc.yml`中的参数。 + +### 保存断点(checkpoint) + +如果在配置文件中设置了`checkpoint_path`, 则在压缩任务执行过程中会自动保存断点,当任务异常中断时, +重启任务会自动从`checkpoint_path`路径下按数字顺序加载最新的checkpoint文件。如果不想让重启的任务从断点恢复, +需要修改配置文件中的`checkpoint_path`,或者将`checkpoint_path`路径下文件清空。 + +>注意:配置文件中的信息不会保存在断点中,重启前对配置文件的修改将会生效。 + + +## 评估 + +如果在配置文件中设置了`checkpoint_path`,则每个epoch会保存一个压缩后的用于评估的模型, +该模型会保存在`${checkpoint_path}/${epoch_id}/eval_model/`路径下,包含`__model__`和`__params__`两个文件。 +其中,`__model__`用于保存模型结构信息,`__params__`用于保存参数(parameters)信息。 + +如果不需要保存评估模型,可以在定义Compressor对象时,将`save_eval_model`选项设置为False(默认为True)。 + +运行命令为: +``` +python ../eval.py \ + --model_path ${checkpoint_path}/${epoch_id}/eval_model/ \ + --model_name __model__ \ + --params_name __params__ \ + -c ../../configs/yolov3_mobilenet_v1_voc.yml \ + -d "../../dataset/voc" +``` + +## 预测 + +如果在配置文件中设置了`checkpoint_path`,并且在定义Compressor对象时指定了`prune_infer_model`选项,则每个epoch都会 +保存一个`inference model`。该模型是通过删除eval_program中多余的operators而得到的。 + +该模型会保存在`${checkpoint_path}/${epoch_id}/eval_model/`路径下,包含`__model__.infer`和`__params__`两个文件。 +其中,`__model__.infer`用于保存模型结构信息,`__params__`用于保存参数(parameters)信息。 + +更多关于`prune_infer_model`选项的介绍,请参考:[Compressor介绍](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/usage.md#121-%E5%A6%82%E4%BD%95%E6%94%B9%E5%86%99%E6%99%AE%E9%80%9A%E8%AE%AD%E7%BB%83%E8%84%9A%E6%9C%AC) + +### python预测 + +在脚本PaddleDetection/tools/infer.py中展示了如何使用fluid python API加载使用预测模型进行预测。 + +运行命令为: +``` +python ../infer.py \ + --model_path ${checkpoint_path}/${epoch_id}/eval_model/ \ + --model_name __model__.infer \ + --params_name __params__ \ + -c ../../configs/yolov3_mobilenet_v1_voc.yml \ + --infer_dir ../../demo +``` + +### PaddleLite + +该示例中产出的预测(inference)模型可以直接用PaddleLite进行加载使用。 +关于PaddleLite如何使用,请参考:[PaddleLite使用文档](https://github.com/PaddlePaddle/Paddle-Lite/wiki#%E4%BD%BF%E7%94%A8) + +## 示例结果 + +> 当前release的结果并非超参调优后的最好结果,仅做示例参考,后续我们会优化当前结果。 + +### MobileNetV1-YOLO-V3 + +| FLOPS |Box AP| model_size |Paddle Fluid inference time(ms)| Paddle Lite inference time(ms)| +|---|---|---|---|---| +|baseline|76.2 |93M |- |-| +|-50%|69.48 |51M |- |-| + +## FAQ diff --git a/slim/prune/compress.py b/slim/prune/compress.py new file mode 100644 index 0000000000000000000000000000000000000000..66f9a0cb50514bfbdf2964024064caf6702cfbc4 --- /dev/null +++ b/slim/prune/compress.py @@ -0,0 +1,255 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import multiprocessing +import numpy as np +import sys +sys.path.append("../../") +from paddle.fluid.contrib.slim import Compressor + + +def set_paddle_flags(**kwargs): + for key, value in kwargs.items(): + if os.environ.get(key, None) is None: + os.environ[key] = str(value) + + +# NOTE(paddle-dev): All of these flags should be set before +# `import paddle`. Otherwise, it would not take any effect. +set_paddle_flags( + FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory +) + +from paddle import fluid +from ppdet.core.workspace import load_config, merge_config, create +from ppdet.data.data_feed import create_reader +from ppdet.utils.eval_utils import parse_fetches, eval_results +from ppdet.utils.cli import ArgsParser +from ppdet.utils.check import check_gpu +import ppdet.utils.checkpoint as checkpoint +from ppdet.modeling.model_input import create_feed + +import logging +FORMAT = '%(asctime)s-%(levelname)s: %(message)s' +logging.basicConfig(level=logging.INFO, format=FORMAT) +logger = logging.getLogger(__name__) + + +def eval_run(exe, compile_program, reader, keys, values, cls, test_feed): + """ + Run evaluation program, return program outputs. + """ + iter_id = 0 + results = [] + if len(cls) != 0: + values = [] + for i in range(len(cls)): + _, accum_map = cls[i].get_map_var() + cls[i].reset(exe) + values.append(accum_map) + + images_num = 0 + start_time = time.time() + has_bbox = 'bbox' in keys + for data in reader(): + data = test_feed.feed(data) + feed_data = {'image': data['image'], 'im_size': data['im_size']} + outs = exe.run(compile_program, + feed=feed_data, + fetch_list=[values[0]], + return_numpy=False) + outs.append(data['gt_box']) + outs.append(data['gt_label']) + outs.append(data['is_difficult']) + res = { + k: (np.array(v), v.recursive_sequence_lengths()) + for k, v in zip(keys, outs) + } + results.append(res) + if iter_id % 100 == 0: + logger.info('Test iter {}'.format(iter_id)) + iter_id += 1 + images_num += len(res['bbox'][1][0]) if has_bbox else 1 + logger.info('Test finish iter {}'.format(iter_id)) + + end_time = time.time() + fps = images_num / (end_time - start_time) + if has_bbox: + logger.info('Total number of images: {}, inference time: {} fps.'. + format(images_num, fps)) + else: + logger.info('Total iteration: {}, inference time: {} batch/s.'.format( + images_num, fps)) + + return results + + +def main(): + cfg = load_config(FLAGS.config) + if 'architecture' in cfg: + main_arch = cfg.architecture + else: + raise ValueError("'architecture' not specified in config file.") + + merge_config(FLAGS.opt) + if 'log_iter' not in cfg: + cfg.log_iter = 20 + + # check if set use_gpu=True in paddlepaddle cpu version + check_gpu(cfg.use_gpu) + + if cfg.use_gpu: + devices_num = fluid.core.get_cuda_device_count() + else: + devices_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + + if 'train_feed' not in cfg: + train_feed = create(main_arch + 'TrainFeed') + else: + train_feed = create(cfg.train_feed) + + if 'eval_feed' not in cfg: + eval_feed = create(main_arch + 'EvalFeed') + else: + eval_feed = create(cfg.eval_feed) + + place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + + lr_builder = create('LearningRate') + optim_builder = create('OptimizerBuilder') + + # build program + startup_prog = fluid.Program() + train_prog = fluid.Program() + with fluid.program_guard(train_prog, startup_prog): + with fluid.unique_name.guard(): + model = create(main_arch) + _, feed_vars = create_feed(train_feed, False) + train_fetches = model.train(feed_vars) + loss = train_fetches['loss'] + lr = lr_builder() + optimizer = optim_builder(lr) + optimizer.minimize(loss) + + train_reader = create_reader(train_feed, cfg.max_iters, FLAGS.dataset_dir) + + # parse train fetches + train_keys, train_values, _ = parse_fetches(train_fetches) + train_keys.append("lr") + train_values.append(lr.name) + + train_fetch_list = [] + for k, v in zip(train_keys, train_values): + train_fetch_list.append((k, v)) + + eval_prog = fluid.Program() + with fluid.program_guard(eval_prog, startup_prog): + with fluid.unique_name.guard(): + model = create(main_arch) + _, test_feed_vars = create_feed(eval_feed, False) + fetches = model.eval(test_feed_vars) + + eval_prog = eval_prog.clone(True) + + eval_reader = create_reader(eval_feed, args_path=FLAGS.dataset_dir) + test_data_feed = fluid.DataFeeder(test_feed_vars.values(), place) + + # parse eval fetches + extra_keys = [] + if cfg.metric == 'COCO': + extra_keys = ['im_info', 'im_id', 'im_shape'] + if cfg.metric == 'VOC': + extra_keys = ['gt_box', 'gt_label', 'is_difficult'] + eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog, + extra_keys) + eval_fetch_list = [] + for k, v in zip(eval_keys, eval_values): + eval_fetch_list.append((k, v)) + + exe.run(startup_prog) + checkpoint.load_params(exe, train_prog, cfg.pretrain_weights) + + best_box_ap_list = [] + + def eval_func(program, scope): + + #place = fluid.CPUPlace() + #exe = fluid.Executor(place) + results = eval_run(exe, program, eval_reader, eval_keys, eval_values, + eval_cls, test_data_feed) + + resolution = None + if 'mask' in results[0]: + resolution = model.mask_head.resolution + box_ap_stats = eval_results(results, eval_feed, cfg.metric, + cfg.num_classes, resolution, False, + FLAGS.output_eval) + if len(best_box_ap_list) == 0: + best_box_ap_list.append(box_ap_stats[0]) + elif box_ap_stats[0] > best_box_ap_list[0]: + best_box_ap_list[0] = box_ap_stats[0] + logger.info("Best test box ap: {}".format(best_box_ap_list[0])) + return best_box_ap_list[0] + + test_feed = [('image', test_feed_vars['image'].name), + ('im_size', test_feed_vars['im_size'].name)] + + com = Compressor( + place, + fluid.global_scope(), + train_prog, + train_reader=train_reader, + train_feed_list=[(key, value.name) for key, value in feed_vars.items()], + train_fetch_list=train_fetch_list, + eval_program=eval_prog, + eval_reader=eval_reader, + eval_feed_list=test_feed, + eval_func={'map': eval_func}, + eval_fetch_list=[eval_fetch_list[0]], + save_eval_model=True, + prune_infer_model=[["image", "im_size"], ["multiclass_nms_0.tmp_0"]], + train_optimizer=None) + com.config(FLAGS.slim_file) + com.run() + + +if __name__ == '__main__': + parser = ArgsParser() + parser.add_argument( + "-s", + "--slim_file", + default=None, + type=str, + help="Config file of PaddleSlim.") + parser.add_argument( + "--output_eval", + default=None, + type=str, + help="Evaluation directory, default is current directory.") + parser.add_argument( + "-d", + "--dataset_dir", + default=None, + type=str, + help="Dataset path, same as DataFeed.dataset.dataset_dir") + FLAGS = parser.parse_args() + main() diff --git a/slim/prune/images/MobileNetV1-YoloV3.pdf b/slim/prune/images/MobileNetV1-YoloV3.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f5d3a22db5030ffc6beb1e7f8c92bddd61e366e6 Binary files /dev/null and b/slim/prune/images/MobileNetV1-YoloV3.pdf differ diff --git a/slim/prune/yolov3_mobilenet_v1_slim.yaml b/slim/prune/yolov3_mobilenet_v1_slim.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ff9859eafe5872dee13f33a3703502e334d1c85e --- /dev/null +++ b/slim/prune/yolov3_mobilenet_v1_slim.yaml @@ -0,0 +1,23 @@ +version: 1.0 +pruners: + pruner_1: + class: 'StructurePruner' + pruning_axis: + '*': 0 + criterions: + '*': 'l1_norm' +strategies: + uniform_pruning_strategy: + class: 'UniformPruneStrategy' + pruner: 'pruner_1' + start_epoch: 0 + target_ratio: 0.5 + pruned_params: '(conv2_1_sep_weights)|(conv2_2_sep_weights)|(conv3_1_sep_weights)|(conv4_1_sep_weights)|(conv5_1_sep_weights)|(conv5_2_sep_weights)|(conv5_3_sep_weights)|(conv5_4_sep_weights)|(conv5_5_sep_weights)|(conv5_6_sep_weights)|(yolo_block.0.0.0.conv.weights)|(yolo_block.0.0.1.conv.weights)|(yolo_block.0.1.0.conv.weights)|(yolo_block.0.1.1.conv.weights)|(yolo_block.1.0.0.conv.weights)|(yolo_block.1.0.1.conv.weights)|(yolo_block.1.1.0.conv.weights)|(yolo_block.1.1.1.conv.weights)|(yolo_block.1.2.conv.weights)|(yolo_block.2.0.0.conv.weights)|(yolo_block.2.0.1.conv.weights)|(yolo_block.2.1.1.conv.weights)|(yolo_block.2.2.conv.weights)|(yolo_block.2.tip.conv.weights)' + metric_name: 'acc_top1' +compressor: + epoch: 271 + eval_epoch: 10 + #init_model: './checkpoints/0' # Please enable this option for loading checkpoint. + checkpoint_path: './checkpoints/' + strategies: + - uniform_pruning_strategy diff --git a/slim/quantization/README.md b/slim/quantization/README.md new file mode 100644 index 0000000000000000000000000000000000000000..acb4c9efcbd49bccc4682c7eb7af294885e5d42a --- /dev/null +++ b/slim/quantization/README.md @@ -0,0 +1,241 @@ +>运行该示例前请安装Paddle1.6或更高版本 + +# 检测模型量化压缩示例 + +## 概述 + +该示例使用PaddleSlim提供的[量化压缩策略](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/tutorial.md#1-quantization-aware-training%E9%87%8F%E5%8C%96%E4%BB%8B%E7%BB%8D)对分类模型进行压缩。 +在阅读该示例前,建议您先了解以下内容: + +- [检测模型的常规训练方法](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/PaddleDetection) +- [PaddleSlim使用文档](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/usage.md) + + +## 配置文件说明 + +关于配置文件如何编写您可以参考: + +- [PaddleSlim配置文件编写说明](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/usage.md#122-%E9%85%8D%E7%BD%AE%E6%96%87%E4%BB%B6%E7%9A%84%E4%BD%BF%E7%94%A8) +- [量化策略配置文件编写说明](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/usage.md#21-%E9%87%8F%E5%8C%96%E8%AE%AD%E7%BB%83) + +其中save_out_nodes需要得到检测结果的Variable的名称,下面介绍如何确定save_out_nodes的参数 +以MobileNet V1为例,可在compress.py中构建好网络之后,直接打印Variable得到Variable的名称信息。 +代码示例: +``` + eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog, + extra_keys) + # print(eval_values) +``` +根据运行结果可看到Variable的名字为:`multiclass_nms_0.tmp_0`。 +## 训练 + +根据 [PaddleCV/PaddleDetection/tools/train.py](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/PaddleDetection/tools/train.py) 编写压缩脚本compress.py。 +在该脚本中定义了Compressor对象,用于执行压缩任务。 + +通过`python compress.py --help`查看可配置参数,简述如下: + +- config: 检测库的配置,其中配置了训练超参数、数据集信息等。 +- slim_file: PaddleSlim的配置文件,参见[配置文件说明](#配置文件说明)。 + +您可以通过运行以下命令运行该示例。 + +step1: 设置gpu卡 +``` +export CUDA_VISIBLE_DEVICES=0 +``` +step2: 开始训练 +使用PaddleDetection提供的配置文件在用8卡进行训练: + +``` +python compress.py \ + -s yolov3_mobilenet_v1_slim.yaml \ + -c ../../configs/yolov3_mobilenet_v1_voc.yml \ + -d "../../dataset/voc" \ + -o max_iters=258 \ + LearningRate.base_lr=0.0001 \ + LearningRate.schedulers="[!PiecewiseDecay {gamma: 0.1, milestones: [258, 516]}]" \ + pretrain_weights=https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar \ + YoloTrainFeed.batch_size=64 +``` + +>通过命令行覆盖设置max_iters选项,因为PaddleDetection中训练是以`batch`为单位迭代的,并没有涉及`epoch`的概念,但是PaddleSlim需要知道当前训练进行到第几个`epoch`, 所以需要将`max_iters`设置为一个`epoch`内的`batch`的数量。 + +如果要调整训练卡数,需要调整配置文件`yolov3_mobilenet_v1_voc.yml`中的以下参数: + +- **max_iters:** 一个`epoch`中batch的数量,需要设置为`total_num / batch_size`, 其中`total_num`为训练样本总数量,`batch_size`为多卡上总的batch size. +- **YoloTrainFeed.batch_size:** 当使用DataLoader时,表示单张卡上的batch size; 当使用普通reader时,则表示多卡上的总的batch_size。batch_size受限于显存大小。 +- **LeaningRate.base_lr:** 根据多卡的总`batch_size`调整`base_lr`,两者大小正相关,可以简单的按比例进行调整。 +- **LearningRate.schedulers.PiecewiseDecay.milestones:** 请根据batch size的变化对其调整。 +- **LearningRate.schedulers.PiecewiseDecay.LinearWarmup.steps:** 请根据batch size的变化对其进行调整。 + + +以下为4卡训练示例,通过命令行覆盖`yolov3_mobilenet_v1_voc.yml`中的参数: + +``` +python compress.py \ + -s yolov3_mobilenet_v1_slim.yaml \ + -c ../../configs/yolov3_mobilenet_v1_voc.yml \ + -d "../../dataset/voc" \ + -o max_iters=258 \ + LearningRate.base_lr=0.0001 \ + LearningRate.schedulers="[!PiecewiseDecay {gamma: 0.1, milestones: [258, 516]}]" \ + pretrain_weights=https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar \ + YoloTrainFeed.batch_size=64 + +``` + +以下为2卡训练示例,受显存所制,单卡`batch_size`不变, 总`batch_size`减小,`base_lr`减小,一个epoch内batch数量增加,同时需要调整学习率相关参数,如下: + +``` +python compress.py \ + -s yolov3_mobilenet_v1_slim.yaml \ + -c ../../configs/yolov3_mobilenet_v1_voc.yml \ + -d "../../dataset/voc" \ + -o max_iters=516 \ + LearningRate.base_lr=0.00005 \ + LearningRate.schedulers="[!PiecewiseDecay {gamma: 0.1, milestones: [516, 1012]}]" \ + pretrain_weights=https://paddlemodels.bj.bcebos.com/object_detection/yolov3_mobilenet_v1_voc.tar \ + YoloTrainFeed.batch_size=32 +``` + +通过`python compress.py --help`查看可配置参数。 +通过`python ../../tools/configure.py ${option_name} help`查看如何通过命令行覆盖配置文件`yolov3_mobilenet_v1_voc.yml`中的参数。 + + + +### 训练时的模型结构 +这部分介绍来源于[量化low-level API介绍](https://github.com/PaddlePaddle/models/tree/develop/PaddleSlim/quant_low_level_api#1-%E9%87%8F%E5%8C%96%E8%AE%AD%E7%BB%83low-level-apis%E4%BB%8B%E7%BB%8D)。 + +PaddlePaddle框架中和量化相关的IrPass, 分别有QuantizationTransformPass、QuantizationFreezePass、ConvertToInt8Pass。在训练时,对网络应用了QuantizationTransformPass,作用是在网络中的conv2d、depthwise_conv2d、mul等算子的各个输入前插入连续的量化op和反量化op,并改变相应反向算子的某些输入。示例图如下: + +

+
+图1:应用QuantizationTransformPass后的结果 +

+ +### 保存断点(checkpoint) + +如果在配置文件中设置了`checkpoint_path`, 则在压缩任务执行过程中会自动保存断点,当任务异常中断时, +重启任务会自动从`checkpoint_path`路径下按数字顺序加载最新的checkpoint文件。如果不想让重启的任务从断点恢复, +需要修改配置文件中的`checkpoint_path`,或者将`checkpoint_path`路径下文件清空。 + +>注意:配置文件中的信息不会保存在断点中,重启前对配置文件的修改将会生效。 + + +### 保存评估和预测模型 + +如果在配置文件的量化策略中设置了`float_model_save_path`, `int8_model_save_path` 在训练结束后,会保存模型量化压缩之后用于预测的模型。接下来介绍这2种预测模型的区别。 + +#### FP32模型 +在介绍量化训练时的模型结构时介绍了PaddlePaddle框架中和量化相关的IrPass, 分别是QuantizationTransformPass、QuantizationFreezePass、ConvertToInt8Pass。FP32模型是在应用QuantizationFreezePass并删除eval_program中多余的operators之后,保存的模型。 + +QuantizationFreezePass主要用于改变IrGraph中量化op和反量化op的顺序,即将类似图1中的量化op和反量化op顺序改变为图2中的布局。除此之外,QuantizationFreezePass还会将`conv2d`、`depthwise_conv2d`、`mul`等算子的权重离线量化为int8_t范围内的值(但数据类型仍为float32),以减少预测过程中对权重的量化操作,示例如图2: + +

+
+图2:应用QuantizationFreezePass后的结果 +

+ +#### 8-bit模型 +在对训练网络进行QuantizationFreezePass之后,执行ConvertToInt8Pass, +其主要目的是将执行完QuantizationFreezePass后输出的权重类型由`FP32`更改为`INT8`。换言之,用户可以选择将量化后的权重保存为float32类型(不执行ConvertToInt8Pass)或者int8_t类型(执行ConvertToInt8Pass),示例如图3: + +

+
+图3:应用ConvertToInt8Pass后的结果 +

+ +> 综上,可得在量化过程中有以下几种模型结构: + +1. 原始模型 +2. 经QuantizationTransformPass之后得到的适用于训练的量化模型结构,在${checkpoint_path}下保存的`eval_model`是这种结构,在训练过程中每个epoch结束时也使用这个网络结构进行评估,虽然这个模型结构不是最终想要的模型结构,但是每个epoch的评估结果可用来挑选模型。 +3. 经QuantizationFreezePass之后得到的FP32模型结构,具体结构已在上面进行介绍。本文档中列出的数据集的评估结果是对FP32模型结构进行评估得到的结果。这种模型结构在训练过程中只会保存一次,也就是在量化配置文件中设置的`end_epoch`结束时进行保存,如果想将其他epoch的训练结果转化成FP32模型,可使用脚本 PaddleSlim/classification/quantization/freeze.py进行转化,具体使用方法在[评估](#评估)中介绍。 +4. 经ConvertToInt8Pass之后得到的8-bit模型结构,具体结构已在上面进行介绍。这种模型结构在训练过程中只会保存一次,也就是在量化配置文件中设置的`end_epoch`结束时进行保存,如果想将其他epoch的训练结果转化成8-bit模型,可使用脚本 slim/quantization/freeze.py进行转化,具体使用方法在[评估](#评估)中介绍。 + + +## 评估 + +### 每个epoch保存的评估模型 +因为量化的最终模型只有在end_epoch时保存一次,不能保证保存的模型是最好的,因此 +如果在配置文件中设置了`checkpoint_path`,则每个epoch会保存一个量化后的用于评估的模型, +该模型会保存在`${checkpoint_path}/${epoch_id}/eval_model/`路径下,包含`__model__`和`__params__`两个文件。 +其中,`__model__`用于保存模型结构信息,`__params__`用于保存参数(parameters)信息。模型结构和训练时一样。 + +如果不需要保存评估模型,可以在定义Compressor对象时,将`save_eval_model`选项设置为False(默认为True)。 + +脚本slim/eval.py中为使用该模型在评估数据集上做评估的示例。 +运行命令为: +``` +python ../eval.py \ + --model_path ${checkpoint_path}/${epoch_id}/eval_model/ \ + --model_name __model__ \ + --params_name __params__ \ + -c ../../configs/yolov3_mobilenet_v1_voc.yml \ + -d "../../dataset/voc" +``` + +在评估之后,选取效果最好的epoch的模型,可使用脚本 slim/quantization/freeze.py将该模型转化为以上介绍的2种模型:FP32模型,int8模型,需要配置的参数为: + +- model_path, 加载的模型路径,`为${checkpoint_path}/${epoch_id}/eval_model/` +- weight_quant_type 模型参数的量化方式,和配置文件中的类型保持一致 +- save_path `FP32`, `8-bit` 模型的保存路径,分别为 `${save_path}/float/`, `${save_path}/int8/` + +运行命令示例: +``` +python freeze.py \ + --model_path ${checkpoint_path}/${epoch_id}/eval_model/ \ + --weight_quant_type ${weight_quant_type} \ + --save_path ${any path you want} \ + -c ../../configs/yolov3_mobilenet_v1_voc.yml \ + -d "../../dataset/voc" +``` + +### 最终评估模型 +最终使用的评估模型是FP32模型,使用脚本slim/eval.py中为使用该模型在评估数据集上做评估的示例。 +运行命令为: +``` +python ../eval.py \ + --model_path ${float_model_path} + --model_name model \ + --params_name weights \ + -c ../../configs/yolov3_mobilenet_v1_voc.yml \ + -d "../../dataset/voc" +``` + +## 预测 + +### python预测 +FP32模型可直接使用原生PaddlePaddle Fluid预测方法进行预测。 + +在脚本slim/infer.py中展示了如何使用fluid python API加载使用预测模型进行预测。 + +运行命令示例: +``` +python ../infer.py \ + --model_path ${save_path}/float \ + --model_name model \ + --params_name weights \ + -c ../../configs/yolov3_mobilenet_v1_voc.yml \ + --infer_dir ../../demo +``` + + +### PaddleLite预测 +FP32模型可使用PaddleLite进行加载预测,可参见教程[Paddle-Lite如何加载运行量化模型](https://github.com/PaddlePaddle/Paddle-Lite/wiki/model_quantization) + + +## 示例结果 + +>当前release的结果并非超参调优后的最好结果,仅做示例参考,后续我们会优化当前结果。 + +### MobileNetV1-YOLO-V3 + +| weight量化方式 | activation量化方式| Box ap |Paddle Fluid inference time(ms)| Paddle Lite inference time(ms)| +|---|---|---|---|---| +|baseline|- |76.2%|- |-| +|abs_max|abs_max|- |- |-| +|abs_max|moving_average_abs_max|- |- |-| +|channel_wise_abs_max|abs_max|- |- |-| + + +## FAQ diff --git a/slim/quantization/compress.py b/slim/quantization/compress.py new file mode 100644 index 0000000000000000000000000000000000000000..b4a5553cf46eabcd25f7cc1ce6c50fccefd2e5df --- /dev/null +++ b/slim/quantization/compress.py @@ -0,0 +1,267 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import multiprocessing +import numpy as np +import datetime +from collections import deque +import sys +sys.path.append("../../") +from paddle.fluid.contrib.slim import Compressor +from paddle.fluid.framework import IrGraph +from paddle.fluid import core + + +def set_paddle_flags(**kwargs): + for key, value in kwargs.items(): + if os.environ.get(key, None) is None: + os.environ[key] = str(value) + + +# NOTE(paddle-dev): All of these flags should be set before +# `import paddle`. Otherwise, it would not take any effect. +set_paddle_flags( + FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory +) + +from paddle import fluid + +from ppdet.core.workspace import load_config, merge_config, create +from ppdet.data.data_feed import create_reader + +from ppdet.utils.eval_utils import parse_fetches, eval_results +from ppdet.utils.stats import TrainingStats +from ppdet.utils.cli import ArgsParser, print_total_cfg +from ppdet.utils.check import check_gpu, check_version +import ppdet.utils.checkpoint as checkpoint +from ppdet.modeling.model_input import create_feed + +import logging +FORMAT = '%(asctime)s-%(levelname)s: %(message)s' +logging.basicConfig(level=logging.INFO, format=FORMAT) +logger = logging.getLogger(__name__) + + +def eval_run(exe, compile_program, reader, keys, values, cls, test_feed): + """ + Run evaluation program, return program outputs. + """ + iter_id = 0 + results = [] + if len(cls) != 0: + values = [] + for i in range(len(cls)): + _, accum_map = cls[i].get_map_var() + cls[i].reset(exe) + values.append(accum_map) + + images_num = 0 + start_time = time.time() + has_bbox = 'bbox' in keys + for data in reader(): + data = test_feed.feed(data) + feed_data = {'image': data['image'], 'im_size': data['im_size']} + outs = exe.run(compile_program, + feed=feed_data, + fetch_list=[values[0]], + return_numpy=False) + outs.append(data['gt_box']) + outs.append(data['gt_label']) + outs.append(data['is_difficult']) + res = { + k: (np.array(v), v.recursive_sequence_lengths()) + for k, v in zip(keys, outs) + } + results.append(res) + if iter_id % 100 == 0: + logger.info('Test iter {}'.format(iter_id)) + iter_id += 1 + images_num += len(res['bbox'][1][0]) if has_bbox else 1 + logger.info('Test finish iter {}'.format(iter_id)) + + end_time = time.time() + fps = images_num / (end_time - start_time) + if has_bbox: + logger.info('Total number of images: {}, inference time: {} fps.'. + format(images_num, fps)) + else: + logger.info('Total iteration: {}, inference time: {} batch/s.'.format( + images_num, fps)) + + return results + + +def main(): + cfg = load_config(FLAGS.config) + if 'architecture' in cfg: + main_arch = cfg.architecture + else: + raise ValueError("'architecture' not specified in config file.") + + merge_config(FLAGS.opt) + if 'log_iter' not in cfg: + cfg.log_iter = 20 + + # check if set use_gpu=True in paddlepaddle cpu version + check_gpu(cfg.use_gpu) + # print_total_cfg(cfg) + #check_version() + if cfg.use_gpu: + devices_num = fluid.core.get_cuda_device_count() + else: + devices_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + + if 'train_feed' not in cfg: + train_feed = create(main_arch + 'TrainFeed') + else: + train_feed = create(cfg.train_feed) + + if 'eval_feed' not in cfg: + eval_feed = create(main_arch + 'EvalFeed') + else: + eval_feed = create(cfg.eval_feed) + + place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + + lr_builder = create('LearningRate') + optim_builder = create('OptimizerBuilder') + + # build program + startup_prog = fluid.Program() + train_prog = fluid.Program() + with fluid.program_guard(train_prog, startup_prog): + with fluid.unique_name.guard(): + model = create(main_arch) + _, feed_vars = create_feed(train_feed, False) + train_fetches = model.train(feed_vars) + loss = train_fetches['loss'] + lr = lr_builder() + optimizer = optim_builder(lr) + optimizer.minimize(loss) + + train_reader = create_reader(train_feed, cfg.max_iters, FLAGS.dataset_dir) + + # parse train fetches + train_keys, train_values, _ = parse_fetches(train_fetches) + train_values.append(lr) + + train_fetch_list = [] + for k, v in zip(train_keys, train_values): + train_fetch_list.append((k, v)) + print("train_fetch_list: {}".format(train_fetch_list)) + + eval_prog = fluid.Program() + with fluid.program_guard(eval_prog, startup_prog): + with fluid.unique_name.guard(): + model = create(main_arch) + _, test_feed_vars = create_feed(eval_feed, False) + fetches = model.eval(test_feed_vars) + eval_prog = eval_prog.clone(True) + + eval_reader = create_reader(eval_feed, args_path=FLAGS.dataset_dir) + #eval_pyreader.decorate_sample_list_generator(eval_reader, place) + test_data_feed = fluid.DataFeeder(test_feed_vars.values(), place) + + # parse eval fetches + extra_keys = [] + if cfg.metric == 'COCO': + extra_keys = ['im_info', 'im_id', 'im_shape'] + if cfg.metric == 'VOC': + extra_keys = ['gt_box', 'gt_label', 'is_difficult'] + eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog, + extra_keys) + # print(eval_values) + + eval_fetch_list = [] + for k, v in zip(eval_keys, eval_values): + eval_fetch_list.append((k, v)) + + exe.run(startup_prog) + + start_iter = 0 + + checkpoint.load_params(exe, train_prog, cfg.pretrain_weights) + + best_box_ap_list = [] + + def eval_func(program, scope): + + #place = fluid.CPUPlace() + #exe = fluid.Executor(place) + results = eval_run(exe, program, eval_reader, eval_keys, eval_values, + eval_cls, test_data_feed) + + resolution = None + if 'mask' in results[0]: + resolution = model.mask_head.resolution + box_ap_stats = eval_results(results, eval_feed, cfg.metric, + cfg.num_classes, resolution, False, + FLAGS.output_eval) + if len(best_box_ap_list) == 0: + best_box_ap_list.append(box_ap_stats[0]) + elif box_ap_stats[0] > best_box_ap_list[0]: + best_box_ap_list[0] = box_ap_stats[0] + logger.info("Best test box ap: {}".format(best_box_ap_list[0])) + return best_box_ap_list[0] + + test_feed = [('image', test_feed_vars['image'].name), + ('im_size', test_feed_vars['im_size'].name)] + + com = Compressor( + place, + fluid.global_scope(), + train_prog, + train_reader=train_reader, + train_feed_list=[(key, value.name) for key, value in feed_vars.items()], + train_fetch_list=train_fetch_list, + eval_program=eval_prog, + eval_reader=eval_reader, + eval_feed_list=test_feed, + eval_func={'map': eval_func}, + eval_fetch_list=[eval_fetch_list[0]], + prune_infer_model=[["image", "im_size"], ["multiclass_nms_0.tmp_0"]], + train_optimizer=None) + com.config(FLAGS.slim_file) + com.run() + + +if __name__ == '__main__': + parser = ArgsParser() + parser.add_argument( + "-s", + "--slim_file", + default=None, + type=str, + help="Config file of PaddleSlim.") + parser.add_argument( + "--output_eval", + default=None, + type=str, + help="Evaluation directory, default is current directory.") + parser.add_argument( + "-d", + "--dataset_dir", + default=None, + type=str, + help="Dataset path, same as DataFeed.dataset.dataset_dir") + FLAGS = parser.parse_args() + main() diff --git a/slim/quantization/freeze.py b/slim/quantization/freeze.py new file mode 100644 index 0000000000000000000000000000000000000000..38c06578e3d22e1cc4f2bdcc933298553c1c1f37 --- /dev/null +++ b/slim/quantization/freeze.py @@ -0,0 +1,239 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import multiprocessing +import numpy as np +import datetime +from collections import deque +import sys +sys.path.append("../../") +from paddle.fluid.contrib.slim import Compressor +from paddle.fluid.framework import IrGraph +from paddle.fluid import core +from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass +from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass +from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass +from paddle.fluid.contrib.slim.quantization import TransformForMobilePass + + +def set_paddle_flags(**kwargs): + for key, value in kwargs.items(): + if os.environ.get(key, None) is None: + os.environ[key] = str(value) + + +# NOTE(paddle-dev): All of these flags should be set before +# `import paddle`. Otherwise, it would not take any effect. +set_paddle_flags( + FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory +) + +from paddle import fluid + +from ppdet.core.workspace import load_config, merge_config, create +from ppdet.data.data_feed import create_reader + +from ppdet.utils.eval_utils import parse_fetches, eval_results +from ppdet.utils.stats import TrainingStats +from ppdet.utils.cli import ArgsParser +from ppdet.utils.check import check_gpu +import ppdet.utils.checkpoint as checkpoint +from ppdet.modeling.model_input import create_feed + +import logging +FORMAT = '%(asctime)s-%(levelname)s: %(message)s' +logging.basicConfig(level=logging.INFO, format=FORMAT) +logger = logging.getLogger(__name__) + + +def eval_run(exe, compile_program, reader, keys, values, cls, test_feed): + """ + Run evaluation program, return program outputs. + """ + iter_id = 0 + results = [] + + images_num = 0 + start_time = time.time() + has_bbox = 'bbox' in keys + for data in reader(): + data = test_feed.feed(data) + feed_data = {'image': data['image'], 'im_size': data['im_size']} + outs = exe.run(compile_program, + feed=feed_data, + fetch_list=values[0], + return_numpy=False) + outs.append(data['gt_box']) + outs.append(data['gt_label']) + outs.append(data['is_difficult']) + res = { + k: (np.array(v), v.recursive_sequence_lengths()) + for k, v in zip(keys, outs) + } + results.append(res) + if iter_id % 100 == 0: + logger.info('Test iter {}'.format(iter_id)) + iter_id += 1 + images_num += len(res['bbox'][1][0]) if has_bbox else 1 + logger.info('Test finish iter {}'.format(iter_id)) + + end_time = time.time() + fps = images_num / (end_time - start_time) + if has_bbox: + logger.info('Total number of images: {}, inference time: {} fps.'. + format(images_num, fps)) + else: + logger.info('Total iteration: {}, inference time: {} batch/s.'.format( + images_num, fps)) + + return results + + +def main(): + cfg = load_config(FLAGS.config) + if 'architecture' in cfg: + main_arch = cfg.architecture + else: + raise ValueError("'architecture' not specified in config file.") + + merge_config(FLAGS.opt) + if 'log_iter' not in cfg: + cfg.log_iter = 20 + + # check if set use_gpu=True in paddlepaddle cpu version + check_gpu(cfg.use_gpu) + + if cfg.use_gpu: + devices_num = fluid.core.get_cuda_device_count() + else: + devices_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + + if 'eval_feed' not in cfg: + eval_feed = create(main_arch + 'EvalFeed') + else: + eval_feed = create(cfg.eval_feed) + + place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + + _, test_feed_vars = create_feed(eval_feed, False) + + eval_reader = create_reader(eval_feed, args_path=FLAGS.dataset_dir) + #eval_pyreader.decorate_sample_list_generator(eval_reader, place) + test_data_feed = fluid.DataFeeder(test_feed_vars.values(), place) + + assert os.path.exists(FLAGS.model_path) + infer_prog, feed_names, fetch_targets = fluid.io.load_inference_model( + dirname=FLAGS.model_path, + executor=exe, + model_filename='__model__.infer', + params_filename='__params__') + + eval_keys = ['bbox', 'gt_box', 'gt_label', 'is_difficult'] + eval_values = [ + 'multiclass_nms_0.tmp_0', 'gt_box', 'gt_label', 'is_difficult' + ] + eval_cls = [] + eval_values[0] = fetch_targets[0] + + results = eval_run(exe, infer_prog, eval_reader, eval_keys, eval_values, + eval_cls, test_data_feed) + + resolution = None + if 'mask' in results[0]: + resolution = model.mask_head.resolution + box_ap_stats = eval_results(results, eval_feed, cfg.metric, cfg.num_classes, + resolution, False, FLAGS.output_eval) + + logger.info("freeze the graph for inference") + test_graph = IrGraph(core.Graph(infer_prog.desc), for_test=True) + + freeze_pass = QuantizationFreezePass( + scope=fluid.global_scope(), + place=place, + weight_quantize_type=FLAGS.weight_quant_type) + freeze_pass.apply(test_graph) + server_program = test_graph.to_program() + fluid.io.save_inference_model( + dirname=os.path.join(FLAGS.save_path, 'float'), + feeded_var_names=feed_names, + target_vars=fetch_targets, + executor=exe, + main_program=server_program, + model_filename='model', + params_filename='weights') + + logger.info("convert the weights into int8 type") + convert_int8_pass = ConvertToInt8Pass( + scope=fluid.global_scope(), place=place) + convert_int8_pass.apply(test_graph) + server_int8_program = test_graph.to_program() + fluid.io.save_inference_model( + dirname=os.path.join(FLAGS.save_path, 'int8'), + feeded_var_names=feed_names, + target_vars=fetch_targets, + executor=exe, + main_program=server_int8_program, + model_filename='model', + params_filename='weights') + + logger.info("convert the freezed pass to paddle-lite execution") + mobile_pass = TransformForMobilePass() + mobile_pass.apply(test_graph) + mobile_program = test_graph.to_program() + fluid.io.save_inference_model( + dirname=os.path.join(FLAGS.save_path, 'mobile'), + feeded_var_names=feed_names, + target_vars=fetch_targets, + executor=exe, + main_program=mobile_program, + model_filename='model', + params_filename='weights') + + +if __name__ == '__main__': + parser = ArgsParser() + parser.add_argument( + "-m", "--model_path", default=None, type=str, help="path of checkpoint") + parser.add_argument( + "--output_eval", + default=None, + type=str, + help="Evaluation directory, default is current directory.") + parser.add_argument( + "-d", + "--dataset_dir", + default=None, + type=str, + help="Dataset path, same as DataFeed.dataset.dataset_dir") + parser.add_argument( + "--weight_quant_type", + default='abs_max', + type=str, + help="quantization type for weight") + parser.add_argument( + "--save_path", + default='./output', + type=str, + help="path to save quantization inference model") + + FLAGS = parser.parse_args() + main() diff --git a/slim/quantization/images/ConvertToInt8Pass.png b/slim/quantization/images/ConvertToInt8Pass.png new file mode 100644 index 0000000000000000000000000000000000000000..8b5849819c0bc8e592dc8f864d8945330df85ab1 Binary files /dev/null and b/slim/quantization/images/ConvertToInt8Pass.png differ diff --git a/slim/quantization/images/FreezePass.png b/slim/quantization/images/FreezePass.png new file mode 100644 index 0000000000000000000000000000000000000000..acd2b0a890a8af85bec6eecdb22e47ad386a178c Binary files /dev/null and b/slim/quantization/images/FreezePass.png differ diff --git a/slim/quantization/images/TransformForMobilePass.png b/slim/quantization/images/TransformForMobilePass.png new file mode 100644 index 0000000000000000000000000000000000000000..4104cacc67af0be1c7bc152696e2ae544127aace Binary files /dev/null and b/slim/quantization/images/TransformForMobilePass.png differ diff --git a/slim/quantization/images/TransformPass.png b/slim/quantization/images/TransformPass.png new file mode 100644 index 0000000000000000000000000000000000000000..f29ab62753e0e6ddf28d0c1dda7139705fc24b18 Binary files /dev/null and b/slim/quantization/images/TransformPass.png differ diff --git a/slim/quantization/yolov3_mobilenet_v1_slim.yaml b/slim/quantization/yolov3_mobilenet_v1_slim.yaml new file mode 100644 index 0000000000000000000000000000000000000000..60a66f656f9e419cd862231654ab4eaca6057ea2 --- /dev/null +++ b/slim/quantization/yolov3_mobilenet_v1_slim.yaml @@ -0,0 +1,20 @@ +version: 1.0 +strategies: + quantization_strategy: + class: 'QuantizationStrategy' + start_epoch: 0 + end_epoch: 4 + float_model_save_path: './output/yolov3/float' + mobile_model_save_path: './output/yolov3/mobile' + int8_model_save_path: './output/yolov3/int8' + weight_bits: 8 + activation_bits: 8 + weight_quantize_type: 'abs_max' + activation_quantize_type: 'moving_average_abs_max' + save_in_nodes: ['image', 'im_size'] + save_out_nodes: ['multiclass_nms_0.tmp_0'] +compressor: + epoch: 5 + checkpoint_path: './checkpoints/yolov3/' + strategies: + - quantization_strategy diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d0c32e26092f6ea25771279418582a24ea449ab2 --- /dev/null +++ b/tools/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tools/configure.py b/tools/configure.py new file mode 100644 index 0000000000000000000000000000000000000000..560d161513ae8f0115d8d3d5f97f6a0695642015 --- /dev/null +++ b/tools/configure.py @@ -0,0 +1,202 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import sys +from argparse import ArgumentParser, RawDescriptionHelpFormatter + +import yaml + +from ppdet.core.workspace import get_registered_modules, load_config, dump_value +from ppdet.utils.cli import ColorTTY, print_total_cfg + +color_tty = ColorTTY() + +MISC_CONFIG = { + "architecture": "", + "max_iters": "", + "train_feed": "", + "eval_feed": "", + "test_feed": "", + "pretrain_weights": "", + "save_dir": "", + "weights": "", + "metric": "", + "map_type": "11point", + "log_smooth_window": 20, + "snapshot_iter": 10000, + "log_iter": 20, + "use_gpu": True, + "finetune_exclude_pretrained_params": "", +} + + +def dump_config(module, minimal=False): + args = module.schema.values() + if minimal: + args = [arg for arg in args if not arg.has_default()] + return yaml.dump( + { + module.name: { + arg.name: arg.default if arg.has_default() else "" + for arg in args + } + }, + default_flow_style=False, + default_style='') + + +def list_modules(**kwargs): + target_category = kwargs['category'] + module_schema = get_registered_modules() + module_by_category = {} + + for schema in module_schema.values(): + category = schema.category + if target_category is not None and schema.category != target_category: + continue + if category not in module_by_category: + module_by_category[category] = [schema] + else: + module_by_category[category].append(schema) + + for cat, modules in module_by_category.items(): + print("Available modules in the category '{}':".format(cat)) + print("") + max_len = max([len(mod.name) for mod in modules]) + for mod in modules: + print(color_tty.green(mod.name.ljust(max_len)), + mod.doc.split('\n')[0]) + print("") + + +def help_module(**kwargs): + schema = get_registered_modules()[kwargs['module']] + + doc = schema.doc is None and "Not documented" or "{}".format(schema.doc) + func_args = {arg.name: arg.doc for arg in schema.schema.values()} + max_len = max([len(k) for k in func_args.keys()]) + opts = "\n".join([ + "{} {}".format(color_tty.green(k.ljust(max_len)), v) + for k, v in func_args.items() + ]) + template = dump_config(schema) + print("{}\n\n{}\n\n{}\n\n{}\n\n{}\n\n{}\n{}\n".format( + color_tty.bold(color_tty.blue("MODULE DESCRIPTION:")), + doc, + color_tty.bold(color_tty.blue("MODULE OPTIONS:")), + opts, + color_tty.bold(color_tty.blue("CONFIGURATION TEMPLATE:")), + template, + color_tty.bold(color_tty.blue("COMMAND LINE OPTIONS:")), )) + for arg in schema.schema.values(): + print("--opt {}.{}={}".format(schema.name, arg.name, + dump_value(arg.default) + if arg.has_default() else "")) + + +def generate_config(**kwargs): + minimal = kwargs['minimal'] + modules = kwargs['modules'] + module_schema = get_registered_modules() + visited = [] + schema = [] + + def walk(m): + if m in visited: + return + s = module_schema[m] + schema.append(s) + visited.append(m) + + for mod in modules: + walk(mod) + + # XXX try to be smart about when to add header, + # if any "architecture" module, is included, head will be added as well + if any([getattr(m, 'category', None) == 'architecture' for m in schema]): + # XXX for ordered printing + header = "" + for k, v in MISC_CONFIG.items(): + header += yaml.dump( + { + k: v + }, default_flow_style=False, default_style='') + print(header) + + for s in schema: + print(dump_config(s, minimal)) + + +# FIXME this is pretty hackish, maybe implement a custom YAML printer? +def analyze_config(**kwargs): + config = load_config(kwargs['file']) + print_total_cfg(config) + + +if __name__ == '__main__': + argv = sys.argv[1:] + + parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter) + subparsers = parser.add_subparsers(help='Supported Commands') + list_parser = subparsers.add_parser("list", help="list available modules") + help_parser = subparsers.add_parser( + "help", help="show detail options for module") + generate_parser = subparsers.add_parser( + "generate", help="generate configuration template") + analyze_parser = subparsers.add_parser( + "analyze", help="analyze configuration file") + + list_parser.set_defaults(func=list_modules) + help_parser.set_defaults(func=help_module) + generate_parser.set_defaults(func=generate_config) + analyze_parser.set_defaults(func=analyze_config) + + list_group = list_parser.add_mutually_exclusive_group() + list_group.add_argument( + "-c", + "--category", + type=str, + default=None, + help="list modules for ") + + help_parser.add_argument( + "module", + help="module to show info for", + choices=list(get_registered_modules().keys())) + + generate_parser.add_argument( + "modules", + nargs='+', + help="include these module in generated configuration template", + choices=list(get_registered_modules().keys())) + generate_group = generate_parser.add_mutually_exclusive_group() + generate_group.add_argument( + "--minimal", action='store_true', help="only include required options") + generate_group.add_argument( + "--full", + action='store_false', + dest='minimal', + help="include all options") + + analyze_parser.add_argument("file", help="configuration file to analyze") + + if len(sys.argv) < 2: + parser.print_help() + sys.exit(1) + + args = parser.parse_args(argv) + if hasattr(args, 'func'): + args.func(**vars(args)) diff --git a/tools/eval.py b/tools/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..15692d3dd8b13789faaaa6a9a0bb3ed9385f37a6 --- /dev/null +++ b/tools/eval.py @@ -0,0 +1,183 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + + +def set_paddle_flags(**kwargs): + for key, value in kwargs.items(): + if os.environ.get(key, None) is None: + os.environ[key] = str(value) + + +# NOTE(paddle-dev): All of these flags should be set before +# `import paddle`. Otherwise, it would not take any effect. +set_paddle_flags( + FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory +) + +import paddle.fluid as fluid + +from ppdet.utils.eval_utils import parse_fetches, eval_run, eval_results, json_eval_results +import ppdet.utils.checkpoint as checkpoint +from ppdet.utils.check import check_gpu +from ppdet.modeling.model_input import create_feed +from ppdet.data.data_feed import create_reader +from ppdet.core.workspace import load_config, merge_config, create +from ppdet.utils.cli import print_total_cfg +from ppdet.utils.cli import ArgsParser + +import logging +FORMAT = '%(asctime)s-%(levelname)s: %(message)s' +logging.basicConfig(level=logging.INFO, format=FORMAT) +logger = logging.getLogger(__name__) + + +def main(): + """ + Main evaluate function + """ + cfg = load_config(FLAGS.config) + if 'architecture' in cfg: + main_arch = cfg.architecture + else: + raise ValueError("'architecture' not specified in config file.") + + merge_config(FLAGS.opt) + # check if set use_gpu=True in paddlepaddle cpu version + check_gpu(cfg.use_gpu) + print_total_cfg(cfg) + + if 'eval_feed' not in cfg: + eval_feed = create(main_arch + 'EvalFeed') + else: + eval_feed = create(cfg.eval_feed) + + multi_scale_test = getattr(cfg, 'MultiScaleTEST', None) + + # define executor + place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + + # build program + model = create(main_arch) + startup_prog = fluid.Program() + eval_prog = fluid.Program() + with fluid.program_guard(eval_prog, startup_prog): + with fluid.unique_name.guard(): + pyreader, feed_vars = create_feed(eval_feed) + if multi_scale_test is None: + fetches = model.eval(feed_vars) + else: + fetches = model.eval(feed_vars, multi_scale_test) + eval_prog = eval_prog.clone(True) + reader = create_reader(eval_feed, args_path=FLAGS.dataset_dir) + pyreader.decorate_sample_list_generator(reader, place) + + # eval already exists json file + if FLAGS.json_eval: + logger.info( + "In json_eval mode, PaddleDetection will evaluate json files in " + "output_eval directly. And proposal.json, bbox.json and mask.json " + "will be detected by default.") + json_eval_results( + eval_feed, cfg.metric, json_directory=FLAGS.output_eval) + return + + compile_program = fluid.compiler.CompiledProgram( + eval_prog).with_data_parallel() + + # load model + exe.run(startup_prog) + if 'weights' in cfg: + checkpoint.load_params(exe, eval_prog, cfg.weights) + + assert cfg.metric in ['COCO', 'VOC'], \ + "unknown metric type {}".format(cfg.metric) + extra_keys = [] + if cfg.metric == 'COCO': + extra_keys = ['im_info', 'im_id', 'im_shape'] + if cfg.metric == 'VOC': + extra_keys = ['gt_box', 'gt_label', 'is_difficult'] + + keys, values, cls = parse_fetches(fetches, eval_prog, extra_keys) + + # whether output bbox is normalized in model output layer + is_bbox_normalized = False + if hasattr(model, 'is_bbox_normalized') and \ + callable(model.is_bbox_normalized): + is_bbox_normalized = model.is_bbox_normalized() + + sub_eval_prog = None + sub_keys = None + sub_values = None + # build sub-program + if 'Mask' in main_arch and multi_scale_test: + sub_eval_prog = fluid.Program() + with fluid.program_guard(sub_eval_prog, startup_prog): + with fluid.unique_name.guard(): + _, feed_vars = create_feed( + eval_feed, use_pyreader=False, sub_prog_feed=True) + sub_fetches = model.eval( + feed_vars, multi_scale_test, mask_branch=True) + extra_keys = [] + if cfg.metric == 'COCO': + extra_keys = ['im_id', 'im_shape'] + if cfg.metric == 'VOC': + extra_keys = ['gt_box', 'gt_label', 'is_difficult'] + sub_keys, sub_values, _ = parse_fetches(sub_fetches, sub_eval_prog, + extra_keys) + sub_eval_prog = sub_eval_prog.clone(True) + + if 'weights' in cfg: + checkpoint.load_params(exe, sub_eval_prog, cfg.weights) + + results = eval_run(exe, compile_program, pyreader, keys, values, cls, cfg, + sub_eval_prog, sub_keys, sub_values) + + # evaluation + resolution = None + if 'mask' in results[0]: + resolution = model.mask_head.resolution + # if map_type not set, use default 11point, only use in VOC eval + map_type = cfg.map_type if 'map_type' in cfg else '11point' + eval_results(results, eval_feed, cfg.metric, cfg.num_classes, resolution, + is_bbox_normalized, FLAGS.output_eval, map_type) + + +if __name__ == '__main__': + parser = ArgsParser() + parser.add_argument( + "--json_eval", + action='store_true', + default=False, + help="Whether to re eval with already exists bbox.json or mask.json") + parser.add_argument( + "-d", + "--dataset_dir", + default=None, + type=str, + help="Dataset path, same as DataFeed.dataset.dataset_dir") + parser.add_argument( + "-f", + "--output_eval", + default=None, + type=str, + help="Evaluation file directory, default is current directory.") + FLAGS = parser.parse_args() + main() diff --git a/tools/export_model.py b/tools/export_model.py new file mode 100644 index 0000000000000000000000000000000000000000..b0c9edac316220ca2b752fad05eec5437f698de8 --- /dev/null +++ b/tools/export_model.py @@ -0,0 +1,118 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from paddle import fluid + +from ppdet.core.workspace import load_config, merge_config, create +from ppdet.modeling.model_input import create_feed +from ppdet.utils.cli import ArgsParser +import ppdet.utils.checkpoint as checkpoint + +import logging +FORMAT = '%(asctime)s-%(levelname)s: %(message)s' +logging.basicConfig(level=logging.INFO, format=FORMAT) +logger = logging.getLogger(__name__) + + +def prune_feed_vars(feeded_var_names, target_vars, prog): + """ + Filter out feed variables which are not in program, + pruned feed variables are only used in post processing + on model output, which are not used in program, such + as im_id to identify image order, im_shape to clip bbox + in image. + """ + exist_var_names = [] + prog = prog.clone() + prog = prog._prune(targets=target_vars) + global_block = prog.global_block() + for name in feeded_var_names: + try: + v = global_block.var(name) + exist_var_names.append(str(v.name)) + except Exception: + logger.info('save_inference_model pruned unused feed ' + 'variables {}'.format(name)) + pass + return exist_var_names + + +def save_infer_model(FLAGS, exe, feed_vars, test_fetches, infer_prog): + cfg_name = os.path.basename(FLAGS.config).split('.')[0] + save_dir = os.path.join(FLAGS.output_dir, cfg_name) + feed_var_names = [var.name for var in feed_vars.values()] + target_vars = list(test_fetches.values()) + feed_var_names = prune_feed_vars(feed_var_names, target_vars, infer_prog) + logger.info("Export inference model to {}, input: {}, output: " + "{}...".format(save_dir, feed_var_names, + [str(var.name) for var in target_vars])) + fluid.io.save_inference_model( + save_dir, + feeded_var_names=feed_var_names, + target_vars=target_vars, + executor=exe, + main_program=infer_prog, + params_filename="__params__") + + +def main(): + cfg = load_config(FLAGS.config) + + if 'architecture' in cfg: + main_arch = cfg.architecture + else: + raise ValueError("'architecture' not specified in config file.") + + merge_config(FLAGS.opt) + + if 'test_feed' not in cfg: + test_feed = create(main_arch + 'TestFeed') + else: + test_feed = create(cfg.test_feed) + + # Use CPU for exporting inference model instead of GPU + place = fluid.CPUPlace() + exe = fluid.Executor(place) + + model = create(main_arch) + + startup_prog = fluid.Program() + infer_prog = fluid.Program() + with fluid.program_guard(infer_prog, startup_prog): + with fluid.unique_name.guard(): + _, feed_vars = create_feed(test_feed, use_pyreader=False) + test_fetches = model.test(feed_vars) + infer_prog = infer_prog.clone(True) + + exe.run(startup_prog) + checkpoint.load_params(exe, infer_prog, cfg.weights) + + save_infer_model(FLAGS, exe, feed_vars, test_fetches, infer_prog) + + +if __name__ == '__main__': + parser = ArgsParser() + parser.add_argument( + "--output_dir", + type=str, + default="output", + help="Directory for storing the output model files.") + FLAGS = parser.parse_args() + main() diff --git a/tools/face_eval.py b/tools/face_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..f74d5ba431e2933e346d8430e9ba9b2ad20af170 --- /dev/null +++ b/tools/face_eval.py @@ -0,0 +1,298 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +import paddle.fluid as fluid +import numpy as np +from PIL import Image +from collections import OrderedDict + +import ppdet.utils.checkpoint as checkpoint +from ppdet.utils.cli import ArgsParser +from ppdet.utils.check import check_gpu +from ppdet.utils.widerface_eval_utils import get_shrink, bbox_vote, \ + save_widerface_bboxes, save_fddb_bboxes, to_chw_bgr +from ppdet.core.workspace import load_config, merge_config, create +from ppdet.modeling.model_input import create_feed + +import logging +FORMAT = '%(asctime)s-%(levelname)s: %(message)s' +logging.basicConfig(level=logging.INFO, format=FORMAT) +logger = logging.getLogger(__name__) + + +def face_img_process(image, + mean=[104., 117., 123.], + std=[127.502231, 127.502231, 127.502231]): + img = np.array(image) + img = to_chw_bgr(img) + img = img.astype('float32') + img -= np.array(mean)[:, np.newaxis, np.newaxis].astype('float32') + img /= np.array(std)[:, np.newaxis, np.newaxis].astype('float32') + img = [img] + img = np.array(img) + return img + + +def face_eval_run(exe, + compile_program, + fetches, + img_root_dir, + gt_file, + pred_dir='output/pred', + eval_mode='widerface', + multi_scale=False): + # load ground truth files + with open(gt_file, 'r') as f: + gt_lines = f.readlines() + imid2path = [] + pos_gt = 0 + while pos_gt < len(gt_lines): + name_gt = gt_lines[pos_gt].strip('\n\t').split()[0] + imid2path.append(name_gt) + pos_gt += 1 + n_gt = int(gt_lines[pos_gt].strip('\n\t').split()[0]) + pos_gt += 1 + n_gt + logger.info('The ground truth file load {} images'.format(len(imid2path))) + + dets_dist = OrderedDict() + for iter_id, im_path in enumerate(imid2path): + image_path = os.path.join(img_root_dir, im_path) + if eval_mode == 'fddb': + image_path += '.jpg' + image = Image.open(image_path).convert('RGB') + if multi_scale: + shrink, max_shrink = get_shrink(image.size[1], image.size[0]) + det0 = detect_face(exe, compile_program, fetches, image, shrink) + det1 = flip_test(exe, compile_program, fetches, image, shrink) + [det2, det3] = multi_scale_test(exe, compile_program, fetches, image, + max_shrink) + det4 = multi_scale_test_pyramid(exe, compile_program, fetches, image, + max_shrink) + det = np.row_stack((det0, det1, det2, det3, det4)) + dets = bbox_vote(det) + else: + dets = detect_face(exe, compile_program, fetches, image, 1) + if eval_mode == 'widerface': + save_widerface_bboxes(image_path, dets, pred_dir) + else: + dets_dist[im_path] = dets + if iter_id % 100 == 0: + logger.info('Test iter {}'.format(iter_id)) + if eval_mode == 'fddb': + save_fddb_bboxes(dets_dist, pred_dir) + logger.info("Finish evaluation.") + + +def detect_face(exe, compile_program, fetches, image, shrink): + image_shape = [3, image.size[1], image.size[0]] + if shrink != 1: + h, w = int(image_shape[1] * shrink), int(image_shape[2] * shrink) + image = image.resize((w, h), Image.ANTIALIAS) + image_shape = [3, h, w] + + img = face_img_process(image) + detection, = exe.run(compile_program, + feed={'image': img}, + fetch_list=[fetches['bbox']], + return_numpy=False) + detection = np.array(detection) + # layout: xmin, ymin, xmax. ymax, score + if np.prod(detection.shape) == 1: + logger.info("No face detected") + return np.array([[0, 0, 0, 0, 0]]) + det_conf = detection[:, 1] + det_xmin = image_shape[2] * detection[:, 2] / shrink + det_ymin = image_shape[1] * detection[:, 3] / shrink + det_xmax = image_shape[2] * detection[:, 4] / shrink + det_ymax = image_shape[1] * detection[:, 5] / shrink + + det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf)) + return det + + +def flip_test(exe, compile_program, fetches, image, shrink): + img = image.transpose(Image.FLIP_LEFT_RIGHT) + det_f = detect_face(exe, compile_program, fetches, img, shrink) + det_t = np.zeros(det_f.shape) + # image.size: [width, height] + det_t[:, 0] = image.size[0] - det_f[:, 2] + det_t[:, 1] = det_f[:, 1] + det_t[:, 2] = image.size[0] - det_f[:, 0] + det_t[:, 3] = det_f[:, 3] + det_t[:, 4] = det_f[:, 4] + return det_t + + +def multi_scale_test(exe, compile_program, fetches, image, max_shrink): + # Shrink detecting is only used to detect big faces + st = 0.5 if max_shrink >= 0.75 else 0.5 * max_shrink + det_s = detect_face(exe, compile_program, fetches, image, st) + index = np.where( + np.maximum(det_s[:, 2] - det_s[:, 0] + 1, det_s[:, 3] - det_s[:, 1] + 1) + > 30)[0] + det_s = det_s[index, :] + # Enlarge one times + bt = min(2, max_shrink) if max_shrink > 1 else (st + max_shrink) / 2 + det_b = detect_face(exe, compile_program, fetches, image, bt) + + # Enlarge small image x times for small faces + if max_shrink > 2: + bt *= 2 + while bt < max_shrink: + det_b = np.row_stack((det_b, detect_face(exe, compile_program, + fetches, image, bt))) + bt *= 2 + det_b = np.row_stack((det_b, detect_face(exe, compile_program, fetches, + image, max_shrink))) + + # Enlarged images are only used to detect small faces. + if bt > 1: + index = np.where( + np.minimum(det_b[:, 2] - det_b[:, 0] + 1, + det_b[:, 3] - det_b[:, 1] + 1) < 100)[0] + det_b = det_b[index, :] + # Shrinked images are only used to detect big faces. + else: + index = np.where( + np.maximum(det_b[:, 2] - det_b[:, 0] + 1, + det_b[:, 3] - det_b[:, 1] + 1) > 30)[0] + det_b = det_b[index, :] + return det_s, det_b + + +def multi_scale_test_pyramid(exe, compile_program, fetches, image, max_shrink): + # Use image pyramids to detect faces + det_b = detect_face(exe, compile_program, fetches, image, 0.25) + index = np.where( + np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1) + > 30)[0] + det_b = det_b[index, :] + + st = [0.75, 1.25, 1.5, 1.75] + for i in range(len(st)): + if st[i] <= max_shrink: + det_temp = detect_face(exe, compile_program, fetches, image, st[i]) + # Enlarged images are only used to detect small faces. + if st[i] > 1: + index = np.where( + np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1, + det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0] + det_temp = det_temp[index, :] + # Shrinked images are only used to detect big faces. + else: + index = np.where( + np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1, + det_temp[:, 3] - det_temp[:, 1] + 1) > 30)[0] + det_temp = det_temp[index, :] + det_b = np.row_stack((det_b, det_temp)) + return det_b + + +def main(): + """ + Main evaluate function + """ + cfg = load_config(FLAGS.config) + if 'architecture' in cfg: + main_arch = cfg.architecture + else: + raise ValueError("'architecture' not specified in config file.") + + merge_config(FLAGS.opt) + + # check if set use_gpu=True in paddlepaddle cpu version + check_gpu(cfg.use_gpu) + + if 'eval_feed' not in cfg: + eval_feed = create(main_arch + 'EvalFeed') + else: + eval_feed = create(cfg.eval_feed) + + # define executor + place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + + # build program + model = create(main_arch) + startup_prog = fluid.Program() + eval_prog = fluid.Program() + with fluid.program_guard(eval_prog, startup_prog): + with fluid.unique_name.guard(): + _, feed_vars = create_feed(eval_feed, use_pyreader=False) + fetches = model.eval(feed_vars) + + eval_prog = eval_prog.clone(True) + + # load model + exe.run(startup_prog) + if 'weights' in cfg: + checkpoint.load_params(exe, eval_prog, cfg.weights) + + assert cfg.metric in ['WIDERFACE'], \ + "unknown metric type {}".format(cfg.metric) + + annotation_file = getattr(eval_feed.dataset, 'annotation', None) + dataset_dir = FLAGS.dataset_dir if FLAGS.dataset_dir else \ + getattr(eval_feed.dataset, 'dataset_dir', None) + img_root_dir = dataset_dir + if FLAGS.eval_mode == "widerface": + image_dir = getattr(eval_feed.dataset, 'image_dir', None) + img_root_dir = os.path.join(dataset_dir, image_dir) + gt_file = os.path.join(dataset_dir, annotation_file) + pred_dir = FLAGS.output_eval if FLAGS.output_eval else 'output/pred' + face_eval_run( + exe, + eval_prog, + fetches, + img_root_dir, + gt_file, + pred_dir=pred_dir, + eval_mode=FLAGS.eval_mode, + multi_scale=FLAGS.multi_scale) + + +if __name__ == '__main__': + parser = ArgsParser() + parser.add_argument( + "-d", + "--dataset_dir", + default=None, + type=str, + help="Dataset path, same as DataFeed.dataset.dataset_dir") + parser.add_argument( + "-f", + "--output_eval", + default=None, + type=str, + help="Evaluation file directory, default is current directory.") + parser.add_argument( + "-e", + "--eval_mode", + default="widerface", + type=str, + help="Evaluation mode, include `widerface` and `fddb`, default is `widerface`." + ) + parser.add_argument( + "--multi_scale", + action='store_true', + default=False, + help="If True it will select `multi_scale` evaluation. Default is `False`, it will select `single-scale` evaluation.") + FLAGS = parser.parse_args() + main() diff --git a/tools/infer.py b/tools/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..ed10814846037dcabdb301e244370848c647a61b --- /dev/null +++ b/tools/infer.py @@ -0,0 +1,269 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import glob + +import numpy as np +from PIL import Image + + +def set_paddle_flags(**kwargs): + for key, value in kwargs.items(): + if os.environ.get(key, None) is None: + os.environ[key] = str(value) + + +# NOTE(paddle-dev): All of these flags should be set before +# `import paddle`. Otherwise, it would not take any effect. +set_paddle_flags( + FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory +) + +from paddle import fluid + +from ppdet.utils.cli import print_total_cfg +from ppdet.core.workspace import load_config, merge_config, create +from ppdet.modeling.model_input import create_feed +from ppdet.data.data_feed import create_reader + +from ppdet.utils.eval_utils import parse_fetches +from ppdet.utils.cli import ArgsParser +from ppdet.utils.check import check_gpu +from ppdet.utils.visualizer import visualize_results +import ppdet.utils.checkpoint as checkpoint + +import logging +FORMAT = '%(asctime)s-%(levelname)s: %(message)s' +logging.basicConfig(level=logging.INFO, format=FORMAT) +logger = logging.getLogger(__name__) + + +def get_save_image_name(output_dir, image_path): + """ + Get save image name from source image path. + """ + if not os.path.exists(output_dir): + os.makedirs(output_dir) + image_name = os.path.split(image_path)[-1] + name, ext = os.path.splitext(image_name) + return os.path.join(output_dir, "{}".format(name)) + ext + + +def get_test_images(infer_dir, infer_img): + """ + Get image path list in TEST mode + """ + assert infer_img is not None or infer_dir is not None, \ + "--infer_img or --infer_dir should be set" + assert infer_img is None or os.path.isfile(infer_img), \ + "{} is not a file".format(infer_img) + assert infer_dir is None or os.path.isdir(infer_dir), \ + "{} is not a directory".format(infer_dir) + images = [] + + # infer_img has a higher priority + if infer_img and os.path.isfile(infer_img): + images.append(infer_img) + return images + + infer_dir = os.path.abspath(infer_dir) + assert os.path.isdir(infer_dir), \ + "infer_dir {} is not a directory".format(infer_dir) + exts = ['jpg', 'jpeg', 'png', 'bmp'] + exts += [ext.upper() for ext in exts] + for ext in exts: + images.extend(glob.glob('{}/*.{}'.format(infer_dir, ext))) + + assert len(images) > 0, "no image found in {}".format(infer_dir) + logger.info("Found {} inference images in total.".format(len(images))) + + return images + + +def main(): + cfg = load_config(FLAGS.config) + + if 'architecture' in cfg: + main_arch = cfg.architecture + else: + raise ValueError("'architecture' not specified in config file.") + + merge_config(FLAGS.opt) + + # check if set use_gpu=True in paddlepaddle cpu version + check_gpu(cfg.use_gpu) + print_total_cfg(cfg) + + if 'test_feed' not in cfg: + test_feed = create(main_arch + 'TestFeed') + else: + test_feed = create(cfg.test_feed) + + test_images = get_test_images(FLAGS.infer_dir, FLAGS.infer_img) + test_feed.dataset.add_images(test_images) + + place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + + model = create(main_arch) + + startup_prog = fluid.Program() + infer_prog = fluid.Program() + with fluid.program_guard(infer_prog, startup_prog): + with fluid.unique_name.guard(): + _, feed_vars = create_feed(test_feed, use_pyreader=False) + test_fetches = model.test(feed_vars) + infer_prog = infer_prog.clone(True) + + reader = create_reader(test_feed) + feeder = fluid.DataFeeder(place=place, feed_list=feed_vars.values()) + + exe.run(startup_prog) + if cfg.weights: + checkpoint.load_params(exe, infer_prog, cfg.weights) + + # parse infer fetches + assert cfg.metric in ['COCO', 'VOC', 'WIDERFACE'], \ + "unknown metric type {}".format(cfg.metric) + extra_keys = [] + if cfg['metric'] == 'COCO': + extra_keys = ['im_info', 'im_id', 'im_shape'] + if cfg['metric'] == 'VOC' or cfg['metric'] == 'WIDERFACE': + extra_keys = ['im_id', 'im_shape'] + keys, values, _ = parse_fetches(test_fetches, infer_prog, extra_keys) + + # parse dataset category + if cfg.metric == 'COCO': + from ppdet.utils.coco_eval import bbox2out, mask2out, get_category_info + if cfg.metric == "VOC": + from ppdet.utils.voc_eval import bbox2out, get_category_info + if cfg.metric == "WIDERFACE": + from ppdet.utils.widerface_eval_utils import bbox2out, get_category_info + + anno_file = getattr(test_feed.dataset, 'annotation', None) + with_background = getattr(test_feed, 'with_background', True) + use_default_label = getattr(test_feed, 'use_default_label', False) + clsid2catid, catid2name = get_category_info(anno_file, with_background, + use_default_label) + + # whether output bbox is normalized in model output layer + is_bbox_normalized = False + if hasattr(model, 'is_bbox_normalized') and \ + callable(model.is_bbox_normalized): + is_bbox_normalized = model.is_bbox_normalized() + + # use tb-paddle to log image + if FLAGS.use_tb: + from tb_paddle import SummaryWriter + tb_writer = SummaryWriter(FLAGS.tb_log_dir) + tb_image_step = 0 + tb_image_frame = 0 # each frame can display ten pictures at most. + + imid2path = reader.imid2path + for iter_id, data in enumerate(reader()): + outs = exe.run(infer_prog, + feed=feeder.feed(data), + fetch_list=values, + return_numpy=False) + res = { + k: (np.array(v), v.recursive_sequence_lengths()) + for k, v in zip(keys, outs) + } + logger.info('Infer iter {}'.format(iter_id)) + + bbox_results = None + mask_results = None + if 'bbox' in res: + bbox_results = bbox2out([res], clsid2catid, is_bbox_normalized) + if 'mask' in res: + mask_results = mask2out([res], clsid2catid, + model.mask_head.resolution) + + # visualize result + im_ids = res['im_id'][0] + for im_id in im_ids: + image_path = imid2path[int(im_id)] + image = Image.open(image_path).convert('RGB') + + # use tb-paddle to log original image + if FLAGS.use_tb: + original_image_np = np.array(image) + tb_writer.add_image( + "original/frame_{}".format(tb_image_frame), + original_image_np, + tb_image_step, + dataformats='HWC') + + image = visualize_results(image, + int(im_id), catid2name, + FLAGS.draw_threshold, bbox_results, + mask_results) + + # use tb-paddle to log image with bbox + if FLAGS.use_tb: + infer_image_np = np.array(image) + tb_writer.add_image( + "bbox/frame_{}".format(tb_image_frame), + infer_image_np, + tb_image_step, + dataformats='HWC') + tb_image_step += 1 + if tb_image_step % 10 == 0: + tb_image_step = 0 + tb_image_frame += 1 + + save_name = get_save_image_name(FLAGS.output_dir, image_path) + logger.info("Detection bbox results save in {}".format(save_name)) + image.save(save_name, quality=95) + + +if __name__ == '__main__': + parser = ArgsParser() + parser.add_argument( + "--infer_dir", + type=str, + default=None, + help="Directory for images to perform inference on.") + parser.add_argument( + "--infer_img", + type=str, + default=None, + help="Image path, has higher priority over --infer_dir") + parser.add_argument( + "--output_dir", + type=str, + default="output", + help="Directory for storing the output visualization files.") + parser.add_argument( + "--draw_threshold", + type=float, + default=0.5, + help="Threshold to reserve the result for visualization.") + parser.add_argument( + "--use_tb", + type=bool, + default=False, + help="whether to record the data to Tensorboard.") + parser.add_argument( + '--tb_log_dir', + type=str, + default="tb_log_dir/image", + help='Tensorboard logging directory for image.') + FLAGS = parser.parse_args() + main() diff --git a/tools/train.py b/tools/train.py new file mode 100644 index 0000000000000000000000000000000000000000..08e1fc63437c78722e11429d94468dcf2e5eee2c --- /dev/null +++ b/tools/train.py @@ -0,0 +1,338 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import numpy as np +import datetime +from collections import deque + + +def set_paddle_flags(**kwargs): + for key, value in kwargs.items(): + if os.environ.get(key, None) is None: + os.environ[key] = str(value) + + +# NOTE(paddle-dev): All of these flags should be set before +# `import paddle`. Otherwise, it would not take any effect. +set_paddle_flags( + FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory +) + +from paddle import fluid + +from ppdet.experimental import mixed_precision_context +from ppdet.core.workspace import load_config, merge_config, create +from ppdet.data.data_feed import create_reader + +from ppdet.utils.cli import print_total_cfg +from ppdet.utils import dist_utils +from ppdet.utils.eval_utils import parse_fetches, eval_run, eval_results +from ppdet.utils.stats import TrainingStats +from ppdet.utils.cli import ArgsParser +from ppdet.utils.check import check_gpu +import ppdet.utils.checkpoint as checkpoint +from ppdet.modeling.model_input import create_feed + +import logging +FORMAT = '%(asctime)s-%(levelname)s: %(message)s' +logging.basicConfig(level=logging.INFO, format=FORMAT) +logger = logging.getLogger(__name__) + + +def main(): + env = os.environ + FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env + if FLAGS.dist: + trainer_id = int(env['PADDLE_TRAINER_ID']) + import random + local_seed = (99 + trainer_id) + random.seed(local_seed) + np.random.seed(local_seed) + + cfg = load_config(FLAGS.config) + if 'architecture' in cfg: + main_arch = cfg.architecture + else: + raise ValueError("'architecture' not specified in config file.") + + merge_config(FLAGS.opt) + + if 'log_iter' not in cfg: + cfg.log_iter = 20 + + # check if set use_gpu=True in paddlepaddle cpu version + check_gpu(cfg.use_gpu) + if not FLAGS.dist or trainer_id == 0: + print_total_cfg(cfg) + + if cfg.use_gpu: + devices_num = fluid.core.get_cuda_device_count() + else: + devices_num = int(os.environ.get('CPU_NUM', 1)) + + if 'train_feed' not in cfg: + train_feed = create(main_arch + 'TrainFeed') + else: + train_feed = create(cfg.train_feed) + + if FLAGS.eval: + if 'eval_feed' not in cfg: + eval_feed = create(main_arch + 'EvalFeed') + else: + eval_feed = create(cfg.eval_feed) + + if 'FLAGS_selected_gpus' in env: + device_id = int(env['FLAGS_selected_gpus']) + else: + device_id = 0 + place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + + lr_builder = create('LearningRate') + optim_builder = create('OptimizerBuilder') + + # build program + startup_prog = fluid.Program() + train_prog = fluid.Program() + with fluid.program_guard(train_prog, startup_prog): + with fluid.unique_name.guard(): + model = create(main_arch) + train_pyreader, feed_vars = create_feed(train_feed) + + if FLAGS.fp16: + assert (getattr(model.backbone, 'norm_type', None) + != 'affine_channel'), \ + '--fp16 currently does not support affine channel, ' \ + ' please modify backbone settings to use batch norm' + + with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: + train_fetches = model.train(feed_vars) + + loss = train_fetches['loss'] + if FLAGS.fp16: + loss *= ctx.get_loss_scale_var() + lr = lr_builder() + optimizer = optim_builder(lr) + optimizer.minimize(loss) + if FLAGS.fp16: + loss /= ctx.get_loss_scale_var() + + # parse train fetches + train_keys, train_values, _ = parse_fetches(train_fetches) + train_values.append(lr) + + if FLAGS.eval: + eval_prog = fluid.Program() + with fluid.program_guard(eval_prog, startup_prog): + with fluid.unique_name.guard(): + model = create(main_arch) + eval_pyreader, feed_vars = create_feed(eval_feed) + fetches = model.eval(feed_vars) + eval_prog = eval_prog.clone(True) + + eval_reader = create_reader(eval_feed, args_path=FLAGS.dataset_dir) + eval_pyreader.decorate_sample_list_generator(eval_reader, place) + + # parse eval fetches + extra_keys = [] + if cfg.metric == 'COCO': + extra_keys = ['im_info', 'im_id', 'im_shape'] + if cfg.metric == 'VOC': + extra_keys = ['gt_box', 'gt_label', 'is_difficult'] + if cfg.metric == 'WIDERFACE': + extra_keys = ['im_id', 'im_shape', 'gt_box'] + eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog, + extra_keys) + + # compile program for multi-devices + build_strategy = fluid.BuildStrategy() + build_strategy.fuse_all_optimizer_ops = False + build_strategy.fuse_elewise_add_act_ops = True + # only enable sync_bn in multi GPU devices + sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' + build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ + and cfg.use_gpu + + exec_strategy = fluid.ExecutionStrategy() + # iteration number when CompiledProgram tries to drop local execution scopes. + # Set it to be 1 to save memory usages, so that unused variables in + # local execution scopes can be deleted after each iteration. + exec_strategy.num_iteration_per_drop_scope = 1 + if FLAGS.dist: + dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog, + train_prog) + exec_strategy.num_threads = 1 + + exe.run(startup_prog) + compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( + loss_name=loss.name, + build_strategy=build_strategy, + exec_strategy=exec_strategy) + + if FLAGS.eval: + compiled_eval_prog = fluid.compiler.CompiledProgram(eval_prog) + + fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' + + ignore_params = cfg.finetune_exclude_pretrained_params \ + if 'finetune_exclude_pretrained_params' in cfg else [] + + start_iter = 0 + if FLAGS.resume_checkpoint: + checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) + start_iter = checkpoint.global_step() + elif cfg.pretrain_weights and fuse_bn and not ignore_params: + checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights) + elif cfg.pretrain_weights: + checkpoint.load_params( + exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params) + + train_reader = create_reader(train_feed, (cfg.max_iters - start_iter) * + devices_num, FLAGS.dataset_dir) + train_pyreader.decorate_sample_list_generator(train_reader, place) + + # whether output bbox is normalized in model output layer + is_bbox_normalized = False + if hasattr(model, 'is_bbox_normalized') and \ + callable(model.is_bbox_normalized): + is_bbox_normalized = model.is_bbox_normalized() + + # if map_type not set, use default 11point, only use in VOC eval + map_type = cfg.map_type if 'map_type' in cfg else '11point' + + train_stats = TrainingStats(cfg.log_smooth_window, train_keys) + train_pyreader.start() + start_time = time.time() + end_time = time.time() + + cfg_name = os.path.basename(FLAGS.config).split('.')[0] + save_dir = os.path.join(cfg.save_dir, cfg_name) + time_stat = deque(maxlen=cfg.log_smooth_window) + best_box_ap_list = [0.0, 0] #[map, iter] + + # use tb-paddle to log data + if FLAGS.use_tb: + from tb_paddle import SummaryWriter + tb_writer = SummaryWriter(FLAGS.tb_log_dir) + tb_loss_step = 0 + tb_mAP_step = 0 + + for it in range(start_iter, cfg.max_iters): + start_time = end_time + end_time = time.time() + time_stat.append(end_time - start_time) + time_cost = np.mean(time_stat) + eta_sec = (cfg.max_iters - it) * time_cost + eta = str(datetime.timedelta(seconds=int(eta_sec))) + outs = exe.run(compiled_train_prog, fetch_list=train_values) + stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} + + # use tb-paddle to log loss + if FLAGS.use_tb: + if it % cfg.log_iter == 0: + for loss_name, loss_value in stats.items(): + tb_writer.add_scalar(loss_name, loss_value, tb_loss_step) + tb_loss_step += 1 + + train_stats.update(stats) + logs = train_stats.log() + if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0): + strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( + it, np.mean(outs[-1]), logs, time_cost, eta) + logger.info(strs) + + if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ + and (not FLAGS.dist or trainer_id == 0): + save_name = str(it) if it != cfg.max_iters - 1 else "model_final" + checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) + + if FLAGS.eval: + # evaluation + results = eval_run(exe, compiled_eval_prog, eval_pyreader, + eval_keys, eval_values, eval_cls) + resolution = None + if 'mask' in results[0]: + resolution = model.mask_head.resolution + box_ap_stats = eval_results( + results, eval_feed, cfg.metric, cfg.num_classes, resolution, + is_bbox_normalized, FLAGS.output_eval, map_type) + + # use tb_paddle to log mAP + if FLAGS.use_tb: + tb_writer.add_scalar("mAP", box_ap_stats[0], tb_mAP_step) + tb_mAP_step += 1 + + if box_ap_stats[0] > best_box_ap_list[0]: + best_box_ap_list[0] = box_ap_stats[0] + best_box_ap_list[1] = it + checkpoint.save(exe, train_prog, + os.path.join(save_dir, "best_model")) + logger.info("Best test box ap: {}, in iter: {}".format( + best_box_ap_list[0], best_box_ap_list[1])) + + train_pyreader.reset() + + +if __name__ == '__main__': + parser = ArgsParser() + parser.add_argument( + "-r", + "--resume_checkpoint", + default=None, + type=str, + help="Checkpoint path for resuming training.") + parser.add_argument( + "--fp16", + action='store_true', + default=False, + help="Enable mixed precision training.") + parser.add_argument( + "--loss_scale", + default=8., + type=float, + help="Mixed precision training loss scale.") + parser.add_argument( + "--eval", + action='store_true', + default=False, + help="Whether to perform evaluation in train") + parser.add_argument( + "--output_eval", + default=None, + type=str, + help="Evaluation directory, default is current directory.") + parser.add_argument( + "-d", + "--dataset_dir", + default=None, + type=str, + help="Dataset path, same as DataFeed.dataset.dataset_dir") + parser.add_argument( + "--use_tb", + type=bool, + default=False, + help="whether to record the data to Tensorboard.") + parser.add_argument( + '--tb_log_dir', + type=str, + default="tb_log_dir/scalar", + help='Tensorboard logging directory for scalar.') + FLAGS = parser.parse_args() + main()