[PaddleDetection] add SSD config_demo (#3176)

* add_ssd_config_demo

[PaddleDetection] add SSD config_demo (#3176)
* add_ssd_config_demo
6b1c4dfa · Guanghua Yu · wangguanzhong · 424db1a2 · 6b1c4dfa · 6b1c4dfa
4 changed file
--- a/docs/config_example/ssd_vgg16_300.yml
+++ b/docs/config_example/ssd_vgg16_300.yml
+# Architecture of detection, which is also the prefix of data feed module.
+architecture: SSD
+# Data feed module.
+# Data feed in training.
+train_feed: SSDTrainFeed
+# Data feed in Evaluation.
+eval_feed: SSDEvalFeed
+# Data feed in infer.
+test_feed: SSDTestFeed
+# Use GPU or CPU, true by default.
+use_gpu: true
+# Maximum number of iteration.
+max_iters: 400000
+# Snapshot period. If training and test at same time, evaluate model at each snapshot_iter. 10000 by default.
+snapshot_iter: 10000
+# Smooth the log output in specified iterations, 20 by default.
+log_smooth_window: 20
+# The log in training is displayed once every period.
+log_iter: 20
+# Evalution method, COCO and VOC are available.
+metric: COCO
+# The path of final model for evaluation and test.
+pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/VGG16_caffe_pretrained.tar
+# The directory to save models.
+save_dir: output
+# The path of final model for evaluation and test.
+weights: output/ssd_vgg16_300/model_final
+# Number of classes, 81 for COCO and 21 for VOC.
+num_classes: 81
+# SSD architecture, see https://arxiv.org/abs/1512.02325
+SSD:
+  # backbone instance, defined below.
+  backbone: VGG
+  # `MultiBoxHead` instance, defined below.
+  multi_box_head: MultiBoxHead
+  # fluid.layers.detection_output, Detection Output Layer for SSD.
+  # This operation is to get the detection results by performing following two steps:
+  #   1. Decode input bounding box predictions according to the prior boxes.
+  #   2. Get the final detection results by applying multi-class non maximum suppression (NMS).
+  # this operation doesn’t clip the final output bounding boxes to the image window.
+  output_decoder:
+    # The index of background label, the background label will be ignored.
+    # If set to -1, then all categories will be considered.
+    background_label: 0
+    # Number of total bboxes to be kept per image after NMS.
+    keep_top_k: 200
+    # The parameter for adaptive NMS.
+    nms_eta: 1.0
+    # The threshold to be used in NMS.
+    nms_threshold: 0.45
+    # Maximum number of detections to be kept according to the confidences
+    # aftern the filtering detections based on score_threshold.
+    nms_top_k: 400
+    # Threshold to filter out bounding boxes with low confidence score.
+    # If not provided, consider all boxes.
+    score_threshold: 0.01
+# VGG backbone, see https://arxiv.org/abs/1409.1556
+VGG:
+  # the VGG net depth (16 or 19
+  depth: 16
+  # whether or not extra blocks should be added
+  with_extra_blocks: true
+  # in each extra block, params:
+  # [in_channel, out_channel, padding_size, stride_size, filter_size]
+  extra_block_filters:
+  - [256, 512, 1, 2, 3]
+  - [128, 256, 1, 2, 3]
+  - [128, 256, 0, 1, 3]
+  - [128, 256, 0, 1, 3]
+  # params list of init scale in l2 norm, skip init scale if param is -1.
+  normalizations: [20., -1, -1, -1, -1, -1]
+# fluid.layers.multi_box_head, Generate prior boxes for SSD algorithm.
+# Generate `prior_box`  according to the inputs list and other parameters
+# Each position of the input produce N prior boxes, N is determined by 
+# the count of min_sizes, max_sizes and aspect_ratios, The size of the box 
+# is in range(min_size, max_size) interval, which is generated in sequence 
+# according to the aspect_ratios.
+MultiBoxHead:
+  # the base_size is used to get min_size and max_size according to min_ratio and max_ratio.
+  base_size: 300
+  # the aspect ratios of generated prior boxes. The length of input and aspect_ratios must be equal.
+  aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]]
+  # the min ratio of generated prior boxes.
+  min_ratio: 15
+  # the max ratio of generated prior boxes.
+  max_ratio: 90
+  # If len(inputs) <=2, min_sizes must be set up, and the length of min_sizes
+  # should equal to the length of inputs. Default: None.
+  min_sizes: [30.0, 60.0, 111.0, 162.0, 213.0, 264.0]
+  # If len(inputs) <=2, max_sizes must be set up, and the length of min_sizes
+  # should equal to the length of inputs. Default: None.
+  max_sizes: [60.0, 111.0, 162.0, 213.0, 264.0, 315.0]
+  # If step_w and step_h are the same, step_w and step_h can be replaced by steps.
+  steps: [8, 16, 32, 64, 100, 300]
+  # Prior boxes center offset. Default: 0.5
+  offset: 0.5
+  # Whether to flip aspect ratios. Default:False.
+  flip: true
+  # The kernel size of conv2d. Default: 1.
+  kernel_size: 3
+  # The padding of conv2d. Default:0.
+  pad: 1
+# Learning rate configuration
+LearningRate:
+  # Base learning rate, 0.01 by default
+  base_lr: 0.001
+  # Learning rate schedulers, PiecewiseDecay and LinearWarmup by default
+  schedulers:
+  # fluid.layers.piecewise_decay
+  # Values has higher priority and if values is null, learning rate is multipled by gamma at each stage
+  - !PiecewiseDecay
+    gamma: 0.1
+    milestones: [280000, 360000]
+  # fluid.layers.linear_lr_warmup
+  # Start learning rate equals to base_lr * start_factor
+  - !LinearWarmup
+    start_factor: 0.3333333333333333
+    steps: 500
+# Optimizer module
+OptimizerBuilder:
+  # fluid.optimizer, Neural network in essence is a Optimization problem .
+  # With forward computing and back propagation , Optimizer use back-propagation
+  # gradients to optimize parameters in a neural network.
+  optimizer:
+    # Momentum optimizer adds momentum on the basis of SGD ,
+    # reducing noise problem in the process of random gradient descent.
+    momentum: 0.9
+    type: Momentum
+  # fluid.regularizer
+  regularizer:
+    # implements the L2 Weight Decay Regularization
+    # Small values of L2 can help prevent over fitting the training data.
+    factor: 0.0005
+    type: L2
+# Data feed module for training
+SSDTrainFeed:
+  # Batch size per device
+  batch_size: 16
+  # list of batch transformations to use
+  batch_transforms: []
+  # The data buffer size
+  bufsize: 10
+  # Dataset module
+  dataset:
+    # Dataset directory
+    dataset_dir: dataset/coco
+    # Annotation file path
+    annotation: annotations/instances_train2017.json
+    # Directory where image files are stored
+    image_dir: train2017
+  # Drop last batch if size is uneven, false by default
+  drop_last: true
+  # List of data fields needed
+  fields: [image, gt_box, gt_label]
+  # list of image dims
+  image_shape: [3, 300, 300]
+  # number of workers processes (or threads)
+  num_workers: 8
+  # List of sample transformations to use
+  sample_transforms:
+  # Transform the image data to numpy format.
+  - !DecodeImage
+    # whether to convert BGR to RGB
+    to_rgb: true  # default: true
+    # whether or not to mixup image and gt_bbbox/gt_score
+    with_mixup: false   # default: false
+  # Transform the bounding box's coornidates to [0,1].
+  - !NormalizeBox {}
+  # modify image brightness，contrast，saturation，hue，reordering channels and etc.
+  - !RandomDistort
+    # brightness_lower/ brightness_upper (float): the brightness
+    # between brightness_lower and brightness_upper
+    brightness_lower: 0.875
+    brightness_upper: 1.125
+    # brightness_prob (float): the probability of changing brightness
+    brightness_prob: 0.5
+    # contrast_lower/ contrast_upper (float): the contrast between
+    # contrast_lower and contrast_lower
+    contrast_lower: 0.5
+    contrast_upper: 1.5
+    # contrast_prob (float): the probability of changing contrast
+    contrast_prob: 0.5
+    # count (int): the kinds of doing distrot
+    count: 4
+    # hue_lower/ hue_upper (float): the hue between hue_lower and hue_upper
+    hue_lower: -18
+    hue_upper: 18
+    # hue_prob (float): the probability of changing hue
+    hue_prob: 0.5
+    # is_order (bool): whether determine the order of distortion
+    is_order: true
+    # saturation_lower/ saturation_upper (float): the saturation
+    # between saturation_lower and saturation_upper
+    saturation_lower: 0.5
+    saturation_upper: 1.5
+    # saturation_prob (float): the probability of changing saturation
+    saturation_prob: 0.5
+  #Expand the image and modify bounding box.
+  #   Operators:
+  #     1. Scale the image weight and height.
+  #     2. Construct new images with new height and width.
+  #     3. Fill the new image with the mean.
+  #     4. Put original imge into new image.
+  #     5. Rescale the bounding box.
+  #     6. Determine if the new bbox is satisfied in the new image.
+  - !ExpandImage
+    # max_ratio (float): the ratio of expanding
+    max_ratio: 4
+    # mean (list): the pixel mean
+    mean: [104, 117, 123]
+    # prob (float): the probability of expanding image
+    prob: 0.5
+  # Crop the image and modify bounding box.
+  #   Operators:
+  #     1. Scale the image weight and height.
+  #     2. Crop the image according to a radom sample.
+  #     3. Rescale the bounding box.
+  #     4. Determine if the new bbox is satisfied in the new image.
+  - !CropImage
+    # avoid_no_bbox (bool): whether to to avoid the
+    # situation where the box does not appear.
+    avoid_no_bbox: false
+    # batch_sampler (list): Multiple sets of different parameters for cropping.
+    batch_sampler:
+    - [1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]
+    - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0]
+    - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0]
+    - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0]
+    - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0]
+    - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0]
+    - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]
+    # satisfy_all (bool): whether all boxes must satisfy.
+    satisfy_all: false
+  # Rescale image to the specified target size, and capped at max_size if max_size != 0.
+  # If target_size is list, selected a scale randomly as the specified target size.
+  - !ResizeImage
+    # Resize method, cv2.INTER_LINEAR(1) by default
+    interp: 1
+    # max_size (int): the max size of image
+    max_size: 0
+    # target_size (int|list): the target size of image's short side,
+    # multi-scale training is adopted when type is list.
+    target_size: 300
+    # use_cv2 (bool): use the cv2 interpolation method or use PIL interpolation method
+    use_cv2: false
+  # Filp the image and bounding box.
+  #   Operators:
+  #     1. Flip the image numpy.
+  #     2. Transform the bboxes' x coordinates. (Must judge whether the coordinates are normalized!)
+  #     3. Transform the segmentations' x coordinates. (Must judge whether the coordinates are normalized!)
+  - !RandomFlipImage
+    # is_mask_flip (bool): whether flip the segmentation
+    is_mask_flip: false
+    # is_normalized (bool): whether the bbox scale to [0,1]
+    is_normalized: true
+    # prob (float): the probability of flipping image
+    prob: 0.5
+  # Change the channel
+  - !Permute
+    # The format of image, [H, W, C]/[C, H, W], true by default
+    channel_first: true
+    # to_bgr (bool): confirm whether to convert RGB to BGR
+    to_bgr: true
+  # Normalize the image.
+  #   Operators:
+  #     1.(optional) Scale the image to [0,1]
+  #     2. Each pixel minus mean and is divided by std
+  - !NormalizeImage
+    # The format of image, [H, W, C]/[C, H, W], true by default
+    is_channel_first: true
+    # Whether divide by 255, true by default
+    is_scale: false
+    # mean (list): the pixel mean
+    mean: [104, 117, 123]
+    # std (list): the pixel variance
+    std: [1, 1, 1]
+  # Number of samples, -1 represents all samples. -1 by default
+  samples: -1
+  # If samples should be shuffled, true by default
+  shuffle: true
+  # If use multi-process, false by default
+  use_process: true
+# Data feed module for Eval
+SSDEvalFeed:
+  # Batch size per device
+  batch_size: 32
+  # list of batch transformations to use
+  batch_transforms: []
+  # The data buffer size
+  bufsize: 10
+  # Dataset module
+  dataset:
+    # Dataset directory
+    dataset_dir: dataset/coco
+    # Annotation file path
+    annotation: annotations/instances_val2017.json
+    # Directory where image files are stored
+    image_dir: val2017
+  # Drop last batch if size is uneven, false by default
+  drop_last: true
+  # List of data fields needed
+  fields: [image, im_shape, im_id, gt_box, gt_label, is_difficult]
+  # list of image dims
+  image_shape: [3, 300, 300]
+  # number of workers processes (or threads)
+  num_workers: 8
+  # List of sample transformations to use
+  sample_transforms:
+  # Transform the image data to numpy format.
+  - !DecodeImage
+  # whether to convert BGR to RGB
+    to_rgb: true  # default: true
+    # whether or not to mixup image and gt_bbbox/gt_score
+    with_mixup: false   # default: false
+  # Transform the bounding box's coornidates to [0,1].
+  - !NormalizeBox {}
+  # Rescale image to the specified target size, and capped at max_size if max_size != 0.
+  # If target_size is list, selected a scale randomly as the specified target size.
+  - !ResizeImage
+    # Resize method, cv2.INTER_LINEAR(1) by default
+    interp: 1
+    # max_size (int): the max size of image
+    max_size: 0
+    # target_size (int|list): the target size of image's short side,
+    # multi-scale training is adopted when type is list.
+    target_size: 300
+    # use_cv2 (bool): use the cv2 interpolation method or use PIL interpolation method
+    use_cv2: false
+  - !Permute
+    # The format of image, [H, W, C]/[C, H, W], true by default
+    channel_first: true
+    # to_bgr (bool): confirm whether to convert RGB to BGR
+    to_bgr: true
+  # Normalize the image.
+  #   Operators:
+  #     1.(optional) Scale the image to [0,1]
+  #     2. Each pixel minus mean and is divided by std
+  - !NormalizeImage
+    # The format of image, [H, W, C]/[C, H, W], true by default
+    is_channel_first: true
+    # Whether divide by 255, true by default
+    is_scale: false
+    # mean (list): the pixel mean
+    mean: [104, 117, 123]
+    # std (list): the pixel variance
+    std: [1, 1, 1]
+  # Number of samples, -1 represents all samples. -1 by default
+  samples: -1
+  # If samples should be shuffled, true by default
+  shuffle: false
+  # If use multi-process, false by default
+  use_process: false
+# Data feed module for test
+SSDTestFeed:
+  # Batch size per device
+  batch_size: 1
+  # list of batch transformations to use
+  batch_transforms: []
+  # The data buffer size
+  bufsize: 10
+  # Dataset module
+  dataset:
+    # Annotation file path
+    annotation: dataset/coco/annotations/instances_val2017.json
+  # Drop last batch if size is uneven, false by default
+  drop_last: false
+  # List of data fields needed
+  fields: [image, im_id]
+  # list of image dims
+  image_shape: [3, 300, 300]
+  # number of workers processes (or threads)
+  num_workers: 8
+  # List of sample transformations to use
+  sample_transforms:
+  # Transform the image data to numpy format.
+  - !DecodeImage
+  # whether to convert BGR to RGB
+    to_rgb: true  # default: true
+    # whether or not to mixup image and gt_bbbox/gt_score
+    with_mixup: false   # default: false
+  # Rescale image to the specified target size, and capped at max_size if max_size != 0.
+  # If target_size is list, selected a scale randomly as the specified target size.
+  - !ResizeImage
+    # Resize method, cv2.INTER_LINEAR(1) by default
+    interp: 1
+    # max_size (int): the max size of image
+    max_size: 0
+    # target_size (int|list): the target size of image's short side,
+    # multi-scale training is adopted when type is list.
+    target_size: 300
+    # use_cv2 (bool): use the cv2 interpolation method or use PIL interpolation method
+    use_cv2: false
+  - !Permute
+    # The format of image, [H, W, C]/[C, H, W], true by default
+    channel_first: true
+    # to_bgr (bool): confirm whether to convert RGB to BGR
+    to_bgr: true
+  # Normalize the image.
+  #   Operators:
+  #     1.(optional) Scale the image to [0,1]
+  #     2. Each pixel minus mean and is divided by std
+  - !NormalizeImage
+    # The format of image, [H, W, C]/[C, H, W], true by default
+    is_channel_first: true
+    # Whether divide by 255, true by default
+    is_scale: false
+    # mean (list): the pixel mean
+    mean: [104, 117, 123]
+    # std (list): the pixel variance
+    std: [1, 1, 1]
+  # Number of samples, -1 represents all samples. -1 by default
+  samples: -1
+  # If samples should be shuffled, true by default
+  shuffle: false
+  # If use multi-process, false by default
+  use_process: false
--- a/ppdet/data/transform/operators.py
+++ b/ppdet/data/transform/operators.py
@@ -82,6 +82,7 @@ class DecodeImage(BaseOperator):
        Args:
            to_rgb (bool): whether to convert BGR to RGB
+            with_mixup (bool): whether or not to mixup image and gt_bbbox/gt_score
        """
        super(DecodeImage, self).__init__()
@@ -459,7 +460,7 @@ class ExpandImage(BaseOperator):
    def __init__(self, max_ratio, prob, mean=[127.5, 127.5, 127.5]):
        """
        Args:
-            ratio (float): the ratio of expanding
+            max_ratio (float): the ratio of expanding
            prob (float): the probability of expanding image
            mean (list): the pixel mean
        """

--- a/ppdet/modeling/architectures/ssd.py
+++ b/ppdet/modeling/architectures/ssd.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 from paddle import fluid
 from ppdet.core.workspace import register
-from ppdet.modeling.ops import SSDOutputDecoder, SSDMetric
+from ppdet.modeling.ops import SSDOutputDecoder
 __all__ = ['SSD']
@@ -33,30 +33,25 @@ class SSD(object):
        backbone (object): backbone instance
        multi_box_head (object): `MultiBoxHead` instance
        output_decoder (object): `SSDOutputDecoder` instance
-        metric (object): `SSDMetric` instance for training
        num_classes (int): number of output classes
    """
    __category__ = 'architecture'
-    __inject__ = ['backbone', 'multi_box_head', 'output_decoder', 'metric']
+    __inject__ = ['backbone', 'multi_box_head', 'output_decoder']
    __shared__ = ['num_classes']
    def __init__(self,
                 backbone,
                 multi_box_head='MultiBoxHead',
                 output_decoder=SSDOutputDecoder().__dict__,
-                 metric=SSDMetric().__dict__,
                 num_classes=21):
        super(SSD, self).__init__()
        self.backbone = backbone
        self.multi_box_head = multi_box_head
        self.num_classes = num_classes
        self.output_decoder = output_decoder
-        self.metric = metric
        if isinstance(output_decoder, dict):
            self.output_decoder = SSDOutputDecoder(**output_decoder)
-        if isinstance(metric, dict):
-            self.metric = SSDMetric(**metric)
    def build(self, feed_vars, mode='train'):
        im = feed_vars['image']

--- a/ppdet/modeling/ops.py
+++ b/ppdet/modeling/ops.py
@@ -23,8 +23,7 @@ from ppdet.core.workspace import register, serializable
 __all__ = [
    'AnchorGenerator', 'RPNTargetAssign', 'GenerateProposals', 'MultiClassNMS',
    'BBoxAssigner', 'MaskAssigner', 'RoIAlign', 'RoIPool', 'MultiBoxHead',
-    'SSDOutputDecoder', 'SSDMetric', 'RetinaTargetAssign',
+    'SSDOutputDecoder', 'RetinaTargetAssign', 'RetinaOutputDecoder', 'ConvNorm'
-    'RetinaOutputDecoder', 'ConvNorm'
 ]
@@ -303,22 +302,6 @@ class SSDOutputDecoder(object):
        self.nms_eta = nms_eta
-@register
-@serializable
-class SSDMetric(object):
-    __op__ = fluid.metrics.DetectionMAP
-    __append_doc__ = True
-    def __init__(self,
-                 overlap_threshold=0.5,
-                 evaluate_difficult=False,
-                 ap_version='integral'):
-        super(SSDMetric, self).__init__()
-        self.overlap_threshold = overlap_threshold
-        self.evaluate_difficult = evaluate_difficult
-        self.ap_version = ap_version
 @register
 @serializable
 class RetinaTargetAssign(object):