diff --git a/docs/config_example/ssd_vgg16_300.yml b/docs/config_example/ssd_vgg16_300.yml new file mode 100644 index 0000000000000000000000000000000000000000..9f812f175ce6990d95469abf29770eaef9c2f934 --- /dev/null +++ b/docs/config_example/ssd_vgg16_300.yml @@ -0,0 +1,425 @@ +# Architecture of detection, which is also the prefix of data feed module. +architecture: SSD +# Data feed module. +# Data feed in training. +train_feed: SSDTrainFeed +# Data feed in Evaluation. +eval_feed: SSDEvalFeed +# Data feed in infer. +test_feed: SSDTestFeed +# Use GPU or CPU, true by default. +use_gpu: true +# Maximum number of iteration. +max_iters: 400000 +# Snapshot period. If training and test at same time, evaluate model at each snapshot_iter. 10000 by default. +snapshot_iter: 10000 +# Smooth the log output in specified iterations, 20 by default. +log_smooth_window: 20 +# The log in training is displayed once every period. +log_iter: 20 +# Evalution method, COCO and VOC are available. +metric: COCO +# The path of final model for evaluation and test. +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/VGG16_caffe_pretrained.tar +# The directory to save models. +save_dir: output +# The path of final model for evaluation and test. +weights: output/ssd_vgg16_300/model_final +# Number of classes, 81 for COCO and 21 for VOC. +num_classes: 81 + +# SSD architecture, see https://arxiv.org/abs/1512.02325 +SSD: + # backbone instance, defined below. + backbone: VGG + # `MultiBoxHead` instance, defined below. + multi_box_head: MultiBoxHead + + # fluid.layers.detection_output, Detection Output Layer for SSD. + # This operation is to get the detection results by performing following two steps: + # 1. Decode input bounding box predictions according to the prior boxes. + # 2. Get the final detection results by applying multi-class non maximum suppression (NMS). + # this operation doesn’t clip the final output bounding boxes to the image window. + output_decoder: + # The index of background label, the background label will be ignored. + # If set to -1, then all categories will be considered. + background_label: 0 + # Number of total bboxes to be kept per image after NMS. + keep_top_k: 200 + # The parameter for adaptive NMS. + nms_eta: 1.0 + # The threshold to be used in NMS. + nms_threshold: 0.45 + # Maximum number of detections to be kept according to the confidences + # aftern the filtering detections based on score_threshold. + nms_top_k: 400 + # Threshold to filter out bounding boxes with low confidence score. + # If not provided, consider all boxes. + score_threshold: 0.01 + +# VGG backbone, see https://arxiv.org/abs/1409.1556 +VGG: + # the VGG net depth (16 or 19 + depth: 16 + # whether or not extra blocks should be added + with_extra_blocks: true + # in each extra block, params: + # [in_channel, out_channel, padding_size, stride_size, filter_size] + extra_block_filters: + - [256, 512, 1, 2, 3] + - [128, 256, 1, 2, 3] + - [128, 256, 0, 1, 3] + - [128, 256, 0, 1, 3] + # params list of init scale in l2 norm, skip init scale if param is -1. + normalizations: [20., -1, -1, -1, -1, -1] + +# fluid.layers.multi_box_head, Generate prior boxes for SSD algorithm. +# Generate `prior_box` according to the inputs list and other parameters +# Each position of the input produce N prior boxes, N is determined by +# the count of min_sizes, max_sizes and aspect_ratios, The size of the box +# is in range(min_size, max_size) interval, which is generated in sequence +# according to the aspect_ratios. +MultiBoxHead: + # the base_size is used to get min_size and max_size according to min_ratio and max_ratio. + base_size: 300 + # the aspect ratios of generated prior boxes. The length of input and aspect_ratios must be equal. + aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]] + # the min ratio of generated prior boxes. + min_ratio: 15 + # the max ratio of generated prior boxes. + max_ratio: 90 + # If len(inputs) <=2, min_sizes must be set up, and the length of min_sizes + # should equal to the length of inputs. Default: None. + min_sizes: [30.0, 60.0, 111.0, 162.0, 213.0, 264.0] + # If len(inputs) <=2, max_sizes must be set up, and the length of min_sizes + # should equal to the length of inputs. Default: None. + max_sizes: [60.0, 111.0, 162.0, 213.0, 264.0, 315.0] + # If step_w and step_h are the same, step_w and step_h can be replaced by steps. + steps: [8, 16, 32, 64, 100, 300] + # Prior boxes center offset. Default: 0.5 + offset: 0.5 + # Whether to flip aspect ratios. Default:False. + flip: true + # The kernel size of conv2d. Default: 1. + kernel_size: 3 + # The padding of conv2d. Default:0. + pad: 1 + +# Learning rate configuration +LearningRate: + # Base learning rate, 0.01 by default + base_lr: 0.001 + # Learning rate schedulers, PiecewiseDecay and LinearWarmup by default + schedulers: + # fluid.layers.piecewise_decay + # Values has higher priority and if values is null, learning rate is multipled by gamma at each stage + - !PiecewiseDecay + gamma: 0.1 + milestones: [280000, 360000] + # fluid.layers.linear_lr_warmup + # Start learning rate equals to base_lr * start_factor + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +# Optimizer module +OptimizerBuilder: + # fluid.optimizer, Neural network in essence is a Optimization problem . + # With forward computing and back propagation , Optimizer use back-propagation + # gradients to optimize parameters in a neural network. + optimizer: + # Momentum optimizer adds momentum on the basis of SGD , + # reducing noise problem in the process of random gradient descent. + momentum: 0.9 + type: Momentum + # fluid.regularizer + regularizer: + # implements the L2 Weight Decay Regularization + # Small values of L2 can help prevent over fitting the training data. + factor: 0.0005 + type: L2 + +# Data feed module for training +SSDTrainFeed: + # Batch size per device + batch_size: 16 + # list of batch transformations to use + batch_transforms: [] + # The data buffer size + bufsize: 10 + # Dataset module + dataset: + # Dataset directory + dataset_dir: dataset/coco + # Annotation file path + annotation: annotations/instances_train2017.json + # Directory where image files are stored + image_dir: train2017 + # Drop last batch if size is uneven, false by default + drop_last: true + # List of data fields needed + fields: [image, gt_box, gt_label] + # list of image dims + image_shape: [3, 300, 300] + # number of workers processes (or threads) + num_workers: 8 + # List of sample transformations to use + sample_transforms: + # Transform the image data to numpy format. + - !DecodeImage + # whether to convert BGR to RGB + to_rgb: true # default: true + # whether or not to mixup image and gt_bbbox/gt_score + with_mixup: false # default: false + # Transform the bounding box's coornidates to [0,1]. + - !NormalizeBox {} + # modify image brightness,contrast,saturation,hue,reordering channels and etc. + - !RandomDistort + # brightness_lower/ brightness_upper (float): the brightness + # between brightness_lower and brightness_upper + brightness_lower: 0.875 + brightness_upper: 1.125 + # brightness_prob (float): the probability of changing brightness + brightness_prob: 0.5 + # contrast_lower/ contrast_upper (float): the contrast between + # contrast_lower and contrast_lower + contrast_lower: 0.5 + contrast_upper: 1.5 + # contrast_prob (float): the probability of changing contrast + contrast_prob: 0.5 + # count (int): the kinds of doing distrot + count: 4 + # hue_lower/ hue_upper (float): the hue between hue_lower and hue_upper + hue_lower: -18 + hue_upper: 18 + # hue_prob (float): the probability of changing hue + hue_prob: 0.5 + # is_order (bool): whether determine the order of distortion + is_order: true + # saturation_lower/ saturation_upper (float): the saturation + # between saturation_lower and saturation_upper + saturation_lower: 0.5 + saturation_upper: 1.5 + # saturation_prob (float): the probability of changing saturation + saturation_prob: 0.5 + #Expand the image and modify bounding box. + # Operators: + # 1. Scale the image weight and height. + # 2. Construct new images with new height and width. + # 3. Fill the new image with the mean. + # 4. Put original imge into new image. + # 5. Rescale the bounding box. + # 6. Determine if the new bbox is satisfied in the new image. + - !ExpandImage + # max_ratio (float): the ratio of expanding + max_ratio: 4 + # mean (list): the pixel mean + mean: [104, 117, 123] + # prob (float): the probability of expanding image + prob: 0.5 + # Crop the image and modify bounding box. + # Operators: + # 1. Scale the image weight and height. + # 2. Crop the image according to a radom sample. + # 3. Rescale the bounding box. + # 4. Determine if the new bbox is satisfied in the new image. + - !CropImage + # avoid_no_bbox (bool): whether to to avoid the + # situation where the box does not appear. + avoid_no_bbox: false + # batch_sampler (list): Multiple sets of different parameters for cropping. + batch_sampler: + - [1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0] + - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0] + # satisfy_all (bool): whether all boxes must satisfy. + satisfy_all: false + # Rescale image to the specified target size, and capped at max_size if max_size != 0. + # If target_size is list, selected a scale randomly as the specified target size. + - !ResizeImage + # Resize method, cv2.INTER_LINEAR(1) by default + interp: 1 + # max_size (int): the max size of image + max_size: 0 + # target_size (int|list): the target size of image's short side, + # multi-scale training is adopted when type is list. + target_size: 300 + # use_cv2 (bool): use the cv2 interpolation method or use PIL interpolation method + use_cv2: false + # Filp the image and bounding box. + # Operators: + # 1. Flip the image numpy. + # 2. Transform the bboxes' x coordinates. (Must judge whether the coordinates are normalized!) + # 3. Transform the segmentations' x coordinates. (Must judge whether the coordinates are normalized!) + - !RandomFlipImage + # is_mask_flip (bool): whether flip the segmentation + is_mask_flip: false + # is_normalized (bool): whether the bbox scale to [0,1] + is_normalized: true + # prob (float): the probability of flipping image + prob: 0.5 + # Change the channel + - !Permute + # The format of image, [H, W, C]/[C, H, W], true by default + channel_first: true + # to_bgr (bool): confirm whether to convert RGB to BGR + to_bgr: true + # Normalize the image. + # Operators: + # 1.(optional) Scale the image to [0,1] + # 2. Each pixel minus mean and is divided by std + - !NormalizeImage + # The format of image, [H, W, C]/[C, H, W], true by default + is_channel_first: true + # Whether divide by 255, true by default + is_scale: false + # mean (list): the pixel mean + mean: [104, 117, 123] + # std (list): the pixel variance + std: [1, 1, 1] + # Number of samples, -1 represents all samples. -1 by default + samples: -1 + # If samples should be shuffled, true by default + shuffle: true + # If use multi-process, false by default + use_process: true + +# Data feed module for Eval +SSDEvalFeed: + # Batch size per device + batch_size: 32 + # list of batch transformations to use + batch_transforms: [] + # The data buffer size + bufsize: 10 + # Dataset module + dataset: + # Dataset directory + dataset_dir: dataset/coco + # Annotation file path + annotation: annotations/instances_val2017.json + # Directory where image files are stored + image_dir: val2017 + # Drop last batch if size is uneven, false by default + drop_last: true + # List of data fields needed + fields: [image, im_shape, im_id, gt_box, gt_label, is_difficult] + # list of image dims + image_shape: [3, 300, 300] + # number of workers processes (or threads) + num_workers: 8 + # List of sample transformations to use + sample_transforms: + # Transform the image data to numpy format. + - !DecodeImage + # whether to convert BGR to RGB + to_rgb: true # default: true + # whether or not to mixup image and gt_bbbox/gt_score + with_mixup: false # default: false + # Transform the bounding box's coornidates to [0,1]. + - !NormalizeBox {} + # Rescale image to the specified target size, and capped at max_size if max_size != 0. + # If target_size is list, selected a scale randomly as the specified target size. + - !ResizeImage + # Resize method, cv2.INTER_LINEAR(1) by default + interp: 1 + # max_size (int): the max size of image + max_size: 0 + # target_size (int|list): the target size of image's short side, + # multi-scale training is adopted when type is list. + target_size: 300 + # use_cv2 (bool): use the cv2 interpolation method or use PIL interpolation method + use_cv2: false + - !Permute + # The format of image, [H, W, C]/[C, H, W], true by default + channel_first: true + # to_bgr (bool): confirm whether to convert RGB to BGR + to_bgr: true + # Normalize the image. + # Operators: + # 1.(optional) Scale the image to [0,1] + # 2. Each pixel minus mean and is divided by std + - !NormalizeImage + # The format of image, [H, W, C]/[C, H, W], true by default + is_channel_first: true + # Whether divide by 255, true by default + is_scale: false + # mean (list): the pixel mean + mean: [104, 117, 123] + # std (list): the pixel variance + std: [1, 1, 1] + # Number of samples, -1 represents all samples. -1 by default + samples: -1 + # If samples should be shuffled, true by default + shuffle: false + # If use multi-process, false by default + use_process: false + +# Data feed module for test +SSDTestFeed: + # Batch size per device + batch_size: 1 + # list of batch transformations to use + batch_transforms: [] + # The data buffer size + bufsize: 10 + # Dataset module + dataset: + # Annotation file path + annotation: dataset/coco/annotations/instances_val2017.json + # Drop last batch if size is uneven, false by default + drop_last: false + # List of data fields needed + fields: [image, im_id] + # list of image dims + image_shape: [3, 300, 300] + # number of workers processes (or threads) + num_workers: 8 + # List of sample transformations to use + sample_transforms: + # Transform the image data to numpy format. + - !DecodeImage + # whether to convert BGR to RGB + to_rgb: true # default: true + # whether or not to mixup image and gt_bbbox/gt_score + with_mixup: false # default: false + # Rescale image to the specified target size, and capped at max_size if max_size != 0. + # If target_size is list, selected a scale randomly as the specified target size. + - !ResizeImage + # Resize method, cv2.INTER_LINEAR(1) by default + interp: 1 + # max_size (int): the max size of image + max_size: 0 + # target_size (int|list): the target size of image's short side, + # multi-scale training is adopted when type is list. + target_size: 300 + # use_cv2 (bool): use the cv2 interpolation method or use PIL interpolation method + use_cv2: false + - !Permute + # The format of image, [H, W, C]/[C, H, W], true by default + channel_first: true + # to_bgr (bool): confirm whether to convert RGB to BGR + to_bgr: true + # Normalize the image. + # Operators: + # 1.(optional) Scale the image to [0,1] + # 2. Each pixel minus mean and is divided by std + - !NormalizeImage + # The format of image, [H, W, C]/[C, H, W], true by default + is_channel_first: true + # Whether divide by 255, true by default + is_scale: false + # mean (list): the pixel mean + mean: [104, 117, 123] + # std (list): the pixel variance + std: [1, 1, 1] + # Number of samples, -1 represents all samples. -1 by default + samples: -1 + # If samples should be shuffled, true by default + shuffle: false + # If use multi-process, false by default + use_process: false diff --git a/ppdet/data/transform/operators.py b/ppdet/data/transform/operators.py index 5f35f95f5e5473f3ab12e70288b8616711d6a724..6b430b165f25252004fd38a96eb94f6f182957bc 100644 --- a/ppdet/data/transform/operators.py +++ b/ppdet/data/transform/operators.py @@ -82,6 +82,7 @@ class DecodeImage(BaseOperator): Args: to_rgb (bool): whether to convert BGR to RGB + with_mixup (bool): whether or not to mixup image and gt_bbbox/gt_score """ super(DecodeImage, self).__init__() @@ -459,7 +460,7 @@ class ExpandImage(BaseOperator): def __init__(self, max_ratio, prob, mean=[127.5, 127.5, 127.5]): """ Args: - ratio (float): the ratio of expanding + max_ratio (float): the ratio of expanding prob (float): the probability of expanding image mean (list): the pixel mean """ diff --git a/ppdet/modeling/architectures/ssd.py b/ppdet/modeling/architectures/ssd.py index c0fb68f92d27237abe285de9eaf429bf71ea1a94..18132b20e430d11a3bf7aa24b6b731cc96167ac3 100644 --- a/ppdet/modeling/architectures/ssd.py +++ b/ppdet/modeling/architectures/ssd.py @@ -19,7 +19,7 @@ from __future__ import print_function from paddle import fluid from ppdet.core.workspace import register -from ppdet.modeling.ops import SSDOutputDecoder, SSDMetric +from ppdet.modeling.ops import SSDOutputDecoder __all__ = ['SSD'] @@ -33,30 +33,25 @@ class SSD(object): backbone (object): backbone instance multi_box_head (object): `MultiBoxHead` instance output_decoder (object): `SSDOutputDecoder` instance - metric (object): `SSDMetric` instance for training num_classes (int): number of output classes """ __category__ = 'architecture' - __inject__ = ['backbone', 'multi_box_head', 'output_decoder', 'metric'] + __inject__ = ['backbone', 'multi_box_head', 'output_decoder'] __shared__ = ['num_classes'] def __init__(self, backbone, multi_box_head='MultiBoxHead', output_decoder=SSDOutputDecoder().__dict__, - metric=SSDMetric().__dict__, num_classes=21): super(SSD, self).__init__() self.backbone = backbone self.multi_box_head = multi_box_head self.num_classes = num_classes self.output_decoder = output_decoder - self.metric = metric if isinstance(output_decoder, dict): self.output_decoder = SSDOutputDecoder(**output_decoder) - if isinstance(metric, dict): - self.metric = SSDMetric(**metric) def build(self, feed_vars, mode='train'): im = feed_vars['image'] diff --git a/ppdet/modeling/ops.py b/ppdet/modeling/ops.py index 75ec06bea09f2763f5f864b1c31b04cde4677b2a..56b406a85857679758bfda8006821eab21ebf25b 100644 --- a/ppdet/modeling/ops.py +++ b/ppdet/modeling/ops.py @@ -23,8 +23,7 @@ from ppdet.core.workspace import register, serializable __all__ = [ 'AnchorGenerator', 'RPNTargetAssign', 'GenerateProposals', 'MultiClassNMS', 'BBoxAssigner', 'MaskAssigner', 'RoIAlign', 'RoIPool', 'MultiBoxHead', - 'SSDOutputDecoder', 'SSDMetric', 'RetinaTargetAssign', - 'RetinaOutputDecoder', 'ConvNorm' + 'SSDOutputDecoder', 'RetinaTargetAssign', 'RetinaOutputDecoder', 'ConvNorm' ] @@ -303,22 +302,6 @@ class SSDOutputDecoder(object): self.nms_eta = nms_eta -@register -@serializable -class SSDMetric(object): - __op__ = fluid.metrics.DetectionMAP - __append_doc__ = True - - def __init__(self, - overlap_threshold=0.5, - evaluate_difficult=False, - ap_version='integral'): - super(SSDMetric, self).__init__() - self.overlap_threshold = overlap_threshold - self.evaluate_difficult = evaluate_difficult - self.ap_version = ap_version - - @register @serializable class RetinaTargetAssign(object):