yolov3_darknet.yml

# Architecture of detection, which is also the prefix of data feed module
architecture: YOLOv3

# Data feed module.
train_feed: YoloTrainFeed
eval_feed: YoloEvalFeed
test_feed: YoloTestFeed

# Use GPU or CPU, true by default.
use_gpu: true

# Maximum number of iteration.
# In YOLOv3 model, default iteration number is to train for 270 epoches.
max_iters: 500200

# Smooth the log output in specified iterations, 20 by default.
log_smooth_window: 20

# The number of iteration interval to display in training log.
log_iter: 20

# The directory to save models.
save_dir: output

# Snapshot period. If training and test at same time, evaluate model at each snapshot_iter. 2000 by default.
snapshot_iter: 2000

# Evalution method, COCO and VOC are available.
metric: COCO

# The path of oretrained wegiths. If url is provided, it will be downloaded and decompressed automatically.
pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/DarkNet53_pretrained.tar
# The path of final model for evaluation and test.
weights: output/yolov3_darknet/model_final

# Number of classes, 80 for COCO and 20 for VOC.
num_classes: 80

# Whether use fine grained YOLOv3 loss, if true, build YOLOv3 loss by python code with common OPs,
# if false, use fluid.layer.yolov3_loss OP to calculate YOLOv3 loss, the former one is better
# for redesign YOLOv3 loss, the latter one is better for training by original YOLOv3 loss
use_fine_grained_loss: false


# YOLOv3 architecture, see https://arxiv.org/abs/1804.02767
YOLOv3:
  backbone: DarkNet
  yolo_head: YOLOv3Head

# Backbone module
DarkNet:
  # Batch normalization type in training, sync_bn for synchronized batch normalization
  norm_type: sync_bn
  # L2 weight decay factor of batch normalization layer
  norm_decay: 0.
  # Darknet convolution layer number, only support 53 currently
  depth: 53

# YOLOv3 head module
# Generate bbox output in evaluation and calculate loss in training
# fluid.layers.yolov3_loss / fluid.layers.yolo_box
YOLOv3Head:
  # anchor mask of 3 yolo_loss/yolo_box layers, each yolo_loss/yolo_box layer has 3 anchors
  anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
  # 9 anchors for 3 yolo_loss/yolo_box layer, generated by perform kmeans on COCO gtboxes
  anchors: [[10, 13], [16, 30], [33, 23],
            [30, 61], [62, 45], [59, 119],
            [116, 90], [156, 198], [373, 326]]
  # L2 weight decay factor of batch normalization layer
  norm_decay: 0.
  # use YOLOv3Loss, which will be defined in following YOLOv3Loss segmentation.
  yolo_loss: YOLOv3Loss
  # fluid.layers.multiclass_nms
  # Non-max suppress for output prediction boxes, see multiclass_nms for following parameters.
  #   1. Select detection bounding boxes with high scores larger than score_threshold.
  #   2. Select detection bounding boxes with the largest nms_top_k scores.
  #   3. Suppress detection bounding boxes which have high IoU overlap witch already selected boxes.
  #   4. Keep the top keep_top_k detection bounding boxes as output.
  nms:
    # Which label is regard as backgroud and will be ignored, -1 for no backgroud label.
    background_label: -1
    # Number of total bboxes to be kept per image after NMS step.
    keep_top_k: 100
    # IoU threshold for NMS, bbox with IoU over nms_threshold will be suppressed.
    nms_threshold: 0.45
    # Maximum number of detections to be kept according to the confidences after the filtering detections based on score_threshold.
    nms_top_k: 1000
    # Whether detections are normalized.
    normalized: false
    # Threshold to filter out bounding boxes with low confidence score. 
    score_threshold: 0.01

YOLOv3Loss:
  # training batch size, this will be used when use_fine_grained_loss is set as True.
  # ATTENTION: this should be same as batch size defined in YoloTrainFeed in fine
  # grained loss mode.
  batch_size: 8
  # Ignore threshold for yolo_loss layer, 0.7 by default.
  # Objectness loss will be ignored if a predcition bbox overlap a gtbox over ignore_thresh.
  ignore_thresh: 0.7
  # Whether use label smooth in yolo_loss layer
  # It is recommended to set as true when only num_classes is very big
  label_smooth: false

# Learning rate configuration
LearningRate:
  # Base learning rate for training, 1e-3 by default.
  base_lr: 0.001
  # Learning rate schedulers, PiecewiseDecay and LinearWarmup by default
  schedulers:
  # fluid.layers.piecewise_decay
  # each milestone stage decay gamma
  - !PiecewiseDecay
    gamma: 0.1
    milestones:
    - 400000
    - 450000
  # fluid.layers.linear_lr_warmup
  # Start learning rate equals to base_lr * start_factor
  - !LinearWarmup
    start_factor: 0.
    steps: 4000

# Optimizer module
OptimizerBuilder:
  # fluid.optimizer
  optimizer:
    momentum: 0.9
    type: Momentum
  # fluid.regularizer
  regularizer:
    factor: 0.0005
    type: L2

# Data feed module for training
YoloTrainFeed:
  # Batch size per device, 8 by default
  batch_size: 8
  # Dataset module
  dataset:
    # Dataset directory.
    dataset_dir: dataset/coco
    # Annotation file path.
    annotation: annotations/instances_train2017.json
    # Directory where image files are stored.
    image_dir: train2017
  # List of data fields needed.
  fields: [image, gt_box, gt_label, gt_score]
  # List of image dims
  image_shape: [3, 608, 608]
  # List of sample transformations to use.
  sample_transforms:
  # read image data and decode to numpy.
  - !DecodeImage
    to_rgb: true
    # YOLOv3 use image mixup in training.
    with_mixup: true
  # Mixup two images in training, a trick to improve performance.
  - !MixupImage
    alpha: 1.5 # default: 1.5
    beta: 1.5 # default: 1.5
  # Normalize gtbox to range [0, 1]
  - !NormalizeBox {}
  # Random color distort: brightness, contrast, hue, saturation.
  - !RandomDistort
    brightness_lower: 0.5
    brightness_prob: 0.5
    brightness_upper: 1.5
    contrast_lower: 0.5
    contrast_prob: 0.5
    contrast_upper: 1.5
    count: 4
    hue_lower: -18
    hue_prob: 0.5
    hue_upper: 18
    is_order: false
    saturation_lower: 0.5
    saturation_prob: 0.5
    saturation_upper: 1.5
  # Random Expand the image and modify bounding box.
  #   Operators:
  #     1. Scale the image weight and height.
  #     2. Construct new images with new height and width.
  #     3. Fill the new image with the mean.
  #     4. Put original imge into new image.
  #     5. Rescale the bounding box.
  #     6. Determine if the new bbox is satisfied in the new image.
  - !ExpandImage
    # max expand ratio, default 4.0.
    max_ratio: 4.0
    mean: [123.675, 116.28, 103.53]
    prob: 0.5
  # Random Crop the image and modify bounding box.
  #   Operators:
  #     1. Scale the image weight and height.
  #     2. Crop the image according to a radom sample.
  #     3. Rescale the bounding box.
  #     4. Determine if the new bbox is satisfied in the new image.
  - !CropImage
    # Recrop image if there are no bbox in output cropped image.
    avoid_no_bbox: true
    batch_sampler: [[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0],
                    [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 1.0],
                    [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 1.0],
                    [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 1.0],
                    [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 1.0],
                    [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 1.0],
                    [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]]
    # Whether should all bbox satisfy IoU constrains.
    satisfy_all: false
  # Interpolate image to target_size with random interpolate method:
  # 		cv2.INTER_NEAREST,
  # 		cv2.INTER_LINEAR,
  # 		cv2.INTER_AREA,
  # 		cv2.INTER_CUBIC,
  # 		cv2.INTER_LANCZOS4,
  - !RandomInterpImage
    max_size: 0
    target_size: 608
  # Filp the image and bounding box.
  #   Operators:
  #     1. Flip the image numpy.
  #     2. Transform the bboxes' x coordinates. (Must judge whether the coordinates are normalized!)
  #     3. Transform the segmentations' x coordinates. (Must judge whether the coordinates are normalized!)
  - !RandomFlipImage
    is_mask_flip: false
    is_normalized: true
    prob: 0.5
  # Normalize the image.
  #   Operators:
  #     1.(optional) Scale the image to [0,1]
  #     2. Each pixel minus mean and is divided by std
  - !NormalizeImage
    is_channel_first: false
    is_scale: true
    mean: [0.485, 0.456, 0.406]
    std: [0.229, 0.224, 0.225]
  # Change data layout to [C, H, W].
  - !Permute
    channel_first: true
    to_bgr: false
  # List of batch transformations to use.
  batch_transforms:
  # Random reshape images in each mini-batch to different shapes.
  - !RandomShape
    sizes: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608]
  # YOLOv3 read gtbox into zero padded tensor with max box number as 50.
  num_max_boxes: 50
  # YOLOv3 read gtlabel without regarding backgroud as label 0.
  with_background: false
  # Number of samples, -1 represents all samples. -1 by default.
  samples: -1
  # Whether samples should be shuffled, true by default.
  shuffle: true
  # Whether drop last images which less than a batch.
  drop_last: true
  # Whether use multi-process reader in training.
  use_process: true
  # Use multi-process reader number.
  num_workers: 8
  # Buffer size for reader.
  bufsize: 128
  # Mixup image epoch number.
  mixup_epoch: 250

# Data feed module for evaluation
YoloEvalFeed:
  batch_size: 8
  dataset:
    dataset_dir: dataset/coco
    annotation: annotations/instances_val2017.json
    image_dir: val2017
  batch_transforms: []
  fields: [image, im_size, im_id, gt_box, gt_label, is_difficult]
  image_shape: [3, 608, 608]
  sample_transforms:
  - !DecodeImage
    to_rgb: true
    with_mixup: false
  # Rescale image to the specified target size, and capped at max_size if max_size != 0.
  # If target_size is list, selected a scale randomly as the specified target size.
  - !ResizeImage
    interp: 2 # 2 for cv2.INTER_CUBIC
    max_size: 0
    target_size: 608
    use_cv2: true
  - !NormalizeImage
    is_channel_first: false
    is_scale: true
    mean: [0.485, 0.456, 0.406]
    std: [0.229, 0.224, 0.225]
  - !Permute
    channel_first: true
    to_bgr: false
  num_max_boxes: 50
  samples: -1
  shuffle: false
  drop_last: false
  # Use multi-thread reader in evaluation mode.
  use_process: false
  # Thread number for multi-thread reader.
  num_workers: 8
  with_background: false

# Data feed module for test
YoloTestFeed:
  batch_size: 1
  dataset:
    annotation: dataset/coco/annotations/instances_val2017.json
  batch_transforms: []
  fields: [image, im_size, im_id]
  sample_transforms:
  - !DecodeImage
    to_rgb: true
    with_mixup: false
  - !ResizeImage
    interp: 2
    max_size: 0
    target_size: 608
    use_cv2: true
  - !NormalizeImage
    is_channel_first: false
    is_scale: true
    mean: [0.485, 0.456, 0.406]
    std: [0.229, 0.224, 0.225]
  - !Permute
    channel_first: true
    to_bgr: false
  num_max_boxes: 50
  samples: -1
  shuffle: false
  drop_last: false
  # Use multi-thread reader in test mode.
  use_process: false
  num_workers: 8
  with_background: false