architecture: SSD train_feed: SSDTrainFeed eval_feed: SSDEvalFeed test_feed: SSDTestFeed use_gpu: true max_iters: 120001 snapshot_iter: 10000 log_smooth_window: 20 log_iter: 20 metric: VOC map_type: 11point pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/VGG16_caffe_pretrained.tar save_dir: output weights: output/ssd_vgg16_300_voc/model_final/ # 20(label_class) + 1(background) num_classes: 21 SSD: backbone: VGG multi_box_head: MultiBoxHead output_decoder: background_label: 0 keep_top_k: 200 nms_eta: 1.0 nms_threshold: 0.45 nms_top_k: 400 score_threshold: 0.01 VGG: depth: 16 with_extra_blocks: true normalizations: [20., -1, -1, -1, -1, -1] MultiBoxHead: base_size: 300 aspect_ratios: [[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]] min_ratio: 20 max_ratio: 90 min_sizes: [30.0, 60.0, 111.0, 162.0, 213.0, 264.0] max_sizes: [60.0, 111.0, 162.0, 213.0, 264.0, 315.0] steps: [8, 16, 32, 64, 100, 300] offset: 0.5 flip: true min_max_aspect_ratios_order: true kernel_size: 3 pad: 1 LearningRate: base_lr: 0.001 schedulers: - !PiecewiseDecay gamma: 0.1 milestones: [80000, 100000] OptimizerBuilder: optimizer: momentum: 0.9 type: Momentum regularizer: factor: 0.0005 type: L2 SSDTrainFeed: batch_size: 8 dataset: dataset_dir: dataset/voc annotation: VOCdevkit/VOC_all/ImageSets/Main/train.txt image_dir: VOCdevkit/VOC_all/JPEGImages use_default_label: true image_shape: [3, 300, 300] sample_transforms: - !DecodeImage to_rgb: true with_mixup: false - !NormalizeBox {} - !RandomDistort brightness_lower: 0.875 brightness_upper: 1.125 is_order: true - !ExpandImage max_ratio: 4 mean: [104, 117, 123] prob: 0.5 - !CropImage avoid_no_bbox: true batch_sampler: - [1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0] - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0] - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0] - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0] - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0] - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0] - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0] satisfy_all: false - !ResizeImage interp: 1 target_size: 300 use_cv2: False - !RandomFlipImage is_normalized: true - !Permute to_bgr: false - !NormalizeImage is_scale: false mean: [104, 117, 123] std: [1, 1, 1] SSDEvalFeed: batch_size: 32 dataset: dataset_dir: dataset/voc annotation: VOCdevkit/VOC_all/ImageSets/Main/val.txt image_dir: VOCdevkit/VOC_all/JPEGImages use_default_label: true drop_last: false image_shape: [3, 300, 300] sample_transforms: - !DecodeImage to_rgb: true with_mixup: false - !NormalizeBox {} - !ResizeImage interp: 1 target_size: 300 use_cv2: false - !Permute to_bgr: false - !NormalizeImage is_scale: false mean: [104, 117, 123] std: [1, 1, 1] SSDTestFeed: batch_size: 1 dataset: use_default_label: true drop_last: false image_shape: [3, 300, 300] sample_transforms: - !DecodeImage to_rgb: true with_mixup: false - !ResizeImage interp: 1 max_size: 0 target_size: 300 use_cv2: false - !Permute to_bgr: false - !NormalizeImage is_scale: false mean: [104, 117, 123] std: [1, 1, 1]