yolov3.py 7.7 KB
Newer Older
D
dengkaipeng 已提交
1
#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
D
dengkaipeng 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

from __future__ import division
from __future__ import print_function

import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.initializer import Constant
from paddle.fluid.initializer import Normal
from paddle.fluid.regularizer import L2Decay

from config import cfg

from .darknet import add_DarkNet53_conv_body
from .darknet import conv_bn_layer

u010070587's avatar
u010070587 已提交
29

D
dengkaipeng 已提交
30
def yolo_detection_block(input, channel, is_test=True, name=None):
D
dengkaipeng 已提交
31 32
    assert channel % 2 == 0, \
            "channel {} cannot be divided by 2".format(channel)
D
dengkaipeng 已提交
33 34
    conv = input
    for j in range(2):
u010070587's avatar
u010070587 已提交
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
        conv = conv_bn_layer(
            conv,
            channel,
            filter_size=1,
            stride=1,
            padding=0,
            is_test=is_test,
            name='{}.{}.0'.format(name, j))
        conv = conv_bn_layer(
            conv,
            channel * 2,
            filter_size=3,
            stride=1,
            padding=1,
            is_test=is_test,
            name='{}.{}.1'.format(name, j))
    route = conv_bn_layer(
        conv,
        channel,
        filter_size=1,
        stride=1,
        padding=0,
        is_test=is_test,
        name='{}.2'.format(name))
    tip = conv_bn_layer(
        route,
        channel * 2,
        filter_size=3,
        stride=1,
        padding=1,
        is_test=is_test,
        name='{}.tip'.format(name))
D
dengkaipeng 已提交
67 68
    return route, tip

u010070587's avatar
u010070587 已提交
69 70

def upsample(input, scale=2, name=None):
71 72 73 74 75 76 77 78 79
    # get dynamic upsample output shape
    shape_nchw = fluid.layers.shape(input)
    shape_hw = fluid.layers.slice(shape_nchw, axes=[0], starts=[2], ends=[4])
    shape_hw.stop_gradient = True
    in_shape = fluid.layers.cast(shape_hw, dtype='int32')
    out_shape = in_shape * scale
    out_shape.stop_gradient = True

    # reisze by actual_shape
D
dengkaipeng 已提交
80
    out = fluid.layers.resize_nearest(
u010070587's avatar
u010070587 已提交
81
        input=input, scale=scale, actual_shape=out_shape, name=name)
D
dengkaipeng 已提交
82 83
    return out

u010070587's avatar
u010070587 已提交
84

D
dengkaipeng 已提交
85
class YOLOv3(object):
u010070587's avatar
u010070587 已提交
86
    def __init__(self, is_train=True, use_random=True):
D
dengkaipeng 已提交
87 88 89 90 91 92
        self.is_train = is_train
        self.use_random = use_random
        self.outputs = []
        self.losses = []
        self.downsample = 32

D
dengkaipeng 已提交
93 94
    def build_input(self):
        self.image_shape = [3, cfg.input_size, cfg.input_size]
D
dengkaipeng 已提交
95
        if self.is_train:
D
dengkaipeng 已提交
96 97
            self.py_reader = fluid.layers.py_reader(
                capacity=64,
u010070587's avatar
u010070587 已提交
98 99
                shapes=[[-1] + self.image_shape, [-1, cfg.max_box_num, 4],
                        [-1, cfg.max_box_num], [-1, cfg.max_box_num]],
D
dengkaipeng 已提交
100 101 102
                lod_levels=[0, 0, 0, 0],
                dtypes=['float32'] * 2 + ['int32'] + ['float32'],
                use_double_buffer=True)
D
dengkaipeng 已提交
103 104
            self.image, self.gtbox, self.gtlabel, self.gtscore = \
                    fluid.layers.read_file(self.py_reader)
D
dengkaipeng 已提交
105 106
        else:
            self.image = fluid.layers.data(
u010070587's avatar
u010070587 已提交
107
                name='image', shape=self.image_shape, dtype='float32')
D
dengkaipeng 已提交
108
            self.im_shape = fluid.layers.data(
u010070587's avatar
u010070587 已提交
109
                name="im_shape", shape=[2], dtype='int32')
D
dengkaipeng 已提交
110
            self.im_id = fluid.layers.data(
u010070587's avatar
u010070587 已提交
111 112
                name="im_id", shape=[1], dtype='int32')

D
dengkaipeng 已提交
113 114 115 116
    def feeds(self):
        if not self.is_train:
            return [self.image, self.im_id, self.im_shape]
        return [self.image, self.gtbox, self.gtlabel, self.gtscore]
D
dengkaipeng 已提交
117

D
dengkaipeng 已提交
118
    def build_model(self):
D
dengkaipeng 已提交
119 120 121 122 123 124 125 126 127
        self.build_input()

        self.outputs = []
        self.boxes = []
        self.scores = []

        blocks = add_DarkNet53_conv_body(self.image, not self.is_train)
        for i, block in enumerate(blocks):
            if i > 0:
u010070587's avatar
u010070587 已提交
128 129 130 131 132 133
                block = fluid.layers.concat(input=[route, block], axis=1)
            route, tip = yolo_detection_block(
                block,
                channel=512 // (2**i),
                is_test=(not self.is_train),
                name="yolo_block.{}".format(i))
D
dengkaipeng 已提交
134 135 136

            # out channel number = mask_num * (5 + class_num)
            num_filters = len(cfg.anchor_masks[i]) * (cfg.class_num + 5)
D
dengkaipeng 已提交
137 138
            block_out = fluid.layers.conv2d(
                input=tip,
D
dengkaipeng 已提交
139
                num_filters=num_filters,
D
dengkaipeng 已提交
140 141 142 143
                filter_size=1,
                stride=1,
                padding=0,
                act=None,
u010070587's avatar
u010070587 已提交
144 145 146 147 148 149 150
                param_attr=ParamAttr(
                    initializer=fluid.initializer.Normal(0., 0.02),
                    name="yolo_output.{}.conv.weights".format(i)),
                bias_attr=ParamAttr(
                    initializer=fluid.initializer.Constant(0.0),
                    regularizer=L2Decay(0.),
                    name="yolo_output.{}.conv.bias".format(i)))
D
dengkaipeng 已提交
151 152 153 154 155
            self.outputs.append(block_out)

            if i < len(blocks) - 1:
                route = conv_bn_layer(
                    input=route,
u010070587's avatar
u010070587 已提交
156
                    ch_out=256 // (2**i),
D
dengkaipeng 已提交
157 158 159 160 161 162 163 164
                    filter_size=1,
                    stride=1,
                    padding=0,
                    is_test=(not self.is_train),
                    name="yolo_transition.{}".format(i))
                # upsample
                route = upsample(route)

D
dengkaipeng 已提交
165 166
        for i, out in enumerate(self.outputs):
            anchor_mask = cfg.anchor_masks[i]
D
dengkaipeng 已提交
167 168 169

            if self.is_train:
                loss = fluid.layers.yolov3_loss(
u010070587's avatar
u010070587 已提交
170 171 172 173 174 175 176 177 178 179 180
                    x=out,
                    gt_box=self.gtbox,
                    gt_label=self.gtlabel,
                    gt_score=self.gtscore,
                    anchors=cfg.anchors,
                    anchor_mask=anchor_mask,
                    class_num=cfg.class_num,
                    ignore_thresh=cfg.ignore_thresh,
                    downsample_ratio=self.downsample,
                    use_label_smooth=cfg.label_smooth,
                    name="yolo_loss" + str(i))
D
dengkaipeng 已提交
181 182
                self.losses.append(fluid.layers.reduce_mean(loss))
            else:
u010070587's avatar
u010070587 已提交
183
                mask_anchors = []
D
dengkaipeng 已提交
184 185 186
                for m in anchor_mask:
                    mask_anchors.append(cfg.anchors[2 * m])
                    mask_anchors.append(cfg.anchors[2 * m + 1])
D
dengkaipeng 已提交
187
                boxes, scores = fluid.layers.yolo_box(
u010070587's avatar
u010070587 已提交
188 189 190 191 192 193 194
                    x=out,
                    img_size=self.im_shape,
                    anchors=mask_anchors,
                    class_num=cfg.class_num,
                    conf_thresh=cfg.valid_thresh,
                    downsample_ratio=self.downsample,
                    name="yolo_box" + str(i))
D
dengkaipeng 已提交
195
                self.boxes.append(boxes)
u010070587's avatar
u010070587 已提交
196 197 198
                self.scores.append(
                    fluid.layers.transpose(
                        scores, perm=[0, 2, 1]))
D
dengkaipeng 已提交
199

u010070587's avatar
u010070587 已提交
200
            self.downsample //= 2
D
dengkaipeng 已提交
201 202 203 204 205 206 207 208

    def loss(self):
        return sum(self.losses)

    def get_pred(self):
        yolo_boxes = fluid.layers.concat(self.boxes, axis=1)
        yolo_scores = fluid.layers.concat(self.scores, axis=2)
        return fluid.layers.multiclass_nms(
u010070587's avatar
u010070587 已提交
209 210 211 212 213 214 215 216
            bboxes=yolo_boxes,
            scores=yolo_scores,
            score_threshold=cfg.valid_thresh,
            nms_top_k=cfg.nms_topk,
            keep_top_k=cfg.nms_posk,
            nms_threshold=cfg.nms_thresh,
            background_label=-1,
            name="multiclass_nms")