未验证 提交 89b6f5fe 编写于 作者: K Kaipeng Deng 提交者: GitHub

Merge pull request #23 from heavengate/yolov3

add YOLOv3
...@@ -1084,7 +1084,7 @@ class Model(fluid.dygraph.Layer): ...@@ -1084,7 +1084,7 @@ class Model(fluid.dygraph.Layer):
return eval_result return eval_result
def predict(self, test_data, batch_size=1, num_workers=0): def predict(self, test_data, batch_size=1, num_workers=0, stack_outputs=True):
""" """
FIXME: add more comments and usage FIXME: add more comments and usage
Args: Args:
...@@ -1097,6 +1097,12 @@ class Model(fluid.dygraph.Layer): ...@@ -1097,6 +1097,12 @@ class Model(fluid.dygraph.Layer):
num_workers (int): the number of subprocess to load data, 0 for no subprocess num_workers (int): the number of subprocess to load data, 0 for no subprocess
used and loading data in main process. When train_data and eval_data are used and loading data in main process. When train_data and eval_data are
both the instance of Dataloader, this parameter will be ignored. both the instance of Dataloader, this parameter will be ignored.
stack_output (bool): whether stack output field like a batch, as for an output
filed of a sample is in shape [X, Y], test_data contains N samples, predict
output field will be in shape [N, X, Y] if stack_output is True, and will
be a length N list in shape [[X, Y], [X, Y], ....[X, Y]] if stack_outputs
is False. stack_outputs as False is used for LoDTensor output situation,
it is recommended set as True if outputs contains no LoDTensor. Default False
""" """
if fluid.in_dygraph_mode(): if fluid.in_dygraph_mode():
...@@ -1123,19 +1129,16 @@ class Model(fluid.dygraph.Layer): ...@@ -1123,19 +1129,16 @@ class Model(fluid.dygraph.Layer):
if not isinstance(test_loader, Iterable): if not isinstance(test_loader, Iterable):
loader = test_loader() loader = test_loader()
outputs = None outputs = []
for data in tqdm.tqdm(loader): for data in tqdm.tqdm(loader):
if not fluid.in_dygraph_mode(): data = flatten(data)
data = data[0] outputs.append(self.test(data[:len(self._inputs)]))
outs = self.test(*data)
if outputs is None: # NOTE: for lod tensor output, we should not stack outputs
outputs = outs # for stacking may loss its detail info
else: outputs = list(zip(*outputs))
outputs = [ if stack_outputs:
np.vstack([x, outs[i]]) for i, x in enumerate(outputs) outputs = [np.stack(outs, axis=0) for outs in outputs]
]
self._test_dataloader = None self._test_dataloader = None
if test_loader is not None and self._adapter._nranks > 1 \ if test_loader is not None and self._adapter._nranks > 1 \
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from . import resnet
from . import darknet
from . import yolov3
from .resnet import * from .resnet import *
from .darknet import *
from .yolov3 import *
__all__ = resnet.__all__ \
+ darknet.__all__ \
+ yolov3.__all__
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.regularizer import L2Decay
from paddle.fluid.dygraph.nn import Conv2D, BatchNorm
from model import Model
from .download import get_weights_path
__all__ = ['DarkNet53', 'ConvBNLayer', 'darknet53']
# {num_layers: (url, md5)}
pretrain_infos = {
53: ('https://paddlemodels.bj.bcebos.com/hapi/darknet53.pdparams',
'2506357a5c31e865785112fc614a487d')
}
class ConvBNLayer(fluid.dygraph.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size=3,
stride=1,
groups=1,
padding=0,
act="leaky"):
super(ConvBNLayer, self).__init__()
self.conv = Conv2D(
num_channels=ch_in,
num_filters=ch_out,
filter_size=filter_size,
stride=stride,
padding=padding,
groups=groups,
param_attr=ParamAttr(
initializer=fluid.initializer.Normal(0., 0.02)),
bias_attr=False,
act=None)
self.batch_norm = BatchNorm(
num_channels=ch_out,
param_attr=ParamAttr(
initializer=fluid.initializer.Normal(0., 0.02),
regularizer=L2Decay(0.)),
bias_attr=ParamAttr(
initializer=fluid.initializer.Constant(0.0),
regularizer=L2Decay(0.)))
self.act = act
def forward(self, inputs):
out = self.conv(inputs)
out = self.batch_norm(out)
if self.act == 'leaky':
out = fluid.layers.leaky_relu(x=out, alpha=0.1)
return out
class DownSample(fluid.dygraph.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size=3,
stride=2,
padding=1):
super(DownSample, self).__init__()
self.conv_bn_layer = ConvBNLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=filter_size,
stride=stride,
padding=padding)
self.ch_out = ch_out
def forward(self, inputs):
out = self.conv_bn_layer(inputs)
return out
class BasicBlock(fluid.dygraph.Layer):
def __init__(self, ch_in, ch_out):
super(BasicBlock, self).__init__()
self.conv1 = ConvBNLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=1,
stride=1,
padding=0)
self.conv2 = ConvBNLayer(
ch_in=ch_out,
ch_out=ch_out*2,
filter_size=3,
stride=1,
padding=1)
def forward(self, inputs):
conv1 = self.conv1(inputs)
conv2 = self.conv2(conv1)
out = fluid.layers.elementwise_add(x=inputs, y=conv2, act=None)
return out
class LayerWarp(fluid.dygraph.Layer):
def __init__(self, ch_in, ch_out, count):
super(LayerWarp,self).__init__()
self.basicblock0 = BasicBlock(ch_in, ch_out)
self.res_out_list = []
for i in range(1,count):
res_out = self.add_sublayer("basic_block_%d" % (i),
BasicBlock(
ch_out*2,
ch_out))
self.res_out_list.append(res_out)
self.ch_out = ch_out
def forward(self,inputs):
y = self.basicblock0(inputs)
for basic_block_i in self.res_out_list:
y = basic_block_i(y)
return y
DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}
class DarkNet53(Model):
def __init__(self, num_layers=53, ch_in=3):
super(DarkNet53, self).__init__()
assert num_layers in DarkNet_cfg.keys(), \
"only support num_layers in {} currently" \
.format(DarkNet_cfg.keys())
self.stages = DarkNet_cfg[num_layers]
self.stages = self.stages[0:5]
self.conv0 = ConvBNLayer(
ch_in=ch_in,
ch_out=32,
filter_size=3,
stride=1,
padding=1)
self.downsample0 = DownSample(
ch_in=32,
ch_out=32 * 2)
self.darknet53_conv_block_list = []
self.downsample_list = []
ch_in = [64,128,256,512,1024]
for i, stage in enumerate(self.stages):
conv_block = self.add_sublayer(
"stage_%d" % (i),
LayerWarp(
int(ch_in[i]),
32*(2**i),
stage))
self.darknet53_conv_block_list.append(conv_block)
for i in range(len(self.stages) - 1):
downsample = self.add_sublayer(
"stage_%d_downsample" % i,
DownSample(
ch_in = 32*(2**(i+1)),
ch_out = 32*(2**(i+2))))
self.downsample_list.append(downsample)
def forward(self,inputs):
out = self.conv0(inputs)
out = self.downsample0(out)
blocks = []
for i, conv_block_i in enumerate(self.darknet53_conv_block_list):
out = conv_block_i(out)
blocks.append(out)
if i < len(self.stages) - 1:
out = self.downsample_list[i](out)
return blocks[-1:-4:-1]
def _darknet(num_layers=53, input_channels=3, pretrained=True):
model = DarkNet53(num_layers, input_channels)
if pretrained:
assert num_layers in pretrain_infos.keys(), \
"DarkNet{} do not have pretrained weights now, " \
"pretrained should be set as False".format(num_layers)
weight_path = get_weights_path(*(pretrain_infos[num_layers]))
assert weight_path.endswith('.pdparams'), \
"suffix of weight must be .pdparams"
model.load(weight_path[:-9])
return model
def darknet53(input_channels=3, pretrained=True):
return _darknet(53, input_channels, pretrained)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Conv2D
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.regularizer import L2Decay
from model import Model, Loss
from .darknet import darknet53, ConvBNLayer
from .download import get_weights_path
__all__ = ['YoloLoss', 'YOLOv3', 'yolov3_darknet53']
# {num_layers: (url, md5)}
pretrain_infos = {
53: ('https://paddlemodels.bj.bcebos.com/hapi/yolov3_darknet53.pdparams',
'aed7dd45124ff2e844ae3bd5ba6c91d2')
}
class YoloDetectionBlock(fluid.dygraph.Layer):
def __init__(self, ch_in, channel):
super(YoloDetectionBlock, self).__init__()
assert channel % 2 == 0, \
"channel {} cannot be divided by 2".format(channel)
self.conv0 = ConvBNLayer(
ch_in=ch_in,
ch_out=channel,
filter_size=1,
stride=1,
padding=0)
self.conv1 = ConvBNLayer(
ch_in=channel,
ch_out=channel*2,
filter_size=3,
stride=1,
padding=1)
self.conv2 = ConvBNLayer(
ch_in=channel*2,
ch_out=channel,
filter_size=1,
stride=1,
padding=0)
self.conv3 = ConvBNLayer(
ch_in=channel,
ch_out=channel*2,
filter_size=3,
stride=1,
padding=1)
self.route = ConvBNLayer(
ch_in=channel*2,
ch_out=channel,
filter_size=1,
stride=1,
padding=0)
self.tip = ConvBNLayer(
ch_in=channel,
ch_out=channel*2,
filter_size=3,
stride=1,
padding=1)
def forward(self, inputs):
out = self.conv0(inputs)
out = self.conv1(out)
out = self.conv2(out)
out = self.conv3(out)
route = self.route(out)
tip = self.tip(route)
return route, tip
class YOLOv3(Model):
def __init__(self, num_classes=80, model_mode='train'):
super(YOLOv3, self).__init__()
self.num_classes = num_classes
assert str.lower(model_mode) in ['train', 'eval', 'test'], \
"model_mode should be 'train' 'eval' or 'test', but got " \
"{}".format(model_mode)
self.model_mode = str.lower(model_mode)
self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45,
59, 119, 116, 90, 156, 198, 373, 326]
self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
self.valid_thresh = 0.005
self.nms_thresh = 0.45
self.nms_topk = 400
self.nms_posk = 100
self.draw_thresh = 0.5
self.backbone = darknet53(pretrained=(model_mode=='train'))
self.block_outputs = []
self.yolo_blocks = []
self.route_blocks = []
for idx, num_chan in enumerate([1024, 768, 384]):
yolo_block = self.add_sublayer(
"yolo_detecton_block_{}".format(idx),
YoloDetectionBlock(num_chan, 512 // (2**idx)))
self.yolo_blocks.append(yolo_block)
num_filters = len(self.anchor_masks[idx]) * (self.num_classes + 5)
block_out = self.add_sublayer(
"block_out_{}".format(idx),
Conv2D(num_channels=1024 // (2**idx),
num_filters=num_filters,
filter_size=1,
act=None,
param_attr=ParamAttr(
initializer=fluid.initializer.Normal(0., 0.02)),
bias_attr=ParamAttr(
initializer=fluid.initializer.Constant(0.0),
regularizer=L2Decay(0.))))
self.block_outputs.append(block_out)
if idx < 2:
route = self.add_sublayer(
"route2_{}".format(idx),
ConvBNLayer(ch_in=512 // (2**idx),
ch_out=256 // (2**idx),
filter_size=1,
act='leaky_relu'))
self.route_blocks.append(route)
def forward(self, img_info, inputs):
outputs = []
boxes = []
scores = []
downsample = 32
feats = self.backbone(inputs)
route = None
for idx, feat in enumerate(feats):
if idx > 0:
feat = fluid.layers.concat(input=[route, feat], axis=1)
route, tip = self.yolo_blocks[idx](feat)
block_out = self.block_outputs[idx](tip)
outputs.append(block_out)
if idx < 2:
route = self.route_blocks[idx](route)
route = fluid.layers.resize_nearest(route, scale=2)
if self.model_mode != 'train':
anchor_mask = self.anchor_masks[idx]
mask_anchors = []
for m in anchor_mask:
mask_anchors.append(self.anchors[2 * m])
mask_anchors.append(self.anchors[2 * m + 1])
img_shape = fluid.layers.slice(img_info, axes=[1], starts=[1], ends=[3])
img_id = fluid.layers.slice(img_info, axes=[1], starts=[0], ends=[1])
b, s = fluid.layers.yolo_box(
x=block_out,
img_size=img_shape,
anchors=mask_anchors,
class_num=self.num_classes,
conf_thresh=self.valid_thresh,
downsample_ratio=downsample)
boxes.append(b)
scores.append(fluid.layers.transpose(s, perm=[0, 2, 1]))
downsample //= 2
if self.model_mode == 'train':
return outputs
preds = [img_id[0, :],
fluid.layers.multiclass_nms(
bboxes=fluid.layers.concat(boxes, axis=1),
scores=fluid.layers.concat(scores, axis=2),
score_threshold=self.valid_thresh,
nms_top_k=self.nms_topk,
keep_top_k=self.nms_posk,
nms_threshold=self.nms_thresh,
background_label=-1)]
if self.model_mode == 'test':
return preds
# model_mode == "eval"
return outputs + preds
class YoloLoss(Loss):
def __init__(self, num_classes=80, num_max_boxes=50):
super(YoloLoss, self).__init__()
self.num_classes = num_classes
self.num_max_boxes = num_max_boxes
self.ignore_thresh = 0.7
self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45,
59, 119, 116, 90, 156, 198, 373, 326]
self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
def forward(self, outputs, labels):
downsample = 32
gt_box, gt_label, gt_score = labels
losses = []
for idx, out in enumerate(outputs):
if idx == 3: break # debug
anchor_mask = self.anchor_masks[idx]
loss = fluid.layers.yolov3_loss(
x=out,
gt_box=gt_box,
gt_label=gt_label,
gt_score=gt_score,
anchor_mask=anchor_mask,
downsample_ratio=downsample,
anchors=self.anchors,
class_num=self.num_classes,
ignore_thresh=self.ignore_thresh,
use_label_smooth=True)
loss = fluid.layers.reduce_mean(loss)
losses.append(loss)
downsample //= 2
return losses
def _yolov3_darknet(num_layers=53, num_classes=80,
model_mode='train', pretrained=True):
model = YOLOv3(num_classes, model_mode)
if pretrained:
assert num_layers in pretrain_infos.keys(), \
"YOLOv3-DarkNet{} do not have pretrained weights now, " \
"pretrained should be set as False".format(num_layers)
weight_path = get_weights_path(*(pretrain_infos[num_layers]))
assert weight_path.endswith('.pdparams'), \
"suffix of weight must be .pdparams"
model.load(weight_path[:-9])
return model
def yolov3_darknet53(num_classes=80, model_mode='train', pretrained=True):
return _yolov3_darknet(53, num_classes, model_mode, pretrained)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
from __future__ import print_function
import argparse
import contextlib
import os
import random
import time
from functools import partial
import cv2
import numpy as np
from pycocotools.coco import COCO
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Conv2D
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.regularizer import L2Decay
from model import Model, Loss, Input
from resnet import ResNet, ConvBNLayer
import logging
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
# XXX transfer learning
class ResNetBackBone(ResNet):
def __init__(self, depth=50):
super(ResNetBackBone, self).__init__(depth=depth)
delattr(self, 'fc')
def forward(self, inputs):
x = self.conv(inputs)
x = self.pool(x)
outputs = []
for layer in self.layers:
x = layer(x)
outputs.append(x)
return outputs
class YoloDetectionBlock(fluid.dygraph.Layer):
def __init__(self, num_channels, num_filters):
super(YoloDetectionBlock, self).__init__()
assert num_filters % 2 == 0, \
"num_filters {} cannot be divided by 2".format(num_filters)
self.conv0 = ConvBNLayer(
num_channels=num_channels,
num_filters=num_filters,
filter_size=1,
act='leaky_relu')
self.conv1 = ConvBNLayer(
num_channels=num_filters,
num_filters=num_filters * 2,
filter_size=3,
act='leaky_relu')
self.conv2 = ConvBNLayer(
num_channels=num_filters * 2,
num_filters=num_filters,
filter_size=1,
act='leaky_relu')
self.conv3 = ConvBNLayer(
num_channels=num_filters,
num_filters=num_filters * 2,
filter_size=3,
act='leaky_relu')
self.route = ConvBNLayer(
num_channels=num_filters * 2,
num_filters=num_filters,
filter_size=1,
act='leaky_relu')
self.tip = ConvBNLayer(
num_channels=num_filters,
num_filters=num_filters * 2,
filter_size=3,
act='leaky_relu')
def forward(self, inputs):
out = self.conv0(inputs)
out = self.conv1(out)
out = self.conv2(out)
out = self.conv3(out)
route = self.route(out)
tip = self.tip(route)
return route, tip
class YOLOv3(Model):
def __init__(self, num_classes=80):
super(YOLOv3, self).__init__()
self.num_classes = num_classes
self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45,
59, 119, 116, 90, 156, 198, 373, 326]
self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
self.valid_thresh = 0.005
self.nms_thresh = 0.45
self.nms_topk = 400
self.nms_posk = 100
self.draw_thresh = 0.5
self.backbone = ResNetBackBone()
self.block_outputs = []
self.yolo_blocks = []
self.route_blocks = []
for idx, num_chan in enumerate([2048, 1280, 640]):
yolo_block = self.add_sublayer(
"detecton_block_{}".format(idx),
YoloDetectionBlock(num_chan, num_filters=512 // (2**idx)))
self.yolo_blocks.append(yolo_block)
num_filters = len(self.anchor_masks[idx]) * (self.num_classes + 5)
block_out = self.add_sublayer(
"block_out_{}".format(idx),
Conv2D(num_channels=1024 // (2**idx),
num_filters=num_filters,
filter_size=1,
param_attr=ParamAttr(
initializer=fluid.initializer.Normal(0., 0.02)),
bias_attr=ParamAttr(
initializer=fluid.initializer.Constant(0.0),
regularizer=L2Decay(0.))))
self.block_outputs.append(block_out)
if idx < 2:
route = self.add_sublayer(
"route_{}".format(idx),
ConvBNLayer(num_channels=512 // (2**idx),
num_filters=256 // (2**idx),
filter_size=1,
act='leaky_relu'))
self.route_blocks.append(route)
def forward(self, inputs, img_info):
outputs = []
boxes = []
scores = []
downsample = 32
feats = self.backbone(inputs)
feats = feats[::-1][:len(self.anchor_masks)]
route = None
for idx, feat in enumerate(feats):
if idx > 0:
feat = fluid.layers.concat(input=[route, feat], axis=1)
route, tip = self.yolo_blocks[idx](feat)
block_out = self.block_outputs[idx](tip)
outputs.append(block_out)
if idx < 2:
route = self.route_blocks[idx](route)
route = fluid.layers.resize_nearest(route, scale=2)
if self.mode == 'test':
anchor_mask = self.anchor_masks[idx]
mask_anchors = []
for m in anchor_mask:
mask_anchors.append(self.anchors[2 * m])
mask_anchors.append(self.anchors[2 * m + 1])
img_shape = fluid.layers.slice(img_info, axes=[1], starts=[1], ends=[3])
img_id = fluid.layers.slice(img_info, axes=[1], starts=[0], ends=[1])
b, s = fluid.layers.yolo_box(
x=block_out,
img_size=img_shape,
anchors=mask_anchors,
class_num=self.num_classes,
conf_thresh=self.valid_thresh,
downsample_ratio=downsample)
boxes.append(b)
scores.append(fluid.layers.transpose(s, perm=[0, 2, 1]))
downsample //= 2
if self.mode != 'test':
return outputs
return [img_id, fluid.layers.multiclass_nms(
bboxes=fluid.layers.concat(boxes, axis=1),
scores=fluid.layers.concat(scores, axis=2),
score_threshold=self.valid_thresh,
nms_top_k=self.nms_topk,
keep_top_k=self.nms_posk,
nms_threshold=self.nms_thresh,
background_label=-1)]
class YoloLoss(Loss):
def __init__(self, num_classes=80):
super(YoloLoss, self).__init__()
self.num_classes = num_classes
self.ignore_thresh = 0.7
self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45,
59, 119, 116, 90, 156, 198, 373, 326]
self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
def forward(self, outputs, labels):
downsample = 32
gt_box, gt_label, gt_score = labels
losses = []
for idx, out in enumerate(outputs):
anchor_mask = self.anchor_masks[idx]
loss = fluid.layers.yolov3_loss(
x=out,
gt_box=gt_box,
gt_label=gt_label,
gt_score=gt_score,
anchor_mask=anchor_mask,
downsample_ratio=downsample,
anchors=self.anchors,
class_num=self.num_classes,
ignore_thresh=self.ignore_thresh,
use_label_smooth=True)
loss = fluid.layers.reduce_mean(loss)
losses.append(loss)
downsample //= 2
return losses
def make_optimizer(parameter_list=None):
base_lr = FLAGS.lr
warm_up_iter = 4000
momentum = 0.9
weight_decay = 5e-4
boundaries = [400000, 450000]
values = [base_lr * (0.1 ** i) for i in range(len(boundaries) + 1)]
learning_rate = fluid.layers.piecewise_decay(
boundaries=boundaries,
values=values)
learning_rate = fluid.layers.linear_lr_warmup(
learning_rate=learning_rate,
warmup_steps=warm_up_iter,
start_lr=0.0,
end_lr=base_lr)
optimizer = fluid.optimizer.Momentum(
learning_rate=learning_rate,
regularization=fluid.regularizer.L2Decay(weight_decay),
momentum=momentum,
parameter_list=parameter_list)
return optimizer
def _iou_matrix(a, b):
tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)
area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
area_o = (area_a[:, np.newaxis] + area_b - area_i)
return area_i / (area_o + 1e-10)
def _crop_box_with_center_constraint(box, crop):
cropped_box = box.copy()
cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2])
cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:])
cropped_box[:, :2] -= crop[:2]
cropped_box[:, 2:] -= crop[:2]
centers = (box[:, :2] + box[:, 2:]) / 2
valid = np.logical_and(
crop[:2] <= centers, centers < crop[2:]).all(axis=1)
valid = np.logical_and(
valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1))
return cropped_box, np.where(valid)[0]
def random_crop(inputs):
aspect_ratios = [.5, 2.]
thresholds = [.0, .1, .3, .5, .7, .9]
scaling = [.3, 1.]
img, img_ids, gt_box, gt_label = inputs
h, w = img.shape[:2]
if len(gt_box) == 0:
return inputs
np.random.shuffle(thresholds)
for thresh in thresholds:
found = False
for i in range(50):
scale = np.random.uniform(*scaling)
min_ar, max_ar = aspect_ratios
ar = np.random.uniform(max(min_ar, scale**2),
min(max_ar, scale**-2))
crop_h = int(h * scale / np.sqrt(ar))
crop_w = int(w * scale * np.sqrt(ar))
crop_y = np.random.randint(0, h - crop_h)
crop_x = np.random.randint(0, w - crop_w)
crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
iou = _iou_matrix(gt_box, np.array([crop_box], dtype=np.float32))
if iou.max() < thresh:
continue
cropped_box, valid_ids = _crop_box_with_center_constraint(
gt_box, np.array(crop_box, dtype=np.float32))
if valid_ids.size > 0:
found = True
break
if found:
x1, y1, x2, y2 = crop_box
img = img[y1:y2, x1:x2, :]
gt_box = np.take(cropped_box, valid_ids, axis=0)
gt_label = np.take(gt_label, valid_ids, axis=0)
return img, img_ids, gt_box, gt_label
return inputs
# XXX mix up, color distort and random expand are skipped for simplicity
def sample_transform(inputs, mode='train', num_max_boxes=50):
if mode == 'train':
img, img_id, gt_box, gt_label = random_crop(inputs)
else:
img, img_id, gt_box, gt_label = inputs
h, w = img.shape[:2]
# random flip
if mode == 'train' and np.random.uniform(0., 1.) > .5:
img = img[:, ::-1, :]
if len(gt_box) > 0:
swap = gt_box.copy()
gt_box[:, 0] = w - swap[:, 2] - 1
gt_box[:, 2] = w - swap[:, 0] - 1
if len(gt_label) == 0:
gt_box = np.zeros([num_max_boxes, 4], dtype=np.float32)
gt_label = np.zeros([num_max_boxes], dtype=np.int32)
return img, gt_box, gt_label
gt_box = gt_box[:num_max_boxes, :]
gt_label = gt_label[:num_max_boxes, 0]
# normalize boxes
gt_box /= np.array([w, h] * 2, dtype=np.float32)
gt_box[:, 2:] = gt_box[:, 2:] - gt_box[:, :2]
gt_box[:, :2] = gt_box[:, :2] + gt_box[:, 2:] / 2.
pad = num_max_boxes - gt_label.size
gt_box = np.pad(gt_box, ((0, pad), (0, 0)), mode='constant')
gt_label = np.pad(gt_label, ((0, pad)), mode='constant')
return img, img_id, gt_box, gt_label
def batch_transform(batch, mode='train'):
if mode == 'train':
d = np.random.choice(
[320, 352, 384, 416, 448, 480, 512, 544, 576, 608])
interp = np.random.choice(range(5))
else:
d = 608
interp = cv2.INTER_CUBIC
# transpose batch
imgs, img_ids, gt_boxes, gt_labels = list(zip(*batch))
img_shapes = np.array([[im.shape[0], im.shape[1]] for im in imgs]).astype('int32')
imgs = np.array([cv2.resize(
img, (d, d), interpolation=interp) for img in imgs])
# transpose, permute and normalize
imgs = imgs.astype(np.float32)[..., ::-1]
mean = np.array([123.675, 116.28, 103.53], dtype=np.float32)
std = np.array([58.395, 57.120, 57.375], dtype=np.float32)
invstd = 1. / std
imgs -= mean
imgs *= invstd
imgs = imgs.transpose((0, 3, 1, 2))
img_ids = np.array(img_ids)
img_info = np.concatenate([img_ids, img_shapes], axis=1)
gt_boxes = np.array(gt_boxes)
gt_labels = np.array(gt_labels)
# XXX since mix up is not used, scores are all ones
gt_scores = np.ones_like(gt_labels, dtype=np.float32)
return [imgs, img_info], [gt_boxes, gt_labels, gt_scores]
def coco2017(root_dir, mode='train'):
json_path = os.path.join(
root_dir, 'annotations/instances_{}2017.json'.format(mode))
coco = COCO(json_path)
img_ids = coco.getImgIds()
imgs = coco.loadImgs(img_ids)
class_map = {v: i + 1 for i, v in enumerate(coco.getCatIds())}
samples = []
for img in imgs:
img_path = os.path.join(
root_dir, '{}2017'.format(mode), img['file_name'])
file_path = img_path
width = img['width']
height = img['height']
ann_ids = coco.getAnnIds(imgIds=img['id'], iscrowd=False)
anns = coco.loadAnns(ann_ids)
gt_box = []
gt_label = []
for ann in anns:
x1, y1, w, h = ann['bbox']
x2 = x1 + w - 1
y2 = y1 + h - 1
x1 = np.clip(x1, 0, width - 1)
x2 = np.clip(x2, 0, width - 1)
y1 = np.clip(y1, 0, height - 1)
y2 = np.clip(y2, 0, height - 1)
if ann['area'] <= 0 or x2 < x1 or y2 < y1:
continue
gt_label.append(ann['category_id'])
gt_box.append([x1, y1, x2, y2])
gt_box = np.array(gt_box, dtype=np.float32)
gt_label = np.array([class_map[cls] for cls in gt_label],
dtype=np.int32)[:, np.newaxis]
im_id = np.array([img['id']], dtype=np.int32)
if gt_label.size == 0 and not mode == 'train':
continue
samples.append((file_path, im_id.copy(), gt_box.copy(), gt_label.copy()))
def iterator():
if mode == 'train':
np.random.shuffle(samples)
for file_path, im_id, gt_box, gt_label in samples:
img = cv2.imread(file_path)
yield img, im_id, gt_box, gt_label
return iterator
# XXX coco metrics not included for simplicity
def run(model, loader, mode='train'):
total_loss = 0.
total_time = 0.
device_ids = list(range(FLAGS.num_devices))
start = time.time()
for idx, batch in enumerate(loader()):
losses = getattr(model, mode)(batch[0], batch[1])
total_loss += np.sum(losses)
if idx > 1: # skip first two steps
total_time += time.time() - start
if idx % 10 == 0:
logger.info("{:04d}: loss {:0.3f} time: {:0.3f}".format(
idx, total_loss / (idx + 1), total_time / max(1, (idx - 1))))
start = time.time()
def main():
@contextlib.contextmanager
def null_guard():
yield
epoch = FLAGS.epoch
batch_size = FLAGS.batch_size
guard = fluid.dygraph.guard() if FLAGS.dynamic else null_guard()
train_loader = fluid.io.xmap_readers(
batch_transform,
paddle.batch(
fluid.io.xmap_readers(
sample_transform,
coco2017(FLAGS.data, 'train'),
process_num=8,
buffer_size=4 * batch_size),
batch_size=batch_size,
drop_last=True),
process_num=2, buffer_size=4)
val_sample_transform = partial(sample_transform, mode='val')
val_batch_transform = partial(batch_transform, mode='val')
val_loader = fluid.io.xmap_readers(
val_batch_transform,
paddle.batch(
fluid.io.xmap_readers(
val_sample_transform,
coco2017(FLAGS.data, 'val'),
process_num=8,
buffer_size=4 * batch_size),
batch_size=1),
process_num=2, buffer_size=4)
if not os.path.exists('yolo_checkpoints'):
os.mkdir('yolo_checkpoints')
with guard:
NUM_CLASSES = 7
NUM_MAX_BOXES = 50
model = YOLOv3(num_classes=NUM_CLASSES)
# XXX transfer learning
if FLAGS.pretrain_weights is not None:
model.backbone.load(FLAGS.pretrain_weights)
if FLAGS.weights is not None:
model.load(FLAGS.weights)
optim = make_optimizer(parameter_list=model.parameters())
anno_path = os.path.join(FLAGS.data, 'annotations', 'instances_val2017.json')
inputs = [Input([None, 3, None, None], 'float32', name='image'),
Input([None, 3], 'int32', name='img_info')]
labels = [Input([None, NUM_MAX_BOXES, 4], 'float32', name='gt_bbox'),
Input([None, NUM_MAX_BOXES], 'int32', name='gt_label'),
Input([None, NUM_MAX_BOXES], 'float32', name='gt_score')]
model.prepare(optim,
YoloLoss(num_classes=NUM_CLASSES),
# For YOLOv3, output variable in train/eval is different,
# which is not supported by metric, add by callback later?
# metrics=COCOMetric(anno_path, with_background=False)
inputs=inputs,
labels = labels)
for e in range(epoch):
logger.info("======== train epoch {} ========".format(e))
run(model, train_loader)
model.save('yolo_checkpoints/{:02d}'.format(e))
logger.info("======== eval epoch {} ========".format(e))
run(model, val_loader, mode='eval')
# should be called in fit()
for metric in model._metrics:
metric.accumulate()
metric.reset()
if __name__ == '__main__':
parser = argparse.ArgumentParser("Yolov3 Training on COCO")
parser.add_argument('data', metavar='DIR', help='path to COCO dataset')
parser.add_argument(
"-d", "--dynamic", action='store_true', help="enable dygraph mode")
parser.add_argument(
"-e", "--epoch", default=300, type=int, help="number of epoch")
parser.add_argument(
'--lr', '--learning-rate', default=0.001, type=float, metavar='LR',
help='initial learning rate')
parser.add_argument(
"-b", "--batch_size", default=64, type=int, help="batch size")
parser.add_argument(
"-n", "--num_devices", default=8, type=int, help="number of devices")
parser.add_argument(
"-p", "--pretrain_weights", default=None, type=str,
help="path to pretrained weights")
parser.add_argument(
"-w", "--weights", default=None, type=str,
help="path to model weights")
FLAGS = parser.parse_args()
assert FLAGS.data, "error: must provide data path"
main()
# YOLOv3 目标检测模型
---
## 内容
- [模型简介](#模型简介)
- [快速开始](#快速开始)
- [参考论文](#参考论文)
## 模型简介
[YOLOv3](https://arxiv.org/abs/1804.02767) 是由 [Joseph Redmon](https://arxiv.org/search/cs?searchtype=author&query=Redmon%2C+J)[Ali Farhadi](https://arxiv.org/search/cs?searchtype=author&query=Farhadi%2C+A) 提出的单阶段检测器, 该检测器与达到同样精度的传统目标检测方法相比,推断速度能达到接近两倍.
传统目标检测方法通过两阶段检测,第一阶段生成预选框,第二阶段对预选框进行分类和位置坐标的调整,而YOLO将目标检测看做是对框位置和类别概率的一个单阶段回归问题,使得YOLO能达到近两倍的检测速度。而YOLOv3在YOLO的基础上引入的多尺度预测,使得YOLOv3网络对于小物体的检测精度大幅提高。
[YOLOv3](https://arxiv.org/abs/1804.02767) 是一阶段End2End的目标检测器。其目标检测原理如下图所示:
<p align="center">
<img src="image/YOLOv3.jpg" height=400 width=600 hspace='10'/> <br />
YOLOv3检测原理
</p>
YOLOv3将输入图像分成S\*S个格子,每个格子预测B个bounding box,每个bounding box预测内容包括: Location(x, y, w, h)、Confidence Score和C个类别的概率,因此YOLOv3输出层的channel数为B\*(5 + C)。YOLOv3的loss函数也有三部分组成:Location误差,Confidence误差和分类误差。
YOLOv3的网络结构如下图所示:
<p align="center">
<img src="image/YOLOv3_structure.jpg" height=400 width=400 hspace='10'/> <br />
YOLOv3网络结构
</p>
YOLOv3 的网络结构由基础特征提取网络、multi-scale特征融合层和输出层组成。
1. 特征提取网络。YOLOv3使用 [DarkNet53](https://arxiv.org/abs/1612.08242)作为特征提取网络:DarkNet53 基本采用了全卷积网络,用步长为2的卷积操作替代了池化层,同时添加了 Residual 单元,避免在网络层数过深时发生梯度弥散。
2. 特征融合层。为了解决之前YOLO版本对小目标不敏感的问题,YOLOv3采用了3个不同尺度的特征图来进行目标检测,分别为13\*13,26\*26,52\*52,用来检测大、中、小三种目标。特征融合层选取 DarkNet 产出的三种尺度特征图作为输入,借鉴了FPN(feature pyramid networks)的思想,通过一系列的卷积层和上采样对各尺度的特征图进行融合。
3. 输出层。同样使用了全卷积结构,其中最后一个卷积层的卷积核个数是255:3\*(80+4+1)=255,3表示一个grid cell包含3个bounding box,4表示框的4个坐标信息,1表示Confidence Score,80表示COCO数据集中80个类别的概率。
## 快速开始
### 安装说明
#### paddle安装
本项目依赖于 PaddlePaddle 1.7及以上版本或适当的develop版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
#### 代码下载及环境变量设置
克隆代码库到本地,并设置`PYTHONPATH`环境变量
```bash
git clone https://github.com/PaddlePaddle/hapi
cd hapi
export PYTHONPATH=$PYTHONPATH:`pwd`
cd tsm
```
#### 安装COCO-API
训练前需要首先下载[COCO-API](https://github.com/cocodataset/cocoapi):
```bash
git clone https://github.com/cocodataset/cocoapi.git
cd cocoapi/PythonAPI
# if cython is not installed
pip install Cython
# Install into global site-packages
make install
# Alternatively, if you do not have permissions or prefer
# not to install the COCO API into global site-packages
python setup.py install --user
```
### 数据准备
模型目前支持COCO数据集格式的数据读入和精度评估,我们同时提供了将转换为COCO数据集的格式的Pascal VOC数据集下载,可通过如下命令下载。
```bash
python dataset/download_voc.py
```
数据目录结构如下:
```
dataset/voc/
├── annotations
│   ├── instances_train2017.json
│   ├── instances_val2017.json
| ...
├── train2017
│   ├── 1013.jpg
│   ├── 1014.jpg
| ...
├── val2017
│   ├── 2551.jpg
│   ├── 2552.jpg
| ...
```
### 模型训练
数据准备完毕后,可使用`main.py`脚本启动训练和评估,如下脚本会自动每epoch交替进行训练和模型评估,并将checkpoint默认保存在`yolo_checkpoint`目录下。
YOLOv3模型训练总batch_size为64训练,以下以使用4卡Tesla P40每卡batch_size为16训练介绍训练方式。对于静态图和动态图,多卡训练中`--batch_size`为每卡上的batch_size,即总batch_size为`--batch_size`乘以卡数。
`main.py`脚本参数可通过如下命令查询
```bash
python main.py --help
```
#### 静态图训练
使用如下方式进行多卡训练:
```bash
CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --data=<path/to/dataset> --batch_size=16
```
#### 动态图训练
动态图训练只需要在运行脚本时添加`-d`参数即可。
使用如下方式进行多卡训练:
```bash
CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --data=<path/to/dataset> --batch_size=16 -d
```
### 模型评估
YOLOv3模型输出为LoDTensor,只支持使用batch_size为1进行评估,可通过如下两种方式进行模型评估。
1. 自动下载Paddle发布的[YOLOv3-DarkNet53](https://paddlemodels.bj.bcebos.com/hapi/yolov3_darknet53.pdparams)权重评估
```bash
python main.py --data=dataset/voc --eval_only
```
2. 加载checkpoint进行精度评估
```bash
python main.py --data=dataset/voc --eval_only --weights=yolo_checkpoint/no_mixup/final
```
同样可以通过指定`-d`参数进行动态图模式的评估。
#### 评估精度
在10类小数据集下训练模型权重见[YOLOv3-DarkNet53](https://paddlemodels.bj.bcebos.com/hapi/yolov3_darknet53.pdparams),评估精度如下:
```bash
Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.503
Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.779
Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.562
Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.190
Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.390
Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.578
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.405
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.591
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.599
Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.294
Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.506
Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.670
```
### 模型推断及可视化
可通过如下两种方式进行模型推断。
1. 自动下载Paddle发布的[YOLOv3-DarkNet53](https://paddlemodels.bj.bcebos.com/hapi/yolov3_darknet53.pdparams)权重评估
```bash
python infer.py --label_list=dataset/voc/label_list.txt --infer_image=image/dog.jpg
```
2. 加载checkpoint进行精度评估
```bash
python infer.py --label_list=dataset/voc/label_list.txt --infer_image=image/dog.jpg --weights=yolo_checkpoint/mo_mixup/final
```
推断结果可视化图像会保存于`--output`指定的文件夹下,默认保存于`./output`目录。
模型推断会输出如下检测结果日志:
```text
2020-04-02 08:26:47,268-INFO: detect bicycle at [116.14993, 127.278336, 579.7716, 438.44214] score: 0.97
2020-04-02 08:26:47,273-INFO: detect dog at [127.44086, 215.71997, 316.04276, 539.7584] score: 0.99
2020-04-02 08:26:47,274-INFO: detect car at [475.42343, 80.007484, 687.16095, 171.27374] score: 0.98
2020-04-02 08:26:47,274-INFO: Detection bbox results save in output/dog.jpg
```
## 参考论文
- [You Only Look Once: Unified, Real-Time Object Detection](https://arxiv.org/abs/1506.02640v5), Joseph Redmon, Santosh Divvala, Ross Girshick, Ali Farhadi.
- [YOLOv3: An Incremental Improvement](https://arxiv.org/abs/1804.02767v1), Joseph Redmon, Ali Farhadi.
- [Bag of Freebies for Training Object Detection Neural Networks](https://arxiv.org/abs/1902.04103v3), Zhi Zhang, Tong He, Hang Zhang, Zhongyue Zhang, Junyuan Xie, Mu Li.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
from __future__ import print_function
import os
import cv2
import numpy as np
from pycocotools.coco import COCO
from paddle.fluid.io import Dataset
import logging
logger = logging.getLogger(__name__)
__all__ = ['COCODataset']
class COCODataset(Dataset):
"""
Load dataset with MS-COCO format.
Args:
dataset_dir (str): root directory for dataset.
image_dir (str): directory for images.
anno_path (str): voc annotation file path.
sample_num (int): number of samples to load, -1 means all.
use_default_label (bool): whether use the default mapping of
label to integer index. Default True.
with_background (bool): whether load background as a class,
default True.
transform (callable): callable transform to perform on samples,
default None.
mixup (bool): whether return image mixup samples, default False.
alpha (float): alpha factor of beta distribution to generate
mixup score, used only when mixup is True, default 1.5
beta (float): beta factor of beta distribution to generate
mixup score, used only when mixup is True, default 1.5
"""
def __init__(self,
dataset_dir='',
image_dir='',
anno_path='',
sample_num=-1,
with_background=True,
transform=None,
mixup=False,
alpha=1.5,
beta=1.5):
# roidbs is list of dict whose structure is:
# {
# 'im_file': im_fname, # image file name
# 'im_id': im_id, # image id
# 'h': im_h, # height of image
# 'w': im_w, # width
# 'is_crowd': is_crowd,
# 'gt_class': gt_class,
# 'gt_bbox': gt_bbox,
# 'gt_score': gt_score,
# 'difficult': difficult
# }
self._anno_path = os.path.join(dataset_dir, anno_path)
self._image_dir = os.path.join(dataset_dir, image_dir)
assert os.path.exists(self._anno_path), \
"anno_path {} not exists".format(anno_path)
assert os.path.exists(self._image_dir), \
"image_dir {} not exists".format(image_dir)
self._sample_num = sample_num
self._with_background = with_background
self._transform = transform
self._mixup = mixup
self._alpha = alpha
self._beta = beta
# load in dataset roidbs
self._load_roidb_and_cname2cid()
def _load_roidb_and_cname2cid(self):
assert self._anno_path.endswith('.json'), \
'invalid coco annotation file: ' + anno_path
coco = COCO(self._anno_path)
img_ids = coco.getImgIds()
cat_ids = coco.getCatIds()
records = []
ct = 0
# when with_background = True, mapping category to classid, like:
# background:0, first_class:1, second_class:2, ...
catid2clsid = dict({
catid: i + int(self._with_background)
for i, catid in enumerate(cat_ids)
})
cname2cid = dict({
coco.loadCats(catid)[0]['name']: clsid
for catid, clsid in catid2clsid.items()
})
for img_id in img_ids:
img_anno = coco.loadImgs(img_id)[0]
im_fname = img_anno['file_name']
im_w = float(img_anno['width'])
im_h = float(img_anno['height'])
ins_anno_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False)
instances = coco.loadAnns(ins_anno_ids)
bboxes = []
for inst in instances:
x, y, box_w, box_h = inst['bbox']
x1 = max(0, x)
y1 = max(0, y)
x2 = min(im_w - 1, x1 + max(0, box_w - 1))
y2 = min(im_h - 1, y1 + max(0, box_h - 1))
if inst['area'] > 0 and x2 >= x1 and y2 >= y1:
inst['clean_bbox'] = [x1, y1, x2, y2]
bboxes.append(inst)
else:
logger.warn(
'Found an invalid bbox in annotations: im_id: {}, '
'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
img_id, float(inst['area']), x1, y1, x2, y2))
num_bbox = len(bboxes)
gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
gt_score = np.ones((num_bbox, 1), dtype=np.float32)
is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
difficult = np.zeros((num_bbox, 1), dtype=np.int32)
gt_poly = [None] * num_bbox
for i, box in enumerate(bboxes):
catid = box['category_id']
gt_class[i][0] = catid2clsid[catid]
gt_bbox[i, :] = box['clean_bbox']
is_crowd[i][0] = box['iscrowd']
if 'segmentation' in box:
gt_poly[i] = box['segmentation']
im_fname = os.path.join(self._image_dir,
im_fname) if self._image_dir else im_fname
coco_rec = {
'im_file': im_fname,
'im_id': np.array([img_id]),
'h': im_h,
'w': im_w,
'is_crowd': is_crowd,
'gt_class': gt_class,
'gt_bbox': gt_bbox,
'gt_score': gt_score,
'gt_poly': gt_poly,
}
records.append(coco_rec)
ct += 1
if self._sample_num > 0 and ct >= self._sample_num:
break
assert len(records) > 0, 'not found any coco record in %s' % (self._anno_path)
logger.info('{} samples in file {}'.format(ct, self._anno_path))
self._roidbs, self._cname2cid = records, cname2cid
@property
def num_classes(self):
return len(self._cname2cid)
def __len__(self):
return len(self._roidbs)
def _getitem_by_index(self, idx):
roidb = self._roidbs[idx]
with open(roidb['im_file'], 'rb') as f:
data = np.frombuffer(f.read(), dtype='uint8')
im = cv2.imdecode(data, 1)
im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
im_info = np.array([roidb['im_id'][0], roidb['h'], roidb['w']], dtype='int32')
gt_bbox = roidb['gt_bbox']
gt_class = roidb['gt_class']
gt_score = roidb['gt_score']
return im_info, im, gt_bbox, gt_class, gt_score
def __getitem__(self, idx):
im_info, im, gt_bbox, gt_class, gt_score = self._getitem_by_index(idx)
if self._mixup:
mixup_idx = idx + np.random.randint(1, self.__len__())
mixup_idx %= self.__len__()
_, mixup_im, mixup_bbox, mixup_class, _ = \
self._getitem_by_index(mixup_idx)
im, gt_bbox, gt_class, gt_score = \
self._mixup_image(im, gt_bbox, gt_class, mixup_im,
mixup_bbox, mixup_class)
if self._transform:
im_info, im, gt_bbox, gt_class, gt_score = \
self._transform(im_info, im, gt_bbox, gt_class, gt_score)
return [im_info, im, gt_bbox, gt_class, gt_score]
def _mixup_image(self, img1, bbox1, class1, img2, bbox2, class2):
factor = np.random.beta(self._alpha, self._beta)
factor = max(0.0, min(1.0, factor))
if factor >= 1.0:
return img1, bbox1, class1, np.ones_like(class1, dtype="float32")
if factor <= 0.0:
return img2, bbox2, class2, np.ones_like(class2, dtype="float32")
h = max(img1.shape[0], img2.shape[0])
w = max(img1.shape[1], img2.shape[1])
img = np.zeros((h, w, img1.shape[2]), 'float32')
img[:img1.shape[0], :img1.shape[1], :] = \
img1.astype('float32') * factor
img[:img2.shape[0], :img2.shape[1], :] += \
img2.astype('float32') * (1.0 - factor)
gt_bbox = np.concatenate((bbox1, bbox2), axis=0)
gt_class = np.concatenate((class1, class2), axis=0)
score1 = np.ones_like(class1, dtype="float32") * factor
score2 = np.ones_like(class2, dtype="float32") * (1.0 - factor)
gt_score = np.concatenate((score1, score2), axis=0)
return img, gt_bbox, gt_class, gt_score
@property
def mixup(self):
return self._mixup
@mixup.setter
def mixup(self, value):
if not isinstance(value, bool):
raise ValueError("mixup should be a boolean number")
logger.info("{} set mixup to {}".format(self, value))
self._mixup = value
def pascalvoc_label(with_background=True):
labels_map = {
'aeroplane': 1,
'bicycle': 2,
'bird': 3,
'boat': 4,
'bottle': 5,
'bus': 6,
'car': 7,
'cat': 8,
'chair': 9,
'cow': 10,
'diningtable': 11,
'dog': 12,
'horse': 13,
'motorbike': 14,
'person': 15,
'pottedplant': 16,
'sheep': 17,
'sofa': 18,
'train': 19,
'tvmonitor': 20
}
if not with_background:
labels_map = {k: v - 1 for k, v in labels_map.items()}
return labels_map
...@@ -17,8 +17,6 @@ import json ...@@ -17,8 +17,6 @@ import json
from pycocotools.cocoeval import COCOeval from pycocotools.cocoeval import COCOeval
from pycocotools.coco import COCO from pycocotools.coco import COCO
from metrics import Metric
import logging import logging
FORMAT = '%(asctime)s-%(levelname)s: %(message)s' FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT) logging.basicConfig(level=logging.INFO, format=FORMAT)
...@@ -26,12 +24,13 @@ logger = logging.getLogger(__name__) ...@@ -26,12 +24,13 @@ logger = logging.getLogger(__name__)
__all__ = ['COCOMetric'] __all__ = ['COCOMetric']
OUTFILE = './bbox.json' OUTFILE = './bbox.json'
# considered to change to a callback later # COCOMetric behavior is different from Metric defined in high
class COCOMetric(Metric): # level API, COCOMetric will and con only accumulate on the epoch
# end, so we impliment COCOMetric as not a high level API Metric
class COCOMetric():
""" """
Metrci for MS-COCO dataset, only support update with batch Metrci for MS-COCO dataset, only support update with batch
size as 1. size as 1.
...@@ -43,26 +42,24 @@ class COCOMetric(Metric): ...@@ -43,26 +42,24 @@ class COCOMetric(Metric):
""" """
def __init__(self, anno_path, with_background=True, **kwargs): def __init__(self, anno_path, with_background=True, **kwargs):
super(COCOMetric, self).__init__(**kwargs)
self.anno_path = anno_path self.anno_path = anno_path
self.with_background = with_background self.with_background = with_background
self.bbox_results = [] self.bbox_results = []
self.coco_gt = COCO(anno_path) self.coco_gt = COCO(anno_path)
cat_ids = self.coco_gt.getCatIds() cat_ids = self.coco_gt.getCatIds()
self.clsid2catid = dict( self.clsid2catid = dict(
{i + int(with_background): catid {i + int(with_background): catid
for i, catid in enumerate(cat_ids)}) for i, catid in enumerate(cat_ids)})
def update(self, preds, *args, **kwargs): def update(self, img_id, bboxes):
im_ids, bboxes = preds assert img_id.shape[0] == 1, \
assert im_ids.shape[0] == 1, \
"COCOMetric can only update with batch size = 1" "COCOMetric can only update with batch size = 1"
if bboxes.shape[1] != 6: if bboxes.shape[1] != 6:
# no bbox detected in this batch # no bbox detected in this batch
return return
im_id = int(im_ids) img_id = int(img_id)
for i in range(bboxes.shape[0]): for i in range(bboxes.shape[0]):
dt = bboxes[i, :] dt = bboxes[i, :]
clsid, score, xmin, ymin, xmax, ymax = dt.tolist() clsid, score, xmin, ymin, xmax, ymax = dt.tolist()
...@@ -72,7 +69,7 @@ class COCOMetric(Metric): ...@@ -72,7 +69,7 @@ class COCOMetric(Metric):
h = ymax - ymin + 1 h = ymax - ymin + 1
bbox = [xmin, ymin, w, h] bbox = [xmin, ymin, w, h]
coco_res = { coco_res = {
'image_id': im_id, 'image_id': img_id,
'category_id': catid, 'category_id': catid,
'bbox': bbox, 'bbox': bbox,
'score': score 'score': score
...@@ -83,30 +80,30 @@ class COCOMetric(Metric): ...@@ -83,30 +80,30 @@ class COCOMetric(Metric):
self.bbox_results = [] self.bbox_results = []
def accumulate(self): def accumulate(self):
if len(self.bbox_results) == 0: if len(self.bbox_results) == 0:
logger.warning("The number of valid bbox detected is zero.\n \ logger.warning("The number of valid bbox detected is zero.\n \
Please use reasonable model and check input data.\n \ Please use reasonable model and check input data.\n \
stop COCOMetric accumulate!") stop COCOMetric accumulate!")
return [0.0] return [0.0]
with open(OUTFILE, 'w') as f: with open(OUTFILE, 'w') as f:
json.dump(self.bbox_results, f) json.dump(self.bbox_results, f)
map_stats = self.cocoapi_eval(OUTFILE, 'bbox', coco_gt=self.coco_gt) map_stats = self.cocoapi_eval(OUTFILE, 'bbox', coco_gt=self.coco_gt)
# flush coco evaluation result # flush coco evaluation result
sys.stdout.flush() sys.stdout.flush()
self.result = map_stats[0] self.result = map_stats[0]
return self.result return [self.result]
def cocoapi_eval(self, jsonfile, style, coco_gt=None, anno_file=None): def cocoapi_eval(self, jsonfile, style, coco_gt=None, anno_file=None):
assert coco_gt != None or anno_file != None assert coco_gt != None or anno_file != None
if coco_gt == None: if coco_gt == None:
coco_gt = COCO(anno_file) coco_gt = COCO(anno_file)
logger.info("Start evaluate...") logger.info("Start evaluate...")
coco_dt = coco_gt.loadRes(jsonfile) coco_dt = coco_gt.loadRes(jsonfile)
coco_eval = COCOeval(coco_gt, coco_dt, style) coco_eval = COCOeval(coco_gt, coco_dt, style)
coco_eval.evaluate() coco_eval.evaluate()
coco_eval.accumulate() coco_eval.accumulate()
coco_eval.summarize() coco_eval.summarize()
return coco_eval.stats return coco_eval.stats
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -12,3 +12,35 @@ ...@@ -12,3 +12,35 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
import os.path as osp
import sys
import tarfile
from download import _download
import logging
logger = logging.getLogger(__name__)
DATASETS = {
'voc': [
('https://paddlemodels.bj.bcebos.com/hapi/voc.tar',
'9faeb7fd997aeea843092fd608d5bcb4', ),
],
}
def download_decompress_file(data_dir, url, md5):
logger.info("Downloading from {}".format(url))
tar_file = _download(url, data_dir, md5)
logger.info("Decompressing {}".format(tar_file))
with tarfile.open(tar_file) as tf:
tf.extractall(path=data_dir)
os.remove(tar_file)
if __name__ == "__main__":
data_dir = osp.split(osp.realpath(sys.argv[0]))[0]
for name, infos in DATASETS.items():
for info in infos:
download_decompress_file(data_dir, *info)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
from __future__ import print_function
import os
import argparse
import numpy as np
from PIL import Image
from paddle import fluid
from paddle.fluid.optimizer import Momentum
from paddle.fluid.io import DataLoader
from model import Model, Input, set_device
from models import yolov3_darknet53, YoloLoss
from coco import COCODataset
from transforms import *
from visualizer import draw_bbox
import logging
logger = logging.getLogger(__name__)
IMAGE_MEAN = [0.485, 0.456, 0.406]
IMAGE_STD = [0.229, 0.224, 0.225]
def get_save_image_name(output_dir, image_path):
"""
Get save image name from source image path.
"""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
image_name = os.path.split(image_path)[-1]
name, ext = os.path.splitext(image_name)
return os.path.join(output_dir, "{}".format(name)) + ext
def load_labels(label_list, with_background=True):
idx = int(with_background)
cat2name = {}
with open(label_list) as f:
for line in f.readlines():
line = line.strip()
if line:
cat2name[idx] = line
idx += 1
return cat2name
def main():
device = set_device(FLAGS.device)
fluid.enable_dygraph(device) if FLAGS.dynamic else None
inputs = [Input([None, 3], 'int32', name='img_info'),
Input([None, 3, None, None], 'float32', name='image')]
cat2name = load_labels(FLAGS.label_list, with_background=False)
model = yolov3_darknet53(num_classes=len(cat2name),
model_mode='test',
pretrained=FLAGS.weights is None)
model.prepare(inputs=inputs, device=FLAGS.device)
if FLAGS.weights is not None:
model.load(FLAGS.weights, reset_optimizer=True)
# image preprocess
orig_img = Image.open(FLAGS.infer_image).convert('RGB')
w, h = orig_img.size
img = orig_img.resize((608, 608), Image.BICUBIC)
img = np.array(img).astype('float32') / 255.0
img -= np.array(IMAGE_MEAN)
img /= np.array(IMAGE_STD)
img = img.transpose((2, 0, 1))[np.newaxis, :]
img_info = np.array([0, h, w]).astype('int32')[np.newaxis, :]
_, bboxes = model.test([img_info, img])
vis_img = draw_bbox(orig_img, cat2name, bboxes, FLAGS.draw_threshold)
save_name = get_save_image_name(FLAGS.output_dir, FLAGS.infer_image)
logger.info("Detection bbox results save in {}".format(save_name))
vis_img.save(save_name, quality=95)
if __name__ == '__main__':
parser = argparse.ArgumentParser("Yolov3 Training on VOC")
parser.add_argument(
"--device", type=str, default='gpu', help="device to use, gpu or cpu")
parser.add_argument(
"-d", "--dynamic", action='store_true', help="enable dygraph mode")
parser.add_argument(
"--label_list", type=str, default=None,
help="path to category label list file")
parser.add_argument(
"-t", "--draw_threshold", type=float, default=0.5,
help="threshold to reserve the result for visualization")
parser.add_argument(
"-i", "--infer_image", type=str, default=None,
help="image path for inference")
parser.add_argument(
"-o", "--output_dir", type=str, default='output',
help="directory to save inference result if --visualize is set")
parser.add_argument(
"-w", "--weights", default=None, type=str,
help="path to weights for inference")
FLAGS = parser.parse_args()
assert os.path.isfile(FLAGS.infer_image), \
"infer_image {} not a file".format(FLAGS.infer_image)
assert os.path.isfile(FLAGS.label_list), \
"label_list {} not a file".format(FLAGS.label_list)
main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
from __future__ import print_function
import argparse
import contextlib
import os
import numpy as np
from paddle import fluid
from paddle.fluid.optimizer import Momentum
from paddle.fluid.io import DataLoader
from model import Model, Input, set_device
from distributed import DistributedBatchSampler
from models import yolov3_darknet53, YoloLoss
from coco_metric import COCOMetric
from coco import COCODataset
from transforms import *
NUM_MAX_BOXES = 50
def make_optimizer(step_per_epoch, parameter_list=None):
base_lr = FLAGS.lr
warm_up_iter = 1000
momentum = 0.9
weight_decay = 5e-4
boundaries = [step_per_epoch * e for e in [200, 250]]
values = [base_lr * (0.1 ** i) for i in range(len(boundaries) + 1)]
learning_rate = fluid.layers.piecewise_decay(
boundaries=boundaries,
values=values)
learning_rate = fluid.layers.linear_lr_warmup(
learning_rate=learning_rate,
warmup_steps=warm_up_iter,
start_lr=0.0,
end_lr=base_lr)
optimizer = fluid.optimizer.Momentum(
learning_rate=learning_rate,
regularization=fluid.regularizer.L2Decay(weight_decay),
momentum=momentum,
parameter_list=parameter_list)
return optimizer
def main():
device = set_device(FLAGS.device)
fluid.enable_dygraph(device) if FLAGS.dynamic else None
inputs = [Input([None, 3], 'int32', name='img_info'),
Input([None, 3, None, None], 'float32', name='image')]
labels = [Input([None, NUM_MAX_BOXES, 4], 'float32', name='gt_bbox'),
Input([None, NUM_MAX_BOXES], 'int32', name='gt_label'),
Input([None, NUM_MAX_BOXES], 'float32', name='gt_score')]
if not FLAGS.eval_only: # training mode
train_transform = Compose([ColorDistort(),
RandomExpand(),
RandomCrop(),
RandomFlip(),
NormalizeBox(),
PadBox(),
BboxXYXY2XYWH()])
train_collate_fn = BatchCompose([RandomShape(), NormalizeImage()])
dataset = COCODataset(dataset_dir=FLAGS.data,
anno_path='annotations/instances_train2017.json',
image_dir='train2017',
with_background=False,
mixup=True,
transform=train_transform)
batch_sampler = DistributedBatchSampler(dataset,
batch_size=FLAGS.batch_size,
shuffle=True,
drop_last=True)
loader = DataLoader(dataset,
batch_sampler=batch_sampler,
places=device,
num_workers=FLAGS.num_workers,
return_list=True,
collate_fn=train_collate_fn)
else: # evaluation mode
eval_transform = Compose([ResizeImage(target_size=608),
NormalizeBox(),
PadBox(),
BboxXYXY2XYWH()])
eval_collate_fn = BatchCompose([NormalizeImage()])
dataset = COCODataset(dataset_dir=FLAGS.data,
anno_path='annotations/instances_val2017.json',
image_dir='val2017',
with_background=False,
transform=eval_transform)
# batch_size can only be 1 in evaluation for YOLOv3
# prediction bbox is a LoDTensor
batch_sampler = DistributedBatchSampler(dataset,
batch_size=1,
shuffle=False,
drop_last=False)
loader = DataLoader(dataset,
batch_sampler=batch_sampler,
places=device,
num_workers=FLAGS.num_workers,
return_list=True,
collate_fn=eval_collate_fn)
pretrained = FLAGS.eval_only and FLAGS.weights is None
model = yolov3_darknet53(num_classes=dataset.num_classes,
model_mode='eval' if FLAGS.eval_only else 'train',
pretrained=pretrained)
if FLAGS.pretrain_weights is not None:
model.load(FLAGS.pretrain_weights, skip_mismatch=True, reset_optimizer=True)
optim = make_optimizer(len(batch_sampler), parameter_list=model.parameters())
model.prepare(optim,
YoloLoss(num_classes=dataset.num_classes),
inputs=inputs, labels=labels,
device=FLAGS.device)
# NOTE: we implement COCO metric of YOLOv3 model here, separately
# from 'prepare' and 'fit' framework for follwing reason:
# 1. YOLOv3 network structure is different between 'train' and
# 'eval' mode, in 'eval' mode, output prediction bbox is not the
# feature map used for YoloLoss calculating
# 2. COCO metric behavior is also different from defined Metric
# for COCO metric should not perform accumulate in each iteration
# but only accumulate at the end of an epoch
if FLAGS.eval_only:
if FLAGS.weights is not None:
model.load(FLAGS.weights, reset_optimizer=True)
preds = model.predict(loader, stack_outputs=False)
_, _, _, img_ids, bboxes = preds
anno_path = os.path.join(FLAGS.data, 'annotations/instances_val2017.json')
coco_metric = COCOMetric(anno_path=anno_path, with_background=False)
for img_id, bbox in zip(img_ids, bboxes):
coco_metric.update(img_id, bbox)
coco_metric.accumulate()
coco_metric.reset()
return
if FLAGS.resume is not None:
model.load(FLAGS.resume)
model.fit(train_data=loader,
epochs=FLAGS.epoch - FLAGS.no_mixup_epoch,
save_dir="yolo_checkpoint/mixup",
save_freq=10)
# do not use image mixup transfrom in laste FLAGS.no_mixup_epoch epoches
dataset.mixup = False
model.fit(train_data=loader,
epochs=FLAGS.no_mixup_epoch,
save_dir="yolo_checkpoint/no_mixup",
save_freq=5)
if __name__ == '__main__':
parser = argparse.ArgumentParser("Yolov3 Training on VOC")
parser.add_argument(
"--data", type=str, default='dataset/voc',
help="path to dataset directory")
parser.add_argument(
"--device", type=str, default='gpu', help="device to use, gpu or cpu")
parser.add_argument(
"-d", "--dynamic", action='store_true', help="enable dygraph mode")
parser.add_argument(
"--eval_only", action='store_true', help="run evaluation only")
parser.add_argument(
"-e", "--epoch", default=300, type=int, help="number of epoch")
parser.add_argument(
"--no_mixup_epoch", default=30, type=int,
help="number of the last N epoch without image mixup")
parser.add_argument(
'--lr', '--learning-rate', default=0.001, type=float, metavar='LR',
help='initial learning rate')
parser.add_argument(
"-b", "--batch_size", default=8, type=int, help="batch size")
parser.add_argument(
"-j", "--num_workers", default=4, type=int, help="reader worker number")
parser.add_argument(
"-p", "--pretrain_weights", default=None, type=str,
help="path to pretrained weights")
parser.add_argument(
"-r", "--resume", default=None, type=str,
help="path to model weights")
parser.add_argument(
"-w", "--weights", default=None, type=str,
help="path to weights for evaluation")
FLAGS = parser.parse_args()
assert FLAGS.data, "error: must provide data path"
main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
from __future__ import print_function
import cv2
import traceback
import numpy as np
import logging
logger = logging.getLogger(__name__)
__all__ = ['ColorDistort', 'RandomExpand', 'RandomCrop', 'RandomFlip',
'NormalizeBox', 'PadBox', 'RandomShape', 'NormalizeImage',
'BboxXYXY2XYWH', 'ResizeImage', 'Compose', 'BatchCompose']
class Compose(object):
def __init__(self, transforms=[]):
self.transforms = transforms
def __call__(self, *data):
for f in self.transforms:
try:
data = f(*data)
except Exception as e:
stack_info = traceback.format_exc()
logger.info("fail to perform transform [{}] with error: "
"{} and stack:\n{}".format(f, e, str(stack_info)))
raise e
return data
class BatchCompose(object):
def __init__(self, transforms=[]):
self.transforms = transforms
def __call__(self, data):
for f in self.transforms:
try:
data = f(data)
except Exception as e:
stack_info = traceback.format_exc()
logger.info("fail to perform batch transform [{}] with error: "
"{} and stack:\n{}".format(f, e, str(stack_info)))
raise e
# sample list to batch data
batch = list(zip(*data))
return batch
class ColorDistort(object):
"""Random color distortion.
Args:
hue (list): hue settings.
in [lower, upper, probability] format.
saturation (list): saturation settings.
in [lower, upper, probability] format.
contrast (list): contrast settings.
in [lower, upper, probability] format.
brightness (list): brightness settings.
in [lower, upper, probability] format.
random_apply (bool): whether to apply in random (yolo) or fixed (SSD)
order.
"""
def __init__(self,
hue=[-18, 18, 0.5],
saturation=[0.5, 1.5, 0.5],
contrast=[0.5, 1.5, 0.5],
brightness=[0.5, 1.5, 0.5],
random_apply=True):
self.hue = hue
self.saturation = saturation
self.contrast = contrast
self.brightness = brightness
self.random_apply = random_apply
def apply_hue(self, img):
low, high, prob = self.hue
if np.random.uniform(0., 1.) < prob:
return img
img = img.astype(np.float32)
# XXX works, but result differ from HSV version
delta = np.random.uniform(low, high)
u = np.cos(delta * np.pi)
w = np.sin(delta * np.pi)
bt = np.array([[1.0, 0.0, 0.0], [0.0, u, -w], [0.0, w, u]])
tyiq = np.array([[0.299, 0.587, 0.114], [0.596, -0.274, -0.321],
[0.211, -0.523, 0.311]])
ityiq = np.array([[1.0, 0.956, 0.621], [1.0, -0.272, -0.647],
[1.0, -1.107, 1.705]])
t = np.dot(np.dot(ityiq, bt), tyiq).T
img = np.dot(img, t)
return img
def apply_saturation(self, img):
low, high, prob = self.saturation
if np.random.uniform(0., 1.) < prob:
return img
delta = np.random.uniform(low, high)
img = img.astype(np.float32)
gray = img * np.array([[[0.299, 0.587, 0.114]]], dtype=np.float32)
gray = gray.sum(axis=2, keepdims=True)
gray *= (1.0 - delta)
img *= delta
img += gray
return img
def apply_contrast(self, img):
low, high, prob = self.contrast
if np.random.uniform(0., 1.) < prob:
return img
delta = np.random.uniform(low, high)
img = img.astype(np.float32)
img *= delta
return img
def apply_brightness(self, img):
low, high, prob = self.brightness
if np.random.uniform(0., 1.) < prob:
return img
delta = np.random.uniform(low, high)
img = img.astype(np.float32)
img += delta
return img
def __call__(self, im_info, im, gt_bbox, gt_class, gt_score):
if self.random_apply:
distortions = np.random.permutation([
self.apply_brightness, self.apply_contrast,
self.apply_saturation, self.apply_hue
])
for func in distortions:
im = func(im)
return [im_info, im, gt_bbox, gt_class, gt_score]
im = self.apply_brightness(im)
if np.random.randint(0, 2):
im = self.apply_contrast(im)
im = self.apply_saturation(im)
im = self.apply_hue(im)
else:
im = self.apply_saturation(im)
im = self.apply_hue(im)
im = self.apply_contrast(im)
return [im_info, im, gt_bbox, gt_class, gt_score]
class RandomExpand(object):
"""Random expand the canvas.
Args:
ratio (float): maximum expansion ratio.
prob (float): probability to expand.
fill_value (list): color value used to fill the canvas. in RGB order.
"""
def __init__(self, ratio=4., prob=0.5, fill_value=[123.675, 116.28, 103.53]):
assert ratio > 1.01, "expand ratio must be larger than 1.01"
self.ratio = ratio
self.prob = prob
self.fill_value = fill_value
def __call__(self, im_info, im, gt_bbox, gt_class, gt_score):
if np.random.uniform(0., 1.) < self.prob:
return [im_info, im, gt_bbox, gt_class, gt_score]
height, width, _ = im.shape
expand_ratio = np.random.uniform(1., self.ratio)
h = int(height * expand_ratio)
w = int(width * expand_ratio)
if not h > height or not w > width:
return [im_info, im, gt_bbox, gt_class, gt_score]
y = np.random.randint(0, h - height)
x = np.random.randint(0, w - width)
canvas = np.ones((h, w, 3), dtype=np.uint8)
canvas *= np.array(self.fill_value, dtype=np.uint8)
canvas[y:y + height, x:x + width, :] = im.astype(np.uint8)
gt_bbox += np.array([x, y, x, y], dtype=np.float32)
return [im_info, canvas, gt_bbox, gt_class, gt_score]
class RandomCrop():
"""Random crop image and bboxes.
Args:
aspect_ratio (list): aspect ratio of cropped region.
in [min, max] format.
thresholds (list): iou thresholds for decide a valid bbox crop.
scaling (list): ratio between a cropped region and the original image.
in [min, max] format.
num_attempts (int): number of tries before giving up.
allow_no_crop (bool): allow return without actually cropping them.
cover_all_box (bool): ensure all bboxes are covered in the final crop.
"""
def __init__(self,
aspect_ratio=[.5, 2.],
thresholds=[.0, .1, .3, .5, .7, .9],
scaling=[.3, 1.],
num_attempts=50,
allow_no_crop=True,
cover_all_box=False):
self.aspect_ratio = aspect_ratio
self.thresholds = thresholds
self.scaling = scaling
self.num_attempts = num_attempts
self.allow_no_crop = allow_no_crop
self.cover_all_box = cover_all_box
def __call__(self, im_info, im, gt_bbox, gt_class, gt_score):
if len(gt_bbox) == 0:
return [im_info, im, gt_bbox, gt_class, gt_score]
# NOTE Original method attempts to generate one candidate for each
# threshold then randomly sample one from the resulting list.
# Here a short circuit approach is taken, i.e., randomly choose a
# threshold and attempt to find a valid crop, and simply return the
# first one found.
# The probability is not exactly the same, kinda resembling the
# "Monty Hall" problem. Actually carrying out the attempts will affect
# observability (just like opening doors in the "Monty Hall" game).
thresholds = list(self.thresholds)
if self.allow_no_crop:
thresholds.append('no_crop')
np.random.shuffle(thresholds)
for thresh in thresholds:
if thresh == 'no_crop':
return [im_info, im, gt_bbox, gt_class, gt_score]
h, w, _ = im.shape
found = False
for i in range(self.num_attempts):
scale = np.random.uniform(*self.scaling)
min_ar, max_ar = self.aspect_ratio
aspect_ratio = np.random.uniform(
max(min_ar, scale**2), min(max_ar, scale**-2))
crop_h = int(h * scale / np.sqrt(aspect_ratio))
crop_w = int(w * scale * np.sqrt(aspect_ratio))
crop_y = np.random.randint(0, h - crop_h)
crop_x = np.random.randint(0, w - crop_w)
crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
iou = self._iou_matrix(
gt_bbox, np.array(
[crop_box], dtype=np.float32))
if iou.max() < thresh:
continue
if self.cover_all_box and iou.min() < thresh:
continue
cropped_box, valid_ids = self._crop_box_with_center_constraint(
gt_bbox, np.array(
crop_box, dtype=np.float32))
if valid_ids.size > 0:
found = True
break
if found:
im = self._crop_image(im, crop_box)
gt_bbox = np.take(cropped_box, valid_ids, axis=0)
gt_class = np.take(gt_class, valid_ids, axis=0)
gt_score = np.take(gt_score, valid_ids, axis=0)
return [im_info, im, gt_bbox, gt_class, gt_score]
return [im_info, im, gt_bbox, gt_class, gt_score]
def _iou_matrix(self, a, b):
tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)
area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
area_o = (area_a[:, np.newaxis] + area_b - area_i)
return area_i / (area_o + 1e-10)
def _crop_box_with_center_constraint(self, box, crop):
cropped_box = box.copy()
cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2])
cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:])
cropped_box[:, :2] -= crop[:2]
cropped_box[:, 2:] -= crop[:2]
centers = (box[:, :2] + box[:, 2:]) / 2
valid = np.logical_and(crop[:2] <= centers,
centers < crop[2:]).all(axis=1)
valid = np.logical_and(
valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1))
return cropped_box, np.where(valid)[0]
def _crop_image(self, img, crop):
x1, y1, x2, y2 = crop
return img[y1:y2, x1:x2, :]
class RandomFlip():
def __init__(self, prob=0.5, is_normalized=False):
"""
Args:
prob (float): the probability of flipping image
is_normalized (bool): whether the bbox scale to [0,1]
"""
self.prob = prob
self.is_normalized = is_normalized
if not (isinstance(self.prob, float) and
isinstance(self.is_normalized, bool)):
raise TypeError("{}: input type is invalid.".format(self))
def __call__(self, im_info, im, gt_bbox, gt_class, gt_score):
"""Filp the image and bounding box.
Operators:
1. Flip the image numpy.
2. Transform the bboxes' x coordinates.
(Must judge whether the coordinates are normalized!)
"""
if not isinstance(im, np.ndarray):
raise TypeError("{}: image is not a numpy array.".format(self))
if len(im.shape) != 3:
raise ImageError("{}: image is not 3-dimensional.".format(self))
height, width, _ = im.shape
if np.random.uniform(0, 1) < self.prob:
im = im[:, ::-1, :]
if gt_bbox.shape[0] > 0:
oldx1 = gt_bbox[:, 0].copy()
oldx2 = gt_bbox[:, 2].copy()
if self.is_normalized:
gt_bbox[:, 0] = 1 - oldx2
gt_bbox[:, 2] = 1 - oldx1
else:
gt_bbox[:, 0] = width - oldx2 - 1
gt_bbox[:, 2] = width - oldx1 - 1
if gt_bbox.shape[0] != 0 and (
gt_bbox[:, 2] < gt_bbox[:, 0]).all():
m = "{}: invalid box, x2 should be greater than x1".format(
self)
raise ValueError(m)
return [im_info, im, gt_bbox, gt_class, gt_score]
class NormalizeBox(object):
"""Transform the bounding box's coornidates to [0,1]."""
def __call__(self, im_info, im, gt_bbox, gt_class, gt_score):
height, width, _ = im.shape
for i in range(gt_bbox.shape[0]):
gt_bbox[i][0] = gt_bbox[i][0] / width
gt_bbox[i][1] = gt_bbox[i][1] / height
gt_bbox[i][2] = gt_bbox[i][2] / width
gt_bbox[i][3] = gt_bbox[i][3] / height
return [im_info, im, gt_bbox, gt_class, gt_score]
class PadBox(object):
def __init__(self, num_max_boxes=50):
"""
Pad zeros to bboxes if number of bboxes is less than num_max_boxes.
Args:
num_max_boxes (int): the max number of bboxes
"""
self.num_max_boxes = num_max_boxes
def __call__(self, im_info, im, gt_bbox, gt_class, gt_score):
gt_num = min(self.num_max_boxes, len(gt_bbox))
num_max = self.num_max_boxes
pad_bbox = np.zeros((num_max, 4), dtype=np.float32)
if gt_num > 0:
pad_bbox[:gt_num, :] = gt_bbox[:gt_num, :]
gt_bbox = pad_bbox
pad_class = np.zeros((num_max), dtype=np.int32)
if gt_num > 0:
pad_class[:gt_num] = gt_class[:gt_num, 0]
gt_class = pad_class
pad_score = np.zeros((num_max), dtype=np.float32)
if gt_num > 0:
pad_score[:gt_num] = gt_score[:gt_num, 0]
gt_score = pad_score
return [im_info, im, gt_bbox, gt_class, gt_score]
class BboxXYXY2XYWH(object):
"""
Convert bbox XYXY format to XYWH format.
"""
def __call__(self, im_info, im, gt_bbox, gt_class, gt_score):
gt_bbox[:, 2:4] = gt_bbox[:, 2:4] - gt_bbox[:, :2]
gt_bbox[:, :2] = gt_bbox[:, :2] + gt_bbox[:, 2:4] / 2.
return [im_info, im, gt_bbox, gt_class, gt_score]
class RandomShape(object):
"""
Randomly reshape a batch. If random_inter is True, also randomly
select one an interpolation algorithm [cv2.INTER_NEAREST, cv2.INTER_LINEAR,
cv2.INTER_AREA, cv2.INTER_CUBIC, cv2.INTER_LANCZOS4]. If random_inter is
False, use cv2.INTER_NEAREST.
Args:
sizes (list): list of int, random choose a size from these
random_inter (bool): whether to randomly interpolation, defalut true.
"""
def __init__(self,
sizes=[320, 352, 384, 416, 448, 480, 512, 544, 576, 608],
random_inter=True):
self.sizes = sizes
self.random_inter = random_inter
self.interps = [
cv2.INTER_NEAREST,
cv2.INTER_LINEAR,
cv2.INTER_AREA,
cv2.INTER_CUBIC,
cv2.INTER_LANCZOS4,
] if random_inter else []
def __call__(self, samples):
shape = np.random.choice(self.sizes)
method = np.random.choice(self.interps) if self.random_inter \
else cv2.INTER_NEAREST
for i in range(len(samples)):
im = samples[i][1]
h, w = im.shape[:2]
scale_x = float(shape) / w
scale_y = float(shape) / h
im = cv2.resize(
im, None, None, fx=scale_x, fy=scale_y, interpolation=method)
samples[i][1] = im
return samples
class NormalizeImage(object):
def __init__(self,
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225],
scale=True,
channel_first=True):
"""
Args:
mean (list): the pixel mean
std (list): the pixel variance
scale (bool): whether scale image to [0, 1]
channel_first (bool): whehter change [h, w, c] to [c, h, w]
"""
self.mean = mean
self.std = std
self.scale = scale
self.channel_first = channel_first
if not (isinstance(self.mean, list) and isinstance(self.std, list) and
isinstance(self.scale, bool)):
raise TypeError("{}: input type is invalid.".format(self))
from functools import reduce
if reduce(lambda x, y: x * y, self.std) == 0:
raise ValueError('{}: std is invalid!'.format(self))
def __call__(self, samples):
"""Normalize the image.
Operators:
1. (optional) Scale the image to [0,1]
2. Each pixel minus mean and is divided by std
3. (optional) permute channel
"""
for i in range(len(samples)):
im = samples[i][1]
im = im.astype(np.float32, copy=False)
mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
std = np.array(self.std)[np.newaxis, np.newaxis, :]
if self.scale:
im = im / 255.0
im -= mean
im /= std
if self.channel_first:
im = im.transpose((2, 0, 1))
samples[i][1] = im
return samples
def _iou_matrix(a, b):
tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)
area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
area_o = (area_a[:, np.newaxis] + area_b - area_i)
return area_i / (area_o + 1e-10)
def _crop_box_with_center_constraint(box, crop):
cropped_box = box.copy()
cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2])
cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:])
cropped_box[:, :2] -= crop[:2]
cropped_box[:, 2:] -= crop[:2]
centers = (box[:, :2] + box[:, 2:]) / 2
valid = np.logical_and(
crop[:2] <= centers, centers < crop[2:]).all(axis=1)
valid = np.logical_and(
valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1))
return cropped_box, np.where(valid)[0]
def random_crop(inputs):
aspect_ratios = [.5, 2.]
thresholds = [.0, .1, .3, .5, .7, .9]
scaling = [.3, 1.]
img, img_ids, gt_box, gt_label = inputs
h, w = img.shape[:2]
if len(gt_box) == 0:
return inputs
np.random.shuffle(thresholds)
for thresh in thresholds:
found = False
for i in range(50):
scale = np.random.uniform(*scaling)
min_ar, max_ar = aspect_ratios
ar = np.random.uniform(max(min_ar, scale**2),
min(max_ar, scale**-2))
crop_h = int(h * scale / np.sqrt(ar))
crop_w = int(w * scale * np.sqrt(ar))
crop_y = np.random.randint(0, h - crop_h)
crop_x = np.random.randint(0, w - crop_w)
crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
iou = _iou_matrix(gt_box, np.array([crop_box], dtype=np.float32))
if iou.max() < thresh:
continue
cropped_box, valid_ids = _crop_box_with_center_constraint(
gt_box, np.array(crop_box, dtype=np.float32))
if valid_ids.size > 0:
found = True
break
if found:
x1, y1, x2, y2 = crop_box
img = img[y1:y2, x1:x2, :]
gt_box = np.take(cropped_box, valid_ids, axis=0)
gt_label = np.take(gt_label, valid_ids, axis=0)
return img, img_ids, gt_box, gt_label
return inputs
class ResizeImage(object):
def __init__(self,
target_size=0,
interp=cv2.INTER_CUBIC):
"""
Rescale image to the specified target size.
If target_size is list, selected a scale randomly as the specified
target size.
Args:
target_size (int|list): the target size of image's short side,
multi-scale training is adopted when type is list.
interp (int): the interpolation method
"""
self.interp = int(interp)
if not (isinstance(target_size, int) or isinstance(target_size, list)):
raise TypeError(
"Type of target_size is invalid. Must be Integer or List, now is {}".
format(type(target_size)))
self.target_size = target_size
def __call__(self, im_info, im, gt_bbox, gt_class, gt_score):
""" Resize the image numpy.
"""
if not isinstance(im, np.ndarray):
raise TypeError("{}: image type is not numpy.".format(self))
if len(im.shape) != 3:
raise ImageError('{}: image is not 3-dimensional.'.format(self))
im_shape = im.shape
im_scale_x = float(self.target_size) / float(im_shape[1])
im_scale_y = float(self.target_size) / float(im_shape[0])
resize_w = self.target_size
resize_h = self.target_size
im = cv2.resize(
im,
None,
None,
fx=im_scale_x,
fy=im_scale_y,
interpolation=self.interp)
return [im_info, im, gt_bbox, gt_class, gt_score]
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
from __future__ import print_function
import numpy as np
from PIL import Image, ImageDraw
import logging
logger = logging.getLogger(__name__)
__all__ = ['draw_bbox']
def color_map(num_classes):
color_map = num_classes * [0, 0, 0]
for i in range(0, num_classes):
j = 0
lab = i
while lab:
color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j))
color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j))
color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j))
j += 1
lab >>= 3
color_map = np.array(color_map).reshape(-1, 3)
return color_map
def draw_bbox(image, catid2name, bboxes, threshold):
"""
Draw bbox on image
"""
bboxes = np.array(bboxes)
if bboxes.shape[1] != 6:
logger.info("No bbox detect")
return image
draw = ImageDraw.Draw(image)
catid2color = {}
color_list = color_map(len(catid2name))
for bbox in bboxes:
catid, score, xmin, ymin, xmax, ymax = bbox
if score < threshold:
continue
if catid not in catid2color:
idx = np.random.randint(len(color_list))
catid2color[catid] = color_list[idx]
color = tuple(catid2color[catid])
# draw bbox
draw.line(
[(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
(xmin, ymin)],
width=2,
fill=color)
logger.info("detect {} at {} score: {:.2f}".format(
catid2name[int(catid)], [xmin, ymin, xmax, ymax], score))
# draw label
text = "{} {:.2f}".format(catid2name[catid], score)
tw, th = draw.textsize(text)
draw.rectangle(
[(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color)
draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
return image
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册