diff --git a/yolov3.py b/yolov3.py new file mode 100644 index 0000000000000000000000000000000000000000..1d1cad446a27031ff875e194cda3ad4a7efea012 --- /dev/null +++ b/yolov3.py @@ -0,0 +1,540 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division + +import argparse +import contextlib +import os +import random +import time + +import cv2 +import numpy as np +from pycocotools.coco import COCO + +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Conv2D +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.regularizer import L2Decay + +from model import Model, Loss, shape_hints +from resnet import ResNet, ConvBNLayer + + +# XXX transfer learning +class ResNetBackBone(ResNet): + def __init__(self, depth=50): + super(ResNetBackBone, self).__init__(depth=depth) + delattr(self, 'fc') + + def forward(self, inputs): + x = self.conv(inputs) + x = self.pool(x) + outputs = [] + for layer in self.layers: + x = layer(x) + outputs.append(x) + return outputs + + +class YoloDetectionBlock(fluid.dygraph.Layer): + def __init__(self, num_channels, num_filters): + super(YoloDetectionBlock, self).__init__() + + assert num_filters % 2 == 0, \ + "num_filters {} cannot be divided by 2".format(num_filters) + + self.conv0 = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters, + filter_size=1, + act='leaky_relu') + self.conv1 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters * 2, + filter_size=3, + act='leaky_relu') + self.conv2 = ConvBNLayer( + num_channels=num_filters * 2, + num_filters=num_filters, + filter_size=1, + act='leaky_relu') + self.conv3 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters * 2, + filter_size=3, + act='leaky_relu') + self.route = ConvBNLayer( + num_channels=num_filters * 2, + num_filters=num_filters, + filter_size=1, + act='leaky_relu') + self.tip = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters * 2, + filter_size=3, + act='leaky_relu') + + def forward(self, inputs): + out = self.conv0(inputs) + out = self.conv1(out) + out = self.conv2(out) + out = self.conv3(out) + route = self.route(out) + tip = self.tip(route) + return route, tip + + +class YOLOv3(Model): + def __init__(self): + super(YOLOv3, self).__init__() + self.num_classes = 80 + self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, + 59, 119, 116, 90, 156, 198, 373, 326] + self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + self.valid_thresh = 0.005 + self.nms_topk = 400 + self.nms_posk = 100 + self.draw_thresh = 0.5 + + self.backbone = ResNetBackBone() + self.block_outputs = [] + self.yolo_blocks = [] + self.route_blocks = [] + + for idx, num_chan in enumerate([2048, 1280, 640]): + yolo_block = self.add_sublayer( + "detecton_block_{}".format(idx), + YoloDetectionBlock(num_chan, num_filters=512 // (2**idx))) + self.yolo_blocks.append(yolo_block) + + num_filters = len(self.anchor_masks[idx]) * (self.num_classes + 5) + + block_out = self.add_sublayer( + "block_out_{}".format(idx), + Conv2D(num_channels=1024 // (2**idx), + num_filters=num_filters, + filter_size=1, + param_attr=ParamAttr( + initializer=fluid.initializer.Normal(0., 0.02)), + bias_attr=ParamAttr( + initializer=fluid.initializer.Constant(0.0), + regularizer=L2Decay(0.)))) + self.block_outputs.append(block_out) + if idx < 2: + route = self.add_sublayer( + "route_{}".format(idx), + ConvBNLayer(num_channels=512 // (2**idx), + num_filters=256 // (2**idx), + filter_size=1, + act='leaky_relu')) + self.route_blocks.append(route) + + @shape_hints(inputs=[None, 3, None, None]) + def forward(self, inputs, im_shape): + outputs = [] + boxes = [] + scores = [] + downsample = 32 + + feats = self.backbone(inputs) + feats = feats[::-1][:len(self.anchor_masks)] + route = None + for idx, feat in enumerate(feats): + if idx > 0: + feat = fluid.layers.concat(input=[route, feat], axis=1) + route, tip = self.yolo_blocks[idx](feat) + block_out = self.block_outputs[idx](tip) + + if idx < 2: + route = self.route_blocks[idx](route) + route = fluid.layers.resize_nearest(route, scale=2) + + anchor_mask = self.anchor_masks[idx] + mask_anchors = [] + for m in anchor_mask: + mask_anchors.append(self.anchors[2 * m]) + mask_anchors.append(self.anchors[2 * m + 1]) + b, s = fluid.layers.yolo_box( + x=block_out, + img_size=im_shape, + anchors=mask_anchors, + class_num=self.num_classes, + conf_thresh=self.valid_thresh, + downsample_ratio=downsample) + + outputs.append(block_out) + boxes.append(b) + scores.append(fluid.layers.transpose(s, perm=[0, 2, 1])) + + downsample //= 2 + + if self.mode != 'test': + return outputs + + return fluid.layers.multiclass_nms( + bboxes=fluid.layers.concat(boxes, axis=1), + scores=fluid.layers.concat(scores, axis=2), + score_threshold=self.valid_thresh, + nms_top_k=self.nms_topk, + keep_top_k=self.nms_posk, + nms_threshold=self.nms_thresh, + background_label=-1) + + +class YoloLoss(Loss): + def __init__(self, num_classes=80, num_max_boxes=50): + super(YoloLoss, self).__init__() + self.num_classes = num_classes + self.num_max_boxes = num_max_boxes + self.ignore_thresh = 0.7 + self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, + 59, 119, 116, 90, 156, 198, 373, 326] + self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + + def forward(self, outputs, labels): + downsample = 32 + gt_box, gt_label, gt_score = labels + losses = [] + + for idx, out in enumerate(outputs): + anchor_mask = self.anchor_masks[idx] + loss = fluid.layers.yolov3_loss( + x=out, + gt_box=gt_box, + gt_label=gt_label, + gt_score=gt_score, + anchor_mask=anchor_mask, + downsample_ratio=downsample, + anchors=self.anchors, + class_num=self.num_classes, + ignore_thresh=self.ignore_thresh, + use_label_smooth=True) + losses.append(loss) + downsample //= 2 + return losses + + def infer_shape(self, _): + return [ + [None, self.num_max_boxes, 4], + [None, self.num_max_boxes], + [None, self.num_max_boxes] + ] + + def infer_dtype(self, _): + return ['float32', 'int32', 'float32'] + + +def make_optimizer(parameter_list=None): + base_lr = 0.001 + boundaries = [400000, 450000] + warm_up_iter = 4000 + momentum = 0.9 + weight_decay = 5e-4 + values = [base_lr * (0.1 ** i) for i in range(len(boundaries) + 1)] + + lr = fluid.layers.piecewise_decay( + boundaries=boundaries, + values=values) + + lr = fluid.layers.linear_lr_warmup( + learning_rate=lr, + warmup_steps=warm_up_iter, + start_lr=0.0, + end_lr=base_lr) + + optimizer = fluid.optimizer.Momentum( + learning_rate=lr, + regularization=fluid.regularizer.L2Decay(weight_decay), + momentum=momentum, + parameter_list=parameter_list) + return optimizer + + +def _iou_matrix(a, b): + tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2]) + br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) + area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2) + area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) + area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) + area_o = (area_a[:, np.newaxis] + area_b - area_i) + return area_i / (area_o + 1e-10) + + +def _crop_box_with_center_constraint(box, crop): + cropped_box = box.copy() + cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2]) + cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:]) + cropped_box[:, :2] -= crop[:2] + cropped_box[:, 2:] -= crop[:2] + centers = (box[:, :2] + box[:, 2:]) / 2 + valid = np.logical_and( + crop[:2] <= centers, centers < crop[2:]).all(axis=1) + valid = np.logical_and( + valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1)) + return cropped_box, np.where(valid)[0] + + +def random_crop(inputs): + aspect_ratios = [.5, 2.] + thresholds = [.0, .1, .3, .5, .7, .9] + scaling = [.3, 1.] + + img, gt_box, gt_label = inputs + h, w = img.shape[:2] + + if len(gt_box) == 0: + return inputs + + np.random.shuffle(thresholds) + for thresh in thresholds: + found = False + for i in range(50): + scale = np.random.uniform(*scaling) + min_ar, max_ar = aspect_ratios + ar = np.random.uniform(max(min_ar, scale**2), + min(max_ar, scale**-2)) + crop_h = int(h * scale / np.sqrt(ar)) + crop_w = int(w * scale * np.sqrt(ar)) + crop_y = np.random.randint(0, h - crop_h) + crop_x = np.random.randint(0, w - crop_w) + crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h] + iou = _iou_matrix(gt_box, np.array([crop_box], dtype=np.float32)) + if iou.max() < thresh: + continue + + cropped_box, valid_ids = _crop_box_with_center_constraint( + gt_box, np.array(crop_box, dtype=np.float32)) + if valid_ids.size > 0: + found = True + break + + if found: + x1, y1, x2, y2 = crop_box + img = img[y1:y2, x1:x2, :] + gt_box = np.take(cropped_box, valid_ids, axis=0) + gt_label = np.take(gt_label, valid_ids, axis=0) + return img, gt_box, gt_label + + return inputs + + +# XXX mix up, color distort and random expand are skipped for simplicity +def sample_transform(inputs, mode='train', num_max_boxes=50): + if mode == 'train': + img, gt_box, gt_label = random_crop(inputs) + else: + img, gt_box, gt_label = inputs + + h, w = img.shape[:2] + # random flip + if mode == 'train' and np.random.uniform(0., 1.) > .5: + img = img[:, ::-1, :] + if len(gt_box) > 0: + swap = gt_box.copy() + gt_box[:, 0] = w - swap[:, 2] - 1 + gt_box[:, 2] = w - swap[:, 0] - 1 + + if len(gt_label) == 0: + gt_box = np.zeros([num_max_boxes, 4], dtype=np.float32) + gt_label = np.zeros([num_max_boxes, 1], dtype=np.int32) + return img, gt_box, gt_label + + gt_box = gt_box[:num_max_boxes, :] + gt_label = gt_label[:num_max_boxes, 0] + # normalize boxes + gt_box /= np.array([w, h] * 2, dtype=np.float32) + gt_box[:, 2:] = gt_box[:, 2:] - gt_box[:, :2] + gt_box[:, :2] = gt_box[:, :2] + gt_box[:, 2:] / 2. + + pad = num_max_boxes - gt_label.size + gt_box = np.pad(gt_box, ((0, pad), (0, 0)), mode='constant') + gt_label = np.pad(gt_label, [(0, pad)], mode='constant') + + return img, gt_box, gt_label + + +def batch_transform(batch, mode='train'): + if mode == 'train': + d = np.random.choice( + [320, 352, 384, 416, 448, 480, 512, 544, 576, 608]) + interp = np.random.choice(range(5)) + else: + d = 608 + interp = cv2.INTER_CUBIC + # transpose batch + imgs, gt_boxes, gt_labels = list(zip(*batch)) + imgs = np.array([cv2.resize( + img, (d, d), interpolation=interp) for img in imgs]) + + # transpose, permute and normalize + imgs = imgs.astype(np.float32)[..., ::-1] + mean = np.array([123.675, 116.28, 103.53], dtype=np.float32) + std = np.array([58.395, 57.120, 57.375], dtype=np.float32) + invstd = 1. / std + imgs -= mean + imgs *= invstd + imgs = imgs.transpose((0, 3, 1, 2)) + + im_shapes = np.full([len(imgs), 2], d, dtype=np.int32) + gt_boxes = np.array(gt_boxes) + gt_labels = np.array(gt_labels) + # XXX since mix up is not used, scores are all 1s + gt_scores = np.ones_like(gt_labels, dtype=np.float32) + return [imgs, im_shapes], [gt_boxes, gt_labels, gt_scores] + + +def coco2017(root_dir, mode='train'): + json_path = os.path.join( + root_dir, 'annotations/instances_{}2017.json'.format(mode)) + coco = COCO(json_path) + img_ids = coco.getImgIds() + imgs = coco.loadImgs(img_ids) + class_map = {v: i + 1 for i, v in enumerate(coco.getCatIds())} + samples = [] + + for img in imgs: + img_path = os.path.join( + root_dir, '{}2017'.format(mode), img['file_name']) + file_path = img_path + width = img['width'] + height = img['height'] + ann_ids = coco.getAnnIds(imgIds=img['id'], iscrowd=False) + anns = coco.loadAnns(ann_ids) + + gt_box = [] + gt_label = [] + + for ann in anns: + x1, y1, w, h = ann['bbox'] + x2 = x1 + w - 1 + y2 = y1 + h - 1 + x1 = np.clip(x1, 0, width - 1) + x2 = np.clip(x2, 0, width - 1) + y1 = np.clip(y1, 0, height - 1) + y2 = np.clip(y2, 0, height - 1) + if ann['area'] <= 0 or x2 < x1 or y2 < y1: + continue + gt_label.append(ann['category_id']) + gt_box.append([x1, y1, x2, y2]) + + gt_box = np.array(gt_box, dtype=np.float32) + gt_label = np.array([class_map[cls] for cls in gt_label], + dtype=np.int32)[:, np.newaxis] + + if gt_label.size == 0 and not mode == 'train': + continue + samples.append((file_path, gt_box.copy(), gt_label.copy())) + + def iterator(): + if mode == 'train': + random.shuffle(samples) + for file_path, gt_box, gt_label in samples: + img = cv2.imread(file_path) + yield img, gt_box, gt_label + + return iterator + + +# XXX coco metrics not included for simplicity +def run(model, loader, mode='train'): + total_loss = 0.0 + total_time = 0. + device_ids = list(range(FLAGS.num_devices)) + start = time.time() + for idx, batch in enumerate(loader()): + outputs, losses = getattr(model, mode)( + batch[0], batch[1], device='gpu', device_ids=device_ids) + + total_loss += np.sum(losses) + if idx > 1: # skip first two step + total_time += time.time() - start + if idx % 10 == 0: + print("{:04d}: loss {:0.3f} time: {:0.3f}".format( + idx, total_loss / (idx + 1), total_time / (idx - 1))) + start = time.time() + + +def main(): + @contextlib.contextmanager + def null_guard(): + yield + + epoch = FLAGS.epoch + batch_size = FLAGS.batch_size + if FLAGS.dynamic: + guard = fluid.dygraph.guard() + else: + guard = null_guard() + + train_loader = fluid.io.xmap_readers( + lambda batch: batch_transform(batch, 'train'), + paddle.batch( + fluid.io.xmap_readers( + lambda inputs: sample_transform(inputs, 'train'), + coco2017(FLAGS.data, 'train'), + process_num=8, + buffer_size=4 * batch_size), + batch_size=batch_size, + drop_last=True), + process_num=2, buffer_size=4) + + val_loader = fluid.io.xmap_readers( + lambda batch: batch_transform(batch, 'train'), + paddle.batch( + fluid.io.xmap_readers( + lambda inputs: sample_transform(inputs, 'val'), + coco2017(FLAGS.data, 'val'), + process_num=8, + buffer_size=4 * batch_size), + batch_size=batch_size), + process_num=2, buffer_size=4) + + if not os.path.exists('yolo_checkpoints'): + os.mkdir('yolo_checkpoints') + + with guard: + model = YOLOv3() + # XXX transfer learning + if FLAGS.weights is not None: + model.backbone.load(FLAGS.weights) + optim = make_optimizer(parameter_list=model.parameters()) + model.prepare(optim, YoloLoss()) + + for e in range(epoch): + print("======== train epoch {} ========".format(e)) + run(model, train_loader) + model.save('checkpoints/{:02d}'.format(e)) + print("======== eval epoch {} ========".format(e)) + run(model, val_loader, mode='eval') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser("Yolov3 Training on COCO") + parser.add_argument('data', metavar='DIR', help='path to COCO dataset') + parser.add_argument( + "-e", "--epoch", default=300, type=int, help="number of epoch") + parser.add_argument( + "-b", "--batch_size", default=32, type=int, help="batch size") + parser.add_argument( + "-n", "--num_devices", default=8, type=int, help="number of devices") + parser.add_argument( + "-d", "--dynamic", action='store_true', help="enable dygraph mode") + parser.add_argument( + "-w", "--weights", default=None, type=str, + help="path to pretrained weights") + FLAGS = parser.parse_args() + main()