diff --git a/examples/yolov3/main.py b/examples/yolov3/main.py index dea9eba5429a2878038aef11a9ca404696b2f7a8..100695b1e25fc5687ea8b9c3ae46f1731d33fa58 100644 --- a/examples/yolov3/main.py +++ b/examples/yolov3/main.py @@ -27,7 +27,7 @@ from paddle.io import DataLoader from hapi.model import Model, Input, set_device from hapi.distributed import DistributedBatchSampler -from hapi.vision.transforms import Compose, BatchCompose +from hapi.vision.transforms import BatchCompose from modeling import yolov3_darknet53, YoloLoss from coco import COCODataset @@ -43,10 +43,9 @@ def make_optimizer(step_per_epoch, parameter_list=None): momentum = 0.9 weight_decay = 5e-4 boundaries = [step_per_epoch * e for e in [200, 250]] - values = [base_lr * (0.1 ** i) for i in range(len(boundaries) + 1)] + values = [base_lr * (0.1**i) for i in range(len(boundaries) + 1)] learning_rate = fluid.layers.piecewise_decay( - boundaries=boundaries, - values=values) + boundaries=boundaries, values=values) learning_rate = fluid.layers.linear_lr_warmup( learning_rate=learning_rate, warmup_steps=warm_up_iter, @@ -63,77 +62,85 @@ def make_optimizer(step_per_epoch, parameter_list=None): def main(): device = set_device(FLAGS.device) fluid.enable_dygraph(device) if FLAGS.dynamic else None - - inputs = [Input([None, 1], 'int64', name='img_id'), - Input([None, 2], 'int32', name='img_shape'), - Input([None, 3, None, None], 'float32', name='image')] - labels = [Input([None, NUM_MAX_BOXES, 4], 'float32', name='gt_bbox'), - Input([None, NUM_MAX_BOXES], 'int32', name='gt_label'), - Input([None, NUM_MAX_BOXES], 'float32', name='gt_score')] - - if not FLAGS.eval_only: # training mode - train_transform = Compose([ColorDistort(), - RandomExpand(), - RandomCrop(), - RandomFlip(), - NormalizeBox(), - PadBox(), - BboxXYXY2XYWH()]) + + inputs = [ + Input( + [None, 1], 'int64', name='img_id'), Input( + [None, 2], 'int32', name='img_shape'), Input( + [None, 3, None, None], 'float32', name='image') + ] + labels = [ + Input( + [None, NUM_MAX_BOXES, 4], 'float32', name='gt_bbox'), Input( + [None, NUM_MAX_BOXES], 'int32', name='gt_label'), Input( + [None, NUM_MAX_BOXES], 'float32', name='gt_score') + ] + + if not FLAGS.eval_only: # training mode + train_transform = Compose([ + ColorDistort(), RandomExpand(), RandomCrop(), RandomFlip(), + NormalizeBox(), PadBox(), BboxXYXY2XYWH() + ]) train_collate_fn = BatchCompose([RandomShape(), NormalizeImage()]) - dataset = COCODataset(dataset_dir=FLAGS.data, - anno_path='annotations/instances_train2017.json', - image_dir='train2017', - with_background=False, - mixup=True, - transform=train_transform) - batch_sampler = DistributedBatchSampler(dataset, - batch_size=FLAGS.batch_size, - shuffle=True, - drop_last=True) - loader = DataLoader(dataset, - batch_sampler=batch_sampler, - places=device, - num_workers=FLAGS.num_workers, - return_list=True, - collate_fn=train_collate_fn) - else: # evaluation mode - eval_transform = Compose([ResizeImage(target_size=608), - NormalizeBox(), - PadBox(), - BboxXYXY2XYWH()]) + dataset = COCODataset( + dataset_dir=FLAGS.data, + anno_path='annotations/instances_train2017.json', + image_dir='train2017', + with_background=False, + mixup=True, + transform=train_transform) + batch_sampler = DistributedBatchSampler( + dataset, batch_size=FLAGS.batch_size, shuffle=True, drop_last=True) + loader = DataLoader( + dataset, + batch_sampler=batch_sampler, + places=device, + num_workers=FLAGS.num_workers, + return_list=True, + collate_fn=train_collate_fn) + else: # evaluation mode + eval_transform = Compose([ + ResizeImage(target_size=608), NormalizeBox(), PadBox(), + BboxXYXY2XYWH() + ]) eval_collate_fn = BatchCompose([NormalizeImage()]) - dataset = COCODataset(dataset_dir=FLAGS.data, - anno_path='annotations/instances_val2017.json', - image_dir='val2017', - with_background=False, - transform=eval_transform) + dataset = COCODataset( + dataset_dir=FLAGS.data, + anno_path='annotations/instances_val2017.json', + image_dir='val2017', + with_background=False, + transform=eval_transform) # batch_size can only be 1 in evaluation for YOLOv3 # prediction bbox is a LoDTensor - batch_sampler = DistributedBatchSampler(dataset, - batch_size=1, - shuffle=False, - drop_last=False) - loader = DataLoader(dataset, - batch_sampler=batch_sampler, - places=device, - num_workers=FLAGS.num_workers, - return_list=True, - collate_fn=eval_collate_fn) + batch_sampler = DistributedBatchSampler( + dataset, batch_size=1, shuffle=False, drop_last=False) + loader = DataLoader( + dataset, + batch_sampler=batch_sampler, + places=device, + num_workers=FLAGS.num_workers, + return_list=True, + collate_fn=eval_collate_fn) pretrained = FLAGS.eval_only and FLAGS.weights is None - model = yolov3_darknet53(num_classes=dataset.num_classes, - model_mode='eval' if FLAGS.eval_only else 'train', - pretrained=pretrained) + model = yolov3_darknet53( + num_classes=dataset.num_classes, + model_mode='eval' if FLAGS.eval_only else 'train', + pretrained=pretrained) if FLAGS.pretrain_weights and not FLAGS.eval_only: - model.load(FLAGS.pretrain_weights, skip_mismatch=True, reset_optimizer=True) + model.load( + FLAGS.pretrain_weights, skip_mismatch=True, reset_optimizer=True) - optim = make_optimizer(len(batch_sampler), parameter_list=model.parameters()) + optim = make_optimizer( + len(batch_sampler), parameter_list=model.parameters()) - model.prepare(optim, - YoloLoss(num_classes=dataset.num_classes), - inputs=inputs, labels=labels, - device=FLAGS.device) + model.prepare( + optim, + YoloLoss(num_classes=dataset.num_classes), + inputs=inputs, + labels=labels, + device=FLAGS.device) # NOTE: we implement COCO metric of YOLOv3 model here, separately # from 'prepare' and 'fit' framework for follwing reason: @@ -149,7 +156,8 @@ def main(): preds = model.predict(loader, stack_outputs=False) _, _, _, img_ids, bboxes = preds - anno_path = os.path.join(FLAGS.data, 'annotations/instances_val2017.json') + anno_path = os.path.join(FLAGS.data, + 'annotations/instances_val2017.json') coco_metric = COCOMetric(anno_path=anno_path, with_background=False) for img_id, bbox in zip(img_ids, bboxes): coco_metric.update(img_id, bbox) @@ -176,7 +184,9 @@ def main(): if __name__ == '__main__': parser = argparse.ArgumentParser("Yolov3 Training on VOC") parser.add_argument( - "--data", type=str, default='dataset/voc', + "--data", + type=str, + default='dataset/voc', help="path to dataset directory") parser.add_argument( "--device", type=str, default='gpu', help="device to use, gpu or cpu") @@ -187,23 +197,38 @@ if __name__ == '__main__': parser.add_argument( "-e", "--epoch", default=300, type=int, help="number of epoch") parser.add_argument( - "--no_mixup_epoch", default=30, type=int, + "--no_mixup_epoch", + default=30, + type=int, help="number of the last N epoch without image mixup") parser.add_argument( - '--lr', '--learning-rate', default=0.001, type=float, metavar='LR', + '--lr', + '--learning-rate', + default=0.001, + type=float, + metavar='LR', help='initial learning rate') parser.add_argument( "-b", "--batch_size", default=8, type=int, help="batch size") parser.add_argument( - "-j", "--num_workers", default=4, type=int, help="reader worker number") + "-j", + "--num_workers", + default=4, + type=int, + help="reader worker number") parser.add_argument( - "-p", "--pretrain_weights", default=None, type=str, + "-p", + "--pretrain_weights", + default=None, + type=str, help="path to pretrained weights") parser.add_argument( - "-r", "--resume", default=None, type=str, - help="path to model weights") + "-r", "--resume", default=None, type=str, help="path to model weights") parser.add_argument( - "-w", "--weights", default=None, type=str, + "-w", + "--weights", + default=None, + type=str, help="path to weights for evaluation") FLAGS = parser.parse_args() assert FLAGS.data, "error: must provide data path" diff --git a/examples/yolov3/modeling.py b/examples/yolov3/modeling.py index be462f5afbca8b987775e63e52a7950d2c3d60fd..0b74bf93449a3eba2be47126525db40b434e89fe 100644 --- a/examples/yolov3/modeling.py +++ b/examples/yolov3/modeling.py @@ -73,6 +73,7 @@ class ConvBNLayer(fluid.dygraph.Layer): out = fluid.layers.leaky_relu(x=out, alpha=0.1) return out + class YoloDetectionBlock(fluid.dygraph.Layer): def __init__(self, ch_in, channel): super(YoloDetectionBlock, self).__init__() @@ -81,38 +82,34 @@ class YoloDetectionBlock(fluid.dygraph.Layer): "channel {} cannot be divided by 2".format(channel) self.conv0 = ConvBNLayer( - ch_in=ch_in, - ch_out=channel, - filter_size=1, - stride=1, - padding=0) + ch_in=ch_in, ch_out=channel, filter_size=1, stride=1, padding=0) self.conv1 = ConvBNLayer( ch_in=channel, - ch_out=channel*2, + ch_out=channel * 2, filter_size=3, stride=1, padding=1) self.conv2 = ConvBNLayer( - ch_in=channel*2, + ch_in=channel * 2, ch_out=channel, filter_size=1, stride=1, padding=0) self.conv3 = ConvBNLayer( ch_in=channel, - ch_out=channel*2, + ch_out=channel * 2, filter_size=3, stride=1, padding=1) self.route = ConvBNLayer( - ch_in=channel*2, + ch_in=channel * 2, ch_out=channel, filter_size=1, stride=1, padding=0) self.tip = ConvBNLayer( ch_in=channel, - ch_out=channel*2, + ch_out=channel * 2, filter_size=3, stride=1, padding=1) @@ -149,8 +146,10 @@ class YOLOv3(Model): "model_mode should be 'train' 'eval' or 'test', but got " \ "{}".format(model_mode) self.model_mode = str.lower(model_mode) - self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, - 59, 119, 116, 90, 156, 198, 373, 326] + self.anchors = [ + 10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, + 373, 326 + ] self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] self.valid_thresh = 0.005 self.nms_thresh = 0.45 @@ -158,7 +157,10 @@ class YOLOv3(Model): self.nms_posk = 100 self.draw_thresh = 0.5 - self.backbone = darknet53(pretrained=(model_mode=='train')) + self.backbone = darknet53( + pretrained=(model_mode == 'train'), + with_pool=False, + num_classes=-1) self.block_outputs = [] self.yolo_blocks = [] self.route_blocks = [] @@ -173,32 +175,46 @@ class YOLOv3(Model): block_out = self.add_sublayer( "block_out_{}".format(idx), - Conv2D(num_channels=1024 // (2**idx), - num_filters=num_filters, - filter_size=1, - act=None, - param_attr=ParamAttr( - initializer=fluid.initializer.Normal(0., 0.02)), - bias_attr=ParamAttr( - initializer=fluid.initializer.Constant(0.0), - regularizer=L2Decay(0.)))) + Conv2D( + num_channels=1024 // (2**idx), + num_filters=num_filters, + filter_size=1, + act=None, + param_attr=ParamAttr( + initializer=fluid.initializer.Normal(0., 0.02)), + bias_attr=ParamAttr( + initializer=fluid.initializer.Constant(0.0), + regularizer=L2Decay(0.)))) self.block_outputs.append(block_out) if idx < 2: route = self.add_sublayer( "route2_{}".format(idx), - ConvBNLayer(ch_in=512 // (2**idx), - ch_out=256 // (2**idx), - filter_size=1, - act='leaky_relu')) + ConvBNLayer( + ch_in=512 // (2**idx), + ch_out=256 // (2**idx), + filter_size=1, + act='leaky_relu')) self.route_blocks.append(route) + def extract_feats(self, inputs): + out = self.backbone.conv0(inputs) + out = self.backbone.downsample0(out) + blocks = [] + for i, conv_block_i in enumerate( + self.backbone.darknet53_conv_block_list): + out = conv_block_i(out) + blocks.append(out) + if i < len(self.backbone.stages) - 1: + out = self.backbone.downsample_list[i](out) + return blocks[-1:-4:-1] + def forward(self, img_id, img_shape, inputs): outputs = [] boxes = [] scores = [] downsample = 32 - feats = self.backbone(inputs) + feats = self.extract_feats(inputs) route = None for idx, feat in enumerate(feats): if idx > 0: @@ -233,15 +249,18 @@ class YOLOv3(Model): if self.model_mode == 'train': return outputs - preds = [img_id, - fluid.layers.multiclass_nms( - bboxes=fluid.layers.concat(boxes, axis=1), - scores=fluid.layers.concat(scores, axis=2), - score_threshold=self.valid_thresh, - nms_top_k=self.nms_topk, - keep_top_k=self.nms_posk, - nms_threshold=self.nms_thresh, - background_label=-1)] + preds = [ + img_id, fluid.layers.multiclass_nms( + bboxes=fluid.layers.concat( + boxes, axis=1), + scores=fluid.layers.concat( + scores, axis=2), + score_threshold=self.valid_thresh, + nms_top_k=self.nms_topk, + keep_top_k=self.nms_posk, + nms_threshold=self.nms_thresh, + background_label=-1) + ] if self.model_mode == 'test': return preds @@ -249,14 +268,17 @@ class YOLOv3(Model): # model_mode == "eval" return outputs + preds + class YoloLoss(Loss): def __init__(self, num_classes=80, num_max_boxes=50): super(YoloLoss, self).__init__() self.num_classes = num_classes self.num_max_boxes = num_max_boxes self.ignore_thresh = 0.7 - self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, - 59, 119, 116, 90, 156, 198, 373, 326] + self.anchors = [ + 10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, + 373, 326 + ] self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] def forward(self, outputs, labels): @@ -265,7 +287,7 @@ class YoloLoss(Loss): losses = [] for idx, out in enumerate(outputs): - if idx == 3: break # debug + if idx == 3: break # debug anchor_mask = self.anchor_masks[idx] loss = fluid.layers.yolov3_loss( x=out, @@ -284,8 +306,10 @@ class YoloLoss(Loss): return losses -def _yolov3_darknet(num_layers=53, num_classes=80, - model_mode='train', pretrained=True): +def _yolov3_darknet(num_layers=53, + num_classes=80, + model_mode='train', + pretrained=True): model = YOLOv3(num_classes, model_mode) if pretrained: assert num_layers in pretrain_infos.keys(), \ diff --git a/examples/yolov3/transforms.py b/examples/yolov3/transforms.py index 8d81c274dfb574bac52855cda95c970e4c8a444f..4eca95a95d692cbe9e9db654cf727e289361ff5f 100644 --- a/examples/yolov3/transforms.py +++ b/examples/yolov3/transforms.py @@ -20,6 +20,7 @@ import traceback import numpy as np __all__ = [ + "Compose", 'ColorDistort', 'RandomExpand', 'RandomCrop', @@ -33,6 +34,37 @@ __all__ = [ ] +class Compose(object): + """Composes several transforms together. + + Args: + transforms (list of ``Transform`` objects): list of transforms to compose. + + """ + + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, *data): + for f in self.transforms: + try: + data = f(*data) + except Exception as e: + stack_info = traceback.format_exc() + print("fail to perform transform [{}] with error: " + "{} and stack:\n{}".format(f, e, str(stack_info))) + raise e + return data + + def __repr__(self): + format_string = self.__class__.__name__ + '(' + for t in self.transforms: + format_string += '\n' + format_string += ' {0}'.format(t) + format_string += '\n)' + return format_string + + class ColorDistort(object): """Random color distortion. @@ -147,7 +179,10 @@ class RandomExpand(object): fill_value (list): color value used to fill the canvas. in RGB order. """ - def __init__(self, ratio=4., prob=0.5, fill_value=[123.675, 116.28, 103.53]): + def __init__(self, + ratio=4., + prob=0.5, + fill_value=[123.675, 116.28, 103.53]): assert ratio > 1.01, "expand ratio must be larger than 1.01" self.ratio = ratio self.prob = prob @@ -493,8 +528,7 @@ def _crop_box_with_center_constraint(box, crop): cropped_box[:, :2] -= crop[:2] cropped_box[:, 2:] -= crop[:2] centers = (box[:, :2] + box[:, 2:]) / 2 - valid = np.logical_and( - crop[:2] <= centers, centers < crop[2:]).all(axis=1) + valid = np.logical_and(crop[:2] <= centers, centers < crop[2:]).all(axis=1) valid = np.logical_and( valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1)) return cropped_box, np.where(valid)[0] @@ -517,8 +551,8 @@ def random_crop(inputs): for i in range(50): scale = np.random.uniform(*scaling) min_ar, max_ar = aspect_ratios - ar = np.random.uniform(max(min_ar, scale**2), - min(max_ar, scale**-2)) + ar = np.random.uniform( + max(min_ar, scale**2), min(max_ar, scale**-2)) crop_h = int(h * scale / np.sqrt(ar)) crop_w = int(w * scale * np.sqrt(ar)) crop_y = np.random.randint(0, h - crop_h) @@ -529,7 +563,8 @@ def random_crop(inputs): continue cropped_box, valid_ids = _crop_box_with_center_constraint( - gt_box, np.array(crop_box, dtype=np.float32)) + gt_box, np.array( + crop_box, dtype=np.float32)) if valid_ids.size > 0: found = True break @@ -545,9 +580,7 @@ def random_crop(inputs): class ResizeImage(object): - def __init__(self, - target_size=0, - interp=cv2.INTER_CUBIC): + def __init__(self, target_size=0, interp=cv2.INTER_CUBIC): """ Rescale image to the specified target size. If target_size is list, selected a scale randomly as the specified @@ -574,8 +607,8 @@ class ResizeImage(object): raise ImageError('{}: image is not 3-dimensional.'.format(self)) im_scale_x = float(self.target_size) / float(im.shape[1]) im_scale_y = float(self.target_size) / float(im.shape[0]) - resize_w = self.target_size - resize_h = self.target_size + resize_w = self.target_size + resize_h = self.target_size im = cv2.resize( im, @@ -586,4 +619,3 @@ class ResizeImage(object): interpolation=self.interp) return [im_id, im_shape, im, gt_bbox, gt_class, gt_score] - diff --git a/hapi/model.py b/hapi/model.py index 438b9d7812d416cf8909998f7dea4be7b4d1709b..b9dc4ca441e8c2531df633b946e5c4da30bffa44 100644 --- a/hapi/model.py +++ b/hapi/model.py @@ -1184,9 +1184,9 @@ class Model(fluid.dygraph.Layer): outputs = [] count = 0 - for i, data in tqdm.tqdm(enumerate(loader)): + for data in tqdm.tqdm(loader): data = flatten(data) - out = to_list(self.test(data[:len(self._inputs)])) + out = to_list(self.test_batch(data[:len(self._inputs)])) outputs.append(out) count += out[0].shape[0] @@ -1199,15 +1199,10 @@ class Model(fluid.dygraph.Layer): # NOTE: for lod tensor output, we should not stack outputs # for stacking may loss its detail info + outputs = list(zip(*outputs)) + if stack_outputs: - stack_outs = [] - for i in range(len(outputs[0])): - split_outs = [] - for out in outputs: - split_outs.append(out[i]) - stack_outs.append(np.vstack(split_outs)) - - outputs = stack_outs + outputs = [np.vstack(outs) for outs in outputs] self._test_dataloader = None diff --git a/hapi/vision/models/darknet.py b/hapi/vision/models/darknet.py index 08e4171ada84ec16cc149f23f3a41691c2fb97d1..5525b6c0489c993669de5d675b25518dc74a6ca6 100755 --- a/hapi/vision/models/darknet.py +++ b/hapi/vision/models/darknet.py @@ -156,7 +156,7 @@ class DarkNet(Model): .format(DarkNet_cfg.keys()) self.stages = DarkNet_cfg[num_layers] self.stages = self.stages[0:5] - self.num_classes = 1000 + self.num_classes = num_classes self.with_pool = True ch_in = 3 self.conv0 = ConvBNLayer(