diff --git a/examples/image_classification/README.MD b/examples/image_classification/README.MD index 5b50370dd4b2ad76e62f0e99877849f5fe2fed8f..9f2c58539edea7a3fbc183f050469f53ba311e30 100644 --- a/examples/image_classification/README.MD +++ b/examples/image_classification/README.MD @@ -85,8 +85,9 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch main.py --arch | [vgg16](https://paddle-hapi.bj.bcebos.com/models/vgg16.pdparams) | 71.92 | 90.65 | | [mobilenet_v1](https://paddle-hapi.bj.bcebos.com/models/mobilenet_v1_x1.0.pdparams) | 71.16 | 89.89 | | [mobilenet_v2](https://paddle-hapi.bj.bcebos.com/models/mobilenet_v2_x1.0.pdparams) | 72.30 | 90.74 | +| [darknet53](https://paddle-hapi.bj.bcebos.com/models/darknet53.pdparams) | 78.43 | 94.24 | -上述模型的复现参数请参考scripts下的脚本。 +上述部分模型的复现参数请参考scripts下的脚本。需要注意的是darknet要使用image size为256的输入来预测, 即```--image-size 256``` ## 参考文献 diff --git a/examples/image_classification/imagenet_dataset.py b/examples/image_classification/imagenet_dataset.py index 6572df01440a36c21330cc905da045e03ff79700..25dcc338e20e53e75f3637ea8f8e3d492a1240e1 100644 --- a/examples/image_classification/imagenet_dataset.py +++ b/examples/image_classification/imagenet_dataset.py @@ -24,7 +24,11 @@ from paddle import fluid class ImageNetDataset(DatasetFolder): - def __init__(self, path, mode='train'): + def __init__(self, + path, + mode='train', + image_size=224, + resize_short_size=256): super(ImageNetDataset, self).__init__(path) self.mode = mode @@ -32,13 +36,14 @@ class ImageNetDataset(DatasetFolder): mean=[123.675, 116.28, 103.53], std=[58.395, 57.120, 57.375]) if self.mode == 'train': self.transform = transforms.Compose([ - transforms.RandomResizedCrop(224), + transforms.RandomResizedCrop(image_size), transforms.RandomHorizontalFlip(), transforms.Permute(mode='CHW'), normalize ]) else: self.transform = transforms.Compose([ - transforms.Resize(256), transforms.CenterCrop(224), + transforms.Resize(resize_short_size), + transforms.CenterCrop(image_size), transforms.Permute(mode='CHW'), normalize ]) @@ -46,7 +51,7 @@ class ImageNetDataset(DatasetFolder): img_path, label = self.samples[idx] img = cv2.imread(img_path).astype(np.float32) label = np.array([label]) - return self.transform(img, label) + return self.transform(img), label def __len__(self): return len(self.samples) diff --git a/examples/image_classification/main.py b/examples/image_classification/main.py index 76360df91cd64a66e2e288c90a37ac667cdc3eea..64396a6042f80cfbd53ff775ab95c41330894a9a 100644 --- a/examples/image_classification/main.py +++ b/examples/image_classification/main.py @@ -18,8 +18,6 @@ from __future__ import print_function import argparse import contextlib import os -import sys -sys.path.append('../') import time import math @@ -89,8 +87,16 @@ def main(): labels = [Input([None, 1], 'int64', name='label')] train_dataset = ImageNetDataset( - os.path.join(FLAGS.data, 'train'), mode='train') - val_dataset = ImageNetDataset(os.path.join(FLAGS.data, 'val'), mode='val') + os.path.join(FLAGS.data, 'train'), + mode='train', + image_size=FLAGS.image_size, + resize_short_size=FLAGS.resize_short_size) + + val_dataset = ImageNetDataset( + os.path.join(FLAGS.data, 'val'), + mode='val', + image_size=FLAGS.image_size, + resize_short_size=FLAGS.resize_short_size) optim = make_optimizer( np.ceil( @@ -176,6 +182,13 @@ if __name__ == '__main__': parser.add_argument( "--weight-decay", default=1e-4, type=float, help="weight decay") parser.add_argument("--momentum", default=0.9, type=float, help="momentum") + parser.add_argument( + "--image-size", default=224, type=int, help="intput image size") + parser.add_argument( + "--resize-short-size", + default=256, + type=int, + help="short size of keeping ratio resize") FLAGS = parser.parse_args() assert FLAGS.data, "error: must provide data path" main() diff --git a/examples/yolov3/main.py b/examples/yolov3/main.py index dea9eba5429a2878038aef11a9ca404696b2f7a8..3f52087fa5e4d460894891eaabb1c18f0f003eed 100644 --- a/examples/yolov3/main.py +++ b/examples/yolov3/main.py @@ -27,7 +27,7 @@ from paddle.io import DataLoader from hapi.model import Model, Input, set_device from hapi.distributed import DistributedBatchSampler -from hapi.vision.transforms import Compose, BatchCompose +from hapi.vision.transforms import BatchCompose from modeling import yolov3_darknet53, YoloLoss from coco import COCODataset @@ -43,10 +43,9 @@ def make_optimizer(step_per_epoch, parameter_list=None): momentum = 0.9 weight_decay = 5e-4 boundaries = [step_per_epoch * e for e in [200, 250]] - values = [base_lr * (0.1 ** i) for i in range(len(boundaries) + 1)] + values = [base_lr * (0.1**i) for i in range(len(boundaries) + 1)] learning_rate = fluid.layers.piecewise_decay( - boundaries=boundaries, - values=values) + boundaries=boundaries, values=values) learning_rate = fluid.layers.linear_lr_warmup( learning_rate=learning_rate, warmup_steps=warm_up_iter, @@ -63,77 +62,88 @@ def make_optimizer(step_per_epoch, parameter_list=None): def main(): device = set_device(FLAGS.device) fluid.enable_dygraph(device) if FLAGS.dynamic else None - - inputs = [Input([None, 1], 'int64', name='img_id'), - Input([None, 2], 'int32', name='img_shape'), - Input([None, 3, None, None], 'float32', name='image')] - labels = [Input([None, NUM_MAX_BOXES, 4], 'float32', name='gt_bbox'), - Input([None, NUM_MAX_BOXES], 'int32', name='gt_label'), - Input([None, NUM_MAX_BOXES], 'float32', name='gt_score')] - - if not FLAGS.eval_only: # training mode - train_transform = Compose([ColorDistort(), - RandomExpand(), - RandomCrop(), - RandomFlip(), - NormalizeBox(), - PadBox(), - BboxXYXY2XYWH()]) + + inputs = [ + Input( + [None, 1], 'int64', name='img_id'), Input( + [None, 2], 'int32', name='img_shape'), Input( + [None, 3, None, None], 'float32', name='image') + ] + + labels = [ + Input( + [None, NUM_MAX_BOXES, 4], 'float32', name='gt_bbox'), Input( + [None, NUM_MAX_BOXES], 'int32', name='gt_label'), Input( + [None, NUM_MAX_BOXES], 'float32', name='gt_score') + ] + + if not FLAGS.eval_only: # training mode + train_transform = Compose([ + ColorDistort(), RandomExpand(), RandomCrop(), RandomFlip(), + NormalizeBox(), PadBox(), BboxXYXY2XYWH() + ]) + train_collate_fn = BatchCompose([RandomShape(), NormalizeImage()]) - dataset = COCODataset(dataset_dir=FLAGS.data, - anno_path='annotations/instances_train2017.json', - image_dir='train2017', - with_background=False, - mixup=True, - transform=train_transform) - batch_sampler = DistributedBatchSampler(dataset, - batch_size=FLAGS.batch_size, - shuffle=True, - drop_last=True) - loader = DataLoader(dataset, - batch_sampler=batch_sampler, - places=device, - num_workers=FLAGS.num_workers, - return_list=True, - collate_fn=train_collate_fn) - else: # evaluation mode - eval_transform = Compose([ResizeImage(target_size=608), - NormalizeBox(), - PadBox(), - BboxXYXY2XYWH()]) + dataset = COCODataset( + dataset_dir=FLAGS.data, + anno_path='annotations/instances_train2017.json', + image_dir='train2017', + with_background=False, + mixup=True, + transform=train_transform) + batch_sampler = DistributedBatchSampler( + dataset, batch_size=FLAGS.batch_size, shuffle=True, drop_last=True) + loader = DataLoader( + dataset, + batch_sampler=batch_sampler, + places=device, + num_workers=FLAGS.num_workers, + return_list=True, + collate_fn=train_collate_fn) + else: # evaluation mode + eval_transform = Compose([ + ResizeImage(target_size=608), NormalizeBox(), PadBox(), + BboxXYXY2XYWH() + ]) + eval_collate_fn = BatchCompose([NormalizeImage()]) - dataset = COCODataset(dataset_dir=FLAGS.data, - anno_path='annotations/instances_val2017.json', - image_dir='val2017', - with_background=False, - transform=eval_transform) + dataset = COCODataset( + dataset_dir=FLAGS.data, + anno_path='annotations/instances_val2017.json', + image_dir='val2017', + with_background=False, + transform=eval_transform) # batch_size can only be 1 in evaluation for YOLOv3 # prediction bbox is a LoDTensor - batch_sampler = DistributedBatchSampler(dataset, - batch_size=1, - shuffle=False, - drop_last=False) - loader = DataLoader(dataset, - batch_sampler=batch_sampler, - places=device, - num_workers=FLAGS.num_workers, - return_list=True, - collate_fn=eval_collate_fn) + batch_sampler = DistributedBatchSampler( + dataset, batch_size=1, shuffle=False, drop_last=False) + loader = DataLoader( + dataset, + batch_sampler=batch_sampler, + places=device, + num_workers=FLAGS.num_workers, + return_list=True, + collate_fn=eval_collate_fn) pretrained = FLAGS.eval_only and FLAGS.weights is None - model = yolov3_darknet53(num_classes=dataset.num_classes, - model_mode='eval' if FLAGS.eval_only else 'train', - pretrained=pretrained) + model = yolov3_darknet53( + num_classes=dataset.num_classes, + model_mode='eval' if FLAGS.eval_only else 'train', + pretrained=pretrained) if FLAGS.pretrain_weights and not FLAGS.eval_only: - model.load(FLAGS.pretrain_weights, skip_mismatch=True, reset_optimizer=True) + model.load( + FLAGS.pretrain_weights, skip_mismatch=True, reset_optimizer=True) - optim = make_optimizer(len(batch_sampler), parameter_list=model.parameters()) + optim = make_optimizer( + len(batch_sampler), parameter_list=model.parameters()) - model.prepare(optim, - YoloLoss(num_classes=dataset.num_classes), - inputs=inputs, labels=labels, - device=FLAGS.device) + model.prepare( + optim, + YoloLoss(num_classes=dataset.num_classes), + inputs=inputs, + labels=labels, + device=FLAGS.device) # NOTE: we implement COCO metric of YOLOv3 model here, separately # from 'prepare' and 'fit' framework for follwing reason: @@ -149,7 +159,8 @@ def main(): preds = model.predict(loader, stack_outputs=False) _, _, _, img_ids, bboxes = preds - anno_path = os.path.join(FLAGS.data, 'annotations/instances_val2017.json') + anno_path = os.path.join(FLAGS.data, + 'annotations/instances_val2017.json') coco_metric = COCOMetric(anno_path=anno_path, with_background=False) for img_id, bbox in zip(img_ids, bboxes): coco_metric.update(img_id, bbox) @@ -176,7 +187,9 @@ def main(): if __name__ == '__main__': parser = argparse.ArgumentParser("Yolov3 Training on VOC") parser.add_argument( - "--data", type=str, default='dataset/voc', + "--data", + type=str, + default='dataset/voc', help="path to dataset directory") parser.add_argument( "--device", type=str, default='gpu', help="device to use, gpu or cpu") @@ -187,23 +200,38 @@ if __name__ == '__main__': parser.add_argument( "-e", "--epoch", default=300, type=int, help="number of epoch") parser.add_argument( - "--no_mixup_epoch", default=30, type=int, + "--no_mixup_epoch", + default=30, + type=int, help="number of the last N epoch without image mixup") parser.add_argument( - '--lr', '--learning-rate', default=0.001, type=float, metavar='LR', + '--lr', + '--learning-rate', + default=0.001, + type=float, + metavar='LR', help='initial learning rate') parser.add_argument( "-b", "--batch_size", default=8, type=int, help="batch size") parser.add_argument( - "-j", "--num_workers", default=4, type=int, help="reader worker number") + "-j", + "--num_workers", + default=4, + type=int, + help="reader worker number") parser.add_argument( - "-p", "--pretrain_weights", default=None, type=str, + "-p", + "--pretrain_weights", + default=None, + type=str, help="path to pretrained weights") parser.add_argument( - "-r", "--resume", default=None, type=str, - help="path to model weights") + "-r", "--resume", default=None, type=str, help="path to model weights") parser.add_argument( - "-w", "--weights", default=None, type=str, + "-w", + "--weights", + default=None, + type=str, help="path to weights for evaluation") FLAGS = parser.parse_args() assert FLAGS.data, "error: must provide data path" diff --git a/examples/yolov3/modeling.py b/examples/yolov3/modeling.py index be462f5afbca8b987775e63e52a7950d2c3d60fd..0b74bf93449a3eba2be47126525db40b434e89fe 100644 --- a/examples/yolov3/modeling.py +++ b/examples/yolov3/modeling.py @@ -73,6 +73,7 @@ class ConvBNLayer(fluid.dygraph.Layer): out = fluid.layers.leaky_relu(x=out, alpha=0.1) return out + class YoloDetectionBlock(fluid.dygraph.Layer): def __init__(self, ch_in, channel): super(YoloDetectionBlock, self).__init__() @@ -81,38 +82,34 @@ class YoloDetectionBlock(fluid.dygraph.Layer): "channel {} cannot be divided by 2".format(channel) self.conv0 = ConvBNLayer( - ch_in=ch_in, - ch_out=channel, - filter_size=1, - stride=1, - padding=0) + ch_in=ch_in, ch_out=channel, filter_size=1, stride=1, padding=0) self.conv1 = ConvBNLayer( ch_in=channel, - ch_out=channel*2, + ch_out=channel * 2, filter_size=3, stride=1, padding=1) self.conv2 = ConvBNLayer( - ch_in=channel*2, + ch_in=channel * 2, ch_out=channel, filter_size=1, stride=1, padding=0) self.conv3 = ConvBNLayer( ch_in=channel, - ch_out=channel*2, + ch_out=channel * 2, filter_size=3, stride=1, padding=1) self.route = ConvBNLayer( - ch_in=channel*2, + ch_in=channel * 2, ch_out=channel, filter_size=1, stride=1, padding=0) self.tip = ConvBNLayer( ch_in=channel, - ch_out=channel*2, + ch_out=channel * 2, filter_size=3, stride=1, padding=1) @@ -149,8 +146,10 @@ class YOLOv3(Model): "model_mode should be 'train' 'eval' or 'test', but got " \ "{}".format(model_mode) self.model_mode = str.lower(model_mode) - self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, - 59, 119, 116, 90, 156, 198, 373, 326] + self.anchors = [ + 10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, + 373, 326 + ] self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] self.valid_thresh = 0.005 self.nms_thresh = 0.45 @@ -158,7 +157,10 @@ class YOLOv3(Model): self.nms_posk = 100 self.draw_thresh = 0.5 - self.backbone = darknet53(pretrained=(model_mode=='train')) + self.backbone = darknet53( + pretrained=(model_mode == 'train'), + with_pool=False, + num_classes=-1) self.block_outputs = [] self.yolo_blocks = [] self.route_blocks = [] @@ -173,32 +175,46 @@ class YOLOv3(Model): block_out = self.add_sublayer( "block_out_{}".format(idx), - Conv2D(num_channels=1024 // (2**idx), - num_filters=num_filters, - filter_size=1, - act=None, - param_attr=ParamAttr( - initializer=fluid.initializer.Normal(0., 0.02)), - bias_attr=ParamAttr( - initializer=fluid.initializer.Constant(0.0), - regularizer=L2Decay(0.)))) + Conv2D( + num_channels=1024 // (2**idx), + num_filters=num_filters, + filter_size=1, + act=None, + param_attr=ParamAttr( + initializer=fluid.initializer.Normal(0., 0.02)), + bias_attr=ParamAttr( + initializer=fluid.initializer.Constant(0.0), + regularizer=L2Decay(0.)))) self.block_outputs.append(block_out) if idx < 2: route = self.add_sublayer( "route2_{}".format(idx), - ConvBNLayer(ch_in=512 // (2**idx), - ch_out=256 // (2**idx), - filter_size=1, - act='leaky_relu')) + ConvBNLayer( + ch_in=512 // (2**idx), + ch_out=256 // (2**idx), + filter_size=1, + act='leaky_relu')) self.route_blocks.append(route) + def extract_feats(self, inputs): + out = self.backbone.conv0(inputs) + out = self.backbone.downsample0(out) + blocks = [] + for i, conv_block_i in enumerate( + self.backbone.darknet53_conv_block_list): + out = conv_block_i(out) + blocks.append(out) + if i < len(self.backbone.stages) - 1: + out = self.backbone.downsample_list[i](out) + return blocks[-1:-4:-1] + def forward(self, img_id, img_shape, inputs): outputs = [] boxes = [] scores = [] downsample = 32 - feats = self.backbone(inputs) + feats = self.extract_feats(inputs) route = None for idx, feat in enumerate(feats): if idx > 0: @@ -233,15 +249,18 @@ class YOLOv3(Model): if self.model_mode == 'train': return outputs - preds = [img_id, - fluid.layers.multiclass_nms( - bboxes=fluid.layers.concat(boxes, axis=1), - scores=fluid.layers.concat(scores, axis=2), - score_threshold=self.valid_thresh, - nms_top_k=self.nms_topk, - keep_top_k=self.nms_posk, - nms_threshold=self.nms_thresh, - background_label=-1)] + preds = [ + img_id, fluid.layers.multiclass_nms( + bboxes=fluid.layers.concat( + boxes, axis=1), + scores=fluid.layers.concat( + scores, axis=2), + score_threshold=self.valid_thresh, + nms_top_k=self.nms_topk, + keep_top_k=self.nms_posk, + nms_threshold=self.nms_thresh, + background_label=-1) + ] if self.model_mode == 'test': return preds @@ -249,14 +268,17 @@ class YOLOv3(Model): # model_mode == "eval" return outputs + preds + class YoloLoss(Loss): def __init__(self, num_classes=80, num_max_boxes=50): super(YoloLoss, self).__init__() self.num_classes = num_classes self.num_max_boxes = num_max_boxes self.ignore_thresh = 0.7 - self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, - 59, 119, 116, 90, 156, 198, 373, 326] + self.anchors = [ + 10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, + 373, 326 + ] self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] def forward(self, outputs, labels): @@ -265,7 +287,7 @@ class YoloLoss(Loss): losses = [] for idx, out in enumerate(outputs): - if idx == 3: break # debug + if idx == 3: break # debug anchor_mask = self.anchor_masks[idx] loss = fluid.layers.yolov3_loss( x=out, @@ -284,8 +306,10 @@ class YoloLoss(Loss): return losses -def _yolov3_darknet(num_layers=53, num_classes=80, - model_mode='train', pretrained=True): +def _yolov3_darknet(num_layers=53, + num_classes=80, + model_mode='train', + pretrained=True): model = YOLOv3(num_classes, model_mode) if pretrained: assert num_layers in pretrain_infos.keys(), \ diff --git a/examples/yolov3/transforms.py b/examples/yolov3/transforms.py index 8d81c274dfb574bac52855cda95c970e4c8a444f..4eca95a95d692cbe9e9db654cf727e289361ff5f 100644 --- a/examples/yolov3/transforms.py +++ b/examples/yolov3/transforms.py @@ -20,6 +20,7 @@ import traceback import numpy as np __all__ = [ + "Compose", 'ColorDistort', 'RandomExpand', 'RandomCrop', @@ -33,6 +34,37 @@ __all__ = [ ] +class Compose(object): + """Composes several transforms together. + + Args: + transforms (list of ``Transform`` objects): list of transforms to compose. + + """ + + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, *data): + for f in self.transforms: + try: + data = f(*data) + except Exception as e: + stack_info = traceback.format_exc() + print("fail to perform transform [{}] with error: " + "{} and stack:\n{}".format(f, e, str(stack_info))) + raise e + return data + + def __repr__(self): + format_string = self.__class__.__name__ + '(' + for t in self.transforms: + format_string += '\n' + format_string += ' {0}'.format(t) + format_string += '\n)' + return format_string + + class ColorDistort(object): """Random color distortion. @@ -147,7 +179,10 @@ class RandomExpand(object): fill_value (list): color value used to fill the canvas. in RGB order. """ - def __init__(self, ratio=4., prob=0.5, fill_value=[123.675, 116.28, 103.53]): + def __init__(self, + ratio=4., + prob=0.5, + fill_value=[123.675, 116.28, 103.53]): assert ratio > 1.01, "expand ratio must be larger than 1.01" self.ratio = ratio self.prob = prob @@ -493,8 +528,7 @@ def _crop_box_with_center_constraint(box, crop): cropped_box[:, :2] -= crop[:2] cropped_box[:, 2:] -= crop[:2] centers = (box[:, :2] + box[:, 2:]) / 2 - valid = np.logical_and( - crop[:2] <= centers, centers < crop[2:]).all(axis=1) + valid = np.logical_and(crop[:2] <= centers, centers < crop[2:]).all(axis=1) valid = np.logical_and( valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1)) return cropped_box, np.where(valid)[0] @@ -517,8 +551,8 @@ def random_crop(inputs): for i in range(50): scale = np.random.uniform(*scaling) min_ar, max_ar = aspect_ratios - ar = np.random.uniform(max(min_ar, scale**2), - min(max_ar, scale**-2)) + ar = np.random.uniform( + max(min_ar, scale**2), min(max_ar, scale**-2)) crop_h = int(h * scale / np.sqrt(ar)) crop_w = int(w * scale * np.sqrt(ar)) crop_y = np.random.randint(0, h - crop_h) @@ -529,7 +563,8 @@ def random_crop(inputs): continue cropped_box, valid_ids = _crop_box_with_center_constraint( - gt_box, np.array(crop_box, dtype=np.float32)) + gt_box, np.array( + crop_box, dtype=np.float32)) if valid_ids.size > 0: found = True break @@ -545,9 +580,7 @@ def random_crop(inputs): class ResizeImage(object): - def __init__(self, - target_size=0, - interp=cv2.INTER_CUBIC): + def __init__(self, target_size=0, interp=cv2.INTER_CUBIC): """ Rescale image to the specified target size. If target_size is list, selected a scale randomly as the specified @@ -574,8 +607,8 @@ class ResizeImage(object): raise ImageError('{}: image is not 3-dimensional.'.format(self)) im_scale_x = float(self.target_size) / float(im.shape[1]) im_scale_y = float(self.target_size) / float(im.shape[0]) - resize_w = self.target_size - resize_h = self.target_size + resize_w = self.target_size + resize_h = self.target_size im = cv2.resize( im, @@ -586,4 +619,3 @@ class ResizeImage(object): interpolation=self.interp) return [im_id, im_shape, im, gt_bbox, gt_class, gt_score] - diff --git a/hapi/datasets/folder.py b/hapi/datasets/folder.py index 23f2c9592915e3e83d596c9cc3679eca306a4bd5..c13710ea033dd62b665d60967d3acc91cb84c4ef 100644 --- a/hapi/datasets/folder.py +++ b/hapi/datasets/folder.py @@ -150,7 +150,7 @@ class DatasetFolder(Dataset): path, target = self.samples[index] sample = self.loader(path) if self.transform is not None: - sample, target = self.transform(sample, target) + sample, target = self.transform(sample) return sample, target diff --git a/hapi/model.py b/hapi/model.py index 3593f00acaa9f2763e01cf139e1ccdb06d339d55..b9dc4ca441e8c2531df633b946e5c4da30bffa44 100644 --- a/hapi/model.py +++ b/hapi/model.py @@ -1135,7 +1135,7 @@ class Model(fluid.dygraph.Layer): test_data, batch_size=1, num_workers=0, - stack_outputs=True): + stack_outputs=False): """ FIXME: add more comments and usage Args: @@ -1183,20 +1183,29 @@ class Model(fluid.dygraph.Layer): loader = test_loader() outputs = [] + count = 0 for data in tqdm.tqdm(loader): data = flatten(data) - outputs.append(self.test_batch(data[:len(self._inputs)])) + out = to_list(self.test_batch(data[:len(self._inputs)])) + outputs.append(out) + count += out[0].shape[0] + + if test_loader is not None and self._adapter._nranks > 1 \ + and isinstance(test_loader, DataLoader) \ + and count > len(test_loader.dataset): + size = outputs[-1][0].shape[0] - (count - len(test_loader.dataset)) + outputs[-1] = [o[:size] for o in outputs[-1]] # NOTE: for lod tensor output, we should not stack outputs # for stacking may loss its detail info + outputs = list(zip(*outputs)) + if stack_outputs: - outputs = [np.stack(outs, axis=0) for outs in outputs] + outputs = [np.vstack(outs) for outs in outputs] self._test_dataloader = None - if test_loader is not None and self._adapter._nranks > 1 \ - and isinstance(test_loader, DataLoader): - outputs = [o[:len(test_loader.dataset)] for o in outputs] + return outputs def _run_one_epoch(self, diff --git a/hapi/vision/models/darknet.py b/hapi/vision/models/darknet.py index df0588846c075009f28a23596682fb7287579672..5525b6c0489c993669de5d675b25518dc74a6ca6 100755 --- a/hapi/vision/models/darknet.py +++ b/hapi/vision/models/darknet.py @@ -12,11 +12,12 @@ #See the License for the specific language governing permissions and #limitations under the License. +import math import paddle.fluid as fluid from paddle.fluid.param_attr import ParamAttr from paddle.fluid.regularizer import L2Decay -from paddle.fluid.dygraph.nn import Conv2D, BatchNorm +from paddle.fluid.dygraph.nn import Conv2D, BatchNorm, Pool2D, Linear from hapi.model import Model from hapi.download import get_weights_path @@ -25,8 +26,8 @@ __all__ = ['DarkNet', 'darknet53'] # {num_layers: (url, md5)} pretrain_infos = { - 53: ('https://paddlemodels.bj.bcebos.com/hapi/darknet53.pdparams', - '2506357a5c31e865785112fc614a487d') + 53: ('https://paddle-hapi.bj.bcebos.com/models/darknet53.pdparams', + 'ca506a90e2efecb9a2093f8ada808708') } @@ -66,17 +67,14 @@ class ConvBNLayer(fluid.dygraph.Layer): def forward(self, inputs): out = self.conv(inputs) out = self.batch_norm(out) + # out = fluid.layers.relu(out) if self.act == 'leaky': out = fluid.layers.leaky_relu(x=out, alpha=0.1) return out + class DownSample(fluid.dygraph.Layer): - def __init__(self, - ch_in, - ch_out, - filter_size=3, - stride=2, - padding=1): + def __init__(self, ch_in, ch_out, filter_size=3, stride=2, padding=1): super(DownSample, self).__init__() @@ -87,46 +85,45 @@ class DownSample(fluid.dygraph.Layer): stride=stride, padding=padding) self.ch_out = ch_out + def forward(self, inputs): out = self.conv_bn_layer(inputs) return out + class BasicBlock(fluid.dygraph.Layer): def __init__(self, ch_in, ch_out): super(BasicBlock, self).__init__() self.conv1 = ConvBNLayer( - ch_in=ch_in, - ch_out=ch_out, - filter_size=1, - stride=1, - padding=0) + ch_in=ch_in, ch_out=ch_out, filter_size=1, stride=1, padding=0) self.conv2 = ConvBNLayer( ch_in=ch_out, - ch_out=ch_out*2, + ch_out=ch_out * 2, filter_size=3, stride=1, padding=1) + def forward(self, inputs): conv1 = self.conv1(inputs) conv2 = self.conv2(conv1) out = fluid.layers.elementwise_add(x=inputs, y=conv2, act=None) return out + class LayerWarp(fluid.dygraph.Layer): def __init__(self, ch_in, ch_out, count): - super(LayerWarp,self).__init__() + super(LayerWarp, self).__init__() self.basicblock0 = BasicBlock(ch_in, ch_out) self.res_out_list = [] - for i in range(1,count): + for i in range(1, count): res_out = self.add_sublayer("basic_block_%d" % (i), - BasicBlock( - ch_out*2, - ch_out)) + BasicBlock(ch_out * 2, ch_out)) self.res_out_list.append(res_out) self.ch_out = ch_out - def forward(self,inputs): + + def forward(self, inputs): y = self.basicblock0(inputs) for basic_block_i in self.res_out_list: y = basic_block_i(y) @@ -142,61 +139,82 @@ class DarkNet(Model): Args: num_layers (int): layer number of DarkNet, only 53 supported currently, default: 53. - ch_in (int): channel number of input data, default 3. + num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer + will not be defined. Default: 1000. + with_pool (bool): use pool before the last fc layer or not. Default: True. + classifier_activation (str): activation for the last fc layer. Default: 'softmax'. """ - def __init__(self, num_layers=53, ch_in=3): + def __init__(self, + num_layers=53, + num_classes=1000, + with_pool=True, + classifier_activation='softmax'): super(DarkNet, self).__init__() assert num_layers in DarkNet_cfg.keys(), \ "only support num_layers in {} currently" \ .format(DarkNet_cfg.keys()) self.stages = DarkNet_cfg[num_layers] self.stages = self.stages[0:5] - + self.num_classes = num_classes + self.with_pool = True + ch_in = 3 self.conv0 = ConvBNLayer( - ch_in=ch_in, - ch_out=32, - filter_size=3, - stride=1, - padding=1) + ch_in=ch_in, ch_out=32, filter_size=3, stride=1, padding=1) - self.downsample0 = DownSample( - ch_in=32, - ch_out=32 * 2) + self.downsample0 = DownSample(ch_in=32, ch_out=32 * 2) self.darknet53_conv_block_list = [] self.downsample_list = [] - ch_in = [64,128,256,512,1024] + ch_in = [64, 128, 256, 512, 1024] for i, stage in enumerate(self.stages): - conv_block = self.add_sublayer( - "stage_%d" % (i), - LayerWarp( - int(ch_in[i]), - 32*(2**i), - stage)) + conv_block = self.add_sublayer("stage_%d" % (i), + LayerWarp( + int(ch_in[i]), 32 * (2**i), + stage)) self.darknet53_conv_block_list.append(conv_block) + for i in range(len(self.stages) - 1): downsample = self.add_sublayer( "stage_%d_downsample" % i, DownSample( - ch_in = 32*(2**(i+1)), - ch_out = 32*(2**(i+2)))) + ch_in=32 * (2**(i + 1)), ch_out=32 * (2**(i + 2)))) self.downsample_list.append(downsample) - def forward(self,inputs): - + if self.with_pool: + self.global_pool = Pool2D( + pool_size=7, pool_type='avg', global_pooling=True) + + if self.num_classes > 0: + stdv = 1.0 / math.sqrt(32 * (2**(i + 2))) + self.fc_input_dim = 32 * (2**(i + 2)) + + self.fc = Linear( + self.fc_input_dim, + num_classes, + act='softmax', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv))) + + def forward(self, inputs): + out = self.conv0(inputs) out = self.downsample0(out) - blocks = [] + for i, conv_block_i in enumerate(self.darknet53_conv_block_list): out = conv_block_i(out) - blocks.append(out) if i < len(self.stages) - 1: out = self.downsample_list[i](out) - return blocks[-1:-4:-1] + + if self.with_pool: + out = self.global_pool(out) + if self.num_classes > 0: + out = fluid.layers.reshape(out, shape=[-1, self.fc_input_dim]) + out = self.fc(out) + return out -def _darknet(num_layers=53, input_channels=3, pretrained=True): - model = DarkNet(num_layers, input_channels) +def _darknet(num_layers=53, pretrained=False, **kwargs): + model = DarkNet(num_layers, **kwargs) if pretrained: assert num_layers in pretrain_infos.keys(), \ "DarkNet{} do not have pretrained weights now, " \ @@ -208,7 +226,7 @@ def _darknet(num_layers=53, input_channels=3, pretrained=True): return model -def darknet53(input_channels=3, pretrained=True): +def darknet53(pretrained=False, **kwargs): """DarkNet 53-layer model Args: @@ -216,4 +234,4 @@ def darknet53(input_channels=3, pretrained=True): pretrained (bool): If True, returns a model pre-trained on ImageNet, default True. """ - return _darknet(53, input_channels, pretrained) + return _darknet(53, pretrained, **kwargs) diff --git a/hapi/vision/models/lenet.py b/hapi/vision/models/lenet.py new file mode 100644 index 0000000000000000000000000000000000000000..0f88bc91cb130f1432ecf29e6aae10755be1392d --- /dev/null +++ b/hapi/vision/models/lenet.py @@ -0,0 +1,58 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +import paddle.fluid as fluid + +from paddle.fluid.dygraph.nn import Conv2D, BatchNorm, Pool2D, Linear +from paddle.fluid.dygraph.container import Sequential +from hapi.model import Model + +__all__ = ['LeNet'] + + +class LeNet(Model): + """LeNet model from + `"LeCun Y, Bottou L, Bengio Y, et al. Gradient-based learning applied to document recognition[J]. Proceedings of the IEEE, 1998, 86(11): 2278-2324.`_ + + Args: + num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer + will not be defined. Default: 10. + classifier_activation (str): activation for the last fc layer. Default: 'softmax'. + """ + + def __init__(self, num_classes=10, classifier_activation='softmax'): + super(LeNet, self).__init__() + self.num_classes = num_classes + self.features = Sequential( + Conv2D( + 1, 6, 3, stride=1, padding=1), + Pool2D(2, 'max', 2), + Conv2D( + 6, 16, 5, stride=1, padding=0), + Pool2D(2, 'max', 2)) + + if num_classes > 0: + self.fc = Sequential( + Linear(400, 120), + Linear(120, 84), + Linear( + 84, 10, act=classifier_activation)) + + def forward(self, inputs): + x = self.features(inputs) + + if self.num_classes > 0: + x = fluid.layers.flatten(x, 1) + x = self.fc(x) + return x diff --git a/hapi/vision/transforms/transforms.py b/hapi/vision/transforms/transforms.py index 3d974171ce0d6f5a80f2af6a272a4250d771fb4d..87e49862489c1d9284b9d3e6d018e0de2f183bcb 100644 --- a/hapi/vision/transforms/transforms.py +++ b/hapi/vision/transforms/transforms.py @@ -64,10 +64,10 @@ class Compose(object): def __init__(self, transforms): self.transforms = transforms - def __call__(self, *data): + def __call__(self, data): for f in self.transforms: try: - data = f(*data) + data = f(data) except Exception as e: stack_info = traceback.format_exc() print("fail to perform transform [{}] with error: " @@ -130,8 +130,8 @@ class Resize(object): self.size = size self.interpolation = interpolation - def __call__(self, img, lbl): - return F.resize(img, self.size, self.interpolation), lbl + def __call__(self, img): + return F.resize(img, self.size, self.interpolation) class RandomResizedCrop(object): @@ -193,10 +193,10 @@ class RandomResizedCrop(object): y = (height - h) // 2 return x, y, w, h - def __call__(self, img, lbl): + def __call__(self, img): x, y, w, h = self._get_params(img) cropped_img = img[y:y + h, x:x + w] - return F.resize(cropped_img, self.output_size, self.interpolation), lbl + return F.resize(cropped_img, self.output_size, self.interpolation) class CenterCropResize(object): @@ -224,10 +224,10 @@ class CenterCropResize(object): y = (w + 1 - c) // 2 return c, x, y - def __call__(self, img, lbl): + def __call__(self, img): c, x, y = self._get_params(img) cropped_img = img[x:x + c, y:y + c, :] - return F.resize(cropped_img, self.size, self.interpolation), lbl + return F.resize(cropped_img, self.size, self.interpolation) class CenterCrop(object): @@ -251,10 +251,10 @@ class CenterCrop(object): y = int(round((h - th) / 2.0)) return x, y - def __call__(self, img, lbl): + def __call__(self, img): x, y = self._get_params(img) th, tw = self.output_size - return img[y:y + th, x:x + tw], lbl + return img[y:y + th, x:x + tw] class RandomHorizontalFlip(object): @@ -267,10 +267,10 @@ class RandomHorizontalFlip(object): def __init__(self, prob=0.5): self.prob = prob - def __call__(self, img, lbl): + def __call__(self, img): if np.random.random() < self.prob: - return F.flip(img, code=1), lbl - return img, lbl + return F.flip(img, code=1) + return img class RandomVerticalFlip(object): @@ -283,10 +283,10 @@ class RandomVerticalFlip(object): def __init__(self, prob=0.5): self.prob = prob - def __call__(self, img, lbl): + def __call__(self, img): if np.random.random() < self.prob: - return F.flip(img, code=0), lbl - return img, lbl + return F.flip(img, code=0) + return img class Normalize(object): @@ -311,8 +311,8 @@ class Normalize(object): self.mean = np.array(mean, dtype=np.float32).reshape(len(mean), 1, 1) self.std = np.array(std, dtype=np.float32).reshape(len(std), 1, 1) - def __call__(self, img, lbl): - return (img - self.mean) / self.std, lbl + def __call__(self, img): + return (img - self.mean) / self.std class Permute(object): @@ -333,12 +333,12 @@ class Permute(object): self.mode = mode self.to_rgb = to_rgb - def __call__(self, img, lbl): + def __call__(self, img): if self.to_rgb: img = img[..., ::-1] if self.mode == "CHW": - return img.transpose((2, 0, 1)), lbl - return img, lbl + return img.transpose((2, 0, 1)) + return img class GaussianNoise(object): @@ -354,11 +354,11 @@ class GaussianNoise(object): self.mean = np.array(mean, dtype=np.float32) self.std = np.array(std, dtype=np.float32) - def __call__(self, img, lbl): + def __call__(self, img): dtype = img.dtype noise = np.random.normal(self.mean, self.std, img.shape) * 255 img = img + noise.astype(np.float32) - return np.clip(img, 0, 255).astype(dtype), lbl + return np.clip(img, 0, 255).astype(dtype) class BrightnessTransform(object): @@ -374,15 +374,15 @@ class BrightnessTransform(object): raise ValueError("brightness value should be non-negative") self.value = value - def __call__(self, img, lbl): + def __call__(self, img): if self.value == 0: - return img, lbl + return img dtype = img.dtype img = img.astype(np.float32) alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value) img = img * alpha - return img.clip(0, 255).astype(dtype), lbl + return img.clip(0, 255).astype(dtype) class ContrastTransform(object): @@ -398,16 +398,16 @@ class ContrastTransform(object): raise ValueError("contrast value should be non-negative") self.value = value - def __call__(self, img, lbl): + def __call__(self, img): if self.value == 0: - return img, lbl + return img dtype = img.dtype img = img.astype(np.float32) alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value) img = img * alpha + cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).mean() * ( 1 - alpha) - return img.clip(0, 255).astype(dtype), lbl + return img.clip(0, 255).astype(dtype) class SaturationTransform(object): @@ -423,9 +423,9 @@ class SaturationTransform(object): raise ValueError("saturation value should be non-negative") self.value = value - def __call__(self, img, lbl): + def __call__(self, img): if self.value == 0: - return img, lbl + return img dtype = img.dtype img = img.astype(np.float32) @@ -433,7 +433,7 @@ class SaturationTransform(object): gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) gray_img = gray_img[..., np.newaxis] img = img * alpha + gray_img * (1 - alpha) - return img.clip(0, 255).astype(dtype), lbl + return img.clip(0, 255).astype(dtype) class HueTransform(object): @@ -449,9 +449,9 @@ class HueTransform(object): raise ValueError("hue value should be in [0.0, 0.5]") self.value = value - def __call__(self, img, lbl): + def __call__(self, img): if self.value == 0: - return img, lbl + return img dtype = img.dtype img = img.astype(np.uint8) @@ -464,7 +464,7 @@ class HueTransform(object): with np.errstate(over="ignore"): h += np.uint8(alpha * 255) hsv_img = cv2.merge([h, s, v]) - return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype), lbl + return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype) class ColorJitter(object): @@ -499,5 +499,5 @@ class ColorJitter(object): random.shuffle(transforms) self.transforms = Compose(transforms) - def __call__(self, img, lbl): - return self.transforms(img, lbl) + def __call__(self, img): + return self.transforms(img) diff --git a/mnist.py b/mnist.py index 39f323ac6454ed7dd06359017703401321428611..4e6240c2d5783b820a8f33f3d75064bd1d495693 100644 --- a/mnist.py +++ b/mnist.py @@ -24,10 +24,10 @@ import numpy as np from paddle import fluid from paddle.fluid.optimizer import Momentum from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear -from vision.datasets import MNIST as MnistDataset +from hapi.datasets.mnist import MNIST as MnistDataset -from model import Model, CrossEntropy, Input, set_device -from metrics import Accuracy +from hapi.model import Model, CrossEntropy, Input, set_device +from hapi.metrics import Accuracy class SimpleImgConvPool(fluid.dygraph.Layer): diff --git a/tests/test_model.py b/tests/test_model.py index 7fe414c0c914b561cc78083f1fe89b0c79e77da2..3aea2d1353e2e414d35e9b6714bdb0985d1249c7 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -190,7 +190,8 @@ class TestModel(unittest.TestCase): eval_result = model.evaluate(val_dataset, batch_size=batch_size) - output = model.predict(test_dataset, batch_size=batch_size) + output = model.predict( + test_dataset, batch_size=batch_size, stack_outputs=True) np.testing.assert_equal(output[0].shape[0], len(test_dataset))