diff --git a/fluid/object_detection/data/label_list b/fluid/object_detection/data/label_list new file mode 100644 index 0000000000000000000000000000000000000000..87df23ce0aebcd5ab96fc91c868598c3333da59c --- /dev/null +++ b/fluid/object_detection/data/label_list @@ -0,0 +1,21 @@ +background +aeroplane +bicycle +bird +boat +bottle +bus +car +cat +chair +cow +diningtable +dog +horse +motorbike +person +pottedplant +sheep +sofa +train +tvmonitor diff --git a/fluid/object_detection/data/prepare_voc_data.py b/fluid/object_detection/data/prepare_voc_data.py new file mode 100644 index 0000000000000000000000000000000000000000..a652956e91ab8277bc6670d4dc85905fc52a3203 --- /dev/null +++ b/fluid/object_detection/data/prepare_voc_data.py @@ -0,0 +1,63 @@ +import os +import os.path as osp +import re +import random + +devkit_dir = './VOCdevkit' +years = ['2007', '2012'] + + +def get_dir(devkit_dir, year, type): + return osp.join(devkit_dir, 'VOC' + year, type) + + +def walk_dir(devkit_dir, year): + filelist_dir = get_dir(devkit_dir, year, 'ImageSets/Main') + annotation_dir = get_dir(devkit_dir, year, 'Annotations') + img_dir = get_dir(devkit_dir, year, 'JPEGImages') + trainval_list = [] + test_list = [] + added = set() + + for _, _, files in os.walk(filelist_dir): + for fname in files: + img_ann_list = [] + if re.match('[a-z]+_trainval\.txt', fname): + img_ann_list = trainval_list + elif re.match('[a-z]+_test\.txt', fname): + img_ann_list = test_list + else: + continue + fpath = osp.join(filelist_dir, fname) + for line in open(fpath): + name_prefix = line.strip().split()[0] + if name_prefix in added: + continue + added.add(name_prefix) + ann_path = osp.join(annotation_dir, name_prefix + '.xml') + img_path = osp.join(img_dir, name_prefix + '.jpg') + assert os.path.isfile(ann_path), 'file %s not found.' % ann_path + assert os.path.isfile(img_path), 'file %s not found.' % img_path + img_ann_list.append((img_path, ann_path)) + + return trainval_list, test_list + + +def prepare_filelist(devkit_dir, years, output_dir): + trainval_list = [] + test_list = [] + for year in years: + trainval, test = walk_dir(devkit_dir, year) + trainval_list.extend(trainval) + test_list.extend(test) + random.shuffle(trainval_list) + with open(osp.join(output_dir, 'trainval.txt'), 'w') as ftrainval: + for item in trainval_list: + ftrainval.write(item[0] + ' ' + item[1] + '\n') + + with open(osp.join(output_dir, 'test.txt'), 'w') as ftest: + for item in test_list: + ftest.write(item[0] + ' ' + item[1] + '\n') + + +prepare_filelist(devkit_dir, years, '.') diff --git a/fluid/object_detection/image_util.py b/fluid/object_detection/image_util.py new file mode 100644 index 0000000000000000000000000000000000000000..ba8744eda0a078acd38cad9b10ca7511185efc43 --- /dev/null +++ b/fluid/object_detection/image_util.py @@ -0,0 +1,161 @@ +from PIL import Image +import numpy as np +import random +import math + + +class sampler(): + def __init__(self, max_sample, max_trial, min_scale, max_scale, + min_aspect_ratio, max_aspect_ratio, min_jaccard_overlap, + max_jaccard_overlap): + self.max_sample = max_sample + self.max_trial = max_trial + self.min_scale = min_scale + self.max_scale = max_scale + self.min_aspect_ratio = min_aspect_ratio + self.max_aspect_ratio = max_aspect_ratio + self.min_jaccard_overlap = min_jaccard_overlap + self.max_jaccard_overlap = max_jaccard_overlap + + +class bbox(): + def __init__(self, xmin, ymin, xmax, ymax): + self.xmin = xmin + self.ymin = ymin + self.xmax = xmax + self.ymax = ymax + + +def bbox_area(src_bbox): + width = src_bbox.xmax - src_bbox.xmin + height = src_bbox.ymax - src_bbox.ymin + return width * height + + +def generate_sample(sampler): + scale = random.uniform(sampler.min_scale, sampler.max_scale) + min_aspect_ratio = max(sampler.min_aspect_ratio, (scale**2.0)) + max_aspect_ratio = min(sampler.max_aspect_ratio, 1 / (scale**2.0)) + aspect_ratio = random.uniform(min_aspect_ratio, max_aspect_ratio) + bbox_width = scale * (aspect_ratio**0.5) + bbox_height = scale / (aspect_ratio**0.5) + xmin_bound = 1 - bbox_width + ymin_bound = 1 - bbox_height + xmin = random.uniform(0, xmin_bound) + ymin = random.uniform(0, ymin_bound) + xmax = xmin + bbox_width + ymax = ymin + bbox_height + sampled_bbox = bbox(xmin, ymin, xmax, ymax) + return sampled_bbox + + +def jaccard_overlap(sample_bbox, object_bbox): + if sample_bbox.xmin >= object_bbox.xmax or \ + sample_bbox.xmax <= object_bbox.xmin or \ + sample_bbox.ymin >= object_bbox.ymax or \ + sample_bbox.ymax <= object_bbox.ymin: + return 0 + intersect_xmin = max(sample_bbox.xmin, object_bbox.xmin) + intersect_ymin = max(sample_bbox.ymin, object_bbox.ymin) + intersect_xmax = min(sample_bbox.xmax, object_bbox.xmax) + intersect_ymax = min(sample_bbox.ymax, object_bbox.ymax) + intersect_size = (intersect_xmax - intersect_xmin) * ( + intersect_ymax - intersect_ymin) + sample_bbox_size = bbox_area(sample_bbox) + object_bbox_size = bbox_area(object_bbox) + overlap = intersect_size / ( + sample_bbox_size + object_bbox_size - intersect_size) + return overlap + + +def satisfy_sample_constraint(sampler, sample_bbox, bbox_labels): + if sampler.min_jaccard_overlap == 0 and sampler.max_jaccard_overlap == 0: + return True + for i in range(len(bbox_labels)): + object_bbox = bbox(bbox_labels[i][1], bbox_labels[i][2], + bbox_labels[i][3], bbox_labels[i][4]) + overlap = jaccard_overlap(sample_bbox, object_bbox) + if sampler.min_jaccard_overlap != 0 and \ + overlap < sampler.min_jaccard_overlap: + continue + if sampler.max_jaccard_overlap != 0 and \ + overlap > sampler.max_jaccard_overlap: + continue + return True + return False + + +def generate_batch_samples(batch_sampler, bbox_labels, image_width, + image_height): + sampled_bbox = [] + index = [] + c = 0 + for sampler in batch_sampler: + found = 0 + for i in range(sampler.max_trial): + if found >= sampler.max_sample: + break + sample_bbox = generate_sample(sampler) + if satisfy_sample_constraint(sampler, sample_bbox, bbox_labels): + sampled_bbox.append(sample_bbox) + found = found + 1 + index.append(c) + c = c + 1 + return sampled_bbox + + +def clip_bbox(src_bbox): + src_bbox.xmin = max(min(src_bbox.xmin, 1.0), 0.0) + src_bbox.ymin = max(min(src_bbox.ymin, 1.0), 0.0) + src_bbox.xmax = max(min(src_bbox.xmax, 1.0), 0.0) + src_bbox.ymax = max(min(src_bbox.ymax, 1.0), 0.0) + return src_bbox + + +def meet_emit_constraint(src_bbox, sample_bbox): + center_x = (src_bbox.xmax + src_bbox.xmin) / 2 + center_y = (src_bbox.ymax + src_bbox.ymin) / 2 + if center_x >= sample_bbox.xmin and \ + center_x <= sample_bbox.xmax and \ + center_y >= sample_bbox.ymin and \ + center_y <= sample_bbox.ymax: + return True + return False + + +def transform_labels(bbox_labels, sample_bbox): + proj_bbox = bbox(0, 0, 0, 0) + sample_labels = [] + for i in range(len(bbox_labels)): + sample_label = [] + object_bbox = bbox(bbox_labels[i][1], bbox_labels[i][2], + bbox_labels[i][3], bbox_labels[i][4]) + if not meet_emit_constraint(object_bbox, sample_bbox): + continue + sample_width = sample_bbox.xmax - sample_bbox.xmin + sample_height = sample_bbox.ymax - sample_bbox.ymin + proj_bbox.xmin = (object_bbox.xmin - sample_bbox.xmin) / sample_width + proj_bbox.ymin = (object_bbox.ymin - sample_bbox.ymin) / sample_height + proj_bbox.xmax = (object_bbox.xmax - sample_bbox.xmin) / sample_width + proj_bbox.ymax = (object_bbox.ymax - sample_bbox.ymin) / sample_height + proj_bbox = clip_bbox(proj_bbox) + if bbox_area(proj_bbox) > 0: + sample_label.append(bbox_labels[i][0]) + sample_label.append(float(proj_bbox.xmin)) + sample_label.append(float(proj_bbox.ymin)) + sample_label.append(float(proj_bbox.xmax)) + sample_label.append(float(proj_bbox.ymax)) + sample_label.append(bbox_labels[i][5]) + sample_labels.append(sample_label) + return sample_labels + + +def crop_image(img, bbox_labels, sample_bbox, image_width, image_height): + sample_bbox = clip_bbox(sample_bbox) + xmin = int(sample_bbox.xmin * image_width) + xmax = int(sample_bbox.xmax * image_width) + ymin = int(sample_bbox.ymin * image_height) + ymax = int(sample_bbox.ymax * image_height) + sample_img = img[ymin:ymax, xmin:xmax] + sample_labels = transform_labels(bbox_labels, sample_bbox) + return sample_img, sample_labels diff --git a/fluid/object_detection/mobilenet_ssd_fluid.py b/fluid/object_detection/mobilenet_ssd_fluid.py new file mode 100644 index 0000000000000000000000000000000000000000..6e3d5729415d86c06c6eaef75460af2d00e979b7 --- /dev/null +++ b/fluid/object_detection/mobilenet_ssd_fluid.py @@ -0,0 +1,288 @@ +import os + +import paddle.v2 as paddle +import paddle.fluid as fluid +from paddle.fluid.initializer import MSRA +from paddle.fluid.param_attr import ParamAttr +import reader + +parameter_attr = ParamAttr(initializer=MSRA()) + + +def conv_bn_layer(input, + filter_size, + num_filters, + stride, + padding, + channels=None, + num_groups=1, + act='relu', + use_cudnn=True): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + act=None, + use_cudnn=use_cudnn, + param_attr=parameter_attr, + bias_attr=False) + return fluid.layers.batch_norm(input=conv, act=act) + + +def depthwise_separable(input, num_filters1, num_filters2, num_groups, stride, + scale): + """ + """ + depthwise_conv = conv_bn_layer( + input=input, + filter_size=3, + num_filters=int(num_filters1 * scale), + stride=stride, + padding=1, + num_groups=int(num_groups * scale), + use_cudnn=False) + + pointwise_conv = conv_bn_layer( + input=depthwise_conv, + filter_size=1, + num_filters=int(num_filters2 * scale), + stride=1, + padding=0) + return pointwise_conv + + +def extra_block(input, num_filters1, num_filters2, num_groups, stride, scale): + """ + """ + pointwise_conv = conv_bn_layer( + input=input, + filter_size=1, + num_filters=int(num_filters1 * scale), + stride=1, + num_groups=int(num_groups * scale), + padding=0) + + normal_conv = conv_bn_layer( + input=pointwise_conv, + filter_size=3, + num_filters=int(num_filters2 * scale), + stride=2, + num_groups=int(num_groups * scale), + padding=1) + return normal_conv + + +def mobile_net(img, img_shape, scale=1.0): + + # 300x300 + tmp = conv_bn_layer( + img, + filter_size=3, + channels=3, + num_filters=int(32 * scale), + stride=2, + padding=1) + + # 150x150 + tmp = depthwise_separable( + tmp, + num_filters1=32, + num_filters2=64, + num_groups=32, + stride=1, + scale=scale) + + tmp = depthwise_separable( + tmp, + num_filters1=64, + num_filters2=128, + num_groups=64, + stride=2, + scale=scale) + + # 75x75 + tmp = depthwise_separable( + tmp, + num_filters1=128, + num_filters2=128, + num_groups=128, + stride=1, + scale=scale) + + tmp = depthwise_separable( + tmp, + num_filters1=128, + num_filters2=256, + num_groups=128, + stride=2, + scale=scale) + + # 38x38 + tmp = depthwise_separable( + tmp, + num_filters1=256, + num_filters2=256, + num_groups=256, + stride=1, + scale=scale) + + tmp = depthwise_separable( + tmp, + num_filters1=256, + num_filters2=512, + num_groups=256, + stride=2, + scale=scale) + + # 19x19 + for i in range(5): + tmp = depthwise_separable( + tmp, + num_filters1=512, + num_filters2=512, + num_groups=512, + stride=1, + scale=scale) + module11 = tmp + + tmp = depthwise_separable( + tmp, + num_filters1=512, + num_filters2=1024, + num_groups=512, + stride=2, + scale=scale) + + # 10x10 + module13 = depthwise_separable( + tmp, + num_filters1=1024, + num_filters2=1024, + num_groups=1024, + stride=1, + scale=scale) + + module14 = extra_block( + module13, + num_filters1=256, + num_filters2=512, + num_groups=1, + stride=2, + scale=scale) + + # 5x5 + module15 = extra_block( + module14, + num_filters1=128, + num_filters2=256, + num_groups=1, + stride=2, + scale=scale) + + # 3x3 + module16 = extra_block( + module15, + num_filters1=128, + num_filters2=256, + num_groups=1, + stride=2, + scale=scale) + + # 2x2 + module17 = extra_block( + module16, + num_filters1=64, + num_filters2=128, + num_groups=1, + stride=2, + scale=scale) + + mbox_locs, mbox_confs, box, box_var = fluid.layers.multi_box_head( + inputs=[module11, module13, module14, module15, module16, module17], + image=img, + num_classes=21, + min_ratio=20, + max_ratio=90, + aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2., 3.], [2., 3.]], + base_size=img_shape[2], + offset=0.5, + flip=True, + clip=True) + + return mbox_locs, mbox_confs, box, box_var + + +def train(train_file_list, + val_file_list, + data_args, + learning_rate, + batch_size, + num_passes, + model_save_dir='model', + init_model_path=None): + image_shape = [3, data_args.resize_h, data_args.resize_w] + + image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') + gt_box = fluid.layers.data( + name='gt_box', shape=[4], dtype='float32', lod_level=1) + gt_label = fluid.layers.data( + name='gt_label', shape=[1], dtype='float32', lod_level=1) + + mbox_locs, mbox_confs, box, box_var = mobile_net(image, image_shape) + nmsed_out = fluid.layers.detection_output(mbox_locs, mbox_confs, box, + box_var) + loss = fluid.layers.ssd_loss(mbox_locs, mbox_confs, gt_box, gt_label, box, + box_var) + avg_loss = fluid.layers.mean(x=loss) + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.learning_rate_decay.exponential_decay( + learning_rate=learning_rate, + global_step=global_step, + decay_steps=40000, + decay_rate=0.1, + staircase=True), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(5 * 1e-5)) + opts = optimizer.minimize(avg_loss) + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + train_reader = paddle.batch( + reader.train(data_args, train_file_list), batch_size=batch_size) + test_reader = paddle.batch( + reader.test(data_args, train_file_list), batch_size=batch_size) + feeder = fluid.DataFeeder(place=place, feed_list=[image, gt_box, gt_label]) + + for pass_id in range(num_passes): + for batch_id, data in enumerate(train_reader()): + avg_loss_v = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_loss]) + print("Pass {0}, batch {1}, loss {2}".format(pass_id, batch_id, + avg_loss_v[0])) + if pass_id % 10 == 0: + model_path = os.path.join(model_save_dir, str(pass_id)) + print 'save models to %s' % (model_path) + fluid.io.save_inference_model(model_path, ['image'], [nmsed_out], + exe) + + +if __name__ == '__main__': + data_args = reader.Settings( + data_dir='./data', + label_file='label_list', + resize_h=300, + resize_w=300, + mean_value=[104, 117, 124]) + train( + train_file_list='./data/trainval.txt', + val_file_list='./data/test.txt', + data_args=data_args, + learning_rate=0.001, + batch_size=32, + num_passes=300) diff --git a/fluid/object_detection/reader.py b/fluid/object_detection/reader.py new file mode 100644 index 0000000000000000000000000000000000000000..aa721d89823df01360c21bfbecbd5e3947ffe928 --- /dev/null +++ b/fluid/object_detection/reader.py @@ -0,0 +1,178 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import image_util +from paddle.utils.image_util import * +import random +from PIL import Image +import numpy as np +import xml.etree.ElementTree +import os + + +class Settings(object): + def __init__(self, data_dir, label_file, resize_h, resize_w, mean_value): + self._data_dir = data_dir + self._label_list = [] + label_fpath = os.path.join(data_dir, label_file) + for line in open(label_fpath): + self._label_list.append(line.strip()) + + self._resize_height = resize_h + self._resize_width = resize_w + self._img_mean = np.array(mean_value)[:, np.newaxis, np.newaxis].astype( + 'float32') + + @property + def data_dir(self): + return self._data_dir + + @property + def label_list(self): + return self._label_list + + @property + def resize_h(self): + return self._resize_height + + @property + def resize_w(self): + return self._resize_width + + @property + def img_mean(self): + return self._img_mean + + +def _reader_creator(settings, file_list, mode, shuffle): + def reader(): + with open(file_list) as flist: + lines = [line.strip() for line in flist] + if shuffle: + random.shuffle(lines) + for line in lines: + if mode == 'train' or mode == 'test': + img_path, label_path = line.split() + img_path = os.path.join(settings.data_dir, img_path) + label_path = os.path.join(settings.data_dir, label_path) + elif mode == 'infer': + img_path = os.path.join(settings.data_dir, line) + + img = Image.open(img_path) + img_width, img_height = img.size + img = np.array(img) + + # layout: label | xmin | ymin | xmax | ymax | difficult + if mode == 'train' or mode == 'test': + bbox_labels = [] + root = xml.etree.ElementTree.parse(label_path).getroot() + for object in root.findall('object'): + bbox_sample = [] + # start from 1 + bbox_sample.append( + float( + settings.label_list.index( + object.find('name').text))) + bbox = object.find('bndbox') + difficult = float(object.find('difficult').text) + bbox_sample.append( + float(bbox.find('xmin').text) / img_width) + bbox_sample.append( + float(bbox.find('ymin').text) / img_height) + bbox_sample.append( + float(bbox.find('xmax').text) / img_width) + bbox_sample.append( + float(bbox.find('ymax').text) / img_height) + bbox_sample.append(difficult) + bbox_labels.append(bbox_sample) + + sample_labels = bbox_labels + if mode == 'train': + batch_sampler = [] + # hard-code here + batch_sampler.append( + image_util.sampler(1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, + 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, + 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, + 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, + 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, + 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, + 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, + 1.0)) + """ random crop """ + sampled_bbox = image_util.generate_batch_samples( + batch_sampler, bbox_labels, img_width, img_height) + + if len(sampled_bbox) > 0: + idx = int(random.uniform(0, len(sampled_bbox))) + img, sample_labels = image_util.crop_image( + img, bbox_labels, sampled_bbox[idx], img_width, + img_height) + + img = Image.fromarray(img) + img = img.resize((settings.resize_w, settings.resize_h), + Image.ANTIALIAS) + img = np.array(img) + + if mode == 'train': + mirror = int(random.uniform(0, 2)) + if mirror == 1: + img = img[:, ::-1, :] + for i in xrange(len(sample_labels)): + tmp = sample_labels[i][1] + sample_labels[i][1] = 1 - sample_labels[i][3] + sample_labels[i][3] = 1 - tmp + + if len(img.shape) == 3: + img = np.swapaxes(img, 1, 2) + img = np.swapaxes(img, 1, 0) + + img = img.astype('float32') + img -= settings.img_mean + img = img.flatten() + + sample_labels = np.array(sample_labels) + if mode == 'train' or mode == 'test': + if mode == 'train' and len(sample_labels) == 0: continue + yield img.astype( + 'float32' + ), sample_labels[:, 1:5], sample_labels[:, 0].astype('int') + elif mode == 'infer': + yield img.astype('float32') + + return reader + + +def train(settings, file_list, shuffle=True): + return _reader_creator(settings, file_list, 'train', shuffle) + + +def test(settings, file_list): + return _reader_creator(settings, file_list, 'test', False) + + +def infer(settings, file_list): + return _reader_creator(settings, file_list, 'infer', False)