add fluid mobilenet ssd

46b5c460 · gaoyuan · 40cc7e4f · 46b5c460 · 46b5c460 · 46b5c460
5 changed file
--- a/fluid/object_detection/data/label_list
+++ b/fluid/object_detection/data/label_list
+background
+aeroplane
+bicycle
+bird
+boat
+bottle
+bus
+car
+cat
+chair
+cow
+diningtable
+dog
+horse
+motorbike
+person
+pottedplant
+sheep
+sofa
+train
+tvmonitor
--- a/fluid/object_detection/data/prepare_voc_data.py
+++ b/fluid/object_detection/data/prepare_voc_data.py
+import os
+import os.path as osp
+import re
+import random
+
+devkit_dir = './VOCdevkit'
+years = ['2007', '2012']
+
+
+def get_dir(devkit_dir, year, type):
+    return osp.join(devkit_dir, 'VOC' + year, type)
+
+
+def walk_dir(devkit_dir, year):
+    filelist_dir = get_dir(devkit_dir, year, 'ImageSets/Main')
+    annotation_dir = get_dir(devkit_dir, year, 'Annotations')
+    img_dir = get_dir(devkit_dir, year, 'JPEGImages')
+    trainval_list = []
+    test_list = []
+    added = set()
+
+    for _, _, files in os.walk(filelist_dir):
+        for fname in files:
+            img_ann_list = []
+            if re.match('[a-z]+_trainval\.txt', fname):
+                img_ann_list = trainval_list
+            elif re.match('[a-z]+_test\.txt', fname):
+                img_ann_list = test_list
+            else:
+                continue
+            fpath = osp.join(filelist_dir, fname)
+            for line in open(fpath):
+                name_prefix = line.strip().split()[0]
+                if name_prefix in added:
+                    continue
+                added.add(name_prefix)
+                ann_path = osp.join(annotation_dir, name_prefix + '.xml')
+                img_path = osp.join(img_dir, name_prefix + '.jpg')
+                assert os.path.isfile(ann_path), 'file %s not found.' % ann_path
+                assert os.path.isfile(img_path), 'file %s not found.' % img_path
+                img_ann_list.append((img_path, ann_path))
+
+    return trainval_list, test_list
+
+
+def prepare_filelist(devkit_dir, years, output_dir):
+    trainval_list = []
+    test_list = []
+    for year in years:
+        trainval, test = walk_dir(devkit_dir, year)
+        trainval_list.extend(trainval)
+        test_list.extend(test)
+    random.shuffle(trainval_list)
+    with open(osp.join(output_dir, 'trainval.txt'), 'w') as ftrainval:
+        for item in trainval_list:
+            ftrainval.write(item[0] + ' ' + item[1] + '\n')
+
+    with open(osp.join(output_dir, 'test.txt'), 'w') as ftest:
+        for item in test_list:
+            ftest.write(item[0] + ' ' + item[1] + '\n')
+
+
+prepare_filelist(devkit_dir, years, '.')
--- a/fluid/object_detection/image_util.py
+++ b/fluid/object_detection/image_util.py
+from PIL import Image
+import numpy as np
+import random
+import math
+
+
+class sampler():
+    def __init__(self, max_sample, max_trial, min_scale, max_scale,
+                 min_aspect_ratio, max_aspect_ratio, min_jaccard_overlap,
+                 max_jaccard_overlap):
+        self.max_sample = max_sample
+        self.max_trial = max_trial
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.min_aspect_ratio = min_aspect_ratio
+        self.max_aspect_ratio = max_aspect_ratio
+        self.min_jaccard_overlap = min_jaccard_overlap
+        self.max_jaccard_overlap = max_jaccard_overlap
+
+
+class bbox():
+    def __init__(self, xmin, ymin, xmax, ymax):
+        self.xmin = xmin
+        self.ymin = ymin
+        self.xmax = xmax
+        self.ymax = ymax
+
+
+def bbox_area(src_bbox):
+    width = src_bbox.xmax - src_bbox.xmin
+    height = src_bbox.ymax - src_bbox.ymin
+    return width * height
+
+
+def generate_sample(sampler):
+    scale = random.uniform(sampler.min_scale, sampler.max_scale)
+    min_aspect_ratio = max(sampler.min_aspect_ratio, (scale**2.0))
+    max_aspect_ratio = min(sampler.max_aspect_ratio, 1 / (scale**2.0))
+    aspect_ratio = random.uniform(min_aspect_ratio, max_aspect_ratio)
+    bbox_width = scale * (aspect_ratio**0.5)
+    bbox_height = scale / (aspect_ratio**0.5)
+    xmin_bound = 1 - bbox_width
+    ymin_bound = 1 - bbox_height
+    xmin = random.uniform(0, xmin_bound)
+    ymin = random.uniform(0, ymin_bound)
+    xmax = xmin + bbox_width
+    ymax = ymin + bbox_height
+    sampled_bbox = bbox(xmin, ymin, xmax, ymax)
+    return sampled_bbox
+
+
+def jaccard_overlap(sample_bbox, object_bbox):
+    if sample_bbox.xmin >= object_bbox.xmax or \
+            sample_bbox.xmax <= object_bbox.xmin or \
+            sample_bbox.ymin >= object_bbox.ymax or \
+            sample_bbox.ymax <= object_bbox.ymin:
+        return 0
+    intersect_xmin = max(sample_bbox.xmin, object_bbox.xmin)
+    intersect_ymin = max(sample_bbox.ymin, object_bbox.ymin)
+    intersect_xmax = min(sample_bbox.xmax, object_bbox.xmax)
+    intersect_ymax = min(sample_bbox.ymax, object_bbox.ymax)
+    intersect_size = (intersect_xmax - intersect_xmin) * (
+        intersect_ymax - intersect_ymin)
+    sample_bbox_size = bbox_area(sample_bbox)
+    object_bbox_size = bbox_area(object_bbox)
+    overlap = intersect_size / (
+        sample_bbox_size + object_bbox_size - intersect_size)
+    return overlap
+
+
+def satisfy_sample_constraint(sampler, sample_bbox, bbox_labels):
+    if sampler.min_jaccard_overlap == 0 and sampler.max_jaccard_overlap == 0:
+        return True
+    for i in range(len(bbox_labels)):
+        object_bbox = bbox(bbox_labels[i][1], bbox_labels[i][2],
+                           bbox_labels[i][3], bbox_labels[i][4])
+        overlap = jaccard_overlap(sample_bbox, object_bbox)
+        if sampler.min_jaccard_overlap != 0 and \
+                overlap < sampler.min_jaccard_overlap:
+            continue
+        if sampler.max_jaccard_overlap != 0 and \
+                overlap > sampler.max_jaccard_overlap:
+            continue
+        return True
+    return False
+
+
+def generate_batch_samples(batch_sampler, bbox_labels, image_width,
+                           image_height):
+    sampled_bbox = []
+    index = []
+    c = 0
+    for sampler in batch_sampler:
+        found = 0
+        for i in range(sampler.max_trial):
+            if found >= sampler.max_sample:
+                break
+            sample_bbox = generate_sample(sampler)
+            if satisfy_sample_constraint(sampler, sample_bbox, bbox_labels):
+                sampled_bbox.append(sample_bbox)
+                found = found + 1
+                index.append(c)
+        c = c + 1
+    return sampled_bbox
+
+
+def clip_bbox(src_bbox):
+    src_bbox.xmin = max(min(src_bbox.xmin, 1.0), 0.0)
+    src_bbox.ymin = max(min(src_bbox.ymin, 1.0), 0.0)
+    src_bbox.xmax = max(min(src_bbox.xmax, 1.0), 0.0)
+    src_bbox.ymax = max(min(src_bbox.ymax, 1.0), 0.0)
+    return src_bbox
+
+
+def meet_emit_constraint(src_bbox, sample_bbox):
+    center_x = (src_bbox.xmax + src_bbox.xmin) / 2
+    center_y = (src_bbox.ymax + src_bbox.ymin) / 2
+    if center_x >= sample_bbox.xmin and \
+        center_x <= sample_bbox.xmax and \
+        center_y >= sample_bbox.ymin and \
+        center_y <= sample_bbox.ymax:
+        return True
+    return False
+
+
+def transform_labels(bbox_labels, sample_bbox):
+    proj_bbox = bbox(0, 0, 0, 0)
+    sample_labels = []
+    for i in range(len(bbox_labels)):
+        sample_label = []
+        object_bbox = bbox(bbox_labels[i][1], bbox_labels[i][2],
+                           bbox_labels[i][3], bbox_labels[i][4])
+        if not meet_emit_constraint(object_bbox, sample_bbox):
+            continue
+        sample_width = sample_bbox.xmax - sample_bbox.xmin
+        sample_height = sample_bbox.ymax - sample_bbox.ymin
+        proj_bbox.xmin = (object_bbox.xmin - sample_bbox.xmin) / sample_width
+        proj_bbox.ymin = (object_bbox.ymin - sample_bbox.ymin) / sample_height
+        proj_bbox.xmax = (object_bbox.xmax - sample_bbox.xmin) / sample_width
+        proj_bbox.ymax = (object_bbox.ymax - sample_bbox.ymin) / sample_height
+        proj_bbox = clip_bbox(proj_bbox)
+        if bbox_area(proj_bbox) > 0:
+            sample_label.append(bbox_labels[i][0])
+            sample_label.append(float(proj_bbox.xmin))
+            sample_label.append(float(proj_bbox.ymin))
+            sample_label.append(float(proj_bbox.xmax))
+            sample_label.append(float(proj_bbox.ymax))
+            sample_label.append(bbox_labels[i][5])
+            sample_labels.append(sample_label)
+    return sample_labels
+
+
+def crop_image(img, bbox_labels, sample_bbox, image_width, image_height):
+    sample_bbox = clip_bbox(sample_bbox)
+    xmin = int(sample_bbox.xmin * image_width)
+    xmax = int(sample_bbox.xmax * image_width)
+    ymin = int(sample_bbox.ymin * image_height)
+    ymax = int(sample_bbox.ymax * image_height)
+    sample_img = img[ymin:ymax, xmin:xmax]
+    sample_labels = transform_labels(bbox_labels, sample_bbox)
+    return sample_img, sample_labels
--- a/fluid/object_detection/mobilenet_ssd_fluid.py
+++ b/fluid/object_detection/mobilenet_ssd_fluid.py
+import os
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+import reader
+
+parameter_attr = ParamAttr(initializer=MSRA())
+
+
+def conv_bn_layer(input,
+                  filter_size,
+                  num_filters,
+                  stride,
+                  padding,
+                  channels=None,
+                  num_groups=1,
+                  act='relu',
+                  use_cudnn=True):
+    conv = fluid.layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        stride=stride,
+        padding=padding,
+        groups=num_groups,
+        act=None,
+        use_cudnn=use_cudnn,
+        param_attr=parameter_attr,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv, act=act)
+
+
+def depthwise_separable(input, num_filters1, num_filters2, num_groups, stride,
+                        scale):
+    """
+    """
+    depthwise_conv = conv_bn_layer(
+        input=input,
+        filter_size=3,
+        num_filters=int(num_filters1 * scale),
+        stride=stride,
+        padding=1,
+        num_groups=int(num_groups * scale),
+        use_cudnn=False)
+
+    pointwise_conv = conv_bn_layer(
+        input=depthwise_conv,
+        filter_size=1,
+        num_filters=int(num_filters2 * scale),
+        stride=1,
+        padding=0)
+    return pointwise_conv
+
+
+def extra_block(input, num_filters1, num_filters2, num_groups, stride, scale):
+    """
+    """
+    pointwise_conv = conv_bn_layer(
+        input=input,
+        filter_size=1,
+        num_filters=int(num_filters1 * scale),
+        stride=1,
+        num_groups=int(num_groups * scale),
+        padding=0)
+
+    normal_conv = conv_bn_layer(
+        input=pointwise_conv,
+        filter_size=3,
+        num_filters=int(num_filters2 * scale),
+        stride=2,
+        num_groups=int(num_groups * scale),
+        padding=1)
+    return normal_conv
+
+
+def mobile_net(img, img_shape, scale=1.0):
+
+    # 300x300
+    tmp = conv_bn_layer(
+        img,
+        filter_size=3,
+        channels=3,
+        num_filters=int(32 * scale),
+        stride=2,
+        padding=1)
+
+    # 150x150
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=32,
+        num_filters2=64,
+        num_groups=32,
+        stride=1,
+        scale=scale)
+
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=64,
+        num_filters2=128,
+        num_groups=64,
+        stride=2,
+        scale=scale)
+
+    # 75x75
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=128,
+        num_filters2=128,
+        num_groups=128,
+        stride=1,
+        scale=scale)
+
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=128,
+        num_filters2=256,
+        num_groups=128,
+        stride=2,
+        scale=scale)
+
+    # 38x38
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=256,
+        num_filters2=256,
+        num_groups=256,
+        stride=1,
+        scale=scale)
+
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=256,
+        num_filters2=512,
+        num_groups=256,
+        stride=2,
+        scale=scale)
+
+    # 19x19
+    for i in range(5):
+        tmp = depthwise_separable(
+            tmp,
+            num_filters1=512,
+            num_filters2=512,
+            num_groups=512,
+            stride=1,
+            scale=scale)
+    module11 = tmp
+
+    tmp = depthwise_separable(
+        tmp,
+        num_filters1=512,
+        num_filters2=1024,
+        num_groups=512,
+        stride=2,
+        scale=scale)
+
+    # 10x10
+    module13 = depthwise_separable(
+        tmp,
+        num_filters1=1024,
+        num_filters2=1024,
+        num_groups=1024,
+        stride=1,
+        scale=scale)
+
+    module14 = extra_block(
+        module13,
+        num_filters1=256,
+        num_filters2=512,
+        num_groups=1,
+        stride=2,
+        scale=scale)
+
+    # 5x5
+    module15 = extra_block(
+        module14,
+        num_filters1=128,
+        num_filters2=256,
+        num_groups=1,
+        stride=2,
+        scale=scale)
+
+    # 3x3
+    module16 = extra_block(
+        module15,
+        num_filters1=128,
+        num_filters2=256,
+        num_groups=1,
+        stride=2,
+        scale=scale)
+
+    # 2x2
+    module17 = extra_block(
+        module16,
+        num_filters1=64,
+        num_filters2=128,
+        num_groups=1,
+        stride=2,
+        scale=scale)
+
+    mbox_locs, mbox_confs, box, box_var = fluid.layers.multi_box_head(
+        inputs=[module11, module13, module14, module15, module16, module17],
+        image=img,
+        num_classes=21,
+        min_ratio=20,
+        max_ratio=90,
+        aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2., 3.], [2., 3.]],
+        base_size=img_shape[2],
+        offset=0.5,
+        flip=True,
+        clip=True)
+
+    return mbox_locs, mbox_confs, box, box_var
+
+
+def train(train_file_list,
+          val_file_list,
+          data_args,
+          learning_rate,
+          batch_size,
+          num_passes,
+          model_save_dir='model',
+          init_model_path=None):
+    image_shape = [3, data_args.resize_h, data_args.resize_w]
+
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    gt_box = fluid.layers.data(
+        name='gt_box', shape=[4], dtype='float32', lod_level=1)
+    gt_label = fluid.layers.data(
+        name='gt_label', shape=[1], dtype='float32', lod_level=1)
+
+    mbox_locs, mbox_confs, box, box_var = mobile_net(image, image_shape)
+    nmsed_out = fluid.layers.detection_output(mbox_locs, mbox_confs, box,
+                                              box_var)
+    loss = fluid.layers.ssd_loss(mbox_locs, mbox_confs, gt_box, gt_label, box,
+                                 box_var)
+    avg_loss = fluid.layers.mean(x=loss)
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=fluid.learning_rate_decay.exponential_decay(
+            learning_rate=learning_rate,
+            global_step=global_step,
+            decay_steps=40000,
+            decay_rate=0.1,
+            staircase=True),
+        momentum=0.9,
+        regularization=fluid.regularizer.L2Decay(5 * 1e-5))
+    opts = optimizer.minimize(avg_loss)
+
+    place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    train_reader = paddle.batch(
+        reader.train(data_args, train_file_list), batch_size=batch_size)
+    test_reader = paddle.batch(
+        reader.test(data_args, train_file_list), batch_size=batch_size)
+    feeder = fluid.DataFeeder(place=place, feed_list=[image, gt_box, gt_label])
+
+    for pass_id in range(num_passes):
+        for batch_id, data in enumerate(train_reader()):
+            avg_loss_v = exe.run(fluid.default_main_program(),
+                                 feed=feeder.feed(data),
+                                 fetch_list=[avg_loss])
+            print("Pass {0}, batch {1}, loss {2}".format(pass_id, batch_id,
+                                                         avg_loss_v[0]))
+        if pass_id % 10 == 0:
+            model_path = os.path.join(model_save_dir, str(pass_id))
+            print 'save models to %s' % (model_path)
+            fluid.io.save_inference_model(model_path, ['image'], [nmsed_out],
+                                          exe)
+
+
+if __name__ == '__main__':
+    data_args = reader.Settings(
+        data_dir='./data',
+        label_file='label_list',
+        resize_h=300,
+        resize_w=300,
+        mean_value=[104, 117, 124])
+    train(
+        train_file_list='./data/trainval.txt',
+        val_file_list='./data/test.txt',
+        data_args=data_args,
+        learning_rate=0.001,
+        batch_size=32,
+        num_passes=300)
--- a/fluid/object_detection/reader.py
+++ b/fluid/object_detection/reader.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import image_util
+from paddle.utils.image_util import *
+import random
+from PIL import Image
+import numpy as np
+import xml.etree.ElementTree
+import os
+
+
+class Settings(object):
+    def __init__(self, data_dir, label_file, resize_h, resize_w, mean_value):
+        self._data_dir = data_dir
+        self._label_list = []
+        label_fpath = os.path.join(data_dir, label_file)
+        for line in open(label_fpath):
+            self._label_list.append(line.strip())
+
+        self._resize_height = resize_h
+        self._resize_width = resize_w
+        self._img_mean = np.array(mean_value)[:, np.newaxis, np.newaxis].astype(
+            'float32')
+
+    @property
+    def data_dir(self):
+        return self._data_dir
+
+    @property
+    def label_list(self):
+        return self._label_list
+
+    @property
+    def resize_h(self):
+        return self._resize_height
+
+    @property
+    def resize_w(self):
+        return self._resize_width
+
+    @property
+    def img_mean(self):
+        return self._img_mean
+
+
+def _reader_creator(settings, file_list, mode, shuffle):
+    def reader():
+        with open(file_list) as flist:
+            lines = [line.strip() for line in flist]
+            if shuffle:
+                random.shuffle(lines)
+            for line in lines:
+                if mode == 'train' or mode == 'test':
+                    img_path, label_path = line.split()
+                    img_path = os.path.join(settings.data_dir, img_path)
+                    label_path = os.path.join(settings.data_dir, label_path)
+                elif mode == 'infer':
+                    img_path = os.path.join(settings.data_dir, line)
+
+                img = Image.open(img_path)
+                img_width, img_height = img.size
+                img = np.array(img)
+
+                # layout: label | xmin | ymin | xmax | ymax | difficult
+                if mode == 'train' or mode == 'test':
+                    bbox_labels = []
+                    root = xml.etree.ElementTree.parse(label_path).getroot()
+                    for object in root.findall('object'):
+                        bbox_sample = []
+                        # start from 1
+                        bbox_sample.append(
+                            float(
+                                settings.label_list.index(
+                                    object.find('name').text)))
+                        bbox = object.find('bndbox')
+                        difficult = float(object.find('difficult').text)
+                        bbox_sample.append(
+                            float(bbox.find('xmin').text) / img_width)
+                        bbox_sample.append(
+                            float(bbox.find('ymin').text) / img_height)
+                        bbox_sample.append(
+                            float(bbox.find('xmax').text) / img_width)
+                        bbox_sample.append(
+                            float(bbox.find('ymax').text) / img_height)
+                        bbox_sample.append(difficult)
+                        bbox_labels.append(bbox_sample)
+
+                    sample_labels = bbox_labels
+                    if mode == 'train':
+                        batch_sampler = []
+                        # hard-code here
+                        batch_sampler.append(
+                            image_util.sampler(1, 1, 1.0, 1.0, 1.0, 1.0, 0.0,
+                                               0.0))
+                        batch_sampler.append(
+                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.1,
+                                               0.0))
+                        batch_sampler.append(
+                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.3,
+                                               0.0))
+                        batch_sampler.append(
+                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.5,
+                                               0.0))
+                        batch_sampler.append(
+                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.7,
+                                               0.0))
+                        batch_sampler.append(
+                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.9,
+                                               0.0))
+                        batch_sampler.append(
+                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.0,
+                                               1.0))
+                        """ random crop """
+                        sampled_bbox = image_util.generate_batch_samples(
+                            batch_sampler, bbox_labels, img_width, img_height)
+
+                        if len(sampled_bbox) > 0:
+                            idx = int(random.uniform(0, len(sampled_bbox)))
+                            img, sample_labels = image_util.crop_image(
+                                img, bbox_labels, sampled_bbox[idx], img_width,
+                                img_height)
+
+                img = Image.fromarray(img)
+                img = img.resize((settings.resize_w, settings.resize_h),
+                                 Image.ANTIALIAS)
+                img = np.array(img)
+
+                if mode == 'train':
+                    mirror = int(random.uniform(0, 2))
+                    if mirror == 1:
+                        img = img[:, ::-1, :]
+                        for i in xrange(len(sample_labels)):
+                            tmp = sample_labels[i][1]
+                            sample_labels[i][1] = 1 - sample_labels[i][3]
+                            sample_labels[i][3] = 1 - tmp
+
+                if len(img.shape) == 3:
+                    img = np.swapaxes(img, 1, 2)
+                    img = np.swapaxes(img, 1, 0)
+
+                img = img.astype('float32')
+                img -= settings.img_mean
+                img = img.flatten()
+
+                sample_labels = np.array(sample_labels)
+                if mode == 'train' or mode == 'test':
+                    if mode == 'train' and len(sample_labels) == 0: continue
+                    yield img.astype(
+                        'float32'
+                    ), sample_labels[:, 1:5], sample_labels[:, 0].astype('int')
+                elif mode == 'infer':
+                    yield img.astype('float32')
+
+    return reader
+
+
+def train(settings, file_list, shuffle=True):
+    return _reader_creator(settings, file_list, 'train', shuffle)
+
+
+def test(settings, file_list):
+    return _reader_creator(settings, file_list, 'test', False)
+
+
+def infer(settings, file_list):
+    return _reader_creator(settings, file_list, 'infer', False)