arrange_sample.py

# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# function:
#    operators to process sample,
#    eg: decode/resize/crop image

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import logging
import numpy as np
from .operators import BaseOperator, register_op

logger = logging.getLogger(__name__)


@register_op
class ArrangeRCNN(BaseOperator):
    """
    Transform dict to tuple format needed for training.

    Args:
        is_mask (bool): whether to use include mask data
    """

    def __init__(self, is_mask=False):
        super(ArrangeRCNN, self).__init__()
        self.is_mask = is_mask
        assert isinstance(self.is_mask, bool), "wrong type for is_mask"

    def __call__(self, sample, context=None):
        """
        Args:
            sample: a dict which contains image
                    info and annotation info.
            context: a dict which contains additional info.
        Returns:
            sample: a tuple containing following items
                (image, im_info, im_id, gt_bbox, gt_class, is_crowd, gt_masks)
        """
        im = sample['image']
        gt_bbox = sample['gt_bbox']
        gt_class = sample['gt_class']
        keys = list(sample.keys())
        if 'is_crowd' in keys:
            is_crowd = sample['is_crowd']
        else:
            raise KeyError("The dataset doesn't have 'is_crowd' key.")
        if 'im_info' in keys:
            im_info = sample['im_info']
        else:
            raise KeyError("The dataset doesn't have 'im_info' key.")
        im_id = sample['im_id']

        outs = (im, im_info, im_id, gt_bbox, gt_class, is_crowd)
        gt_masks = []
        if self.is_mask and len(sample['gt_poly']) != 0 \
                and 'is_crowd' in keys:
            valid = True
            segms = sample['gt_poly']
            assert len(segms) == is_crowd.shape[0]
            for i in range(len(sample['gt_poly'])):
                segm, iscrowd = segms[i], is_crowd[i]
                gt_segm = []
                if iscrowd:
                    gt_segm.append([[0, 0]])
                else:
                    for poly in segm:
                        if len(poly) == 0:
                            valid = False
                            break
                        gt_segm.append(np.array(poly).reshape(-1, 2))
                if (not valid) or len(gt_segm) == 0:
                    break
                gt_masks.append(gt_segm)
            outs = outs + (gt_masks, )
        return outs


@register_op
class ArrangeEvalRCNN(BaseOperator):
    """
    Transform dict to the tuple format needed for evaluation.
    """

    def __init__(self):
        super(ArrangeEvalRCNN, self).__init__()

    def __call__(self, sample, context=None):
        """
        Args:
            sample: a dict which contains image
                    info and annotation info.
            context: a dict which contains additional info.
        Returns:
            sample: a tuple containing the following items:
                    (image, im_info, im_id, im_shape, gt_bbox,
                    gt_class, difficult)
        """
        im = sample['image']
        keys = list(sample.keys())
        if 'im_info' in keys:
            im_info = sample['im_info']
        else:
            raise KeyError("The dataset doesn't have 'im_info' key.")
        im_id = sample['im_id']
        h = sample['h']
        w = sample['w']
        # For rcnn models in eval and infer stage, original image size
        # is needed to clip the bounding boxes. And box clip op in
        # bbox prediction needs im_info as input in format of [N, 3],
        # so im_shape is appended by 1 to match dimension.
        im_shape = np.array((h, w, 1), dtype=np.float32)
        gt_bbox = sample['gt_bbox']
        gt_class = sample['gt_class']
        difficult = sample['difficult']
        outs = (im, im_info, im_id, im_shape, gt_bbox, gt_class, difficult)
        return outs


@register_op
class ArrangeTestRCNN(BaseOperator):
    """
    Transform dict to the tuple format needed for training.
    """

    def __init__(self):
        super(ArrangeTestRCNN, self).__init__()

    def __call__(self, sample, context=None):
        """
        Args:
            sample: a dict which contains image
                    info and annotation info.
            context: a dict which contains additional info.
        Returns:
            sample: a tuple containing the following items:
                    (image, im_info, im_id)
        """
        im = sample['image']
        keys = list(sample.keys())
        if 'im_info' in keys:
            im_info = sample['im_info']
        else:
            raise KeyError("The dataset doesn't have 'im_info' key.")
        im_id = sample['im_id']
        h = sample['h']
        w = sample['w']
        # For rcnn models in eval and infer stage, original image size
        # is needed to clip the bounding boxes. And box clip op in
        # bbox prediction needs im_info as input in format of [N, 3],
        # so im_shape is appended by 1 to match dimension.
        im_shape = np.array((h, w, 1), dtype=np.float32)
        outs = (im, im_info, im_id, im_shape)
        return outs


@register_op
class ArrangeSSD(BaseOperator):
    """
    Transform dict to tuple format needed for training.
    """

    def __init__(self):
        super(ArrangeSSD, self).__init__()

    def __call__(self, sample, context=None):
        """
        Args:
            sample: a dict which contains image
                    info and annotation info.
            context: a dict which contains additional info.
        Returns:
            sample: a tuple containing the following items:
                    (image, gt_bbox, gt_class, difficult)
        """
        im = sample['image']
        gt_bbox = sample['gt_bbox']
        gt_class = sample['gt_class']
        outs = (im, gt_bbox, gt_class)
        return outs


@register_op
class ArrangeEvalSSD(BaseOperator):
    """
    Transform dict to tuple format needed for training.
    """

    def __init__(self, fields):
        super(ArrangeEvalSSD, self).__init__()
        self.fields = fields

    def __call__(self, sample, context=None):
        """
        Args:
            sample: a dict which contains image
                    info and annotation info.
            context: a dict which contains additional info.
        Returns:
            sample: a tuple containing the following items: (image)
        """
        outs = []
        if len(sample['gt_bbox']) != len(sample['gt_class']):
            raise ValueError("gt num mismatch: bbox and class.")
        for field in self.fields:
            if field == 'im_shape':
                h = sample['h']
                w = sample['w']
                im_shape = np.array((h, w))
                outs.append(im_shape)
            elif field == 'is_difficult':
                outs.append(sample['difficult'])
            elif field == 'gt_box':
                outs.append(sample['gt_bbox'])
            elif field == 'gt_label':
                outs.append(sample['gt_class'])
            else:
                outs.append(sample[field])

        outs = tuple(outs)

        return outs


@register_op
class ArrangeTestSSD(BaseOperator):
    """
    Transform dict to tuple format needed for training.

    Args:
        is_mask (bool): whether to use include mask data
    """

    def __init__(self):
        super(ArrangeTestSSD, self).__init__()

    def __call__(self, sample, context=None):
        """
        Args:
            sample: a dict which contains image
                    info and annotation info.
            context: a dict which contains additional info.
        Returns:
            sample: a tuple containing the following items: (image)
        """
        im = sample['image']
        im_id = sample['im_id']
        h = sample['h']
        w = sample['w']
        im_shape = np.array((h, w))
        outs = (im, im_id, im_shape)
        return outs


@register_op
class ArrangeYOLO(BaseOperator):
    """
    Transform dict to the tuple format needed for training.
    """

    def __init__(self):
        super(ArrangeYOLO, self).__init__()

    def __call__(self, sample, context=None):
        """
        Args:
            sample: a dict which contains image
                    info and annotation info.
            context: a dict which contains additional info.
        Returns:
            sample: a tuple containing the following items:
                (image, gt_bbox, gt_class, gt_score,
                 is_crowd, im_info, gt_masks)
        """
        im = sample['image']
        if len(sample['gt_bbox']) != len(sample['gt_class']):
            raise ValueError("gt num mismatch: bbox and class.")
        if len(sample['gt_bbox']) != len(sample['gt_score']):
            raise ValueError("gt num mismatch: bbox and score.")
        gt_bbox = np.zeros((50, 4), dtype=im.dtype)
        gt_class = np.zeros((50, ), dtype=np.int32)
        gt_score = np.zeros((50, ), dtype=im.dtype)
        gt_num = min(50, len(sample['gt_bbox']))
        if gt_num > 0:
            gt_bbox[:gt_num, :] = sample['gt_bbox'][:gt_num, :]
            gt_class[:gt_num] = sample['gt_class'][:gt_num, 0]
            gt_score[:gt_num] = sample['gt_score'][:gt_num, 0]
        # parse [x1, y1, x2, y2] to [x, y, w, h]
        gt_bbox[:, 2:4] = gt_bbox[:, 2:4] - gt_bbox[:, :2]
        gt_bbox[:, :2] = gt_bbox[:, :2] + gt_bbox[:, 2:4] / 2.
        outs = (im, gt_bbox, gt_class, gt_score)
        return outs


@register_op
class ArrangeEvalYOLO(BaseOperator):
    """
    Transform dict to the tuple format needed for evaluation.
    """

    def __init__(self):
        super(ArrangeEvalYOLO, self).__init__()

    def __call__(self, sample, context=None):
        """
        Args:
            sample: a dict which contains image
                    info and annotation info.
            context: a dict which contains additional info.
        Returns:
            sample: a tuple containing the following items:
                (image, im_shape, im_id, gt_bbox, gt_class,
                 difficult)
        """
        im = sample['image']
        if len(sample['gt_bbox']) != len(sample['gt_class']):
            raise ValueError("gt num mismatch: bbox and class.")
        im_id = sample['im_id']
        h = sample['h']
        w = sample['w']
        im_shape = np.array((h, w))
        gt_bbox = np.zeros((50, 4), dtype=im.dtype)
        gt_class = np.zeros((50, ), dtype=np.int32)
        difficult = np.zeros((50, ), dtype=np.int32)
        gt_num = min(50, len(sample['gt_bbox']))
        if gt_num > 0:
            gt_bbox[:gt_num, :] = sample['gt_bbox'][:gt_num, :]
            gt_class[:gt_num] = sample['gt_class'][:gt_num, 0]
            difficult[:gt_num] = sample['difficult'][:gt_num, 0]
        outs = (im, im_shape, im_id, gt_bbox, gt_class, difficult)
        return outs


@register_op
class ArrangeTestYOLO(BaseOperator):
    """
    Transform dict to the tuple format needed for inference.
    """

    def __init__(self):
        super(ArrangeTestYOLO, self).__init__()

    def __call__(self, sample, context=None):
        """
        Args:
            sample: a dict which contains image
                    info and annotation info.
            context: a dict which contains additional info.
        Returns:
            sample: a tuple containing the following items:
                (image, gt_bbox, gt_class, gt_score, is_crowd,
                 im_info, gt_masks)
        """
        im = sample['image']
        im_id = sample['im_id']
        h = sample['h']
        w = sample['w']
        im_shape = np.array((h, w))
        outs = (im, im_shape, im_id)
        return outs