vision.py

#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ...fluid.data_feeder import check_variable_and_dtype
from ...fluid.layer_helper import LayerHelper
from ...fluid.framework import core, in_dygraph_mode

from ...fluid.layers import affine_channel  #DEFINE_ALIAS
from ...fluid.layers import affine_grid  #DEFINE_ALIAS
from ...fluid.layers import anchor_generator  #DEFINE_ALIAS
from ...fluid.layers import bipartite_match  #DEFINE_ALIAS
from ...fluid.layers import box_clip  #DEFINE_ALIAS
from ...fluid.layers import box_coder  #DEFINE_ALIAS
from ...fluid.layers import box_decoder_and_assign  #DEFINE_ALIAS
from ...fluid.layers import collect_fpn_proposals  #DEFINE_ALIAS
from ...fluid.layers import deformable_roi_pooling  #DEFINE_ALIAS
from ...fluid.layers import density_prior_box  #DEFINE_ALIAS
from ...fluid.layers import detection_output  #DEFINE_ALIAS
from ...fluid.layers import distribute_fpn_proposals  #DEFINE_ALIAS
from ...fluid.layers import generate_mask_labels  #DEFINE_ALIAS
from ...fluid.layers import generate_proposal_labels  #DEFINE_ALIAS
from ...fluid.layers import generate_proposals  #DEFINE_ALIAS
from ...fluid.layers import grid_sampler  #DEFINE_ALIAS
from ...fluid.layers import image_resize  #DEFINE_ALIAS
from ...fluid.layers import prior_box  #DEFINE_ALIAS
from ...fluid.layers import prroi_pool  #DEFINE_ALIAS
from ...fluid.layers import psroi_pool  #DEFINE_ALIAS
from ...fluid.layers import resize_bilinear  #DEFINE_ALIAS
from ...fluid.layers import resize_nearest  #DEFINE_ALIAS
from ...fluid.layers import resize_trilinear  #DEFINE_ALIAS
from ...fluid.layers import roi_align  #DEFINE_ALIAS
from ...fluid.layers import roi_pool  #DEFINE_ALIAS
from ...fluid.layers import space_to_depth  #DEFINE_ALIAS
from ...fluid.layers import yolo_box  #DEFINE_ALIAS
from ...fluid.layers import yolov3_loss  #DEFINE_ALIAS

from ...fluid.layers import fsp_matrix  #DEFINE_ALIAS
from ...fluid.layers import image_resize_short  #DEFINE_ALIAS
# from ...fluid.layers import pixel_shuffle  #DEFINE_ALIAS
from ...fluid.layers import retinanet_detection_output  #DEFINE_ALIAS
from ...fluid.layers import retinanet_target_assign  #DEFINE_ALIAS
from ...fluid.layers import roi_perspective_transform  #DEFINE_ALIAS
from ...fluid.layers import shuffle_channel  #DEFINE_ALIAS

__all__ = [
    'affine_channel',
    'affine_grid',
    'anchor_generator',
    'bipartite_match',
    'box_clip',
    'box_coder',
    'box_decoder_and_assign',
    'collect_fpn_proposals',
    #       'deformable_conv',
    'deformable_roi_pooling',
    'density_prior_box',
    'detection_output',
    'distribute_fpn_proposals',
    'fsp_matrix',
    'generate_mask_labels',
    'generate_proposal_labels',
    'generate_proposals',
    'grid_sampler',
    'image_resize',
    'image_resize_short',
    #       'multi_box_head',
    'pixel_shuffle',
    'prior_box',
    'prroi_pool',
    'psroi_pool',
    'resize_bilinear',
    'resize_nearest',
    'resize_trilinear',
    'retinanet_detection_output',
    'retinanet_target_assign',
    'roi_align',
    'roi_perspective_transform',
    'roi_pool',
    'shuffle_channel',
    'space_to_depth',
    'yolo_box',
    'yolov3_loss'
]

from ...fluid import core, dygraph_utils
from ...fluid.framework import Variable, in_dygraph_mode
from ...device import get_cudnn_version
import numpy as np


def grid_sample(x,
                grid,
                mode='bilinear',
                padding_mode='zeros',
                align_corners=True,
                name=None):
    """
    This operation samples input X by using bilinear interpolation or
    nearest interpolation based on flow field grid, which is usually
    generated by :code:`affine_grid` . The grid of shape [N, H, W, 2]
    is the concatenation of (x, y) coordinates with shape [N, H, W] each,
    where x is indexing the 4th dimension (in width dimension) of input
    data x and y is indexing the 3rd dimension (in height dimension),
    finally results is the bilinear interpolation or nearest value of 4 nearest corner
    points. The output tensor shape will be [N, C, H, W].
    .. code-block:: text
        Step 1:
        Get (x, y) grid coordinates and scale to [0, H-1/W-1].
        .. code-block:: text
            grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
            grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
        Step 2:
        Indices input data X with grid (x, y) in each [H, W] area, and bilinear
        interpolate point value by 4 nearest points or nearest interpolate point value
        by nearest point.
          wn ------- y_n ------- en
          |           |           |
          |          d_n          |
          |           |           |
         x_w --d_w-- grid--d_e-- x_e
          |           |           |
          |          d_s          |
          |           |           |
          ws ------- y_s ------- wn
        For bilinear interpolation:
        x_w = floor(x)              // west side x coord
        x_e = x_w + 1               // east side x coord
        y_n = floor(y)              // north side y coord
        y_s = y_s + 1               // south side y coord
        d_w = grid_x - x_w          // distance to west side
        d_e = x_e - grid_x          // distance to east side
        d_n = grid_y - y_n          // distance to north side
        d_s = y_s - grid_y          // distance to south side
        wn = X[:, :, y_n, x_w]      // north-west point value
        en = X[:, :, y_n, x_e]      // north-east point value
        ws = X[:, :, y_s, x_w]      // south-east point value
        es = X[:, :, y_s, x_w]      // north-east point value
        output = wn * d_e * d_s + en * d_w * d_s
               + ws * d_e * d_n + es * d_w * d_n
    Args:
        x(Tensor): The input tensor, which is a 4-d tensor with shape
                     [N, C, H, W], N is the batch size, C is the channel
                     number, H and W is the feature height and width.
                     The data type is float32 or float64.
        grid(Tensor): Input grid tensor of shape [N, grid_H, grid_W, 2]. The
                        data type is float32 or float64.
        mode(str, optional): The interpolation method which can be 'bilinear' or 'nearest'.
                         Default: 'bilinear'.
        padding_mode(str, optional) The padding method used when source index
                   is out of input images. It can be 'zeros', 'reflect' and 'border'.
                   Default: zeros.
        align_corners(bool, optional): If `align_corners` is true, it will projects
                   -1 and 1 to the centers of the corner pixels. Otherwise, it will
                   projects -1 and 1 to the image edges.
        name(str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.
    Returns: Tensor, The shape of output is [N, C, grid_H, grid_W] in which `grid_H` is the height of grid
                 and `grid_W` is the width of grid. The data type is same as input tensor.
    Examples:
        .. code-block:: python
            import paddle
            import paddle.nn.functional as F
            import numpy as np
            
            # shape=[1, 1, 3, 3]
            x = np.array([[[[-0.6,  0.8, -0.5],
                            [-0.5,  0.2,  1.2],
                            [ 1.4,  0.3, -0.2]]]]).astype("float64")
            
            # grid shape = [1, 3, 4, 2]
            grid = np.array(
                         [[[[ 0.2,  0.3],
                            [-0.4, -0.3],
                            [-0.9,  0.3],
                            [-0.9, -0.6]],
                           [[ 0.4,  0.1],
                            [ 0.9, -0.8],
                            [ 0.4,  0.5],
                            [ 0.5, -0.2]],
                           [[ 0.1, -0.8],
                            [-0.3, -1. ],
                            [ 0.7,  0.4],
                            [ 0.2,  0.8]]]]).astype("float64")
            
            paddle.disable_static()
            x = paddle.to_tensor(x)
            grid = paddle.to_tensor(grid)
            y_t = F.grid_sample(
                x,
                grid,
                mode='bilinear',
                padding_mode='border',
                align_corners=True)
            print(y_t.numpy())
            
            # output shape = [1, 1, 3, 4]
            # [[[[ 0.34   0.016  0.086 -0.448]
            #    [ 0.55  -0.076  0.35   0.59 ]
            #    [ 0.596  0.38   0.52   0.24 ]]]]
    """
    helper = LayerHelper("grid_sample", **locals())
    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'grid_sampler')
    check_variable_and_dtype(grid, 'grid', ['float32', 'float64'],
                             'grid_sampler')
    if not isinstance(x, Variable):
        raise ValueError("The x should be a Variable")
    if not isinstance(grid, Variable):
        raise ValueError("The grid should be a Variable")
    _modes = ['bilinear', 'nearest']
    _padding_modes = ['zeros', 'reflect', 'border']
    if mode not in _modes:
        raise ValueError(
            "The mode of grid sample function should be in {}, but got: {}".
            format(_modes, mode))
    if padding_mode not in _padding_modes:
        raise ValueError(
            "The padding mode of grid sample function should be in {}, but got: {}".
            format(_padding_modes, padding_mode))

    if not isinstance(align_corners, bool):
        raise ValueError("The align corners should be bool, but got: {}".format(
            align_corners))

    cudnn_version = get_cudnn_version()
    use_cudnn = False
    if (cudnn_version is not None
        ) and align_corners and mode == 'bilinear' and padding_mode == 'zeros':
        use_cudnn = True
    ipts = {'X': x, 'Grid': grid}
    attrs = {
        'mode': mode,
        'padding_mode': padding_mode,
        'align_corners': align_corners,
        'use_cudnn': use_cudnn
    }

    if in_dygraph_mode():
        attrs = ('mode', mode, 'padding_mode', padding_mode, 'align_corners',
                 align_corners, 'use_cudnn', use_cudnn)
        out = getattr(core.ops, 'grid_sampler')(x, grid, *attrs)
    else:
        out = helper.create_variable_for_type_inference(x.dtype)
        helper.append_op(
            type='grid_sampler',
            inputs=ipts,
            attrs=attrs,
            outputs={'Output': out})
    return out


def pixel_shuffle(x, upscale_factor, data_format="NCHW", name=None):
    """
    This API implements pixel shuffle operation.
    See more details in :ref:`api_nn_vision_PixelShuffle` .
    Parameters:
        x(Tensor): 4-D tensor, the data type should be float32 or float64.
        upscale_factor(int): factor to increase spatial resolution.
        data_format (str): The data format of the input and output data. An optional string from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in the order of: [batch_size, input_channels, input_height, input_width].
        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
    Returns:
        Out(tensor): Reshaped tensor according to the new dimension.
    Raises:
        ValueError: If the square of upscale_factor cannot divide the channels of input.
    Examples:
        .. code-block:: python
            import paddle
            import paddle.nn.functional as F
            import numpy as np
            x = np.random.randn(2, 9, 4, 4).astype(np.float32)
            paddle.disable_static()
            x_var = paddle.to_tensor(x)
            out_var = F.pixel_shuffle(x_var, 3)
            out = out_var.numpy()
            print(out.shape) 
            # (2, 1, 12, 12)
    """
    if not in_dygraph_mode():
        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                                 'pixel_shuffle')

    if not isinstance(upscale_factor, int):
        raise TypeError("upscale factor must be int type")

    if data_format not in ["NCHW", "NHWC"]:
        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'."
                         "But recevie Attr(data_format): {} ".format(
                             data_format))

    if in_dygraph_mode():
        return core.ops.pixel_shuffle(x, "upscale_factor", upscale_factor,
                                      "data_format", data_format)

    helper = LayerHelper("pixel_shuffle", **locals())

    out = helper.create_variable_for_type_inference(dtype=x.dtype)
    helper.append_op(
        type="pixel_shuffle",
        inputs={"X": x},
        outputs={"Out": out},
        attrs={"upscale_factor": upscale_factor,
               "data_format": data_format})
    return out