utils.py

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This code is based on: https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np

import paddle
import paddle.nn.functional as F

IMAGE_MEAN = paddle.to_tensor([0.48145466, 0.4578275, 0.40821073])
IMAGE_STD = paddle.to_tensor([0.26862954, 0.26130258, 0.27577711])


def normalize_image(img):
    return (img - IMAGE_MEAN) / IMAGE_STD


def unnormalize_image(x):
    return x * IMAGE_STD + IMAGE_MEAN


def resize_posemb(posemb, target_size):
    """Resizes position embeddings to new resolution."""
    if target_size == posemb.shape[1]:
        return posemb

    gs_old = int(np.sqrt(posemb.shape[1]))
    gs_new = int(np.sqrt(target_size))

    posemb_tok = None
    if gs_old**2 == posemb.shape[1]:
        posemb_grid = posemb
    elif gs_old**2 == posemb.shape[1] - 1:
        posemb_tok, posemb_grid = posemb[:, :1], posemb[:, 1:]
    else:
        raise ValueError(
            'Posemb shape must be a perfect square (maybe with CLS token), but '
            f'got posemb of shape {posemb.shape}.')

    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).transpose(
        [0, 3, 1, 2])
    posemb_grid = F.interpolate(
        posemb_grid, size=gs_new, mode='bilinear', align_corners=False)
    posemb_grid = posemb_grid.transpose([0, 2, 3, 1]).reshape(1, gs_new[0] *
                                                              gs_new[1], -1)
    if posemb_tok is not None:
        posemb = paddle.concat([posemb_tok, posemb], axis=1)

    return posemb


def seq2img(original_img, features):
    """Reshapes 1D sequence to 2D image features."""
    if original_img.shape[2] == original_img.shape[3]:
        h = w = int(np.sqrt(features.shape[2]))
    else:
        stride = np.ceil(
            np.sqrt(original_img.shape[2] * original_img.shape[3] /
                    features.shape[2]))
        h = np.ceil(original_img.shape[2] / stride)
        w = np.ceil(original_img.shape[3] / stride)
    return features.reshape([features.shape[0], -1, int(h), int(w)])


def normalized_grid_corner_coordinates(feature_map, padding_mask):
    """Computes normalized xy corner coords from feature_map or padding_mask."""
    # Note 1: it computes not the centers of grid patches, but the patch corner
    # coordinates (for a grid patch from 0 to 0.1, it returns 0.1 not 0.05).
    # Note 2: behavior is quite different for feature_map and padding_mask inputs.
    if padding_mask is None:
        assert len(feature_map.shape) == 4  # [B, C, H, W]
        _, _, h, w = paddle.shape(feature_map)
        shift_x = paddle.arange(1, w + 1)
        shift_y = paddle.arange(1, h + 1)
        shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
        # [H, W, 2]
        xy = paddle.cast(
            paddle.stack(
                [shift_x, shift_y], axis=-1), dtype='float32')
        xy = xy / paddle.concat([w, h])
    else:
        assert len(padding_mask.shape) == 3  # [B, H, W]
        padding_mask = padding_mask.cast(paddle.float32)
        y = paddle.cumsum(padding_mask, axis=1)
        x = paddle.cumsum(padding_mask, axis=2)
        # [B, H, W, 2]
        xy = paddle.stack(
            [x / (x[:, :, -1:] + 1e-6), y / (y[:, -1:] + 1e-6)], axis=-1)

    return xy.reshape(xy.shape[:-3] + [-1, 2])


def compute_box_bias(feature_map, padding_mask, kind='both'):
    """Computes spatial bias for grid."""
    # The box center is biased to its position on the feature grid:
    xy = normalized_grid_corner_coordinates(feature_map, padding_mask)
    xy = paddle.clip(xy, 0.0, 1.0)

    if kind in ['both', 'location']:
        # Unnormalize xy (i.e., apply logit function/sigmoid^-1).
        xy_bias = logit(xy)
    else:
        xy_bias = paddle.zeros_like(xy)

    if kind in ['both', 'size']:
        # The box size is biased to the patch size:
        wh_bias = logit(paddle.full_like(xy_bias, 1.0 / feature_map.shape[-1]))
    else:
        wh_bias = paddle.zeros_like(xy_bias)

    return paddle.concat([xy_bias, wh_bias], axis=-1)


def logit(x, eps=1e-4):
    """Logit (inverse sigmoid) function (https://en.wikipedia.org/wiki/Logit)."""
    return paddle.log(x + eps) - paddle.log1p(-x + eps)