owl_vit_head.py

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This code is based on: https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.modeling.ops import get_act_fn

from ..utils import compute_box_bias

__all__ = ['PredictorMLP', 'ClassPredictor', 'OWLViTHead']


@register
class PredictorMLP(nn.Layer):
    """FFN block for predicting continuous outputs, e.g. bounding box coordinates.  

    Attributes:
      out_dim: Size of output of this mlp.
      num_layers: Number of layers.
      mlp_dim: Size of hidden dimension of dense layers.
      hidden_activation: Activation function of hidden layers.
      out_activation: Activation of the output.
      dtype: Data type, e.g. jnp.float32.

    """

    def __init__(self,
                 in_dim,
                 out_dim,
                 num_layers,
                 mlp_dim,
                 hidden_activation,
                 out_activation=None):
        super().__init__()

        layers = []
        for _ in range(num_layers - 1):
            layers.append(nn.Linear(in_dim, mlp_dim))
            in_dim = mlp_dim

        layers.append(nn.Linear(in_dim, out_dim))
        self.mlp = nn.LayerList(layers)
        self.num_layers = num_layers
        self.hidden_activation = get_act_fn(hidden_activation)
        self.out_activation = get_act_fn(out_activation)

    def forward(self, inputs):
        x = inputs
        for _ in range(self.num_layers - 1):
            x = self.mlp[i](x)
            x = self.hidden_activation(x)

        x = self.mlp[-1](x)
        x = self.out_activation(x)

        return x


@register
class ClassPredictor(nn.Layer):
    """Open-vocabulary instance class predictor."""

    def __init__(self, in_dim, out_dim, normalize):
        super().__init__()
        self.normalize = normalize
        self.out_dim = out_dim
        self.proj = nn.Linear(in_dim, out_dim)
        self.logit_shift = nn.Linear(in_dim, 1)
        self.logit_scale = nn.Linear(in_dim, 1)

    def forward(self, x, query_embeddings=None, query_mask=None):
        """Computes class prediction logits.

        Query embeddings from a text encoder define the classification label space.

        Args:
        x: Image features [batch_size, num_patches, emb_dim].
        query_embeddings: The embeddings to classify against of shape [batch_size,
            num_queries, out_dim]. If not specified, only the image class embeddings
            will be returned.
        query_mask: Mask indicating whether query is real (1) or padding (0), of
            shape [batch_size, num_queries].
        Returns:
        Dict with keys 'class_embeddings' and, if query embeddings were provided,
        'pred_logits'.
        """
        image_class_emb = self.proj(x)
        if query_embeddings is None:
            return {"class_embeddings": image_class_emb}

        if self.normalize:
            image_class_emb /= image_class_emb.norm(
                axis=-1, keepdims=True) + 1e-6
            query_embeddings /= query_embeddings.norm(
                axis=-1, keepdims=True) + 1e-6

        pred_logits = paddle.matmul(
            x=image_class_emb, y=query_embeddings, transpose_y=True)

        logit_shift = self.logit_shift(x)
        logit_scale = F.elu(self.logit_scale(x)) + 1
        pred_logits = (logit_shift + pred_logits) * logit_scale

        if query_mask is not None:
            if len(query_mask.shape) > 1:
                query_mask = query_mask.unsqueeze(-2)
            pred_logits = paddle.where(query_mask == 0, -1e6, pred_logits)

        return pred_logits, image_class_emb


@register
class OWLViTHead(nn.Layer):

    __inject__ = ['class_head, bbox_head', 'loss']

    def __init__(self, class_head, bbox_head, loss, box_bias='both'):
        super().__init__()

        self.class_head = class_head
        self.bbox_head = bbox_head
        self.box_bias = box_bias
        self.matcher = matcher
        self.loss = loss

    def box_predictor(self, image_features, feature_map):
        """Predicts bounding boxes from image features.

        Args:
        image_features: Feature tokens extracted from the image, returned by the
            `embedder` function.
        feature_map: A spatial re-arrangement of image_features, also returned by
            the `embedder` function.

        Returns:
        List of predicted boxes (cxcywh normalized to 0, 1) nested within
            a dictionary.
        """
        # Bounding box detection head [b, num_patches, 4].
        pred_boxes = self.obj_box_head(image_features)
        # We compute the location of each token on the grid and use it to compute
        # a bias for the bbox prediction, i.e., each token is biased towards
        # predicting its location on the grid as the center.
        pred_boxes += compute_box_bias(feature_map, kind=self.box_bias)
        pred_boxes = nn.sigmoid(pred_boxes)
        return pred_boxes

    def class_predictor(self,
                        image_features,
                        query_embeddings=None,
                        query_mask=None):
        """Applies the class head to the image features.

        Args:
        image_features: Feature tokens extracted by the image embedder.
        query_embeddings: Optional list of text (or image) embeddings. If no
            embeddings are provided, no logits will be computed and only the class
            embeddings for the image will be returned.
        query_mask: Must be provided with query_embeddings. A mask indicating
            which query embeddings are valid.

        Returns:
        A dictionary containing the class_embeddings and the pred_logits if
            query_embeddings and query_mask are provided.
        """
        return self.class_head(image_features, query_embeddings, query_mask)

    def forward(self, feature_map, query_embeddings, targets=None):
        b, c, h, w = feature_map.shape
        image_features = paddle.reshape(feature_map, (b, c, h * w))
        pred_boxes = self.box_predictor(image_features, feature_map)

        query_mask = (text_queries[..., 0] > 0).cast(paddle.float32)
        pred_logits, image_class_emb = self.class_predictor(
            image_features, query_embeddings, query_mask)

        if self.training:
            return self.get_loss([pred_boxes, pred_logits], targets)
        else:
            return self.get_pred(pred_boxes, pred_logits)

    def get_loss(self, head_outs, gt_meta):
        return self.loss(head_outs, gt_meta)