models.py

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This code is based on: https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from collections import OrderedDict
import numpy as np

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Normal, Constant

from ppdet.modeling.initializer import zeros_, normal_
from ppdet.core.workspace import register

from .layers import *

__all__ = ['ModifiedResNet', 'VisionTransformer', 'TextEncoder']


@register
class ModifiedResNet(nn.Layer):
    """
    A ResNet class that is similar to torchvision's but contains the following changes:
    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
    - The final pooling layer is a QKV attention instead of an average pool
    """

    def __init__(self,
                 layers,
                 output_dim,
                 heads,
                 input_resolution=224,
                 width=64):
        super().__init__()

        self.output_dim = output_dim
        self.input_resolution = input_resolution

        # the 3-layer stem
        self.conv1 = nn.Conv2D(
            3, width // 2, kernel_size=3, stride=2, padding=1, bias_attr=False)
        self.bn1 = nn.BatchNorm2D(width // 2)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2D(
            width // 2, width // 2, kernel_size=3, padding=1, bias_attr=False)
        self.bn2 = nn.BatchNorm2D(width // 2)
        self.relu2 = nn.ReLU()
        self.conv3 = nn.Conv2D(
            width // 2, width, kernel_size=3, padding=1, bias_attr=False)
        self.bn3 = nn.BatchNorm2D(width)
        self.relu3 = nn.ReLU()
        self.avgpool = nn.AvgPool2D(2)

        # residual layers
        self._inplanes = width  # this is a *mutable* variable used during construction
        self.layer1 = self._make_layer(width, layers[0])
        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)

        embed_dim = width * 32  # the ResNet feature dimension
        self.attnpool = AttentionPool2D(input_resolution // 32, embed_dim,
                                        heads, output_dim)

    def _make_layer(self, planes, blocks, stride=1):
        layers = [Bottleneck(self._inplanes, planes, stride)]

        self._inplanes = planes * Bottleneck.expansion
        for _ in range(1, blocks):
            layers.append(Bottleneck(self._inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = x.cast(self.conv1.weight.dtype)
        x = self.relu1(self.bn1(self.conv1(x)))
        x = self.relu2(self.bn2(self.conv2(x)))
        x = self.relu3(self.bn3(self.conv3(x)))
        x = self.avgpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.attnpool(x)
        return x


@register
class VisionTransformer(nn.Layer):
    def __init__(self,
                 input_resolution,
                 patch_size,
                 width,
                 layers,
                 heads,
                 output_dim=None,
                 stochastic_droplayer_rate=0.0):
        super().__init__()
        self.input_resolution = input_resolution
        self.output_dim = output_dim
        self.conv1 = nn.Conv2D(
            in_channels=3,
            out_channels=width,
            kernel_size=patch_size,
            stride=patch_size,
            bias=False)
        scale = width**-0.5
        self.class_embedding = self.create_parameter(
            shape=[width], attr=ParamAttr(initializer=Normal(std=scale)))
        self.positional_embedding = self.create_parameter(
            shape=[(input_resolution // patch_size)**2 + 1, width],
            attr=ParamAttr(initializer=Normal(std=scale)))
        self.ln_pre = LayerNorm(width)
        self.transformer = Transformer(width, layers, heads,
                                       stochastic_droplayer_rate)
        self.ln_post = LayerNorm(width)
        if output_dim is not None:
            self.proj = nn.Linear(self.width, self.output_dim, bias_attr=False)

    def forward(self, x):
        x = self.conv1(x)
        x = x.reshape([x.shape[0], x.shape[1], -1])
        x = x.transpose([0, 2, 1])
        class_embedding = self.class_embedding.cast(x.dtype) + paddle.zeros(
            [x.shape[0], 1, x.shape[-1]], type=x.dtype)
        x = paddle.concat([class_embedding, x], axis=1)
        x = x + self.positional_embedding.cast(x.dtype)
        x = self.ln_pre(x)
        x = feature = self.transformer(x)
        if self.output_dim is not None:
            x = self.ln_post(x[:, 0, :])
            x = self.proj(x)
        else:
            x = self.ln_post(x)

        return x, feature


@register
class TextEncoder(nn.Layer):
    def __init__(self, context_length, vocab_size, transformer_width,
                 transformer_heads, transformer_layers,
                 stochastic_droplayer_rate):
        super().__init__()
        self.context_length = context_length

        self.transformer = Transformer(
            width=transformer_width,
            layers=transformer_layers,
            heads=transformer_heads,
            stochastic_droplayer_rate=stochastic_droplayer_rate,
            attn_mask=self.build_attention_mask())

        self.vocab_size = vocab_size
        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
        self.positional_embedding = self.create_parameter(
            shape=[transformer_width, embed_dim],
            attr=ParamAttr(initializer=Constant(0.0)))
        self.ln_final = LayerNorm(transformer_width)
        self.text_projection = nn.Linear(
            transformer_width, embed_dim, bias_attr=False)
        self.logit_scale = self.create_parameter(
            shape=[], attr=ParamAttr(initializer=Constant(np.log(1. / 0.07))))

    def build_attention_mask(self):
        # lazily create causal attention mask, with full attention between the vision tokens
        # pytorch uses additive attention mask; fill with -inf
        mask = paddle.full((self.context_length, self.context_length),
                           float("-inf"))
        mask = paddle.triu(mask)
        return mask

    def forward(self, text):
        x = self.token_embedding(text)  # [batch_size, n_ctx, d_model]

        x = x + self.positional_embedding.cast(x.dtype)
        x = self.transformer(x)
        x = self.ln_final(x).cast(x.dtype)

        # x.shape = [batch_size, text_length, transformer.width]
        # take features from the eot embedding (eot_token is the highest number in each sequence)
        batch_idx = paddle.arange(x.shape(0))
        seq_idx = text.argmax(dim=-1)
        gather_idx = paddle.stack([batch_idx, seq_idx], axis=1)
        x = paddle.gather_nd(x, gather_idx)
        x = self.text_projection(x)

        return x