from torch import nn from torchvision import models from .tab_mlp import MLP from ..wdtypes import * # noqa: F403 def conv_layer( ni: int, nf: int, ks: int = 3, stride: int = 1, maxpool: bool = True, adaptiveavgpool: bool = False, ): layer = nn.Sequential( nn.Conv2d(ni, nf, kernel_size=ks, bias=True, stride=stride, padding=ks // 2), nn.BatchNorm2d(nf, momentum=0.01), nn.LeakyReLU(negative_slope=0.1, inplace=True), ) if maxpool: layer.add_module("maxpool", nn.MaxPool2d(2, 2)) if adaptiveavgpool: layer.add_module("adaptiveavgpool", nn.AdaptiveAvgPool2d(output_size=(1, 1))) return layer class DeepImage(nn.Module): r""" Standard image classifier/regressor using a pretrained network (in particular ResNets) or a sequence of 4 convolution layers. If ``pretrained=False`` the `'backbone'` of :obj:`DeepImage` will be a sequence of 4 convolutional layers comprised by: ``Conv2d -> BatchNorm2d -> LeakyReLU``. The 4th one will also add a final ``AdaptiveAvgPool2d`` operation. If ``pretrained=True`` the `'backbone'` will be ResNets. ResNets have 9 `'components'` before the last dense layers. The first 4 are: ``Conv2d -> BatchNorm2d -> ReLU -> MaxPool2d``. Then there are 4 additional resnet blocks comprised by a series of convolutions and then the final ``AdaptiveAvgPool2d``. Overall, ``4+4+1=9``. The parameter ``freeze_n`` sets the number of layers to be frozen. For example, ``freeze_n=6`` will freeze all but the last 3 layers. In addition to all of the above, there is the option to add a fully connected set of dense layers (referred as `imagehead`) on top of the stack of CNNs Parameters ---------- pretrained: bool, default = True Indicates whether or not we use a pretrained Resnet network or a series of conv layers (see conv_layer function) resnet_architecture: int, default = 18 The resnet architecture. One of 18, 34 or 50 freeze_n: int, default = 6 number of layers to freeze. Must be less than or equal to 8. If 8 the entire 'backbone' of the nwtwork will be frozen head_layers_dim: List, Optional List with the sizes of the stacked dense layers in the head e.g: [128, 64] head_dropout: List, Optional List with the dropout between the dense layers. e.g: [0.5, 0.5]. head_batchnorm: bool, Optional, default = False Boolean indicating whether or not to include batch normalizatin in the dense layers that form the imagehead Attributes ---------- backbone: :obj:`nn.Sequential` Sequential stack of CNNs comprising the 'backbone' of the network imagehead: :obj:`nn.Sequential` Sequential stack of dense layers comprising the FC-Head (aka imagehead) output_dim: :obj:`int` The output dimension of the model. This is a required attribute neccesary to build the WideDeep class Example -------- >>> import torch >>> from pytorch_widedeep.models import DeepImage >>> X_img = torch.rand((2,3,224,224)) >>> model = DeepImage(head_layers_dim=[512, 64, 8]) >>> out = model(X_img) """ def __init__( self, pretrained: bool = True, resnet_architecture: int = 18, freeze_n: int = 6, head_layers_dim: Optional[List[int]] = None, head_activation: Optional[str] = "relu", head_dropout: Optional[Union[float, List[float]]] = None, head_batchnorm: Optional[bool] = False, head_batchnorm_last: Optional[bool] = False, head_linear_first: Optional[bool] = False, ): super(DeepImage, self).__init__() self.pretrained = pretrained self.resnet_architecture = resnet_architecture self.freeze_n = freeze_n self.head_layers_dim = head_layers_dim self.head_activation = head_activation self.head_dropout = head_dropout self.head_batchnorm = head_batchnorm self.head_batchnorm_last = head_batchnorm_last self.head_linear_first = head_linear_first if pretrained: vision_model = self.select_resnet_architecture(resnet_architecture) backbone_layers = list(vision_model.children())[:-1] self.backbone = self._build_backbone(backbone_layers, freeze_n) else: self.backbone = self._conv_nn() # the output_dim attribute will be used as input_dim when "merging" the models self.output_dim = 512 if self.head_layers_dim is not None: assert self.head_layers_dim[0] == self.output_dim, ( "The output dimension from the backbone ({}) is not consistent with " "the expected input dimension ({}) of the fc-head".format( self.output_dim, self.head_layers_dim[0] ) ) self.imagehead = MLP( head_layers_dim, head_activation, head_dropout, head_batchnorm, head_batchnorm_last, head_linear_first, ) self.output_dim = head_layers_dim[-1] def forward(self, x: Tensor) -> Tensor: # type: ignore r"""Forward pass connecting the `'backbone'` with the `'head layers'`""" x = self.backbone(x) x = x.view(x.size(0), -1) if self.head_layers_dim is not None: out = self.imagehead(x) return out else: return x @staticmethod def select_resnet_architecture(resnet_architecture: int): if resnet_architecture == 18: return models.resnet18(pretrained=True) elif resnet_architecture == 34: return models.resnet34(pretrained=True) elif resnet_architecture == 50: return models.resnet50(pretrained=True) def _conv_nn(self): return nn.Sequential( conv_layer(3, 64, 3), conv_layer(64, 128, 1, maxpool=False), conv_layer(128, 256, 1, maxpool=False), conv_layer(256, 512, 1, maxpool=False, adaptiveavgpool=True), ) def _build_backbone(self, backbone_layers, freeze_n): """ Builds the backbone layers """ if freeze_n > 8: raise ValueError( "freeze_n' must be less than or equal to 8 for resnet architectures" ) frozen_layers = [] trainable_layers = backbone_layers[freeze_n:] for layer in backbone_layers[:freeze_n]: for param in layer.parameters(): param.requires_grad = False frozen_layers.append(layer) trainable_and_frozen_layers = frozen_layers + trainable_layers return nn.Sequential(*trainable_and_frozen_layers)