import torch from ..wdtypes import * from .deep_dense import dense_layer from torch import nn from torchvision import models def conv_layer(ni:int, nf:int, ks:int=3, stride:int=1, maxpool:bool=True, adaptiveavgpool:bool=False): layer = nn.Sequential( nn.Conv2d(ni, nf, kernel_size=ks, bias=True, stride=stride, padding=ks//2), nn.BatchNorm2d(nf, momentum=0.01), nn.LeakyReLU(negative_slope=0.1, inplace=True)) if maxpool: layer.add_module('maxpool', nn.MaxPool2d(2, 2)) if adaptiveavgpool: layer.add_module('adaptiveavgpool', nn.AdaptiveAvgPool2d(output_size=(1, 1))) return layer class DeepImage(nn.Module): r""" Standard image classifier/regressor using a pretrained network freezing some of the first layers, or all layers. I use Resnets which have 9 "components" before the last dense layers. The first 4 are: conv->batchnorm->relu->maxpool. After that we have 4 additional 'layers' (resnet blocks) (so 4+4=8) comprised by a series of convolutions and then the final AdaptiveAvgPool2d (8+1=9). The parameter freeze sets the layers to be frozen. For example, freeze=6 will freeze all but the last 2 Layers and AdaptiveAvgPool2d layer. If freeze='all' it freezes the entire network. In addition, there is the option to add a Fully Connected (FC) set of dense layers (FC-Head, referred as 'imagehead') on top of the stack of RNNs Parameters ---------- pretrained: boolean that indicates whether or not we use a pretrained Resnet network or a series of conv layers (see conv_layer function) resnet: int indicating the resnet architecture. One of 18, 34 or 50 freeze: int or string indicating the number of layers to freeze. If int must be less than 8 head_layers: optional list with the sizes of the stacked dense layers in the head e.g: [128, 64] head_dropout: optional list with the dropout between the dense layers. e.g: [0.5, 0.5]. head_batchnorm: Optional Boolean indicating whether or not to include batch normalizatin in the dense layers that form the imagehead Attributes ---------- backbone: Sequential stack of CNNs comprising the 'backbone' of the network imagehead: Sequential stack of dense layers comprising the FC-Head (aka imagehead) output_dim: integer containing the output dimension of the model. This is a required attribute neccesary to build the WideDeep class Example -------- >>> import torch >>> from pytorch_widedeep.models import DeepImage >>> X_img = torch.rand((2,3,224,224)) >>> model = DeepImage(head_layers=[512, 64, 8]) >>> model(X_img) tensor([[ 7.7234e-02, 8.0923e-02, 2.3077e-01, -5.1122e-03, -4.3018e-03, 3.1193e-01, 3.0780e-01, 6.5098e-01], [ 4.6191e-02, 6.7856e-02, -3.0163e-04, -3.7670e-03, -2.1437e-03, 1.5416e-01, 3.9227e-01, 5.5048e-01]], grad_fn=) """ def __init__(self, pretrained:bool=True, resnet:int=18, freeze:Union[str,int]=6, head_layers:Optional[List[int]] = None, head_dropout:Optional[List[float]]=None, head_batchnorm:Optional[bool] = False): super(DeepImage, self).__init__() self.head_layers = head_layers if pretrained: if resnet==18: vision_model = models.resnet18(pretrained=True) elif resnet==34: vision_model = models.resnet34(pretrained=True) elif resnet==50: vision_model = models.resnet50(pretrained=True) backbone_layers = list(vision_model.children())[:-1] if isinstance(freeze, str): frozen_layers = [] for layer in backbone_layers: for param in layer.parameters(): param.requires_grad = False frozen_layers.append(layer) self.backbone = nn.Sequential(*frozen_layers) if isinstance(freeze, int): assert freeze < 8, "freeze' must be less than 8 when using resnet architectures" frozen_layers = [] trainable_layers = backbone_layers[freeze:] for layer in backbone_layers[:freeze]: for param in layer.parameters(): param.requires_grad = False frozen_layers.append(layer) backbone_layers = frozen_layers + trainable_layers self.backbone = nn.Sequential(*backbone_layers) else: self.backbone = nn.Sequential( conv_layer(3, 64, 3), conv_layer(64, 128, 1, maxpool=False), conv_layer(128, 256, 1, maxpool=False), conv_layer(256, 512, 1, maxpool=False, adaptiveavgpool=True), ) # the output_dim attribute will be used as input_dim when "merging" the models self.output_dim = 512 if self.head_layers is not None: assert self.head_layers[0]==self.output_dim, ( "The output dimension from the backbone ({}) is not consistent with " "the expected input dimension ({}) of the fc-head".format( self.output_dim, self.head_layers[0])) if not head_dropout: head_dropout = [0.]*len(head_layers) self.imagehead = nn.Sequential() for i in range(1, len(head_layers)): self.imagehead.add_module( 'dense_layer_{}'.format(i-1), dense_layer(head_layers[i-1], head_layers[i], head_dropout[i-1], head_batchnorm) ) self.output_dim = head_layers[-1] def forward(self, x:Tensor)->Tensor: x = self.backbone(x) x = x.view(x.size(0), -1) if self.head_layers is not None: out = self.imagehead(x) return out else: return x