added documentation for model components. Arranged modules to expose utils

e2b8ccbe · jrzaurin · 333ffc9c · e2b8ccbe · e2b8ccbe · e2b8ccbe
16 changed file
--- a/examples/main_adult.py
+++ b/examples/main_adult.py
@@ -31,7 +31,6 @@ if __name__ == '__main__':
        ('occupation',10),('native_country',10)]
    continuous_cols = ["age","hours_per_week"]
    target = 'income_label'
-
    target = df[target].values
    prepare_wide = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
    X_wide = prepare_wide.fit_transform(df)
@@ -42,7 +41,7 @@ if __name__ == '__main__':
        output_dim=1)
    deepdense = DeepDense(
        hidden_layers=[64,32],
-        dropout=[0.5],
+        dropout=[0.2,0.2],
        deep_column_idx=prepare_deep.deep_column_idx,
        embed_input=prepare_deep.embeddings_input,
        continuous_cols=continuous_cols)
@@ -56,14 +55,14 @@ if __name__ == '__main__':
    optimizers = {'wide': wide_opt, 'deepdense':deep_opt}
    schedulers = {'wide': wide_sch, 'deepdense':deep_sch}
    initializers = {'wide': KaimingNormal, 'deepdense':XavierNormal}
-    callbacks = [LRHistory, EarlyStopping, ModelCheckpoint(filepath='../model_weights/wd_out')]
+    callbacks = [LRHistory(n_epochs=10), EarlyStopping, ModelCheckpoint(filepath='../model_weights/wd_out')]
    metrics = [BinaryAccuracy]

    model.compile(
-        method='logistic',
-        initializers=initializers,
+        method='binary',
        optimizers=optimizers,
        lr_schedulers=schedulers,
+        initializers=initializers,
        callbacks=callbacks,
        metrics=metrics)


--- a/examples/main_airbnb.py
+++ b/examples/main_airbnb.py
@@ -49,7 +49,7 @@ if __name__ == '__main__':
        output_dim=1)
    deepdense = DeepDense(
        hidden_layers=[64,32],
-        dropout=[0.5],
+        dropout=[0.2,0.2],
        deep_column_idx=prepare_deep.deep_column_idx,
        embed_input=prepare_deep.embeddings_input,
        continuous_cols=continuous_cols)
@@ -58,31 +58,26 @@ if __name__ == '__main__':
        hidden_dim=64,
        n_layers=3,
        rnn_dropout=0.5,
-        spatial_dropout=0.5,
        padding_idx=1,
        embedding_matrix=text_processor.embedding_matrix
        )
    deepimage = DeepImage(pretrained=True, head_layers=None)
    model = WideDeep(wide=wide, deepdense=deepdense, deeptext=deeptext,
-        deepimage=deepimage, head_layers=[256, 128, 64])
-    # pdb.set_trace()
+        deepimage=deepimage)

    wide_opt = torch.optim.Adam(model.wide.parameters())
    deep_opt = torch.optim.Adam(model.deepdense.parameters())
    text_opt = RAdam(model.deeptext.parameters())
    img_opt  = RAdam(model.deepimage.parameters())
-    head_opt = torch.optim.Adam(model.head.parameters())

    wide_sch = torch.optim.lr_scheduler.StepLR(wide_opt, step_size=5)
    deep_sch = torch.optim.lr_scheduler.StepLR(deep_opt, step_size=3)
    text_sch = torch.optim.lr_scheduler.StepLR(text_opt, step_size=5)
    img_sch  = torch.optim.lr_scheduler.StepLR(img_opt, step_size=3)
-    head_sch = torch.optim.lr_scheduler.StepLR(head_opt, step_size=5)

-    optimizers = {'wide': wide_opt, 'deepdense':deep_opt, 'deeptext':text_opt, 'deepimage': img_opt, 'head': head_opt}
-    schedulers = {'wide': wide_sch, 'deepdense':deep_sch, 'deeptext':text_sch, 'deepimage': img_sch, 'head': head_sch}
-    initializers = {'wide': KaimingNormal, 'deepdense':KaimingNormal, 'deeptext':KaimingNormal, 'deepimage':KaimingNormal,
-    'head': KaimingNormal}
+    optimizers = {'wide': wide_opt, 'deepdense':deep_opt, 'deeptext':text_opt, 'deepimage': img_opt}
+    schedulers = {'wide': wide_sch, 'deepdense':deep_sch, 'deeptext':text_sch, 'deepimage': img_sch}
+    initializers = {'wide': KaimingNormal, 'deepdense':KaimingNormal, 'deeptext':KaimingNormal, 'deepimage':KaimingNormal}
    mean = [0.406, 0.456, 0.485]  #BGR
    std =  [0.225, 0.224, 0.229]  #BGR
    transforms = [ToTensor, Normalize(mean=mean, std=std)]

--- a/pytorch_widedeep/callbacks.py
+++ b/pytorch_widedeep/callbacks.py
@@ -132,7 +132,9 @@ class History(Callback):


 class LRHistory(Callback):
-
+    def __init__(self, n_epochs):
+        super(LRHistory, self).__init__()
+        self.n_epochs = n_epochs

    def on_epoch_begin(self, epoch:int, logs:Optional[Dict]=None):
        if epoch==0 and self.model.lr_scheduler:
@@ -165,7 +167,7 @@ class LRHistory(Callback):
                        ("_").join(['lr', str(group_idx)]),[]).append(group['lr'])

    def on_epoch_end(self, epoch:int, logs:Optional[Dict]=None):
-        if self.model.lr_scheduler:
+        if epoch != (self.n_epochs-1) and self.model.lr_scheduler:
            if self.model.lr_scheduler.__class__.__name__ == 'MultipleLRScheduler':
                for model_name, opt in self.model.optimizer._optimizers.items():
                    if model_name in self.model.lr_scheduler._schedulers:

--- a/pytorch_widedeep/models/deep_dense.py
+++ b/pytorch_widedeep/models/deep_dense.py
@@ -22,6 +22,54 @@ def dense_layer(inp:int, out:int, dropout:float, batchnorm=False):


 class DeepDense(nn.Module):
+    r"""Dense branch of the deep side of the model. This class combines embedding
+    representations of the categorical features with numerical (aka
+    continuous) features. These are then passed through a series of dense
+    layers.
+
+    Parameters
+    ----------
+    deep_column_idx: Dict containing the index of the columns that will be
+        passed through the DeepDense model. Required to slice the tensors. e.g.
+        {'education': 0, 'relationship': 1, 'workclass': 2, ...}
+    hidden_layers: List with the number of neurons per dense layer. e.g: [64,32]
+    dropout: Optional List with the dropout between the dense layers.
+        e.g: [0.5,0.5]
+    batchnorm: Optional Boolean indicating whether or not to include batch
+        normalizatin in the dense layers
+    embeddings_input: Optional List of Tuples with the column name, number of
+        unique values and embedding dimension. e.g. [(education, 11, 32), ...]
+    continuous_cols: Optional List with the name of the numeric (aka
+        continuous) columns
+
+    **Either embeddings_input or continuous_cols (or both) should be passed to the
+    model
+
+    Attributes
+    ----------
+    dense: nn.Sequential model of dense layers that will receive the
+        concatenation of the  embeddings and the continuous columns
+    embed_layers: nn.ModuleDict with the embedding layers
+    output_dim: integer containing the output dimension of the model. This is a
+        required attribute neccesary to build the WideDeep class
+
+    Example
+    --------
+    >>> import torch
+    >>> from pytorch_widedeep.models import DeepDense
+    >>> X_deep = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)
+    >>> colnames = ['a', 'b', 'c', 'd', 'e']
+    >>> embed_input = [(u,i,j) for u,i,j in zip(colnames[:4], [4]*4, [8]*4)]
+    >>> deep_column_idx = {k:v for v,k in enumerate(colnames)}
+    >>> model = DeepDense(hidden_layers=[8,4], deep_column_idx=deep_column_idx, embed_input=embed_input)
+    >>> model(X_deep)
+    tensor([[ 3.4470e-02, -2.0089e-03,  4.7983e-02,  3.3500e-01],
+            [ 1.4329e-02, -1.3800e-03, -3.3617e-04,  4.1046e-01],
+            [-3.3546e-04,  3.2413e-02, -4.1198e-03,  4.8717e-01],
+            [-6.7882e-04,  7.9103e-03, -1.9960e-03,  4.2134e-01],
+            [ 6.7187e-02, -1.2821e-03, -3.0960e-04,  3.6123e-01]],
+           grad_fn=<LeakyReluBackward1>)
+    """
    def __init__(self,
        deep_column_idx:Dict[str,int],
        hidden_layers:List[int],
@@ -50,7 +98,7 @@ class DeepDense(nn.Module):
        # Dense Layers
        input_dim = emb_inp_dim + cont_inp_dim
        hidden_layers = [input_dim] + hidden_layers
-        dropout = [0.] + dropout if dropout is not None else [0.]*(len(hidden_layers)-1)
+        if not dropout: dropout = [0.]*len(hidden_layers)
        batchnorm = batchnorm if batchnorm is not None else False
        self.dense = nn.Sequential()
        for i in range(1, len(hidden_layers)):

--- a/pytorch_widedeep/models/deep_image.py
+++ b/pytorch_widedeep/models/deep_image.py
@@ -20,6 +20,52 @@ def conv_layer(ni:int, nf:int, ks:int=3, stride:int=1, maxpool:bool=True,


 class DeepImage(nn.Module):
+    r"""
+    Standard image classifier/regressor using a pretrained network freezing
+    some of the first layers, or all layers. I use Resnets which have 9
+    "components" before the last dense layers.
+    The first 4 are: conv->batchnorm->relu->maxpool.
+    After that we have 4 additional 'layers' (resnet blocks) (so 4+4=8)
+    comprised by a series of convolutions and then the final AdaptiveAvgPool2d
+    (8+1=9). The parameter freeze sets the layers to be frozen. For example,
+    freeze=6 will freeze all but the last 2 Layers and AdaptiveAvgPool2d
+    layer. If freeze='all' it freezes the entire network. In addition, there
+    is the option to add a Fully Connected (FC) set of dense layers (FC-Head,
+    referred as 'imagehead') on top of the stack of RNNs
+
+    Parameters
+    ----------
+    pretrained: boolean that indicates whether or not we use a pretrained Resnet network
+        or a series of conv layers (see conv_layer function)
+    resnet: int indicating the resnet architecture. One of 18, 34 or 50
+    freeze: int or string indicating the number of layers to freeze. If int
+        must be less than 8
+    head_layers: optional list with the sizes of the stacked dense layers in the head
+        e.g: [128, 64]
+    head_dropout: optional list with the dropout between the dense layers.
+        e.g: [0.5, 0.5].
+    head_batchnorm: Optional Boolean indicating whether or not to include batch
+        normalizatin in the dense layers that form the imagehead
+
+    Attributes
+    ----------
+    backbone: Sequential stack of CNNs comprising the 'backbone' of the network
+    imagehead: Sequential stack of dense layers comprising the FC-Head (aka imagehead)
+    output_dim: integer containing the output dimension of the model. This is a
+        required attribute neccesary to build the WideDeep class
+
+    Example
+    --------
+    >>> import torch
+    >>> from pytorch_widedeep.models import DeepImage
+    >>> X_img = torch.rand((2,3,224,224))
+    >>> model = DeepImage(head_layers=[512, 64, 8])
+    >>> model(X_img)
+    tensor([[ 7.7234e-02,  8.0923e-02,  2.3077e-01, -5.1122e-03, -4.3018e-03,
+              3.1193e-01,  3.0780e-01,  6.5098e-01],
+            [ 4.6191e-02,  6.7856e-02, -3.0163e-04, -3.7670e-03, -2.1437e-03,
+              1.5416e-01,  3.9227e-01,  5.5048e-01]], grad_fn=<LeakyReluBackward1>)
+    """

    def __init__(self,
        pretrained:bool=True,
@@ -29,20 +75,6 @@ class DeepImage(nn.Module):
        head_dropout:Optional[List[float]]=None,
        head_batchnorm:Optional[bool] = False):
        super(DeepImage, self).__init__()
-        """
-        Standard image classifier/regressor using a pretrained network
-        freezing some of the  first layers (or all layers).
-
-        I use Resnets which have 9 "components" before the last dense layers.
-        The first 4 are: conv->batchnorm->relu->maxpool.
-
-        After that we have 4 additional 'layers' (so 4+4=8) comprised by a
-        series of convolutions and then the final AdaptiveAvgPool2d (8+1=9).
-
-        The parameter freeze sets the last layer to be frozen. For example,
-        freeze=6 will freeze all but the last 2 Layers and AdaptiveAvgPool2d
-        layer. If freeze='all' it freezes the entire network.
-        """

        self.head_layers = head_layers

@@ -86,9 +118,14 @@ class DeepImage(nn.Module):
        self.output_dim = 512

        if self.head_layers is not None:
-            self.head = nn.Sequential()
+            assert self.head_layers[0]==self.output_dim, (
+                "The output dimension from the backbone ({}) is not consistent with "
+                "the expected input dimension ({}) of the fc-head".format(
+                    self.output_dim, self.head_layers[0]))
+            if not head_dropout: head_dropout = [0.]*len(head_layers)
+            self.imagehead = nn.Sequential()
            for i in range(1, len(head_layers)):
-                self.head.add_module(
+                self.imagehead.add_module(
                    'dense_layer_{}'.format(i-1),
                    dense_layer(head_layers[i-1], head_layers[i], head_dropout[i-1], head_batchnorm)
                    )
@@ -98,7 +135,7 @@ class DeepImage(nn.Module):
        x = self.backbone(x)
        x = x.view(x.size(0), -1)
        if self.head_layers is not None:
-            out = self.head(x)
+            out = self.imagehead(x)
            return out
        else:
            return x
\ No newline at end of file
--- a/pytorch_widedeep/models/deep_text.py
+++ b/pytorch_widedeep/models/deep_text.py
@@ -4,22 +4,68 @@ import warnings

 from torch import nn
 from ..wdtypes import *
+from .deep_dense import dense_layer


 class DeepText(nn.Module):
+    r"""Standard text classifier/regressor comprised by a stack of RNNs (LSTMs).
+    In addition, there is the option to add a Fully Connected (FC) set of dense
+    layers (FC-Head, referred as 'texthead') on top of the stack of RNNs
+
+    Parameters
+    ----------
+    vocab_size: number of words in the vocabulary
+    hidden_dim: number of features in the hidden state h of the LSTM
+    n_layers: number of recurrent layers
+    rnn_dropout: dropout layer on the outputs of each LSTM layer except the last
+        layer
+    bidirectional: boolean indicating whether the staked RNNs are bidirectional
+    padding_idx: index of the padding token in the padded-tokenised sequences.
+        default: 1. I use the fastai tokenizer where the token index 0 is
+        reserved for the  unknown word token
+    embed_dim: optional integer indicating the dimension of the word embedding matrix
+    embedding_matrix: optional array with pretrained word embeddings
+    head_layers: optional list with the sizes of the stacked dense layers in the head
+        e.g: [128, 64]
+    head_dropout: optional list with the dropout between the dense layers.
+        e.g: [0.5, 0.5].
+    head_batchnorm: Optional Boolean indicating whether or not to include batch
+        normalizatin in the dense layers that form the texthead
+
+    Attributes
+    ----------
+    word_embed: Module with the word embedding matrix
+    rnn: Module with the stack of LSTMs
+    texthead: optional Sequential stack of dense layers
+    output_dim: integer containing the output dimension of the model. This is a
+        required attribute neccesary to build the WideDeep class
+
+    Example
+    --------
+    >>> import torch
+    >>> from pytorch_widedeep.models import DeepText
+    >>> X_text = torch.cat((torch.zeros([5,1]), torch.empty(5, 4).random_(1,4)), axis=1)
+    >>> model = DeepText(vocab_size=4, hidden_dim=4, n_layers=1, padding_idx=0, embed_dim=4)
+    >>> model(X_text)
+    tensor([[ 0.0315,  0.0393, -0.0618, -0.0561],
+            [-0.0674,  0.0297, -0.1118, -0.0668],
+            [-0.0446,  0.0814, -0.0921, -0.0338],
+            [-0.0844,  0.0681, -0.1016, -0.0464],
+            [-0.0268,  0.0294, -0.0988, -0.0666]], grad_fn=<SelectBackward>)
+    """
    def __init__(self,
        vocab_size:int,
        hidden_dim:int=64,
        n_layers:int=3,
        rnn_dropout:float=0.,
-        padding_idx:int=1,
        bidirectional:bool=False,
+        padding_idx:int=1,
        embed_dim:Optional[int]=None,
-        embedding_matrix:Optional[np.ndarray]=None):
+        embedding_matrix:Optional[np.ndarray]=None,
+        head_layers:Optional[List[int]] = None,
+        head_dropout:Optional[List[float]]=None,
+        head_batchnorm:Optional[bool] = False):
        super(DeepText, self).__init__()
-        """
-        Standard Text Classifier/Regressor with a stack of RNNs.
-        """

        if embed_dim is not None and embedding_matrix is not None and not embed_dim==embedding_matrix.shape[1]:
            warnings.warn(
@@ -29,7 +75,7 @@ class DeepText(nn.Module):
                    embedding_matrix.shape[1]), UserWarning)

        self.bidirectional = bidirectional
-        self.word_embed_dropout = nn.Dropout2d(spatial_dropout)
+        self.head_layers = head_layers

        # Pre-trained Embeddings
        if isinstance(embedding_matrix, np.ndarray):
@@ -50,6 +96,20 @@ class DeepText(nn.Module):
        # the output_dim attribute will be used as input_dim when "merging" the models
        self.output_dim = hidden_dim*2 if bidirectional else hidden_dim

+        if self.head_layers is not None:
+            assert self.head_layers[0]==self.output_dim, (
+                "The output dimension from the stack or RNNs ({}) is not consistent with "
+                "the expected input dimension ({}) of the fc-head".format(
+                    self.output_dim, self.head_layers[0]))
+            if not head_dropout: head_dropout = [0.]*len(head_layers)
+            self.texthead = nn.Sequential()
+            for i in range(1, len(head_layers)):
+                self.texthead.add_module(
+                    'dense_layer_{}'.format(i-1),
+                    dense_layer(head_layers[i-1], head_layers[i], head_dropout[i-1], head_batchnorm)
+                    )
+            self.output_dim = head_layers[-1]
+
    def forward(self, X:Tensor)->Tensor:

        embed = self.word_embed(X.long())
@@ -58,4 +118,8 @@ class DeepText(nn.Module):
            last_h = torch.cat((h[-2], h[-1]), dim = 1)
        else:
            last_h = h[-1]
-        return last_h
+        if self.head_layers is not None:
+            out = self.head(last_h)
+            return out
+        else:
+            return last_h
\ No newline at end of file
--- a/pytorch_widedeep/models/wide.py
+++ b/pytorch_widedeep/models/wide.py
@@ -4,6 +4,30 @@ from torch import nn
 from ..wdtypes import *

 class Wide(nn.Module):
+    r"""simple linear layer between the one-hot encoded wide input and the output
+    neuron.
+
+    Parameters
+    ----------
+	wide_dim: size of the input tensor
+	output_dim: size of the ouput tensor
+
+    Attributes
+    ----------
+	wide_linear: the linear layer that comprises the wide branch of the model
+
+    Example
+    --------
+	>>> import torch
+	>>> from pytorch_widedeep.models import Wide
+	>>> X = torch.empty(4, 4).random_(2)
+	>>> wide = Wide(wide_dim=X.size(0), output_dim=1)
+	>>> wide(X)
+	tensor([[-0.8841],
+	        [-0.8633],
+	        [-1.2713],
+	        [-0.4762]], grad_fn=<AddmmBackward>)
+    """
    def __init__(self,wide_dim:int, output_dim:int=1):
        super(Wide, self).__init__()
        self.wide_linear = nn.Linear(wide_dim, output_dim)

--- a/pytorch_widedeep/models/wide_deep.py
+++ b/pytorch_widedeep/models/wide_deep.py
--- a/pytorch_widedeep/preprocessing/__init__.py
+++ b/pytorch_widedeep/preprocessing/__init__.py
 from ._preprocessors import WidePreprocessor
 from ._preprocessors import DeepPreprocessor
 from ._preprocessors import TextPreprocessor
-from ._preprocessors import ImagePreprocessor
+from ._preprocessors import ImagePreprocessor
\ No newline at end of file
--- a/pytorch_widedeep/preprocessing/_preprocessors.py
+++ b/pytorch_widedeep/preprocessing/_preprocessors.py
@@ -10,9 +10,9 @@ from scipy.sparse import csc_matrix
 from tqdm import tqdm

 from ..wdtypes import *
-from .utils.dense_utils import *
-from .utils.text_utils import *
-from .utils.image_utils import *
+from ..utils.dense_utils import *
+from ..utils.text_utils import *
+from ..utils.image_utils import *


 class BasePreprocessor(object):
@@ -184,7 +184,8 @@ class TextPreprocessor(BasePreprocessor):
        if self.verbose:
            print("The vocabulary contains {} words".format(len(self.vocab.stoi)))
        if self.word_vectors_path is not None:
-            self.embedding_matrix = build_embeddings_matrix(self.vocab, self.word_vectors_path)
+            self.embedding_matrix = build_embeddings_matrix(self.vocab, self.word_vectors_path,
+                self.min_freq)
        return padded_seq

    def fit_transform(self, df:pd.DataFrame, text_col:str)->np.ndarray:

--- a/pytorch_widedeep/preprocessing/utils/__init__.py
+++ b/pytorch_widedeep/preprocessing/utils/__init__.py
--- a/pytorch_widedeep/preprocessing/utils/dense_utils.py
+++ b/pytorch_widedeep/preprocessing/utils/dense_utils.py
 import numpy as np
 import pandas as pd

-from ...wdtypes import *
+from ..wdtypes import *


 pd.options.mode.chained_assignment = None

--- a/pytorch_widedeep/preprocessing/utils/fastai_transforms.py
+++ b/pytorch_widedeep/preprocessing/utils/fastai_transforms.py
@@ -8,7 +8,7 @@ way I avoid the numerous fastai dependencies.
 Credit for the code here to Jeremy Howard and the fastai team
 '''

-from ...wdtypes import *
+from ..wdtypes import *

 import sys
 import os

--- a/pytorch_widedeep/preprocessing/utils/image_utils.py
+++ b/pytorch_widedeep/preprocessing/utils/image_utils.py
@@ -11,7 +11,7 @@ import numpy as np
 import imutils
 import cv2

-from ...wdtypes import *
+from ..wdtypes import *


 class AspectAwarePreprocessor:

--- a/pytorch_widedeep/preprocessing/utils/text_utils.py
+++ b/pytorch_widedeep/preprocessing/utils/text_utils.py
@@ -4,7 +4,7 @@ import html
 import os
 import re

-from ...wdtypes import *
+from ..wdtypes import *
 from .fastai_transforms import Tokenizer, Vocab
 from gensim.utils import tokenize

@@ -35,7 +35,8 @@ def pad_sequences(seq:List[int], maxlen:int, pad_first:bool=True, pad_idx:int=1)
        return res


-def build_embeddings_matrix(vocab:Vocab, word_vectors_path:str, verbose:int=1) -> np.ndarray:
+def build_embeddings_matrix(vocab:Vocab, word_vectors_path:str, min_freq:int,
+	verbose:int=1) -> np.ndarray:

 	if not os.path.isfile(word_vectors_path):
 		raise FileNotFoundError("{} not found".format(word_vectors_path))
@@ -68,7 +69,8 @@ def build_embeddings_matrix(vocab:Vocab, word_vectors_path:str, verbose:int=1) -
 	        embedding_matrix[i] = mean_word_vector

 	if verbose:
-		print('{} words in the vocabulary had {} vectors and appear more than the min frequency'.format(found_words, word_vectors_path))
+		print('{} words in the vocabulary had {} vectors and appear more than {} times'.format(
+			found_words, word_vectors_path, min_freq))

 	return embedding_matrix

--- a/pytorch_widedeep/wdtypes.py
+++ b/pytorch_widedeep/wdtypes.py
 import sys
-import scipy

 from torch.nn import Module
 from torch import Tensor
@@ -10,9 +9,7 @@ from torch.optim.lr_scheduler import _LRScheduler
 from pathlib import PosixPath
 from typing import (List, Any, Union, Dict, Callable, Optional, Tuple,
 	Generator, Collection, Iterable)
-
-
-sparse_matrix = Union[scipy.sparse.csr.csr_matrix]
+from scipy.sparse.csr import csr_matrix as sparse_matrix

 SimpleNamespace = type(sys.implementation)
 ListRules = Collection[Callable[[str],str]]