refined the documentation for the WarmUp class. Fix an issue regarding to the...

refined the documentation for the WarmUp class. Fix an issue regarding to the model type. Added a test for the warm up functionalities

refined the documentation for the WarmUp class. Fix an issue regarding to the...
refined the documentation for the WarmUp class. Fix an issue regarding to the model type. Added a test for the warm up functionalities
1d1ea3c5 · jrzaurin · 0802b33a · 1d1ea3c5 · 1d1ea3c5 · 1d1ea3c5
3 changed file
--- a/pytorch_widedeep/models/_warmup.py
+++ b/pytorch_widedeep/models/_warmup.py
@@ -3,37 +3,36 @@ import torch

 from ..metrics import Metric
 from ..wdtypes import *
-from ._wdmodel_type import WDModel

 from tqdm import tqdm,trange
 from torch import nn

 use_cuda = torch.cuda.is_available()

-import pdb

 class WarmUp(object):
-	r"""Class containing the 'warm up' methods to be applied to the individual
-	models before the joined training. There are 3 warm up routines:
+	r"""
+	'Warm up' methods to be applied to the individual models before the joined
+	training. There are 3 warm up routines available:
 	1) Warm up all trainable layers at once
 	2) Gradual warm up inspired by the work of Felbo et al., 2017
 	3) Gradual warm up inspired by the work of Howard & Ruder 2018

-	The structure of the code in this class is highly customised to be
-	instantiated within the class WideDeep. This is not ideal, but represents a
-	compromise towards implementing a 'warm up' functionality for the current
-	overall structure of the package without having to re-structure most of the
-	existing code.
+	The structure of the code in this class is designed to be instantiated within
+	the class WideDeep. This is not ideal, but represents a compromise towards
+	implementing a 'warm up' functionality for the current overall structure of
+	the package without having to re-structure most of the existing code.

 	Parameters
 	----------
 	activation_fn: Any
 		any function with the same strucure as '_activation_fn' in the main class
-		WideDeep
+		WideDeep at pytorch_widedeep.models.wide_deep
 	loss_fn: Any
 		any function with the same strucure as '_loss_fn' in the main class WideDeep
+		at pytorch_widedeep.models.wide_deep
 	metric: Metric
-		object of class Metric (see metrics.Metric)
+		object of class Metric (see Metric in pytorch_widedeep.metrics)
 	method: str
 		one of 'binary', 'regression' or 'multiclass'
 	verbose: Boolean
@@ -47,30 +46,31 @@ class WarmUp(object):
 		self.method = method
 		self.verbose = verbose

-	def warm_all(self, model:WDModel, model_name:str, loader:DataLoader, n_epochs:int,
+	def warm_all(self, model:nn.Module, model_name:str, loader:DataLoader, n_epochs:int,
 		max_lr:float):
 		r"""
-		Warm up all trainable layers in a model using a one cycle triangular
-		learning rate. This is refereed as Slanted Triangular learing rate in Jeremy
-		Howard & Sebastian Ruder 2018 (https://arxiv.org/abs/1801.06146). The cycle
-		is described as follows:
+		Warm up all trainable layers in a model using a one cyclic learning rate
+		with a triangular pattern. This is refereed as Slanted Triangular learing
+		rate in Jeremy Howard & Sebastian Ruder 2018
+		(https://arxiv.org/abs/1801.06146). The cycle is described as follows:
 		1-The learning rate will gradually increase for 10% of the training steps
 			from max_lr/10 to max_lr.
-		2-It will then gradually decrease to max_lr/10 for the remaining 90%.
-		The optimizer used in the process is AdamW (not optional).
+		2-It will then gradually decrease to max_lr/10 for the remaining 90% of the
+			steps.
+		The optimizer used in the process is AdamW

 		Parameters:
 		----------
-		model: WDModel
-			WDModel object containing one the WideDeep model components (wide,
+		model: nn.Module
+			nn.Module object containing one the WideDeep model components (wide,
 			deepdense, deeptext or deepimage)
 		model_name: Str
 			string indicating the model name to access the corresponding parameters.
 			One of 'wide', 'deepdense', 'deeptext' or 'deepimage'
 		loader: DataLoader
-			Pytorch DataLoader containing the data to warm up with.
+			Pytorch DataLoader containing the data used to warm up
 		n_epochs: Int
-			number of epochs used to warm up the model.
+			number of epochs used to warm up the model
 		max_lr: Float
 			maximum learning rate value during the triangular cycle.
 		"""
@@ -85,7 +85,7 @@ class WarmUp(object):

 		self._warm(model, model_name, loader, optimizer, scheduler, n_epochs=n_epochs)

-	def warm_gradual(self, model:WDModel, model_name:str, loader:DataLoader,
+	def warm_gradual(self, model:nn.Module, model_name:str, loader:DataLoader,
 		max_lr:float, layers:List[nn.Module], routine:str):
 		r"""
 		Warm up certain layers within the model following a gradual warm up routine.
@@ -94,12 +94,11 @@ class WarmUp(object):
 		Howard & Sebastian Ruder 2018 ULMFit paper
 		(https://arxiv.org/abs/1801.06146).

-		As in the case of the 'warm_all' method, a one cycle triangular learning
-		rate is used. In both Felbo's and Howard's routines a gradual decreasing
-		learning rate is used as we go deeper into the network. The 'closest' layer
-		to the output neuron(s) will use a maximum learning rate of 'max_lr'. The
-		learning rate will then decrease by a factor of 2.5 per layer, i.e.:
-	    max_lrs = [0.01] + [0.01/(2.5*n) for n in range(1, len(layers))]
+		A one cycle triangular learning rate is used. In both Felbo's and Howard's
+		routines a gradually decreasing learning rate is used as we go deeper into
+		the network. The 'closest' layer to the output neuron(s) will use a maximum
+		learning rate of 'max_lr'. The learning rate will then decrease by a factor
+		of 2.5 per layer

 		1) The 'Felbo' routine:
 			warm up the first layer in 'layers' for one epoch. Then warm up the next
@@ -107,13 +106,13 @@ class WarmUp(object):
 			Repeat
 		2) The 'Howard' routine:
 			warm up the first layer in 'layers' for one epoch. Then warm the next layer
-			in the model for one epoch while keep the already warmed up layer(s)
+			in the model for one epoch while keeping the already warmed up layer(s)
 			trainable. Repeat.

 		Parameters:
 		----------
-		model: WDModel
-			WDModel object containing one the WideDeep model components (wide,
+		model: nn.Module
+			nn.Module object containing one the WideDeep model components (wide,
 			deepdense, deeptext or deepimage)
 		model_name: Str
 			string indicating the model name to access the corresponding parameters.
@@ -122,15 +121,12 @@ class WarmUp(object):
 			Pytorch DataLoader containing the data to warm up with.
 		max_lr: Float
 			maximum learning rate value during the triangular cycle for the layer
-			'closest' to the output neuron(s). Deeper layers in the model will be
-			trained with a gradual descending learning rate. The descending factor is
-			fixed and is 2.5, i.e:
-		    max_lrs = [0.01] + [0.01/(2.5*n) for n in range(1, len(layers))]
+			closest to the output neuron(s). Deeper layers in 'model' will be trained
+			with a gradually descending learning rate. The descending factor is fixed
+			and is 2.5
 		layers: List
 			List of nn.Module objects containing the layers that will be warmed up.
-			This must be in 'WARM-UP ORDER', i.e. the closest layer to the output
-			neuron(s) must be the first element of the list while the deepest layer
-			that will be warmed up must be the last element in the list
+			This must be in 'WARM-UP ORDER'.
 		routine: str
 			one of 'howard' or 'felbo'
 		"""
@@ -139,10 +135,10 @@ class WarmUp(object):
 		original_setup = {}
 		for n,p in model.named_parameters(): original_setup[n] = p.requires_grad

-		# decreasing learning rates
+		# gradually decreasing learning rates
 		max_lrs = [0.01] + [0.01/(2.5*n) for n in range(1, len(layers))]

-		# freezing the layers that have to be warmed up gradually
+		# freezing the layers that will be warmed up gradually
 		for layer in layers:
 			for p in layer.parameters(): p.requires_grad=False

@@ -169,12 +165,12 @@ class WarmUp(object):
 		# back to the original setup
 		for n,p in model.named_parameters(): p.requires_grad = original_setup[n]

-		# If felbo we train the whole model for one last epoch
+		# If 'felbo' we train the whole model for one last epoch
 		if routine is 'felbo':
-			print('Warming up one last epoch with all warmed up layers trainable')
+			if self.verbose: print('Warming up one last epoch with all warmed up layers trainable')
 			self._warm(model, model_name, loader, optimizer, scheduler)

-	def _warm(self, model:WDModel, model_name:str, loader:DataLoader, optimizer:Optimizer,
+	def _warm(self, model:nn.Module, model_name:str, loader:DataLoader, optimizer:Optimizer,
 		scheduler:LRScheduler, n_epochs:int=1):
 		r"""
 		Standard Pytorch training loop

--- a/pytorch_widedeep/models/wide_deep.py
+++ b/pytorch_widedeep/models/wide_deep.py
@@ -16,7 +16,6 @@ from ._wd_dataset import WideDeepDataset
 from ._multiple_optimizer import MultipleOptimizer
 from ._multiple_lr_scheduler import MultipleLRScheduler
 from ._multiple_transforms import MultipleTransforms
-from ._wdmodel_type import WDModel
 from ._warmup import WarmUp
 from .deep_dense import dense_layer

@@ -27,7 +26,6 @@ from torch.utils.data import DataLoader
 n_cpus = os.cpu_count()
 use_cuda = torch.cuda.is_available()

-import pdb

 class WideDeep(nn.Module):
    r""" Main collector class to combine all Wide, DeepDense, DeepText and
@@ -64,8 +62,8 @@ class WideDeep(nn.Module):
    head_dropout: List, Optional
        Dropout between the dense layers. e.g: [0.5, 0.5]
    head_batchnorm: Boolean, Optional
-        Whether or not to include batch normalizatin in the dense layers that
-        form the texthead
+        Specifies if batch normalizatin should be included in the dense layers
+        that form the texthead
    output_dim: Int
        Size of the final layer. 1 for regression and binary classification or
        'n_class' for multiclass classification
@@ -240,7 +238,7 @@ class WideDeep(nn.Module):
            yourself. See here:
            https://discuss.pytorch.org/t/passing-the-weights-to-crossentropyloss-correctly/14731/10
        with_focal_loss: Boolean, Optional. Default=False
-            Whether or not to use the Focal Loss. https://arxiv.org/pdf/1708.02002.pdf
+            Use the Focal Loss. https://arxiv.org/pdf/1708.02002.pdf
        alpha, gamma: Float. Default=0.25, 2
            Focal Loss parameters. See: https://arxiv.org/pdf/1708.02002.pdf
        verbose: Int
@@ -402,14 +400,36 @@ class WideDeep(nn.Module):
            Number of epochs without improving the target metric before we
            stop the fit
        warm_up: Boolean, Default=False
-            Warm up the models individually before starting the joined training
+            warm_up model components individually before the joined traininga
        warm_epochs: Int, Default=4
-            Number of warm up epochs
+            Number of warm up epochs for those model componenst that will not
+            be gradually warmed up
        warm_max_lr: Float, Default=0.01
-            Warming up will happen using a slanted triangular learning rates
-            (https://arxiv.org/pdf/1801.06146.pdf). warm_max_lr indicates the
-            maximum learning rate that will be used during the cycle. The
-            minimum (base_lr) learning rate is warm_max_lr/10.
+            Maximum learning rate during the Triangular Learning rate cycle
+            for those model componenst that will not be gradually warmed up
+        warm_deeptext_gradual: Boolean, Default=False
+            Boolean indicating if the deeptext component will be warmed
+            up gradually
+        warm_deeptext_max_lr: Float, Default=0.01
+            Maximum learning rate during the Triangular Learning rate cycle
+            for the deeptext component
+        warm_deeptext_layers: Optional, List, Default=None
+            List of nn.Modules that will be warmed up gradually. These have to
+            be in 'warm-up-order': the layers or blocks close to the output
+            neuron(s) first
+        warm_deepimage_gradual: Boolean, Default=False
+            Boolean indicating if the deepimage component will be warmed
+            up gradually
+        warm_deepimage_max_lr: Float, Default=0.01
+            Maximum learning rate during the Triangular Learning rate cycle
+            for the deepimage component
+        warm_deepimage_layers: Optional, List, Default=None
+            List of nn.Modules that will be warmed up gradually. These have to
+            be in 'warm-up-order': the layers or blocks close to the output
+            neuron(s) first
+        warm_routine: Str, Default='felbo'
+            Warm up routine. On of 'felbo' or 'howard'. See the WarmUp class
+            documentation for details

        **WideDeep assumes that X_wide, X_deep and target ALWAYS exist, while
        X_text and X_img are optional

--- a/tests/test_warm_up/test_warm_up_routines.py
+++ b/tests/test_warm_up/test_warm_up_routines.py
+import pytest
+import numpy as np
+import string
+import torch
+import torch.nn.functional as F
+
+from torch import nn
+from sklearn.utils import Bunch
+from torch.utils.data import DataLoader, Dataset
+
+from pytorch_widedeep.models import Wide, DeepDense
+from pytorch_widedeep.models.deep_image import conv_layer
+from pytorch_widedeep.metrics import BinaryAccuracy
+from pytorch_widedeep.models._warmup import WarmUp
+
+use_cuda = torch.cuda.is_available()
+
+# Define a series of simple models to quickly test the WarmUp class
+class DeepText(nn.Module):
+
+    def __init__(self):
+        super(DeepText, self).__init__()
+        self.word_embed = nn.Embedding(5, 16, padding_idx=0)
+        self.rnn = nn.LSTM(16, 8, batch_first=True)
+        self.linear = nn.Linear(8,1)
+
+    def forward(self, X):
+        embed = self.word_embed(X.long())
+        o, (h, c) = self.rnn(embed)
+        return self.linear(h).view(-1,1)
+
+
+class DeepImage(nn.Module):
+
+    def __init__(self):
+        super(DeepImage, self).__init__()
+
+        self.conv_block = nn.Sequential(
+            conv_layer(3, 64, 3),
+            conv_layer(64, 128, 1,
+                maxpool=False,
+                adaptiveavgpool=True)
+            )
+        self.linear = nn.Linear(128, 1)
+
+    def forward(self, X):
+        x = self.conv_block(X)
+        x = x.view(x.size(0), -1)
+        return self.linear(x)
+
+# Define a simple WideDeep Dataset
+class WDset(Dataset):
+
+    def __init__(self, X_wide, X_deep, X_text, X_img, target):
+
+        self.X_wide = X_wide
+        self.X_deep = X_deep
+        self.X_text = X_text
+        self.X_img  = X_img
+        self.Y = target
+
+    def __getitem__(self, idx:int):
+
+        X = Bunch(wide=self.X_wide[idx])
+        X.deepdense = self.X_deep[idx]
+        X.deeptext  = self.X_text[idx]
+        X.deepimage = self.X_img[idx]
+        y  = self.Y[idx]
+        return X, y
+
+    def __len__(self):
+        return len(self.X_deep)
+
+# Remember that the WarmUp class will be instantiated inside the WideDeep and
+# will take, among others, the activation_fn and the loss_fn of that class as
+# parameters. Therefore, we define equivalent classes to replicate the
+# scenario
+def activ_fn(inp):
+	return torch.sigmoid(inp)
+
+
+def loss_fn(y_pred, y_true):
+	return F.binary_cross_entropy(y_pred, y_true.view(-1, 1))
+
+# Define the data components:
+
+# target
+target = torch.empty(100, 1).random_(0,2)
+
+# wide
+X_wide = torch.empty(100, 10).random_(0,2)
+
+# deep
+colnames = list(string.ascii_lowercase)[:10]
+embed_cols = [np.random.choice(np.arange(5), 100) for _ in range(5)]
+cont_cols = [np.random.rand(100) for _ in range(5)]
+embed_input = [(u,i,j) for u,i,j in zip(colnames[:5], [5]*5, [16]*5)]
+deep_column_idx={k:v for v,k in enumerate(colnames[:10])}
+continuous_cols=colnames[-5:]
+X_deep = torch.from_numpy(np.vstack(embed_cols+cont_cols).transpose())
+
+# text
+X_text = torch.cat((torch.zeros([100,1]), torch.empty(100, 4).random_(1,4)), axis=1)
+
+# image
+X_image = torch.rand(100, 3, 28, 28)
+
+# Define the model components
+
+# wide
+wide = Wide(10, 1)
+if use_cuda: wide.cuda()
+
+# deep
+deepdense = DeepDense(
+    hidden_layers=[16,8],
+    dropout=[0.5, 0.2],
+    deep_column_idx=deep_column_idx,
+    embed_input=embed_input,
+    continuous_cols=continuous_cols)
+deepdense = nn.Sequential(deepdense,
+    nn.Linear(8, 1))
+if use_cuda: deepdense.cuda()
+
+# text
+deeptext = DeepText()
+if use_cuda: deeptext.cuda()
+
+# image
+deepimage = DeepImage()
+if use_cuda: deepimage.cuda()
+
+# Define the loader
+wdset = WDset(X_wide, X_deep, X_text, X_image, target)
+wdloader = DataLoader(wdset, batch_size=10, shuffle=True)
+
+# Instantiate the WarmUp class
+warmer = WarmUp(activ_fn, loss_fn, BinaryAccuracy(), 'binary', False)
+
+# List the layers for the warm_gradual method
+text_layers  = [c for c in list(deeptext.children())[1:]][::-1]
+image_layers = [c for c in list(deepimage.children())][::-1]
+
+###############################################################################
+# Simply test that warm_all runs
+###############################################################################
+@pytest.mark.parametrize(
+    'model, modelname, loader, n_epochs, max_lr',
+    [
+    (wide, 'wide', wdloader, 1, 0.01),
+    (deepdense, 'deepdense', wdloader, 1, 0.01),
+    (deeptext,  'deeptext' , wdloader, 1, 0.01),
+    (deepimage, 'deepimage', wdloader, 1, 0.01)
+    ]
+    )
+def test_warm_all(model, modelname, loader, n_epochs, max_lr):
+    has_run = True
+    try: warmer.warm_all(model, modelname, loader, n_epochs, max_lr)
+    except: has_run=False
+    assert has_run
+
+###############################################################################
+# Simply test that warm_gradual runs
+###############################################################################
+@pytest.mark.parametrize(
+    'model, modelname, loader, max_lr, layers, routine',
+    [
+    (deeptext,  'deeptext'  , wdloader, 0.01, text_layers, 'felbo'),
+    (deeptext,  'deeptext'  , wdloader, 0.01, text_layers, 'howard'),
+    (deepimage, 'deepimage' , wdloader, 0.01, image_layers, 'felbo'),
+    (deepimage, 'deepimage' , wdloader, 0.01, image_layers, 'howard'),
+    ]
+    )
+def test_warm_gradual(model, modelname, loader, max_lr, layers, routine):
+    has_run = True
+    try: warmer.warm_gradual(model, modelname, loader, max_lr, layers, routine)
+    except: has_run=False
+    assert has_run
+