_warmup.py 8.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
import numpy as np
import torch

from ..metrics import Metric
from ..wdtypes import *

from tqdm import tqdm,trange
from torch import nn

use_cuda = torch.cuda.is_available()


class WarmUp(object):
14 15 16
	r"""
	'Warm up' methods to be applied to the individual models before the joined
	training. There are 3 warm up routines available:
17 18 19 20
	1) Warm up all trainable layers at once
	2) Gradual warm up inspired by the work of Felbo et al., 2017
	3) Gradual warm up inspired by the work of Howard & Ruder 2018

21 22 23 24
	The structure of the code in this class is designed to be instantiated within
	the class WideDeep. This is not ideal, but represents a compromise towards
	implementing a 'warm up' functionality for the current overall structure of
	the package without having to re-structure most of the existing code.
25 26 27 28 29

	Parameters
	----------
	activation_fn: Any
		any function with the same strucure as '_activation_fn' in the main class
30
		WideDeep at pytorch_widedeep.models.wide_deep
31 32
	loss_fn: Any
		any function with the same strucure as '_loss_fn' in the main class WideDeep
33
		at pytorch_widedeep.models.wide_deep
34
	metric: Metric
35
		object of class Metric (see Metric in pytorch_widedeep.metrics)
36 37 38 39 40 41 42 43 44 45 46 47 48
	method: str
		one of 'binary', 'regression' or 'multiclass'
	verbose: Boolean
	"""
	def __init__(self, activation_fn:Any, loss_fn:Any, metric:Metric, method:str,
		verbose:bool):
		super(WarmUp, self).__init__()
		self.activation_fn = activation_fn
		self.loss_fn = loss_fn
		self.metric = metric
		self.method = method
		self.verbose = verbose

49
	def warm_all(self, model:nn.Module, model_name:str, loader:DataLoader, n_epochs:int,
50 51
		max_lr:float):
		r"""
52 53 54 55
		Warm up all trainable layers in a model using a one cyclic learning rate
		with a triangular pattern. This is refereed as Slanted Triangular learing
		rate in Jeremy Howard & Sebastian Ruder 2018
		(https://arxiv.org/abs/1801.06146). The cycle is described as follows:
56 57
		1-The learning rate will gradually increase for 10% of the training steps
			from max_lr/10 to max_lr.
58 59 60
		2-It will then gradually decrease to max_lr/10 for the remaining 90% of the
			steps.
		The optimizer used in the process is AdamW
61 62 63

		Parameters:
		----------
64 65
		model: nn.Module
			nn.Module object containing one the WideDeep model components (wide,
66 67 68 69 70
			deepdense, deeptext or deepimage)
		model_name: Str
			string indicating the model name to access the corresponding parameters.
			One of 'wide', 'deepdense', 'deeptext' or 'deepimage'
		loader: DataLoader
71
			Pytorch DataLoader containing the data used to warm up
72
		n_epochs: Int
73
			number of epochs used to warm up the model
74 75 76 77 78 79 80 81 82 83 84 85 86 87
		max_lr: Float
			maximum learning rate value during the triangular cycle.
		"""
		if self.verbose: print('Warming up {} for {} epochs'.format(model_name, n_epochs))
		model.train()

		optimizer = torch.optim.AdamW(model.parameters(), lr=max_lr/10.)
		step_size_up, step_size_down = self._steps_up_down(len(loader), n_epochs)
		scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=max_lr/10.,
		    max_lr=max_lr, step_size_up=step_size_up, step_size_down=step_size_down,
		    cycle_momentum=False)

		self._warm(model, model_name, loader, optimizer, scheduler, n_epochs=n_epochs)

88
	def warm_gradual(self, model:nn.Module, model_name:str, loader:DataLoader,
89 90 91 92 93 94 95 96
		max_lr:float, layers:List[nn.Module], routine:str):
		r"""
		Warm up certain layers within the model following a gradual warm up routine.
		The approaches implemented in this method are inspired by the work of Felbo
		et al., 2017 in their DeepEmoji paper (https://arxiv.org/abs/1708.00524) and
		Howard & Sebastian Ruder 2018 ULMFit paper
		(https://arxiv.org/abs/1801.06146).

97 98 99 100 101
		A one cycle triangular learning rate is used. In both Felbo's and Howard's
		routines a gradually decreasing learning rate is used as we go deeper into
		the network. The 'closest' layer to the output neuron(s) will use a maximum
		learning rate of 'max_lr'. The learning rate will then decrease by a factor
		of 2.5 per layer
102 103 104 105 106 107 108

		1) The 'Felbo' routine:
			warm up the first layer in 'layers' for one epoch. Then warm up the next
			layer in 'layers' for one epoch freezing the already warmed up layer(s).
			Repeat
		2) The 'Howard' routine:
			warm up the first layer in 'layers' for one epoch. Then warm the next layer
109
			in the model for one epoch while keeping the already warmed up layer(s)
110 111 112 113
			trainable. Repeat.

		Parameters:
		----------
114 115
		model: nn.Module
			nn.Module object containing one the WideDeep model components (wide,
116 117 118 119 120 121 122 123
			deepdense, deeptext or deepimage)
		model_name: Str
			string indicating the model name to access the corresponding parameters.
			One of 'wide', 'deepdense', 'deeptext' or 'deepimage'
		loader: DataLoader
			Pytorch DataLoader containing the data to warm up with.
		max_lr: Float
			maximum learning rate value during the triangular cycle for the layer
124 125 126
			closest to the output neuron(s). Deeper layers in 'model' will be trained
			with a gradually descending learning rate. The descending factor is fixed
			and is 2.5
127 128
		layers: List
			List of nn.Module objects containing the layers that will be warmed up.
129
			This must be in 'WARM-UP ORDER'.
130 131 132 133 134 135 136 137
		routine: str
			one of 'howard' or 'felbo'
		"""
		model.train()
		step_size_up, step_size_down = self._steps_up_down(len(loader))
		original_setup = {}
		for n,p in model.named_parameters(): original_setup[n] = p.requires_grad

138
		# gradually decreasing learning rates
139 140
		max_lrs = [0.01] + [0.01/(2.5*n) for n in range(1, len(layers))]

141
		# freezing the layers that will be warmed up gradually
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
		for layer in layers:
			for p in layer.parameters(): p.requires_grad=False

		# Gradual defrosting
		if routine is 'howard': params, max_lr, base_lr = [],[],[]
		for i, (lr, layer) in enumerate(zip(max_lrs, layers)):
			if self.verbose: print('Warming up {}, layer {} of {}'.format(model_name, i+1, len(layers)))
			for p in layer.parameters(): p.requires_grad=True
			if routine is 'felbo':
				params, max_lr, base_lr = layer.parameters(), lr, lr/10.
			elif routine is 'howard':
				params += [{'params': layer.parameters(), 'lr': lr/10.}]
				max_lr += [lr]
				base_lr += [lr/10.]
			optimizer = torch.optim.AdamW(params)
			scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=base_lr,
		        max_lr=max_lr, step_size_up=step_size_up, step_size_down=step_size_down,
		        cycle_momentum=False)
			self._warm(model, model_name, loader, optimizer, scheduler)
			if routine is 'felbo':
				# freezing it again before we warm the next one
				for p in layer.parameters(): p.requires_grad=False

		# back to the original setup
		for n,p in model.named_parameters(): p.requires_grad = original_setup[n]

168
		# If 'felbo' we train the whole model for one last epoch
169
		if routine is 'felbo':
170
			if self.verbose: print('Warming up one last epoch with all warmed up layers trainable')
171 172
			self._warm(model, model_name, loader, optimizer, scheduler)

173
	def _warm(self, model:nn.Module, model_name:str, loader:DataLoader, optimizer:Optimizer,
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
		scheduler:LRScheduler, n_epochs:int=1):
		r"""
		Standard Pytorch training loop
		"""
		steps = len(loader)
		for epoch in range(n_epochs):
		    running_loss=0.
		    with trange(steps, disable=self.verbose != 1) as t:
		        for batch_idx, (data, target) in zip(t, loader):
		            t.set_description('epoch %i' % (epoch+1))
		            X = data[model_name].cuda() if use_cuda else data[model_name]
		            y = target.float() if self.method != 'multiclass' else target
		            y = y.cuda() if use_cuda else y

		            optimizer.zero_grad()
		            y_pred = self.activation_fn(model(X))
		            loss   = self.loss_fn(y_pred, y)
		            loss.backward()
		            optimizer.step()
		            scheduler.step()

		            running_loss += loss.item()
		            avg_loss = running_loss/(batch_idx+1)

		            if self.metric is not None:
		                acc = self.metric(y_pred, y)
		                t.set_postfix(metrics=acc, loss=avg_loss)
		            else:
		                t.set_postfix(loss=np.sqrt(avg_loss))

	def _steps_up_down(self, steps:int, n_epochs:int=1)->Tuple[int,int]:
		r"""
		Calculate the number of steps up and down during the one cycle warm up for a
		given number of epochs

		Parameters:
		----------
		steps: Int
			steps per epoch
		n_epochs: Int. Default=1
			number of warm up epochs

		Returns:
		-------
		up, down: Tuple, Int
			number of steps increasing/decreasing the learning rate during the cycle
		"""
		up = round((steps*n_epochs) * 0.1)
		down = (steps*n_epochs) - up
		return up, down