_finetune.py 11.5 KB
Newer Older
1
import numpy as np
2
import torch
3
import torch.nn.functional as F
4 5
from tqdm import trange
from torch import nn
6

7 8
from pytorch_widedeep.metrics import Metric, MultipleMetrics
from pytorch_widedeep.wdtypes import *  # noqa: F403
9 10 11 12

use_cuda = torch.cuda.is_available()


13
class FineTune:
14 15
    r"""
    Fine-tune methods to be applied to the individual model components.
16

17 18
    Note that they can also be used to "fine-tune" those components before
    the joined training.
19

20
    There are 3 fine-tune/warm-up routines available:
21

22
    1) Fine-tune all trainable layers at once
23

24
    2) Gradual fine-tuning inspired by the work of Felbo et al., 2017
25

26
    3) Gradual fine-tuning inspired by the work of Howard & Ruder 2018
27

28 29 30 31 32 33
    The structure of the code in this class is designed to be instantiated
    within the class WideDeep. This is not ideal, but represents a
    compromise towards implementing a fine-tuning functionality for the
    current overall structure of the package without having to
    re-structure most of the existing code. This will change in future
    releases.
34

35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
    Parameters
    ----------
    loss_fn: Any
       any function with the same strucure as 'loss_fn' in the class ``Trainer``
    metric: ``Metric`` or ``MultipleMetrics``
       object of class Metric (see Metric in pytorch_widedeep.metrics)
    method: str
       one of 'binary', 'regression' or 'multiclass'
    verbose: Boolean
    """

    def __init__(
        self,
        loss_fn: Any,
        metric: Union[Metric, MultipleMetrics],
        method: str,
        verbose: int,
    ):
J
jrzaurin 已提交
53 54 55 56 57
        self.loss_fn = loss_fn
        self.metric = metric
        self.method = method
        self.verbose = verbose

58
    def finetune_all(
J
jrzaurin 已提交
59 60 61 62 63 64 65
        self,
        model: nn.Module,
        model_name: str,
        loader: DataLoader,
        n_epochs: int,
        max_lr: float,
    ):
66 67 68
        r"""Fine-tune/warm-up all trainable layers in a model using a one cyclic
        learning rate with a triangular pattern. This is refereed as Slanted
        Triangular learing rate in Jeremy Howard & Sebastian Ruder 2018
J
jrzaurin 已提交
69
        (https://arxiv.org/abs/1801.06146). The cycle is described as follows:
70 71

        1) The learning rate will gradually increase for 10% of the training steps
J
jrzaurin 已提交
72
            from max_lr/10 to max_lr.
73 74

        2) It will then gradually decrease to max_lr/10 for the remaining 90% of the
J
jrzaurin 已提交
75
            steps.
76

J
jrzaurin 已提交
77 78 79 80
        The optimizer used in the process is AdamW

        Parameters:
        ----------
81 82 83
        model: `Module``
            ``Module`` object containing one the WideDeep model components (wide,
            deeptabular, deeptext or deepimage)
84
        model_name: str
J
jrzaurin 已提交
85
            string indicating the model name to access the corresponding parameters.
86 87
            One of 'wide', 'deeptabular', 'deeptext' or 'deepimage'
        loader: ``DataLoader``
88
            Pytorch DataLoader containing the data used to fine-tune
89
        n_epochs: int
90
            number of epochs used to fine-tune the model
91
        max_lr: float
J
jrzaurin 已提交
92 93 94
            maximum learning rate value during the triangular cycle.
        """
        if self.verbose:
95
            print("Training {} for {} epochs".format(model_name, n_epochs))
J
jrzaurin 已提交
96 97 98 99 100 101 102 103 104 105 106 107 108
        model.train()

        optimizer = torch.optim.AdamW(model.parameters(), lr=max_lr / 10.0)  # type: ignore
        step_size_up, step_size_down = self._steps_up_down(len(loader), n_epochs)
        scheduler = torch.optim.lr_scheduler.CyclicLR(
            optimizer,
            base_lr=max_lr / 10.0,
            max_lr=max_lr,
            step_size_up=step_size_up,
            step_size_down=step_size_down,
            cycle_momentum=False,
        )

109 110 111
        self._finetune(
            model, model_name, loader, optimizer, scheduler, n_epochs=n_epochs
        )
J
jrzaurin 已提交
112

113
    def finetune_gradual(  # noqa: C901
J
jrzaurin 已提交
114 115 116 117 118 119 120 121
        self,
        model: nn.Module,
        model_name: str,
        loader: DataLoader,
        last_layer_max_lr: float,
        layers: List[nn.Module],
        routine: str,
    ):
122 123 124 125 126
        r"""Fine-tune/warm-up certain layers within the model following a
        gradual fine-tune routine. The approaches implemented in this method are
        based on fine-tuning routines described in the the work of Felbo et
        al., 2017 in their DeepEmoji paper (https://arxiv.org/abs/1708.00524)
        and Howard & Sebastian Ruder 2018 ULMFit paper
127
        (https://arxiv.org/abs/1801.06146).
J
jrzaurin 已提交
128

129 130 131 132 133
        A one cycle triangular learning rate is used. In both Felbo's and
        Howard's routines a gradually decreasing learning rate is used as we
        go deeper into the network. The 'closest' layer to the output
        neuron(s) will use a maximum learning rate of 'last_layer_max_lr'. The
        learning rate will then decrease by a factor of 2.5 per layer
J
jrzaurin 已提交
134

135 136 137 138 139
        1) The 'Felbo' routine: train the first layer in 'layers' for one
           epoch. Then train the next layer in 'layers' for one epoch freezing
           the already trained up layer(s). Repeat untill all individual layers
           are trained. Then, train one last epoch with all trained/fine-tuned
           layers trainable
140

141 142 143
        2) The 'Howard' routine: fine-tune the first layer in 'layers' for one
           epoch. Then traine the next layer in the model for one epoch while
           keeping the already trained up layer(s) trainable. Repeat.
J
jrzaurin 已提交
144 145 146

        Parameters:
        ----------
147 148 149
        model: ``Module``
           ``Module`` object containing one the WideDeep model components (wide,
           deeptabular, deeptext or deepimage)
150
        model_name: str
J
jrzaurin 已提交
151
           string indicating the model name to access the corresponding parameters.
152 153
           One of 'wide', 'deeptabular', 'deeptext' or 'deepimage'
        loader: ``DataLoader``
154
           Pytorch DataLoader containing the data to fine-tune with.
155
        last_layer_max_lr: float
J
jrzaurin 已提交
156 157 158 159
           maximum learning rate value during the triangular cycle for the layer
           closest to the output neuron(s). Deeper layers in 'model' will be trained
           with a gradually descending learning rate. The descending factor is fixed
           and is 2.5
160
        layers: list
161 162
           List of ``Module`` objects containing the layers that will be fine-tuned.
           This must be in *'FINE-TUNE ORDER'*.
J
jrzaurin 已提交
163 164 165 166
        routine: str
           one of 'howard' or 'felbo'
        """
        model.train()
167

J
jrzaurin 已提交
168
        step_size_up, step_size_down = self._steps_up_down(len(loader))
169

J
jrzaurin 已提交
170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
        original_setup = {}
        for n, p in model.named_parameters():
            original_setup[n] = p.requires_grad
        layers_max_lr = [last_layer_max_lr] + [
            last_layer_max_lr / (2.5 * n) for n in range(1, len(layers))
        ]

        for layer in layers:
            for p in layer.parameters():
                p.requires_grad = False

        if routine == "howard":
            params: List = []
            max_lr: List = []
            base_lr: List = []

        for i, (lr, layer) in enumerate(zip(layers_max_lr, layers)):
            if self.verbose:
                print(
189
                    "Training {}, layer {} of {}".format(model_name, i + 1, len(layers))
J
jrzaurin 已提交
190 191 192 193 194 195 196 197 198
                )
            for p in layer.parameters():
                p.requires_grad = True
            if routine == "felbo":
                params, max_lr, base_lr = layer.parameters(), lr, lr / 10.0  # type: ignore
            elif routine == "howard":
                params += [{"params": layer.parameters(), "lr": lr / 10.0}]
                max_lr += [lr]
                base_lr += [lr / 10.0]
199
            optimizer = torch.optim.AdamW(params)
J
jrzaurin 已提交
200 201
            scheduler = torch.optim.lr_scheduler.CyclicLR(
                optimizer,
202
                base_lr=base_lr,  # type: ignore[arg-type]
J
jrzaurin 已提交
203 204
                max_lr=max_lr,  # type: ignore
                step_size_up=step_size_up,
205
                step_size_down=step_size_down,
J
jrzaurin 已提交
206 207
                cycle_momentum=False,
            )
208
            self._finetune(model, model_name, loader, optimizer, scheduler)
J
jrzaurin 已提交
209 210 211 212 213 214
            if routine == "felbo":
                for p in layer.parameters():
                    p.requires_grad = False

        if routine == "felbo":
            if self.verbose:
215
                print("Training one last epoch...")
J
jrzaurin 已提交
216 217 218 219 220 221 222 223
            for layer in layers:
                for p in layer.parameters():
                    p.requires_grad = True
            params, max_lr, base_lr = [], [], []
            for lr, layer in zip(layers_max_lr, layers):
                params += [{"params": layer.parameters(), "lr": lr / 10.0}]
                max_lr += [lr]
                base_lr += [lr / 10.0]
224
            optimizer = torch.optim.AdamW(params)
J
jrzaurin 已提交
225 226 227 228 229
            scheduler = torch.optim.lr_scheduler.CyclicLR(
                optimizer,
                base_lr=base_lr,  # type: ignore
                max_lr=max_lr,  # type: ignore
                step_size_up=step_size_up,
230
                step_size_down=step_size_down,
J
jrzaurin 已提交
231 232
                cycle_momentum=False,
            )
233
            self._finetune(model, model_name, loader, optimizer, scheduler)
J
jrzaurin 已提交
234 235 236 237

        for n, p in model.named_parameters():
            p.requires_grad = original_setup[n]

238
    def _finetune(
J
jrzaurin 已提交
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
        self,
        model: nn.Module,
        model_name: str,
        loader: DataLoader,
        optimizer: Optimizer,
        scheduler: LRScheduler,
        n_epochs: int = 1,
    ):
        r"""
        Standard Pytorch training loop
        """
        steps = len(loader)
        for epoch in range(n_epochs):
            running_loss = 0.0
            with trange(steps, disable=self.verbose != 1) as t:
                for batch_idx, (data, target) in zip(t, loader):
                    t.set_description("epoch %i" % (epoch + 1))
                    X = data[model_name].cuda() if use_cuda else data[model_name]
257 258 259 260 261
                    y = (
                        target.view(-1, 1).float()
                        if self.method != "multiclass"
                        else target
                    )
J
jrzaurin 已提交
262 263 264
                    y = y.cuda() if use_cuda else y

                    optimizer.zero_grad()
265
                    y_pred = model(X)
J
jrzaurin 已提交
266 267 268
                    loss = self.loss_fn(y_pred, y)
                    loss.backward()
                    optimizer.step()
269
                    scheduler.step()
J
jrzaurin 已提交
270 271 272 273 274

                    running_loss += loss.item()
                    avg_loss = running_loss / (batch_idx + 1)

                    if self.metric is not None:
275 276
                        if self.method == "regression":
                            score = self.metric(y_pred, y)
277
                        if self.method == "binary":
278
                            score = self.metric(torch.sigmoid(y_pred), y)
279
                        if self.method == "multiclass":
280
                            score = self.metric(F.softmax(y_pred, dim=1), y)
281 282 283 284
                        t.set_postfix(
                            metrics={k: np.round(v, 4) for k, v in score.items()},
                            loss=avg_loss,
                        )
J
jrzaurin 已提交
285
                    else:
286
                        t.set_postfix(loss=avg_loss)
J
jrzaurin 已提交
287 288 289

    def _steps_up_down(self, steps: int, n_epochs: int = 1) -> Tuple[int, int]:
        r"""
290
        Calculate the number of steps up and down during the one cycle fine-tune for a
J
jrzaurin 已提交
291 292 293 294
        given number of epochs

        Parameters:
        ----------
295
        steps: int
J
jrzaurin 已提交
296
            steps per epoch
297
        n_epochs: int, default=1
298
            number of fine-tune epochs
J
jrzaurin 已提交
299 300 301

        Returns:
        -------
302
        up, down: Tuple, int
J
jrzaurin 已提交
303 304 305 306 307
            number of steps increasing/decreasing the learning rate during the cycle
        """
        up = round((steps * n_epochs) * 0.1)
        down = (steps * n_epochs) - up
        return up, down