_finetune.py 11.6 KB
Newer Older
1
import numpy as np
2
import torch
3
import torch.nn.functional as F
4 5
from tqdm import trange
from torch import nn
6

7 8
from pytorch_widedeep.metrics import Metric, MultipleMetrics
from pytorch_widedeep.wdtypes import *  # noqa: F403
9 10 11 12

use_cuda = torch.cuda.is_available()


13
class FineTune:
J
jrzaurin 已提交
14 15 16 17 18 19 20
    def __init__(
        self,
        loss_fn: Any,
        metric: Union[Metric, MultipleMetrics],
        method: str,
        verbose: int,
    ):
21
        r"""
22
        Fine-tune methods to be applied to the individual model components.
23

24 25
        Note that they can also be used to "fine-tune" those components before
        the joined training.
26

27
        There are 3 fine-tune/warm-up routines available:
28

29 30 31 32 33
        1) Fine-tune all trainable layers at once

        2) Gradual fine-tuning inspired by the work of Felbo et al., 2017

        3) Gradual fine-tuning inspired by the work of Howard & Ruder 2018
34 35

        The structure of the code in this class is designed to be instantiated
36 37 38 39 40
        within the class WideDeep. This is not ideal, but represents a
        compromise towards implementing a fine-tuning functionality for the
        current overall structure of the package without having to
        re-structure most of the existing code. This will change in future
        releases.
41 42 43 44 45 46 47 48 49 50 51

        Parameters
        ----------
        loss_fn: Any
           any function with the same strucure as 'loss_fn' in the class ``Trainer``
        metric: ``Metric`` or ``MultipleMetrics``
           object of class Metric (see Metric in pytorch_widedeep.metrics)
        method: str
           one of 'binary', 'regression' or 'multiclass'
        verbose: Boolean
        """
J
jrzaurin 已提交
52 53 54 55 56
        self.loss_fn = loss_fn
        self.metric = metric
        self.method = method
        self.verbose = verbose

57
    def finetune_all(
J
jrzaurin 已提交
58 59 60 61 62 63 64
        self,
        model: nn.Module,
        model_name: str,
        loader: DataLoader,
        n_epochs: int,
        max_lr: float,
    ):
65 66 67
        r"""Fine-tune/warm-up all trainable layers in a model using a one cyclic
        learning rate with a triangular pattern. This is refereed as Slanted
        Triangular learing rate in Jeremy Howard & Sebastian Ruder 2018
J
jrzaurin 已提交
68
        (https://arxiv.org/abs/1801.06146). The cycle is described as follows:
69 70

        1) The learning rate will gradually increase for 10% of the training steps
J
jrzaurin 已提交
71
            from max_lr/10 to max_lr.
72 73

        2) It will then gradually decrease to max_lr/10 for the remaining 90% of the
J
jrzaurin 已提交
74
            steps.
75

J
jrzaurin 已提交
76 77 78 79
        The optimizer used in the process is AdamW

        Parameters:
        ----------
80 81 82
        model: `Module``
            ``Module`` object containing one the WideDeep model components (wide,
            deeptabular, deeptext or deepimage)
83
        model_name: str
J
jrzaurin 已提交
84
            string indicating the model name to access the corresponding parameters.
85 86
            One of 'wide', 'deeptabular', 'deeptext' or 'deepimage'
        loader: ``DataLoader``
87
            Pytorch DataLoader containing the data used to fine-tune
88
        n_epochs: int
89
            number of epochs used to fine-tune the model
90
        max_lr: float
J
jrzaurin 已提交
91 92 93
            maximum learning rate value during the triangular cycle.
        """
        if self.verbose:
94
            print("Training {} for {} epochs".format(model_name, n_epochs))
J
jrzaurin 已提交
95 96 97 98 99 100 101 102 103 104 105 106 107
        model.train()

        optimizer = torch.optim.AdamW(model.parameters(), lr=max_lr / 10.0)  # type: ignore
        step_size_up, step_size_down = self._steps_up_down(len(loader), n_epochs)
        scheduler = torch.optim.lr_scheduler.CyclicLR(
            optimizer,
            base_lr=max_lr / 10.0,
            max_lr=max_lr,
            step_size_up=step_size_up,
            step_size_down=step_size_down,
            cycle_momentum=False,
        )

108 109 110
        self._finetune(
            model, model_name, loader, optimizer, scheduler, n_epochs=n_epochs
        )
J
jrzaurin 已提交
111

112
    def finetune_gradual(  # noqa: C901
J
jrzaurin 已提交
113 114 115 116 117 118 119 120
        self,
        model: nn.Module,
        model_name: str,
        loader: DataLoader,
        last_layer_max_lr: float,
        layers: List[nn.Module],
        routine: str,
    ):
121 122 123 124 125
        r"""Fine-tune/warm-up certain layers within the model following a
        gradual fine-tune routine. The approaches implemented in this method are
        based on fine-tuning routines described in the the work of Felbo et
        al., 2017 in their DeepEmoji paper (https://arxiv.org/abs/1708.00524)
        and Howard & Sebastian Ruder 2018 ULMFit paper
126
        (https://arxiv.org/abs/1801.06146).
J
jrzaurin 已提交
127

128 129 130 131 132
        A one cycle triangular learning rate is used. In both Felbo's and
        Howard's routines a gradually decreasing learning rate is used as we
        go deeper into the network. The 'closest' layer to the output
        neuron(s) will use a maximum learning rate of 'last_layer_max_lr'. The
        learning rate will then decrease by a factor of 2.5 per layer
J
jrzaurin 已提交
133

134 135 136 137 138
        1) The 'Felbo' routine: train the first layer in 'layers' for one
           epoch. Then train the next layer in 'layers' for one epoch freezing
           the already trained up layer(s). Repeat untill all individual layers
           are trained. Then, train one last epoch with all trained/fine-tuned
           layers trainable
139

140 141 142
        2) The 'Howard' routine: fine-tune the first layer in 'layers' for one
           epoch. Then traine the next layer in the model for one epoch while
           keeping the already trained up layer(s) trainable. Repeat.
J
jrzaurin 已提交
143 144 145

        Parameters:
        ----------
146 147 148
        model: ``Module``
           ``Module`` object containing one the WideDeep model components (wide,
           deeptabular, deeptext or deepimage)
149
        model_name: str
J
jrzaurin 已提交
150
           string indicating the model name to access the corresponding parameters.
151 152
           One of 'wide', 'deeptabular', 'deeptext' or 'deepimage'
        loader: ``DataLoader``
153
           Pytorch DataLoader containing the data to fine-tune with.
154
        last_layer_max_lr: float
J
jrzaurin 已提交
155 156 157 158
           maximum learning rate value during the triangular cycle for the layer
           closest to the output neuron(s). Deeper layers in 'model' will be trained
           with a gradually descending learning rate. The descending factor is fixed
           and is 2.5
159
        layers: list
160 161
           List of ``Module`` objects containing the layers that will be fine-tuned.
           This must be in *'FINE-TUNE ORDER'*.
J
jrzaurin 已提交
162 163 164 165
        routine: str
           one of 'howard' or 'felbo'
        """
        model.train()
166

J
jrzaurin 已提交
167
        step_size_up, step_size_down = self._steps_up_down(len(loader))
168

J
jrzaurin 已提交
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
        original_setup = {}
        for n, p in model.named_parameters():
            original_setup[n] = p.requires_grad
        layers_max_lr = [last_layer_max_lr] + [
            last_layer_max_lr / (2.5 * n) for n in range(1, len(layers))
        ]

        for layer in layers:
            for p in layer.parameters():
                p.requires_grad = False

        if routine == "howard":
            params: List = []
            max_lr: List = []
            base_lr: List = []

        for i, (lr, layer) in enumerate(zip(layers_max_lr, layers)):
            if self.verbose:
                print(
188
                    "Training {}, layer {} of {}".format(model_name, i + 1, len(layers))
J
jrzaurin 已提交
189 190 191 192 193 194 195 196 197
                )
            for p in layer.parameters():
                p.requires_grad = True
            if routine == "felbo":
                params, max_lr, base_lr = layer.parameters(), lr, lr / 10.0  # type: ignore
            elif routine == "howard":
                params += [{"params": layer.parameters(), "lr": lr / 10.0}]
                max_lr += [lr]
                base_lr += [lr / 10.0]
198
            optimizer = torch.optim.AdamW(params)
J
jrzaurin 已提交
199 200
            scheduler = torch.optim.lr_scheduler.CyclicLR(
                optimizer,
201
                base_lr=base_lr,  # type: ignore[arg-type]
J
jrzaurin 已提交
202 203
                max_lr=max_lr,  # type: ignore
                step_size_up=step_size_up,
204
                step_size_down=step_size_down,
J
jrzaurin 已提交
205 206
                cycle_momentum=False,
            )
207
            self._finetune(model, model_name, loader, optimizer, scheduler)
J
jrzaurin 已提交
208 209 210 211 212 213
            if routine == "felbo":
                for p in layer.parameters():
                    p.requires_grad = False

        if routine == "felbo":
            if self.verbose:
214
                print("Training one last epoch...")
J
jrzaurin 已提交
215 216 217 218 219 220 221 222
            for layer in layers:
                for p in layer.parameters():
                    p.requires_grad = True
            params, max_lr, base_lr = [], [], []
            for lr, layer in zip(layers_max_lr, layers):
                params += [{"params": layer.parameters(), "lr": lr / 10.0}]
                max_lr += [lr]
                base_lr += [lr / 10.0]
223
            optimizer = torch.optim.AdamW(params)
J
jrzaurin 已提交
224 225 226 227 228
            scheduler = torch.optim.lr_scheduler.CyclicLR(
                optimizer,
                base_lr=base_lr,  # type: ignore
                max_lr=max_lr,  # type: ignore
                step_size_up=step_size_up,
229
                step_size_down=step_size_down,
J
jrzaurin 已提交
230 231
                cycle_momentum=False,
            )
232
            self._finetune(model, model_name, loader, optimizer, scheduler)
J
jrzaurin 已提交
233 234 235 236

        for n, p in model.named_parameters():
            p.requires_grad = original_setup[n]

237
    def _finetune(
J
jrzaurin 已提交
238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
        self,
        model: nn.Module,
        model_name: str,
        loader: DataLoader,
        optimizer: Optimizer,
        scheduler: LRScheduler,
        n_epochs: int = 1,
    ):
        r"""
        Standard Pytorch training loop
        """
        steps = len(loader)
        for epoch in range(n_epochs):
            running_loss = 0.0
            with trange(steps, disable=self.verbose != 1) as t:
                for batch_idx, (data, target) in zip(t, loader):
                    t.set_description("epoch %i" % (epoch + 1))
                    X = data[model_name].cuda() if use_cuda else data[model_name]
256 257 258 259 260
                    y = (
                        target.view(-1, 1).float()
                        if self.method != "multiclass"
                        else target
                    )
J
jrzaurin 已提交
261 262 263
                    y = y.cuda() if use_cuda else y

                    optimizer.zero_grad()
264
                    y_pred = model(X)
J
jrzaurin 已提交
265 266 267
                    loss = self.loss_fn(y_pred, y)
                    loss.backward()
                    optimizer.step()
268
                    scheduler.step()
J
jrzaurin 已提交
269 270 271 272 273

                    running_loss += loss.item()
                    avg_loss = running_loss / (batch_idx + 1)

                    if self.metric is not None:
274 275
                        if self.method == "regression":
                            score = self.metric(y_pred, y)
276
                        if self.method == "binary":
277
                            score = self.metric(torch.sigmoid(y_pred), y)
278
                        if self.method == "multiclass":
279
                            score = self.metric(F.softmax(y_pred, dim=1), y)
280 281 282 283
                        t.set_postfix(
                            metrics={k: np.round(v, 4) for k, v in score.items()},
                            loss=avg_loss,
                        )
J
jrzaurin 已提交
284
                    else:
285
                        t.set_postfix(loss=avg_loss)
J
jrzaurin 已提交
286 287 288

    def _steps_up_down(self, steps: int, n_epochs: int = 1) -> Tuple[int, int]:
        r"""
289
        Calculate the number of steps up and down during the one cycle fine-tune for a
J
jrzaurin 已提交
290 291 292 293
        given number of epochs

        Parameters:
        ----------
294
        steps: int
J
jrzaurin 已提交
295
            steps per epoch
296
        n_epochs: int, default=1
297
            number of fine-tune epochs
J
jrzaurin 已提交
298 299 300

        Returns:
        -------
301
        up, down: Tuple, int
J
jrzaurin 已提交
302 303 304 305 306
            number of steps increasing/decreasing the learning rate during the cycle
        """
        up = round((steps * n_epochs) * 0.1)
        down = (steps * n_epochs) - up
        return up, down