_finetune.py 11.9 KB
Newer Older
1
import numpy as np
2
import torch
3
import torch.nn.functional as F
4 5
from tqdm import trange
from torch import nn
6

7
from pytorch_widedeep.metrics import Metric, MultipleMetrics
8 9 10 11 12 13 14 15 16 17
from pytorch_widedeep.wdtypes import (
    Any,
    List,
    Tuple,
    Union,
    Literal,
    Optimizer,
    DataLoader,
    LRScheduler,
)
18 19 20
from pytorch_widedeep.models._base_wd_model_component import (
    BaseWDModelComponent,
)
21 22 23

use_cuda = torch.cuda.is_available()

24
WDModel = Union[nn.Module, BaseWDModelComponent]
25

26

27
class FineTune:
J
jrzaurin 已提交
28
    r"""Fine-tune methods to be applied to the individual model components.
29

J
jrzaurin 已提交
30
    Note that they can also be used to "warm-up" those components before
31
    the joined training.
32

33
    There are 3 fine-tune/warm-up routines available:
34

35
    1) Fine-tune all trainable layers at once
36

37
    2) Gradual fine-tuning inspired by the work of Felbo et al., 2017
38

39
    3) Gradual fine-tuning inspired by the work of Howard & Ruder 2018
40

41 42 43 44 45 46
    The structure of the code in this class is designed to be instantiated
    within the class WideDeep. This is not ideal, but represents a
    compromise towards implementing a fine-tuning functionality for the
    current overall structure of the package without having to
    re-structure most of the existing code. This will change in future
    releases.
47

48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
    Parameters
    ----------
    loss_fn: Any
       any function with the same strucure as 'loss_fn' in the class ``Trainer``
    metric: ``Metric`` or ``MultipleMetrics``
       object of class Metric (see Metric in pytorch_widedeep.metrics)
    method: str
       one of 'binary', 'regression' or 'multiclass'
    verbose: Boolean
    """

    def __init__(
        self,
        loss_fn: Any,
        metric: Union[Metric, MultipleMetrics],
5
5uperpalo 已提交
63
        method: Literal["binary", "regression", "multiclass"],
64 65
        verbose: int,
    ):
J
jrzaurin 已提交
66 67 68 69 70
        self.loss_fn = loss_fn
        self.metric = metric
        self.method = method
        self.verbose = verbose

71
    def finetune_all(
J
jrzaurin 已提交
72
        self,
73
        model: WDModel,
J
jrzaurin 已提交
74 75 76 77 78
        model_name: str,
        loader: DataLoader,
        n_epochs: int,
        max_lr: float,
    ):
79 80 81
        r"""Fine-tune/warm-up all trainable layers in a model using a one cyclic
        learning rate with a triangular pattern. This is refereed as Slanted
        Triangular learing rate in Jeremy Howard & Sebastian Ruder 2018
J
jrzaurin 已提交
82
        (https://arxiv.org/abs/1801.06146). The cycle is described as follows:
83 84

        1) The learning rate will gradually increase for 10% of the training steps
J
jrzaurin 已提交
85
            from max_lr/10 to max_lr.
86 87

        2) It will then gradually decrease to max_lr/10 for the remaining 90% of the
J
jrzaurin 已提交
88
            steps.
89

J
jrzaurin 已提交
90 91 92 93
        The optimizer used in the process is AdamW

        Parameters:
        ----------
94 95 96
        model: `Module``
            ``Module`` object containing one the WideDeep model components (wide,
            deeptabular, deeptext or deepimage)
97
        model_name: str
J
jrzaurin 已提交
98
            string indicating the model name to access the corresponding parameters.
99 100
            One of 'wide', 'deeptabular', 'deeptext' or 'deepimage'
        loader: ``DataLoader``
101
            Pytorch DataLoader containing the data used to fine-tune
102
        n_epochs: int
103
            number of epochs used to fine-tune the model
104
        max_lr: float
J
jrzaurin 已提交
105 106 107
            maximum learning rate value during the triangular cycle.
        """
        if self.verbose:
108
            print("Training {} for {} epochs".format(model_name, n_epochs))
J
jrzaurin 已提交
109 110 111 112 113 114 115 116 117 118 119 120 121
        model.train()

        optimizer = torch.optim.AdamW(model.parameters(), lr=max_lr / 10.0)  # type: ignore
        step_size_up, step_size_down = self._steps_up_down(len(loader), n_epochs)
        scheduler = torch.optim.lr_scheduler.CyclicLR(
            optimizer,
            base_lr=max_lr / 10.0,
            max_lr=max_lr,
            step_size_up=step_size_up,
            step_size_down=step_size_down,
            cycle_momentum=False,
        )

122 123 124
        self._finetune(
            model, model_name, loader, optimizer, scheduler, n_epochs=n_epochs
        )
J
jrzaurin 已提交
125

126
    def finetune_gradual(  # noqa: C901
J
jrzaurin 已提交
127
        self,
128
        model: WDModel,
J
jrzaurin 已提交
129 130 131 132 133 134
        model_name: str,
        loader: DataLoader,
        last_layer_max_lr: float,
        layers: List[nn.Module],
        routine: str,
    ):
135 136 137 138 139
        r"""Fine-tune/warm-up certain layers within the model following a
        gradual fine-tune routine. The approaches implemented in this method are
        based on fine-tuning routines described in the the work of Felbo et
        al., 2017 in their DeepEmoji paper (https://arxiv.org/abs/1708.00524)
        and Howard & Sebastian Ruder 2018 ULMFit paper
140
        (https://arxiv.org/abs/1801.06146).
J
jrzaurin 已提交
141

142 143 144 145 146
        A one cycle triangular learning rate is used. In both Felbo's and
        Howard's routines a gradually decreasing learning rate is used as we
        go deeper into the network. The 'closest' layer to the output
        neuron(s) will use a maximum learning rate of 'last_layer_max_lr'. The
        learning rate will then decrease by a factor of 2.5 per layer
J
jrzaurin 已提交
147

148 149 150 151 152
        1) The 'Felbo' routine: train the first layer in 'layers' for one
           epoch. Then train the next layer in 'layers' for one epoch freezing
           the already trained up layer(s). Repeat untill all individual layers
           are trained. Then, train one last epoch with all trained/fine-tuned
           layers trainable
153

154 155 156
        2) The 'Howard' routine: fine-tune the first layer in 'layers' for one
           epoch. Then traine the next layer in the model for one epoch while
           keeping the already trained up layer(s) trainable. Repeat.
J
jrzaurin 已提交
157 158 159

        Parameters:
        ----------
160 161 162
        model: ``Module``
           ``Module`` object containing one the WideDeep model components (wide,
           deeptabular, deeptext or deepimage)
163
        model_name: str
J
jrzaurin 已提交
164
           string indicating the model name to access the corresponding parameters.
165 166
           One of 'wide', 'deeptabular', 'deeptext' or 'deepimage'
        loader: ``DataLoader``
167
           Pytorch DataLoader containing the data to fine-tune with.
168
        last_layer_max_lr: float
J
jrzaurin 已提交
169 170 171 172
           maximum learning rate value during the triangular cycle for the layer
           closest to the output neuron(s). Deeper layers in 'model' will be trained
           with a gradually descending learning rate. The descending factor is fixed
           and is 2.5
173
        layers: list
174 175
           List of ``Module`` objects containing the layers that will be fine-tuned.
           This must be in *'FINE-TUNE ORDER'*.
J
jrzaurin 已提交
176 177 178 179
        routine: str
           one of 'howard' or 'felbo'
        """
        model.train()
180

J
jrzaurin 已提交
181
        step_size_up, step_size_down = self._steps_up_down(len(loader))
182

J
jrzaurin 已提交
183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
        original_setup = {}
        for n, p in model.named_parameters():
            original_setup[n] = p.requires_grad
        layers_max_lr = [last_layer_max_lr] + [
            last_layer_max_lr / (2.5 * n) for n in range(1, len(layers))
        ]

        for layer in layers:
            for p in layer.parameters():
                p.requires_grad = False

        if routine == "howard":
            params: List = []
            max_lr: List = []
            base_lr: List = []

        for i, (lr, layer) in enumerate(zip(layers_max_lr, layers)):
            if self.verbose:
                print(
202
                    "Training {}, layer {} of {}".format(model_name, i + 1, len(layers))
J
jrzaurin 已提交
203 204 205 206 207 208 209 210 211
                )
            for p in layer.parameters():
                p.requires_grad = True
            if routine == "felbo":
                params, max_lr, base_lr = layer.parameters(), lr, lr / 10.0  # type: ignore
            elif routine == "howard":
                params += [{"params": layer.parameters(), "lr": lr / 10.0}]
                max_lr += [lr]
                base_lr += [lr / 10.0]
212
            optimizer = torch.optim.AdamW(params)
J
jrzaurin 已提交
213 214
            scheduler = torch.optim.lr_scheduler.CyclicLR(
                optimizer,
215
                base_lr=base_lr,  # type: ignore[arg-type]
J
jrzaurin 已提交
216 217
                max_lr=max_lr,  # type: ignore
                step_size_up=step_size_up,
218
                step_size_down=step_size_down,
J
jrzaurin 已提交
219 220
                cycle_momentum=False,
            )
221
            self._finetune(model, model_name, loader, optimizer, scheduler)
J
jrzaurin 已提交
222 223 224 225 226 227
            if routine == "felbo":
                for p in layer.parameters():
                    p.requires_grad = False

        if routine == "felbo":
            if self.verbose:
228
                print("Training one last epoch...")
J
jrzaurin 已提交
229 230 231 232 233 234 235 236
            for layer in layers:
                for p in layer.parameters():
                    p.requires_grad = True
            params, max_lr, base_lr = [], [], []
            for lr, layer in zip(layers_max_lr, layers):
                params += [{"params": layer.parameters(), "lr": lr / 10.0}]
                max_lr += [lr]
                base_lr += [lr / 10.0]
237
            optimizer = torch.optim.AdamW(params)
J
jrzaurin 已提交
238 239 240 241 242
            scheduler = torch.optim.lr_scheduler.CyclicLR(
                optimizer,
                base_lr=base_lr,  # type: ignore
                max_lr=max_lr,  # type: ignore
                step_size_up=step_size_up,
243
                step_size_down=step_size_down,
J
jrzaurin 已提交
244 245
                cycle_momentum=False,
            )
246
            self._finetune(model, model_name, loader, optimizer, scheduler)
J
jrzaurin 已提交
247 248 249 250

        for n, p in model.named_parameters():
            p.requires_grad = original_setup[n]

251
    def _finetune(
J
jrzaurin 已提交
252
        self,
253
        model: WDModel,
J
jrzaurin 已提交
254 255 256 257 258 259 260 261 262 263 264 265 266
        model_name: str,
        loader: DataLoader,
        optimizer: Optimizer,
        scheduler: LRScheduler,
        n_epochs: int = 1,
    ):
        r"""
        Standard Pytorch training loop
        """
        steps = len(loader)
        for epoch in range(n_epochs):
            running_loss = 0.0
            with trange(steps, disable=self.verbose != 1) as t:
267
                for batch_idx, (data, target, lds_weightt) in zip(t, loader):
J
jrzaurin 已提交
268 269
                    t.set_description("epoch %i" % (epoch + 1))
                    X = data[model_name].cuda() if use_cuda else data[model_name]
270 271
                    y = (
                        target.view(-1, 1).float()
J
jrzaurin 已提交
272
                        if self.method not in ["multiclass", "qregression"]
273 274
                        else target
                    )
J
jrzaurin 已提交
275 276 277
                    y = y.cuda() if use_cuda else y

                    optimizer.zero_grad()
278
                    y_pred = model(X)
J
jrzaurin 已提交
279 280 281
                    loss = self.loss_fn(y_pred, y)
                    loss.backward()
                    optimizer.step()
282
                    scheduler.step()
J
jrzaurin 已提交
283 284 285 286 287

                    running_loss += loss.item()
                    avg_loss = running_loss / (batch_idx + 1)

                    if self.metric is not None:
288 289
                        if self.method == "regression":
                            score = self.metric(y_pred, y)
290
                        if self.method == "binary":
291
                            score = self.metric(torch.sigmoid(y_pred), y)
J
jrzaurin 已提交
292 293
                        if self.method == "qregression":
                            score = self.metric(y_pred, y)
294
                        if self.method == "multiclass":
295
                            score = self.metric(F.softmax(y_pred, dim=1), y)
296 297 298 299
                        t.set_postfix(
                            metrics={k: np.round(v, 4) for k, v in score.items()},
                            loss=avg_loss,
                        )
J
jrzaurin 已提交
300
                    else:
301
                        t.set_postfix(loss=avg_loss)
J
jrzaurin 已提交
302 303 304

    def _steps_up_down(self, steps: int, n_epochs: int = 1) -> Tuple[int, int]:
        r"""
305
        Calculate the number of steps up and down during the one cycle fine-tune for a
J
jrzaurin 已提交
306 307 308 309
        given number of epochs

        Parameters:
        ----------
310
        steps: int
J
jrzaurin 已提交
311
            steps per epoch
312
        n_epochs: int, default=1
313
            number of fine-tune epochs
J
jrzaurin 已提交
314 315 316

        Returns:
        -------
317
        up, down: Tuple, int
J
jrzaurin 已提交
318 319
            number of steps increasing/decreasing the learning rate during the cycle
        """
320 321
        # up = round((steps * n_epochs) * 0.1)
        up = max([round((steps * n_epochs) * 0.1), 1])
J
jrzaurin 已提交
322 323
        down = (steps * n_epochs) - up
        return up, down