added get and setstate methods in the EarlyStopping and ModelCheckpoint...

added get and setstate methods in the EarlyStopping and ModelCheckpoint callbacks. Added the possibility of using GRUs in the deeptext component and also predict using the hidden state or the output. Fixed a small bug in the text processor. Improved the save method in the Trainer

added get and setstate methods in the EarlyStopping and ModelCheckpoint...
added get and setstate methods in the EarlyStopping and ModelCheckpoint callbacks. Added the possibility of using GRUs in the deeptext component and also predict using the hidden state or the output. Fixed a small bug in the text processor. Improved the save method in the Trainer
fc4aed48 · jrzaurin · 8727ce66 · fc4aed48 · fc4aed48 · fc4aed48
6 changed file
--- a/examples/adult_census.py
+++ b/examples/adult_census.py
@@ -84,14 +84,16 @@ if __name__ == "__main__":
    wide_sch = torch.optim.lr_scheduler.StepLR(wide_opt, step_size=2)
    deep_sch = torch.optim.lr_scheduler.StepLR(deep_opt, step_size=3)

+    model_checkpoint = ModelCheckpoint(
+        filepath="model_weights/wd_out",
+        save_best_only=True,
+        max_save=1,
+    )
+    early_stopping = EarlyStopping(patience=5)
    optimizers = {"wide": wide_opt, "deeptabular": deep_opt}
    schedulers = {"wide": wide_sch, "deeptabular": deep_sch}
    initializers = {"wide": KaimingNormal, "deeptabular": XavierNormal}
-    callbacks = [
-        LRHistory(n_epochs=10),
-        EarlyStopping(patience=5),
-        ModelCheckpoint(filepath="model_weights/wd_out"),
-    ]
+    callbacks = [early_stopping, model_checkpoint, LRHistory(n_epochs=10)]
    metrics = [Accuracy, Precision]

    trainer = Trainer(
@@ -108,16 +110,9 @@ if __name__ == "__main__":
        X_wide=X_wide,
        X_tab=X_tab,
        target=target,
-        n_epochs=10,
+        n_epochs=2,
        batch_size=64,
        val_split=0.2,
    )

-    # # to save/load the model
-    # trainer.save_model("model_weights/model.t")
-    # # ... days after
-    # model = Trainer.load_model("model_weights/model.t")
-    # # or via state dictionaries
-    # trainer.save_model_state_dict("model_weights/model_dict.t")
-    # # ... days after, with an instantiated class of Trainer
-    # trainer.load_model_state_dict("model_weights/model_dict.t")
+    trainer.save("widedeep")
--- a/pytorch_widedeep/callbacks.py
+++ b/pytorch_widedeep/callbacks.py
@@ -466,6 +466,14 @@ class ModelCheckpoint(Callback):
                        self.old_files = self.old_files[1:]
                    self.old_files.append(filepath)

+    def __getstate__(self):
+        d = self.__dict__
+        self_dict = {k: d[k] for k in d if k not in ["trainer", "model"]}
+        return self_dict
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+

 class EarlyStopping(Callback):
    def __init__(
@@ -608,3 +616,11 @@ class EarlyStopping(Callback):
                RuntimeWarning,
            )
        return monitor_value
+
+    def __getstate__(self):
+        d = self.__dict__
+        self_dict = {k: d[k] for k in d if k not in ["trainer", "model"]}
+        return self_dict
+
+    def __setstate__(self, state):
+        self.__dict__ = state
--- a/pytorch_widedeep/models/deep_text.py
+++ b/pytorch_widedeep/models/deep_text.py
@@ -12,10 +12,12 @@ class DeepText(nn.Module):
    def __init__(
        self,
        vocab_size: int,
+        rnn_type: str = "lstm",
        hidden_dim: int = 64,
        n_layers: int = 3,
        rnn_dropout: float = 0.1,
        bidirectional: bool = False,
+        use_hidden_state: bool = True,
        padding_idx: int = 1,
        embed_dim: Optional[int] = None,
        embed_matrix: Optional[np.ndarray] = None,
@@ -37,6 +39,8 @@ class DeepText(nn.Module):
        ----------
        vocab_size: int
            number of words in the vocabulary
+        rnn_type: str
+            String indicating the type of RNN to use. One of "lstm" or "rnn"
        hidden_dim: int, default = 64
            Hidden dim of the LSTM
        n_layers: int, default = 3
@@ -46,6 +50,9 @@ class DeepText(nn.Module):
            the last layer
        bidirectional: bool, default = True
            indicates whether the staked RNNs are bidirectional
+        use_hidden_state: str, default = True,
+            Boolean indicating whether to use the final hidden state of the
+            rnn output as predicting features
        padding_idx: int, default = 1
            index of the padding token in the padded-tokenised sequences. I
            use the ``fastai`` tokenizer where the token index 0 is reserved
@@ -112,11 +119,18 @@ class DeepText(nn.Module):
                UserWarning,
            )

+        if rnn_type.lower() not in ["lstm", "gru"]:
+            raise ValueError(
+                f"'rnn_type' must be 'lstm' or 'gru', got {rnn_type} instead"
+            )
+
        self.vocab_size = vocab_size
+        self.rnn_type = rnn_type
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.rnn_dropout = rnn_dropout
        self.bidirectional = bidirectional
+        self.use_hidden_state = use_hidden_state
        self.padding_idx = padding_idx
        self.embed_dim = embed_dim
        self.embed_trainable = embed_trainable
@@ -152,14 +166,18 @@ class DeepText(nn.Module):
            )

        # stack of RNNs (LSTMs)
-        self.rnn = nn.LSTM(
-            embed_dim,
-            hidden_dim,
-            num_layers=n_layers,
-            bidirectional=bidirectional,
-            dropout=rnn_dropout,
-            batch_first=True,
-        )
+        rnn_params = {
+            "input_size": embed_dim,
+            "hidden_size": hidden_dim,
+            "num_layers": n_layers,
+            "bidirectional": bidirectional,
+            "dropout": rnn_dropout,
+            "batch_first": True,
+        }
+        if self.rnn_type.lower() == "lstm":
+            self.rnn: Union[nn.LSTM, nn.GRU] = nn.LSTM(**rnn_params)
+        elif self.rnn_type.lower() == "gru":
+            self.rnn = nn.GRU(**rnn_params)

        # the output_dim attribute will be used as input_dim when "merging" the models
        self.output_dim = hidden_dim * 2 if bidirectional else hidden_dim
@@ -186,13 +204,23 @@ class DeepText(nn.Module):
        classifier/regressor with an optional `'Fully Connected head'`
        """
        embed = self.word_embed(X.long())
-        o, (h, c) = self.rnn(embed)
+
+        if self.rnn_type.lower() == "lstm":
+            o, (h, c) = self.rnn(embed)
+        elif self.rnn_type.lower() == "gru":
+            o, h = self.rnn(embed)
+
+        o = o.permute(1, 0, 2)
+
        if self.bidirectional:
-            last_h = torch.cat((h[-2], h[-1]), dim=1)
+            rnn_out = (
+                torch.cat((h[-2], h[-1]), dim=1) if self.use_hidden_state else o[-1]
+            )
        else:
-            last_h = h[-1]
+            rnn_out = h[-1] if self.use_hidden_state else o[-1]
+
        if self.head_hidden_dims is not None:
-            out = self.texthead(last_h)
-            return out
+            head_out = self.texthead(rnn_out)
+            return head_out
        else:
-            return last_h
+            return rnn_out
--- a/pytorch_widedeep/preprocessing/tab_preprocessor.py
+++ b/pytorch_widedeep/preprocessing/tab_preprocessor.py
@@ -135,6 +135,13 @@ class TabPreprocessor(BasePreprocessor):
            raise ValueError(tabtransformer_error_message)
        if self.for_tabtransformer and isinstance(self.embed_cols[0], tuple):  # type: ignore[index]
            raise ValueError(tabtransformer_error_message)
+        if self.for_tabtransformer and self.scale:
+            warnings.warn(
+                "Both 'for_tabtransformer' and 'scale' are set to True. "
+                "This implies that the continuous columns will be "
+                "standarized and then passed through a LayerNorm layer",
+                UserWarning,
+            )

    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
        """Fits the Preprocessor and creates required attributes"""

--- a/pytorch_widedeep/preprocessing/text_preprocessor.py
+++ b/pytorch_widedeep/preprocessing/text_preprocessor.py
@@ -21,6 +21,8 @@ class TextPreprocessor(BasePreprocessor):
        max_vocab: int = 30000,
        min_freq: int = 5,
        maxlen: int = 80,
+        pad_first: bool = True,
+        pad_idx: int = 1,
        word_vectors_path: Optional[str] = None,
        verbose: int = 1,
    ):
@@ -36,6 +38,11 @@ class TextPreprocessor(BasePreprocessor):
            Minimum frequency for a token to be part of the vocabulary
        maxlen: int, default=80
            Maximum length of the tokenized sequences
+        pad_first: bool,  default = True
+            Indicates whether the padding index will be added at the beginning or the
+            end of the sequences
+        pad_idx: int, default = 1
+            padding index. Fastai's Tokenizer leaves 0 for the 'unknown' token.
        word_vectors_path: str, Optional
            Path to the pretrained word vectors
        verbose: int, default 1
@@ -71,6 +78,8 @@ class TextPreprocessor(BasePreprocessor):
        self.max_vocab = max_vocab
        self.min_freq = min_freq
        self.maxlen = maxlen
+        self.pad_first = pad_first
+        self.pad_idx = pad_idx
        self.word_vectors_path = word_vectors_path
        self.verbose = verbose

@@ -83,6 +92,10 @@ class TextPreprocessor(BasePreprocessor):
        )
        if self.verbose:
            print("The vocabulary contains {} tokens".format(len(self.vocab.stoi)))
+        if self.word_vectors_path is not None:
+            self.embedding_matrix = build_embeddings_matrix(
+                self.vocab, self.word_vectors_path, self.min_freq
+            )
        return self

    def transform(self, df: pd.DataFrame) -> np.ndarray:
@@ -91,11 +104,17 @@ class TextPreprocessor(BasePreprocessor):
        texts = df[self.text_col].tolist()
        self.tokens = get_texts(texts)
        sequences = [self.vocab.numericalize(t) for t in self.tokens]
-        padded_seq = np.array([pad_sequences(s, maxlen=self.maxlen) for s in sequences])
-        if self.word_vectors_path is not None:
-            self.embedding_matrix = build_embeddings_matrix(
-                self.vocab, self.word_vectors_path, self.min_freq
-            )
+        padded_seq = np.array(
+            [
+                pad_sequences(
+                    s,
+                    maxlen=self.maxlen,
+                    pad_first=self.pad_first,
+                    pad_idx=self.pad_idx,
+                )
+                for s in sequences
+            ]
+        )
        return padded_seq

    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:

--- a/pytorch_widedeep/training/trainer.py
+++ b/pytorch_widedeep/training/trainer.py
 import os
 import json
+from pathlib import Path

 import numpy as np
 import torch
@@ -275,10 +276,13 @@ class Trainer:

        self.verbose = verbose
        self.seed = seed
-        self.early_stop = False
        self.objective = objective
        self.method = _ObjectiveToMethod.get(objective)

+        # initialize early_stop. If EarlyStopping Callback is used it will
+        # take care of it
+        self.early_stop = False
+
        self.loss_fn = self._set_loss_fn(
            objective, class_weight, custom_loss_function, alpha, gamma
        )
@@ -817,10 +821,10 @@ class Trainer:
        path: str,
        save_state_dict: bool = False,
        model_filename: str = "wd_model.pt",
-        feat_imp_filename: str = "feature_importance.json",
    ):
-        """Saves the model and the feature_importance attribute (if the
-        ``deeptabular`` component is a Tabnet model) to disk
+        """Saves the model, training and evaluation history, and the
+        feature_importance attribute (if the ``deeptabular`` component is a
+        Tabnet model) to disk

        The ``Trainer`` class is built so that it 'just' trains a model. With
        that in mind, all the torch related parameters (such as optimizers,
@@ -847,23 +851,31 @@ class Trainer:
            model's state dictionary
        model_filename: str, Optional, default = "wd_model.pt"
            filename where the model weights will be store
-        feat_imp_filename: str, Optional, default = "feature_importance.json"
-            filename where the feature importances will be stored
        """

-        # TO DO: ask advide on saving strategy
-        if not os.path.exists(path):
-            os.makedirs(path)
+        save_dir = Path(path)
+        history_dir = save_dir / "history"
+        history_dir.mkdir(exist_ok=True, parents=True)
+
+        # the trainer is run with the History Callback by default
+        with open(history_dir / "train_eval_history.json", "w") as teh:
+            json.dump(self.history, teh)  # type: ignore[attr-defined]
+
+        has_lr_history = any(
+            [clbk.__class__.__name__ == "LRHistory" for clbk in self.callbacks]
+        )
+        if has_lr_history:
+            with open(history_dir / "lr_history.json", "w") as lrh:
+                json.dump(self.lr_history, lrh)  # type: ignore[attr-defined]

-        model_path = "/".join([path, model_filename])
+        model_path = save_dir / model_filename
        if save_state_dict:
            torch.save(self.model.state_dict(), model_path)
        else:
            torch.save(self.model, model_path)

        if self.model.is_tabnet:
-            feature_importance_fname = "/".join([path, feat_imp_filename])
-            with open(feature_importance_fname, "w") as fi:
+            with open(save_dir / "feature_importance.json", "w") as fi:
                json.dump(self.feature_importance, fi)

    def _restore_best_weights(self):
@@ -894,7 +906,8 @@ class Trainer:
                        if self.verbose:
                            print(
                                "Model weights after training corresponds to the those of the "
-                                "final epoch which might not be the best performing weights"
+                                "final epoch which might not be the best performing weights. Use"
+                                "the 'ModelCheckpoint' Callback to restore the best epoch weights."
                            )

    def _finetune(