Added a few types, refined the docs and added tests for tabnet and the...

Added a few types, refined the docs and added tests for tabnet and the corresponding functionalities

Added a few types, refined the docs and added tests for tabnet and the...
Added a few types, refined the docs and added tests for tabnet and the corresponding functionalities
bf10a695 · jrzaurin · dd2b2141 · bf10a695 · bf10a695 · bf10a695
8 changed file
--- a/pytorch_widedeep/models/tabnet/tab_net.py
+++ b/pytorch_widedeep/models/tabnet/tab_net.py
@@ -86,7 +86,7 @@ class GLU_Block(nn.Module):
        output_dim: int,
        n_glu: int = 2,
        first: bool = False,
-        shared_layers: List = None,
+        shared_layers: nn.ModuleList = None,
        ghost_bn: bool = True,
        virtual_batch_size: int = 128,
        momentum: float = 0.02,
@@ -142,9 +142,9 @@ class FeatTransformer(nn.Module):
        self,
        input_dim: int,
        output_dim: int,
-        shared_layers: List,
+        shared_layers: nn.ModuleList,
        n_glu_step_dependent: int,
-        ghost_bn: bool = True,
+        ghost_bn=True,
        virtual_batch_size=128,
        momentum=0.02,
    ):
@@ -176,10 +176,10 @@ class FeatTransformer(nn.Module):
 class AttentiveTransformer(nn.Module):
    def __init__(
        self,
-        input_dim,
-        output_dim,
-        mask_type="sparsemax",
-        ghost_bn: bool = True,
+        input_dim: int,
+        output_dim: int,
+        mask_type: str = "sparsemax",
+        ghost_bn=True,
        virtual_batch_size=128,
        momentum=0.02,
    ):
@@ -200,7 +200,7 @@ class AttentiveTransformer(nn.Module):
            self.mask = sparsemax.Entmax15(dim=-1)
        else:
            raise NotImplementedError(
-                "Please choose either sparsemax" + "or entmax as masktype"
+                "Please choose either 'sparsemax' or 'entmax' as masktype"
            )

    def forward(self, priors, processed_feat):
@@ -212,25 +212,25 @@ class AttentiveTransformer(nn.Module):
 class TabNetEncoder(nn.Module):
    def __init__(
        self,
-        input_dim,
-        step_dim=8,
-        attn_dim=8,
-        n_steps=3,
-        n_glu_step_dependent=2,
-        n_glu_shared=2,
-        ghost_bn=True,
-        virtual_batch_size=128,
-        momentum=0.02,
-        gamma=1.3,
-        epsilon=1e-15,
-        mask_type="sparsemax",
+        input_dim: int,
+        n_steps: int = 3,
+        step_dim: int = 8,
+        attn_dim: int = 8,
+        n_glu_step_dependent: int = 2,
+        n_glu_shared: int = 2,
+        ghost_bn: bool = True,
+        virtual_batch_size: int = 128,
+        momentum: float = 0.02,
+        gamma: float = 1.3,
+        epsilon: float = 1e-15,
+        mask_type: str = "sparsemax",
    ):
        super(TabNetEncoder, self).__init__()

        self.input_dim = input_dim
+        self.n_steps = n_steps
        self.step_dim = step_dim
        self.attn_dim = attn_dim
-        self.n_steps = n_steps
        self.gamma = gamma
        self.epsilon = epsilon

@@ -242,7 +242,7 @@ class TabNetEncoder(nn.Module):
            "momentum": momentum,
        }

-        shared_layers = torch.nn.ModuleList()
+        shared_layers = nn.ModuleList()
        for i in range(n_glu_shared):
            if i == 0:
                shared_layers.append(
@@ -377,7 +377,7 @@ class EmbeddingsAndContinuous(nn.Module):
        else:
            cont_out_dim = 0

-        self.output_dim = emb_out_dim + cont_out_dim
+        self.output_dim: int = emb_out_dim + cont_out_dim  # type: ignore[assignment]

    def forward(self, X):
        embed = [
@@ -403,18 +403,95 @@ class TabNet(nn.Module):
        embed_dropout: float = 0.0,
        continuous_cols: Optional[List[str]] = None,
        batchnorm_cont: bool = False,
-        step_dim=8,
-        attn_dim=8,
-        n_steps=3,
-        n_glu_step_dependent=2,
-        n_glu_shared=2,
-        ghost_bn=True,
-        virtual_batch_size=128,
-        momentum=0.02,
-        gamma=1.3,
-        epsilon=1e-15,
-        mask_type="sparsemax",
+        n_steps: int = 3,
+        step_dim: int = 8,
+        attn_dim: int = 8,
+        n_glu_step_dependent: int = 2,
+        n_glu_shared: int = 2,
+        ghost_bn: bool = True,
+        virtual_batch_size: int = 128,
+        momentum: float = 0.02,
+        gamma: float = 1.3,
+        epsilon: float = 1e-15,
+        mask_type: str = "sparsemax",
    ):
+        r"""TabNet model (https://arxiv.org/abs/1908.07442) model that can be used
+        as the deeptabular component of a Wide & Deep model.
+
+        The implementation in this library is fully based on that here:
+        https://github.com/dreamquark-ai/tabnet, simply adapted so that it can
+        work within the ``WideDeep`` frame. Therefore, **all credit to the
+        dreamquark-ai team**
+
+        Parameters
+        ----------
+        column_idx: Dict
+            Dictionary where the keys are the columns and the values their
+            corresponding index
+        embed_input: List
+            List of Tuples with the column name, number of unique values and
+            embedding dimension. e.g. [(education, 11, 32), ...]
+        embed_dropout: float, default = 0.
+            embeddings dropout
+        continuous_cols: List, Optional, default = None
+            List with the name of the numeric (aka continuous) columns
+        batchnorm_cont: bool, default = False
+            Boolean indicating whether or not to apply batch normalization to the
+            continuous input
+        n_steps: int, default = 3
+            number of decision steps
+        step_dim: int, default = 8
+            Step's output dimension. This is the output dimension that
+            ``WideDeep`` will collect and connect to the output neuron(s). For
+            a better understanding of the function of this and the upcoming
+            parameters, please see the `paper
+            <https://arxiv.org/abs/1908.07442>`_.
+        attn_dim: int, default = 8
+            Attention dimension
+        n_glu_step_dependent: int, default = 2
+            number of GLU Blocks [FC -> BN -> GLU] that are step dependent
+        n_glu_shared: int, default = 2
+            number of GLU Blocks [FC -> BN -> GLU] that will be shared
+            across decision steps
+        ghost_bn: bool, default=True
+            Boolean indicating if `Ghost Batch Normalization
+            <https://arxiv.org/abs/1705.08741>_` will be used.
+        virtual_batch_size: int, default = 128
+            Batch size when using Ghost Batch Normalization
+        momentum: float, default = 0.02
+            Ghost Batch Normalization's momentum
+        gamma: float, default = 1.3
+            Relaxation parameter in the paper. When gamma = 1, a feature is
+            enforced to be used only at one decision step and as gamma
+            increases, more flexibility is provided to use a feature at
+            multiple decision steps
+        epsilon: float, default = 1e-15
+            Float to avoid log(0). Always keep low
+        mask_type: str, default = "sparsemax"
+            Mask function to use. Either "sparsemax" or "entmax"
+
+        Attributes
+        ----------
+        embed_and_cont: ``nn.ModuleDict``
+            ``ModuleDict`` with the embedding
+        TabNetEncoder: ``nn.Module``
+            ``Module`` containing the TabNetEncoder. See the `paper
+            <https://arxiv.org/abs/1908.07442>`_.
+        output_dim: int
+            The output dimension of the model. This is a required attribute
+            neccesary to build the WideDeep class
+
+        Example
+        --------
+        >>> import torch
+        >>> from pytorch_widedeep.models import TabNet
+        >>> X_tab = torch.cat((torch.empty(5, 4).random_(4), torch.rand(5, 1)), axis=1)
+        >>> colnames = ['a', 'b', 'c', 'd', 'e']
+        >>> embed_input = [(u,i,j) for u,i,j in zip(colnames[:4], [4]*4, [8]*4)]
+        >>> column_idx = {k:v for v,k in enumerate(colnames)}
+        >>> model = TabNet(column_idx=column_idx, embed_input=embed_input, continuous_cols = ['e'])
+        """
+
        super(TabNet, self).__init__()

        self.column_idx = column_idx
@@ -422,9 +499,9 @@ class TabNet(nn.Module):
        self.embed_dropout = embed_dropout
        self.continuous_cols = continuous_cols
        self.batchnorm_cont = batchnorm_cont
+        self.n_steps = n_steps
        self.step_dim = step_dim
        self.attn_dim = attn_dim
-        self.n_steps = n_steps
        self.n_glu_step_dependent = n_glu_step_dependent
        self.n_glu_shared = n_glu_shared
        self.ghost_bn = ghost_bn
@@ -467,6 +544,16 @@ class TabNet(nn.Module):

 class TabNetPredLayer(nn.Module):
    def __init__(self, inp, out):
+        r"""This class is a 'hack' required because TabNet is a very particular
+        model within ``WideDeep``.
+
+        TabNet's forward method within ``WideDeep`` outputs two tensors, one
+        with the last layer's activations and the sparse regularization
+        factor. Since the output needs to be collected by ``WideDeep`` to then
+        Sequentially build the output layer (connection to the output
+        neuron(s)) I need to code a custom TabNetPredLayer that accepts two
+        inputs. This will be used by the ``WideDeep`` class.
+        """
        super(TabNetPredLayer, self).__init__()
        self.pred_layer = nn.Linear(inp, out, bias=False)
        initialize_non_glu(self.pred_layer, inp, out)

--- a/pytorch_widedeep/models/wide_deep.py
+++ b/pytorch_widedeep/models/wide_deep.py
@@ -30,7 +30,8 @@ class WideDeep(nn.Module):
        pred_dim: int = 1,
    ):
        r"""Main collector class that combines all ``wide``, ``deeptabular``
-        (which can be a number of architectures), ``deeptext`` and ``deepimage`` models.
+        (which can be a number of architectures), ``deeptext`` and
+        ``deepimage`` models.

        There are two options to combine these models that correspond to the
        two main architectures (there is a higher number of
@@ -51,9 +52,9 @@ class WideDeep(nn.Module):
            :class:`pytorch_widedeep.models.wide.Wide`
        deeptabular: ``nn.Module``, Optional, default = None

-            currently ``pytorch-widedeep`` implements three possible
+            currently ``pytorch-widedeep`` implements four possible
            architectures for the `deeptabular` component. These are:
-            ``TabMlp``, ``TabResnet`` and ``TabTransformer``.
+            ``TabMlp``, ``TabResnet``, ``TabNet`` and ``TabTransformer``.

            1. ``TabMlp`` is simply an embedding layer encoding the categorical
            features that are then concatenated and passed through a series of
@@ -65,14 +66,18 @@ class WideDeep(nn.Module):
            ResNet blocks formed by dense layers.
            See ``pytorch_widedeep.models.deep_dense_resnet.TabResnet``

+            3. ``TabNet`` is detailed in `TabNet: Attentive Interpretable Tabular
+            Learning <https://arxiv.org/abs/1908.07442>`_. See
+            ``pytorch_widedeep.models.tabnet.tab_net.TabNet``
+
            3. ``TabTransformer`` is detailed in `TabTransformer: Tabular Data
            Modeling Using Contextual Embeddings
-            <https://arxiv.org/pdf/2012.06678.pdf>`_. See
+            <https://arxiv.org/abs/2012.06678>`_. See
            ``pytorch_widedeep.models.tab_transformer.TabTransformer``

-            I recommend using on of these as ``deeptabular``. However, a
-            custom model as long as is  consistent with the required
-            architecture. See
+            I recommend using on of these as ``deeptabular``. However, it is
+            possible to use a custom model as long as is  consistent with the
+            required architecture. See
            :class:`pytorch_widedeep.models.deep_dense.TabTransformer`.

        deeptext: ``nn.Module``, Optional, default = None
@@ -323,10 +328,12 @@ class WideDeep(nn.Module):
                warnings.warn(
                    "'WideDeep' is a model comprised by multiple components and the 'deeptabular'"
                    " component is 'TabNet'. We recommend using 'TabNet' in isolation."
-                    " This is because 'TabNet' uses sparse regularization which partially losses"
+                    " The reasons are: i)'TabNet' uses sparse regularization which partially losses"
                    " its purpose when used in combination with other components."
                    " If you still want to use a multiple component model with 'TabNet',"
-                    " consider setting 'lambda_sparse' to 0 during training",
+                    " consider setting 'lambda_sparse' to 0 during training. ii) The feature"
+                    " importances will be computed only for TabNet but the model will comprise multiple"
+                    " components. Therefore, such importances will partially lose their 'meaning'.",
                    UserWarning,
                )
        if deeptext is not None and not hasattr(deeptext, "output_dim"):

--- a/tests/test_data_utils/test_du_base_preprocessor.py
+++ b/tests/test_data_utils/test_du_base_preprocessor.py
+import pandas as pd
+import pytest
+from sklearn.exceptions import NotFittedError
+
+from pytorch_widedeep.preprocessing.base_preprocessor import (
+    BasePreprocessor,
+    check_is_fitted,
+)
+
+df = pd.DataFrame({"col1": ["a", "b", "c", "d", "e"], "col2": [1, 2, 3, 4, 5]})
+
+
+class DummyPreprocessor(BasePreprocessor):
+    def __init__(self):
+        super().__init__()
+
+    def fit(self, df):
+        self.att1 = 1
+        self.att2 = 2
+        return df
+
+    def transform(self, df):
+        check_is_fitted(self, attributes=["att1", "att2"], all_or_any="any")
+        return df
+
+    def fit_transform(self, df):
+        return self.fit(df).transform(df)
+
+
+class IncompletePreprocessor(BasePreprocessor):
+    def __init__(self):
+        super().__init__()
+
+    def fit(self, df):
+        return df
+
+    def transform(self, df):
+        return df
+
+
+###############################################################################
+#  test check_is_fitted with "any"
+###############################################################################
+
+
+def test_check_is_fitted():
+    dummy_preprocessor = DummyPreprocessor()
+    with pytest.raises(NotFittedError):
+        dummy_preprocessor.transform(df)
+
+
+###############################################################################
+#  test base_preprocessor raising NotImplemented error
+###############################################################################
+
+
+def test_base_non_implemented_error():
+    with pytest.raises(NotImplementedError):
+        incomplete_preprocessor = IncompletePreprocessor()  # noqa: F841
+        incomplete_preprocessor.fit_transform(df)
--- a/tests/test_data_utils/test_du_deep_tabular.py
+++ b/tests/test_data_utils/test_du_deep_tabular.py
@@ -5,6 +5,7 @@ from sklearn.exceptions import NotFittedError

 from pytorch_widedeep.preprocessing import TabPreprocessor
 from pytorch_widedeep.utils.deeptabular_utils import LabelEncoder
+from pytorch_widedeep.preprocessing.tab_preprocessor import embed_sz_rule


 def create_test_dataset(input_type, input_type_2=None):
@@ -225,3 +226,27 @@ def test_notfittederror():
    )
    with pytest.raises(NotFittedError):
        processor.transform(df)
+
+
+###############################################################################
+# Test embeddings fastai's rule of thumb
+###############################################################################
+
+
+def test_embed_sz_rule_of_thumb():
+
+    embed_cols = ["col1", "col2"]
+    df = pd.DataFrame(
+        {
+            "col1": np.random.randint(10, size=100),
+            "col2": np.random.randint(20, size=100),
+        }
+    )
+    n_cats = {c: df[c].nunique() for c in ["col1", "col2"]}
+    embed_szs = {c: embed_sz_rule(nc) for c, nc in n_cats.items()}
+    tab_preprocessor = TabPreprocessor(embed_cols=embed_cols)
+    tdf = tab_preprocessor.fit_transform(df)  # noqa: F841
+    out = [
+        tab_preprocessor.embed_dim[col] == embed_szs[col] for col in embed_szs.keys()
+    ]
+    assert all(out)
--- a/tests/test_model_components/test_mc_tab_tabnet.py
+++ b/tests/test_model_components/test_mc_tab_tabnet.py
+import string
+
+import numpy as np
+import torch
+import pytest
+
+from pytorch_widedeep.wdtypes import WideDeep
+from pytorch_widedeep.models.tabnet.tab_net import TabNet  # noqa: F403
+from pytorch_widedeep.models.tabnet.tabnet_utils import create_explain_matrix
+
+# I am going over test this model due to the number of components
+
+n_embed = 5
+#  this is the number of embed_cols and cont_cols. So total num of cols =
+#  n_cols * 2
+n_cols = 2
+batch_size = 10
+colnames = list(string.ascii_lowercase)[: (n_cols * 2)]
+embed_cols = [np.random.choice(np.arange(n_embed), batch_size) for _ in range(n_cols)]
+cont_cols = [np.random.rand(batch_size) for _ in range(n_cols)]
+
+X_tab = torch.from_numpy(np.vstack(embed_cols + cont_cols).transpose())
+X_tab_emb = X_tab[:, :n_cols]
+X_tab_cont = X_tab[:, n_cols:]
+
+###############################################################################
+# Test functioning using the defaults
+###############################################################################
+
+embed_input = [(u, i, 1) for u, i in zip(colnames[:2], [n_embed] * 2)]
+
+model1 = TabNet(
+    column_idx={k: v for v, k in enumerate(colnames)},
+    embed_input=embed_input,
+    continuous_cols=colnames[n_cols:],
+)
+
+
+def test_embeddings_have_padding():
+    res = []
+    for k, v in model1.embed_and_cont.embed_layers.items():
+        res.append(v.weight.size(0) == n_embed + 1)
+        res.append(not torch.all(v.weight[0].bool()))
+    assert all(res)
+
+
+def test_tabnet_output():
+    out1, out2 = model1(X_tab)
+    assert out1.size(0) == 10 and out1.size(1) == model1.step_dim
+
+
+###############################################################################
+# Test functioning with different types of masks
+###############################################################################
+
+
+@pytest.mark.parametrize(
+    "mask_type",
+    [
+        "sparsemax",
+        "entmax",
+    ],
+)
+def test_mask_type(mask_type):
+    model = TabNet(
+        column_idx={k: v for v, k in enumerate(colnames)},
+        embed_input=embed_input,
+        continuous_cols=colnames[n_cols:],
+        mask_type=mask_type,
+    )
+    out1, out2 = model(X_tab)
+    assert out1.size(0) == 10 and out1.size(1) == model1.step_dim
+
+
+###############################################################################
+# Test functioning with/without ghost BN
+###############################################################################
+
+
+@pytest.mark.parametrize(
+    "ghost_bn",
+    [
+        True,
+        False,
+    ],
+)
+def test_ghost_bn(ghost_bn):
+    model = TabNet(
+        column_idx={k: v for v, k in enumerate(colnames)},
+        embed_input=embed_input,
+        continuous_cols=colnames[n_cols:],
+        ghost_bn=ghost_bn,
+    )
+    out1, out2 = model(X_tab)
+    assert out1.size(0) == 10 and out1.size(1) == model1.step_dim
+
+
+###############################################################################
+# Test forward_mask method
+###############################################################################
+
+
+def test_forward_masks():
+    out1, out2 = model1.forward_masks(X_tab)
+    bsz, nfeat = X_tab.shape[0], X_tab.shape[1]
+    out = []
+    out.append(out1.shape[0] == bsz)
+    out.append(out1.shape[1] == nfeat)
+    for step in range(model1.n_steps):
+        out.append(out2[step].size(0) == bsz)
+        out.append(out2[step].size(1) == nfeat)
+    assert all(out)
+
+
+###############################################################################
+# Test create_explain_matrix
+###############################################################################
+
+
+def test_create_explain_matrix():
+    embed_input = [(u, i, 2) for u, i in zip(colnames[:2], [n_embed] * 2)]
+    continuous_cols = colnames[2:]
+    embed_cols = colnames[:2]
+    column_idx = {k: v for v, k in enumerate(colnames)}
+
+    tabnet = TabNet(
+        column_idx=column_idx,
+        embed_input=embed_input,
+        continuous_cols=continuous_cols,
+    )
+    wdmodel = WideDeep(deeptabular=tabnet)
+
+    expl_mtx = create_explain_matrix(wdmodel)
+
+    checks = []
+    checks.append(expl_mtx.sum() == tabnet.embed_and_cont_dim)
+    checks.append(all(expl_mtx.sum(1) == 1))
+    for col, idx in column_idx.items():
+        if col in embed_cols:
+            checks.append(expl_mtx[:, idx].sum() == 2.0)
+        elif col in continuous_cols:
+            checks.append(expl_mtx[:, idx].sum() == 1.0)
+
+    assert all(checks)
--- a/tests/test_model_components/test_wide_deep.py
+++ b/tests/test_model_components/test_wide_deep.py
@@ -3,14 +3,20 @@ from copy import deepcopy
 import pytest
 from torch import nn

-from pytorch_widedeep.models import Wide, TabMlp, DeepText, WideDeep, DeepImage
+from pytorch_widedeep.models import (
+    Wide,
+    TabMlp,
+    TabNet,
+    DeepText,
+    WideDeep,
+    DeepImage,
+)

 embed_input = [(u, i, j) for u, i, j in zip(["a", "b", "c"][:4], [4] * 3, [8] * 3)]
 column_idx = {k: v for v, k in enumerate(["a", "b", "c"])}
 wide = Wide(10, 1)
-deepdense = TabMlp(
-    mlp_hidden_dims=[16, 8], column_idx=column_idx, embed_input=embed_input
-)
+tabmlp = TabMlp(mlp_hidden_dims=[16, 8], column_idx=column_idx, embed_input=embed_input)
+tabnet = TabNet(column_idx=column_idx, embed_input=embed_input)
 deeptext = DeepText(vocab_size=100, embed_dim=8)
 deepimage = DeepImage(pretrained=False)

@@ -29,16 +35,16 @@ deepimage = DeepImage(pretrained=False)
 )
 def test_history_callback(deepcomponent, component_name):
    if deepcomponent is None:
-        deepcomponent = deepcopy(deepdense)
+        deepcomponent = deepcopy(tabmlp)
    deepcomponent.__dict__.pop("output_dim")
    with pytest.raises(AttributeError):
        if component_name == "dense":
            model = WideDeep(wide, deeptabular=deepcomponent)
        elif component_name == "text":
-            model = WideDeep(wide, deeptabular=deepdense, deeptext=deepcomponent)
+            model = WideDeep(wide, deeptabular=tabmlp, deeptext=deepcomponent)
        elif component_name == "image":
            model = WideDeep(  # noqa: F841
-                wide, deeptabular=deepdense, deepimage=deepcomponent
+                wide, deeptabular=tabmlp, deepimage=deepcomponent
            )


@@ -52,7 +58,7 @@ def test_deephead_and_head_layers_dim():
    with pytest.raises(ValueError):
        model = WideDeep(  # noqa: F841
            wide=wide,
-            deeptabular=deepdense,
+            deeptabular=tabmlp,
            head_hidden_dims=[16, 8],
            deephead=deephead,
        )
@@ -66,7 +72,7 @@ def test_deephead_and_head_layers_dim():
 def test_no_deephead_and_head_layers_dim():
    out = []
    model = WideDeep(
-        wide=wide, deeptabular=deepdense, head_hidden_dims=[8, 4]
+        wide=wide, deeptabular=tabmlp, head_hidden_dims=[8, 4]
    )  # noqa: F841
    for n, p in model.named_parameters():
        if n == "deephead.head_layer_0.0.weight":
@@ -74,3 +80,13 @@ def test_no_deephead_and_head_layers_dim():
        if n == "deephead.head_layer_1.0.weight":
            out.append(p.size(0) == 4 and p.size(1) == 8)
    assert all(out)
+
+
+###############################################################################
+#  test tabnet warning
+###############################################################################
+
+
+def test_tabnet_warning():
+    with pytest.warns(UserWarning):
+        model = WideDeep(wide=wide, deeptabular=tabnet)  # noqa: F841
--- a/tests/test_model_functioning/test_fit_methods.py
+++ b/tests/test_model_functioning/test_fit_methods.py
 import string
+import warnings

 import numpy as np
 import pytest
 from torch import nn

-from pytorch_widedeep.models import Wide, TabMlp, WideDeep, TabTransformer
+from pytorch_widedeep.models import (
+    Wide,
+    TabMlp,
+    TabNet,
+    WideDeep,
+    TabTransformer,
+)
 from pytorch_widedeep.metrics import R2Score
 from pytorch_widedeep.training import Trainer

@@ -138,6 +145,51 @@ def test_fit_objectives_tab_transformer(
    assert preds.shape[0] == 32, probs.shape[1] == probs_dim


+##############################################################################
+# Repeat 1st set of tests with TabNet
+##############################################################################
+
+
+@pytest.mark.parametrize(
+    "X_wide, X_tab, target, objective, X_wide_test, X_tab_test, X_test, pred_dim, probs_dim",
+    [
+        (X_wide, X_tab, target_regres, "regression", X_wide, X_tab, None, 1, None),
+        (X_wide, X_tab, target_binary, "binary", X_wide, X_tab, None, 1, 2),
+        (X_wide, X_tab, target_multic, "multiclass", X_wide, X_tab, None, 3, 3),
+        (X_wide, X_tab, target_regres, "regression", None, None, X_test, 1, None),
+        (X_wide, X_tab, target_binary, "binary", None, None, X_test, 1, 2),
+        (X_wide, X_tab, target_multic, "multiclass", None, None, X_test, 3, 3),
+    ],
+)
+def test_fit_objectives_tabnet(
+    X_wide,
+    X_tab,
+    target,
+    objective,
+    X_wide_test,
+    X_tab_test,
+    X_test,
+    pred_dim,
+    probs_dim,
+):
+    warnings.filterwarnings("ignore")
+    wide = Wide(np.unique(X_wide).shape[0], pred_dim)
+    tab_transformer = TabNet(
+        column_idx={k: v for v, k in enumerate(colnames)},
+        embed_input=embed_input,
+        continuous_cols=colnames[5:],
+    )
+    model = WideDeep(wide=wide, deeptabular=tab_transformer, pred_dim=pred_dim)
+    trainer = Trainer(model, objective=objective, verbose=0)
+    trainer.fit(X_wide=X_wide, X_tab=X_tab, target=target, batch_size=16)
+    preds = trainer.predict(X_wide=X_wide, X_tab=X_tab, X_test=X_test)
+    if objective == "binary":
+        pass
+    else:
+        probs = trainer.predict_proba(X_wide=X_wide, X_tab=X_tab, X_test=X_test)
+    assert preds.shape[0] == 32, probs.shape[1] == probs_dim
+
+
 ##############################################################################
 # Test fit with R2 for regression
 ##############################################################################

--- a/tests/test_model_functioning/test_miscellaneous.py
+++ b/tests/test_model_functioning/test_miscellaneous.py
@@ -10,6 +10,7 @@ from sklearn.model_selection import train_test_split
 from pytorch_widedeep.models import (
    Wide,
    TabMlp,
+    TabNet,
    DeepText,
    WideDeep,
    DeepImage,
@@ -78,6 +79,12 @@ tabtransformer = TabTransformer(
    embed_input=embed_input_tt,
    continuous_cols=colnames[5:],
 )
+tabnet = TabNet(
+    column_idx={k: v for v, k in enumerate(colnames)},
+    embed_input=embed_input,
+    continuous_cols=colnames[5:],
+    ghost_bn=False,
+)
 deeptext = DeepText(vocab_size=vocab_size, embed_dim=32, padding_idx=0)
 deepimage = DeepImage(pretrained=True)

@@ -268,3 +275,31 @@ def test_save_and_load_dict():
    shutil.rmtree("tests/test_model_functioning/model_dir/")

    assert torch.allclose(wide_weights, n_wide_weights)
+
+
+###############################################################################
+#  test explain matrices and feature importance for TabNet
+###############################################################################
+
+
+def test_explain_mtx_and_feat_imp():
+    model = WideDeep(deeptabular=tabnet)
+    trainer = Trainer(model, objective="binary", verbose=0)
+    trainer.fit(
+        X_tab=X_tab,
+        target=target,
+        batch_size=16,
+    )
+
+    checks = []
+    checks.append(len(trainer.feature_importance) == len(tabnet.column_idx))
+
+    expl_mtx, step_masks = trainer.explain(X_tab[:6], save_step_masks=True)
+    checks.append(expl_mtx.shape[0] == 6)
+    checks.append(expl_mtx.shape[1] == 10)
+
+    for i in range(tabnet.n_steps):
+        checks.append(step_masks[i].shape[0] == 6)
+        checks.append(step_masks[i].shape[1] == 10)
+
+    assert all(checks)