Merge pull request #42 from jrzaurin/imbalanced_dataloader

Added option for custom dataloader and the possibility of using metrics from torchmetrics

Merge pull request #42 from jrzaurin/imbalanced_dataloader
Added option for custom dataloader and the possibility of using metrics from torchmetrics
8e110a93 · Javier · GitHub · a71699d6 · 52d0711b · 8e110a93
24 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,7 +17,7 @@ matrix:
        # Stop the build if there are Python syntax errors or undefined names
        - flake8 . --count --select=E901,E999,F821,F822,F823 --ignore=E266 --show-source --statistics
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --ignore=E203,E266,E501,E722,F401,F403,F405,W503,C901 --statistics
+        - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --ignore=E203,E266,E501,E722,F401,F403,F405,F811,W503,C901 --statistics
      after_success: skip
 install:
  - pip install --upgrade pip

--- a/code_style.sh
+++ b/code_style.sh
@@ -3,6 +3,6 @@ isort --quiet . pytorch_widedeep tests examples setup.py
 # Black code style
 black . pytorch_widedeep tests examples setup.py
 # flake8 standards
-flake8 . --max-complexity=10 --max-line-length=127 --ignore=E203,E266,E501,E722,F401,F403,F405,W503,C901
+flake8 . --max-complexity=10 --max-line-length=127 --ignore=E203,E266,E501,E722,F401,F403,F405,W503,C901,F811
 # mypy
 mypy pytorch_widedeep --ignore-missing-imports --no-strict-optional
\ No newline at end of file
--- a/docs/callbacks.rst
+++ b/docs/callbacks.rst
@@ -13,6 +13,9 @@ Here are the 4 callbacks available in ``pytorch-widedepp``: ``History``,
 .. autoclass:: pytorch_widedeep.callbacks.LRShedulerCallback
 	:members:

+.. autoclass:: pytorch_widedeep.callbacks.MetricCallback
+	:members:
+
 .. autoclass:: pytorch_widedeep.callbacks.LRHistory
 	:members:


--- a/docs/dataloaders.rst
+++ b/docs/dataloaders.rst
+Dataloaders
+===========
+
+.. note:: This module should contain custom dataloaders that the user might want to
+	implement. At the moment ``pytorch-widedeep`` offers one custom dataloader,
+	``DataLoaderImbalanced``.
+
+
+.. autoclass:: pytorch_widedeep.dataloaders.DataLoaderImbalanced
+	:members:
--- a/docs/examples.rst
+++ b/docs/examples.rst
@@ -14,3 +14,4 @@ them to address different problems
 * `FineTune routines <https://github.com/jrzaurin/pytorch-widedeep/blob/master/examples/06_FineTune_and_WarmUp_Model_Components.ipynb>`__
 * `Custom Components <https://github.com/jrzaurin/pytorch-widedeep/blob/master/examples/07_Custom_Components.ipynb>`__
 * `Save and Load Model and Artifacts <https://github.com/jrzaurin/pytorch-widedeep/blob/master/examples/08_save_and_load_model_and_artifacts.ipynb>`__
+* `Using Custom DataLoaders and Torchmetrics <https://github.com/jrzaurin/pytorch-widedeep/blob/master/examples/09_Custom_DataLoader_Imbalanced_dataset.ipynb>`__
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -20,6 +20,7 @@ Documentation
    Model Components <model_components>
    Metrics <metrics>
    Losses <losses>
+    Dataloaders <dataloaders>
    Callbacks <callbacks>
    The Trainer <trainer>
    Examples <examples>

--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -40,4 +40,5 @@ Dependencies
 * torch
 * torchvision
 * einops
-* wrapt
\ No newline at end of file
+* wrapt
+* torchmetrics
\ No newline at end of file
--- a/docs/metrics.rst
+++ b/docs/metrics.rst
@@ -7,6 +7,26 @@ Metrics
 	ground truth is expected to be a 1D tensor with the corresponding classes.
 	See Examples below

+We have added the possibility of using the metrics available at the
+`torchmetrics <https://torchmetrics.readthedocs.io/en/latest/>`_ library.
+Note that this library is still in its early versions and therefore this
+option should be used with caution. To use ``torchmetrics`` simply import
+them and use them as any of the ``pytorch-widedeep`` metrics described
+below.
+
+.. code-block:: python
+
+	from torchmetrics import Accuracy, Precision
+
+	accuracy = Accuracy(average=None, num_classes=2)
+	precision = Precision(average='micro', num_classes=2)
+
+	trainer = Trainer(model, objective="binary", metrics=[accuracy, precision])
+
+A functioning example for ``pytorch-widedeep`` using ``torchmetrics`` can be
+found in the `Examples folder <https://github.com/jrzaurin/pytorch-widedeep/blob/master/examples>`_.
+
+
 .. autoclass:: pytorch_widedeep.metrics.Accuracy
 	:members:
 	:undoc-members:

--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -16,4 +16,5 @@ tqdm
 torch
 torchvision
 einops
-wrapt
\ No newline at end of file
+wrapt
+torchmetrics
\ No newline at end of file
--- a/docs/trainer.rst
+++ b/docs/trainer.rst
 Training wide and deep models for tabular data
-==============================================
+===============================================

 `...` or just deep learning models for tabular data.


--- a/examples/04_Binary_Classification_Varying_Parameters.ipynb
+++ b/examples/04_Binary_Classification_Varying_Parameters.ipynb
@@ -841,9 +841,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.7"
+   "version": "3.8.6"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
--- a/examples/09_Custom_DataLoader_Imbalanced_dataset.ipynb
+++ b/examples/09_Custom_DataLoader_Imbalanced_dataset.ipynb
--- a/examples/adult_census.py
+++ b/examples/adult_census.py
@@ -11,6 +11,8 @@ from pytorch_widedeep.models import (  # noqa: F401
    TabResnet,
 )
 from pytorch_widedeep.metrics import Accuracy, Precision
+
+# from torchmetrics import Accuracy as accuracy_score
 from pytorch_widedeep.callbacks import (
    LRHistory,
    EarlyStopping,
@@ -94,6 +96,7 @@ if __name__ == "__main__":
    schedulers = {"wide": wide_sch, "deeptabular": deep_sch}
    initializers = {"wide": KaimingNormal, "deeptabular": XavierNormal}
    callbacks = [early_stopping, model_checkpoint, LRHistory(n_epochs=10)]
+    # metrics = [Accuracy, accuracy_score(num_classes=2), Precision]
    metrics = [Accuracy, Precision]

    trainer = Trainer(

--- a/examples/airbnb_multiclass.py
+++ b/examples/airbnb_multiclass.py
@@ -4,7 +4,7 @@ import pandas as pd

 import pytorch_widedeep as wd
 from pytorch_widedeep.models import Wide, TabMlp, WideDeep
-from pytorch_widedeep.metrics import F1Score, Accuracy
+from pytorch_widedeep.metrics import F1Score, Precision
 from pytorch_widedeep.preprocessing import TabPreprocessor, WidePreprocessor

 use_cuda = torch.cuda.is_available()
@@ -53,7 +53,10 @@ if __name__ == "__main__":
    optimizer = torch.optim.Adam(model.parameters(), lr=0.03)

    trainer = wd.Trainer(
-        model, objective="multiclass", metrics=[Accuracy, F1Score], optimizers=optimizer
+        model,
+        objective="multiclass",
+        metrics=[Precision(average=False), F1Score],
+        optimizers=optimizer,
    )

    trainer.fit(

--- a/examples/bio_imbalanced_loader.py
+++ b/examples/bio_imbalanced_loader.py
+import time
+import datetime
+import warnings
+
+import numpy as np
+import pandas as pd
+from torch.optim import SGD, lr_scheduler
+from sklearn.metrics import classification_report
+from sklearn.model_selection import train_test_split
+
+from pytorch_widedeep import Trainer
+from pytorch_widedeep.models import TabMlp, WideDeep
+from pytorch_widedeep.metrics import Recall, F1Score, Accuracy, Precision
+from pytorch_widedeep.dataloaders import DataLoaderImbalanced
+from pytorch_widedeep.initializers import XavierNormal
+from pytorch_widedeep.preprocessing import TabPreprocessor
+
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+
+# increase displayed columns in jupyter notebook
+pd.set_option("display.max_columns", 200)
+pd.set_option("display.max_rows", 300)
+
+
+header_list = ["EXAMPLE_ID", "BLOCK_ID", "target"] + [str(i) for i in range(4, 78)]
+df = pd.read_csv("data/kddcup04/bio_train.dat", sep="\t", names=header_list)
+df.head()
+
+# drop columns we won't need in this example
+df.drop(columns=["EXAMPLE_ID", "BLOCK_ID"], inplace=True)
+
+df_train, df_valid = train_test_split(
+    df, test_size=0.2, stratify=df["target"], random_state=1
+)
+df_valid, df_test = train_test_split(
+    df_valid, test_size=0.5, stratify=df_valid["target"], random_state=1
+)
+
+continuous_cols = df.drop(columns=["target"]).columns.values.tolist()
+
+# deeptabular
+tab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True)
+X_tab_train = tab_preprocessor.fit_transform(df_train)
+X_tab_valid = tab_preprocessor.transform(df_valid)
+X_tab_test = tab_preprocessor.transform(df_test)
+
+# target
+y_train = df_train["target"].values
+y_valid = df_valid["target"].values
+y_test = df_test["target"].values
+
+# Define the model
+input_layer = len(tab_preprocessor.continuous_cols)
+output_layer = 1
+hidden_layers = np.linspace(
+    input_layer * 2, output_layer, 5, endpoint=False, dtype=int
+).tolist()
+
+deeptabular = TabMlp(
+    mlp_hidden_dims=hidden_layers,
+    column_idx=tab_preprocessor.column_idx,
+    continuous_cols=tab_preprocessor.continuous_cols,
+)
+model = WideDeep(deeptabular=deeptabular)
+model
+
+# Metrics from pytorch-widedeep
+accuracy = Accuracy(top_k=2)
+precision = Precision(average=False)
+recall = Recall(average=True)
+f1 = F1Score(average=False)
+
+# Optimizers
+deep_opt = SGD(model.deeptabular.parameters(), lr=0.1)
+
+# LR Scheduler
+deep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)
+
+trainer = Trainer(
+    model,
+    objective="binary",
+    lr_schedulers={"deeptabular": deep_sch},
+    initializers={"deeptabular": XavierNormal},
+    optimizers={"deeptabular": deep_opt},
+    metrics=[accuracy, precision],  # , recall, f1],
+    verbose=1,
+)
+
+start = time.time()
+trainer.fit(
+    X_train={"X_tab": X_tab_train, "target": y_train},
+    X_val={"X_tab": X_tab_valid, "target": y_valid},
+    n_epochs=1,
+    batch_size=32,
+    custom_dataloader=DataLoaderImbalanced,
+    oversample_mul=5,
+)
+print(
+    "Training time[s]: {}".format(
+        datetime.timedelta(seconds=round(time.time() - start))
+    )
+)
+
+pd.DataFrame(trainer.history)
+
+df_pred = trainer.predict(X_tab=X_tab_test)
+print(classification_report(df_test["target"].to_list(), df_pred))
+print("Actual predicted values:\n{}".format(np.unique(df_pred, return_counts=True)))
--- a/pytorch_widedeep/callbacks.py
+++ b/pytorch_widedeep/callbacks.py
@@ -11,6 +11,7 @@ import numpy as np
 import torch
 from torch.optim.lr_scheduler import ReduceLROnPlateau

+from pytorch_widedeep.metrics import MultipleMetrics
 from pytorch_widedeep.wdtypes import *  # noqa: F403


@@ -84,12 +85,6 @@ class CallbackContainer(object):
        for callback in self.callbacks:
            callback.on_train_begin(logs)

-    def on_eval_begin(self, logs: Optional[Dict] = None):
-        # at the moment only used to reset metrics before eval
-        logs = logs or {}
-        for callback in self.callbacks:
-            callback.on_eval_begin(logs)
-
    def on_train_end(self, logs: Optional[Dict] = None):
        logs = logs or {}
        # logs['final_loss'] = self.model.history.epoch_losses[-1],
@@ -99,6 +94,7 @@ class CallbackContainer(object):
            callback.on_train_end(logs)

    def on_eval_begin(self, logs: Optional[Dict] = None):
+        # at the moment only used to reset metrics before eval
        logs = logs or {}
        for callback in self.callbacks:
            callback.on_eval_begin(logs)
@@ -138,14 +134,11 @@ class Callback(object):
    def on_train_begin(self, logs: Optional[Dict] = None):
        pass

-    def on_eval_begin(self, logs: Optional[Dict] = None):
-        # at the moment only used to reset metrics before eval
-        pass
-
    def on_train_end(self, logs: Optional[Dict] = None):
        pass

    def on_eval_begin(self, logs: Optional[Dict] = None):
+        # at the moment only used to reset metrics before eval
        pass


@@ -164,6 +157,8 @@ class History(Callback):
    ):
        logs = logs or {}
        for k, v in logs.items():
+            if isinstance(v, np.ndarray):
+                v = v.tolist()
            self.trainer.history.setdefault(k, []).append(v)


@@ -222,6 +217,23 @@ class LRShedulerCallback(Callback):
        return model_name in self.trainer.lr_scheduler._schedulers


+class MetricCallback(Callback):
+    r"""Callback that resets the metrics (if any metric is used)
+
+    This callback runs by default within :obj:`Trainer`, therefore, should not
+    be passed to the :obj:`Trainer`. Is included here just for completion.
+    """
+
+    def __init__(self, container: MultipleMetrics):
+        self.container = container
+
+    def on_epoch_begin(self, epoch: int, logs: Optional[Dict] = None):
+        self.container.reset()
+
+    def on_eval_begin(self, logs: Optional[Dict] = None):
+        self.container.reset()
+
+
 class LRHistory(Callback):
    r"""Saves the learning rates during training to a ``lr_history`` attribute.


--- a/pytorch_widedeep/dataloaders.py
+++ b/pytorch_widedeep/dataloaders.py
+import numpy as np
+from torch.utils.data import DataLoader, WeightedRandomSampler
+
+from pytorch_widedeep.wdtypes import *  # noqa: F403
+from pytorch_widedeep.training._wd_dataset import WideDeepDataset
+
+
+def get_class_weights(dataset: WideDeepDataset) -> Tuple[np.ndarray, int, int]:
+    """Helper function to get weights of classes in the imbalanced dataset.
+
+    Parameters
+    ----------
+    dataset: ``WideDeepDataset``
+        dataset containing target classes in dataset.Y
+
+    Returns
+    ----------
+    weights: array
+        numpy array with weights
+    minor_class_count: int
+        count of samples in the smallest class for undersampling
+    num_classes: int
+        number of classes
+    """
+    weights = 1 / np.unique(dataset.Y, return_counts=True)[1]
+    minor_class_count = min(np.unique(dataset.Y, return_counts=True)[1])
+    num_classes = len(np.unique(dataset.Y))
+    return weights, minor_class_count, num_classes
+
+
+class DataLoaderDefault(DataLoader):
+    def __init__(self, dataset, batch_size, num_workers, **kwargs):
+        super().__init__(dataset, batch_size, num_workers)
+
+
+class DataLoaderImbalanced(DataLoader):
+    r"""Class to load and shuffle batches with adjusted weights for imbalanced
+    datasets. If the classes do not begin from 0 remapping is necessary. See
+    `here <https://towardsdatascience.com/pytorch-tabular-multiclass-classification-9f8211a123ab>`_
+
+    Parameters
+    ----------
+    dataset: ``WideDeepDataset``
+        see ``pytorch_widedeep.training._wd_dataset``
+    batch_size: int
+        size of batch
+    num_workers: int
+        number of workers
+    """
+
+    def __init__(
+        self, dataset: WideDeepDataset, batch_size: int, num_workers: int, **kwargs
+    ):
+        if "oversample_mul" in kwargs:
+            oversample_mul = kwargs["oversample_mul"]
+        else:
+            oversample_mul = 1
+        weights, minor_cls_cnt, num_clss = get_class_weights(dataset)
+        num_samples = int(minor_cls_cnt * num_clss * oversample_mul)
+        samples_weight = list(np.array([weights[i] for i in dataset.Y]))
+        sampler = WeightedRandomSampler(samples_weight, num_samples, replacement=True)
+        super().__init__(dataset, batch_size, num_workers=num_workers, sampler=sampler)
--- a/pytorch_widedeep/metrics.py
+++ b/pytorch_widedeep/metrics.py
+import numpy as np
 import torch
-
-from pytorch_widedeep.callbacks import Callback
+from torchmetrics import Metric as TorchMetric

 from .wdtypes import *  # noqa: F403

@@ -35,21 +35,19 @@ class MultipleMetrics(object):
    def __call__(self, y_pred: Tensor, y_true: Tensor) -> Dict:
        logs = {}
        for metric in self._metrics:
-            logs[self.prefix + metric._name] = metric(y_pred, y_true)
+            if isinstance(metric, Metric):
+                logs[self.prefix + metric._name] = metric(y_pred, y_true)
+            if isinstance(metric, TorchMetric):
+                if metric.num_classes == 2:
+                    metric.update(torch.round(y_pred).int(), y_true.int())
+                if metric.num_classes > 2:  # type: ignore[operator]
+                    metric.update(torch.max(y_pred, dim=1).indices, y_true.int())  # type: ignore[attr-defined]
+                logs[self.prefix + type(metric).__name__] = (
+                    metric.compute().detach().cpu().numpy()
+                )
        return logs


-class MetricCallback(Callback):
-    def __init__(self, container: MultipleMetrics):
-        self.container = container
-
-    def on_epoch_begin(self, epoch: int, logs: Optional[Dict] = None):
-        self.container.reset()
-
-    def on_eval_begin(self, logs: Optional[Dict] = None):
-        self.container.reset()
-
-
 class Accuracy(Metric):
    r"""Class to calculate the accuracy for both binary and categorical problems

@@ -69,13 +67,13 @@ class Accuracy(Metric):
    >>> y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)
    >>> y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)
    >>> acc(y_pred, y_true)
-    0.5
+    array(0.5)
    >>>
    >>> acc = Accuracy(top_k=2)
    >>> y_true = torch.tensor([0, 1, 2])
    >>> y_pred = torch.tensor([[0.3, 0.5, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])
    >>> acc(y_pred, y_true)
-    0.6666666666666666
+    array(0.66666667)
    """

    def __init__(self, top_k: int = 1):
@@ -93,7 +91,7 @@ class Accuracy(Metric):
        self.correct_count = 0
        self.total_count = 0

-    def __call__(self, y_pred: Tensor, y_true: Tensor) -> float:
+    def __call__(self, y_pred: Tensor, y_true: Tensor) -> np.ndarray:
        num_classes = y_pred.size(1)

        if num_classes == 1:
@@ -106,7 +104,7 @@ class Accuracy(Metric):
        self.correct_count += y_pred.eq(y_true).sum().item()  # type: ignore[assignment]
        self.total_count += len(y_pred)
        accuracy = float(self.correct_count) / float(self.total_count)
-        return accuracy
+        return np.array(accuracy)


 class Precision(Metric):
@@ -128,13 +126,13 @@ class Precision(Metric):
    >>> y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)
    >>> y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)
    >>> prec(y_pred, y_true)
-    0.5
+    array(0.5)
    >>>
    >>> prec = Precision(average=True)
    >>> y_true = torch.tensor([0, 1, 2])
    >>> y_pred = torch.tensor([[0.7, 0.1, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])
    >>> prec(y_pred, y_true)
-    0.3333333432674408
+    array(0.33333334)
    """

    def __init__(self, average: bool = True):
@@ -153,7 +151,7 @@ class Precision(Metric):
        self.true_positives = 0
        self.all_positives = 0

-    def __call__(self, y_pred: Tensor, y_true: Tensor) -> float:
+    def __call__(self, y_pred: Tensor, y_true: Tensor) -> np.ndarray:
        num_class = y_pred.size(1)

        if num_class == 1:
@@ -170,9 +168,9 @@ class Precision(Metric):
        precision = self.true_positives / (self.all_positives + self.eps)

        if self.average:
-            return precision.mean().item()  # type:ignore
+            return np.array(precision.mean().item())  # type:ignore
        else:
-            return precision
+            return precision.detach().cpu().numpy()  # type: ignore[attr-defined]


 class Recall(Metric):
@@ -194,13 +192,13 @@ class Recall(Metric):
    >>> y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)
    >>> y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)
    >>> rec(y_pred, y_true)
-    0.5
+    array(0.5)
    >>>
    >>> rec = Recall(average=True)
    >>> y_true = torch.tensor([0, 1, 2])
    >>> y_pred = torch.tensor([[0.7, 0.1, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])
    >>> rec(y_pred, y_true)
-    0.3333333432674408
+    array(0.33333334)
    """

    def __init__(self, average: bool = True):
@@ -219,7 +217,7 @@ class Recall(Metric):
        self.true_positives = 0
        self.actual_positives = 0

-    def __call__(self, y_pred: Tensor, y_true: Tensor) -> float:
+    def __call__(self, y_pred: Tensor, y_true: Tensor) -> np.ndarray:
        num_class = y_pred.size(1)

        if num_class == 1:
@@ -236,9 +234,9 @@ class Recall(Metric):
        recall = self.true_positives / (self.actual_positives + self.eps)

        if self.average:
-            return recall.mean().item()  # type:ignore
+            return np.array(recall.mean().item())  # type:ignore
        else:
-            return recall
+            return recall.detach().cpu().numpy()  # type: ignore[attr-defined]


 class FBetaScore(Metric):
@@ -264,13 +262,13 @@ class FBetaScore(Metric):
    >>> y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)
    >>> y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)
    >>> fbeta(y_pred, y_true)
-    0.5
+    array(0.5)
    >>>
    >>> fbeta = FBetaScore(beta=2)
    >>> y_true = torch.tensor([0, 1, 2])
    >>> y_pred = torch.tensor([[0.7, 0.1, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])
    >>> fbeta(y_pred, y_true)
-    0.3333333432674408
+    array(0.33333334)
    """

    def __init__(self, beta: int, average: bool = True):
@@ -290,7 +288,7 @@ class FBetaScore(Metric):
        self.precision.reset()
        self.recall.reset()

-    def __call__(self, y_pred: Tensor, y_true: Tensor) -> float:
+    def __call__(self, y_pred: Tensor, y_true: Tensor) -> np.ndarray:

        prec = self.precision(y_pred, y_true)
        rec = self.recall(y_pred, y_true)
@@ -299,7 +297,7 @@ class FBetaScore(Metric):
        fbeta = ((1 + beta2) * prec * rec) / (beta2 * prec + rec + self.eps)

        if self.average:
-            return fbeta.mean().item()  # type: ignore[attr-defined]
+            return np.array(fbeta.mean().item())  # type: ignore[attr-defined]
        else:
            return fbeta

@@ -323,13 +321,13 @@ class F1Score(Metric):
    >>> y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)
    >>> y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)
    >>> f1(y_pred, y_true)
-    0.5
+    array(0.5)
    >>>
    >>> f1 = F1Score()
    >>> y_true = torch.tensor([0, 1, 2])
    >>> y_pred = torch.tensor([[0.7, 0.1, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])
    >>> f1(y_pred, y_true)
-    0.3333333432674408
+    array(0.33333334)
    """

    def __init__(self, average: bool = True):
@@ -345,7 +343,7 @@ class F1Score(Metric):
        """
        self.f1.reset()

-    def __call__(self, y_pred: Tensor, y_true: Tensor) -> float:
+    def __call__(self, y_pred: Tensor, y_true: Tensor) -> np.ndarray:
        return self.f1(y_pred, y_true)


@@ -369,7 +367,7 @@ class R2Score(Metric):
    >>> y_true = torch.tensor([3, -0.5, 2, 7]).view(-1, 1)
    >>> y_pred = torch.tensor([2.5, 0.0, 2, 8]).view(-1, 1)
    >>> r2(y_pred, y_true)
-    0.9486081370449679
+    array(0.94860814)
    """

    def __init__(self):
@@ -389,7 +387,7 @@ class R2Score(Metric):
        self.num_examples = 0
        self.y_true_sum = 0

-    def __call__(self, y_pred: Tensor, y_true: Tensor) -> float:
+    def __call__(self, y_pred: Tensor, y_true: Tensor) -> np.ndarray:

        self.numerator += ((y_pred - y_true) ** 2).sum().item()

@@ -397,4 +395,4 @@ class R2Score(Metric):
        self.y_true_sum += y_true.sum().item()
        y_true_avg = self.y_true_sum / self.num_examples
        self.denominator += ((y_true - y_true_avg) ** 2).sum().item()
-        return 1 - (self.numerator / self.denominator)
+        return np.array((1 - (self.numerator / self.denominator)))
--- a/pytorch_widedeep/training/trainer.py
+++ b/pytorch_widedeep/training/trainer.py
@@ -8,17 +8,20 @@ import torch.nn as nn
 import torch.nn.functional as F
 from tqdm import trange
 from scipy.sparse import csc_matrix
+from torchmetrics import Metric as TorchMetric
 from torch.utils.data import DataLoader
 from torch.optim.lr_scheduler import ReduceLROnPlateau

-from pytorch_widedeep.metrics import Metric, MetricCallback, MultipleMetrics
+from pytorch_widedeep.metrics import Metric, MultipleMetrics
 from pytorch_widedeep.wdtypes import *  # noqa: F403
 from pytorch_widedeep.callbacks import (
    History,
    Callback,
+    MetricCallback,
    CallbackContainer,
    LRShedulerCallback,
 )
+from pytorch_widedeep.dataloaders import DataLoaderDefault
 from pytorch_widedeep.initializers import Initializer, MultipleInitializer
 from pytorch_widedeep.training._finetune import FineTune
 from pytorch_widedeep.training._wd_dataset import WideDeepDataset
@@ -82,7 +85,7 @@ class Trainer:
        function. See for example
        :class:`pytorch_widedeep.losses.FocalLoss` for the required
        structure of the object or the `Examples
-        <https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`_
+        <https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`__
        folder in the repo.

        .. note:: If ``custom_loss_function`` is not None, ``objective`` must be
@@ -125,16 +128,22 @@ class Trainer:
        callbacks are used by default. This can also be a custom callback as
        long as the object of type ``Callback``. See
        :obj:`pytorch_widedeep.callbacks.Callback` or the `Examples
-        <https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`_
+        <https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`__
        folder in the repo
    metrics: List, optional, default=None
-        List of objects of type :obj:`Metric`. Metrics available are:
-        ``Accuracy``, ``Precision``, ``Recall``, ``FBetaScore``,
-        ``F1Score`` and ``R2Score``. This can also be a custom metric as
-        long as it is an object of type :obj:`Metric`. See
-        :obj:`pytorch_widedeep.metrics.Metric` or the `Examples
-        <https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`_
-        folder in the repo
+        - List of objects of type :obj:`Metric`. Metrics available are:
+          ``Accuracy``, ``Precision``, ``Recall``, ``FBetaScore``,
+          ``F1Score`` and ``R2Score``. This can also be a custom metric as
+          long as it is an object of type :obj:`Metric`. See
+          :obj:`pytorch_widedeep.metrics.Metric` or the `Examples
+          <https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`__
+          folder in the repo
+        - List of objects of type :obj:`torchmetrics.Metric`. This can be any
+          metric from torchmetrics library `Examples
+          <https://torchmetrics.readthedocs.io/en/latest/references/modules.html#
+          classification-metrics>`_. This can also be a custom metric as
+          long as it is an object of type :obj:`Metric`. See `the instructions
+          <https://torchmetrics.readthedocs.io/en/latest/>`_.
    class_weight: float, List or Tuple. optional. default=None
        - float indicating the weight of the minority class in binary classification
          problems (e.g. 9.)
@@ -227,7 +236,7 @@ class Trainer:
        initializers: Optional[Union[Initializer, Dict[str, Initializer]]] = None,
        transforms: Optional[List[Transforms]] = None,
        callbacks: Optional[List[Callback]] = None,
-        metrics: Optional[List[Metric]] = None,
+        metrics: Optional[Union[List[Metric], List[TorchMetric]]] = None,
        class_weight: Optional[Union[float, List[float], Tuple[float]]] = None,
        lambda_sparse: float = 1e-3,
        alpha: float = 0.25,
@@ -315,6 +324,7 @@ class Trainer:
        n_epochs: int = 1,
        validation_freq: int = 1,
        batch_size: int = 32,
+        custom_dataloader: Union[DataLoader, None] = None,
        finetune: bool = False,
        finetune_epochs: int = 5,
        finetune_max_lr: float = 0.01,
@@ -329,6 +339,7 @@ class Trainer:
        finetune_deepimage_layers: Optional[List[nn.Module]] = None,
        finetune_routine: str = "howard",
        stop_after_finetuning: bool = False,
+        **kwargs,
    ):
        r"""Fit method.

@@ -368,6 +379,10 @@ class Trainer:
            epochs validation frequency
        batch_size: int, default=32
            batch size
+        custom_dataloader: ``DataLoader``, Optional, default=None
+            object of class ``torch.utils.data.DataLoader``. Available
+            predefined dataloaders are in ``pytorch-widedeep.dataloaders``.If
+            ``None``, a standard torch ``DataLoader`` is used.
        finetune: bool, default=False
            param alias: ``warmup``

@@ -399,7 +414,7 @@ class Trainer:

            For details on how these routines work, please see the Examples
            section in this documentation and the `Examples
-            <https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`_
+            <https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`__
            folder in the repo.
        finetune_epochs: int, default=4
            param alias: ``warmup_epochs``
@@ -477,7 +492,7 @@ class Trainer:
        --------

        For a series of comprehensive examples please, see the `Examples
-        <https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`_
+        <https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`__
        folder in the repo

        For completion, here we include some `"fabricated"` examples, i.e.
@@ -524,9 +539,25 @@ class Trainer:
            val_split,
            target,
        )
-        train_loader = DataLoader(
-            dataset=train_set, batch_size=batch_size, num_workers=n_cpus
-        )
+        if isinstance(custom_dataloader, type):
+            if issubclass(custom_dataloader, DataLoader):
+                train_loader = custom_dataloader(
+                    dataset=train_set,
+                    batch_size=batch_size,
+                    num_workers=n_cpus,
+                    **kwargs,
+                )
+            else:
+                NotImplementedError(
+                    "Custom DataLoader must be a subclass of "
+                    "torch.utils.data.DataLoader, please see the "
+                    "pytorch documentation or examples in "
+                    "pytorch_widedeep.dataloaders"
+                )
+        else:
+            train_loader = DataLoaderDefault(
+                dataset=train_set, batch_size=batch_size, num_workers=n_cpus
+            )
        train_steps = len(train_loader)
        if eval_set is not None:
            eval_loader = DataLoader(
@@ -740,7 +771,7 @@ class Trainer:
        --------

        For a series of comprehensive examples please, see the `Examples
-        <https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`_
+        <https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`__
        folder in the repo

        For completion, here we include a `"fabricated"` example, i.e.
@@ -827,7 +858,7 @@ class Trainer:
        save_state_dict: bool = False,
        model_filename: str = "wd_model.pt",
    ):
-        """Saves the model, training and evaluation history, and the
+        r"""Saves the model, training and evaluation history, and the
        ``feature_importance`` attribute (if the ``deeptabular`` component is a
        Tabnet model) to disk


--- a/pytorch_widedeep/training/trainer_utils.py
+++ b/pytorch_widedeep/training/trainer_utils.py
@@ -156,7 +156,9 @@ def print_loss_and_metric(pb: tqdm, loss: float, score: Dict):
    """
    if score is not None:
        pb.set_postfix(
-            metrics={k: np.round(v, 4) for k, v in score.items()},
+            metrics={
+                k: np.round(v.astype(float), 4).tolist() for k, v in score.items()
+            },
            loss=loss,
        )
    else:

--- a/setup.py
+++ b/setup.py
@@ -64,6 +64,7 @@ setup_kwargs = {
        "torchvision",
        "einops",
        "wrapt",
+        "torchmetrics",
    ],
    "extras_require": extras,
    "python_requires": ">=3.6.0",

--- a/tests/test_metrics/test_metrics.py
+++ b/tests/test_metrics/test_metrics.py
@@ -115,6 +115,26 @@ def test_muticlass_metrics(sklearn_metric, widedeep_metric):
        )


+###############################################################################
+# Test the reset method
+###############################################################################
+@pytest.mark.parametrize(
+    "sklearn_metric, widedeep_metric",
+    [
+        (precision_score, Precision(average=False)),
+        (recall_score, Recall(average=False)),
+        (f1_score, F1Score(average=False)),
+        (f2_score_multi, FBetaScore(beta=2, average=False)),
+    ],
+)
+def test_muticlass_metrics_without_average(sklearn_metric, widedeep_metric):
+    skm = (
+        sklearn_metric(y_true_multi_np, y_pred_muli_np.argmax(axis=1), average="macro"),
+    )
+    wdm = widedeep_metric(y_pred_multi_pt, y_true_multi_pt)
+    assert np.isclose(skm, np.mean(wdm)) and wdm.shape[0] == 3
+
+
 ###############################################################################
 # Test the reset method
 ###############################################################################

--- a/tests/test_metrics/test_torchmetrics.py
+++ b/tests/test_metrics/test_torchmetrics.py
+import numpy as np
+import torch
+import pytest
+from torchmetrics import F1, FBeta, Recall, Accuracy, Precision
+from sklearn.metrics import (
+    f1_score,
+    fbeta_score,
+    recall_score,
+    accuracy_score,
+    precision_score,
+)
+
+from pytorch_widedeep.metrics import MultipleMetrics
+
+
+def f2_score_bin(y_true, y_pred):
+    return fbeta_score(y_true, y_pred, beta=2)
+
+
+y_true_bin_np = np.array([1, 0, 0, 0, 1, 1, 0]).reshape((-1, 1))
+y_pred_bin_np = np.array([0.6, 0.3, 0.2, 0.8, 0.4, 0.9, 0.6]).reshape((-1, 1))
+
+y_true_bin_pt = torch.from_numpy(y_true_bin_np)
+y_pred_bin_pt = torch.from_numpy(y_pred_bin_np)
+
+
+###############################################################################
+# Test binary metrics
+###############################################################################
+@pytest.mark.parametrize(
+    "metric_name, sklearn_metric, torch_metric",
+    [
+        ("Accuracy", accuracy_score, Accuracy(num_classes=2)),
+        ("Precision", precision_score, Precision(num_classes=2, average="none")),
+        ("Recall", recall_score, Recall(num_classes=2, average="none")),
+        ("F1", f1_score, F1(num_classes=2, average="none")),
+        ("FBeta", f2_score_bin, FBeta(beta=2, num_classes=2, average="none")),
+    ],
+)
+def test_binary_metrics(metric_name, sklearn_metric, torch_metric):
+    sk_res = sklearn_metric(y_true_bin_np, y_pred_bin_np.round())
+    wd_metric = MultipleMetrics(metrics=[torch_metric])
+    wd_logs = wd_metric(y_pred_bin_pt, y_true_bin_pt)
+    wd_res = wd_logs[metric_name]
+    if wd_res.size != 1:
+        wd_res = wd_res[1]
+    assert np.isclose(sk_res, wd_res)
+
+
+###############################################################################
+# Test multiclass metrics
+###############################################################################
+y_true_multi_np = np.array([1, 0, 2, 1, 1, 2, 2, 0, 0, 0])
+y_pred_muli_np = np.array(
+    [
+        [0.2, 0.6, 0.2],
+        [0.4, 0.5, 0.1],
+        [0.1, 0.1, 0.8],
+        [0.1, 0.6, 0.3],
+        [0.1, 0.8, 0.1],
+        [0.1, 0.6, 0.6],
+        [0.2, 0.6, 0.8],
+        [0.6, 0.1, 0.3],
+        [0.7, 0.2, 0.1],
+        [0.1, 0.7, 0.2],
+    ]
+)
+
+y_true_multi_pt = torch.from_numpy(y_true_multi_np)
+y_pred_multi_pt = torch.from_numpy(y_pred_muli_np)
+
+
+def f2_score_multi(y_true, y_pred, average):
+    return fbeta_score(y_true, y_pred, average=average, beta=2)
+
+
+@pytest.mark.parametrize(
+    "metric_name, sklearn_metric, torch_metric",
+    [
+        ("Accuracy", accuracy_score, Accuracy(num_classes=3, average="micro")),
+        ("Precision", precision_score, Precision(num_classes=3, average="macro")),
+        ("Recall", recall_score, Recall(num_classes=3, average="macro")),
+        ("F1", f1_score, F1(num_classes=3, average="macro")),
+        ("FBeta", f2_score_multi, FBeta(beta=3, num_classes=3, average="macro")),
+    ],
+)
+def test_muticlass_metrics(metric_name, sklearn_metric, torch_metric):
+    if metric_name == "Accuracy":
+        sk_res = sklearn_metric(y_true_multi_np, y_pred_muli_np.argmax(axis=1))
+    else:
+        sk_res = sklearn_metric(
+            y_true_multi_np, y_pred_muli_np.argmax(axis=1), average="macro"
+        )
+
+    wd_metric = MultipleMetrics(metrics=[torch_metric])
+    wd_logs = wd_metric(y_pred_multi_pt, y_true_multi_pt)
+    wd_res = wd_logs[metric_name]
+
+    assert np.isclose(sk_res, wd_res, atol=0.01)
--- a/tests/test_model_functioning/test_fit_methods.py
+++ b/tests/test_model_functioning/test_fit_methods.py
@@ -14,6 +14,7 @@ from pytorch_widedeep.models import (
 )
 from pytorch_widedeep.metrics import R2Score
 from pytorch_widedeep.training import Trainer
+from pytorch_widedeep.dataloaders import DataLoaderImbalanced

 # Wide array
 X_wide = np.random.choice(50, (32, 10))
@@ -30,6 +31,7 @@ X_tab = np.vstack(embed_cols + cont_cols).transpose()
 # Target
 target_regres = np.random.random(32)
 target_binary = np.random.choice(2, 32)
+target_binary_imbalanced = np.random.choice(2, 32, p=[0.75, 0.25])
 target_multic = np.random.choice(3, 32)

 # Test dictionary
@@ -234,3 +236,30 @@ def test_aliases():
        and trainer.__wd_aliases_used["objective"] == "loss"
        and trainer.__wd_aliases_used["finetune"] == "warmup"
    )
+
+
+##############################################################################
+# Test custom dataloader
+##############################################################################
+
+
+def test_custom_dataloader():
+    wide = Wide(np.unique(X_wide).shape[0], 1)
+    deeptabular = TabMlp(
+        mlp_hidden_dims=[32, 16],
+        mlp_dropout=[0.5, 0.5],
+        column_idx=column_idx,
+        embed_input=embed_input,
+        continuous_cols=colnames[-5:],
+    )
+    model = WideDeep(wide=wide, deeptabular=deeptabular)
+    trainer = Trainer(model, loss="binary", verbose=0)
+    trainer.fit(
+        X_wide=X_wide,
+        X_tab=X_tab,
+        target=target_binary_imbalanced,
+        batch_size=16,
+        custom_dataloader=DataLoaderImbalanced,
+    )
+    # simply checking that runs with DataLoaderImbalanced
+    assert "train_loss" in trainer.history.keys()