未验证 提交 8e110a93 编写于 作者: J Javier 提交者: GitHub

Merge pull request #42 from jrzaurin/imbalanced_dataloader

Added option for custom dataloader and the possibility of using metrics from torchmetrics
......@@ -17,7 +17,7 @@ matrix:
# Stop the build if there are Python syntax errors or undefined names
- flake8 . --count --select=E901,E999,F821,F822,F823 --ignore=E266 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
- flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --ignore=E203,E266,E501,E722,F401,F403,F405,W503,C901 --statistics
- flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --ignore=E203,E266,E501,E722,F401,F403,F405,F811,W503,C901 --statistics
after_success: skip
install:
- pip install --upgrade pip
......
......@@ -3,6 +3,6 @@ isort --quiet . pytorch_widedeep tests examples setup.py
# Black code style
black . pytorch_widedeep tests examples setup.py
# flake8 standards
flake8 . --max-complexity=10 --max-line-length=127 --ignore=E203,E266,E501,E722,F401,F403,F405,W503,C901
flake8 . --max-complexity=10 --max-line-length=127 --ignore=E203,E266,E501,E722,F401,F403,F405,W503,C901,F811
# mypy
mypy pytorch_widedeep --ignore-missing-imports --no-strict-optional
\ No newline at end of file
......@@ -13,6 +13,9 @@ Here are the 4 callbacks available in ``pytorch-widedepp``: ``History``,
.. autoclass:: pytorch_widedeep.callbacks.LRShedulerCallback
:members:
.. autoclass:: pytorch_widedeep.callbacks.MetricCallback
:members:
.. autoclass:: pytorch_widedeep.callbacks.LRHistory
:members:
......
Dataloaders
===========
.. note:: This module should contain custom dataloaders that the user might want to
implement. At the moment ``pytorch-widedeep`` offers one custom dataloader,
``DataLoaderImbalanced``.
.. autoclass:: pytorch_widedeep.dataloaders.DataLoaderImbalanced
:members:
......@@ -14,3 +14,4 @@ them to address different problems
* `FineTune routines <https://github.com/jrzaurin/pytorch-widedeep/blob/master/examples/06_FineTune_and_WarmUp_Model_Components.ipynb>`__
* `Custom Components <https://github.com/jrzaurin/pytorch-widedeep/blob/master/examples/07_Custom_Components.ipynb>`__
* `Save and Load Model and Artifacts <https://github.com/jrzaurin/pytorch-widedeep/blob/master/examples/08_save_and_load_model_and_artifacts.ipynb>`__
* `Using Custom DataLoaders and Torchmetrics <https://github.com/jrzaurin/pytorch-widedeep/blob/master/examples/09_Custom_DataLoader_Imbalanced_dataset.ipynb>`__
......@@ -20,6 +20,7 @@ Documentation
Model Components <model_components>
Metrics <metrics>
Losses <losses>
Dataloaders <dataloaders>
Callbacks <callbacks>
The Trainer <trainer>
Examples <examples>
......
......@@ -40,4 +40,5 @@ Dependencies
* torch
* torchvision
* einops
* wrapt
\ No newline at end of file
* wrapt
* torchmetrics
\ No newline at end of file
......@@ -7,6 +7,26 @@ Metrics
ground truth is expected to be a 1D tensor with the corresponding classes.
See Examples below
We have added the possibility of using the metrics available at the
`torchmetrics <https://torchmetrics.readthedocs.io/en/latest/>`_ library.
Note that this library is still in its early versions and therefore this
option should be used with caution. To use ``torchmetrics`` simply import
them and use them as any of the ``pytorch-widedeep`` metrics described
below.
.. code-block:: python
from torchmetrics import Accuracy, Precision
accuracy = Accuracy(average=None, num_classes=2)
precision = Precision(average='micro', num_classes=2)
trainer = Trainer(model, objective="binary", metrics=[accuracy, precision])
A functioning example for ``pytorch-widedeep`` using ``torchmetrics`` can be
found in the `Examples folder <https://github.com/jrzaurin/pytorch-widedeep/blob/master/examples>`_.
.. autoclass:: pytorch_widedeep.metrics.Accuracy
:members:
:undoc-members:
......
......@@ -16,4 +16,5 @@ tqdm
torch
torchvision
einops
wrapt
\ No newline at end of file
wrapt
torchmetrics
\ No newline at end of file
Training wide and deep models for tabular data
==============================================
===============================================
`...` or just deep learning models for tabular data.
......
......@@ -841,9 +841,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
"version": "3.8.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}
此差异已折叠。
......@@ -11,6 +11,8 @@ from pytorch_widedeep.models import ( # noqa: F401
TabResnet,
)
from pytorch_widedeep.metrics import Accuracy, Precision
# from torchmetrics import Accuracy as accuracy_score
from pytorch_widedeep.callbacks import (
LRHistory,
EarlyStopping,
......@@ -94,6 +96,7 @@ if __name__ == "__main__":
schedulers = {"wide": wide_sch, "deeptabular": deep_sch}
initializers = {"wide": KaimingNormal, "deeptabular": XavierNormal}
callbacks = [early_stopping, model_checkpoint, LRHistory(n_epochs=10)]
# metrics = [Accuracy, accuracy_score(num_classes=2), Precision]
metrics = [Accuracy, Precision]
trainer = Trainer(
......
......@@ -4,7 +4,7 @@ import pandas as pd
import pytorch_widedeep as wd
from pytorch_widedeep.models import Wide, TabMlp, WideDeep
from pytorch_widedeep.metrics import F1Score, Accuracy
from pytorch_widedeep.metrics import F1Score, Precision
from pytorch_widedeep.preprocessing import TabPreprocessor, WidePreprocessor
use_cuda = torch.cuda.is_available()
......@@ -53,7 +53,10 @@ if __name__ == "__main__":
optimizer = torch.optim.Adam(model.parameters(), lr=0.03)
trainer = wd.Trainer(
model, objective="multiclass", metrics=[Accuracy, F1Score], optimizers=optimizer
model,
objective="multiclass",
metrics=[Precision(average=False), F1Score],
optimizers=optimizer,
)
trainer.fit(
......
import time
import datetime
import warnings
import numpy as np
import pandas as pd
from torch.optim import SGD, lr_scheduler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from pytorch_widedeep import Trainer
from pytorch_widedeep.models import TabMlp, WideDeep
from pytorch_widedeep.metrics import Recall, F1Score, Accuracy, Precision
from pytorch_widedeep.dataloaders import DataLoaderImbalanced
from pytorch_widedeep.initializers import XavierNormal
from pytorch_widedeep.preprocessing import TabPreprocessor
warnings.filterwarnings("ignore", category=DeprecationWarning)
# increase displayed columns in jupyter notebook
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)
header_list = ["EXAMPLE_ID", "BLOCK_ID", "target"] + [str(i) for i in range(4, 78)]
df = pd.read_csv("data/kddcup04/bio_train.dat", sep="\t", names=header_list)
df.head()
# drop columns we won't need in this example
df.drop(columns=["EXAMPLE_ID", "BLOCK_ID"], inplace=True)
df_train, df_valid = train_test_split(
df, test_size=0.2, stratify=df["target"], random_state=1
)
df_valid, df_test = train_test_split(
df_valid, test_size=0.5, stratify=df_valid["target"], random_state=1
)
continuous_cols = df.drop(columns=["target"]).columns.values.tolist()
# deeptabular
tab_preprocessor = TabPreprocessor(continuous_cols=continuous_cols, scale=True)
X_tab_train = tab_preprocessor.fit_transform(df_train)
X_tab_valid = tab_preprocessor.transform(df_valid)
X_tab_test = tab_preprocessor.transform(df_test)
# target
y_train = df_train["target"].values
y_valid = df_valid["target"].values
y_test = df_test["target"].values
# Define the model
input_layer = len(tab_preprocessor.continuous_cols)
output_layer = 1
hidden_layers = np.linspace(
input_layer * 2, output_layer, 5, endpoint=False, dtype=int
).tolist()
deeptabular = TabMlp(
mlp_hidden_dims=hidden_layers,
column_idx=tab_preprocessor.column_idx,
continuous_cols=tab_preprocessor.continuous_cols,
)
model = WideDeep(deeptabular=deeptabular)
model
# Metrics from pytorch-widedeep
accuracy = Accuracy(top_k=2)
precision = Precision(average=False)
recall = Recall(average=True)
f1 = F1Score(average=False)
# Optimizers
deep_opt = SGD(model.deeptabular.parameters(), lr=0.1)
# LR Scheduler
deep_sch = lr_scheduler.StepLR(deep_opt, step_size=3)
trainer = Trainer(
model,
objective="binary",
lr_schedulers={"deeptabular": deep_sch},
initializers={"deeptabular": XavierNormal},
optimizers={"deeptabular": deep_opt},
metrics=[accuracy, precision], # , recall, f1],
verbose=1,
)
start = time.time()
trainer.fit(
X_train={"X_tab": X_tab_train, "target": y_train},
X_val={"X_tab": X_tab_valid, "target": y_valid},
n_epochs=1,
batch_size=32,
custom_dataloader=DataLoaderImbalanced,
oversample_mul=5,
)
print(
"Training time[s]: {}".format(
datetime.timedelta(seconds=round(time.time() - start))
)
)
pd.DataFrame(trainer.history)
df_pred = trainer.predict(X_tab=X_tab_test)
print(classification_report(df_test["target"].to_list(), df_pred))
print("Actual predicted values:\n{}".format(np.unique(df_pred, return_counts=True)))
......@@ -11,6 +11,7 @@ import numpy as np
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau
from pytorch_widedeep.metrics import MultipleMetrics
from pytorch_widedeep.wdtypes import * # noqa: F403
......@@ -84,12 +85,6 @@ class CallbackContainer(object):
for callback in self.callbacks:
callback.on_train_begin(logs)
def on_eval_begin(self, logs: Optional[Dict] = None):
# at the moment only used to reset metrics before eval
logs = logs or {}
for callback in self.callbacks:
callback.on_eval_begin(logs)
def on_train_end(self, logs: Optional[Dict] = None):
logs = logs or {}
# logs['final_loss'] = self.model.history.epoch_losses[-1],
......@@ -99,6 +94,7 @@ class CallbackContainer(object):
callback.on_train_end(logs)
def on_eval_begin(self, logs: Optional[Dict] = None):
# at the moment only used to reset metrics before eval
logs = logs or {}
for callback in self.callbacks:
callback.on_eval_begin(logs)
......@@ -138,14 +134,11 @@ class Callback(object):
def on_train_begin(self, logs: Optional[Dict] = None):
pass
def on_eval_begin(self, logs: Optional[Dict] = None):
# at the moment only used to reset metrics before eval
pass
def on_train_end(self, logs: Optional[Dict] = None):
pass
def on_eval_begin(self, logs: Optional[Dict] = None):
# at the moment only used to reset metrics before eval
pass
......@@ -164,6 +157,8 @@ class History(Callback):
):
logs = logs or {}
for k, v in logs.items():
if isinstance(v, np.ndarray):
v = v.tolist()
self.trainer.history.setdefault(k, []).append(v)
......@@ -222,6 +217,23 @@ class LRShedulerCallback(Callback):
return model_name in self.trainer.lr_scheduler._schedulers
class MetricCallback(Callback):
r"""Callback that resets the metrics (if any metric is used)
This callback runs by default within :obj:`Trainer`, therefore, should not
be passed to the :obj:`Trainer`. Is included here just for completion.
"""
def __init__(self, container: MultipleMetrics):
self.container = container
def on_epoch_begin(self, epoch: int, logs: Optional[Dict] = None):
self.container.reset()
def on_eval_begin(self, logs: Optional[Dict] = None):
self.container.reset()
class LRHistory(Callback):
r"""Saves the learning rates during training to a ``lr_history`` attribute.
......
import numpy as np
from torch.utils.data import DataLoader, WeightedRandomSampler
from pytorch_widedeep.wdtypes import * # noqa: F403
from pytorch_widedeep.training._wd_dataset import WideDeepDataset
def get_class_weights(dataset: WideDeepDataset) -> Tuple[np.ndarray, int, int]:
"""Helper function to get weights of classes in the imbalanced dataset.
Parameters
----------
dataset: ``WideDeepDataset``
dataset containing target classes in dataset.Y
Returns
----------
weights: array
numpy array with weights
minor_class_count: int
count of samples in the smallest class for undersampling
num_classes: int
number of classes
"""
weights = 1 / np.unique(dataset.Y, return_counts=True)[1]
minor_class_count = min(np.unique(dataset.Y, return_counts=True)[1])
num_classes = len(np.unique(dataset.Y))
return weights, minor_class_count, num_classes
class DataLoaderDefault(DataLoader):
def __init__(self, dataset, batch_size, num_workers, **kwargs):
super().__init__(dataset, batch_size, num_workers)
class DataLoaderImbalanced(DataLoader):
r"""Class to load and shuffle batches with adjusted weights for imbalanced
datasets. If the classes do not begin from 0 remapping is necessary. See
`here <https://towardsdatascience.com/pytorch-tabular-multiclass-classification-9f8211a123ab>`_
Parameters
----------
dataset: ``WideDeepDataset``
see ``pytorch_widedeep.training._wd_dataset``
batch_size: int
size of batch
num_workers: int
number of workers
"""
def __init__(
self, dataset: WideDeepDataset, batch_size: int, num_workers: int, **kwargs
):
if "oversample_mul" in kwargs:
oversample_mul = kwargs["oversample_mul"]
else:
oversample_mul = 1
weights, minor_cls_cnt, num_clss = get_class_weights(dataset)
num_samples = int(minor_cls_cnt * num_clss * oversample_mul)
samples_weight = list(np.array([weights[i] for i in dataset.Y]))
sampler = WeightedRandomSampler(samples_weight, num_samples, replacement=True)
super().__init__(dataset, batch_size, num_workers=num_workers, sampler=sampler)
import numpy as np
import torch
from pytorch_widedeep.callbacks import Callback
from torchmetrics import Metric as TorchMetric
from .wdtypes import * # noqa: F403
......@@ -35,21 +35,19 @@ class MultipleMetrics(object):
def __call__(self, y_pred: Tensor, y_true: Tensor) -> Dict:
logs = {}
for metric in self._metrics:
logs[self.prefix + metric._name] = metric(y_pred, y_true)
if isinstance(metric, Metric):
logs[self.prefix + metric._name] = metric(y_pred, y_true)
if isinstance(metric, TorchMetric):
if metric.num_classes == 2:
metric.update(torch.round(y_pred).int(), y_true.int())
if metric.num_classes > 2: # type: ignore[operator]
metric.update(torch.max(y_pred, dim=1).indices, y_true.int()) # type: ignore[attr-defined]
logs[self.prefix + type(metric).__name__] = (
metric.compute().detach().cpu().numpy()
)
return logs
class MetricCallback(Callback):
def __init__(self, container: MultipleMetrics):
self.container = container
def on_epoch_begin(self, epoch: int, logs: Optional[Dict] = None):
self.container.reset()
def on_eval_begin(self, logs: Optional[Dict] = None):
self.container.reset()
class Accuracy(Metric):
r"""Class to calculate the accuracy for both binary and categorical problems
......@@ -69,13 +67,13 @@ class Accuracy(Metric):
>>> y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)
>>> y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)
>>> acc(y_pred, y_true)
0.5
array(0.5)
>>>
>>> acc = Accuracy(top_k=2)
>>> y_true = torch.tensor([0, 1, 2])
>>> y_pred = torch.tensor([[0.3, 0.5, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])
>>> acc(y_pred, y_true)
0.6666666666666666
array(0.66666667)
"""
def __init__(self, top_k: int = 1):
......@@ -93,7 +91,7 @@ class Accuracy(Metric):
self.correct_count = 0
self.total_count = 0
def __call__(self, y_pred: Tensor, y_true: Tensor) -> float:
def __call__(self, y_pred: Tensor, y_true: Tensor) -> np.ndarray:
num_classes = y_pred.size(1)
if num_classes == 1:
......@@ -106,7 +104,7 @@ class Accuracy(Metric):
self.correct_count += y_pred.eq(y_true).sum().item() # type: ignore[assignment]
self.total_count += len(y_pred)
accuracy = float(self.correct_count) / float(self.total_count)
return accuracy
return np.array(accuracy)
class Precision(Metric):
......@@ -128,13 +126,13 @@ class Precision(Metric):
>>> y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)
>>> y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)
>>> prec(y_pred, y_true)
0.5
array(0.5)
>>>
>>> prec = Precision(average=True)
>>> y_true = torch.tensor([0, 1, 2])
>>> y_pred = torch.tensor([[0.7, 0.1, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])
>>> prec(y_pred, y_true)
0.3333333432674408
array(0.33333334)
"""
def __init__(self, average: bool = True):
......@@ -153,7 +151,7 @@ class Precision(Metric):
self.true_positives = 0
self.all_positives = 0
def __call__(self, y_pred: Tensor, y_true: Tensor) -> float:
def __call__(self, y_pred: Tensor, y_true: Tensor) -> np.ndarray:
num_class = y_pred.size(1)
if num_class == 1:
......@@ -170,9 +168,9 @@ class Precision(Metric):
precision = self.true_positives / (self.all_positives + self.eps)
if self.average:
return precision.mean().item() # type:ignore
return np.array(precision.mean().item()) # type:ignore
else:
return precision
return precision.detach().cpu().numpy() # type: ignore[attr-defined]
class Recall(Metric):
......@@ -194,13 +192,13 @@ class Recall(Metric):
>>> y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)
>>> y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)
>>> rec(y_pred, y_true)
0.5
array(0.5)
>>>
>>> rec = Recall(average=True)
>>> y_true = torch.tensor([0, 1, 2])
>>> y_pred = torch.tensor([[0.7, 0.1, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])
>>> rec(y_pred, y_true)
0.3333333432674408
array(0.33333334)
"""
def __init__(self, average: bool = True):
......@@ -219,7 +217,7 @@ class Recall(Metric):
self.true_positives = 0
self.actual_positives = 0
def __call__(self, y_pred: Tensor, y_true: Tensor) -> float:
def __call__(self, y_pred: Tensor, y_true: Tensor) -> np.ndarray:
num_class = y_pred.size(1)
if num_class == 1:
......@@ -236,9 +234,9 @@ class Recall(Metric):
recall = self.true_positives / (self.actual_positives + self.eps)
if self.average:
return recall.mean().item() # type:ignore
return np.array(recall.mean().item()) # type:ignore
else:
return recall
return recall.detach().cpu().numpy() # type: ignore[attr-defined]
class FBetaScore(Metric):
......@@ -264,13 +262,13 @@ class FBetaScore(Metric):
>>> y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)
>>> y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)
>>> fbeta(y_pred, y_true)
0.5
array(0.5)
>>>
>>> fbeta = FBetaScore(beta=2)
>>> y_true = torch.tensor([0, 1, 2])
>>> y_pred = torch.tensor([[0.7, 0.1, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])
>>> fbeta(y_pred, y_true)
0.3333333432674408
array(0.33333334)
"""
def __init__(self, beta: int, average: bool = True):
......@@ -290,7 +288,7 @@ class FBetaScore(Metric):
self.precision.reset()
self.recall.reset()
def __call__(self, y_pred: Tensor, y_true: Tensor) -> float:
def __call__(self, y_pred: Tensor, y_true: Tensor) -> np.ndarray:
prec = self.precision(y_pred, y_true)
rec = self.recall(y_pred, y_true)
......@@ -299,7 +297,7 @@ class FBetaScore(Metric):
fbeta = ((1 + beta2) * prec * rec) / (beta2 * prec + rec + self.eps)
if self.average:
return fbeta.mean().item() # type: ignore[attr-defined]
return np.array(fbeta.mean().item()) # type: ignore[attr-defined]
else:
return fbeta
......@@ -323,13 +321,13 @@ class F1Score(Metric):
>>> y_true = torch.tensor([0, 1, 0, 1]).view(-1, 1)
>>> y_pred = torch.tensor([[0.3, 0.2, 0.6, 0.7]]).view(-1, 1)
>>> f1(y_pred, y_true)
0.5
array(0.5)
>>>
>>> f1 = F1Score()
>>> y_true = torch.tensor([0, 1, 2])
>>> y_pred = torch.tensor([[0.7, 0.1, 0.2], [0.1, 0.1, 0.8], [0.1, 0.5, 0.4]])
>>> f1(y_pred, y_true)
0.3333333432674408
array(0.33333334)
"""
def __init__(self, average: bool = True):
......@@ -345,7 +343,7 @@ class F1Score(Metric):
"""
self.f1.reset()
def __call__(self, y_pred: Tensor, y_true: Tensor) -> float:
def __call__(self, y_pred: Tensor, y_true: Tensor) -> np.ndarray:
return self.f1(y_pred, y_true)
......@@ -369,7 +367,7 @@ class R2Score(Metric):
>>> y_true = torch.tensor([3, -0.5, 2, 7]).view(-1, 1)
>>> y_pred = torch.tensor([2.5, 0.0, 2, 8]).view(-1, 1)
>>> r2(y_pred, y_true)
0.9486081370449679
array(0.94860814)
"""
def __init__(self):
......@@ -389,7 +387,7 @@ class R2Score(Metric):
self.num_examples = 0
self.y_true_sum = 0
def __call__(self, y_pred: Tensor, y_true: Tensor) -> float:
def __call__(self, y_pred: Tensor, y_true: Tensor) -> np.ndarray:
self.numerator += ((y_pred - y_true) ** 2).sum().item()
......@@ -397,4 +395,4 @@ class R2Score(Metric):
self.y_true_sum += y_true.sum().item()
y_true_avg = self.y_true_sum / self.num_examples
self.denominator += ((y_true - y_true_avg) ** 2).sum().item()
return 1 - (self.numerator / self.denominator)
return np.array((1 - (self.numerator / self.denominator)))
......@@ -8,17 +8,20 @@ import torch.nn as nn
import torch.nn.functional as F
from tqdm import trange
from scipy.sparse import csc_matrix
from torchmetrics import Metric as TorchMetric
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from pytorch_widedeep.metrics import Metric, MetricCallback, MultipleMetrics
from pytorch_widedeep.metrics import Metric, MultipleMetrics
from pytorch_widedeep.wdtypes import * # noqa: F403
from pytorch_widedeep.callbacks import (
History,
Callback,
MetricCallback,
CallbackContainer,
LRShedulerCallback,
)
from pytorch_widedeep.dataloaders import DataLoaderDefault
from pytorch_widedeep.initializers import Initializer, MultipleInitializer
from pytorch_widedeep.training._finetune import FineTune
from pytorch_widedeep.training._wd_dataset import WideDeepDataset
......@@ -82,7 +85,7 @@ class Trainer:
function. See for example
:class:`pytorch_widedeep.losses.FocalLoss` for the required
structure of the object or the `Examples
<https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`_
<https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`__
folder in the repo.
.. note:: If ``custom_loss_function`` is not None, ``objective`` must be
......@@ -125,16 +128,22 @@ class Trainer:
callbacks are used by default. This can also be a custom callback as
long as the object of type ``Callback``. See
:obj:`pytorch_widedeep.callbacks.Callback` or the `Examples
<https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`_
<https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`__
folder in the repo
metrics: List, optional, default=None
List of objects of type :obj:`Metric`. Metrics available are:
``Accuracy``, ``Precision``, ``Recall``, ``FBetaScore``,
``F1Score`` and ``R2Score``. This can also be a custom metric as
long as it is an object of type :obj:`Metric`. See
:obj:`pytorch_widedeep.metrics.Metric` or the `Examples
<https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`_
folder in the repo
- List of objects of type :obj:`Metric`. Metrics available are:
``Accuracy``, ``Precision``, ``Recall``, ``FBetaScore``,
``F1Score`` and ``R2Score``. This can also be a custom metric as
long as it is an object of type :obj:`Metric`. See
:obj:`pytorch_widedeep.metrics.Metric` or the `Examples
<https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`__
folder in the repo
- List of objects of type :obj:`torchmetrics.Metric`. This can be any
metric from torchmetrics library `Examples
<https://torchmetrics.readthedocs.io/en/latest/references/modules.html#
classification-metrics>`_. This can also be a custom metric as
long as it is an object of type :obj:`Metric`. See `the instructions
<https://torchmetrics.readthedocs.io/en/latest/>`_.
class_weight: float, List or Tuple. optional. default=None
- float indicating the weight of the minority class in binary classification
problems (e.g. 9.)
......@@ -227,7 +236,7 @@ class Trainer:
initializers: Optional[Union[Initializer, Dict[str, Initializer]]] = None,
transforms: Optional[List[Transforms]] = None,
callbacks: Optional[List[Callback]] = None,
metrics: Optional[List[Metric]] = None,
metrics: Optional[Union[List[Metric], List[TorchMetric]]] = None,
class_weight: Optional[Union[float, List[float], Tuple[float]]] = None,
lambda_sparse: float = 1e-3,
alpha: float = 0.25,
......@@ -315,6 +324,7 @@ class Trainer:
n_epochs: int = 1,
validation_freq: int = 1,
batch_size: int = 32,
custom_dataloader: Union[DataLoader, None] = None,
finetune: bool = False,
finetune_epochs: int = 5,
finetune_max_lr: float = 0.01,
......@@ -329,6 +339,7 @@ class Trainer:
finetune_deepimage_layers: Optional[List[nn.Module]] = None,
finetune_routine: str = "howard",
stop_after_finetuning: bool = False,
**kwargs,
):
r"""Fit method.
......@@ -368,6 +379,10 @@ class Trainer:
epochs validation frequency
batch_size: int, default=32
batch size
custom_dataloader: ``DataLoader``, Optional, default=None
object of class ``torch.utils.data.DataLoader``. Available
predefined dataloaders are in ``pytorch-widedeep.dataloaders``.If
``None``, a standard torch ``DataLoader`` is used.
finetune: bool, default=False
param alias: ``warmup``
......@@ -399,7 +414,7 @@ class Trainer:
For details on how these routines work, please see the Examples
section in this documentation and the `Examples
<https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`_
<https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`__
folder in the repo.
finetune_epochs: int, default=4
param alias: ``warmup_epochs``
......@@ -477,7 +492,7 @@ class Trainer:
--------
For a series of comprehensive examples please, see the `Examples
<https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`_
<https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`__
folder in the repo
For completion, here we include some `"fabricated"` examples, i.e.
......@@ -524,9 +539,25 @@ class Trainer:
val_split,
target,
)
train_loader = DataLoader(
dataset=train_set, batch_size=batch_size, num_workers=n_cpus
)
if isinstance(custom_dataloader, type):
if issubclass(custom_dataloader, DataLoader):
train_loader = custom_dataloader(
dataset=train_set,
batch_size=batch_size,
num_workers=n_cpus,
**kwargs,
)
else:
NotImplementedError(
"Custom DataLoader must be a subclass of "
"torch.utils.data.DataLoader, please see the "
"pytorch documentation or examples in "
"pytorch_widedeep.dataloaders"
)
else:
train_loader = DataLoaderDefault(
dataset=train_set, batch_size=batch_size, num_workers=n_cpus
)
train_steps = len(train_loader)
if eval_set is not None:
eval_loader = DataLoader(
......@@ -740,7 +771,7 @@ class Trainer:
--------
For a series of comprehensive examples please, see the `Examples
<https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`_
<https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`__
folder in the repo
For completion, here we include a `"fabricated"` example, i.e.
......@@ -827,7 +858,7 @@ class Trainer:
save_state_dict: bool = False,
model_filename: str = "wd_model.pt",
):
"""Saves the model, training and evaluation history, and the
r"""Saves the model, training and evaluation history, and the
``feature_importance`` attribute (if the ``deeptabular`` component is a
Tabnet model) to disk
......
......@@ -156,7 +156,9 @@ def print_loss_and_metric(pb: tqdm, loss: float, score: Dict):
"""
if score is not None:
pb.set_postfix(
metrics={k: np.round(v, 4) for k, v in score.items()},
metrics={
k: np.round(v.astype(float), 4).tolist() for k, v in score.items()
},
loss=loss,
)
else:
......
......@@ -64,6 +64,7 @@ setup_kwargs = {
"torchvision",
"einops",
"wrapt",
"torchmetrics",
],
"extras_require": extras,
"python_requires": ">=3.6.0",
......
......@@ -115,6 +115,26 @@ def test_muticlass_metrics(sklearn_metric, widedeep_metric):
)
###############################################################################
# Test the reset method
###############################################################################
@pytest.mark.parametrize(
"sklearn_metric, widedeep_metric",
[
(precision_score, Precision(average=False)),
(recall_score, Recall(average=False)),
(f1_score, F1Score(average=False)),
(f2_score_multi, FBetaScore(beta=2, average=False)),
],
)
def test_muticlass_metrics_without_average(sklearn_metric, widedeep_metric):
skm = (
sklearn_metric(y_true_multi_np, y_pred_muli_np.argmax(axis=1), average="macro"),
)
wdm = widedeep_metric(y_pred_multi_pt, y_true_multi_pt)
assert np.isclose(skm, np.mean(wdm)) and wdm.shape[0] == 3
###############################################################################
# Test the reset method
###############################################################################
......
import numpy as np
import torch
import pytest
from torchmetrics import F1, FBeta, Recall, Accuracy, Precision
from sklearn.metrics import (
f1_score,
fbeta_score,
recall_score,
accuracy_score,
precision_score,
)
from pytorch_widedeep.metrics import MultipleMetrics
def f2_score_bin(y_true, y_pred):
return fbeta_score(y_true, y_pred, beta=2)
y_true_bin_np = np.array([1, 0, 0, 0, 1, 1, 0]).reshape((-1, 1))
y_pred_bin_np = np.array([0.6, 0.3, 0.2, 0.8, 0.4, 0.9, 0.6]).reshape((-1, 1))
y_true_bin_pt = torch.from_numpy(y_true_bin_np)
y_pred_bin_pt = torch.from_numpy(y_pred_bin_np)
###############################################################################
# Test binary metrics
###############################################################################
@pytest.mark.parametrize(
"metric_name, sklearn_metric, torch_metric",
[
("Accuracy", accuracy_score, Accuracy(num_classes=2)),
("Precision", precision_score, Precision(num_classes=2, average="none")),
("Recall", recall_score, Recall(num_classes=2, average="none")),
("F1", f1_score, F1(num_classes=2, average="none")),
("FBeta", f2_score_bin, FBeta(beta=2, num_classes=2, average="none")),
],
)
def test_binary_metrics(metric_name, sklearn_metric, torch_metric):
sk_res = sklearn_metric(y_true_bin_np, y_pred_bin_np.round())
wd_metric = MultipleMetrics(metrics=[torch_metric])
wd_logs = wd_metric(y_pred_bin_pt, y_true_bin_pt)
wd_res = wd_logs[metric_name]
if wd_res.size != 1:
wd_res = wd_res[1]
assert np.isclose(sk_res, wd_res)
###############################################################################
# Test multiclass metrics
###############################################################################
y_true_multi_np = np.array([1, 0, 2, 1, 1, 2, 2, 0, 0, 0])
y_pred_muli_np = np.array(
[
[0.2, 0.6, 0.2],
[0.4, 0.5, 0.1],
[0.1, 0.1, 0.8],
[0.1, 0.6, 0.3],
[0.1, 0.8, 0.1],
[0.1, 0.6, 0.6],
[0.2, 0.6, 0.8],
[0.6, 0.1, 0.3],
[0.7, 0.2, 0.1],
[0.1, 0.7, 0.2],
]
)
y_true_multi_pt = torch.from_numpy(y_true_multi_np)
y_pred_multi_pt = torch.from_numpy(y_pred_muli_np)
def f2_score_multi(y_true, y_pred, average):
return fbeta_score(y_true, y_pred, average=average, beta=2)
@pytest.mark.parametrize(
"metric_name, sklearn_metric, torch_metric",
[
("Accuracy", accuracy_score, Accuracy(num_classes=3, average="micro")),
("Precision", precision_score, Precision(num_classes=3, average="macro")),
("Recall", recall_score, Recall(num_classes=3, average="macro")),
("F1", f1_score, F1(num_classes=3, average="macro")),
("FBeta", f2_score_multi, FBeta(beta=3, num_classes=3, average="macro")),
],
)
def test_muticlass_metrics(metric_name, sklearn_metric, torch_metric):
if metric_name == "Accuracy":
sk_res = sklearn_metric(y_true_multi_np, y_pred_muli_np.argmax(axis=1))
else:
sk_res = sklearn_metric(
y_true_multi_np, y_pred_muli_np.argmax(axis=1), average="macro"
)
wd_metric = MultipleMetrics(metrics=[torch_metric])
wd_logs = wd_metric(y_pred_multi_pt, y_true_multi_pt)
wd_res = wd_logs[metric_name]
assert np.isclose(sk_res, wd_res, atol=0.01)
......@@ -14,6 +14,7 @@ from pytorch_widedeep.models import (
)
from pytorch_widedeep.metrics import R2Score
from pytorch_widedeep.training import Trainer
from pytorch_widedeep.dataloaders import DataLoaderImbalanced
# Wide array
X_wide = np.random.choice(50, (32, 10))
......@@ -30,6 +31,7 @@ X_tab = np.vstack(embed_cols + cont_cols).transpose()
# Target
target_regres = np.random.random(32)
target_binary = np.random.choice(2, 32)
target_binary_imbalanced = np.random.choice(2, 32, p=[0.75, 0.25])
target_multic = np.random.choice(3, 32)
# Test dictionary
......@@ -234,3 +236,30 @@ def test_aliases():
and trainer.__wd_aliases_used["objective"] == "loss"
and trainer.__wd_aliases_used["finetune"] == "warmup"
)
##############################################################################
# Test custom dataloader
##############################################################################
def test_custom_dataloader():
wide = Wide(np.unique(X_wide).shape[0], 1)
deeptabular = TabMlp(
mlp_hidden_dims=[32, 16],
mlp_dropout=[0.5, 0.5],
column_idx=column_idx,
embed_input=embed_input,
continuous_cols=colnames[-5:],
)
model = WideDeep(wide=wide, deeptabular=deeptabular)
trainer = Trainer(model, loss="binary", verbose=0)
trainer.fit(
X_wide=X_wide,
X_tab=X_tab,
target=target_binary_imbalanced,
batch_size=16,
custom_dataloader=DataLoaderImbalanced,
)
# simply checking that runs with DataLoaderImbalanced
assert "train_loss" in trainer.history.keys()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册