提交 e2cf20e1 编写于 作者: P Pavol Mulinka

added embedding rules, MonteCarlo(uncertainty) prediction and removed running...

added embedding rules, MonteCarlo(uncertainty) prediction and removed running of the tests on draf requests
上级 c287c870
......@@ -11,6 +11,7 @@ on:
jobs:
codestyle:
runs-on: ubuntu-latest
if: ${{ github.event_name == 'push' || !github.event.pull_request.draft }}
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.9
......@@ -32,6 +33,7 @@ jobs:
test:
runs-on: ubuntu-latest
if: ${{ github.event_name == 'push' || !github.event.pull_request.draft }}
strategy:
fail-fast: true
matrix:
......@@ -59,6 +61,7 @@ jobs:
finish:
needs: test
runs-on: ubuntu-latest
if: ${{ github.event_name == 'push' || !github.event.pull_request.draft }}
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.9
......
1.0.11
\ No newline at end of file
1.0.12
\ No newline at end of file
此差异已折叠。
......@@ -12,10 +12,24 @@ from pytorch_widedeep.preprocessing.base_preprocessor import (
)
def embed_sz_rule(n_cat):
r"""Rule of thumb to pick embedding size corresponding to ``n_cat``. Taken
from fastai's Tabular API"""
return min(600, round(1.6 * n_cat ** 0.56))
def embed_sz_rule(n_cat: int, embedding_rule: str="fastai_new") -> int:
r"""Rule of thumb to pick embedding size corresponding to ``n_cat``. Default rule is taken
from recent fastai's Tabular API. The function also includes previously used rule by fastai
and rule included in the Google's Tensorflow documentation
Parameters
----------
n_cat: int
number of unique categorical values in a feature
embedding_rule: str, default = fastai_old
rule of thumb to be used for embedding vector size
"""
if embedding_rule == 'google':
return int(round(n_cat**0.25))
elif embedding_rule == 'fastai_old':
return int(min(50, (n_cat//2) + 1))
else:
return int(min(600, round(1.6 * n_cat ** 0.56)))
class TabPreprocessor(BasePreprocessor):
......@@ -38,8 +52,15 @@ class TabPreprocessor(BasePreprocessor):
:obj:`pytorch_widedeep.models.transformers._embedding_layers`
auto_embed_dim: bool, default = True
Boolean indicating whether the embedding dimensions will be
automatically defined via fastai's rule of thumb':
:math:`min(600, int(1.6 \times n_{cat}^{0.56}))`
automatically defined via rule of thumb
embedding_rule: str, default = 'fastai_new'
choice of embedding rule of thumb
'fastai_new':
:math:`min(600, round(1.6 \times n_{cat}^{0.56}))`
'fastai_old':
:math:`min(50, (n_{cat}//{2})+1)`
'google':
:math:`min(600, round(n_{cat}^{0.24}))`
default_embed_dim: int, default=16
Dimension for the embeddings used for the ``deeptabular``
component if the embed_dim is not provided in the ``embed_cols``
......@@ -118,6 +139,7 @@ class TabPreprocessor(BasePreprocessor):
continuous_cols: List[str] = None,
scale: bool = True,
auto_embed_dim: bool = True,
embedding_rule: str = "fastai_new",
default_embed_dim: int = 16,
already_standard: List[str] = None,
for_transformer: bool = False,
......@@ -131,6 +153,7 @@ class TabPreprocessor(BasePreprocessor):
self.continuous_cols = continuous_cols
self.scale = scale
self.auto_embed_dim = auto_embed_dim
self.embedding_rule = embedding_rule
self.default_embed_dim = default_embed_dim
self.already_standard = already_standard
self.for_transformer = for_transformer
......@@ -250,7 +273,7 @@ class TabPreprocessor(BasePreprocessor):
embed_colname = [emb[0] for emb in self.embed_cols]
elif self.auto_embed_dim:
n_cats = {col: df[col].nunique() for col in self.embed_cols}
self.embed_dim = {col: embed_sz_rule(n_cat) for col, n_cat in n_cats.items()} # type: ignore[misc]
self.embed_dim = {col: embed_sz_rule(n_cat, self.embedding_rule) for col, n_cat in n_cats.items()} # type: ignore[misc]
embed_colname = self.embed_cols # type: ignore
else:
self.embed_dim = {e: self.default_embed_dim for e in self.embed_cols} # type: ignore
......
......@@ -27,7 +27,7 @@ from pytorch_widedeep.callbacks import (
from pytorch_widedeep.dataloaders import DataLoaderDefault
from pytorch_widedeep.initializers import Initializer, MultipleInitializer
from pytorch_widedeep.training._finetune import FineTune
from pytorch_widedeep.utils.general_utils import Alias
from pytorch_widedeep.utils.general_utils import Alias, set_default_attr
from pytorch_widedeep.models.tabnet._utils import create_explain_matrix
from pytorch_widedeep.training._wd_dataset import WideDeepDataset
from pytorch_widedeep.training._trainer_utils import (
......@@ -685,8 +685,14 @@ class Trainer:
If a trainer is used to predict after having trained a model, the
``batch_size`` needs to be defined as it will not be defined as
the :obj:`Trainer` is instantiated
uncertainty: bool, default = False
If set to True the model activates the dropout layers and predicts
the each sample N times (uncertainty_granularity times) and returns
{max, min, mean, stdev} value for each sample
uncertainty_granularity: int default = 1000
number of times the model does prediction for each sample if uncertainty
is set to True
"""
preds_l = self._predict(X_wide, X_tab, X_text, X_img, X_test, batch_size)
if self.method == "regression":
return np.vstack(preds_l).squeeze(1)
......@@ -697,6 +703,86 @@ class Trainer:
preds = np.vstack(preds_l)
return np.argmax(preds, 1) # type: ignore[return-value]
def predict_uncertainty( # type: ignore[return]
self,
X_wide: Optional[np.ndarray] = None,
X_tab: Optional[np.ndarray] = None,
X_text: Optional[np.ndarray] = None,
X_img: Optional[np.ndarray] = None,
X_test: Optional[Dict[str, np.ndarray]] = None,
batch_size: int = 256,
uncertainty_granularity = 1000,
) -> np.ndarray:
r"""Returns the predicted ucnertainty of the model for the test dataset using a
Monte Carlo method during which dropout layers are activated in the evaluation/prediction
phase and each sample is predicted N times (uncertainty_granularity times). Based on [1].
[1] Gal Y. & Ghahramani Z., 2016, Dropout as a Bayesian Approximation: Representing Model
Uncertainty in Deep Learning, Proceedings of the 33rd International Conference on Machine Learning
Parameters
----------
X_wide: np.ndarray, Optional. default=None
Input for the ``wide`` model component.
See :class:`pytorch_widedeep.preprocessing.WidePreprocessor`
X_tab: np.ndarray, Optional. default=None
Input for the ``deeptabular`` model component.
See :class:`pytorch_widedeep.preprocessing.TabPreprocessor`
X_text: np.ndarray, Optional. default=None
Input for the ``deeptext`` model component.
See :class:`pytorch_widedeep.preprocessing.TextPreprocessor`
X_img : np.ndarray, Optional. default=None
Input for the ``deepimage`` model component.
See :class:`pytorch_widedeep.preprocessing.ImagePreprocessor`
X_test: Dict, Optional. default=None
The test dataset can also be passed in a dictionary. Keys are
`X_wide`, `'X_tab'`, `'X_text'`, `'X_img'` and `'target'`. Values
are the corresponding matrices.
batch_size: int, default = 256
If a trainer is used to predict after having trained a model, the
``batch_size`` needs to be defined as it will not be defined as
the :obj:`Trainer` is instantiated
uncertainty_granularity: int default = 1000
number of times the model does prediction for each sample if uncertainty
is set to True
Returns
-------
method == regression : np.ndarray
{max, min, mean, stdev} values for each sample for
method == binary : np.ndarray
{mean_cls_0_prob, mean_cls_1_prob, predicted_cls} values for each sample for
method == multiclass : np.ndarray
{mean_cls_0_prob, mean_cls_1_prob, mean_cls_2_prob, ... , predicted_cls} values for each sample for
"""
preds_l = self._predict(X_wide, X_tab, X_text, X_img, X_test, batch_size,
uncertainty_granularity, uncertainty=True)
preds = np.vstack(preds_l)
samples_num = int(preds.shape[0]/uncertainty_granularity)
if self.method == "regression":
preds = preds.squeeze(1)
preds = preds.reshape((uncertainty_granularity, samples_num))
return np.array((
preds.max(axis=0),
preds.min(axis=0),
preds.mean(axis=0),
preds.std(axis=0))).T
if self.method == "binary":
preds = preds.squeeze(1)
preds = preds.reshape((uncertainty_granularity, samples_num))
preds = preds.mean(axis=0)
probs = np.zeros([preds.shape[0], 3])
probs[:, 0] = 1 - preds
probs[:, 1] = preds
probs[:, 2] = (preds > 0.5).astype("int")
return probs
if self.method == "multiclass":
preds = preds.reshape(uncertainty_granularity, samples_num, preds.shape[1])
preds = preds.mean(axis=0)
preds = np.hstack((preds, np.vstack(np.argmax(preds, 1))))
return preds
def predict_proba( # type: ignore[return]
self,
X_wide: Optional[np.ndarray] = None,
......@@ -1112,6 +1198,8 @@ class Trainer:
X_img: Optional[np.ndarray] = None,
X_test: Optional[Dict[str, np.ndarray]] = None,
batch_size: int = 256,
uncertainty_granularity = 1000,
uncertainty: bool = False,
) -> List:
r"""Private method to avoid code repetition in predict and
predict_proba. For parameter information, please, see the .predict()
......@@ -1144,20 +1232,33 @@ class Trainer:
self.model.eval()
preds_l = []
if uncertainty:
for m in self.model.modules():
if m.__class__.__name__.startswith('Dropout'):
m.train()
prediction_iters = uncertainty_granularity
else:
prediction_iters = 1
with torch.no_grad():
with trange(test_steps, disable=self.verbose != 1) as t:
for i, data in zip(t, test_loader):
t.set_description("predict")
X = {k: v.cuda() for k, v in data.items()} if use_cuda else data
preds = (
self.model(X) if not self.model.is_tabnet else self.model(X)[0]
)
if self.method == "binary":
preds = torch.sigmoid(preds)
if self.method == "multiclass":
preds = F.softmax(preds, dim=1)
preds = preds.cpu().data.numpy()
preds_l.append(preds)
with trange(uncertainty_granularity, disable=uncertainty is False) as t:
for i, k in zip(t, range(prediction_iters)):
t.set_description("predict_UncertaintyIter")
with trange(test_steps, disable=self.verbose != 1 or uncertainty is True) as tt:
for j, data in zip(tt, test_loader):
tt.set_description("predict")
X = {k: v.cuda() for k, v in data.items()} if use_cuda else data
preds = (
self.model(X) if not self.model.is_tabnet else self.model(X)[0]
)
if self.method == "binary":
preds = torch.sigmoid(preds)
if self.method == "multiclass":
preds = F.softmax(preds, dim=1)
preds = preds.cpu().data.numpy()
preds_l.append(preds)
self.model.train()
return preds_l
......
__version__ = "1.0.11"
__version__ = "1.0.12"
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册