review the docs for the preprocessing module

61131205 · Javier Rodriguez Zaurin · f10eb4d3 · 61131205 · 61131205 · 61131205
8 changed file
--- a/mkdocs/mkdocs.yml
+++ b/mkdocs/mkdocs.yml
@@ -151,7 +151,8 @@ markdown_extensions:
        baselevel: 1
        permalink: true
        toc_depth: 2
-    - pymdownx.arithmatex
+    - pymdownx.arithmatex:
+        generic: true
    - pymdownx.betterem:
        smart_enable: all
    - pymdownx.caret

--- a/pytorch_widedeep/models/wide_deep.py
+++ b/pytorch_widedeep/models/wide_deep.py
@@ -109,7 +109,7 @@ class WideDeep(nn.Module):
    >>> model = WideDeep(wide=wide, deeptabular=deeptabular, deeptext=deeptext, deepimage=deepimage)


-    :information_source: **NOTE**: 
+    :information_source: **NOTE**:
    It is possible to use custom components to build Wide & Deep models.
    Simply, build them and pass them as the corresponding parameters. Note
    that the custom models MUST return a last layer of activations

--- a/pytorch_widedeep/preprocessing/image_preprocessor.py
+++ b/pytorch_widedeep/preprocessing/image_preprocessor.py
@@ -35,9 +35,9 @@ class ImagePreprocessor(BasePreprocessor):
    Attributes
    ----------
    aap: AspectAwarePreprocessor
-        an instance of :class:`pytorch_widedeep.utils.image_utils.AspectAwarePreprocessor`
+        an instance of `pytorch_widedeep.utils.image_utils.AspectAwarePreprocessor`
    spp: SimplePreprocessor
-        an instance of :class:`pytorch_widedeep.utils.image_utils.SimplePreprocessor`
+        an instance of `pytorch_widedeep.utils.image_utils.SimplePreprocessor`
    normalise_metrics: Dict
        Dict containing the normalisation metrics of the image dataset, i.e.
        mean and std for the R, G and B channels
@@ -57,11 +57,11 @@ class ImagePreprocessor(BasePreprocessor):
    >>> resized_images = img_preprocessor.fit_transform(df_train)
    >>> new_resized_images = img_preprocessor.transform(df_train)

-    :information_source: **NOTE**: 
-    Normalising metrics will only be computed when the
-    ``fit_transform`` method is run. Running ``transform`` only will not
-    change the computed metrics and running ``fit`` only simply
-        instantiates the resizing functions.
+    :information_source: **NOTE**:
+    Normalising metrics will only be computed when the ``fit_transform``
+    method is run. Running ``transform`` only will not change the computed
+    metrics and running ``fit`` only simply instantiates the resizing
+    functions.
    """

    def __init__(
@@ -81,13 +81,21 @@ class ImagePreprocessor(BasePreprocessor):
        self.verbose = verbose

    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
-        r"""Instantiates the Preprocessors
-        :obj:`AspectAwarePreprocessor`` and :obj:`SimplePreprocessor` for image
-        resizing.
+        r"""Instantiates the Preprocessors `AspectAwarePreprocessor` and
+        `SimplePreprocessor` for image resizing.

-        See
-        :class:`pytorch_widedeep.utils.image_utils.AspectAwarePreprocessor`
-        and :class:`pytorch_widedeep.utils.image_utils.SimplePreprocessor`.
+        See`pytorch_widedeep.utils.image_utils.AspectAwarePreprocessor` and
+        `pytorch_widedeep.utils.image_utils.SimplePreprocessor`.
+
+        Parameters
+        ----------
+        df: pd.DataFrame
+            Input pandas dataframe
+
+        Returns
+        -------
+        ImagePreprocessor
+            `ImagePreprocessor` fitted object

        """
        self.aap = AspectAwarePreprocessor(self.width, self.height)
@@ -96,7 +104,19 @@ class ImagePreprocessor(BasePreprocessor):
        return self

    def transform(self, df: pd.DataFrame) -> np.ndarray:
-        """Resizes the images to the input height and width."""
+        """Resizes the images to the input height and width.
+
+
+        Parameters
+        ----------
+        df: pd.DataFrame
+            Input pandas dataframe with the `img_col`
+
+        Returns
+        -------
+        np.ndarray
+            Resized images to the input height and width
+        """
        check_is_fitted(self, attributes=["aap"])
        image_list = df[self.img_col].tolist()
        if self.verbose:
@@ -151,7 +171,18 @@ class ImagePreprocessor(BasePreprocessor):
        return np.asarray(resized_imgs)

    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:
-        """Combines ``fit`` and ``transform``"""
+        """Combines `fit` and `transform`
+
+        Parameters
+        ----------
+        df: pd.DataFrame
+            Input pandas dataframe
+
+        Returns
+        -------
+        np.ndarray
+            Resized images to the input height and width
+        """
        return self.fit(df).transform(df)

    def inverse_transform(self, transformed_image):

--- a/pytorch_widedeep/preprocessing/tab_preprocessor.py
+++ b/pytorch_widedeep/preprocessing/tab_preprocessor.py
@@ -17,7 +17,7 @@ def embed_sz_rule(
    n_cat: int,
    embedding_rule: Literal["google", "fastai_old", "fastai_new"] = "fastai_new",
 ) -> int:
-    r"""Rule of thumb to pick embedding size corresponding to ``n_cat``. Default rule is taken
+    r"""Rule of thumb to pick embedding size corresponding to `n_cat`. Default rule is taken
    from recent fastai's Tabular API. The function also includes previously used rule by fastai
    and rule included in the Google's Tensorflow documentation

@@ -37,56 +37,51 @@ def embed_sz_rule(


 class TabPreprocessor(BasePreprocessor):
-    r"""Preprocessor to prepare the ``deeptabular`` component input dataset
+    r"""Preprocessor to prepare the `deeptabular` component input dataset

    Parameters
    ----------
    cat_embed_cols: List, default = None
        List containing the name of the categorical columns that will be
-        represented by embeddings (e.g.['education', 'relationship', ...]) or
-        a Tuple with the name and the embedding dimension (e.g.:[
-        ('education',32),('relationship',16), ...])
+        represented by embeddings (e.g. _['education', 'relationship', ...]_) or
+        a Tuple with the name and the embedding dimension (e.g.: _[
+        ('education',32), ('relationship',16), ...]_)
    continuous_cols: List, default = None
        List with the name of the continuous cols
    scale: bool, default = True
        Bool indicating whether or not to scale/standarise continuous cols. It
        is important to emphasize that all the DL models for tabular data in
        the library also include the possibility of normalising the input
-        continuous features via a ``BatchNorm`` or a ``LayerNorm``.
-
-        Param alias: ``scale_cont_cols``
-
+        continuous features via a `BatchNorm` or a `LayerNorm`. <br/>
+        Param alias: `scale_cont_cols`.
    already_standard: List, default = None
        List with the name of the continuous cols that do not need to be
        scaled/standarised.
    auto_embed_dim: bool, default = True
        Boolean indicating whether the embedding dimensions will be
-        automatically defined via rule of thumb. See ``embedding_rule``
+        automatically defined via rule of thumb. See `embedding_rule`
        below.
    embedding_rule: str, default = 'fastai_new'
-        If ``auto_embed_dim=True``, this is the choice of embedding rule of
+        If `auto_embed_dim=True`, this is the choice of embedding rule of
        thumb. Choices are:

-        - `'fastai_new'` -- :math:`min(600, round(1.6 \times n_{cat}^{0.56}))`
+        - _fastai_new_: $min(600, round(1.6 \times n_{cat}^{0.56}))$

-        - `'fastai_old'` -- :math:`min(50, (n_{cat}//{2})+1)`
-
-        - `'google'` -- :math:`min(600, round(n_{cat}^{0.24}))`
+        - _fastai_old_: $min(50, (n_{cat}//{2})+1)$

+        - _google_: $min(600, round(n_{cat}^{0.24}))$
    default_embed_dim: int, default=16
        Dimension for the embeddings if the embed_dim is not provided in the
-        ``cat_embed_cols`` parameter and ``auto_embed_dim`` is set to
-        ``False``.
+        `cat_embed_cols` parameter and `auto_embed_dim` is set to
+        `False`.
    with_attention: bool, default = False
        Boolean indicating whether the preprocessed data will be passed to an
-        attention-based model. If ``True``, the param ``cat_embed_cols`` must
+        attention-based model. If `True`, the param `cat_embed_cols` must
        just be a list containing just the categorical column names: e.g.
-        ['education', 'relationship', ...]. This is because they will all be
+        _['education', 'relationship', ...]_. This is because they will all be
        encoded using embeddings of the same dim, which will be specified
-        later when the model is defined.
-
-        Param alias: ``for_transformer``
-
+        later when the model is defined. <br/>
+        Param alias: `for_transformer`
    with_cls_token: bool, default = False
        Boolean indicating if a `'[CLS]'` token will be added to the dataset
        when using attention-based models. The final hidden state
@@ -96,32 +91,32 @@ class TabPreprocessor(BasePreprocessor):
        being passed to the final MLP (if present).
    shared_embed: bool, default = False
        Boolean indicating if the embeddings will be "shared" when using
-        attention-based models. The idea behind ``shared_embed`` is
-        described in the Appendix A in the `TabTransformer paper
-        <https://arxiv.org/abs/2012.06678>`_: `'The goal of having column
-        embedding is to enable the model to distinguish the classes in one
-        column from those in the other columns'`. In other words, the idea is
-        to let the model learn which column is embedded at the time. See:
-        :obj:`pytorch_widedeep.models.transformers._layers.SharedEmbeddings`.
+        attention-based models. The idea behind `shared_embed` is
+        described in the Appendix A in the [TabTransformer paper](https://arxiv.org/abs/2012.06678):
+        _'The goal of having column embedding is to enable the model to
+        distinguish the classes in one column from those in the other
+        columns'_. In other words, the idea is to let the model learn which
+        column is embedded at the time. See: `pytorch_widedeep.models.transformers._layers.SharedEmbeddings`.
+
    verbose: int, default = 1

    Attributes
    ----------
    embed_dim: Dict
        Dictionary where keys are the embed cols and values are the embedding
-        dimensions. If ``with_attention`` is set to ``True`` this attribute
-        is not generated during the ``fit`` process
+        dimensions. If `with_attention` is set to `True` this attribute
+        is not generated during the `fit` process
    label_encoder: LabelEncoder
-        see :class:`pytorch_widedeep.utils.dense_utils.LabelEncder`
+        see `pytorch_widedeep.utils.dense_utils.LabelEncder`
    cat_embed_input: List
        List of Tuples with the column name, number of individual values for
-        that column and, If ``with_attention`` is set to ``False``, the
-        corresponding embeddings dim, e.g. [('education', 16, 10),
-        ('relationship', 6, 8), ...].
+        that column and, If `with_attention` is set to `False`, the
+        corresponding embeddings dim, e.g. _[('education', 16, 10),
+        ('relationship', 6, 8), ...]_.
    standardize_cols: List
        List of the columns that will be standarized
    scaler: StandardScaler
-        an instance of :class:`sklearn.preprocessing.StandardScaler`
+        an instance of `sklearn.preprocessing.StandardScaler`
    column_idx: Dict
        Dictionary where keys are column names and values are column indexes.
        This is neccesary to slice tensors
@@ -149,10 +144,10 @@ class TabPreprocessor(BasePreprocessor):
        cat_embed_cols: Union[List[str], List[Tuple[str, int]]] = None,
        continuous_cols: List[str] = None,
        scale: bool = True,
+        already_standard: List[str] = None,
        auto_embed_dim: bool = True,
        embedding_rule: Literal["google", "fastai_old", "fastai_new"] = "fastai_new",
        default_embed_dim: int = 16,
-        already_standard: List[str] = None,
        with_attention: bool = False,
        with_cls_token: bool = False,
        shared_embed: bool = False,
@@ -163,10 +158,10 @@ class TabPreprocessor(BasePreprocessor):
        self.cat_embed_cols = cat_embed_cols
        self.continuous_cols = continuous_cols
        self.scale = scale
+        self.already_standard = already_standard
        self.auto_embed_dim = auto_embed_dim
        self.embedding_rule = embedding_rule
        self.default_embed_dim = default_embed_dim
-        self.already_standard = already_standard
        self.with_attention = with_attention
        self.with_cls_token = with_cls_token
        self.shared_embed = shared_embed
@@ -176,7 +171,18 @@ class TabPreprocessor(BasePreprocessor):
        self.is_fitted = False

    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
-        """Fits the Preprocessor and creates required attributes"""
+        """Fits the Preprocessor and creates required attributes
+
+        Parameters
+        ----------
+        df: pd.DataFrame
+            Input pandas dataframe
+
+        Returns
+        -------
+        TabPreprocessor
+            `TabPreprocessor` fitted object
+        """
        if self.cat_embed_cols is not None:
            df_emb = self._prepare_embed(df)
            self.label_encoder = LabelEncoder(
@@ -202,7 +208,18 @@ class TabPreprocessor(BasePreprocessor):
        return self

    def transform(self, df: pd.DataFrame) -> np.ndarray:
-        """Returns the processed ``dataframe`` as a np.ndarray"""
+        """Returns the processed `dataframe` as a np.ndarray
+
+        Parameters
+        ----------
+        df: pd.DataFrame
+            Input pandas dataframe
+
+        Returns
+        -------
+        np.ndarray
+            transformed input dataframe
+        """
        check_is_fitted(self, condition=self.is_fitted)
        if self.cat_embed_cols is not None:
            df_emb = self._prepare_embed(df)
@@ -223,13 +240,18 @@ class TabPreprocessor(BasePreprocessor):
        return df_deep.values

    def inverse_transform(self, encoded: np.ndarray) -> pd.DataFrame:
-        r"""Takes as input the output from the ``transform`` method and it will
+        r"""Takes as input the output from the `transform` method and it will
        return the original values.

        Parameters
        ----------
        encoded: np.ndarray
-            array with the output of the ``transform`` method
+            array with the output of the `transform` method
+
+        Returns
+        -------
+        pd.DataFrame
+            Pandas dataframe with the original values
        """
        decoded = pd.DataFrame(encoded, columns=self.column_idx.keys())
        # embeddings back to original category
@@ -254,7 +276,18 @@ class TabPreprocessor(BasePreprocessor):
        return decoded

    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:
-        """Combines ``fit`` and ``transform``"""
+        """Combines `fit` and `transform`
+
+        Parameters
+        ----------
+        df: pd.DataFrame
+            Input pandas dataframe
+
+        Returns
+        -------
+        np.ndarray
+            transformed input dataframe
+        """
        return self.fit(df).transform(df)

    def _prepare_embed(self, df: pd.DataFrame) -> pd.DataFrame:

--- a/pytorch_widedeep/preprocessing/text_preprocessor.py
+++ b/pytorch_widedeep/preprocessing/text_preprocessor.py
@@ -40,7 +40,7 @@ class TextPreprocessor(BasePreprocessor):
    Attributes
    ----------
    vocab: Vocab
-        an instance of :class:`pytorch_widedeep.utils.fastai_transforms.Vocab`
+        an instance of `pytorch_widedeep.utils.fastai_transforms.Vocab`
    embedding_matrix: np.ndarray
        Array with the pretrained embeddings
    tokens: List
@@ -85,7 +85,18 @@ class TextPreprocessor(BasePreprocessor):
        self.verbose = verbose

    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
-        """Builds the vocabulary"""
+        """Builds the vocabulary
+
+        Parameters
+        ----------
+        df: pd.DataFrame
+            Input pandas dataframe
+
+        Returns
+        -------
+        TextPreprocessor
+            `TextPreprocessor` fitted object
+        """
        texts = df[self.text_col].tolist()
        tokens = get_texts(texts)
        self.vocab = Vocab.create(
@@ -100,7 +111,18 @@ class TextPreprocessor(BasePreprocessor):
        return self

    def transform(self, df: pd.DataFrame) -> np.ndarray:
-        """Returns the padded, `numericalised` sequences"""
+        """Returns the padded, _'numericalised'_ sequences
+
+        Parameters
+        ----------
+        df: pd.DataFrame
+            Input pandas dataframe
+
+        Returns
+        -------
+        np.ndarray
+            Padded, _'numericalised'_ sequences
+        """
        check_is_fitted(self, attributes=["vocab"])
        texts = df[self.text_col].tolist()
        self.tokens = get_texts(texts)
@@ -118,11 +140,33 @@ class TextPreprocessor(BasePreprocessor):
        )
        return padded_seq

-    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:
-        """Combines ``fit`` and ``transform``"""
-        return self.fit(df).transform(df)
-
    def inverse_transform(self, padded_seq: np.ndarray) -> pd.DataFrame:
-        """Returns the original text plus the added 'special' tokens"""
+        """Returns the original text plus the added 'special' tokens
+
+        Parameters
+        ----------
+        encoded: np.ndarray
+            array with the output of the `transform` method
+
+        Returns
+        -------
+        pd.DataFrame
+            Pandas dataframe with the original text plus the added 'special' tokens
+        """
        texts = [self.vocab.textify(num) for num in padded_seq]
        return pd.DataFrame({self.text_col: texts})
+
+    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:
+        """Combines `fit` and `transform`
+
+        Parameters
+        ----------
+        df: pd.DataFrame
+            Input pandas dataframe
+
+        Returns
+        -------
+        np.ndarray
+            Padded, _'numericalised'_ sequences
+        """
+        return self.fit(df).transform(df)
--- a/pytorch_widedeep/preprocessing/wide_preprocessor.py
+++ b/pytorch_widedeep/preprocessing/wide_preprocessor.py
@@ -13,18 +13,18 @@ class WidePreprocessor(BasePreprocessor):

    This Preprocessor prepares the data for the wide, linear component.
    This linear model is implemented via an Embedding layer that is
-    connected to the output neuron. ``WidePreprocessor`` numerically
-    encodes all the unique values of all categorical columns ``wide_cols +
-    crossed_cols``. See the Example below.
+    connected to the output neuron. `WidePreprocessor` numerically
+    encodes all the unique values of all categorical columns `wide_cols +
+    crossed_cols`. See the Example below.

    Parameters
    ----------
    wide_cols: List
        List of strings with the name of the columns that will label
-        encoded and passed through the ``wide`` component
+        encoded and passed through the `wide` component
    crossed_cols: List, default = None
        List of Tuples with the name of the columns that will be `'crossed'`
-        and then label encoded. e.g. [('education', 'occupation'), ...]. For
+        and then label encoded. e.g. _[('education', 'occupation'), ...]_. For
        binary features, a cross-product transformation is 1 if and only if
        the constituent features are all 1, and 0 otherwise".

@@ -68,8 +68,19 @@ class WidePreprocessor(BasePreprocessor):
        self.wide_cols = wide_cols
        self.crossed_cols = crossed_cols

-    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
-        r"""Fits the Preprocessor and creates required attributes"""
+    def fit(self, df: pd.DataFrame) -> "WidePreprocessor":
+        r"""Fits the Preprocessor and creates required attributes
+
+        Parameters
+        ----------
+        df: pd.DataFrame
+            Input pandas dataframe
+
+        Returns
+        -------
+        WidePreprocessor
+            `WidePreprocessor` fitted object
+        """
        df_wide = self._prepare_wide(df)
        self.wide_crossed_cols = df_wide.columns.tolist()
        glob_feature_list = self._make_global_feature_list(
@@ -83,7 +94,17 @@ class WidePreprocessor(BasePreprocessor):
        return self

    def transform(self, df: pd.DataFrame) -> np.ndarray:
-        r"""Returns the processed dataframe"""
+        r"""
+        Parameters
+        ----------
+        df: pd.DataFrame
+            Input pandas dataframe
+
+        Returns
+        -------
+        np.ndarray
+            transformed input dataframe
+        """
        check_is_fitted(self, attributes=["encoding_dict"])
        df_wide = self._prepare_wide(df)
        encoded = np.zeros([len(df_wide), len(self.wide_crossed_cols)])
@@ -96,8 +117,19 @@ class WidePreprocessor(BasePreprocessor):
        return encoded.astype("int64")

    def inverse_transform(self, encoded: np.ndarray) -> pd.DataFrame:
-        r"""Takes as input the output from the ``transform`` method and it will
+        r"""Takes as input the output from the `transform` method and it will
        return the original values.
+
+        Parameters
+        ----------
+        encoded: np.ndarray
+            numpy array with the encoded values that are the output from the
+            `transform` method
+
+        Returns
+        -------
+        pd.DataFrame
+            Pandas dataframe with the original values
        """
        decoded = pd.DataFrame(encoded, columns=self.wide_crossed_cols)
        decoded = decoded.applymap(lambda x: self.inverse_encoding_dict[x])
@@ -107,7 +139,18 @@ class WidePreprocessor(BasePreprocessor):
        return decoded

    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:
-        """Combines ``fit`` and ``transform``"""
+        """Combines `fit` and `transform`
+
+        Parameters
+        ----------
+        df: pd.DataFrame
+            Input pandas dataframe
+
+        Returns
+        -------
+        np.ndarray
+            transformed input dataframe
+        """
        return self.fit(df).transform(df)

    def _make_global_feature_list(self, df: pd.DataFrame) -> List:

--- a/pytorch_widedeep/tab2vec.py
+++ b/pytorch_widedeep/tab2vec.py
@@ -20,7 +20,7 @@ class Tab2Vec:
    processing applied by the model to the categorical and continuous
    columns.

-    :information_source: **NOTE**: 
+    :information_source: **NOTE**:
    Currently this class is only implemented for the deeptabular
    component or the Bayesian model. Therefore, if the input dataframe has
    a text column or a column with the path to images, these will be

--- a/pytorch_widedeep/training/trainer.py
+++ b/pytorch_widedeep/training/trainer.py
@@ -719,7 +719,7 @@ class Trainer:
        For a series of comprehensive examples please, see the `Examples
        <https://github.com/jrzaurin/pytorch-widedeep/tree/master/examples>`__
        folder in the repo.
-        
+
        For completion, here we include a `"fabricated"` example, i.e.
        assuming we have already trained the model, that we have the
        categorical encodings in a dictionary name ``encoding_dict``, and that