wide_preprocessor.py 6.2 KB
Newer Older
1 2
from typing import List, Tuple

3 4 5 6 7 8 9 10 11 12
import numpy as np
import pandas as pd

from pytorch_widedeep.preprocessing.base_preprocessor import (
    BasePreprocessor,
    check_is_fitted,
)


class WidePreprocessor(BasePreprocessor):
13 14 15 16
    r"""Preprocessor to prepare the wide input dataset

    This Preprocessor prepares the data for the wide, linear component.
    This linear model is implemented via an Embedding layer that is
17 18 19
    connected to the output neuron. `WidePreprocessor` numerically
    encodes all the unique values of all categorical columns `wide_cols +
    crossed_cols`. See the Example below.
20

21 22 23 24
    Parameters
    ----------
    wide_cols: List
        List of strings with the name of the columns that will label
25
        encoded and passed through the `wide` component
26 27
    crossed_cols: List, default = None
        List of Tuples with the name of the columns that will be `'crossed'`
28
        and then label encoded. e.g. _[('education', 'occupation'), ...]_. For
J
jrzaurin 已提交
29 30
        binary features, a cross-product transformation is 1 if and only if
        the constituent features are all 1, and 0 otherwise".
31

32 33 34 35 36 37 38
    Attributes
    ----------
    wide_crossed_cols: List
        List with the names of all columns that will be label encoded
    encoding_dict: Dict
        Dictionary where the keys are the result of pasting `colname + '_' +
        column value` and the values are the corresponding mapped integer.
39 40
    wide_dim: int
        Dimension of the wide model (i.e. dim of the linear layer)
41

42 43
    Examples
    --------
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
    >>> import pandas as pd
    >>> from pytorch_widedeep.preprocessing import WidePreprocessor
    >>> df = pd.DataFrame({'color': ['r', 'b', 'g'], 'size': ['s', 'n', 'l']})
    >>> wide_cols = ['color']
    >>> crossed_cols = [('color', 'size')]
    >>> wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
    >>> X_wide = wide_preprocessor.fit_transform(df)
    >>> X_wide
    array([[1, 4],
           [2, 5],
           [3, 6]])
    >>> wide_preprocessor.encoding_dict
    {'color_r': 1, 'color_b': 2, 'color_g': 3, 'color_size_r-s': 4, 'color_size_b-n': 5, 'color_size_g-l': 6}
    >>> wide_preprocessor.inverse_transform(X_wide)
      color color_size
    0     r        r-s
    1     b        b-n
    2     g        g-l
    """
63

64 65 66
    def __init__(
        self, wide_cols: List[str], crossed_cols: List[Tuple[str, str]] = None
    ):
67 68 69 70 71
        super(WidePreprocessor, self).__init__()

        self.wide_cols = wide_cols
        self.crossed_cols = crossed_cols

72 73 74 75 76 77 78 79 80 81 82 83 84
    def fit(self, df: pd.DataFrame) -> "WidePreprocessor":
        r"""Fits the Preprocessor and creates required attributes

        Parameters
        ----------
        df: pd.DataFrame
            Input pandas dataframe

        Returns
        -------
        WidePreprocessor
            `WidePreprocessor` fitted object
        """
85 86 87 88 89 90 91
        df_wide = self._prepare_wide(df)
        self.wide_crossed_cols = df_wide.columns.tolist()
        glob_feature_list = self._make_global_feature_list(
            df_wide[self.wide_crossed_cols]
        )
        # leave 0 for padding/"unseen" categories
        self.encoding_dict = {v: i + 1 for i, v in enumerate(glob_feature_list)}
92
        self.wide_dim = len(self.encoding_dict)
93 94 95 96 97
        self.inverse_encoding_dict = {k: v for v, k in self.encoding_dict.items()}
        self.inverse_encoding_dict[0] = "unseen"
        return self

    def transform(self, df: pd.DataFrame) -> np.ndarray:
98 99 100 101 102 103 104 105 106 107 108
        r"""
        Parameters
        ----------
        df: pd.DataFrame
            Input pandas dataframe

        Returns
        -------
        np.ndarray
            transformed input dataframe
        """
109 110 111 112 113 114 115 116 117 118 119 120
        check_is_fitted(self, attributes=["encoding_dict"])
        df_wide = self._prepare_wide(df)
        encoded = np.zeros([len(df_wide), len(self.wide_crossed_cols)])
        for col_i, col in enumerate(self.wide_crossed_cols):
            encoded[:, col_i] = df_wide[col].apply(
                lambda x: self.encoding_dict[col + "_" + str(x)]
                if col + "_" + str(x) in self.encoding_dict
                else 0
            )
        return encoded.astype("int64")

    def inverse_transform(self, encoded: np.ndarray) -> pd.DataFrame:
121
        r"""Takes as input the output from the `transform` method and it will
122
        return the original values.
123 124 125 126 127 128 129 130 131 132 133

        Parameters
        ----------
        encoded: np.ndarray
            numpy array with the encoded values that are the output from the
            `transform` method

        Returns
        -------
        pd.DataFrame
            Pandas dataframe with the original values
134 135 136 137 138 139 140 141 142
        """
        decoded = pd.DataFrame(encoded, columns=self.wide_crossed_cols)
        decoded = decoded.applymap(lambda x: self.inverse_encoding_dict[x])
        for col in decoded.columns:
            rm_str = "".join([col, "_"])
            decoded[col] = decoded[col].apply(lambda x: x.replace(rm_str, ""))
        return decoded

    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:
143 144 145 146 147 148 149 150 151 152 153 154
        """Combines `fit` and `transform`

        Parameters
        ----------
        df: pd.DataFrame
            Input pandas dataframe

        Returns
        -------
        np.ndarray
            transformed input dataframe
        """
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
        return self.fit(df).transform(df)

    def _make_global_feature_list(self, df: pd.DataFrame) -> List:
        glob_feature_list = []
        for column in df.columns:
            glob_feature_list += self._make_column_feature_list(df[column])
        return glob_feature_list

    def _make_column_feature_list(self, s: pd.Series) -> List:
        return [s.name + "_" + str(x) for x in s.unique()]

    def _cross_cols(self, df: pd.DataFrame):
        df_cc = df.copy()
        crossed_colnames = []
        for cols in self.crossed_cols:
            for c in cols:
                df_cc[c] = df_cc[c].astype("str")
            colname = "_".join(cols)
            df_cc[colname] = df_cc[list(cols)].apply(lambda x: "-".join(x), axis=1)
            crossed_colnames.append(colname)
        return df_cc[crossed_colnames]

    def _prepare_wide(self, df: pd.DataFrame):
        if self.crossed_cols is not None:
            df_cc = self._cross_cols(df)
            return pd.concat([df[self.wide_cols], df_cc], axis=1)
        else:
            return df.copy()[self.wide_cols]