test_du_tabular.py 10.0 KB
Newer Older
1 2 3
import numpy as np
import pandas as pd
import pytest
4
from sklearn.exceptions import NotFittedError
5

6 7
from pytorch_widedeep.preprocessing import TabPreprocessor
from pytorch_widedeep.utils.deeptabular_utils import LabelEncoder
8
from pytorch_widedeep.preprocessing.tab_preprocessor import embed_sz_rule
9 10 11


def create_test_dataset(input_type, input_type_2=None):
J
jrzaurin 已提交
12 13 14 15 16 17 18 19 20 21 22
    df = pd.DataFrame()
    col1 = list(np.random.choice(input_type, 3))
    if input_type_2 is not None:
        col2 = list(np.random.choice(input_type_2, 3))
    else:
        col2 = list(np.random.choice(input_type, 3))
    df["col1"], df["col2"] = col1, col2
    return df


some_letters = ["a", "b", "c", "d", "e"]
23 24 25 26 27
some_numbers = [1, 2, 3, 4, 5]

df_letters = create_test_dataset(some_letters)
df_numbers = create_test_dataset(some_numbers)

J
jrzaurin 已提交
28

29
###############################################################################
30
# Simple test of functionality: testing the LabelEncoder class
31 32 33 34 35 36 37 38
###############################################################################
le_letters = LabelEncoder(["col1", "col2"])
df_letters_le = le_letters.fit_transform(df_letters)
le_numbers = LabelEncoder(["col1", "col2"])
df_numbers_le = le_numbers.fit_transform(df_numbers)


@pytest.mark.parametrize(
39 40
    "input_df, encoder, output_df",
    [(df_letters, le_letters, df_letters_le), (df_numbers, le_numbers, df_numbers_le)],
41 42 43 44 45 46
)
def test_label_encoder(input_df, encoder, output_df):
    original_df = encoder.inverse_transform(output_df)
    assert original_df.equals(input_df)


47
################################################################################
48
# Test the TabPreprocessor: only categorical columns to be represented with
49 50
# embeddings
###############################################################################
J
jrzaurin 已提交
51
cat_embed_cols = [("col1", 5), ("col2", 5)]
52

53
preprocessor1 = TabPreprocessor(cat_embed_cols)  # type: ignore[arg-type]
54 55
X_letters = preprocessor1.fit_transform(df_letters)

56
preprocessor2 = TabPreprocessor(cat_embed_cols)  # type: ignore[arg-type]
57 58
X_numbers = preprocessor2.fit_transform(df_numbers)

59
error_list = []
60 61


J
jrzaurin 已提交
62
@pytest.mark.parametrize(
63 64
    "input_df, X_deep, preprocessor",
    [(df_letters, X_letters, preprocessor1), (df_numbers, X_numbers, preprocessor2)],
J
jrzaurin 已提交
65
)
66
def test_prepare_deep_without_continous_columns(input_df, X_deep, preprocessor):
J
jrzaurin 已提交
67 68
    for i, c in enumerate(input_df.columns):
        if (
69
            # remember we have an "unseen class"
J
jrzaurin 已提交
70 71
            input_df[c].nunique() != preprocessor.cat_embed_input[i][1]
            or cat_embed_cols[i][1] != preprocessor.cat_embed_input[i][2]
J
jrzaurin 已提交
72 73 74 75 76
        ):
            error_list.append(
                "error: the setup output does not match the intended input"
            )

77 78 79
    tmp_df = preprocessor.label_encoder.inverse_transform(
        pd.DataFrame({"col1": X_deep[:, 0], "col2": X_deep[:, 1]})
    )
80

J
jrzaurin 已提交
81 82 83 84
    if not tmp_df.equals(input_df):
        error_list.append("error: the decoding does not match the encoding")

    assert not error_list, "errors occured:\n{}".format("\n".join(error_list))
85 86 87


################################################################################
88
# Test the TabPreprocessor: only continouos columns
89 90 91
###############################################################################
def test_prepare_deep_without_embedding_columns():

J
jrzaurin 已提交
92 93 94
    errors = []
    df_randint = pd.DataFrame(np.random.choice(np.arange(100), (100, 2)))
    df_randint.columns = ["col1", "col2"]
95
    preprocessor3 = TabPreprocessor(continuous_cols=["col1", "col2"])
96

J
jrzaurin 已提交
97 98
    try:
        X_randint = preprocessor3.fit_transform(df_randint)
99
    except Exception:
J
jrzaurin 已提交
100
        errors.append("Fundamental Error")
101

J
jrzaurin 已提交
102
    out_booleans = []
103

J
jrzaurin 已提交
104 105 106 107
    means, stds = np.mean(X_randint, axis=0), np.std(X_randint, axis=0)
    for mean, std in zip(means, stds):
        out_booleans.append(np.isclose(mean, 0.0))
        out_booleans.append(np.isclose(std, 1.0))
108

J
jrzaurin 已提交
109 110
    if not np.all(out_booleans):
        errors.append("There is something going on with the scaler")
111

J
jrzaurin 已提交
112
    assert not errors, "errors occured:\n{}".format("\n".join(errors))
113 114 115


################################################################################
116
# Test TabPreprocessor inverse_transform
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
###############################################################################

df = pd.DataFrame(
    {
        "col1": ["a", "b", "c"],
        "col2": ["c", "d", "e"],
        "col3": [10, 20, 30],
        "col4": [2, 7, 9],
    }
)


@pytest.mark.parametrize(
    "embed_cols, continuous_cols, scale",
    [
        (["col1", "col2"], None, False),
        (None, ["col3", "col4"], True),
        (["col1", "col2"], ["col3", "col4"], False),
        (["col1", "col2"], ["col3", "col4"], True),
        ([("col1", 5), ("col2", 5)], ["col3", "col4"], True),
    ],
)
139 140
def test_tab_preprocessor_inverse_transform(embed_cols, continuous_cols, scale):
    tab_preprocessor = TabPreprocessor(
J
jrzaurin 已提交
141
        cat_embed_cols=embed_cols,
142 143 144
        continuous_cols=continuous_cols,
        scale=scale,
        verbose=False,
145
    )
146 147
    encoded = tab_preprocessor.fit_transform(df)
    decoded = tab_preprocessor.inverse_transform(encoded)
148 149 150 151 152 153 154 155 156 157 158 159 160
    try:
        if isinstance(embed_cols[0], tuple):
            embed_cols = [c[0] for c in embed_cols]
        emb_df = df[embed_cols]
    except Exception:
        emb_df = pd.DataFrame()
    try:
        cont_df = df[continuous_cols]
    except Exception:
        cont_df = pd.DataFrame()
    org_df = pd.concat([emb_df, cont_df], axis=1)
    decoded = decoded.astype(org_df.dtypes.to_dict())
    assert decoded.equals(org_df)
161 162 163 164 165 166 167 168


################################################################################
# Test TabPreprocessor for the TabTransformer
###############################################################################


@pytest.mark.parametrize(
169
    "embed_cols, continuous_cols, scale, with_cls_token",
170
    [
171 172 173 174 175 176
        (["col1", "col2"], None, False, True),
        (["col1", "col2"], ["col3", "col4"], False, True),
        (["col1", "col2"], ["col3", "col4"], True, True),
        (["col1", "col2"], None, False, False),
        (["col1", "col2"], ["col3", "col4"], False, False),
        (["col1", "col2"], ["col3", "col4"], True, False),
177 178
    ],
)
179
def test_tab_preprocessor_trasformer(
180
    embed_cols, continuous_cols, scale, with_cls_token
181
):
182
    tab_preprocessor = TabPreprocessor(
J
jrzaurin 已提交
183
        cat_embed_cols=embed_cols,
184 185
        continuous_cols=continuous_cols,
        scale=scale,
186
        for_transformer=True,
187
        with_cls_token=with_cls_token,
188
        verbose=False,
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
    )
    encoded = tab_preprocessor.fit_transform(df)
    decoded = tab_preprocessor.inverse_transform(encoded)
    try:
        if isinstance(embed_cols[0], tuple):
            embed_cols = [c[0] for c in embed_cols]
        emb_df = df[embed_cols]
    except Exception:
        emb_df = pd.DataFrame()
    try:
        cont_df = df[continuous_cols]
    except Exception:
        cont_df = pd.DataFrame()
    org_df = pd.concat([emb_df, cont_df], axis=1)
    decoded = decoded.astype(org_df.dtypes.to_dict())
    assert decoded.equals(org_df)


@pytest.mark.parametrize(
    "embed_cols, continuous_cols, scale",
    [
        (None, ["col3", "col4"], True),
        ([("col1", 5), ("col2", 5)], ["col3", "col4"], True),
    ],
)
def test_tab_preprocessor_trasformer_raise_error(embed_cols, continuous_cols, scale):
    with pytest.raises(ValueError):
        tab_preprocessor = TabPreprocessor(  # noqa: F841
J
jrzaurin 已提交
217
            cat_embed_cols=embed_cols,
218 219
            continuous_cols=continuous_cols,
            scale=scale,
220
            for_transformer=True,
221
        )
222 223


224 225 226 227 228 229 230
@pytest.mark.parametrize(
    "shared_embed",
    [True, False],
)
def test_with_and_without_shared_embeddings(shared_embed):

    tab_preprocessor = TabPreprocessor(
J
jrzaurin 已提交
231
        cat_embed_cols=["col1", "col2"],
232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253
        continuous_cols=None,
        for_transformer=True,
        shared_embed=shared_embed,
        verbose=False,
    )

    encoded = tab_preprocessor.fit_transform(df)  # noqa: F841

    first_index = []
    for k, v in tab_preprocessor.label_encoder.encoding_dict.items():
        first_index.append(min(v.values()))
        added_idx = len(v) if not shared_embed else 0

    if shared_embed:
        res = len(set(first_index)) == 1
    else:
        res = (
            len(set(first_index)) == 2 and first_index[1] == first_index[0] + added_idx
        )
    assert res


254 255 256 257 258 259 260
###############################################################################
# Test NotFittedError
###############################################################################


def test_notfittederror():
    processor = TabPreprocessor(
J
jrzaurin 已提交
261
        cat_embed_cols=["col1", "col2"], continuous_cols=["col3", "col4"]
262 263 264
    )
    with pytest.raises(NotFittedError):
        processor.transform(df)
265 266 267 268 269 270 271


###############################################################################
# Test embeddings fastai's rule of thumb
###############################################################################


P
Pavol Mulinka 已提交
272 273 274 275 276 277 278 279 280
@pytest.mark.parametrize(
    "rule",
    [
        ("google"),
        ("fastai_old"),
        ("fastai_new"),
    ],
)
def test_embed_sz_rule_of_thumb(rule):
281 282 283 284 285 286 287 288 289

    embed_cols = ["col1", "col2"]
    df = pd.DataFrame(
        {
            "col1": np.random.randint(10, size=100),
            "col2": np.random.randint(20, size=100),
        }
    )
    n_cats = {c: df[c].nunique() for c in ["col1", "col2"]}
P
Pavol Mulinka 已提交
290
    embed_szs = {c: embed_sz_rule(nc, embedding_rule=rule) for c, nc in n_cats.items()}
J
jrzaurin 已提交
291
    tab_preprocessor = TabPreprocessor(cat_embed_cols=embed_cols, embedding_rule=rule)
292 293 294 295 296
    tdf = tab_preprocessor.fit_transform(df)  # noqa: F841
    out = [
        tab_preprocessor.embed_dim[col] == embed_szs[col] for col in embed_szs.keys()
    ]
    assert all(out)
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312


###############################################################################
# Test Valuerror for repeated cols
###############################################################################


def test_overlapping_cols_valueerror():

    embed_cols = ["col1", "col2"]
    cont_cols = ["col1", "col2"]

    with pytest.raises(ValueError):
        tab_preprocessor = TabPreprocessor(  # noqa: F841
            cat_embed_cols=embed_cols, continuous_cols=cont_cols
        )