test_du_deep_tabular.py 8.3 KB
Newer Older
1 2 3
import numpy as np
import pandas as pd
import pytest
4
from sklearn.exceptions import NotFittedError
5

6 7
from pytorch_widedeep.preprocessing import TabPreprocessor
from pytorch_widedeep.utils.deeptabular_utils import LabelEncoder
8
from pytorch_widedeep.preprocessing.tab_preprocessor import embed_sz_rule
9 10 11


def create_test_dataset(input_type, input_type_2=None):
J
jrzaurin 已提交
12 13 14 15 16 17 18 19 20 21 22
    df = pd.DataFrame()
    col1 = list(np.random.choice(input_type, 3))
    if input_type_2 is not None:
        col2 = list(np.random.choice(input_type_2, 3))
    else:
        col2 = list(np.random.choice(input_type, 3))
    df["col1"], df["col2"] = col1, col2
    return df


some_letters = ["a", "b", "c", "d", "e"]
23 24 25 26 27
some_numbers = [1, 2, 3, 4, 5]

df_letters = create_test_dataset(some_letters)
df_numbers = create_test_dataset(some_numbers)

J
jrzaurin 已提交
28

29
###############################################################################
30
# Simple test of functionality: testing the LabelEncoder class
31 32 33 34 35 36 37 38
###############################################################################
le_letters = LabelEncoder(["col1", "col2"])
df_letters_le = le_letters.fit_transform(df_letters)
le_numbers = LabelEncoder(["col1", "col2"])
df_numbers_le = le_numbers.fit_transform(df_numbers)


@pytest.mark.parametrize(
39 40
    "input_df, encoder, output_df",
    [(df_letters, le_letters, df_letters_le), (df_numbers, le_numbers, df_numbers_le)],
41 42 43 44 45 46
)
def test_label_encoder(input_df, encoder, output_df):
    original_df = encoder.inverse_transform(output_df)
    assert original_df.equals(input_df)


47
################################################################################
48
# Test the TabPreprocessor: only categorical columns to be represented with
49 50
# embeddings
###############################################################################
J
jrzaurin 已提交
51
cat_embed_cols = [("col1", 5), ("col2", 5)]
52

53
preprocessor1 = TabPreprocessor(cat_embed_cols)  # type: ignore[arg-type]
54 55
X_letters = preprocessor1.fit_transform(df_letters)

56
preprocessor2 = TabPreprocessor(cat_embed_cols)  # type: ignore[arg-type]
57 58
X_numbers = preprocessor2.fit_transform(df_numbers)

59
error_list = []
60 61


J
jrzaurin 已提交
62
@pytest.mark.parametrize(
63 64
    "input_df, X_deep, preprocessor",
    [(df_letters, X_letters, preprocessor1), (df_numbers, X_numbers, preprocessor2)],
J
jrzaurin 已提交
65
)
66
def test_prepare_deep_without_continous_columns(input_df, X_deep, preprocessor):
J
jrzaurin 已提交
67 68
    for i, c in enumerate(input_df.columns):
        if (
69
            # remember we have an "unseen class"
70
            input_df[c].nunique() != preprocessor.embeddings_input[i][1]
71
            or cat_embed_cols[i][1] != preprocessor.embeddings_input[i][2]
J
jrzaurin 已提交
72 73 74 75 76
        ):
            error_list.append(
                "error: the setup output does not match the intended input"
            )

77 78 79
    tmp_df = preprocessor.label_encoder.inverse_transform(
        pd.DataFrame({"col1": X_deep[:, 0], "col2": X_deep[:, 1]})
    )
80

J
jrzaurin 已提交
81 82 83 84
    if not tmp_df.equals(input_df):
        error_list.append("error: the decoding does not match the encoding")

    assert not error_list, "errors occured:\n{}".format("\n".join(error_list))
85 86 87


################################################################################
88
# Test the TabPreprocessor: only continouos columns
89 90 91
###############################################################################
def test_prepare_deep_without_embedding_columns():

J
jrzaurin 已提交
92 93 94
    errors = []
    df_randint = pd.DataFrame(np.random.choice(np.arange(100), (100, 2)))
    df_randint.columns = ["col1", "col2"]
95
    preprocessor3 = TabPreprocessor(continuous_cols=["col1", "col2"])
96

J
jrzaurin 已提交
97 98
    try:
        X_randint = preprocessor3.fit_transform(df_randint)
99
    except Exception:
J
jrzaurin 已提交
100
        errors.append("Fundamental Error")
101

J
jrzaurin 已提交
102
    out_booleans = []
103

J
jrzaurin 已提交
104 105 106 107
    means, stds = np.mean(X_randint, axis=0), np.std(X_randint, axis=0)
    for mean, std in zip(means, stds):
        out_booleans.append(np.isclose(mean, 0.0))
        out_booleans.append(np.isclose(std, 1.0))
108

J
jrzaurin 已提交
109 110
    if not np.all(out_booleans):
        errors.append("There is something going on with the scaler")
111

J
jrzaurin 已提交
112
    assert not errors, "errors occured:\n{}".format("\n".join(errors))
113 114 115


################################################################################
116
# Test TabPreprocessor inverse_transform
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
###############################################################################

df = pd.DataFrame(
    {
        "col1": ["a", "b", "c"],
        "col2": ["c", "d", "e"],
        "col3": [10, 20, 30],
        "col4": [2, 7, 9],
    }
)


@pytest.mark.parametrize(
    "embed_cols, continuous_cols, scale",
    [
        (["col1", "col2"], None, False),
        (None, ["col3", "col4"], True),
        (["col1", "col2"], ["col3", "col4"], False),
        (["col1", "col2"], ["col3", "col4"], True),
        ([("col1", 5), ("col2", 5)], ["col3", "col4"], True),
    ],
)
139 140
def test_tab_preprocessor_inverse_transform(embed_cols, continuous_cols, scale):
    tab_preprocessor = TabPreprocessor(
141 142 143 144
        embed_cols=embed_cols,
        continuous_cols=continuous_cols,
        scale=scale,
        verbose=False,
145
    )
146 147
    encoded = tab_preprocessor.fit_transform(df)
    decoded = tab_preprocessor.inverse_transform(encoded)
148 149 150 151 152 153 154 155 156 157 158 159 160
    try:
        if isinstance(embed_cols[0], tuple):
            embed_cols = [c[0] for c in embed_cols]
        emb_df = df[embed_cols]
    except Exception:
        emb_df = pd.DataFrame()
    try:
        cont_df = df[continuous_cols]
    except Exception:
        cont_df = pd.DataFrame()
    org_df = pd.concat([emb_df, cont_df], axis=1)
    decoded = decoded.astype(org_df.dtypes.to_dict())
    assert decoded.equals(org_df)
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181


################################################################################
# Test TabPreprocessor for the TabTransformer
###############################################################################


@pytest.mark.parametrize(
    "embed_cols, continuous_cols, scale",
    [
        (["col1", "col2"], None, False),
        (["col1", "col2"], ["col3", "col4"], False),
        (["col1", "col2"], ["col3", "col4"], True),
    ],
)
def test_tab_preprocessor_trasformer(embed_cols, continuous_cols, scale):
    tab_preprocessor = TabPreprocessor(
        embed_cols=embed_cols,
        continuous_cols=continuous_cols,
        scale=scale,
        for_tabtransformer=True,
182
        verbose=False,
183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
    )
    encoded = tab_preprocessor.fit_transform(df)
    decoded = tab_preprocessor.inverse_transform(encoded)
    try:
        if isinstance(embed_cols[0], tuple):
            embed_cols = [c[0] for c in embed_cols]
        emb_df = df[embed_cols]
    except Exception:
        emb_df = pd.DataFrame()
    try:
        cont_df = df[continuous_cols]
    except Exception:
        cont_df = pd.DataFrame()
    org_df = pd.concat([emb_df, cont_df], axis=1)
    decoded = decoded.astype(org_df.dtypes.to_dict())
    assert decoded.equals(org_df)


@pytest.mark.parametrize(
    "embed_cols, continuous_cols, scale",
    [
        (None, ["col3", "col4"], True),
        ([("col1", 5), ("col2", 5)], ["col3", "col4"], True),
    ],
)
def test_tab_preprocessor_trasformer_raise_error(embed_cols, continuous_cols, scale):
    with pytest.raises(ValueError):
        tab_preprocessor = TabPreprocessor(  # noqa: F841
            embed_cols=embed_cols,
            continuous_cols=continuous_cols,
            scale=scale,
            for_tabtransformer=True,
        )
216 217 218 219 220 221 222 223 224 225 226 227 228


###############################################################################
# Test NotFittedError
###############################################################################


def test_notfittederror():
    processor = TabPreprocessor(
        embed_cols=["col1", "col2"], continuous_cols=["col3", "col4"]
    )
    with pytest.raises(NotFittedError):
        processor.transform(df)
229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252


###############################################################################
# Test embeddings fastai's rule of thumb
###############################################################################


def test_embed_sz_rule_of_thumb():

    embed_cols = ["col1", "col2"]
    df = pd.DataFrame(
        {
            "col1": np.random.randint(10, size=100),
            "col2": np.random.randint(20, size=100),
        }
    )
    n_cats = {c: df[c].nunique() for c in ["col1", "col2"]}
    embed_szs = {c: embed_sz_rule(nc) for c, nc in n_cats.items()}
    tab_preprocessor = TabPreprocessor(embed_cols=embed_cols)
    tdf = tab_preprocessor.fit_transform(df)  # noqa: F841
    out = [
        tab_preprocessor.embed_dim[col] == embed_szs[col] for col in embed_szs.keys()
    ]
    assert all(out)