test_du_deep_tabular.py 7.0 KB
Newer Older
1 2 3 4
import numpy as np
import pandas as pd
import pytest

5 6
from pytorch_widedeep.preprocessing import TabPreprocessor
from pytorch_widedeep.utils.deeptabular_utils import LabelEncoder
7 8 9


def create_test_dataset(input_type, input_type_2=None):
J
jrzaurin 已提交
10 11 12 13 14 15 16 17 18 19 20
    df = pd.DataFrame()
    col1 = list(np.random.choice(input_type, 3))
    if input_type_2 is not None:
        col2 = list(np.random.choice(input_type_2, 3))
    else:
        col2 = list(np.random.choice(input_type, 3))
    df["col1"], df["col2"] = col1, col2
    return df


some_letters = ["a", "b", "c", "d", "e"]
21 22 23 24 25
some_numbers = [1, 2, 3, 4, 5]

df_letters = create_test_dataset(some_letters)
df_numbers = create_test_dataset(some_numbers)

J
jrzaurin 已提交
26

27
###############################################################################
28
# Simple test of functionality: testing the LabelEncoder class
29 30 31 32 33 34 35 36
###############################################################################
le_letters = LabelEncoder(["col1", "col2"])
df_letters_le = le_letters.fit_transform(df_letters)
le_numbers = LabelEncoder(["col1", "col2"])
df_numbers_le = le_numbers.fit_transform(df_numbers)


@pytest.mark.parametrize(
37 38
    "input_df, encoder, output_df",
    [(df_letters, le_letters, df_letters_le), (df_numbers, le_numbers, df_numbers_le)],
39 40 41 42 43 44
)
def test_label_encoder(input_df, encoder, output_df):
    original_df = encoder.inverse_transform(output_df)
    assert original_df.equals(input_df)


45
################################################################################
46
# Test the TabPreprocessor: only categorical columns to be represented with
47 48
# embeddings
###############################################################################
J
jrzaurin 已提交
49
cat_embed_cols = [("col1", 5), ("col2", 5)]
50

51
preprocessor1 = TabPreprocessor(cat_embed_cols)  # type: ignore[arg-type]
52 53
X_letters = preprocessor1.fit_transform(df_letters)

54
preprocessor2 = TabPreprocessor(cat_embed_cols)  # type: ignore[arg-type]
55 56
X_numbers = preprocessor2.fit_transform(df_numbers)

57
error_list = []
58 59


J
jrzaurin 已提交
60
@pytest.mark.parametrize(
61 62
    "input_df, X_deep, preprocessor",
    [(df_letters, X_letters, preprocessor1), (df_numbers, X_numbers, preprocessor2)],
J
jrzaurin 已提交
63
)
64
def test_prepare_deep_without_continous_columns(input_df, X_deep, preprocessor):
J
jrzaurin 已提交
65 66
    for i, c in enumerate(input_df.columns):
        if (
67
            # remember we have an "unseen class"
68
            input_df[c].nunique() != preprocessor.embeddings_input[i][1]
69
            or cat_embed_cols[i][1] != preprocessor.embeddings_input[i][2]
J
jrzaurin 已提交
70 71 72 73 74
        ):
            error_list.append(
                "error: the setup output does not match the intended input"
            )

75 76 77
    tmp_df = preprocessor.label_encoder.inverse_transform(
        pd.DataFrame({"col1": X_deep[:, 0], "col2": X_deep[:, 1]})
    )
78

J
jrzaurin 已提交
79 80 81 82
    if not tmp_df.equals(input_df):
        error_list.append("error: the decoding does not match the encoding")

    assert not error_list, "errors occured:\n{}".format("\n".join(error_list))
83 84 85


################################################################################
86
# Test the TabPreprocessor: only continouos columns
87 88 89
###############################################################################
def test_prepare_deep_without_embedding_columns():

J
jrzaurin 已提交
90 91 92
    errors = []
    df_randint = pd.DataFrame(np.random.choice(np.arange(100), (100, 2)))
    df_randint.columns = ["col1", "col2"]
93
    preprocessor3 = TabPreprocessor(continuous_cols=["col1", "col2"])
94

J
jrzaurin 已提交
95 96
    try:
        X_randint = preprocessor3.fit_transform(df_randint)
97
    except Exception:
J
jrzaurin 已提交
98
        errors.append("Fundamental Error")
99

J
jrzaurin 已提交
100
    out_booleans = []
101

J
jrzaurin 已提交
102 103 104 105
    means, stds = np.mean(X_randint, axis=0), np.std(X_randint, axis=0)
    for mean, std in zip(means, stds):
        out_booleans.append(np.isclose(mean, 0.0))
        out_booleans.append(np.isclose(std, 1.0))
106

J
jrzaurin 已提交
107 108
    if not np.all(out_booleans):
        errors.append("There is something going on with the scaler")
109

J
jrzaurin 已提交
110
    assert not errors, "errors occured:\n{}".format("\n".join(errors))
111 112 113


################################################################################
114
# Test TabPreprocessor inverse_transform
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
###############################################################################

df = pd.DataFrame(
    {
        "col1": ["a", "b", "c"],
        "col2": ["c", "d", "e"],
        "col3": [10, 20, 30],
        "col4": [2, 7, 9],
    }
)


@pytest.mark.parametrize(
    "embed_cols, continuous_cols, scale",
    [
        (["col1", "col2"], None, False),
        (None, ["col3", "col4"], True),
        (["col1", "col2"], ["col3", "col4"], False),
        (["col1", "col2"], ["col3", "col4"], True),
        ([("col1", 5), ("col2", 5)], ["col3", "col4"], True),
    ],
)
137 138
def test_tab_preprocessor_inverse_transform(embed_cols, continuous_cols, scale):
    tab_preprocessor = TabPreprocessor(
139 140
        embed_cols=embed_cols, continuous_cols=continuous_cols, scale=scale
    )
141 142
    encoded = tab_preprocessor.fit_transform(df)
    decoded = tab_preprocessor.inverse_transform(encoded)
143 144 145 146 147 148 149 150 151 152 153 154 155
    try:
        if isinstance(embed_cols[0], tuple):
            embed_cols = [c[0] for c in embed_cols]
        emb_df = df[embed_cols]
    except Exception:
        emb_df = pd.DataFrame()
    try:
        cont_df = df[continuous_cols]
    except Exception:
        cont_df = pd.DataFrame()
    org_df = pd.concat([emb_df, cont_df], axis=1)
    decoded = decoded.astype(org_df.dtypes.to_dict())
    assert decoded.equals(org_df)
156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209


################################################################################
# Test TabPreprocessor for the TabTransformer
###############################################################################


@pytest.mark.parametrize(
    "embed_cols, continuous_cols, scale",
    [
        (["col1", "col2"], None, False),
        (["col1", "col2"], ["col3", "col4"], False),
        (["col1", "col2"], ["col3", "col4"], True),
    ],
)
def test_tab_preprocessor_trasformer(embed_cols, continuous_cols, scale):
    tab_preprocessor = TabPreprocessor(
        embed_cols=embed_cols,
        continuous_cols=continuous_cols,
        scale=scale,
        for_tabtransformer=True,
    )
    encoded = tab_preprocessor.fit_transform(df)
    decoded = tab_preprocessor.inverse_transform(encoded)
    try:
        if isinstance(embed_cols[0], tuple):
            embed_cols = [c[0] for c in embed_cols]
        emb_df = df[embed_cols]
    except Exception:
        emb_df = pd.DataFrame()
    try:
        cont_df = df[continuous_cols]
    except Exception:
        cont_df = pd.DataFrame()
    org_df = pd.concat([emb_df, cont_df], axis=1)
    decoded = decoded.astype(org_df.dtypes.to_dict())
    assert decoded.equals(org_df)


@pytest.mark.parametrize(
    "embed_cols, continuous_cols, scale",
    [
        (None, ["col3", "col4"], True),
        ([("col1", 5), ("col2", 5)], ["col3", "col4"], True),
    ],
)
def test_tab_preprocessor_trasformer_raise_error(embed_cols, continuous_cols, scale):
    with pytest.raises(ValueError):
        tab_preprocessor = TabPreprocessor(  # noqa: F841
            embed_cols=embed_cols,
            continuous_cols=continuous_cols,
            scale=scale,
            for_tabtransformer=True,
        )