_preprocessors.py 17.1 KB
Newer Older
1 2 3
import numpy as np
import pandas as pd
import warnings
J
jrzaurin 已提交
4
import cv2
5 6 7

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
8
from sklearn.exceptions import NotFittedError
9
from tqdm import tqdm
10

11
from ..wdtypes import *
J
jrzaurin 已提交
12 13 14 15
from ..utils.fastai_transforms import Vocab
from ..utils.dense_utils import label_encoder
from ..utils.text_utils import get_texts, pad_sequences, build_embeddings_matrix
from ..utils.image_utils import AspectAwarePreprocessor, SimplePreprocessor
16 17


J
jrzaurin 已提交
18
class BasePreprocessor(object):
19

J
jrzaurin 已提交
20
    def fit(self, df: pd.DataFrame):
21
        raise NotImplementedError("BasePreprocessor must implement this method")
J
jrzaurin 已提交
22 23

    def transform(self, df: pd.DataFrame):
24
        raise NotImplementedError("BasePreprocessor must implement this method")
J
jrzaurin 已提交
25 26

    def fit_transform(self, df: pd.DataFrame):
27
        raise NotImplementedError("BasePreprocessor must implement this method")
J
jrzaurin 已提交
28 29 30


class WidePreprocessor(BasePreprocessor):
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
    r"""Preprocessor to prepare the wide input dataset

    Parameters
    ----------
    wide_cols: List
        List with the name of the columns that will be one-hot encoded and
        pass through the Wide model
    crossed_cols: List
        List of Tuples with the name of the columns that will be "crossed"
        and then one-hot encoded. e.g. (['education', 'occupation'], ...)
    already_dummies: List
        List of columns that are already dummies/one-hot encoded

    Attributes
    ----------
    one_hot_enc: sklearn's OneHotEncoder
    wide_crossed_cols: List
        List with the names of all columns that will be one-hot encoded

    Example
    --------
    Assuming we have a dataset loaded in memory as a pd.DataFrame

    >>> wide_cols = ['age_buckets', 'education', 'relationship','workclass','occupation',
    ... 'native_country','gender']
    >>> crossed_cols = [('education', 'occupation'), ('native_country', 'occupation')]
    >>> wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
    >>> X_wide = wide_preprocessor.fit_transform(df)

    From there on, for new data (loaded as a dataframe)
    >>> new_X_wide = wide_preprocessor.transform(new_df)
    """
J
jrzaurin 已提交
63 64 65 66 67 68 69 70

    def __init__(
        self,
        wide_cols: List[str],
        crossed_cols=None,
        already_dummies: Optional[List[str]] = None,
        sparse=False,
    ):
71
        super(WidePreprocessor, self).__init__()
72 73 74
        self.wide_cols = wide_cols
        self.crossed_cols = crossed_cols
        self.already_dummies = already_dummies
75
        self.one_hot_enc = OneHotEncoder(sparse=sparse)
76

J
jrzaurin 已提交
77
    def _cross_cols(self, df: pd.DataFrame):
78 79 80
        crossed_colnames = []
        for cols in self.crossed_cols:
            cols = list(cols)
J
jrzaurin 已提交
81 82 83 84
            for c in cols:
                df[c] = df[c].astype("str")
            colname = "_".join(cols)
            df[colname] = df[cols].apply(lambda x: "-".join(x), axis=1)
85 86 87
            crossed_colnames.append(colname)
        return df, crossed_colnames

J
jrzaurin 已提交
88
    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
89 90 91
        df_wide = df.copy()[self.wide_cols]
        if self.crossed_cols is not None:
            df_wide, crossed_colnames = self._cross_cols(df_wide)
92
            self.wide_crossed_cols = self.wide_cols + crossed_colnames
93 94 95 96
        else:
            self.wide_crossed_cols = self.wide_cols

        if self.already_dummies:
J
jrzaurin 已提交
97 98 99
            dummy_cols = [
                c for c in self.wide_crossed_cols if c not in self.already_dummies
            ]
100 101 102 103 104
            self.one_hot_enc.fit(df_wide[dummy_cols])
        else:
            self.one_hot_enc.fit(df_wide[self.wide_crossed_cols])
        return self

J
jrzaurin 已提交
105
    def transform(self, df: pd.DataFrame) -> Union[sparse_matrix, np.ndarray]:
106 107 108
        try:
            self.one_hot_enc.categories_
        except:
J
jrzaurin 已提交
109 110 111 112
            raise NotFittedError(
                "This WidePreprocessor instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this estimator."
            )
113 114 115 116
        df_wide = df.copy()[self.wide_cols]
        if self.crossed_cols is not None:
            df_wide, _ = self._cross_cols(df_wide)
        if self.already_dummies:
J
jrzaurin 已提交
117
            X_oh_1 = df_wide[self.already_dummies].values
J
jrzaurin 已提交
118 119 120 121
            dummy_cols = [
                c for c in self.wide_crossed_cols if c not in self.already_dummies
            ]
            X_oh_2 = self.one_hot_enc.transform(df_wide[dummy_cols])
122 123
            return np.hstack((X_oh_1, X_oh_2))
        else:
J
jrzaurin 已提交
124
            return self.one_hot_enc.transform(df_wide[self.wide_crossed_cols])
125

J
jrzaurin 已提交
126
    def fit_transform(self, df: pd.DataFrame) -> Union[sparse_matrix, np.ndarray]:
127 128 129
        return self.fit(df).transform(df)


J
jrzaurin 已提交
130
class DeepPreprocessor(BasePreprocessor):
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
    r"""Preprocessor to prepare the deepdense input dataset

    Parameters
    ----------
    embed_cols: List
        List containing the name of the columns that will be represented with
        embeddings or a Tuple with the name and the embedding dimension. e.g.:
         [('education',32), ('relationship',16)
    continuous_cols: List
        List with the name of the so called continuous cols
    scale: Bool
        Bool indicating whether or not to scale/Standarise continuous cols.
        Should "almost always" be True.
    default_embed_dim: Int, default=8
        Dimension for the embeddings used in the Deep-Dense model
    already_standard: List, Optional,
        List with the name of the continuous cols that do not need to be
        Standarised.

    Attributes
    ----------
    encoding_dict: Dict
        Dict with the categorical encoding
    embed_cols: List
        List with the columns that will be represented with embeddings
    embed_dim: Dict
        Dict where keys are the embed cols and values are the embed dimensions
    standardize_cols: List
        List of the columns that will be standarized
    deep_column_idx: Dict
        Dict where keys are column names and values are column indexes. This
        will be neccesary to slice tensors
    scaler: sklearn's StandardScaler

    Example
    --------
    Assuming we have a dataset loaded in memory as a pd.DataFrame

    >>> cat_embed_cols = [('education',10), ('relationship',8), ('workclass',10),
    ... ('occupation',10),('native_country',10)]
    >>> continuous_cols = ["age","hours_per_week"]
    >>> deep_preprocessor = DeepPreprocessor(embed_cols=cat_embed_cols, continuous_cols=continuous_cols)
    >>> X_deep = deep_preprocessor.fit_transform(df)

    From there on, for new data (loaded as a dataframe)
    >>> new_X_deep = deep_preprocessor.transform(new_df)
    """
J
jrzaurin 已提交
178 179 180 181 182 183 184 185 186

    def __init__(
        self,
        embed_cols: List[Union[str, Tuple[str, int]]] = None,
        continuous_cols: List[str] = None,
        scale: bool = True,
        default_embed_dim: int = 8,
        already_standard: Optional[List[str]] = None,
    ):
187
        super(DeepPreprocessor, self).__init__()
188

J
jrzaurin 已提交
189 190 191 192 193
        self.embed_cols = embed_cols
        self.continuous_cols = continuous_cols
        self.already_standard = already_standard
        self.scale = scale
        self.default_embed_dim = default_embed_dim
194

J
jrzaurin 已提交
195 196 197
        assert (self.embed_cols is not None) or (
            self.continuous_cols is not None
        ), "'embed_cols' and 'continuous_cols' are 'None'. Please, define at least one of the two."
198

J
jrzaurin 已提交
199
    def _prepare_embed(self, df: pd.DataFrame) -> pd.DataFrame:
J
jrzaurin 已提交
200
        if isinstance(self.embed_cols[0], tuple):
J
jrzaurin 已提交
201
            self.embed_dim = dict(self.embed_cols)  # type: ignore
202 203
            embed_colname = [emb[0] for emb in self.embed_cols]
        else:
J
jrzaurin 已提交
204 205
            self.embed_dim = {e: self.default_embed_dim for e in self.embed_cols}  # type: ignore
            embed_colname = self.embed_cols  # type: ignore
206 207
        return df.copy()[embed_colname]

J
jrzaurin 已提交
208
    def _prepare_continuous(self, df: pd.DataFrame) -> pd.DataFrame:
209 210
        if self.scale:
            if self.already_standard is not None:
J
jrzaurin 已提交
211 212 213 214 215
                self.standardize_cols = [
                    c for c in self.continuous_cols if c not in self.already_standard
                ]
            else:
                self.standardize_cols = self.continuous_cols
216 217
        return df.copy()[self.continuous_cols]

J
jrzaurin 已提交
218
    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
219 220 221
        if self.embed_cols is not None:
            df_emb = self._prepare_embed(df)
            _, self.encoding_dict = label_encoder(df_emb, cols=df_emb.columns.tolist())
J
jrzaurin 已提交
222
            self.embeddings_input: List = []
J
jrzaurin 已提交
223
            for k, v in self.encoding_dict.items():
224 225 226 227 228 229 230
                self.embeddings_input.append((k, len(v), self.embed_dim[k]))
        if self.continuous_cols is not None:
            df_cont = self._prepare_continuous(df)
            if self.scale:
                df_std = df_cont[self.standardize_cols]
                self.scaler = StandardScaler().fit(df_std.values)
            else:
J
jrzaurin 已提交
231
                warnings.warn("Continuous columns will not be normalised")
232 233
        return self

J
jrzaurin 已提交
234
    def transform(self, df: pd.DataFrame) -> np.ndarray:
235 236
        if self.embed_cols is not None:
            df_emb = self._prepare_embed(df)
J
jrzaurin 已提交
237 238 239
            df_emb, _ = label_encoder(
                df_emb, cols=df_emb.columns.tolist(), val_to_idx=self.encoding_dict
            )
240 241 242
        if self.continuous_cols is not None:
            df_cont = self._prepare_continuous(df)
            if self.scale:
243 244 245
                try:
                    self.scaler.mean_
                except:
J
jrzaurin 已提交
246 247 248 249
                    raise NotFittedError(
                        "This DeepPreprocessor instance is not fitted yet. "
                        "Call 'fit' with appropriate arguments before using this estimator."
                    )
250 251 252 253 254 255 256 257 258
                df_std = df_cont[self.standardize_cols]
                df_cont[self.standardize_cols] = self.scaler.transform(df_std.values)
        try:
            df_deep = pd.concat([df_emb, df_cont], axis=1)
        except:
            try:
                df_deep = df_emb.copy()
            except:
                df_deep = df_cont.copy()
J
jrzaurin 已提交
259
        self.deep_column_idx = {k: v for v, k in enumerate(df_deep.columns)}
260 261
        return df_deep.values

J
jrzaurin 已提交
262
    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:
263 264 265
        return self.fit(df).transform(df)


J
jrzaurin 已提交
266
class TextPreprocessor(BasePreprocessor):
267 268 269 270
    r"""Preprocessor to prepare the deepdense input dataset

    Parameters
    ----------
J
jrzaurin 已提交
271 272
    text_col: str
        column in the input pd.DataFrame containing the texts
273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304
    max_vocab: Int, default=30000
        Maximum number of token in the vocabulary
    min_freq: Int, default=5
        Minimum frequency for a token to be part of the vocabulary
    maxlen: Int, default=80
        Maximum length of the tokenized sequences
    word_vectors_path: Optional, str
        Path to the pretrained word vectors
    verbose: Int, Default 1
        Enable verbose output.

    Attributes
    ----------
    vocab: fastai Vocab object. See https://docs.fast.ai/text.transform.html#Vocab
        Vocab object containing the information of the vocabulary
    tokens: List
        List with Lists of str containing the tokenized texts
    embedding_matrix: np.ndarray
        Array with the pretrained embeddings

    Example
    --------
    Assuming we have a dataset loaded in memory as a pd.DataFrame

    >>> text_preprocessor = TextPreprocessor()
    >>> X_text = text_preprocessor.fit_transform(df, text_col)

    from there on

    From there on, for new data (loaded as a dataframe)
    >>> new_X_text = text_preprocessor.transform(new_df)
    """
J
jrzaurin 已提交
305 306 307 308 309 310 311 312 313 314

    def __init__(
        self,
        text_col: str,
        max_vocab: int = 30000,
        min_freq: int = 5,
        maxlen: int = 80,
        word_vectors_path: Optional[str] = None,
        verbose: int = 1,
    ):
315
        super(TextPreprocessor, self).__init__()
J
jrzaurin 已提交
316
        self.text_col = text_col
317 318 319 320 321 322
        self.max_vocab = max_vocab
        self.min_freq = min_freq
        self.maxlen = maxlen
        self.word_vectors_path = word_vectors_path
        self.verbose = verbose

J
jrzaurin 已提交
323 324
    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
        texts = df[self.text_col].tolist()
325
        tokens = get_texts(texts)
J
jrzaurin 已提交
326 327 328
        self.vocab = Vocab.create(
            tokens, max_vocab=self.max_vocab, min_freq=self.min_freq
        )
329 330
        return self

J
jrzaurin 已提交
331
    def transform(self, df: pd.DataFrame) -> np.ndarray:
332 333 334
        try:
            self.vocab
        except:
J
jrzaurin 已提交
335 336 337 338
            raise NotFittedError(
                "This TextPreprocessor instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this estimator."
            )
339 340 341 342 343
        texts = df[self.text_col].tolist()
        self.tokens = get_texts(texts)
        sequences = [self.vocab.numericalize(t) for t in self.tokens]
        padded_seq = np.array([pad_sequences(s, maxlen=self.maxlen) for s in sequences])
        if self.verbose:
J
jrzaurin 已提交
344
            print("The vocabulary contains {} tokens".format(len(self.vocab.stoi)))
345
        if self.word_vectors_path is not None:
J
jrzaurin 已提交
346 347 348
            self.embedding_matrix = build_embeddings_matrix(
                self.vocab, self.word_vectors_path, self.min_freq
            )
349 350
        return padded_seq

J
jrzaurin 已提交
351 352
    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:
        return self.fit(df).transform(df)
353 354


J
jrzaurin 已提交
355
class ImagePreprocessor(BasePreprocessor):
356 357 358 359
    r"""Preprocessor to prepare the deepdense input dataset

    Parameters
    ----------
J
jrzaurin 已提交
360 361 362 363
    img_col: str
        name of the column with the images filenames
    img_path: str
        path to the dicrectory where the images are stored
364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
    width: Int, default=224
        width of the resulting processed image. 224 because the default
        architecture used by WideDeep is ResNet
    height: Int, default=224
        width of the resulting processed image. 224 because the default
        architecture used by WideDeep is ResNet
    verbose: Int, Default 1
        Enable verbose output.

    Attributes
    ----------
    aap: Class, AspectAwarePreprocessor()
        Preprocessing tool taken from Adrian Rosebrock's book "Deep Learning
        for Computer Vision".
    spp: Class, SimplePreprocessor()
        Preprocessing tool taken from Adrian Rosebrock's book "Deep Learning
        for Computer Vision".
    normalise_metrics: Dict
        Dict containing the normalisation metrics of the image dataset, i.e.
        mean and std for the R, G and B channels

    Example
    --------
    Assuming we have a dataset loaded in memory as a pd.DataFrame

    >>> image_preprocessor = ImagePreprocessor()
    >>> img_path = 'path/to/my_images'
    >>> X_images = image_preprocessor.fit_transform(df, img_col, img_path)

    from there on

    From there on, for new data (loaded as a dataframe)
    >>> next_X_images = image_preprocessor.transform(new_df)
    """
J
jrzaurin 已提交
398 399 400 401 402 403 404 405 406

    def __init__(
        self,
        img_col: str,
        img_path: str,
        width: int = 224,
        height: int = 224,
        verbose: int = 1,
    ):
407
        super(ImagePreprocessor, self).__init__()
J
jrzaurin 已提交
408 409
        self.img_col = img_col
        self.img_path = img_path
410 411 412 413
        self.width = width
        self.height = height
        self.verbose = verbose

J
jrzaurin 已提交
414
    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
415 416 417 418
        self.aap = AspectAwarePreprocessor(self.width, self.height)
        self.spp = SimplePreprocessor(self.width, self.height)
        return self

J
jrzaurin 已提交
419
    def transform(self, df: pd.DataFrame) -> np.ndarray:
420
        try:
421
            self.aap
422
        except:
J
jrzaurin 已提交
423 424 425 426
            raise NotFittedError(
                "This ImagePreprocessor instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this estimator."
            )
427
        image_list = df[self.img_col].tolist()
J
jrzaurin 已提交
428 429 430
        if self.verbose:
            print("Reading Images from {}".format(self.img_path))
        imgs = [cv2.imread("/".join([self.img_path, img])) for img in image_list]
431 432 433

        # finding images with different height and width
        aspect = [(im.shape[0], im.shape[1]) for im in imgs]
J
jrzaurin 已提交
434 435
        aspect_r = [a[0] / a[1] for a in aspect]
        diff_idx = [i for i, r in enumerate(aspect_r) if r != 1.0]
436

J
jrzaurin 已提交
437 438
        if self.verbose:
            print("Resizing")
439
        resized_imgs = []
J
jrzaurin 已提交
440
        for i, img in tqdm(enumerate(imgs), total=len(imgs), disable=self.verbose != 1):
441 442 443 444 445
            if i in diff_idx:
                resized_imgs.append(self.aap.preprocess(img))
            else:
                resized_imgs.append(self.spp.preprocess(img))

J
jrzaurin 已提交
446 447
        if self.verbose:
            print("Computing normalisation metrics")
448 449 450 451
        mean_R, mean_G, mean_B = [], [], []
        std_R, std_G, std_B = [], [], []
        for rsz_img in resized_imgs:
            (mean_b, mean_g, mean_r), (std_b, std_g, std_r) = cv2.meanStdDev(rsz_img)
J
jrzaurin 已提交
452 453 454 455 456 457
            mean_R.append(mean_r)
            mean_G.append(mean_g)
            mean_B.append(mean_b)
            std_R.append(std_r)
            std_G.append(std_g)
            std_B.append(std_b)
458
        self.normalise_metrics = dict(
J
jrzaurin 已提交
459 460 461 462 463 464 465 466 467 468 469
            mean={
                "R": np.mean(mean_R) / 255.0,
                "G": np.mean(mean_G) / 255.0,
                "B": np.mean(mean_B) / 255.0,
            },
            std={
                "R": np.mean(std_R) / 255.0,
                "G": np.mean(std_G) / 255.0,
                "B": np.mean(std_B) / 255.0,
            },
        )
470 471
        return np.asarray(resized_imgs)

J
jrzaurin 已提交
472 473
    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:
        return self.fit(df).transform(df)