_preprocessors.py 21.0 KB
Newer Older
1
import warnings
2
from abc import ABC, abstractmethod
3

4 5 6
import cv2
import numpy as np
import pandas as pd
7
from tqdm import tqdm
8 9
from sklearn.exceptions import NotFittedError
from sklearn.preprocessing import OneHotEncoder, StandardScaler
10

11
from ..wdtypes import *
12 13 14 15 16
from ..utils.text_utils import (
    get_texts,
    pad_sequences,
    build_embeddings_matrix,
)
17
from ..utils.dense_utils import LabelEncoder
18
from ..utils.image_utils import SimplePreprocessor, AspectAwarePreprocessor
J
jrzaurin 已提交
19
from ..utils.fastai_transforms import Vocab
20 21


22 23 24 25 26 27 28 29
class BasePreprocessor(ABC):
    """Base Abstract Class of All Preprocessors."""

    @abstractmethod
    def __init__(self, *args):
        pass

    @abstractmethod
J
jrzaurin 已提交
30
    def fit(self, df: pd.DataFrame):
J
jrzaurin 已提交
31
        raise NotImplementedError("Preprocessor must implement this method")
J
jrzaurin 已提交
32

33
    @abstractmethod
J
jrzaurin 已提交
34
    def transform(self, df: pd.DataFrame):
J
jrzaurin 已提交
35
        raise NotImplementedError("Preprocessor must implement this method")
J
jrzaurin 已提交
36

37
    @abstractmethod
J
jrzaurin 已提交
38
    def fit_transform(self, df: pd.DataFrame):
J
jrzaurin 已提交
39
        raise NotImplementedError("Preprocessor must implement this method")
J
jrzaurin 已提交
40 41 42


class WidePreprocessor(BasePreprocessor):
43 44
    r"""Preprocessor to prepare the wide input dataset

45 46 47 48 49 50
    This Preprocessor prepares the data for the wide, linear component. This
    linear model is implemented via an Embedding layer that is connected to
    the output neuron. ``WidePreprocessor`` simply numerically encodes all the
    unique values of all categorical columns ``wide_cols + crossed_cols``. See
    the Example below.

51 52
    Parameters
    ----------
53
    wide_cols: List[str]
54 55
        List with the name of the columns that will label encoded and passed
        through the Wide model
56
    crossed_cols: List[Tuple[str, str]]
57
        List of Tuples with the name of the columns that will be `'crossed'`
58
        and then label encoded. e.g. [('education', 'occupation'), ...]
59 60 61

    Attributes
    ----------
62
    wide_crossed_cols: :obj:`List`
63
        List with the names of all columns that will be label encoded
64 65 66
    feature_dict: :obj:`Dict`
        Dictionary where the keys are the result of pasting `colname + '_' +
        column value` and the values are the corresponding mapped integer.
67

68
    Examples
69
    --------
70 71 72 73 74
    >>> import pandas as pd
    >>> from pytorch_widedeep.preprocessing import WidePreprocessor
    >>> df = pd.DataFrame({'color': ['r', 'b', 'g'], 'size': ['s', 'n', 'l']})
    >>> wide_cols = ['color']
    >>> crossed_cols = [('color', 'size')]
75
    >>> wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
    >>> X_wide = wide_preprocessor.fit_transform(df)
    >>> X_wide
    array([[1, 4],
           [2, 5],
           [3, 6]])
    >>> wide_preprocessor.feature_dict
    {'color_r': 1,
     'color_b': 2,
     'color_g': 3,
     'color_size_r-s': 4,
     'color_size_b-n': 5,
     'color_size_g-l': 6}
    >>> wide_preprocessor.inverse_transform(X_wide)
      color color_size
    0     r        r-s
    1     b        b-n
    2     g        g-l
93
    """
J
jrzaurin 已提交
94 95

    def __init__(
96 97 98
        self,
        wide_cols: List[str],
        crossed_cols=None,
J
jrzaurin 已提交
99
    ):
100
        super(WidePreprocessor, self).__init__()
101 102 103
        self.wide_cols = wide_cols
        self.crossed_cols = crossed_cols

J
jrzaurin 已提交
104
    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
105
        """Fits the Preprocessor and creates required attributes"""
106 107
        df_wide = self._prepare_wide(df)
        self.wide_crossed_cols = df_wide.columns.tolist()
108 109 110
        vocab = self._make_global_feature_list(df_wide[self.wide_crossed_cols])
        # leave 0 as padding index
        self.feature_dict = {v: i + 1 for i, v in enumerate(vocab)}
111 112
        return self

113
    def transform(self, df: pd.DataFrame) -> np.array:
114
        r"""Returns the processed dataframe"""
115
        try:
116
            self.feature_dict
117
        except:
J
jrzaurin 已提交
118 119 120 121
            raise NotFittedError(
                "This WidePreprocessor instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this estimator."
            )
122
        df_wide = self._prepare_wide(df)
123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
        encoded = np.zeros([len(df_wide), len(self.wide_crossed_cols)], dtype=np.long)
        for col_i, col in enumerate(self.wide_crossed_cols):
            encoded[:, col_i] = df_wide[col].apply(
                lambda x: self.feature_dict[col + "_" + str(x)]
                if col + "_" + str(x) in self.feature_dict
                else 0
            )
        return encoded.astype("int64")

    def inverse_transform(self, encoded: np.ndarray) -> pd.DataFrame:
        r"""Takes as input the output from the ``transform`` method and it will
        return the original values.

        Parameters
        ----------
        encoded: np.ndarray
            array with the output of the ``transform`` method
        """
        decoded = pd.DataFrame(encoded, columns=self.wide_crossed_cols)
        inverse_dict = {k: v for v, k in self.feature_dict.items()}
        decoded = decoded.applymap(lambda x: inverse_dict[x])
        for col in decoded.columns:
            rm_str = "".join([col, "_"])
            decoded[col] = decoded[col].apply(lambda x: x.replace(rm_str, ""))
        return decoded

    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:
150
        """Combines ``fit`` and ``transform``"""
151 152
        return self.fit(df).transform(df)

153 154 155 156 157 158 159 160 161
    def _make_global_feature_list(self, df: pd.DataFrame) -> List:
        vocab = []
        for column in df.columns:
            vocab += self._make_column_feature_list(df[column])
        return vocab

    def _make_column_feature_list(self, s: pd.Series) -> List:
        return [s.name + "_" + str(x) for x in s.unique()]

162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
    def _cross_cols(self, df: pd.DataFrame):
        df_cc = df.copy()
        crossed_colnames = []
        for cols in self.crossed_cols:
            cols = list(cols)
            for c in cols:
                df_cc[c] = df_cc[c].astype("str")
            colname = "_".join(cols)
            df_cc[colname] = df_cc[cols].apply(lambda x: "-".join(x), axis=1)
            crossed_colnames.append(colname)
        return df_cc[crossed_colnames]

    def _prepare_wide(self, df: pd.DataFrame):
        if self.crossed_cols is not None:
            df_cc = self._cross_cols(df)
            return pd.concat([df[self.wide_cols], df_cc], axis=1)
        else:
            return df.copy()[self.wide_cols]

181

182
class DensePreprocessor(BasePreprocessor):
183 184 185 186
    r"""Preprocessor to prepare the deepdense input dataset

    Parameters
    ----------
187
    embed_cols: List[Union[str, Tuple[str, int]]]
188
        List containing the name of the columns that will be represented by
189
        embeddings or a Tuple with the name and the embedding dimension. e.g.:
190
        [('education',32), ('relationship',16)
191
    continuous_cols: List[str]
192
        List with the name of the so called continuous cols
193
    scale: bool
194 195
        Bool indicating whether or not to scale/Standarise continuous cols.
        Should "almost always" be True.
196
    default_embed_dim: Int, Default=8
197
        Dimension for the embeddings used in the Deep-Dense model
198
    already_standard: List[str], Optional,
199 200 201 202 203
        List with the name of the continuous cols that do not need to be
        Standarised.

    Attributes
    ----------
204 205 206 207 208 209 210 211
    label_encoder: :obj:`LabelEncoder`
        see :class:`pytorch_widedeep.utils.dense_utils.LabelEncder`
    embed_cols: :obj:`List`
        List with the columns that will be represented by embeddings
    embed_dim: :obj:`Dict`
        Dictionary where keys are the embed cols and values are the embed
        dimensions
    standardize_cols: :obj:`List`
212
        List of the columns that will be standarized
213 214 215 216 217
    deep_column_idx: :obj:`Dict`
        Dictionary where keys are column names and values are column indexes.
        This will be neccesary to slice tensors
    scaler: :obj:`StandardScaler`
        an instance of :class:`sklearn.preprocessing.StandardScaler`
218

219
    Examples
220
    --------
221
    >>> import pandas as pd
222
    >>> from pytorch_widedeep.preprocessing import DensePreprocessor
223 224 225
    >>> df = pd.DataFrame({'color': ['r', 'b', 'g'], 'size': ['s', 'n', 'l'], 'age': [25, 40, 55]})
    >>> embed_cols = [('color',5), ('size',5)]
    >>> cont_cols = ['age']
226
    >>> deep_preprocessor = DensePreprocessor(embed_cols=embed_cols, continuous_cols=cont_cols)
227 228 229 230 231 232 233 234
    >>> deep_preprocessor.fit_transform(df)
    array([[ 0.        ,  0.        , -1.22474487],
           [ 1.        ,  1.        ,  0.        ],
           [ 2.        ,  2.        ,  1.22474487]])
    >>> deep_preprocessor.embed_dim
    {'color': 5, 'size': 5}
    >>> deep_preprocessor.deep_column_idx
    {'color': 0, 'size': 1, 'age': 2}
235
    """
J
jrzaurin 已提交
236 237 238 239 240 241 242 243 244

    def __init__(
        self,
        embed_cols: List[Union[str, Tuple[str, int]]] = None,
        continuous_cols: List[str] = None,
        scale: bool = True,
        default_embed_dim: int = 8,
        already_standard: Optional[List[str]] = None,
    ):
245
        super(DensePreprocessor, self).__init__()
246

J
jrzaurin 已提交
247 248 249 250 251
        self.embed_cols = embed_cols
        self.continuous_cols = continuous_cols
        self.already_standard = already_standard
        self.scale = scale
        self.default_embed_dim = default_embed_dim
252

J
jrzaurin 已提交
253 254 255
        assert (self.embed_cols is not None) or (
            self.continuous_cols is not None
        ), "'embed_cols' and 'continuous_cols' are 'None'. Please, define at least one of the two."
256

J
jrzaurin 已提交
257
    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
258
        """Fits the Preprocessor and creates required attributes"""
259 260
        if self.embed_cols is not None:
            df_emb = self._prepare_embed(df)
261
            self.label_encoder = LabelEncoder(df_emb.columns.tolist()).fit(df_emb)
J
jrzaurin 已提交
262
            self.embeddings_input: List = []
263
            for k, v in self.label_encoder.encoding_dict.items():
264 265 266 267 268 269 270
                self.embeddings_input.append((k, len(v), self.embed_dim[k]))
        if self.continuous_cols is not None:
            df_cont = self._prepare_continuous(df)
            if self.scale:
                df_std = df_cont[self.standardize_cols]
                self.scaler = StandardScaler().fit(df_std.values)
            else:
J
jrzaurin 已提交
271
                warnings.warn("Continuous columns will not be normalised")
272 273
        return self

J
jrzaurin 已提交
274
    def transform(self, df: pd.DataFrame) -> np.ndarray:
275
        """Returns the processed ``dataframe`` as a np.ndarray"""
276 277
        if self.embed_cols is not None:
            df_emb = self._prepare_embed(df)
278
            df_emb = self.label_encoder.transform(df_emb)
279 280 281
        if self.continuous_cols is not None:
            df_cont = self._prepare_continuous(df)
            if self.scale:
282 283 284
                try:
                    self.scaler.mean_
                except:
J
jrzaurin 已提交
285
                    raise NotFittedError(
286
                        "This DensePreprocessor instance is not fitted yet. "
J
jrzaurin 已提交
287 288
                        "Call 'fit' with appropriate arguments before using this estimator."
                    )
289 290 291 292 293 294 295 296 297
                df_std = df_cont[self.standardize_cols]
                df_cont[self.standardize_cols] = self.scaler.transform(df_std.values)
        try:
            df_deep = pd.concat([df_emb, df_cont], axis=1)
        except:
            try:
                df_deep = df_emb.copy()
            except:
                df_deep = df_cont.copy()
J
jrzaurin 已提交
298
        self.deep_column_idx = {k: v for v, k in enumerate(df_deep.columns)}
299 300
        return df_deep.values

J
jrzaurin 已提交
301
    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:
302
        """Combines ``fit`` and ``transform``"""
303 304
        return self.fit(df).transform(df)

305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
    def _prepare_embed(self, df: pd.DataFrame) -> pd.DataFrame:
        if isinstance(self.embed_cols[0], tuple):
            self.embed_dim = dict(self.embed_cols)  # type: ignore
            embed_colname = [emb[0] for emb in self.embed_cols]
        else:
            self.embed_dim = {e: self.default_embed_dim for e in self.embed_cols}  # type: ignore
            embed_colname = self.embed_cols  # type: ignore
        return df.copy()[embed_colname]

    def _prepare_continuous(self, df: pd.DataFrame) -> pd.DataFrame:
        if self.scale:
            if self.already_standard is not None:
                self.standardize_cols = [
                    c for c in self.continuous_cols if c not in self.already_standard
                ]
            else:
                self.standardize_cols = self.continuous_cols
        return df.copy()[self.continuous_cols]

324

J
jrzaurin 已提交
325
class TextPreprocessor(BasePreprocessor):
326
    r"""Preprocessor to prepare the deeptext input dataset
327 328 329

    Parameters
    ----------
J
jrzaurin 已提交
330
    text_col: str
331
        column in the input dataframe containing the texts
332
    max_vocab: int, default=30000
333
        Maximum number of token in the vocabulary
334
    min_freq: int, default=5
335
        Minimum frequency for a token to be part of the vocabulary
336
    maxlen: int, default=80
337
        Maximum length of the tokenized sequences
338
    word_vectors_path: str, Optional
339
        Path to the pretrained word vectors
340
    verbose: int, default 1
341 342 343 344
        Enable verbose output.

    Attributes
    ----------
345 346 347
    vocab: :obj:`Vocab`
        an instance of :class:`pytorch_widedeep.utils.fastai_transforms.Vocab`
    tokens: :obj:`List`
348
        List with Lists of str containing the tokenized texts
349
    embedding_matrix: :obj:`np.ndarray`
350 351
        Array with the pretrained embeddings

352 353 354 355 356 357 358 359 360 361 362 363 364 365
    Examples
    ---------
    >>> import pandas as pd
    >>> from pytorch_widedeep.preprocessing import TextPreprocessor
    >>> df_train = pd.DataFrame({'text_column': ["life was like a box of chocolates",
    ... "You never know what you're gonna get"]})
    >>> text_preprocessor = TextPreprocessor(text_col='text_column', max_vocab=25, min_freq=1, maxlen=10)
    >>> text_preprocessor.fit_transform(df_train)
    The vocabulary contains 24 tokens
    array([[ 1,  1,  1,  1, 10, 11, 12, 13, 14, 15],
           [ 5,  9, 16, 17, 18,  9, 19, 20, 21, 22]], dtype=int32)
    >>> df_te = pd.DataFrame({'text_column': ['you never know what is in the box']})
    >>> text_preprocessor.transform(df_te)
    array([[ 1,  1,  9, 16, 17, 18,  0,  0,  0, 13]], dtype=int32)
366
    """
J
jrzaurin 已提交
367 368 369 370 371 372 373 374 375 376

    def __init__(
        self,
        text_col: str,
        max_vocab: int = 30000,
        min_freq: int = 5,
        maxlen: int = 80,
        word_vectors_path: Optional[str] = None,
        verbose: int = 1,
    ):
377
        super(TextPreprocessor, self).__init__()
J
jrzaurin 已提交
378
        self.text_col = text_col
379 380 381 382 383 384
        self.max_vocab = max_vocab
        self.min_freq = min_freq
        self.maxlen = maxlen
        self.word_vectors_path = word_vectors_path
        self.verbose = verbose

J
jrzaurin 已提交
385
    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
386
        """Builds the vocabulary"""
J
jrzaurin 已提交
387
        texts = df[self.text_col].tolist()
388
        tokens = get_texts(texts)
J
jrzaurin 已提交
389 390 391
        self.vocab = Vocab.create(
            tokens, max_vocab=self.max_vocab, min_freq=self.min_freq
        )
392 393
        if self.verbose:
            print("The vocabulary contains {} tokens".format(len(self.vocab.stoi)))
394 395
        return self

J
jrzaurin 已提交
396
    def transform(self, df: pd.DataFrame) -> np.ndarray:
397
        """Returns the padded, `numericalised` sequences"""
398 399 400
        try:
            self.vocab
        except:
J
jrzaurin 已提交
401 402 403 404
            raise NotFittedError(
                "This TextPreprocessor instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this estimator."
            )
405 406 407 408 409
        texts = df[self.text_col].tolist()
        self.tokens = get_texts(texts)
        sequences = [self.vocab.numericalize(t) for t in self.tokens]
        padded_seq = np.array([pad_sequences(s, maxlen=self.maxlen) for s in sequences])
        if self.word_vectors_path is not None:
J
jrzaurin 已提交
410 411 412
            self.embedding_matrix = build_embeddings_matrix(
                self.vocab, self.word_vectors_path, self.min_freq
            )
413 414
        return padded_seq

J
jrzaurin 已提交
415
    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:
416
        """Combines ``fit`` and ``transform``"""
J
jrzaurin 已提交
417
        return self.fit(df).transform(df)
418 419


J
jrzaurin 已提交
420
class ImagePreprocessor(BasePreprocessor):
421 422
    r"""Preprocessor to prepare the deepimage input dataset. The Preprocessing
    consists simply on resizing according to their aspect ratio
423 424 425

    Parameters
    ----------
J
jrzaurin 已提交
426 427 428 429
    img_col: str
        name of the column with the images filenames
    img_path: str
        path to the dicrectory where the images are stored
430 431 432 433 434 435
    width: Int, default=224
        width of the resulting processed image. 224 because the default
        architecture used by WideDeep is ResNet
    height: Int, default=224
        width of the resulting processed image. 224 because the default
        architecture used by WideDeep is ResNet
436
    verbose: Int, default 1
437 438 439 440
        Enable verbose output.

    Attributes
    ----------
441 442 443 444 445
    aap: :obj:`AspectAwarePreprocessor`
        an instance of :class:`pytorch_widedeep.utils.image_utils.AspectAwarePreprocessor`
    spp: :obj:`SimplePreprocessor`
        an instance of :class:`pytorch_widedeep.utils.image_utils.SimplePreprocessor`
    normalise_metrics: :obj:`Dict`
446 447 448
        Dict containing the normalisation metrics of the image dataset, i.e.
        mean and std for the R, G and B channels

449
    Examples
450
    --------
451 452 453 454 455 456 457
    >>> import pandas as pd
    >>> from pytorch_widedeep.preprocessing import ImagePreprocessor
    >>> df_train = pd.DataFrame({'images_column': ['galaxy1.png', 'galaxy2.png']})
    >>> df_test = pd.DataFrame({'images_column': ['galaxy3.png']})
    >>> img_preprocessor = ImagePreprocessor(img_col='images_column', img_path='.', verbose=0)
    >>> resized_images = img_preprocessor.fit_transform(df_train)
    >>> new_resized_images = img_preprocessor.transform(df_train)
458 459


460 461 462 463
    .. note:: Normalising metrics will only be computed when the
        ``fit_transform`` method is run. Running ``transform`` only will not
        change the computed metrics and running ``fit`` only simply
        instantiates the resizing functions.
464 465

    """
J
jrzaurin 已提交
466 467 468 469 470 471 472 473 474

    def __init__(
        self,
        img_col: str,
        img_path: str,
        width: int = 224,
        height: int = 224,
        verbose: int = 1,
    ):
475
        super(ImagePreprocessor, self).__init__()
J
jrzaurin 已提交
476 477
        self.img_col = img_col
        self.img_path = img_path
478 479 480 481
        self.width = width
        self.height = height
        self.verbose = verbose

J
jrzaurin 已提交
482
    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
483 484 485 486 487
        r"""Simply instantiates the Preprocessors
        :obj:`AspectAwarePreprocessor`` and :obj:`SimplePreprocessor` for image
        resizing.

        See
488 489
        :class:`pytorch_widedeep.utils.image_utils.AspectAwarePreprocessor`
        and :class:`pytorch_widedeep.utils.image_utils.SimplePreprocessor`.
490

491
        """
492 493
        self.aap = AspectAwarePreprocessor(self.width, self.height)
        self.spp = SimplePreprocessor(self.width, self.height)
494
        self._compute_normalising_metrics = True
495 496
        return self

J
jrzaurin 已提交
497
    def transform(self, df: pd.DataFrame) -> np.ndarray:
498
        """Resizes the images to the input height and width."""
499
        try:
500
            self.aap
501
        except:
J
jrzaurin 已提交
502 503 504 505
            raise NotFittedError(
                "This ImagePreprocessor instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this estimator."
            )
506
        image_list = df[self.img_col].tolist()
J
jrzaurin 已提交
507 508 509
        if self.verbose:
            print("Reading Images from {}".format(self.img_path))
        imgs = [cv2.imread("/".join([self.img_path, img])) for img in image_list]
510 511 512

        # finding images with different height and width
        aspect = [(im.shape[0], im.shape[1]) for im in imgs]
J
jrzaurin 已提交
513 514
        aspect_r = [a[0] / a[1] for a in aspect]
        diff_idx = [i for i, r in enumerate(aspect_r) if r != 1.0]
515

J
jrzaurin 已提交
516 517
        if self.verbose:
            print("Resizing")
518
        resized_imgs = []
J
jrzaurin 已提交
519
        for i, img in tqdm(enumerate(imgs), total=len(imgs), disable=self.verbose != 1):
520 521 522
            if i in diff_idx:
                resized_imgs.append(self.aap.preprocess(img))
            else:
523
                # if aspect ratio is 1:1, no need for AspectAwarePreprocessor
524 525
                resized_imgs.append(self.spp.preprocess(img))

526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555
        if self._compute_normalising_metrics:
            if self.verbose:
                print("Computing normalisation metrics")
            # mean and std deviation will only be computed when the fit method
            # is called
            mean_R, mean_G, mean_B = [], [], []
            std_R, std_G, std_B = [], [], []
            for rsz_img in resized_imgs:
                (mean_b, mean_g, mean_r), (std_b, std_g, std_r) = cv2.meanStdDev(
                    rsz_img
                )
                mean_R.append(mean_r)
                mean_G.append(mean_g)
                mean_B.append(mean_b)
                std_R.append(std_r)
                std_G.append(std_g)
                std_B.append(std_b)
            self.normalise_metrics = dict(
                mean={
                    "R": np.mean(mean_R) / 255.0,
                    "G": np.mean(mean_G) / 255.0,
                    "B": np.mean(mean_B) / 255.0,
                },
                std={
                    "R": np.mean(std_R) / 255.0,
                    "G": np.mean(std_G) / 255.0,
                    "B": np.mean(std_B) / 255.0,
                },
            )
            self._compute_normalising_metrics = False
556 557
        return np.asarray(resized_imgs)

J
jrzaurin 已提交
558
    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:
559
        """Combines ``fit`` and ``transform``"""
J
jrzaurin 已提交
560
        return self.fit(df).transform(df)