_preprocessors.py 19.9 KB
Newer Older
1
import warnings
2
from abc import ABC, abstractmethod
3

4 5 6
import cv2
import numpy as np
import pandas as pd
7
from tqdm import tqdm
8 9
from sklearn.exceptions import NotFittedError
from sklearn.preprocessing import OneHotEncoder, StandardScaler
10

11
from ..wdtypes import *
12 13 14 15 16
from ..utils.text_utils import (
    get_texts,
    pad_sequences,
    build_embeddings_matrix,
)
17
from ..utils.dense_utils import LabelEncoder
18
from ..utils.image_utils import SimplePreprocessor, AspectAwarePreprocessor
J
jrzaurin 已提交
19
from ..utils.fastai_transforms import Vocab
20 21


22 23 24 25 26 27 28 29
class BasePreprocessor(ABC):
    """Base Abstract Class of All Preprocessors."""

    @abstractmethod
    def __init__(self, *args):
        pass

    @abstractmethod
J
jrzaurin 已提交
30
    def fit(self, df: pd.DataFrame):
J
jrzaurin 已提交
31
        raise NotImplementedError("Preprocessor must implement this method")
J
jrzaurin 已提交
32

33
    @abstractmethod
J
jrzaurin 已提交
34
    def transform(self, df: pd.DataFrame):
J
jrzaurin 已提交
35
        raise NotImplementedError("Preprocessor must implement this method")
J
jrzaurin 已提交
36

37
    @abstractmethod
J
jrzaurin 已提交
38
    def fit_transform(self, df: pd.DataFrame):
J
jrzaurin 已提交
39
        raise NotImplementedError("Preprocessor must implement this method")
J
jrzaurin 已提交
40 41 42


class WidePreprocessor(BasePreprocessor):
43 44 45 46
    r"""Preprocessor to prepare the wide input dataset

    Parameters
    ----------
47
    wide_cols: List[str]
48 49
        List with the name of the columns that will be one-hot encoded and
        pass through the Wide model
50
    crossed_cols: List[Tuple[str, str]]
51
        List of Tuples with the name of the columns that will be "crossed"
52 53
        and then one-hot encoded. e.g. [('education', 'occupation'), ...]
    already_dummies: List[str]
54 55
        List of columns that are already dummies/one-hot encoded, and
        therefore do not need to be processed
56 57 58

    Attributes
    ----------
59 60
    one_hot_enc:
        an instance of ``sklearn``'s ``OneHotEncoder``
61
    wide_crossed_cols: `List`
62 63
        List with the names of all columns that will be one-hot encoded

64
    Examples
65
    --------
66 67 68 69 70
    >>> import pandas as pd
    >>> from pytorch_widedeep.preprocessing import WidePreprocessor
    >>> df = pd.DataFrame({'color': ['r', 'b', 'g'], 'size': ['s', 'n', 'l']})
    >>> wide_cols = ['color']
    >>> crossed_cols = [('color', 'size')]
71
    >>> wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
72 73 74 75
    >>> wide_preprocessor.fit_transform(df)
    array([[0., 0., 1., 0., 0., 1.],
           [1., 0., 0., 1., 0., 0.],
           [0., 1., 0., 0., 1., 0.]])
76
    """
J
jrzaurin 已提交
77 78 79 80 81 82 83

    def __init__(
        self,
        wide_cols: List[str],
        crossed_cols=None,
        already_dummies: Optional[List[str]] = None,
        sparse=False,
84
        handle_unknown="ignore",
J
jrzaurin 已提交
85
    ):
86
        super(WidePreprocessor, self).__init__()
87 88 89
        self.wide_cols = wide_cols
        self.crossed_cols = crossed_cols
        self.already_dummies = already_dummies
90
        self.one_hot_enc = OneHotEncoder(sparse=sparse, handle_unknown=handle_unknown)
91

J
jrzaurin 已提交
92
    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
93 94 95 96
        """Fits the Preprocessor and creates required attributes
        """
        df_wide = self._prepare_wide(df)
        self.wide_crossed_cols = df_wide.columns.tolist()
97
        if self.already_dummies:
J
jrzaurin 已提交
98 99 100
            dummy_cols = [
                c for c in self.wide_crossed_cols if c not in self.already_dummies
            ]
101 102 103 104 105
            self.one_hot_enc.fit(df_wide[dummy_cols])
        else:
            self.one_hot_enc.fit(df_wide[self.wide_crossed_cols])
        return self

J
jrzaurin 已提交
106
    def transform(self, df: pd.DataFrame) -> Union[sparse_matrix, np.ndarray]:
107 108
        """Returns the processed ``dataframe`` as an array or sparse matrix
        """
109 110 111
        try:
            self.one_hot_enc.categories_
        except:
J
jrzaurin 已提交
112 113 114 115
            raise NotFittedError(
                "This WidePreprocessor instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this estimator."
            )
116
        df_wide = self._prepare_wide(df)
117
        if self.already_dummies:
J
jrzaurin 已提交
118
            X_oh_1 = df_wide[self.already_dummies].values
J
jrzaurin 已提交
119 120 121 122
            dummy_cols = [
                c for c in self.wide_crossed_cols if c not in self.already_dummies
            ]
            X_oh_2 = self.one_hot_enc.transform(df_wide[dummy_cols])
123 124
            return np.hstack((X_oh_1, X_oh_2))
        else:
J
jrzaurin 已提交
125
            return self.one_hot_enc.transform(df_wide[self.wide_crossed_cols])
126

J
jrzaurin 已提交
127
    def fit_transform(self, df: pd.DataFrame) -> Union[sparse_matrix, np.ndarray]:
128 129
        """Combines ``fit`` and ``transform``
        """
130 131
        return self.fit(df).transform(df)

132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
    def _cross_cols(self, df: pd.DataFrame):
        df_cc = df.copy()
        crossed_colnames = []
        for cols in self.crossed_cols:
            cols = list(cols)
            for c in cols:
                df_cc[c] = df_cc[c].astype("str")
            colname = "_".join(cols)
            df_cc[colname] = df_cc[cols].apply(lambda x: "-".join(x), axis=1)
            crossed_colnames.append(colname)
        return df_cc[crossed_colnames]

    def _prepare_wide(self, df: pd.DataFrame):
        if self.crossed_cols is not None:
            df_cc = self._cross_cols(df)
            return pd.concat([df[self.wide_cols], df_cc], axis=1)
        else:
            return df.copy()[self.wide_cols]

151

J
jrzaurin 已提交
152
class DeepPreprocessor(BasePreprocessor):
153 154 155 156
    r"""Preprocessor to prepare the deepdense input dataset

    Parameters
    ----------
157
    embed_cols: List[Union[str, Tuple[str, int]]]
158 159
        List containing the name of the columns that will be represented with
        embeddings or a Tuple with the name and the embedding dimension. e.g.:
160
        [('education',32), ('relationship',16)
161
    continuous_cols: List[str]
162
        List with the name of the so called continuous cols
163
    scale: bool
164 165
        Bool indicating whether or not to scale/Standarise continuous cols.
        Should "almost always" be True.
166
    default_embed_dim: Int, Default=8
167
        Dimension for the embeddings used in the Deep-Dense model
168
    already_standard: List[str], Optional,
169 170 171 172 173
        List with the name of the continuous cols that do not need to be
        Standarised.

    Attributes
    ----------
174
    label_encoder: `LabelEncoder`
175
        Instance of :class:`pytorch_widedeep.utils.dense_utils.LabelEncder`
176
    embed_cols: `List`
177
        List with the columns that will be represented with embeddings
178
    embed_dim: `Dict`
179
        Dict where keys are the embed cols and values are the embed dimensions
180
    standardize_cols: `List`
181
        List of the columns that will be standarized
182
    deep_column_idx: `Dict`
183 184
        Dict where keys are column names and values are column indexes. This
        will be neccesary to slice tensors
185 186
    scaler:
        an instance of ``sklearn``'s ``StandardScaler``
187

188
    Examples
189
    --------
190 191 192 193 194 195 196 197 198 199 200 201 202 203
    >>> import pandas as pd
    >>> from pytorch_widedeep.preprocessing import DeepPreprocessor
    >>> df = pd.DataFrame({'color': ['r', 'b', 'g'], 'size': ['s', 'n', 'l'], 'age': [25, 40, 55]})
    >>> embed_cols = [('color',5), ('size',5)]
    >>> cont_cols = ['age']
    >>> deep_preprocessor = DeepPreprocessor(embed_cols=embed_cols, continuous_cols=cont_cols)
    >>> deep_preprocessor.fit_transform(df)
    array([[ 0.        ,  0.        , -1.22474487],
           [ 1.        ,  1.        ,  0.        ],
           [ 2.        ,  2.        ,  1.22474487]])
    >>> deep_preprocessor.embed_dim
    {'color': 5, 'size': 5}
    >>> deep_preprocessor.deep_column_idx
    {'color': 0, 'size': 1, 'age': 2}
204
    """
J
jrzaurin 已提交
205 206 207 208 209 210 211 212 213

    def __init__(
        self,
        embed_cols: List[Union[str, Tuple[str, int]]] = None,
        continuous_cols: List[str] = None,
        scale: bool = True,
        default_embed_dim: int = 8,
        already_standard: Optional[List[str]] = None,
    ):
214
        super(DeepPreprocessor, self).__init__()
215

J
jrzaurin 已提交
216 217 218 219 220
        self.embed_cols = embed_cols
        self.continuous_cols = continuous_cols
        self.already_standard = already_standard
        self.scale = scale
        self.default_embed_dim = default_embed_dim
221

J
jrzaurin 已提交
222 223 224
        assert (self.embed_cols is not None) or (
            self.continuous_cols is not None
        ), "'embed_cols' and 'continuous_cols' are 'None'. Please, define at least one of the two."
225

J
jrzaurin 已提交
226
    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
227 228
        """Fits the Preprocessor and creates required attributes
        """
229 230
        if self.embed_cols is not None:
            df_emb = self._prepare_embed(df)
231
            self.label_encoder = LabelEncoder(df_emb.columns.tolist()).fit(df_emb)
J
jrzaurin 已提交
232
            self.embeddings_input: List = []
233
            for k, v in self.label_encoder.encoding_dict.items():
234 235 236 237 238 239 240
                self.embeddings_input.append((k, len(v), self.embed_dim[k]))
        if self.continuous_cols is not None:
            df_cont = self._prepare_continuous(df)
            if self.scale:
                df_std = df_cont[self.standardize_cols]
                self.scaler = StandardScaler().fit(df_std.values)
            else:
J
jrzaurin 已提交
241
                warnings.warn("Continuous columns will not be normalised")
242 243
        return self

J
jrzaurin 已提交
244
    def transform(self, df: pd.DataFrame) -> np.ndarray:
245 246
        """Returns the processed ``dataframe`` as a np.ndarray
        """
247 248
        if self.embed_cols is not None:
            df_emb = self._prepare_embed(df)
249
            df_emb = self.label_encoder.transform(df_emb)
250 251 252
        if self.continuous_cols is not None:
            df_cont = self._prepare_continuous(df)
            if self.scale:
253 254 255
                try:
                    self.scaler.mean_
                except:
J
jrzaurin 已提交
256 257 258 259
                    raise NotFittedError(
                        "This DeepPreprocessor instance is not fitted yet. "
                        "Call 'fit' with appropriate arguments before using this estimator."
                    )
260 261 262 263 264 265 266 267 268
                df_std = df_cont[self.standardize_cols]
                df_cont[self.standardize_cols] = self.scaler.transform(df_std.values)
        try:
            df_deep = pd.concat([df_emb, df_cont], axis=1)
        except:
            try:
                df_deep = df_emb.copy()
            except:
                df_deep = df_cont.copy()
J
jrzaurin 已提交
269
        self.deep_column_idx = {k: v for v, k in enumerate(df_deep.columns)}
270 271
        return df_deep.values

J
jrzaurin 已提交
272
    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:
273 274
        """Combines ``fit`` and ``transform``
        """
275 276
        return self.fit(df).transform(df)

277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
    def _prepare_embed(self, df: pd.DataFrame) -> pd.DataFrame:
        if isinstance(self.embed_cols[0], tuple):
            self.embed_dim = dict(self.embed_cols)  # type: ignore
            embed_colname = [emb[0] for emb in self.embed_cols]
        else:
            self.embed_dim = {e: self.default_embed_dim for e in self.embed_cols}  # type: ignore
            embed_colname = self.embed_cols  # type: ignore
        return df.copy()[embed_colname]

    def _prepare_continuous(self, df: pd.DataFrame) -> pd.DataFrame:
        if self.scale:
            if self.already_standard is not None:
                self.standardize_cols = [
                    c for c in self.continuous_cols if c not in self.already_standard
                ]
            else:
                self.standardize_cols = self.continuous_cols
        return df.copy()[self.continuous_cols]

296

J
jrzaurin 已提交
297
class TextPreprocessor(BasePreprocessor):
298
    r"""Preprocessor to prepare the deeptext input dataset
299 300 301

    Parameters
    ----------
J
jrzaurin 已提交
302 303
    text_col: str
        column in the input pd.DataFrame containing the texts
304
    max_vocab: int, default=30000
305
        Maximum number of token in the vocabulary
306
    min_freq: int, default=5
307
        Minimum frequency for a token to be part of the vocabulary
308
    maxlen: int, default=80
309
        Maximum length of the tokenized sequences
310
    word_vectors_path: str, Optional
311
        Path to the pretrained word vectors
312
    verbose: int, default 1
313 314 315 316
        Enable verbose output.

    Attributes
    ----------
317 318 319
    vocab: `Vocab`
        instance of ``Vocab``. See :class:`pytorch_widedeep.utils.fastai_transforms.Vocab`
    tokens: `List`
320
        List with Lists of str containing the tokenized texts
321
    embedding_matrix: `np.ndarray`
322 323
        Array with the pretrained embeddings

324 325 326 327 328 329 330 331 332 333 334 335 336 337
    Examples
    ---------
    >>> import pandas as pd
    >>> from pytorch_widedeep.preprocessing import TextPreprocessor
    >>> df_train = pd.DataFrame({'text_column': ["life was like a box of chocolates",
    ... "You never know what you're gonna get"]})
    >>> text_preprocessor = TextPreprocessor(text_col='text_column', max_vocab=25, min_freq=1, maxlen=10)
    >>> text_preprocessor.fit_transform(df_train)
    The vocabulary contains 24 tokens
    array([[ 1,  1,  1,  1, 10, 11, 12, 13, 14, 15],
           [ 5,  9, 16, 17, 18,  9, 19, 20, 21, 22]], dtype=int32)
    >>> df_te = pd.DataFrame({'text_column': ['you never know what is in the box']})
    >>> text_preprocessor.transform(df_te)
    array([[ 1,  1,  9, 16, 17, 18,  0,  0,  0, 13]], dtype=int32)
338
    """
J
jrzaurin 已提交
339 340 341 342 343 344 345 346 347 348

    def __init__(
        self,
        text_col: str,
        max_vocab: int = 30000,
        min_freq: int = 5,
        maxlen: int = 80,
        word_vectors_path: Optional[str] = None,
        verbose: int = 1,
    ):
349
        super(TextPreprocessor, self).__init__()
J
jrzaurin 已提交
350
        self.text_col = text_col
351 352 353 354 355 356
        self.max_vocab = max_vocab
        self.min_freq = min_freq
        self.maxlen = maxlen
        self.word_vectors_path = word_vectors_path
        self.verbose = verbose

J
jrzaurin 已提交
357
    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
358 359
        """Builds the vocabulary
        """
J
jrzaurin 已提交
360
        texts = df[self.text_col].tolist()
361
        tokens = get_texts(texts)
J
jrzaurin 已提交
362 363 364
        self.vocab = Vocab.create(
            tokens, max_vocab=self.max_vocab, min_freq=self.min_freq
        )
365 366
        if self.verbose:
            print("The vocabulary contains {} tokens".format(len(self.vocab.stoi)))
367 368
        return self

J
jrzaurin 已提交
369
    def transform(self, df: pd.DataFrame) -> np.ndarray:
370 371
        """Returns the padded, `numericalised` sequences
        """
372 373 374
        try:
            self.vocab
        except:
J
jrzaurin 已提交
375 376 377 378
            raise NotFittedError(
                "This TextPreprocessor instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this estimator."
            )
379 380 381 382 383
        texts = df[self.text_col].tolist()
        self.tokens = get_texts(texts)
        sequences = [self.vocab.numericalize(t) for t in self.tokens]
        padded_seq = np.array([pad_sequences(s, maxlen=self.maxlen) for s in sequences])
        if self.word_vectors_path is not None:
J
jrzaurin 已提交
384 385 386
            self.embedding_matrix = build_embeddings_matrix(
                self.vocab, self.word_vectors_path, self.min_freq
            )
387 388
        return padded_seq

J
jrzaurin 已提交
389
    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:
390 391
        """Combines ``fit`` and ``transform``
        """
J
jrzaurin 已提交
392
        return self.fit(df).transform(df)
393 394


J
jrzaurin 已提交
395
class ImagePreprocessor(BasePreprocessor):
396 397
    r"""Preprocessor to prepare the deepimage input dataset. The Preprocessing
    consists simply on resizing according to their aspect ratio
398 399 400

    Parameters
    ----------
J
jrzaurin 已提交
401 402 403 404
    img_col: str
        name of the column with the images filenames
    img_path: str
        path to the dicrectory where the images are stored
405 406 407 408 409 410
    width: Int, default=224
        width of the resulting processed image. 224 because the default
        architecture used by WideDeep is ResNet
    height: Int, default=224
        width of the resulting processed image. 224 because the default
        architecture used by WideDeep is ResNet
411
    verbose: Int, default 1
412 413 414 415 416 417 418 419 420 421 422 423 424 425
        Enable verbose output.

    Attributes
    ----------
    aap: Class, AspectAwarePreprocessor()
        Preprocessing tool taken from Adrian Rosebrock's book "Deep Learning
        for Computer Vision".
    spp: Class, SimplePreprocessor()
        Preprocessing tool taken from Adrian Rosebrock's book "Deep Learning
        for Computer Vision".
    normalise_metrics: Dict
        Dict containing the normalisation metrics of the image dataset, i.e.
        mean and std for the R, G and B channels

426
    Examples
427
    --------
428 429 430 431 432 433 434
    >>> import pandas as pd
    >>> from pytorch_widedeep.preprocessing import ImagePreprocessor
    >>> df_train = pd.DataFrame({'images_column': ['galaxy1.png', 'galaxy2.png']})
    >>> df_test = pd.DataFrame({'images_column': ['galaxy3.png']})
    >>> img_preprocessor = ImagePreprocessor(img_col='images_column', img_path='.', verbose=0)
    >>> resized_images = img_preprocessor.fit_transform(df_train)
    >>> new_resized_images = img_preprocessor.transform(df_train)
435 436


437 438 439 440
    .. note:: Normalising metrics will only be computed when the
        ``fit_transform`` method is run. Running ``transform`` only will not
        change the computed metrics and running ``fit`` only simply
        instantiates the resizing functions.
441 442

    """
J
jrzaurin 已提交
443 444 445 446 447 448 449 450 451

    def __init__(
        self,
        img_col: str,
        img_path: str,
        width: int = 224,
        height: int = 224,
        verbose: int = 1,
    ):
452
        super(ImagePreprocessor, self).__init__()
J
jrzaurin 已提交
453 454
        self.img_col = img_col
        self.img_path = img_path
455 456 457 458
        self.width = width
        self.height = height
        self.verbose = verbose

J
jrzaurin 已提交
459
    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
460 461 462 463 464
        """Simply instantiates the Preprocessors ``AspectAwarePreprocessor`` and
        ``SimplePreprocessor`` for image resizing. See
        :class:`pytorch_widedeep.utils.image_utils.AspectAwarePreprocessor`
        and :class:`pytorch_widedeep.utils.image_utils.SimplePreprocessor`.
        """
465 466
        self.aap = AspectAwarePreprocessor(self.width, self.height)
        self.spp = SimplePreprocessor(self.width, self.height)
467
        self._compute_normalising_metrics = True
468 469
        return self

J
jrzaurin 已提交
470
    def transform(self, df: pd.DataFrame) -> np.ndarray:
471 472
        """Resizes the images to the input height and width.
        """
473
        try:
474
            self.aap
475
        except:
J
jrzaurin 已提交
476 477 478 479
            raise NotFittedError(
                "This ImagePreprocessor instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this estimator."
            )
480
        image_list = df[self.img_col].tolist()
J
jrzaurin 已提交
481 482 483
        if self.verbose:
            print("Reading Images from {}".format(self.img_path))
        imgs = [cv2.imread("/".join([self.img_path, img])) for img in image_list]
484 485 486

        # finding images with different height and width
        aspect = [(im.shape[0], im.shape[1]) for im in imgs]
J
jrzaurin 已提交
487 488
        aspect_r = [a[0] / a[1] for a in aspect]
        diff_idx = [i for i, r in enumerate(aspect_r) if r != 1.0]
489

J
jrzaurin 已提交
490 491
        if self.verbose:
            print("Resizing")
492
        resized_imgs = []
J
jrzaurin 已提交
493
        for i, img in tqdm(enumerate(imgs), total=len(imgs), disable=self.verbose != 1):
494 495 496
            if i in diff_idx:
                resized_imgs.append(self.aap.preprocess(img))
            else:
497
                # if aspect ratio is 1:1, no need for AspectAwarePreprocessor
498 499
                resized_imgs.append(self.spp.preprocess(img))

500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529
        if self._compute_normalising_metrics:
            if self.verbose:
                print("Computing normalisation metrics")
            # mean and std deviation will only be computed when the fit method
            # is called
            mean_R, mean_G, mean_B = [], [], []
            std_R, std_G, std_B = [], [], []
            for rsz_img in resized_imgs:
                (mean_b, mean_g, mean_r), (std_b, std_g, std_r) = cv2.meanStdDev(
                    rsz_img
                )
                mean_R.append(mean_r)
                mean_G.append(mean_g)
                mean_B.append(mean_b)
                std_R.append(std_r)
                std_G.append(std_g)
                std_B.append(std_b)
            self.normalise_metrics = dict(
                mean={
                    "R": np.mean(mean_R) / 255.0,
                    "G": np.mean(mean_G) / 255.0,
                    "B": np.mean(mean_B) / 255.0,
                },
                std={
                    "R": np.mean(std_R) / 255.0,
                    "G": np.mean(std_G) / 255.0,
                    "B": np.mean(std_B) / 255.0,
                },
            )
            self._compute_normalising_metrics = False
530 531
        return np.asarray(resized_imgs)

J
jrzaurin 已提交
532
    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:
533 534
        """Combines ``fit`` and ``transform``
        """
J
jrzaurin 已提交
535
        return self.fit(df).transform(df)