_preprocessors.py 15.7 KB
Newer Older
1 2 3
import numpy as np
import pandas as pd
import warnings
J
jrzaurin 已提交
4
import cv2
5 6 7

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
8
from sklearn.exceptions import NotFittedError
9
from scipy.sparse import csc_matrix
10
from tqdm import tqdm
11

12
from ..wdtypes import *
13 14
from ..utils.dense_utils import *
from ..utils.text_utils import *
J
jrzaurin 已提交
15
from ..utils.fastai_transforms import *
16
from ..utils.image_utils import *
17 18


J
jrzaurin 已提交
19
class WidePreprocessor(object):
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
    r"""Preprocessor to prepare the wide input dataset

    Parameters
    ----------
    wide_cols: List
        List with the name of the columns that will be one-hot encoded and
        pass through the Wide model
    crossed_cols: List
        List of Tuples with the name of the columns that will be "crossed"
        and then one-hot encoded. e.g. (['education', 'occupation'], ...)
    already_dummies: List
        List of columns that are already dummies/one-hot encoded

    Attributes
    ----------
    one_hot_enc: sklearn's OneHotEncoder
    wide_crossed_cols: List
        List with the names of all columns that will be one-hot encoded

    Example
    --------
    Assuming we have a dataset loaded in memory as a pd.DataFrame

    >>> wide_cols = ['age_buckets', 'education', 'relationship','workclass','occupation',
    ... 'native_country','gender']
    >>> crossed_cols = [('education', 'occupation'), ('native_country', 'occupation')]
    >>> wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
    >>> X_wide = wide_preprocessor.fit_transform(df)

    From there on, for new data (loaded as a dataframe)
    >>> new_X_wide = wide_preprocessor.transform(new_df)
    """
52
    def __init__(self, wide_cols:List[str], crossed_cols=None,
53
        already_dummies:Optional[List[str]]=None, sparse=False):
54
        super(WidePreprocessor, self).__init__()
55 56 57
        self.wide_cols = wide_cols
        self.crossed_cols = crossed_cols
        self.already_dummies = already_dummies
58
        self.one_hot_enc = OneHotEncoder(sparse=sparse)
59 60 61 62 63 64 65 66 67 68 69

    def _cross_cols(self, df:pd.DataFrame):
        crossed_colnames = []
        for cols in self.crossed_cols:
            cols = list(cols)
            for c in cols: df[c] = df[c].astype('str')
            colname = '_'.join(cols)
            df[colname] = df[cols].apply(lambda x: '-'.join(x), axis=1)
            crossed_colnames.append(colname)
        return df, crossed_colnames

J
jrzaurin 已提交
70
    def fit(self, df:pd.DataFrame):
71 72 73
        df_wide = df.copy()[self.wide_cols]
        if self.crossed_cols is not None:
            df_wide, crossed_colnames = self._cross_cols(df_wide)
74
            self.wide_crossed_cols = self.wide_cols + crossed_colnames
75 76 77 78 79 80 81 82 83 84
        else:
            self.wide_crossed_cols = self.wide_cols

        if self.already_dummies:
            dummy_cols = [c for c in self.wide_crossed_cols if c not in self.already_dummies]
            self.one_hot_enc.fit(df_wide[dummy_cols])
        else:
            self.one_hot_enc.fit(df_wide[self.wide_crossed_cols])
        return self

J
jrzaurin 已提交
85
    def transform(self, df:pd.DataFrame) -> Union[sparse_matrix, np.ndarray]:
86 87 88 89 90
        try:
            self.one_hot_enc.categories_
        except:
            raise NotFittedError("This WidePreprocessor instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this estimator.")
91 92 93 94
        df_wide = df.copy()[self.wide_cols]
        if self.crossed_cols is not None:
            df_wide, _ = self._cross_cols(df_wide)
        if self.already_dummies:
J
jrzaurin 已提交
95
            X_oh_1 = df_wide[self.already_dummies].values
96 97 98 99 100 101
            dummy_cols = [c for c in self.wide_crossed_cols if c not in self.already_dummies]
            X_oh_2=self.one_hot_enc.transform(df_wide[dummy_cols])
            return np.hstack((X_oh_1, X_oh_2))
        else:
            return (self.one_hot_enc.transform(df_wide[self.wide_crossed_cols]))

102
    def fit_transform(self, df:pd.DataFrame)->Union[sparse_matrix, np.ndarray]:
103 104 105
        return self.fit(df).transform(df)


J
jrzaurin 已提交
106
class DeepPreprocessor(object):
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
    r"""Preprocessor to prepare the deepdense input dataset

    Parameters
    ----------
    embed_cols: List
        List containing the name of the columns that will be represented with
        embeddings or a Tuple with the name and the embedding dimension. e.g.:
         [('education',32), ('relationship',16)
    continuous_cols: List
        List with the name of the so called continuous cols
    scale: Bool
        Bool indicating whether or not to scale/Standarise continuous cols.
        Should "almost always" be True.
    default_embed_dim: Int, default=8
        Dimension for the embeddings used in the Deep-Dense model
    already_standard: List, Optional,
        List with the name of the continuous cols that do not need to be
        Standarised.

    Attributes
    ----------
    encoding_dict: Dict
        Dict with the categorical encoding
    embed_cols: List
        List with the columns that will be represented with embeddings
    embed_dim: Dict
        Dict where keys are the embed cols and values are the embed dimensions
    standardize_cols: List
        List of the columns that will be standarized
    deep_column_idx: Dict
        Dict where keys are column names and values are column indexes. This
        will be neccesary to slice tensors
    scaler: sklearn's StandardScaler

    Example
    --------
    Assuming we have a dataset loaded in memory as a pd.DataFrame

    >>> cat_embed_cols = [('education',10), ('relationship',8), ('workclass',10),
    ... ('occupation',10),('native_country',10)]
    >>> continuous_cols = ["age","hours_per_week"]
    >>> deep_preprocessor = DeepPreprocessor(embed_cols=cat_embed_cols, continuous_cols=continuous_cols)
    >>> X_deep = deep_preprocessor.fit_transform(df)

    From there on, for new data (loaded as a dataframe)
    >>> new_X_deep = deep_preprocessor.transform(new_df)
    """
154 155 156 157
    def __init__(self,
        embed_cols:List[Union[str,Tuple[str,int]]]=None,
        continuous_cols:List[str]=None,
        scale:bool=True,
158 159
        default_embed_dim:int=8,
        already_standard:Optional[List[str]]=None):
160
        super(DeepPreprocessor, self).__init__()
161 162 163 164 165 166 167 168

        self.embed_cols=embed_cols
        self.continuous_cols=continuous_cols
        self.already_standard=already_standard
        self.scale=scale
        self.default_embed_dim=default_embed_dim

        assert (self.embed_cols is not None) or (self.continuous_cols is not None), \
169
        "'embed_cols' and 'continuous_cols' are 'None'. Please, define at least one of the two."
170 171

    def _prepare_embed(self, df:pd.DataFrame)->pd.DataFrame:
J
jrzaurin 已提交
172 173
        if isinstance(self.embed_cols[0], tuple):
            self.embed_dim = dict(self.embed_cols) # type: ignore
174 175
            embed_colname = [emb[0] for emb in self.embed_cols]
        else:
J
jrzaurin 已提交
176 177
            self.embed_dim = {e:self.default_embed_dim for e in self.embed_cols} # type: ignore
            embed_colname = self.embed_cols # type: ignore
178 179 180 181 182 183 184 185 186
        return df.copy()[embed_colname]

    def _prepare_continuous(self, df:pd.DataFrame)->pd.DataFrame:
        if self.scale:
            if self.already_standard is not None:
                self.standardize_cols = [c for c in self.continuous_cols if c not in self.already_standard]
            else: self.standardize_cols = self.continuous_cols
        return df.copy()[self.continuous_cols]

J
jrzaurin 已提交
187
    def fit(self, df:pd.DataFrame):
188 189 190
        if self.embed_cols is not None:
            df_emb = self._prepare_embed(df)
            _, self.encoding_dict = label_encoder(df_emb, cols=df_emb.columns.tolist())
J
jrzaurin 已提交
191
            self.embeddings_input: List = []
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
            for k,v in self.encoding_dict.items():
                self.embeddings_input.append((k, len(v), self.embed_dim[k]))
        if self.continuous_cols is not None:
            df_cont = self._prepare_continuous(df)
            if self.scale:
                df_std = df_cont[self.standardize_cols]
                self.scaler = StandardScaler().fit(df_std.values)
            else:
                warnings.warn('Continuous columns will not be normalised')
        return self

    def transform(self, df:pd.DataFrame)->np.ndarray:
        if self.embed_cols is not None:
            df_emb = self._prepare_embed(df)
            df_emb, _ = label_encoder(df_emb, cols=df_emb.columns.tolist(),
                val_to_idx=self.encoding_dict)
        if self.continuous_cols is not None:
            df_cont = self._prepare_continuous(df)
            if self.scale:
211 212 213 214 215
                try:
                    self.scaler.mean_
                except:
                    raise NotFittedError("This DeepPreprocessor instance is not fitted yet. "
                        "Call 'fit' with appropriate arguments before using this estimator.")
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
                df_std = df_cont[self.standardize_cols]
                df_cont[self.standardize_cols] = self.scaler.transform(df_std.values)
        try:
            df_deep = pd.concat([df_emb, df_cont], axis=1)
        except:
            try:
                df_deep = df_emb.copy()
            except:
                df_deep = df_cont.copy()
        self.deep_column_idx = {k:v for v,k in enumerate(df_deep.columns)}
        return df_deep.values

    def fit_transform(self, df:pd.DataFrame)->np.ndarray:
        return self.fit(df).transform(df)


J
jrzaurin 已提交
232
class TextPreprocessor(object):
233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
    r"""Preprocessor to prepare the deepdense input dataset

    Parameters
    ----------
    max_vocab: Int, default=30000
        Maximum number of token in the vocabulary
    min_freq: Int, default=5
        Minimum frequency for a token to be part of the vocabulary
    maxlen: Int, default=80
        Maximum length of the tokenized sequences
    word_vectors_path: Optional, str
        Path to the pretrained word vectors
    verbose: Int, Default 1
        Enable verbose output.

    Attributes
    ----------
    text_col: str
        column in the input pd.DataFrame containing the texts
    vocab: fastai Vocab object. See https://docs.fast.ai/text.transform.html#Vocab
        Vocab object containing the information of the vocabulary
    tokens: List
        List with Lists of str containing the tokenized texts
    embedding_matrix: np.ndarray
        Array with the pretrained embeddings

    Example
    --------
    Assuming we have a dataset loaded in memory as a pd.DataFrame

    >>> text_preprocessor = TextPreprocessor()
    >>> X_text = text_preprocessor.fit_transform(df, text_col)

    from there on

    From there on, for new data (loaded as a dataframe)
    >>> new_X_text = text_preprocessor.transform(new_df)
    """
271
    def __init__(self, max_vocab:int=30000, min_freq:int=5,
272 273
        maxlen:int=80, word_vectors_path:Optional[str]=None,
        verbose:int=1):
274
        super(TextPreprocessor, self).__init__()
275 276 277 278 279 280
        self.max_vocab = max_vocab
        self.min_freq = min_freq
        self.maxlen = maxlen
        self.word_vectors_path = word_vectors_path
        self.verbose = verbose

J
jrzaurin 已提交
281
    def fit(self, df:pd.DataFrame, text_col:str):
282 283 284 285 286 287 288
        text_col = text_col
        texts = df[text_col].tolist()
        tokens = get_texts(texts)
        self.vocab = Vocab.create(tokens, max_vocab=self.max_vocab, min_freq=self.min_freq)
        return self

    def transform(self, df:pd.DataFrame, text_col:str)->np.ndarray:
289 290 291 292 293
        try:
            self.vocab
        except:
            raise NotFittedError("This TextPreprocessor instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this estimator.")
294 295 296 297 298 299
        self.text_col = text_col
        texts = df[self.text_col].tolist()
        self.tokens = get_texts(texts)
        sequences = [self.vocab.numericalize(t) for t in self.tokens]
        padded_seq = np.array([pad_sequences(s, maxlen=self.maxlen) for s in sequences])
        if self.verbose:
J
jrzaurin 已提交
300
            print("The vocabulary contains {} tokens".format(len(self.vocab.stoi)))
301
        if self.word_vectors_path is not None:
302 303
            self.embedding_matrix = build_embeddings_matrix(self.vocab, self.word_vectors_path,
                self.min_freq)
304 305 306 307 308 309
        return padded_seq

    def fit_transform(self, df:pd.DataFrame, text_col:str)->np.ndarray:
        return self.fit(df, text_col).transform(df, text_col)


J
jrzaurin 已提交
310
class ImagePreprocessor(object):
311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350
    r"""Preprocessor to prepare the deepdense input dataset

    Parameters
    ----------
    width: Int, default=224
        width of the resulting processed image. 224 because the default
        architecture used by WideDeep is ResNet
    height: Int, default=224
        width of the resulting processed image. 224 because the default
        architecture used by WideDeep is ResNet
    verbose: Int, Default 1
        Enable verbose output.

    Attributes
    ----------
    aap: Class, AspectAwarePreprocessor()
        Preprocessing tool taken from Adrian Rosebrock's book "Deep Learning
        for Computer Vision".
    spp: Class, SimplePreprocessor()
        Preprocessing tool taken from Adrian Rosebrock's book "Deep Learning
        for Computer Vision".
    img_col: str
        name of the column with the images filenames
    normalise_metrics: Dict
        Dict containing the normalisation metrics of the image dataset, i.e.
        mean and std for the R, G and B channels

    Example
    --------
    Assuming we have a dataset loaded in memory as a pd.DataFrame

    >>> image_preprocessor = ImagePreprocessor()
    >>> img_path = 'path/to/my_images'
    >>> X_images = image_preprocessor.fit_transform(df, img_col, img_path)

    from there on

    From there on, for new data (loaded as a dataframe)
    >>> next_X_images = image_preprocessor.transform(new_df)
    """
351
    def __init__(self, width:int=224, height:int=224, verbose:int=1):
352
        super(ImagePreprocessor, self).__init__()
353 354 355 356
        self.width = width
        self.height = height
        self.verbose = verbose

J
jrzaurin 已提交
357
    def fit(self):
358 359 360 361 362
        self.aap = AspectAwarePreprocessor(self.width, self.height)
        self.spp = SimplePreprocessor(self.width, self.height)
        return self

    def transform(self, df, img_col:str, img_path:str)->np.ndarray:
363
        try:
364
            self.aap
365 366 367
        except:
            raise NotFittedError("This ImagePreprocessor instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this estimator.")
368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390
        self.img_col = img_col
        image_list = df[self.img_col].tolist()
        if self.verbose: print('Reading Images from {}'.format(img_path))
        imgs = [cv2.imread("/".join([img_path,img])) for img in image_list]

        # finding images with different height and width
        aspect = [(im.shape[0], im.shape[1]) for im in imgs]
        aspect_r = [a[0]/a[1] for a in aspect]
        diff_idx = [i for i,r in enumerate(aspect_r) if r!=1.]

        if self.verbose: print('Resizing')
        resized_imgs = []
        for i,img in tqdm(enumerate(imgs), total=len(imgs), disable=self.verbose != 1):
            if i in diff_idx:
                resized_imgs.append(self.aap.preprocess(img))
            else:
                resized_imgs.append(self.spp.preprocess(img))

        if self.verbose: print('Computing normalisation metrics')
        mean_R, mean_G, mean_B = [], [], []
        std_R, std_G, std_B = [], [], []
        for rsz_img in resized_imgs:
            (mean_b, mean_g, mean_r), (std_b, std_g, std_r) = cv2.meanStdDev(rsz_img)
J
jrzaurin 已提交
391 392 393 394 395 396
            mean_R.append(mean_r)
            mean_G.append(mean_g)
            mean_B.append(mean_b)
            std_R.append(std_r)
            std_G.append(std_g)
            std_B.append(std_b)
397 398 399 400 401 402 403 404
        self.normalise_metrics = dict(
            mean = {"R": np.mean(mean_R)/255., "G": np.mean(mean_G)/255., "B": np.mean(mean_B)/255.},
            std = {"R": np.mean(std_R)/255., "G": np.mean(std_G)/255., "B": np.mean(std_B)/255.}
            )
        return np.asarray(resized_imgs)

    def fit_transform(self, df, img_col:str, img_path:str)->np.ndarray:
        return self.fit().transform(df, img_col, img_path)