_preprocessors.py

import numpy as np
import pandas as pd
import warnings
import cv2

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import NotFittedError
from scipy.sparse import csc_matrix
from tqdm import tqdm

from ..wdtypes import *
from ..utils.dense_utils import *
from ..utils.text_utils import *
from ..utils.fastai_transforms import *
from ..utils.image_utils import *


class WidePreprocessor(object):
    r"""Preprocessor to prepare the wide input dataset

    Parameters
    ----------
    wide_cols: List
        List with the name of the columns that will be one-hot encoded and
        pass through the Wide model
    crossed_cols: List
        List of Tuples with the name of the columns that will be "crossed"
        and then one-hot encoded. e.g. (['education', 'occupation'], ...)
    already_dummies: List
        List of columns that are already dummies/one-hot encoded

    Attributes
    ----------
    one_hot_enc: sklearn's OneHotEncoder
    wide_crossed_cols: List
        List with the names of all columns that will be one-hot encoded

    Example
    --------
    Assuming we have a dataset loaded in memory as a pd.DataFrame

    >>> wide_cols = ['age_buckets', 'education', 'relationship','workclass','occupation',
    ... 'native_country','gender']
    >>> crossed_cols = [('education', 'occupation'), ('native_country', 'occupation')]
    >>> wide_preprocessor = WidePreprocessor(wide_cols=wide_cols, crossed_cols=crossed_cols)
    >>> X_wide = wide_preprocessor.fit_transform(df)

    From there on, for new data (loaded as a dataframe)
    >>> new_X_wide = wide_preprocessor.transform(new_df)
    """
    def __init__(self, wide_cols:List[str], crossed_cols=None,
        already_dummies:Optional[List[str]]=None, sparse=False):
        super(WidePreprocessor, self).__init__()
        self.wide_cols = wide_cols
        self.crossed_cols = crossed_cols
        self.already_dummies = already_dummies
        self.one_hot_enc = OneHotEncoder(sparse=sparse)

    def _cross_cols(self, df:pd.DataFrame):
        crossed_colnames = []
        for cols in self.crossed_cols:
            cols = list(cols)
            for c in cols: df[c] = df[c].astype('str')
            colname = '_'.join(cols)
            df[colname] = df[cols].apply(lambda x: '-'.join(x), axis=1)
            crossed_colnames.append(colname)
        return df, crossed_colnames

    def fit(self, df:pd.DataFrame):
        df_wide = df.copy()[self.wide_cols]
        if self.crossed_cols is not None:
            df_wide, crossed_colnames = self._cross_cols(df_wide)
            self.wide_crossed_cols = self.wide_cols + crossed_colnames
        else:
            self.wide_crossed_cols = self.wide_cols

        if self.already_dummies:
            dummy_cols = [c for c in self.wide_crossed_cols if c not in self.already_dummies]
            self.one_hot_enc.fit(df_wide[dummy_cols])
        else:
            self.one_hot_enc.fit(df_wide[self.wide_crossed_cols])
        return self

    def transform(self, df:pd.DataFrame) -> Union[sparse_matrix, np.ndarray]:
        try:
            self.one_hot_enc.categories_
        except:
            raise NotFittedError("This WidePreprocessor instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this estimator.")
        df_wide = df.copy()[self.wide_cols]
        if self.crossed_cols is not None:
            df_wide, _ = self._cross_cols(df_wide)
        if self.already_dummies:
            X_oh_1 = df_wide[self.already_dummies].values
            dummy_cols = [c for c in self.wide_crossed_cols if c not in self.already_dummies]
            X_oh_2=self.one_hot_enc.transform(df_wide[dummy_cols])
            return np.hstack((X_oh_1, X_oh_2))
        else:
            return (self.one_hot_enc.transform(df_wide[self.wide_crossed_cols]))

    def fit_transform(self, df:pd.DataFrame)->Union[sparse_matrix, np.ndarray]:
        return self.fit(df).transform(df)


class DeepPreprocessor(object):
    r"""Preprocessor to prepare the deepdense input dataset

    Parameters
    ----------
    embed_cols: List
        List containing the name of the columns that will be represented with
        embeddings or a Tuple with the name and the embedding dimension. e.g.:
         [('education',32), ('relationship',16)
    continuous_cols: List
        List with the name of the so called continuous cols
    scale: Bool
        Bool indicating whether or not to scale/Standarise continuous cols.
        Should "almost always" be True.
    default_embed_dim: Int, default=8
        Dimension for the embeddings used in the Deep-Dense model
    already_standard: List, Optional,
        List with the name of the continuous cols that do not need to be
        Standarised.

    Attributes
    ----------
    encoding_dict: Dict
        Dict with the categorical encoding
    embed_cols: List
        List with the columns that will be represented with embeddings
    embed_dim: Dict
        Dict where keys are the embed cols and values are the embed dimensions
    standardize_cols: List
        List of the columns that will be standarized
    deep_column_idx: Dict
        Dict where keys are column names and values are column indexes. This
        will be neccesary to slice tensors
    scaler: sklearn's StandardScaler

    Example
    --------
    Assuming we have a dataset loaded in memory as a pd.DataFrame

    >>> cat_embed_cols = [('education',10), ('relationship',8), ('workclass',10),
    ... ('occupation',10),('native_country',10)]
    >>> continuous_cols = ["age","hours_per_week"]
    >>> deep_preprocessor = DeepPreprocessor(embed_cols=cat_embed_cols, continuous_cols=continuous_cols)
    >>> X_deep = deep_preprocessor.fit_transform(df)

    From there on, for new data (loaded as a dataframe)
    >>> new_X_deep = deep_preprocessor.transform(new_df)
    """
    def __init__(self,
        embed_cols:List[Union[str,Tuple[str,int]]]=None,
        continuous_cols:List[str]=None,
        scale:bool=True,
        default_embed_dim:int=8,
        already_standard:Optional[List[str]]=None):
        super(DeepPreprocessor, self).__init__()

        self.embed_cols=embed_cols
        self.continuous_cols=continuous_cols
        self.already_standard=already_standard
        self.scale=scale
        self.default_embed_dim=default_embed_dim

        assert (self.embed_cols is not None) or (self.continuous_cols is not None), \
        "'embed_cols' and 'continuous_cols' are 'None'. Please, define at least one of the two."

    def _prepare_embed(self, df:pd.DataFrame)->pd.DataFrame:
        if isinstance(self.embed_cols[0], tuple):
            self.embed_dim = dict(self.embed_cols) # type: ignore
            embed_colname = [emb[0] for emb in self.embed_cols]
        else:
            self.embed_dim = {e:self.default_embed_dim for e in self.embed_cols} # type: ignore
            embed_colname = self.embed_cols # type: ignore
        return df.copy()[embed_colname]

    def _prepare_continuous(self, df:pd.DataFrame)->pd.DataFrame:
        if self.scale:
            if self.already_standard is not None:
                self.standardize_cols = [c for c in self.continuous_cols if c not in self.already_standard]
            else: self.standardize_cols = self.continuous_cols
        return df.copy()[self.continuous_cols]

    def fit(self, df:pd.DataFrame):
        if self.embed_cols is not None:
            df_emb = self._prepare_embed(df)
            _, self.encoding_dict = label_encoder(df_emb, cols=df_emb.columns.tolist())
            self.embeddings_input: List = []
            for k,v in self.encoding_dict.items():
                self.embeddings_input.append((k, len(v), self.embed_dim[k]))
        if self.continuous_cols is not None:
            df_cont = self._prepare_continuous(df)
            if self.scale:
                df_std = df_cont[self.standardize_cols]
                self.scaler = StandardScaler().fit(df_std.values)
            else:
                warnings.warn('Continuous columns will not be normalised')
        return self

    def transform(self, df:pd.DataFrame)->np.ndarray:
        if self.embed_cols is not None:
            df_emb = self._prepare_embed(df)
            df_emb, _ = label_encoder(df_emb, cols=df_emb.columns.tolist(),
                val_to_idx=self.encoding_dict)
        if self.continuous_cols is not None:
            df_cont = self._prepare_continuous(df)
            if self.scale:
                try:
                    self.scaler.mean_
                except:
                    raise NotFittedError("This DeepPreprocessor instance is not fitted yet. "
                        "Call 'fit' with appropriate arguments before using this estimator.")
                df_std = df_cont[self.standardize_cols]
                df_cont[self.standardize_cols] = self.scaler.transform(df_std.values)
        try:
            df_deep = pd.concat([df_emb, df_cont], axis=1)
        except:
            try:
                df_deep = df_emb.copy()
            except:
                df_deep = df_cont.copy()
        self.deep_column_idx = {k:v for v,k in enumerate(df_deep.columns)}
        return df_deep.values

    def fit_transform(self, df:pd.DataFrame)->np.ndarray:
        return self.fit(df).transform(df)


class TextPreprocessor(object):
    r"""Preprocessor to prepare the deepdense input dataset

    Parameters
    ----------
    max_vocab: Int, default=30000
        Maximum number of token in the vocabulary
    min_freq: Int, default=5
        Minimum frequency for a token to be part of the vocabulary
    maxlen: Int, default=80
        Maximum length of the tokenized sequences
    word_vectors_path: Optional, str
        Path to the pretrained word vectors
    verbose: Int, Default 1
        Enable verbose output.

    Attributes
    ----------
    text_col: str
        column in the input pd.DataFrame containing the texts
    vocab: fastai Vocab object. See https://docs.fast.ai/text.transform.html#Vocab
        Vocab object containing the information of the vocabulary
    tokens: List
        List with Lists of str containing the tokenized texts
    embedding_matrix: np.ndarray
        Array with the pretrained embeddings

    Example
    --------
    Assuming we have a dataset loaded in memory as a pd.DataFrame

    >>> text_preprocessor = TextPreprocessor()
    >>> X_text = text_preprocessor.fit_transform(df, text_col)

    from there on

    From there on, for new data (loaded as a dataframe)
    >>> new_X_text = text_preprocessor.transform(new_df)
    """
    def __init__(self, max_vocab:int=30000, min_freq:int=5,
        maxlen:int=80, word_vectors_path:Optional[str]=None,
        verbose:int=1):
        super(TextPreprocessor, self).__init__()
        self.max_vocab = max_vocab
        self.min_freq = min_freq
        self.maxlen = maxlen
        self.word_vectors_path = word_vectors_path
        self.verbose = verbose

    def fit(self, df:pd.DataFrame, text_col:str):
        text_col = text_col
        texts = df[text_col].tolist()
        tokens = get_texts(texts)
        self.vocab = Vocab.create(tokens, max_vocab=self.max_vocab, min_freq=self.min_freq)
        return self

    def transform(self, df:pd.DataFrame, text_col:str)->np.ndarray:
        try:
            self.vocab
        except:
            raise NotFittedError("This TextPreprocessor instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this estimator.")
        self.text_col = text_col
        texts = df[self.text_col].tolist()
        self.tokens = get_texts(texts)
        sequences = [self.vocab.numericalize(t) for t in self.tokens]
        padded_seq = np.array([pad_sequences(s, maxlen=self.maxlen) for s in sequences])
        if self.verbose:
            print("The vocabulary contains {} tokens".format(len(self.vocab.stoi)))
        if self.word_vectors_path is not None:
            self.embedding_matrix = build_embeddings_matrix(self.vocab, self.word_vectors_path,
                self.min_freq)
        return padded_seq

    def fit_transform(self, df:pd.DataFrame, text_col:str)->np.ndarray:
        return self.fit(df, text_col).transform(df, text_col)


class ImagePreprocessor(object):
    r"""Preprocessor to prepare the deepdense input dataset

    Parameters
    ----------
    width: Int, default=224
        width of the resulting processed image. 224 because the default
        architecture used by WideDeep is ResNet
    height: Int, default=224
        width of the resulting processed image. 224 because the default
        architecture used by WideDeep is ResNet
    verbose: Int, Default 1
        Enable verbose output.

    Attributes
    ----------
    aap: Class, AspectAwarePreprocessor()
        Preprocessing tool taken from Adrian Rosebrock's book "Deep Learning
        for Computer Vision".
    spp: Class, SimplePreprocessor()
        Preprocessing tool taken from Adrian Rosebrock's book "Deep Learning
        for Computer Vision".
    img_col: str
        name of the column with the images filenames
    normalise_metrics: Dict
        Dict containing the normalisation metrics of the image dataset, i.e.
        mean and std for the R, G and B channels

    Example
    --------
    Assuming we have a dataset loaded in memory as a pd.DataFrame

    >>> image_preprocessor = ImagePreprocessor()
    >>> img_path = 'path/to/my_images'
    >>> X_images = image_preprocessor.fit_transform(df, img_col, img_path)

    from there on

    From there on, for new data (loaded as a dataframe)
    >>> next_X_images = image_preprocessor.transform(new_df)
    """
    def __init__(self, width:int=224, height:int=224, verbose:int=1):
        super(ImagePreprocessor, self).__init__()
        self.width = width
        self.height = height
        self.verbose = verbose

    def fit(self):
        self.aap = AspectAwarePreprocessor(self.width, self.height)
        self.spp = SimplePreprocessor(self.width, self.height)
        return self

    def transform(self, df, img_col:str, img_path:str)->np.ndarray:
        try:
            self.aap
        except:
            raise NotFittedError("This ImagePreprocessor instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this estimator.")
        self.img_col = img_col
        image_list = df[self.img_col].tolist()
        if self.verbose: print('Reading Images from {}'.format(img_path))
        imgs = [cv2.imread("/".join([img_path,img])) for img in image_list]

        # finding images with different height and width
        aspect = [(im.shape[0], im.shape[1]) for im in imgs]
        aspect_r = [a[0]/a[1] for a in aspect]
        diff_idx = [i for i,r in enumerate(aspect_r) if r!=1.]

        if self.verbose: print('Resizing')
        resized_imgs = []
        for i,img in tqdm(enumerate(imgs), total=len(imgs), disable=self.verbose != 1):
            if i in diff_idx:
                resized_imgs.append(self.aap.preprocess(img))
            else:
                resized_imgs.append(self.spp.preprocess(img))

        if self.verbose: print('Computing normalisation metrics')
        mean_R, mean_G, mean_B = [], [], []
        std_R, std_G, std_B = [], [], []
        for rsz_img in resized_imgs:
            (mean_b, mean_g, mean_r), (std_b, std_g, std_r) = cv2.meanStdDev(rsz_img)
            mean_R.append(mean_r)
            mean_G.append(mean_g)
            mean_B.append(mean_b)
            std_R.append(std_r)
            std_G.append(std_g)
            std_B.append(std_b)
        self.normalise_metrics = dict(
            mean = {"R": np.mean(mean_R)/255., "G": np.mean(mean_G)/255., "B": np.mean(mean_B)/255.},
            std = {"R": np.mean(std_R)/255., "G": np.mean(std_G)/255., "B": np.mean(std_B)/255.}
            )
        return np.asarray(resized_imgs)

    def fit_transform(self, df, img_col:str, img_path:str)->np.ndarray:
        return self.fit().transform(df, img_col, img_path)