text_preprocessor.py 4.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
import numpy as np
import pandas as pd

from pytorch_widedeep.wdtypes import *  # noqa: F403
from pytorch_widedeep.utils.text_utils import (
    get_texts,
    pad_sequences,
    build_embeddings_matrix,
)
from pytorch_widedeep.utils.fastai_transforms import Vocab
from pytorch_widedeep.preprocessing.base_preprocessor import (
    BasePreprocessor,
    check_is_fitted,
)


class TextPreprocessor(BasePreprocessor):
    def __init__(
        self,
        text_col: str,
        max_vocab: int = 30000,
        min_freq: int = 5,
        maxlen: int = 80,
24 25
        pad_first: bool = True,
        pad_idx: int = 1,
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
        word_vectors_path: Optional[str] = None,
        verbose: int = 1,
    ):
        r"""Preprocessor to prepare the ``deeptext`` input dataset

        Parameters
        ----------
        text_col: str
            column in the input dataframe containing the texts
        max_vocab: int, default=30000
            Maximum number of token in the vocabulary
        min_freq: int, default=5
            Minimum frequency for a token to be part of the vocabulary
        maxlen: int, default=80
            Maximum length of the tokenized sequences
41 42 43 44 45
        pad_first: bool,  default = True
            Indicates whether the padding index will be added at the beginning or the
            end of the sequences
        pad_idx: int, default = 1
            padding index. Fastai's Tokenizer leaves 0 for the 'unknown' token.
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
        word_vectors_path: str, Optional
            Path to the pretrained word vectors
        verbose: int, default 1
            Enable verbose output.

        Attributes
        ----------
        vocab: Vocab
            an instance of :class:`pytorch_widedeep.utils.fastai_transforms.Vocab`
        tokens: List
            List with Lists of str containing the tokenized texts
        embedding_matrix: np.ndarray
            Array with the pretrained embeddings

        Examples
        ---------
        >>> import pandas as pd
        >>> from pytorch_widedeep.preprocessing import TextPreprocessor
        >>> df_train = pd.DataFrame({'text_column': ["life is like a box of chocolates",
        ... "You never know what you're gonna get"]})
        >>> text_preprocessor = TextPreprocessor(text_col='text_column', max_vocab=25, min_freq=1, maxlen=10)
        >>> text_preprocessor.fit_transform(df_train)
        The vocabulary contains 24 tokens
        array([[ 1,  1,  1,  1, 10, 11, 12, 13, 14, 15],
               [ 5,  9, 16, 17, 18,  9, 19, 20, 21, 22]], dtype=int32)
        >>> df_te = pd.DataFrame({'text_column': ['you never know what is in the box']})
        >>> text_preprocessor.transform(df_te)
        array([[ 1,  1,  9, 16, 17, 18, 11,  0,  0, 13]], dtype=int32)
        """
        super(TextPreprocessor, self).__init__()

        self.text_col = text_col
        self.max_vocab = max_vocab
        self.min_freq = min_freq
        self.maxlen = maxlen
81 82
        self.pad_first = pad_first
        self.pad_idx = pad_idx
83 84 85 86 87 88 89 90 91 92 93 94
        self.word_vectors_path = word_vectors_path
        self.verbose = verbose

    def fit(self, df: pd.DataFrame) -> BasePreprocessor:
        """Builds the vocabulary"""
        texts = df[self.text_col].tolist()
        tokens = get_texts(texts)
        self.vocab = Vocab.create(
            tokens, max_vocab=self.max_vocab, min_freq=self.min_freq
        )
        if self.verbose:
            print("The vocabulary contains {} tokens".format(len(self.vocab.stoi)))
95 96 97 98
        if self.word_vectors_path is not None:
            self.embedding_matrix = build_embeddings_matrix(
                self.vocab, self.word_vectors_path, self.min_freq
            )
99 100 101 102 103 104 105 106
        return self

    def transform(self, df: pd.DataFrame) -> np.ndarray:
        """Returns the padded, `numericalised` sequences"""
        check_is_fitted(self, attributes=["vocab"])
        texts = df[self.text_col].tolist()
        self.tokens = get_texts(texts)
        sequences = [self.vocab.numericalize(t) for t in self.tokens]
107 108 109 110 111 112 113 114 115 116 117
        padded_seq = np.array(
            [
                pad_sequences(
                    s,
                    maxlen=self.maxlen,
                    pad_first=self.pad_first,
                    pad_idx=self.pad_idx,
                )
                for s in sequences
            ]
        )
118 119 120 121 122 123 124 125 126 127
        return padded_seq

    def fit_transform(self, df: pd.DataFrame) -> np.ndarray:
        """Combines ``fit`` and ``transform``"""
        return self.fit(df).transform(df)

    def inverse_transform(self, padded_seq: np.ndarray) -> pd.DataFrame:
        """Returns the original text plus the added 'special' tokens"""
        texts = [self.vocab.textify(num) for num in padded_seq]
        return pd.DataFrame({self.text_col: texts})