text_utils.py 6.8 KB
Newer Older
1
import os
2
from typing import List, Optional
3

4
import numpy as np
5
from gensim.utils import tokenize
6

7
from pytorch_widedeep.utils.fastai_transforms import Vocab, Tokenizer
8

J
jrzaurin 已提交
9 10 11 12 13 14 15 16 17 18 19
__all__ = ["simple_preprocess", "get_texts", "pad_sequences", "build_embeddings_matrix"]


def simple_preprocess(
    doc: str,
    lower: bool = False,
    deacc: bool = False,
    min_len: int = 2,
    max_len: int = 15,
) -> List[str]:
    r"""
20
    This is `Gensim`'s `simple_preprocess` with a `lower` param to
21
    indicate wether or not to lower case all the token in the doc
J
jrzaurin 已提交
22

23
    For more information see: `Gensim` [utils module](https://radimrehurek.com/gensim/utils.html)
24 25 26 27 28

    Parameters
    ----------
    doc: str
        Input document.
29
    lower: bool, default = False
30
        Lower case tokens in the input doc
31
    deacc: bool, default = False
32
        Remove accent marks from tokens using `Gensim`'s `deaccent`
33
    min_len: int, default = 2
34
        Minimum length of token (inclusive). Shorter tokens are discarded.
35
    max_len: int, default = 15
36 37
        Maximum length of token in result (inclusive). Longer tokens are discarded.

38 39 40 41 42
    Examples
    --------
    >>> from pytorch_widedeep.utils import simple_preprocess
    >>> simple_preprocess('Machine learning is great')
    ['Machine', 'learning', 'is', 'great']
43 44 45 46 47

    Returns
    -------
    List[str]
        List with the processed tokens
J
jrzaurin 已提交
48 49 50
    """
    tokens = [
        token
J
jrzaurin 已提交
51
        for token in tokenize(doc, lower=lower, deacc=deacc, errors="ignore")
J
jrzaurin 已提交
52 53 54 55 56
        if min_len <= len(token) <= max_len and not token.startswith("_")
    ]
    return tokens


57 58 59 60 61
def get_texts(
    texts: List[str],
    already_processed: Optional[bool] = False,
    n_cpus: Optional[int] = None,
) -> List[List[str]]:
62
    r"""Tokenization using `Fastai`'s `Tokenizer` because it does a
63
    series of very convenients things during the tokenization process
64

65
    See `pytorch_widedeep.utils.fastai_utils.Tokenizer`
66

67 68
    Parameters
    ----------
69
    texts: List
70
        List of str with the texts (or documents). One str per document
71 72 73
    already_processed: bool, Optional, default = False
        Boolean indicating if the text is already processed and we simply
        want to tokenize it
74 75
    n_cpus: int, Optional, default = None
        number of CPUs to used during the tokenization process
76

77 78 79 80 81 82 83
    Examples
    --------
    >>> from pytorch_widedeep.utils import get_texts
    >>> texts = ['Machine learning is great', 'but building stuff is even better']
    >>> get_texts(texts)
    [['xxmaj', 'machine', 'learning', 'is', 'great'], ['but', 'building', 'stuff', 'is', 'even', 'better']]

84 85 86 87 88 89 90 91 92 93 94
    Returns
    -------
    List[List[str]]
        List of lists, one list per '_document_' containing its corresponding tokens

    :information_source: **NOTE**:
    `get_texts` uses `pytorch_widedeep.utils.fastai_transforms.Tokenizer`.
    Such tokenizer uses a series of convenient processing steps, including
    the  addition of some special tokens, such as `TK_MAJ` (`xxmaj`), used to
    indicate the next word begins with a capital in the original text. For more
    details of special tokens please see the [`fastai` `docs](https://docs.fast.ai/text.core.html#Tokenizing)
J
jrzaurin 已提交
95
    """
96 97 98

    num_cpus = n_cpus if n_cpus is not None else os.cpu_count()

99 100 101 102 103
    if not already_processed:
        processed_texts = [" ".join(simple_preprocess(t)) for t in texts]
    else:
        processed_texts = texts
    tok = Tokenizer(n_cpus=num_cpus).process_all(processed_texts)
J
jrzaurin 已提交
104 105 106 107 108
    return tok


def pad_sequences(
    seq: List[int], maxlen: int, pad_first: bool = True, pad_idx: int = 1
109
) -> np.ndarray:
J
jrzaurin 已提交
110
    r"""
111
    Given a List of tokenized and `numericalised` sequences it will return
112
    padded sequences according to the input parameters.
J
jrzaurin 已提交
113 114 115

    Parameters
    ----------
116
    seq: List
117
        List of int with the `numericalised` tokens
118
    maxlen: int
J
jrzaurin 已提交
119
        Maximum length of the padded sequences
120
    pad_first: bool,  default = True
J
jrzaurin 已提交
121 122
        Indicates whether the padding index will be added at the beginning or the
        end of the sequences
123
    pad_idx: int, default = 1
124
        padding index. Fastai's Tokenizer leaves 0 for the 'unknown' token.
J
jrzaurin 已提交
125

126 127 128 129 130 131
    Examples
    --------
    >>> from pytorch_widedeep.utils import pad_sequences
    >>> seq = [1,2,3]
    >>> pad_sequences(seq, maxlen=5, pad_idx=0)
    array([0, 0, 1, 2, 3], dtype=int32)
132 133 134 135 136

    Returns
    -------
    np.ndarray
        numpy array with the padded sequences
J
jrzaurin 已提交
137
    """
138 139 140
    if len(seq) == 0:
        return np.zeros(maxlen, dtype="int32") + pad_idx
    elif len(seq) >= maxlen:
J
jrzaurin 已提交
141 142 143 144 145 146 147 148 149 150 151 152 153
        res = np.array(seq[-maxlen:]).astype("int32")
        return res
    else:
        res = np.zeros(maxlen, dtype="int32") + pad_idx
        if pad_first:
            res[-len(seq) :] = seq
        else:
            res[: len(seq) :] = seq
        return res


def build_embeddings_matrix(
    vocab: Vocab, word_vectors_path: str, min_freq: int, verbose: int = 1
154
) -> np.ndarray:  # pragma: no cover
155
    r"""Build the embedding matrix using pretrained word vectors.
J
jrzaurin 已提交
156

157 158 159 160
    Returns pretrained word embeddings. If a word in our vocabulary is not
    among the pretrained embeddings it will be assigned the mean pretrained
    word-embeddings vector

J
jrzaurin 已提交
161 162
    Parameters
    ----------
163
    vocab: Vocab
164
        see `pytorch_widedeep.utils.fastai_utils.Vocab`
165
    word_vectors_path: str
J
jrzaurin 已提交
166
        path to the pretrained word embeddings
167
    min_freq: int
J
jrzaurin 已提交
168
        minimum frequency required for a word to be in the vocabulary
169
    verbose: int,  default=1
170
        level of verbosity. Set to 0 for no verbosity
171 172 173 174 175

    Returns
    -------
    np.ndarray
        Pretrained word embeddings
J
jrzaurin 已提交
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
    """
    if not os.path.isfile(word_vectors_path):
        raise FileNotFoundError("{} not found".format(word_vectors_path))
    if verbose:
        print("Indexing word vectors...")

    embeddings_index = {}
    f = open(word_vectors_path)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs
    f.close()

    if verbose:
        print("Loaded {} word vectors".format(len(embeddings_index)))
        print("Preparing embeddings matrix...")

195
    mean_word_vector = np.mean(list(embeddings_index.values()), axis=0)  # type: ignore[arg-type]
J
jrzaurin 已提交
196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
    embedding_dim = len(list(embeddings_index.values())[0])
    num_words = len(vocab.itos)
    embedding_matrix = np.zeros((num_words, embedding_dim))
    found_words = 0
    for i, word in enumerate(vocab.itos):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            found_words += 1
        else:
            embedding_matrix[i] = mean_word_vector

    if verbose:
        print(
            "{} words in the vocabulary had {} vectors and appear more than {} times".format(
                found_words, word_vectors_path, min_freq
            )
        )

215
    return embedding_matrix.astype("float32")