text_utils.py 5.1 KB
Newer Older
1
import os
2

3
import numpy as np
4
from gensim.utils import tokenize
5

6 7
from ..wdtypes import *
from .fastai_transforms import Vocab, Tokenizer
8

J
jrzaurin 已提交
9 10 11 12 13 14 15 16 17 18 19
__all__ = ["simple_preprocess", "get_texts", "pad_sequences", "build_embeddings_matrix"]


def simple_preprocess(
    doc: str,
    lower: bool = False,
    deacc: bool = False,
    min_len: int = 2,
    max_len: int = 15,
) -> List[str]:
    r"""
20 21
    ``Gensim``'s ``simple_preprocess`` adding a 'lower' param to indicate wether or not to
    lower case all the token in the doc
J
jrzaurin 已提交
22

23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
    For more information see: ``Gensim`` `utils module <https://radimrehurek.com/gensim/utils.html>`_.

    Parameters
    ----------
    doc: str
        Input document.
    lower: bool, Default = False
        Lower case tokens in the input doc
    deacc: bool, Default = False
        Remove accent marks from tokens using ``Gensim``'s ``deaccent()``
    min_len: int, Default = 2
        Minimum length of token (inclusive). Shorter tokens are discarded.
    max_len: int, Default = 15
        Maximum length of token in result (inclusive). Longer tokens are discarded.

    Returns
    -------
    tokens: List
        List of tokens for a given doc
J
jrzaurin 已提交
42 43 44 45 46 47 48 49 50 51
    """
    tokens = [
        token
        for token in tokenize(doc, lower=False, deacc=deacc, errors="ignore")
        if min_len <= len(token) <= max_len and not token.startswith("_")
    ]
    return tokens


def get_texts(texts: List[str]) -> List[List[str]]:
52 53 54 55
    r"""Tokenization using ``Fastai``'s Tokenizer because it does a series
    of very convenients things during the tokenization process

    See :class:`pytorch_widedeep.utils.fastai_utils.Tokenizer`
J
jrzaurin 已提交
56

57 58 59 60 61 62 63 64 65
    Parameters
    ----------
    texts: List
        List of ``str`` with the texts (or documents). One ``str`` per document

    Returns
    -------
    tok: List
        List containing the tokens per text or document
J
jrzaurin 已提交
66 67 68 69 70 71 72 73 74 75
    """
    processed_textx = [" ".join(simple_preprocess(t)) for t in texts]
    tok = Tokenizer().process_all(processed_textx)
    return tok


def pad_sequences(
    seq: List[int], maxlen: int, pad_first: bool = True, pad_idx: int = 1
) -> List[List[int]]:
    r"""
76 77
    Given a List of tokenized and `numericalised` sequences it will return
    padded sequences according to the input parameters
J
jrzaurin 已提交
78 79 80 81

    Parameters
    ----------
    seq: List
82 83
        List of ``int`` with the `numericalised` tokens
    maxlen: int
J
jrzaurin 已提交
84
        Maximum length of the padded sequences
85
    pad_first: bool,  Default = True
J
jrzaurin 已提交
86 87
        Indicates whether the padding index will be added at the beginning or the
        end of the sequences
88 89
    pad_idx: int. Default = 1
        padding index. ``Fastai``'s ``Tokenizer`` leaves 0 for the 'unknown' token.
J
jrzaurin 已提交
90

91 92
    Returns
    -------
J
jrzaurin 已提交
93
    res: List
94
        List of padded senquences
J
jrzaurin 已提交
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
    """
    if len(seq) >= maxlen:
        res = np.array(seq[-maxlen:]).astype("int32")
        return res
    else:
        res = np.zeros(maxlen, dtype="int32") + pad_idx
        if pad_first:
            res[-len(seq) :] = seq
        else:
            res[: len(seq) :] = seq
        return res


def build_embeddings_matrix(
    vocab: Vocab, word_vectors_path: str, min_freq: int, verbose: int = 1
) -> np.ndarray:
    r"""
    Build the embedding matrix using pretrained word vectors

    Parameters
    ----------
116 117 118
    vocab: Instance of Fastai's ``Vocab``
        see :class:`pytorch_widedeep.utils.fastai_utils.Vocab`
    word_vectors_path: str
J
jrzaurin 已提交
119
        path to the pretrained word embeddings
120
    min_freq: int
J
jrzaurin 已提交
121
        minimum frequency required for a word to be in the vocabulary
122 123
    verbose: int. Default=1
        ``int`` indicating verbosity. Set to 0 for no verbosity
J
jrzaurin 已提交
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169

    Returns
    -------
    embedding_matrix: np.ndarray
        pretrained word embeddings. If a word in our vocabulary is not among the
        pretrained embeddings it will be assigned the mean pretrained
        word-embeddings vector
    """
    if not os.path.isfile(word_vectors_path):
        raise FileNotFoundError("{} not found".format(word_vectors_path))
    if verbose:
        print("Indexing word vectors...")

    embeddings_index = {}
    f = open(word_vectors_path)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs
    f.close()

    if verbose:
        print("Loaded {} word vectors".format(len(embeddings_index)))
        print("Preparing embeddings matrix...")

    mean_word_vector = np.mean(list(embeddings_index.values()), axis=0)
    embedding_dim = len(list(embeddings_index.values())[0])
    num_words = len(vocab.itos)
    embedding_matrix = np.zeros((num_words, embedding_dim))
    found_words = 0
    for i, word in enumerate(vocab.itos):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            found_words += 1
        else:
            embedding_matrix[i] = mean_word_vector

    if verbose:
        print(
            "{} words in the vocabulary had {} vectors and appear more than {} times".format(
                found_words, word_vectors_path, min_freq
            )
        )

170
    return embedding_matrix.astype("float32")