text_utils.py 4.2 KB
Newer Older
1
import numpy as np
2
import os
3

4
from ..wdtypes import *
5 6
from .fastai_transforms import Tokenizer, Vocab
from gensim.utils import tokenize
7 8


J
jrzaurin 已提交
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
__all__ = ["simple_preprocess", "get_texts", "pad_sequences", "build_embeddings_matrix"]


def simple_preprocess(
    doc: str,
    lower: bool = False,
    deacc: bool = False,
    min_len: int = 2,
    max_len: int = 15,
) -> List[str]:
    r"""
    Gensim's simple_preprocess adding a 'lower' param to indicate wether or not to
    lower case all the token in the texts

    For more informations see: https://radimrehurek.com/gensim/utils.html
    """
    tokens = [
        token
        for token in tokenize(doc, lower=False, deacc=deacc, errors="ignore")
        if min_len <= len(token) <= max_len and not token.startswith("_")
    ]
    return tokens


def get_texts(texts: List[str]) -> List[List[str]]:
    r"""
    Uses fastai's Tokenizer because it does a series of very convenients things
    during the tokenization process

    See here: https://docs.fast.ai/text.transform.html#Tokenizer
    """
    processed_textx = [" ".join(simple_preprocess(t)) for t in texts]
    tok = Tokenizer().process_all(processed_textx)
    return tok


def pad_sequences(
    seq: List[int], maxlen: int, pad_first: bool = True, pad_idx: int = 1
) -> List[List[int]]:
    r"""
    Given a List of tokenized and 'numericalised' sequences it will return padded sequences
    according to the input parameters maxlen, pad_first and pad_idx

    Parameters
    ----------
    seq: List
        List of int tokens
    maxlen: Int
        Maximum length of the padded sequences
    pad_first: Boolean. Default=True
        Indicates whether the padding index will be added at the beginning or the
        end of the sequences
    pad_idx: Int. Default=1
        padding index. Fastai's Tokenizer leaves 0 for the 'unknown' token.

    Returns:
    res: List
       Padded sequences
    """
    if len(seq) >= maxlen:
        res = np.array(seq[-maxlen:]).astype("int32")
        return res
    else:
        res = np.zeros(maxlen, dtype="int32") + pad_idx
        if pad_first:
            res[-len(seq) :] = seq
        else:
            res[: len(seq) :] = seq
        return res


def build_embeddings_matrix(
    vocab: Vocab, word_vectors_path: str, min_freq: int, verbose: int = 1
) -> np.ndarray:
    r"""
    Build the embedding matrix using pretrained word vectors

    Parameters
    ----------
    vocab: Fastai's Vocab object
        see: https://docs.fast.ai/text.transform.html#Vocab
    word_vectors_path:str
        path to the pretrained word embeddings
    min_freq: Int
        minimum frequency required for a word to be in the vocabulary
    verbose: Int. Default=1

    Returns
    -------
    embedding_matrix: np.ndarray
        pretrained word embeddings. If a word in our vocabulary is not among the
        pretrained embeddings it will be assigned the mean pretrained
        word-embeddings vector
    """
    if not os.path.isfile(word_vectors_path):
        raise FileNotFoundError("{} not found".format(word_vectors_path))
    if verbose:
        print("Indexing word vectors...")

    embeddings_index = {}
    f = open(word_vectors_path)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs
    f.close()

    if verbose:
        print("Loaded {} word vectors".format(len(embeddings_index)))
        print("Preparing embeddings matrix...")

    mean_word_vector = np.mean(list(embeddings_index.values()), axis=0)
    embedding_dim = len(list(embeddings_index.values())[0])
    num_words = len(vocab.itos)
    embedding_matrix = np.zeros((num_words, embedding_dim))
    found_words = 0
    for i, word in enumerate(vocab.itos):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            found_words += 1
        else:
            embedding_matrix[i] = mean_word_vector

    if verbose:
        print(
            "{} words in the vocabulary had {} vectors and appear more than {} times".format(
                found_words, word_vectors_path, min_freq
            )
        )

    return embedding_matrix