text_utils.py

import os

import numpy as np
from gensim.utils import tokenize

from ..wdtypes import *
from .fastai_transforms import Vocab, Tokenizer

__all__ = ["simple_preprocess", "get_texts", "pad_sequences", "build_embeddings_matrix"]


def simple_preprocess(
    doc: str,
    lower: bool = False,
    deacc: bool = False,
    min_len: int = 2,
    max_len: int = 15,
) -> List[str]:
    r"""
    ``Gensim``'s ``simple_preprocess`` adding a 'lower' param to indicate wether or not to
    lower case all the token in the doc

    For more information see: ``Gensim`` `utils module <https://radimrehurek.com/gensim/utils.html>`_.

    Parameters
    ----------
    doc: str
        Input document.
    lower: bool, Default = False
        Lower case tokens in the input doc
    deacc: bool, Default = False
        Remove accent marks from tokens using ``Gensim``'s ``deaccent()``
    min_len: int, Default = 2
        Minimum length of token (inclusive). Shorter tokens are discarded.
    max_len: int, Default = 15
        Maximum length of token in result (inclusive). Longer tokens are discarded.

    Returns
    -------
    tokens: List
        List of tokens for a given doc
    """
    tokens = [
        token
        for token in tokenize(doc, lower=False, deacc=deacc, errors="ignore")
        if min_len <= len(token) <= max_len and not token.startswith("_")
    ]
    return tokens


def get_texts(texts: List[str]) -> List[List[str]]:
    r"""Tokenization using ``Fastai``'s Tokenizer because it does a series
    of very convenients things during the tokenization process

    See :class:`pytorch_widedeep.utils.fastai_utils.Tokenizer`

    Parameters
    ----------
    texts: List
        List of ``str`` with the texts (or documents). One ``str`` per document

    Returns
    -------
    tok: List
        List containing the tokens per text or document
    """
    processed_textx = [" ".join(simple_preprocess(t)) for t in texts]
    tok = Tokenizer().process_all(processed_textx)
    return tok


def pad_sequences(
    seq: List[int], maxlen: int, pad_first: bool = True, pad_idx: int = 1
) -> List[List[int]]:
    r"""
    Given a List of tokenized and `numericalised` sequences it will return
    padded sequences according to the input parameters

    Parameters
    ----------
    seq: List
        List of ``int`` with the `numericalised` tokens
    maxlen: int
        Maximum length of the padded sequences
    pad_first: bool,  Default = True
        Indicates whether the padding index will be added at the beginning or the
        end of the sequences
    pad_idx: int. Default = 1
        padding index. ``Fastai``'s ``Tokenizer`` leaves 0 for the 'unknown' token.

    Returns
    -------
    res: List
        List of padded senquences
    """
    if len(seq) >= maxlen:
        res = np.array(seq[-maxlen:]).astype("int32")
        return res
    else:
        res = np.zeros(maxlen, dtype="int32") + pad_idx
        if pad_first:
            res[-len(seq) :] = seq
        else:
            res[: len(seq) :] = seq
        return res


def build_embeddings_matrix(
    vocab: Vocab, word_vectors_path: str, min_freq: int, verbose: int = 1
) -> np.ndarray:
    r"""
    Build the embedding matrix using pretrained word vectors

    Parameters
    ----------
    vocab: Instance of Fastai's ``Vocab``
        see :class:`pytorch_widedeep.utils.fastai_utils.Vocab`
    word_vectors_path: str
        path to the pretrained word embeddings
    min_freq: int
        minimum frequency required for a word to be in the vocabulary
    verbose: int. Default=1
        ``int`` indicating verbosity. Set to 0 for no verbosity

    Returns
    -------
    embedding_matrix: np.ndarray
        pretrained word embeddings. If a word in our vocabulary is not among the
        pretrained embeddings it will be assigned the mean pretrained
        word-embeddings vector
    """
    if not os.path.isfile(word_vectors_path):
        raise FileNotFoundError("{} not found".format(word_vectors_path))
    if verbose:
        print("Indexing word vectors...")

    embeddings_index = {}
    f = open(word_vectors_path)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs
    f.close()

    if verbose:
        print("Loaded {} word vectors".format(len(embeddings_index)))
        print("Preparing embeddings matrix...")

    mean_word_vector = np.mean(list(embeddings_index.values()), axis=0)
    embedding_dim = len(list(embeddings_index.values())[0])
    num_words = len(vocab.itos)
    embedding_matrix = np.zeros((num_words, embedding_dim))
    found_words = 0
    for i, word in enumerate(vocab.itos):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            found_words += 1
        else:
            embedding_matrix[i] = mean_word_vector

    if verbose:
        print(
            "{} words in the vocabulary had {} vectors and appear more than {} times".format(
                found_words, word_vectors_path, min_freq
            )
        )

    return embedding_matrix.astype("float32")