import numpy as np import pandas as pd import html import re from pathlib import PosixPath from typing import List from gensim.utils import tokenize from fastai.text import Tokenizer from fastai.text.transform import Vocab from ..wdtypes import * def prepare_text(df:pd.DataFrame, text_col:str, max_vocab:int, min_freq:int, maxlen:int, word_vectors_path:Optional[str]=None, verbose:int=1): texts = df[text_col].tolist() tokens = get_texts(texts) vocab = Vocab.create(tokens, max_vocab=max_vocab, min_freq=min_freq) sequences = [vocab.numericalize(t) for t in tokens] padded_seq = np.array([pad_sequences(s, maxlen=maxlen) for s in sequences]) if verbose: print("The vocabulary contains {} words".format(len(vocab.stoi))) if word_vectors_path is not None: embedding_matrix = build_embeddings_matrix(vocab, word_vectors_path) else: embedding_matrix = None return padded_seq, embedding_matrix, vocab def simple_preprocess(doc:str, lower:bool=False, deacc:bool=False, min_len:int=2, max_len:int=15) -> List[str]: tokens = [ token for token in tokenize(doc, lower=False, deacc=deacc, errors='ignore') if min_len <= len(token) <= max_len and not token.startswith('_') ] return tokens def get_texts(texts:List[str]) -> List[List[str]]: processed_textx = [' '.join(simple_preprocess(t)) for t in texts] tok = Tokenizer().process_all(processed_textx) return tok def pad_sequences(seq:List[int], maxlen:int=190, pad_first:bool=True, pad_idx:int=1) -> List[List[int]]: if len(seq) >= maxlen: res = np.array(seq[-maxlen:]).astype('int32') return res else: res = np.zeros(maxlen, dtype='int32') + pad_idx if pad_first: res[-len(seq):] = seq else: res[:len(seq):] = seq return res def build_embeddings_matrix(vocab:Vocab, word_vectors_path:PosixPath, verbose:int=1) -> np.ndarray: if verbose: print('Indexing word vectors...') embeddings_index = {} f = open(str(word_vectors_path)) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() if verbose: print('Loaded {} word vectors'.format(len(embeddings_index))) print('Preparing embeddings matrix...') mean_word_vector = np.mean(list(embeddings_index.values()), axis=0) embedding_dim = len(list(embeddings_index.values())[0]) num_words = len(vocab.itos) embedding_matrix = np.zeros((num_words, embedding_dim)) found_words=0 for i,word in enumerate(vocab.itos): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector found_words+=1 else: embedding_matrix[i] = mean_word_vector if verbose: print('{} words in the vocabulary had {} vectors and appear more than the min frequency'.format(found_words, word_vectors_path)) return embedding_matrix