import numpy as np
import pandas as pd
import html
import os
import re
from pathlib import PosixPath
from typing import List
from gensim.utils import tokenize
from sklearn.utils.validation import check_is_fitted
from .fastai_transforms import Tokenizer, Vocab
from .base_util import DataProcessor
from ..wdtypes import *
def simple_preprocess(doc:str, lower:bool=False, deacc:bool=False, min_len:int=2,
max_len:int=15) -> List[str]:
tokens = [
token for token in tokenize(doc, lower=False, deacc=deacc, errors='ignore')
if min_len <= len(token) <= max_len and not token.startswith('_')
]
return tokens
def get_texts(texts:List[str]) -> List[List[str]]:
processed_textx = [' '.join(simple_preprocess(t)) for t in texts]
tok = Tokenizer().process_all(processed_textx)
return tok
def pad_sequences(seq:List[int], maxlen:int=190, pad_first:bool=True, pad_idx:int=1) -> List[List[int]]:
if len(seq) >= maxlen:
res = np.array(seq[-maxlen:]).astype('int32')
return res
else:
res = np.zeros(maxlen, dtype='int32') + pad_idx
if pad_first: res[-len(seq):] = seq
else: res[:len(seq):] = seq
return res
def build_embeddings_matrix(vocab:Vocab, word_vectors_path:str, verbose:int=1) -> np.ndarray:
if not os.path.isfile(word_vectors_path):
raise FileNotFoundError("{} not found".format(word_vectors_path))
if verbose: print('Indexing word vectors...')
embeddings_index = {}
f = open(word_vectors_path)
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
if verbose:
print('Loaded {} word vectors'.format(len(embeddings_index)))
print('Preparing embeddings matrix...')
mean_word_vector = np.mean(list(embeddings_index.values()), axis=0)
embedding_dim = len(list(embeddings_index.values())[0])
num_words = len(vocab.itos)
embedding_matrix = np.zeros((num_words, embedding_dim))
found_words=0
for i,word in enumerate(vocab.itos):
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
found_words+=1
else:
embedding_matrix[i] = mean_word_vector
if verbose:
print('{} words in the vocabulary had {} vectors and appear more than the min frequency'.format(found_words, word_vectors_path))
return embedding_matrix
class TextProcessor(DataProcessor):
"""docstring for TextProcessor"""
def __init__(self, max_vocab:int=30000, min_freq:int=5,
maxlen:int=80, word_vectors_path:Optional[str]=None,
verbose:int=1):
super(TextProcessor, self).__init__()
self.max_vocab = max_vocab
self.min_freq = min_freq
self.maxlen = maxlen
self.word_vectors_path = word_vectors_path
self.verbose = verbose
def fit(self, df:pd.DataFrame, text_col:str)->DataProcessor:
text_col = text_col
texts = df[text_col].tolist()
tokens = get_texts(texts)
self.vocab = Vocab.create(tokens, max_vocab=self.max_vocab, min_freq=self.min_freq)
return self
def transform(self, df:pd.DataFrame, text_col:str)->np.ndarray:
check_is_fitted(self, 'vocab')
self.text_col = text_col
texts = df[self.text_col].tolist()
self.tokens = get_texts(texts)
sequences = [self.vocab.numericalize(t) for t in self.tokens]
padded_seq = np.array([pad_sequences(s, maxlen=self.maxlen) for s in sequences])
if self.verbose:
print("The vocabulary contains {} words".format(len(self.vocab.stoi)))
if self.word_vectors_path is not None:
self.embedding_matrix = build_embeddings_matrix(self.vocab, self.word_vectors_path)
return padded_seq
def fit_transform(self, df:pd.DataFrame, text_col:str)->np.ndarray:
return self.fit(df, text_col).transform(df, text_col)