text_utils.py 3.7 KB
Newer Older
1 2 3
import numpy as np
import pandas as pd
import html
4
import os
5 6 7 8 9
import re

from pathlib import PosixPath
from typing import List
from gensim.utils import tokenize
10
from sklearn.utils.validation import check_is_fitted
11

12 13
from .fastai_transforms import Tokenizer, Vocab
from .base_util import DataProcessor
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
from ..wdtypes import *


def simple_preprocess(doc:str, lower:bool=False, deacc:bool=False, min_len:int=2,
	max_len:int=15) -> List[str]:
    tokens = [
        token for token in tokenize(doc, lower=False, deacc=deacc, errors='ignore')
        if min_len <= len(token) <= max_len and not token.startswith('_')
    ]
    return tokens


def get_texts(texts:List[str]) -> List[List[str]]:
    processed_textx = [' '.join(simple_preprocess(t)) for t in texts]
    tok = Tokenizer().process_all(processed_textx)
    return tok


def pad_sequences(seq:List[int], maxlen:int=190, pad_first:bool=True, pad_idx:int=1) -> List[List[int]]:
    if len(seq) >= maxlen:
        res = np.array(seq[-maxlen:]).astype('int32')
        return res
    else:
        res = np.zeros(maxlen, dtype='int32') + pad_idx
        if pad_first: res[-len(seq):] = seq
        else:         res[:len(seq):] = seq
        return res


43
def build_embeddings_matrix(vocab:Vocab, word_vectors_path:str, verbose:int=1) -> np.ndarray:
44

45 46
	if not os.path.isfile(word_vectors_path):
		raise FileNotFoundError("{} not found".format(word_vectors_path))
47 48 49
	if verbose: print('Indexing word vectors...')

	embeddings_index = {}
50
	f = open(word_vectors_path)
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
	for line in f:
	    values = line.split()
	    word = values[0]
	    coefs = np.asarray(values[1:], dtype='float32')
	    embeddings_index[word] = coefs
	f.close()

	if verbose:
		print('Loaded {} word vectors'.format(len(embeddings_index)))
		print('Preparing embeddings matrix...')

	mean_word_vector = np.mean(list(embeddings_index.values()), axis=0)
	embedding_dim = len(list(embeddings_index.values())[0])
	num_words = len(vocab.itos)
	embedding_matrix = np.zeros((num_words, embedding_dim))
	found_words=0
	for i,word in enumerate(vocab.itos):
	    embedding_vector = embeddings_index.get(word)
	    if embedding_vector is not None:
	        embedding_matrix[i] = embedding_vector
	        found_words+=1
	    else:
	        embedding_matrix[i] = mean_word_vector

	if verbose:
		print('{} words in the vocabulary had {} vectors and appear more than the min frequency'.format(found_words, word_vectors_path))

78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
	return embedding_matrix


class TextProcessor(DataProcessor):
	"""docstring for TextProcessor"""
	def __init__(self, max_vocab:int=30000, min_freq:int=5,
		maxlen:int=80, word_vectors_path:Optional[str]=None,
		verbose:int=1):
		super(TextProcessor, self).__init__()
		self.max_vocab = max_vocab
		self.min_freq = min_freq
		self.maxlen = maxlen
		self.word_vectors_path = word_vectors_path
		self.verbose = verbose

	def fit(self, df:pd.DataFrame, text_col:str)->DataProcessor:
		text_col = text_col
		texts = df[text_col].tolist()
		tokens = get_texts(texts)
		self.vocab = Vocab.create(tokens, max_vocab=self.max_vocab, min_freq=self.min_freq)
		return self

	def transform(self, df:pd.DataFrame, text_col:str)->np.ndarray:
		check_is_fitted(self, 'vocab')
		self.text_col = text_col
		texts = df[self.text_col].tolist()
		self.tokens = get_texts(texts)
		sequences = [self.vocab.numericalize(t) for t in self.tokens]
		padded_seq = np.array([pad_sequences(s, maxlen=self.maxlen) for s in sequences])
		if self.verbose:
		    print("The vocabulary contains {} words".format(len(self.vocab.stoi)))
		if self.word_vectors_path is not None:
		    self.embedding_matrix = build_embeddings_matrix(self.vocab, self.word_vectors_path)
		return padded_seq

	def fit_transform(self, df:pd.DataFrame, text_col:str)->np.ndarray:
		return self.fit(df, text_col).transform(df, text_col)