text_featurizer.py 2.0 KB
Newer Older
1
"""Contains the text featurizer class."""
2 3 4 5 6 7 8 9
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os


class TextFeaturizer(object):
10 11 12 13 14 15 16 17 18 19 20
    """Text featurizer, for processing or extracting features from text.

    Currently, it only supports char-level tokenizing and conversion into
    a list of token indices. Note that the token indexing order follows the
    given vocabulary file.

    :param vocab_filepath: Filepath to load vocabulary for token indices
                           conversion.
    :type specgram_type: basestring
    """

21 22 23 24
    def __init__(self, vocab_filepath):
        self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file(
            vocab_filepath)

25 26 27 28 29 30 31 32 33
    def featurize(self, text):
        """Convert text string to a list of token indices in char-level.Note
        that the token indexing order follows the given vocabulary file.

        :param text: Text to process.
        :type text: basestring
        :return: List of char-level token indices.
        :rtype: list
        """
34 35 36 37 38
        tokens = self._char_tokenize(text)
        return [self._vocab_dict[token] for token in tokens]

    @property
    def vocab_size(self):
39 40 41 42 43
        """Return the vocabulary size.

        :return: Vocabulary size.
        :rtype: int
        """
44 45 46 47
        return len(self._vocab_list)

    @property
    def vocab_list(self):
48 49 50 51 52
        """Return the vocabulary in list.

        :return: Vocabulary in list.
        :rtype: list
        """
53 54 55
        return self._vocab_list

    def _char_tokenize(self, text):
56
        """Character tokenizer."""
57 58 59 60 61 62 63 64 65 66 67
        return list(text.strip())

    def _load_vocabulary_from_file(self, vocab_filepath):
        """Load vocabulary from file."""
        vocab_lines = []
        with open(vocab_filepath, 'r') as file:
            vocab_lines.extend(file.readlines())
        vocab_list = [line[:-1] for line in vocab_lines]
        vocab_dict = dict(
            [(token, id) for (id, token) in enumerate(vocab_list)])
        return vocab_dict, vocab_list