tokenizer.py

import os

import numpy as np
from paddlehub.common.logger import logger


class Tokenizer(object):
    """Base tokenizer class.
    """

    def __init__(self):
        pass

    def tokenize(self, text):
        raise NotImplementedError


class SimpleTokenizer(Tokenizer):
    """Simple version FMM(Forward Maximun Matching) word tokenizer. This tokenizer can only
       be used in topic model demo, but not in real business application scenarios.

       Notes: This tokenizer can only recognize the words in the corresponding vocab file.
    """

    def __init__(self, vocab_path):
        super().__init__()
        self.__max_word_len = 0
        self.__vocab = set()
        self.__load_vocab(vocab_path)

    def tokenize(self, text):
        """Tokenize the input string `text`, and return the tokenize result.
        """
        text_len = len(text)
        result = []
        i = 0
        while i < text_len:
            word = found_word = ""
            # Deal with English characters.
            if self.__is_eng_char(text[i]):
                for j in range(i, text_len + 1):
                    if j < text_len and self.__is_eng_char(text[j]):
                        word += self.__tolower(text[j])
                    else:
                        # Forward matching by character granularity.
                        if word in self.__vocab:
                            result.append(word)
                        i = j - 1
                        break
            else:
                for j in range(i, min(i + self.__max_word_len, text_len)):
                    word += text[j]
                    if word in self.__vocab:
                        found_word = word
                if len(found_word) > 0:
                    result.append(found_word)
                    i += len(found_word) - 1
            i += 1
        return result

    def contains(self, word):
        """Check whether the word is in the vocabulary.
        """
        return word in self.__vocab

    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
        """
        with open(vocab_path, 'r') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')
                assert len(fields) >= 2
                word = fields[1]
                self.__max_word_len = max(self.__max_word_len, len(word))
                self.__vocab.add(word)
                vocab_size += 1

    def __is_eng_char(self, c):
        """Check whether char c is an English character.
        """
        return (c >= 'A' and c <= 'Z') or (c >= 'a' and c <= 'z')

    def __tolower(self, c):
        """Return the lowercase character of the corresponding character, or return
           the original character if there is no corresponding lowercase character.
        """
        return c.lower()


class LACTokenizer(Tokenizer):
    def __init__(self, vocab_path, lac):
        super().__init__()
        self.__max_word_len = 0
        self.__vocab = set()
        self.__lac = lac
        self.__load_vocab(vocab_path)

    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
                """
        with open(vocab_path, 'r') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')
                assert len(fields) >= 2
                word = fields[1]
                self.__max_word_len = max(self.__max_word_len, len(word))
                self.__vocab.add(word)
                vocab_size += 1

    def tokenize(self, text):
        results = self.__lac.lexical_analysis(
            texts=[text], use_gpu=True, batch_size=1, return_tag=True)
        # Change English words to lower case.
        # And just preserve the word in vocab.
        words = results[0]["word"]
        result = []
        for word in words:
            word = word.lower()
            if word in self.__vocab:
                result.append(word)
        return result

    def contains(self, word):
        """Check whether the word is in the vocabulary.
        """
        return word in self.__vocab