tokenizer.py 4.1 KB
Newer Older
S
SiMing Dai 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
import os

import numpy as np
from paddlehub.common.logger import logger


class Tokenizer(object):
    """Base tokenizer class.
    """

    def __init__(self):
        pass

    def tokenize(self, text):
        raise NotImplementedError


class SimpleTokenizer(Tokenizer):
    """Simple version FMM(Forward Maximun Matching) word tokenizer. This tokenizer can only
       be used in topic model demo, but not in real business application scenarios.

       Notes: This tokenizer can only recognize the words in the corresponding vocab file.
    """

    def __init__(self, vocab_path):
        super().__init__()
        self.__max_word_len = 0
        self.__vocab = set()
        self.__load_vocab(vocab_path)

    def tokenize(self, text):
        """Tokenize the input string `text`, and return the tokenize result.
        """
        text_len = len(text)
        result = []
        i = 0
        while i < text_len:
            word = found_word = ""
            # Deal with English characters.
            if self.__is_eng_char(text[i]):
                for j in range(i, text_len + 1):
                    if j < text_len and self.__is_eng_char(text[j]):
                        word += self.__tolower(text[j])
                    else:
                        # Forward matching by character granularity.
                        if word in self.__vocab:
                            result.append(word)
                        i = j - 1
                        break
            else:
                for j in range(i, min(i + self.__max_word_len, text_len)):
                    word += text[j]
                    if word in self.__vocab:
                        found_word = word
                if len(found_word) > 0:
                    result.append(found_word)
                    i += len(found_word) - 1
            i += 1
        return result

    def contains(self, word):
        """Check whether the word is in the vocabulary.
        """
        return word in self.__vocab

    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
        """
        with open(vocab_path, 'r') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')
                assert len(fields) >= 2
                word = fields[1]
                self.__max_word_len = max(self.__max_word_len, len(word))
                self.__vocab.add(word)
                vocab_size += 1

    def __is_eng_char(self, c):
        """Check whether char c is an English character.
        """
        return (c >= 'A' and c <= 'Z') or (c >= 'a' and c <= 'z')

    def __tolower(self, c):
        """Return the lowercase character of the corresponding character, or return
           the original character if there is no corresponding lowercase character.
        """
        return c.lower()


class LACTokenizer(Tokenizer):
    def __init__(self, vocab_path, lac):
        super().__init__()
        self.__max_word_len = 0
        self.__vocab = set()
        self.__lac = lac
        self.__load_vocab(vocab_path)

    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
                """
        with open(vocab_path, 'r') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')
                assert len(fields) >= 2
                word = fields[1]
                self.__max_word_len = max(self.__max_word_len, len(word))
                self.__vocab.add(word)
                vocab_size += 1

    def tokenize(self, text):
        results = self.__lac.lexical_analysis(
            texts=[text], use_gpu=True, batch_size=1, return_tag=True)
        # Change English words to lower case.
        # And just preserve the word in vocab.
        words = results[0]["word"]
        result = []
        for word in words:
            word = word.lower()
            if word in self.__vocab:
                result.append(word)
        return result

    def contains(self, word):
        """Check whether the word is in the vocabulary.
        """
        return word in self.__vocab