tokenizer.py 4.1 KB
Newer Older
S
SiMing Dai 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
import os

import numpy as np
from paddlehub.common.logger import logger


class Tokenizer(object):
    """Base tokenizer class.
    """

    def __init__(self):
        pass

    def tokenize(self, text):
        raise NotImplementedError


class SimpleTokenizer(Tokenizer):
    """Simple version FMM(Forward Maximun Matching) word tokenizer. This tokenizer can only
       be used in topic model demo, but not in real business application scenarios.

       Notes: This tokenizer can only recognize the words in the corresponding vocab file.
    """

    def __init__(self, vocab_path):
        super().__init__()
        self.__max_word_len = 0
        self.__vocab = set()
        self.__load_vocab(vocab_path)

    def tokenize(self, text):
        """Tokenize the input string `text`, and return the tokenize result.
        """
        text_len = len(text)
        result = []
        i = 0
        while i < text_len:
            word = found_word = ""
            # Deal with English characters.
            if self.__is_eng_char(text[i]):
                for j in range(i, text_len + 1):
                    if j < text_len and self.__is_eng_char(text[j]):
                        word += self.__tolower(text[j])
                    else:
                        # Forward matching by character granularity.
                        if word in self.__vocab:
                            result.append(word)
                        i = j - 1
                        break
            else:
                for j in range(i, min(i + self.__max_word_len, text_len)):
                    word += text[j]
                    if word in self.__vocab:
                        found_word = word
                if len(found_word) > 0:
                    result.append(found_word)
                    i += len(found_word) - 1
            i += 1
        return result

    def contains(self, word):
        """Check whether the word is in the vocabulary.
        """
        return word in self.__vocab

    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
        """
S
SiMing Dai 已提交
69
        with open(vocab_path, 'r', encoding='utf-8') as fin:
S
SiMing Dai 已提交
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')
                assert len(fields) >= 2
                word = fields[1]
                self.__max_word_len = max(self.__max_word_len, len(word))
                self.__vocab.add(word)
                vocab_size += 1

    def __is_eng_char(self, c):
        """Check whether char c is an English character.
        """
        return (c >= 'A' and c <= 'Z') or (c >= 'a' and c <= 'z')

    def __tolower(self, c):
        """Return the lowercase character of the corresponding character, or return
           the original character if there is no corresponding lowercase character.
        """
        return c.lower()


class LACTokenizer(Tokenizer):
    def __init__(self, vocab_path, lac):
        super().__init__()
        self.__max_word_len = 0
        self.__vocab = set()
        self.__lac = lac
        self.__load_vocab(vocab_path)

    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
                """
S
SiMing Dai 已提交
102
        with open(vocab_path, 'r', encoding='utf-8') as fin:
S
SiMing Dai 已提交
103 104 105 106 107 108 109 110 111 112 113
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')
                assert len(fields) >= 2
                word = fields[1]
                self.__max_word_len = max(self.__max_word_len, len(word))
                self.__vocab.add(word)
                vocab_size += 1

    def tokenize(self, text):
        results = self.__lac.lexical_analysis(
S
SiMing Dai 已提交
114
            texts=[text], use_gpu=False, batch_size=1, return_tag=True)
S
SiMing Dai 已提交
115 116 117 118 119 120 121 122 123 124 125 126 127 128
        # Change English words to lower case.
        # And just preserve the word in vocab.
        words = results[0]["word"]
        result = []
        for word in words:
            word = word.lower()
            if word in self.__vocab:
                result.append(word)
        return result

    def contains(self, word):
        """Check whether the word is in the vocabulary.
        """
        return word in self.__vocab