tokenizer.py 4.1 KB
Newer Older
S
SiMing Dai 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
import os

import numpy as np
from paddlehub.common.logger import logger


class Tokenizer(object):
    """Base tokenizer class.
    """

    def __init__(self):
        pass

    def tokenize(self, text):
        raise NotImplementedError


class SimpleTokenizer(Tokenizer):
    """Simple version FMM(Forward Maximun Matching) word tokenizer. This tokenizer can only
       be used in topic model demo, but not in real business application scenarios.

       Notes: This tokenizer can only recognize the words in the corresponding vocab file.
    """

    def __init__(self, vocab_path):
        super().__init__()
        self.__max_word_len = 0
        self.__vocab = set()
        self.__load_vocab(vocab_path)

    def tokenize(self, text):
        """Tokenize the input string `text`, and return the tokenize result.
        """
        text_len = len(text)
        result = []
        i = 0
        while i < text_len:
            word = found_word = ""
            # Deal with English characters.
            if self.__is_eng_char(text[i]):
                for j in range(i, text_len + 1):
                    if j < text_len and self.__is_eng_char(text[j]):
                        word += self.__tolower(text[j])
                    else:
                        # Forward matching by character granularity.
                        if word in self.__vocab:
                            result.append(word)
                        i = j - 1
                        break
            else:
                for j in range(i, min(i + self.__max_word_len, text_len)):
                    word += text[j]
                    if word in self.__vocab:
                        found_word = word
                if len(found_word) > 0:
                    result.append(found_word)
                    i += len(found_word) - 1
            i += 1
        return result

    def contains(self, word):
        """Check whether the word is in the vocabulary.
        """
        return word in self.__vocab

    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
        """
        with open(vocab_path, 'r') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')
                assert len(fields) >= 2
                word = fields[1]
                self.__max_word_len = max(self.__max_word_len, len(word))
                self.__vocab.add(word)
                vocab_size += 1

    def __is_eng_char(self, c):
        """Check whether char c is an English character.
        """
        return (c >= 'A' and c <= 'Z') or (c >= 'a' and c <= 'z')

    def __tolower(self, c):
        """Return the lowercase character of the corresponding character, or return
           the original character if there is no corresponding lowercase character.
        """
        return c.lower()


class LACTokenizer(Tokenizer):
    def __init__(self, vocab_path, lac):
        super().__init__()
        self.__max_word_len = 0
        self.__vocab = set()
        self.__lac = lac
        self.__load_vocab(vocab_path)

    def __load_vocab(self, vocab_path):
        """Load the word dictionary.
                """
        with open(vocab_path, 'r') as fin:
            vocab_size = 0
            for line in fin.readlines():
                fields = line.strip().split('\t')
                assert len(fields) >= 2
                word = fields[1]
                self.__max_word_len = max(self.__max_word_len, len(word))
                self.__vocab.add(word)
                vocab_size += 1

    def tokenize(self, text):
        results = self.__lac.lexical_analysis(
S
SiMing Dai 已提交
114
            texts=[text], use_gpu=False, batch_size=1, return_tag=True)
S
SiMing Dai 已提交
115 116 117 118 119 120 121 122 123 124 125 126 127 128
        # Change English words to lower case.
        # And just preserve the word in vocab.
        words = results[0]["word"]
        result = []
        for word in words:
            word = word.lower()
            if word in self.__vocab:
                result.append(word)
        return result

    def contains(self, word):
        """Check whether the word is in the vocabulary.
        """
        return word in self.__vocab