diff --git a/paddlehub/reader/tokenization.py b/paddlehub/reader/tokenization.py index e3a55451a43aace5465b1f59249c1c5bfbb87c72..fab4121ff4a147dde007c4f19e468cf9f9917b0c 100644 --- a/paddlehub/reader/tokenization.py +++ b/paddlehub/reader/tokenization.py @@ -194,16 +194,15 @@ class WSSPTokenizer(object): text = text.split(' ') if self.lower: text = [s.lower() for s in text] - in_vocab = [] - for word in text: - if word in self.vocab: - in_vocab.append(word) - else: - in_vocab.append(unk_token) - text = ' '.join(in_vocab) + text = ' '.join(text) tokens = self.sp_model.EncodeAsPieces(text) - print(tokens) - return tokens + in_vocab_tokens = [] + for token in tokens: + if token in self.vocab: + in_vocab_tokens.append(token) + else: + in_vocab_tokens.append(unk_token) + return in_vocab_tokens def convert_tokens_to_ids(self, tokens): return convert_by_vocab(self.vocab, tokens)