From eb72d3865aff4a46a1f26e100be4665f3245e12e Mon Sep 17 00:00:00 2001 From: kinghuin Date: Wed, 20 Nov 2019 15:37:41 +0800 Subject: [PATCH] debug tiny reader --- paddlehub/reader/nlp_reader.py | 2 +- paddlehub/reader/tokenization.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddlehub/reader/nlp_reader.py b/paddlehub/reader/nlp_reader.py index e3788543..b89039bb 100644 --- a/paddlehub/reader/nlp_reader.py +++ b/paddlehub/reader/nlp_reader.py @@ -49,7 +49,7 @@ class BaseReader(object): in_tokens=False): self.max_seq_len = max_seq_len if sp_model_path and word_dict_path: - self.tokenizer = tokenization.WSSPTokenizer( + self.tzokenizer = tokenization.WSSPTokenizer( vocab_path, sp_model_path, word_dict_path, ws=True, lower=True) else: self.tokenizer = tokenization.FullTokenizer( diff --git a/paddlehub/reader/tokenization.py b/paddlehub/reader/tokenization.py index 9b541a06..d936c251 100644 --- a/paddlehub/reader/tokenization.py +++ b/paddlehub/reader/tokenization.py @@ -186,11 +186,11 @@ class WSSPTokenizer(object): return words def tokenize(self, text): - sen = text.decode('utf8') + text = convert_to_unicode(text) if self.ws: - sen = [s for s in self.cut(sen) if s != ' '] + sen = [s for s in self.cut(text) if s != ' '] else: - sen = sen.split(' ') + sen = text.split(' ') if self.lower: sen = [s.lower() for s in sen] sen = ' '.join(sen) -- GitLab