未验证 提交 7f83b995 编写于 作者: S SiMing Dai 提交者: GitHub

fix gbk encode (#821)

上级 f8d70245
......@@ -141,3 +141,7 @@ paddlehub >= 1.8.0
* 1.0.1
修复因为return的bug导致的NoneType错误
* 1.0.2
修复由于Windows`gbk`编码导致的问题
......@@ -93,7 +93,7 @@ class TopicModel(object):
"""Load the word topic parameters.
"""
logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f:
with open(word_dict_path, 'r', encoding='utf-8') as f:
for line in tqdm(f.readlines()):
fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!"
......
......@@ -14,7 +14,7 @@ from lda_news.vocab import Vocab, WordCount
@moduleinfo(
name="lda_news",
version="1.0.1",
version="1.0.2",
summary=
"This is a PaddleHub Module for LDA topic model in news dataset, where we can calculate doc distance, calculate the similarity between query and document, etc",
author="DesmonDay",
......
......@@ -64,7 +64,7 @@ class SimpleTokenizer(Tokenizer):
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
......@@ -97,7 +97,7 @@ class LACTokenizer(Tokenizer):
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
......
......@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
config: ModelConfig class
"""
logger.info("Loading LDA config.")
with open(config_file, 'r') as f:
with open(config_file, 'r', encoding='utf-8') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment.
......
......@@ -22,7 +22,7 @@ class Vocab(object):
def load(self, vocab_file):
self.__term2id = {}
self.__id2term = {}
with open(vocab_file, 'r') as fin:
with open(vocab_file, 'r', encoding='utf-8') as fin:
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(
......
......@@ -141,3 +141,7 @@ paddlehub >= 1.8.0
* 1.0.1
修复因为return的bug导致的NoneType错误
* 1.0.2
修复由于Windows`gbk`编码导致的问题
......@@ -93,7 +93,7 @@ class TopicModel(object):
"""Load the word topic parameters.
"""
logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f:
with open(word_dict_path, 'r', encoding='utf-8') as f:
for line in tqdm(f.readlines()):
fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!"
......
......@@ -14,7 +14,7 @@ from lda_novel.vocab import Vocab, WordCount
@moduleinfo(
name="lda_novel",
version="1.0.1",
version="1.0.2",
summary=
"This is a PaddleHub Module for LDA topic model in novel dataset, where we can calculate doc distance, calculate the similarity between query and document, etc.",
author="DesmonDay",
......
......@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
......@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
......
......@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
config: ModelConfig class
"""
logger.info("Loading LDA config.")
with open(config_file, 'r') as f:
with open(config_file, 'r', encoding='utf-8') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment.
......
......@@ -22,7 +22,7 @@ class Vocab(object):
def load(self, vocab_file):
self.__term2id = {}
self.__id2term = {}
with open(vocab_file, 'r') as fin:
with open(vocab_file, 'r', encoding='utf-8') as fin:
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(
......
......@@ -137,3 +137,7 @@ paddlehub >= 1.8.0
* 1.0.1
修复因为return的bug导致的NoneType错误
* 1.0.2
修复由于Windows`gbk`编码导致的问题
......@@ -93,7 +93,7 @@ class TopicModel(object):
"""Load the word topic parameters.
"""
logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f:
with open(word_dict_path, 'r', encoding='utf-8') as f:
for line in tqdm(f.readlines()):
fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!"
......
......@@ -14,7 +14,7 @@ from lda_webpage.vocab import Vocab, WordCount
@moduleinfo(
name="lda_webpage",
version="1.0.1",
version="1.0.2",
summary=
"This is a PaddleHub Module for LDA topic model in webpage dataset, where we can calculate doc distance, calculate the similarity between query and document, etc.",
author="DesmonDay",
......
......@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
......@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
......
......@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
config: ModelConfig class
"""
logger.info("Loading LDA config.")
with open(config_file, 'r') as f:
with open(config_file, 'r', encoding='utf-8') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment.
......
......@@ -22,7 +22,7 @@ class Vocab(object):
def load(self, vocab_file):
self.__term2id = {}
self.__id2term = {}
with open(vocab_file, 'r') as fin:
with open(vocab_file, 'r', encoding='utf-8') as fin:
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(
......
......@@ -93,7 +93,7 @@ class TopicModel(object):
"""Load the word topic parameters.
"""
logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f:
with open(word_dict_path, 'r', encoding='utf-8') as f:
for line in tqdm(f.readlines()):
fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!"
......
......@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
......@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
......
......@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
config: ModelConfig class
"""
logger.info("Loading SLDA config.")
with open(config_file, 'r') as f:
with open(config_file, 'r', encoding='utf-8') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment.
......
......@@ -22,7 +22,7 @@ class Vocab(object):
def load(self, vocab_file):
self.__term2id = {}
self.__id2term = {}
with open(vocab_file, 'r') as fin:
with open(vocab_file, 'r', encoding='utf-8') as fin:
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(
......
......@@ -93,7 +93,7 @@ class TopicModel(object):
"""Load the word topic parameters.
"""
logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f:
with open(word_dict_path, 'r', encoding='utf-8') as f:
for line in tqdm(f.readlines()):
fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!"
......
......@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
......@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
......
......@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
config: ModelConfig class
"""
logger.info("Loading SLDA config.")
with open(config_file, 'r') as f:
with open(config_file, 'r', encoding='utf-8') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment.
......
......@@ -22,7 +22,7 @@ class Vocab(object):
def load(self, vocab_file):
self.__term2id = {}
self.__id2term = {}
with open(vocab_file, 'r') as fin:
with open(vocab_file, 'r', encoding='utf-8') as fin:
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(
......
......@@ -93,7 +93,7 @@ class TopicModel(object):
"""Load the word topic parameters.
"""
logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f:
with open(word_dict_path, 'r', encoding='utf-8') as f:
for line in tqdm(f.readlines()):
fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!"
......
......@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
......@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
......
......@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
config: ModelConfig class
"""
logger.info("Loading SLDA config.")
with open(config_file, 'r') as f:
with open(config_file, 'r', encoding='utf-8') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment.
......
......@@ -22,7 +22,7 @@ class Vocab(object):
def load(self, vocab_file):
self.__term2id = {}
self.__id2term = {}
with open(vocab_file, 'r') as fin:
with open(vocab_file, 'r', encoding='utf-8') as fin:
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(
......
......@@ -93,7 +93,7 @@ class TopicModel(object):
"""Load the word topic parameters.
"""
logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f:
with open(word_dict_path, 'r', encoding='utf-8') as f:
for line in tqdm(f.readlines()):
fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!"
......
......@@ -66,7 +66,7 @@ class SimpleTokenizer(Tokenizer):
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
......@@ -99,7 +99,7 @@ class LACTokenizer(Tokenizer):
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
......
......@@ -14,7 +14,7 @@ def load_prototxt(config_file, config):
config: ModelConfig class
"""
logger.info("Loading SLDA config.")
with open(config_file, 'r') as f:
with open(config_file, 'r', encoding='utf-8') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment.
......
......@@ -22,7 +22,7 @@ class Vocab(object):
def load(self, vocab_file):
self.__term2id = {}
self.__id2term = {}
with open(vocab_file, 'r') as fin:
with open(vocab_file, 'r', encoding='utf-8') as fin:
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册