提交 b5a4bb1b 编写于 作者: W wuzewu

Fix encoding bug

上级 d34e9473
......@@ -22,6 +22,7 @@ import sys
import requests
from paddlehub.common.logger import logger
from paddlehub.common.utils import sys_stdin_encoding
from paddlehub.common import stats
from paddlehub.commands.base_command import BaseCommand
from paddlehub.commands import show
......@@ -63,7 +64,7 @@ def main():
argv = []
for item in sys.argv:
if six.PY2:
argv.append(item.decode(sys.stdin.encoding).decode("utf8"))
argv.append(item.decode(sys_stdin_encoding()).decode("utf8"))
else:
argv.append(item)
command.execute(argv[1:])
......@@ -73,7 +74,7 @@ if __name__ == "__main__":
argv = []
for item in sys.argv:
if six.PY2:
argv.append(item.decode(sys.stdin.encoding).decode("utf8"))
argv.append(item.decode(sys_stdin_encoding()).decode("utf8"))
else:
argv.append(item)
command.execute(argv[1:])
......@@ -17,6 +17,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import os
import time
import multiprocessing
......@@ -231,3 +232,29 @@ def get_running_device_info(config):
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
return place, dev_count
def get_platform_default_encoding():
if platform.platform().lower().startswith("windows"):
return "gbk"
return "utf8"
def sys_stdin_encoding():
encoding = sys.stdin.encoding
if encoding is None:
encoding = sys.getdefaultencoding()
if encoding is None:
encoding = get_platform_default_encoding()
return encoding
def sys_stdout_encoding():
encoding = sys.stdout.encoding
if encoding is None:
encoding = sys.getdefaultencoding()
if encoding is None:
encoding = get_platform_default_encoding()
return encoding
......@@ -21,6 +21,8 @@ import codecs
import sys
import yaml
from paddlehub.common.utils import sys_stdin_encoding
class CSVFileParser(object):
def __init__(self):
......@@ -30,7 +32,7 @@ class CSVFileParser(object):
pass
def parse(self, csv_file):
with codecs.open(csv_file, "r", sys.stdin.encoding) as file:
with codecs.open(csv_file, "r", sys_stdin_encoding()) as file:
content = file.read()
content = content.split('\n')
self.title = content[0].split(',')
......@@ -57,7 +59,7 @@ class YAMLFileParser(object):
pass
def parse(self, yaml_file):
with codecs.open(yaml_file, "r", sys.stdin.encoding) as file:
with codecs.open(yaml_file, "r", sys_stdin_encoding()) as file:
content = file.read()
return yaml.load(content, Loader=yaml.BaseLoader)
......@@ -70,7 +72,7 @@ class TextFileParser(object):
pass
def parse(self, txt_file):
with codecs.open(txt_file, "r", sys.stdin.encoding) as file:
with codecs.open(txt_file, "r", sys_stdin_encoding()) as file:
contents = []
for line in file:
line = line.strip()
......
......@@ -29,6 +29,7 @@ import paddle
from paddlehub.reader import tokenization
from paddlehub.common.logger import logger
from paddlehub.common.utils import sys_stdout_encoding
from paddlehub.dataset.dataset import InputExample
from .batching import pad_batch_data
import paddlehub as hub
......@@ -527,7 +528,7 @@ class LACClassifyReader(object):
]
if len(processed) == 0:
if six.PY2:
text = text.encode(sys.stdout.encoding)
text = text.encode(sys_stdout_encoding())
logger.warning(
"The words in text %s can't be found in the vocabulary." %
(text))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册