From 1106077fa3ae6159905e75282bc5996fa89fcbc5 Mon Sep 17 00:00:00 2001 From: tianxin Date: Tue, 24 Sep 2019 14:59:06 +0800 Subject: [PATCH] set encoding=utf8 for open (#3395) --- PaddleNLP/preprocess/ernie/task_reader.py | 2 +- PaddleNLP/preprocess/ernie/tokenization.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PaddleNLP/preprocess/ernie/task_reader.py b/PaddleNLP/preprocess/ernie/task_reader.py index 1ff5c10f..1073b67f 100644 --- a/PaddleNLP/preprocess/ernie/task_reader.py +++ b/PaddleNLP/preprocess/ernie/task_reader.py @@ -66,7 +66,7 @@ class BaseReader(object): def _read_tsv(self, input_file, quotechar=None): """Reads a tab separated value file.""" - with open(input_file, "r") as f: + with open(input_file, "r", encoding="utf8") as f: reader = csv.reader(f, delimiter="\t", quotechar=quotechar) headers = next(reader) Example = namedtuple('Example', headers) diff --git a/PaddleNLP/preprocess/ernie/tokenization.py b/PaddleNLP/preprocess/ernie/tokenization.py index f906b537..abdb6d11 100644 --- a/PaddleNLP/preprocess/ernie/tokenization.py +++ b/PaddleNLP/preprocess/ernie/tokenization.py @@ -69,7 +69,7 @@ def printable_text(text): def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() - fin = open(vocab_file) + fin = open(vocab_file, encoding="utf8") for num, line in enumerate(fin): items = convert_to_unicode(line.strip()).split("\t") if len(items) > 2: -- GitLab