From dc5abb6f0ce34e9ce549d2baa563c6cba3c55894 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 16 Oct 2019 12:23:17 +0800 Subject: [PATCH] Fix windows encoding problem (#3595) (#3597) --- .../BERT/reader/cls.py | 3 ++- .../BERT/reader/pretraining.py | 3 ++- .../BERT/reader/squad.py | 15 ++++++++------- .../BERT/run_classifier.py | 4 ++++ .../BERT/run_squad.py | 4 ++++ .../BERT/tokenization.py | 3 ++- .../language_representations_kit/BERT/train.py | 4 ++++ 7 files changed, 26 insertions(+), 10 deletions(-) diff --git a/PaddleNLP/language_representations_kit/BERT/reader/cls.py b/PaddleNLP/language_representations_kit/BERT/reader/cls.py index 7448526f..60bd5505 100644 --- a/PaddleNLP/language_representations_kit/BERT/reader/cls.py +++ b/PaddleNLP/language_representations_kit/BERT/reader/cls.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import io import os import types import csv @@ -100,7 +101,7 @@ class DataProcessor(object): @classmethod def _read_tsv(cls, input_file, quotechar=None): """Reads a tab separated value file.""" - with open(input_file, "r") as f: + with io.open(input_file, "r", encoding="utf8") as f: reader = csv.reader(f, delimiter="\t", quotechar=quotechar) lines = [] for line in reader: diff --git a/PaddleNLP/language_representations_kit/BERT/reader/pretraining.py b/PaddleNLP/language_representations_kit/BERT/reader/pretraining.py index f43400eb..fe65e64a 100644 --- a/PaddleNLP/language_representations_kit/BERT/reader/pretraining.py +++ b/PaddleNLP/language_representations_kit/BERT/reader/pretraining.py @@ -15,6 +15,7 @@ from __future__ import print_function from __future__ import division +import io import os import numpy as np import types @@ -125,7 +126,7 @@ class DataReader(object): def load_vocab(self, vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() - fin = open(vocab_file) + fin = io.open(vocab_file, encoding="utf8") for num, line in enumerate(fin): items = self.convert_to_unicode(line.strip()).split("\t") if len(items) > 2: diff --git a/PaddleNLP/language_representations_kit/BERT/reader/squad.py b/PaddleNLP/language_representations_kit/BERT/reader/squad.py index 90c3496c..fb9ec657 100644 --- a/PaddleNLP/language_representations_kit/BERT/reader/squad.py +++ b/PaddleNLP/language_representations_kit/BERT/reader/squad.py @@ -14,6 +14,7 @@ # limitations under the License. """Run BERT on SQuAD 1.1 and SQuAD 2.0.""" +import io import six import math import json @@ -95,7 +96,7 @@ class InputFeatures(object): def read_squad_examples(input_file, is_training, version_2_with_negative=False): """Read a SQuAD json file into a list of SquadExample.""" - with open(input_file, "r") as reader: + with io.open(input_file, "r", encoding="utf8") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): @@ -763,15 +764,15 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, all_nbest_json[example.qas_id] = nbest_json - with open(output_prediction_file, "w") as writer: - writer.write(json.dumps(all_predictions, indent=4) + "\n") + with io.open(output_prediction_file, "w", encoding="utf8") as writer: + writer.write(json.dumps(all_predictions, indent=4) + u"\n") - with open(output_nbest_file, "w") as writer: - writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + with io.open(output_nbest_file, "w", encoding="utf8") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + u"\n") if version_2_with_negative: - with open(output_null_log_odds_file, "w") as writer: - writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + with io.open(output_null_log_odds_file, "w", encoding="utf8") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + u"\n") def get_final_text(pred_text, orig_text, do_lower_case, verbose): diff --git a/PaddleNLP/language_representations_kit/BERT/run_classifier.py b/PaddleNLP/language_representations_kit/BERT/run_classifier.py index 851ec77b..81a8becc 100644 --- a/PaddleNLP/language_representations_kit/BERT/run_classifier.py +++ b/PaddleNLP/language_representations_kit/BERT/run_classifier.py @@ -17,6 +17,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import sys +reload(sys) +sys.setdefaultencoding('utf8') + import os import time import argparse diff --git a/PaddleNLP/language_representations_kit/BERT/run_squad.py b/PaddleNLP/language_representations_kit/BERT/run_squad.py index d465eff9..8b906986 100644 --- a/PaddleNLP/language_representations_kit/BERT/run_squad.py +++ b/PaddleNLP/language_representations_kit/BERT/run_squad.py @@ -17,6 +17,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import sys +reload(sys) +sys.setdefaultencoding('utf8') + import argparse import collections import multiprocessing diff --git a/PaddleNLP/language_representations_kit/BERT/tokenization.py b/PaddleNLP/language_representations_kit/BERT/tokenization.py index f906b537..08570f30 100644 --- a/PaddleNLP/language_representations_kit/BERT/tokenization.py +++ b/PaddleNLP/language_representations_kit/BERT/tokenization.py @@ -21,6 +21,7 @@ from __future__ import print_function import collections import unicodedata import six +import io def convert_to_unicode(text): @@ -69,7 +70,7 @@ def printable_text(text): def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() - fin = open(vocab_file) + fin = io.open(vocab_file, encoding="utf8") for num, line in enumerate(fin): items = convert_to_unicode(line.strip()).split("\t") if len(items) > 2: diff --git a/PaddleNLP/language_representations_kit/BERT/train.py b/PaddleNLP/language_representations_kit/BERT/train.py index c99b3db2..43642866 100644 --- a/PaddleNLP/language_representations_kit/BERT/train.py +++ b/PaddleNLP/language_representations_kit/BERT/train.py @@ -17,6 +17,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import sys +reload(sys) +sys.setdefaultencoding('utf8') + import os import time import sys -- GitLab