From b0d11b3b429c72375ba39b8c10fe36e770c83957 Mon Sep 17 00:00:00 2001
From: xixiaoyao <emis_go@163.com>
Date: Thu, 17 Oct 2019 20:05:00 +0800
Subject: [PATCH] fix bugs

---
 mtl_run.py                             |  2 +-
 paradigm/answer_matching.py            |  2 +-
 paradigm/mask_language_model.py        |  1 +
 paradigm/reading_comprehension.py      |  2 +-
 reader/answer_matching_reader.py       |  3 ++-
 reader/joint_reader.py                 | 16 +++++++++++++++-
 reader/mask_language_model_reader.py   |  3 ++-
 reader/reading_comprehension_reader.py | 12 +++++++-----
 utils/batching.py                      |  1 +
 utils/configure.py                     |  8 +++++---
 utils/fp16.py                          |  1 +
 utils/init.py                          |  1 +
 utils/tokenization.py                  |  4 ++--
 13 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/mtl_run.py b/mtl_run.py
index 7f457b9..8bba8bc 100644
--- a/mtl_run.py
+++ b/mtl_run.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# encoding=utf8
+# -*- coding: utf-8 -*-
 
 import os
 import sys
diff --git a/paradigm/answer_matching.py b/paradigm/answer_matching.py
index d1bd202..2d53815 100644
--- a/paradigm/answer_matching.py
+++ b/paradigm/answer_matching.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# encoding=utf8
+# -*- coding: utf-8 -*-
 
 import paddle.fluid as fluid
 
diff --git a/paradigm/mask_language_model.py b/paradigm/mask_language_model.py
index fa13d75..3f8e21e 100644
--- a/paradigm/mask_language_model.py
+++ b/paradigm/mask_language_model.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# -*- coding: utf-8 -*-
 
 import paddle.fluid as fluid
 from backbone.utils.transformer import pre_process_layer
diff --git a/paradigm/reading_comprehension.py b/paradigm/reading_comprehension.py
index 5f38842..79e0543 100644
--- a/paradigm/reading_comprehension.py
+++ b/paradigm/reading_comprehension.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# encoding=utf8
+# -*- coding: utf-8 -*-
 
 import paddle.fluid as fluid
 import collections
diff --git a/reader/answer_matching_reader.py b/reader/answer_matching_reader.py
index 1764d0a..e072837 100644
--- a/reader/answer_matching_reader.py
+++ b/reader/answer_matching_reader.py
@@ -17,6 +17,7 @@ import types
 import csv
 import numpy as np
 from utils import tokenization
+import io
 from utils.batching import prepare_batch_data
 
 
@@ -115,7 +116,7 @@ class BaseProcessor(object):
     @classmethod
     def _read_tsv(cls, input_file, quotechar=None):
         """Reads a tab separated value file."""
-        with open(input_file, "r") as f:
+        with io.open(input_file, "r", encoding="utf8") as f:
             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
             lines = []
             for line in reader:
diff --git a/reader/joint_reader.py b/reader/joint_reader.py
index aced579..151847b 100644
--- a/reader/joint_reader.py
+++ b/reader/joint_reader.py
@@ -1,4 +1,18 @@
-#encoding=utf8
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# -*- coding: utf-8 -*-
+
 import os
 import sys
 import random
diff --git a/reader/mask_language_model_reader.py b/reader/mask_language_model_reader.py
index e4accba..134648d 100644
--- a/reader/mask_language_model_reader.py
+++ b/reader/mask_language_model_reader.py
@@ -22,6 +22,7 @@ import gzip
 import logging
 import re
 import six
+import io
 import collections
 from utils import tokenization
 from utils.batching import prepare_batch_data
@@ -126,7 +127,7 @@ class DataProcessor(object):
     def load_vocab(self, vocab_file):
         """Loads a vocabulary file into a dictionary."""
         vocab = collections.OrderedDict()
-        fin = open(vocab_file)
+        fin = io.open(vocab_file, encoding='utf8')
         for num, line in enumerate(fin):
             items = self.convert_to_unicode(line.strip()).split("\t")
             if len(items) > 2:
diff --git a/reader/reading_comprehension_reader.py b/reader/reading_comprehension_reader.py
index c3eaa72..afba885 100644
--- a/reader/reading_comprehension_reader.py
+++ b/reader/reading_comprehension_reader.py
@@ -14,9 +14,11 @@
 """Run MRQA"""
 
 import six
+import io
 import math
 import json
 import random
+import io
 import collections
 import numpy as np
 from utils import tokenization
@@ -401,14 +403,14 @@ class DataProcessor(object):
 
             all_nbest_json[example.qas_id] = nbest_json
 
-        with open(output_prediction_file, "w") as writer:
+        with io.open(output_prediction_file, "w", encoding="utf8") as writer:
             writer.write(json.dumps(all_predictions, indent=4) + "\n")
 
-        with open(output_nbest_file, "w") as writer:
+        with io.open(output_nbest_file, "w", encoding="utf8") as writer:
             writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
 
         if with_negative:
-            with open(output_null_log_odds_file, "w") as writer:
+            with io.open(output_null_log_odds_file, "w", encoding="utf8") as writer:
                 writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
 
 
@@ -486,7 +488,7 @@ def read_mrqa_examples(input_file, is_training, with_negative=False):
     """Read a MRQA json file into a list of MRQAExample."""
     phase = 'training' if is_training else 'testing'
     print("loading mrqa {} data...".format(phase))
-    with open(input_file, "r") as reader:
+    with io.open(input_file, "r", encoding="utf8") as reader:
         input_data = json.load(reader)["data"]
 
     def is_whitespace(c):
@@ -736,7 +738,7 @@ def estimate_runtime_examples(data_path, sample_rate, tokenizer, \
     assert sample_rate > 0.0 and sample_rate <= 1.0, "sample_rate must be set between 0.0~1.0"
 
     print("loading data with json parser...")
-    with open(data_path, "r") as reader:
+    with io.open(data_path, "r", encoding="utf8") as reader:
         data = json.load(reader)["data"]
 
     num_raw_examples = 0
diff --git a/utils/batching.py b/utils/batching.py
index 13803cf..569a55c 100644
--- a/utils/batching.py
+++ b/utils/batching.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# -*- coding: utf-8 -*-
 """Mask, padding and batching."""
 from __future__ import absolute_import
 from __future__ import division
diff --git a/utils/configure.py b/utils/configure.py
index 2d63b0b..67de31f 100644
--- a/utils/configure.py
+++ b/utils/configure.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# -*- coding: utf-8 -*-
 
 from __future__ import absolute_import
 from __future__ import division
@@ -18,6 +19,7 @@ from __future__ import print_function
 
 import os
 import sys
+import io
 import argparse
 import json
 import yaml
@@ -38,7 +40,7 @@ class JsonConfig(object):
 
     def _parse(self, config_path):
         try:
-            with open(config_path) as json_file:
+            with io.open(config_path, encoding="utf8") as json_file:
                 config_dict = json.load(json_file)
                 assert isinstance(config_dict, dict), "Object in {} is NOT a dict.".format(config_path)
         except:
@@ -216,7 +218,7 @@ class PDConfig(object):
             raise Warning("the json file %s does not exist." % file_path)
             return
 
-        with open(file_path, "r") as fin:
+        with io.open(file_path, "r", encoding="utf8") as fin:
             self.json_config = json.loads(fin.read())
             fin.close()
 
@@ -241,7 +243,7 @@ class PDConfig(object):
                 raise Warning("the yaml file %s does not exist." % file_path)
                 return
 
-            with open(file_path, "r") as fin: 
+            with io.open(file_path, "r", encoding="utf8") as fin: 
                 self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader)
                 fin.close()
 
diff --git a/utils/fp16.py b/utils/fp16.py
index e153c2b..28e0be4 100644
--- a/utils/fp16.py
+++ b/utils/fp16.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# -*- coding: utf-8 -*-
 
 from __future__ import print_function
 import paddle
diff --git a/utils/init.py b/utils/init.py
index abdad1f..0e8c47e 100644
--- a/utils/init.py
+++ b/utils/init.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# -*- coding: utf-8 -*-
 
 from __future__ import print_function
 
diff --git a/utils/tokenization.py b/utils/tokenization.py
index 3a52ecf..8001f95 100644
--- a/utils/tokenization.py
+++ b/utils/tokenization.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 import collections
 import unicodedata
 import six
-
+import io
 
 def convert_to_unicode(text):
     """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
@@ -68,7 +68,7 @@ def printable_text(text):
 def load_vocab(vocab_file):
     """Loads a vocabulary file into a dictionary."""
     vocab = collections.OrderedDict()
-    fin = open(vocab_file)
+    fin = io.open(vocab_file, encoding="utf8")
     for num, line in enumerate(fin):
         items = convert_to_unicode(line.strip()).split("\t")
         if len(items) > 2:
-- 
GitLab