Merge pull request #8 from xixiaoyao/master

fix bugs

Merge pull request #8 from xixiaoyao/master
fix bugs
a4987b6a · Xiaoyao Xi · GitHub · 6ef6adcb · 5f3bb81e · a4987b6a
14 changed file
--- a/.gitignore
+++ b/.gitignore
+*.pyc
+__pycache__
+pretrain_model
+output_model
--- a/mtl_run.py
+++ b/mtl_run.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# encoding=utf8
+# -*- coding: utf-8 -*-

 import os
 import sys

--- a/paradigm/answer_matching.py
+++ b/paradigm/answer_matching.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# encoding=utf8
+# -*- coding: utf-8 -*-

 import paddle.fluid as fluid


--- a/paradigm/mask_language_model.py
+++ b/paradigm/mask_language_model.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# -*- coding: utf-8 -*-

 import paddle.fluid as fluid
 from backbone.utils.transformer import pre_process_layer

--- a/paradigm/reading_comprehension.py
+++ b/paradigm/reading_comprehension.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# encoding=utf8
+# -*- coding: utf-8 -*-

 import paddle.fluid as fluid
 import collections

--- a/reader/answer_matching_reader.py
+++ b/reader/answer_matching_reader.py
@@ -17,6 +17,7 @@ import types
 import csv
 import numpy as np
 from utils import tokenization
+import io
 from utils.batching import prepare_batch_data


@@ -115,7 +116,7 @@ class BaseProcessor(object):
    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
-        with open(input_file, "r") as f:
+        with io.open(input_file, "r", encoding="utf8") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:

--- a/reader/joint_reader.py
+++ b/reader/joint_reader.py
-#encoding=utf8
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# -*- coding: utf-8 -*-
+
 import os
 import sys
 import random

--- a/reader/mask_language_model_reader.py
+++ b/reader/mask_language_model_reader.py
@@ -22,6 +22,7 @@ import gzip
 import logging
 import re
 import six
+import io
 import collections
 from utils import tokenization
 from utils.batching import prepare_batch_data
@@ -126,7 +127,7 @@ class DataProcessor(object):
    def load_vocab(self, vocab_file):
        """Loads a vocabulary file into a dictionary."""
        vocab = collections.OrderedDict()
-        fin = open(vocab_file)
+        fin = io.open(vocab_file, encoding='utf8')
        for num, line in enumerate(fin):
            items = self.convert_to_unicode(line.strip()).split("\t")
            if len(items) > 2:

--- a/reader/reading_comprehension_reader.py
+++ b/reader/reading_comprehension_reader.py
@@ -14,9 +14,11 @@
 """Run MRQA"""

 import six
+import io
 import math
 import json
 import random
+import io
 import collections
 import numpy as np
 from utils import tokenization
@@ -401,14 +403,14 @@ class DataProcessor(object):

            all_nbest_json[example.qas_id] = nbest_json

-        with open(output_prediction_file, "w") as writer:
+        with io.open(output_prediction_file, "w", encoding="utf8") as writer:
            writer.write(json.dumps(all_predictions, indent=4) + "\n")

-        with open(output_nbest_file, "w") as writer:
+        with io.open(output_nbest_file, "w", encoding="utf8") as writer:
            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")

        if with_negative:
-            with open(output_null_log_odds_file, "w") as writer:
+            with io.open(output_null_log_odds_file, "w", encoding="utf8") as writer:
                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")


@@ -486,7 +488,7 @@ def read_mrqa_examples(input_file, is_training, with_negative=False):
    """Read a MRQA json file into a list of MRQAExample."""
    phase = 'training' if is_training else 'testing'
    print("loading mrqa {} data...".format(phase))
-    with open(input_file, "r") as reader:
+    with io.open(input_file, "r", encoding="utf8") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
@@ -736,7 +738,7 @@ def estimate_runtime_examples(data_path, sample_rate, tokenizer, \
    assert sample_rate > 0.0 and sample_rate <= 1.0, "sample_rate must be set between 0.0~1.0"

    print("loading data with json parser...")
-    with open(data_path, "r") as reader:
+    with io.open(data_path, "r", encoding="utf8") as reader:
        data = json.load(reader)["data"]

    num_raw_examples = 0

--- a/utils/batching.py
+++ b/utils/batching.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# -*- coding: utf-8 -*-
 """Mask, padding and batching."""
 from __future__ import absolute_import
 from __future__ import division

--- a/utils/configure.py
+++ b/utils/configure.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# -*- coding: utf-8 -*-

 from __future__ import absolute_import
 from __future__ import division
@@ -18,6 +19,7 @@ from __future__ import print_function

 import os
 import sys
+import io
 import argparse
 import json
 import yaml
@@ -38,7 +40,7 @@ class JsonConfig(object):

    def _parse(self, config_path):
        try:
-            with open(config_path) as json_file:
+            with io.open(config_path, encoding="utf8") as json_file:
                config_dict = json.load(json_file)
                assert isinstance(config_dict, dict), "Object in {} is NOT a dict.".format(config_path)
        except:
@@ -216,7 +218,7 @@ class PDConfig(object):
            raise Warning("the json file %s does not exist." % file_path)
            return

-        with open(file_path, "r") as fin:
+        with io.open(file_path, "r", encoding="utf8") as fin:
            self.json_config = json.loads(fin.read())
            fin.close()

@@ -241,7 +243,7 @@ class PDConfig(object):
                raise Warning("the yaml file %s does not exist." % file_path)
                return

-            with open(file_path, "r") as fin: 
+            with io.open(file_path, "r", encoding="utf8") as fin: 
                self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader)
                fin.close()


--- a/utils/fp16.py
+++ b/utils/fp16.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# -*- coding: utf-8 -*-

 from __future__ import print_function
 import paddle

--- a/utils/init.py
+++ b/utils/init.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# -*- coding: utf-8 -*-

 from __future__ import print_function


--- a/utils/tokenization.py
+++ b/utils/tokenization.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 import collections
 import unicodedata
 import six
-
+import io

 def convert_to_unicode(text):
    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
@@ -68,15 +68,15 @@ def printable_text(text):
 def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
-    fin = open(vocab_file)
-    for num, line in enumerate(fin):
-        items = convert_to_unicode(line.strip()).split("\t")
-        if len(items) > 2:
-            break
-        token = items[0]
-        index = items[1] if len(items) == 2 else num
-        token = token.strip()
-        vocab[token] = int(index)
+    with io.open(vocab_file, encoding="utf8") as fin:
+        for num, line in enumerate(fin):
+            items = convert_to_unicode(line.strip()).split("\t")
+            if len(items) > 2:
+                break
+            token = items[0]
+            index = items[1] if len(items) == 2 else num
+            token = token.strip()
+            vocab[token] = int(index)
    return vocab