From b0d11b3b429c72375ba39b8c10fe36e770c83957 Mon Sep 17 00:00:00 2001 From: xixiaoyao Date: Thu, 17 Oct 2019 20:05:00 +0800 Subject: [PATCH] fix bugs --- mtl_run.py | 2 +- paradigm/answer_matching.py | 2 +- paradigm/mask_language_model.py | 1 + paradigm/reading_comprehension.py | 2 +- reader/answer_matching_reader.py | 3 ++- reader/joint_reader.py | 16 +++++++++++++++- reader/mask_language_model_reader.py | 3 ++- reader/reading_comprehension_reader.py | 12 +++++++----- utils/batching.py | 1 + utils/configure.py | 8 +++++--- utils/fp16.py | 1 + utils/init.py | 1 + utils/tokenization.py | 4 ++-- 13 files changed, 40 insertions(+), 16 deletions(-) diff --git a/mtl_run.py b/mtl_run.py index 7f457b9..8bba8bc 100644 --- a/mtl_run.py +++ b/mtl_run.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# encoding=utf8 +# -*- coding: utf-8 -*- import os import sys diff --git a/paradigm/answer_matching.py b/paradigm/answer_matching.py index d1bd202..2d53815 100644 --- a/paradigm/answer_matching.py +++ b/paradigm/answer_matching.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# encoding=utf8 +# -*- coding: utf-8 -*- import paddle.fluid as fluid diff --git a/paradigm/mask_language_model.py b/paradigm/mask_language_model.py index fa13d75..3f8e21e 100644 --- a/paradigm/mask_language_model.py +++ b/paradigm/mask_language_model.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# -*- coding: utf-8 -*- import paddle.fluid as fluid from backbone.utils.transformer import pre_process_layer diff --git a/paradigm/reading_comprehension.py b/paradigm/reading_comprehension.py index 5f38842..79e0543 100644 --- a/paradigm/reading_comprehension.py +++ b/paradigm/reading_comprehension.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# encoding=utf8 +# -*- coding: utf-8 -*- import paddle.fluid as fluid import collections diff --git a/reader/answer_matching_reader.py b/reader/answer_matching_reader.py index 1764d0a..e072837 100644 --- a/reader/answer_matching_reader.py +++ b/reader/answer_matching_reader.py @@ -17,6 +17,7 @@ import types import csv import numpy as np from utils import tokenization +import io from utils.batching import prepare_batch_data @@ -115,7 +116,7 @@ class BaseProcessor(object): @classmethod def _read_tsv(cls, input_file, quotechar=None): """Reads a tab separated value file.""" - with open(input_file, "r") as f: + with io.open(input_file, "r", encoding="utf8") as f: reader = csv.reader(f, delimiter="\t", quotechar=quotechar) lines = [] for line in reader: diff --git a/reader/joint_reader.py b/reader/joint_reader.py index aced579..151847b 100644 --- a/reader/joint_reader.py +++ b/reader/joint_reader.py @@ -1,4 +1,18 @@ -#encoding=utf8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# -*- coding: utf-8 -*- + import os import sys import random diff --git a/reader/mask_language_model_reader.py b/reader/mask_language_model_reader.py index e4accba..134648d 100644 --- a/reader/mask_language_model_reader.py +++ b/reader/mask_language_model_reader.py @@ -22,6 +22,7 @@ import gzip import logging import re import six +import io import collections from utils import tokenization from utils.batching import prepare_batch_data @@ -126,7 +127,7 @@ class DataProcessor(object): def load_vocab(self, vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() - fin = open(vocab_file) + fin = io.open(vocab_file, encoding='utf8') for num, line in enumerate(fin): items = self.convert_to_unicode(line.strip()).split("\t") if len(items) > 2: diff --git a/reader/reading_comprehension_reader.py b/reader/reading_comprehension_reader.py index c3eaa72..afba885 100644 --- a/reader/reading_comprehension_reader.py +++ b/reader/reading_comprehension_reader.py @@ -14,9 +14,11 @@ """Run MRQA""" import six +import io import math import json import random +import io import collections import numpy as np from utils import tokenization @@ -401,14 +403,14 @@ class DataProcessor(object): all_nbest_json[example.qas_id] = nbest_json - with open(output_prediction_file, "w") as writer: + with io.open(output_prediction_file, "w", encoding="utf8") as writer: writer.write(json.dumps(all_predictions, indent=4) + "\n") - with open(output_nbest_file, "w") as writer: + with io.open(output_nbest_file, "w", encoding="utf8") as writer: writer.write(json.dumps(all_nbest_json, indent=4) + "\n") if with_negative: - with open(output_null_log_odds_file, "w") as writer: + with io.open(output_null_log_odds_file, "w", encoding="utf8") as writer: writer.write(json.dumps(scores_diff_json, indent=4) + "\n") @@ -486,7 +488,7 @@ def read_mrqa_examples(input_file, is_training, with_negative=False): """Read a MRQA json file into a list of MRQAExample.""" phase = 'training' if is_training else 'testing' print("loading mrqa {} data...".format(phase)) - with open(input_file, "r") as reader: + with io.open(input_file, "r", encoding="utf8") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): @@ -736,7 +738,7 @@ def estimate_runtime_examples(data_path, sample_rate, tokenizer, \ assert sample_rate > 0.0 and sample_rate <= 1.0, "sample_rate must be set between 0.0~1.0" print("loading data with json parser...") - with open(data_path, "r") as reader: + with io.open(data_path, "r", encoding="utf8") as reader: data = json.load(reader)["data"] num_raw_examples = 0 diff --git a/utils/batching.py b/utils/batching.py index 13803cf..569a55c 100644 --- a/utils/batching.py +++ b/utils/batching.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# -*- coding: utf-8 -*- """Mask, padding and batching.""" from __future__ import absolute_import from __future__ import division diff --git a/utils/configure.py b/utils/configure.py index 2d63b0b..67de31f 100644 --- a/utils/configure.py +++ b/utils/configure.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# -*- coding: utf-8 -*- from __future__ import absolute_import from __future__ import division @@ -18,6 +19,7 @@ from __future__ import print_function import os import sys +import io import argparse import json import yaml @@ -38,7 +40,7 @@ class JsonConfig(object): def _parse(self, config_path): try: - with open(config_path) as json_file: + with io.open(config_path, encoding="utf8") as json_file: config_dict = json.load(json_file) assert isinstance(config_dict, dict), "Object in {} is NOT a dict.".format(config_path) except: @@ -216,7 +218,7 @@ class PDConfig(object): raise Warning("the json file %s does not exist." % file_path) return - with open(file_path, "r") as fin: + with io.open(file_path, "r", encoding="utf8") as fin: self.json_config = json.loads(fin.read()) fin.close() @@ -241,7 +243,7 @@ class PDConfig(object): raise Warning("the yaml file %s does not exist." % file_path) return - with open(file_path, "r") as fin: + with io.open(file_path, "r", encoding="utf8") as fin: self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader) fin.close() diff --git a/utils/fp16.py b/utils/fp16.py index e153c2b..28e0be4 100644 --- a/utils/fp16.py +++ b/utils/fp16.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# -*- coding: utf-8 -*- from __future__ import print_function import paddle diff --git a/utils/init.py b/utils/init.py index abdad1f..0e8c47e 100644 --- a/utils/init.py +++ b/utils/init.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# -*- coding: utf-8 -*- from __future__ import print_function diff --git a/utils/tokenization.py b/utils/tokenization.py index 3a52ecf..8001f95 100644 --- a/utils/tokenization.py +++ b/utils/tokenization.py @@ -20,7 +20,7 @@ from __future__ import print_function import collections import unicodedata import six - +import io def convert_to_unicode(text): """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" @@ -68,7 +68,7 @@ def printable_text(text): def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() - fin = open(vocab_file) + fin = io.open(vocab_file, encoding="utf8") for num, line in enumerate(fin): items = convert_to_unicode(line.strip()).split("\t") if len(items) > 2: -- GitLab