From 6ed5f04dcd5ff403565231fc2241560070e7faa6 Mon Sep 17 00:00:00 2001
From: Bond-SYSU <374579557@qq.com>
Date: Tue, 22 Oct 2019 14:53:20 +0800
Subject: [PATCH] replace open with io.open to be compatible with windows
 (#3707)

* update downloads.py

* fix bug on ernie based inferring

* replace open with io.open to be compatible with  windows
---
 PaddleNLP/lexical_analysis/compare.py         |  6 +-
 PaddleNLP/lexical_analysis/eval.py            | 72 +++++++++++--------
 PaddleNLP/lexical_analysis/inference_model.py | 57 +++++++--------
 PaddleNLP/lexical_analysis/predict.py         | 66 ++++++++++-------
 PaddleNLP/lexical_analysis/reader.py          |  1 +
 PaddleNLP/lexical_analysis/utils.py           |  3 +-
 6 files changed, 116 insertions(+), 89 deletions(-)

diff --git a/PaddleNLP/lexical_analysis/compare.py b/PaddleNLP/lexical_analysis/compare.py
index 3e21f66d..43cecc62 100644
--- a/PaddleNLP/lexical_analysis/compare.py
+++ b/PaddleNLP/lexical_analysis/compare.py
@@ -12,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 evaluate wordseg for LAC and other open-source wordseg tools
 """
@@ -21,6 +20,7 @@ from __future__ import division
 
 import sys
 import os
+import io
 
 
 def to_unicode(string):
@@ -71,7 +71,7 @@ def load_testdata(datapath="./data/test_data/test_part"):
     """none"""
     sentences = []
     sent_seg_list = []
-    for line in open(datapath):
+    for line in io.open(datapath, 'r', encoding='utf8'):
         sent, label = line.strip().split("\t")
         sentences.append(sent)
 
@@ -110,7 +110,7 @@ def get_lac_result():
         `sh run.sh | tail -n 100 > result.txt`
     """
     sent_seg_list = []
-    for line in open("./result.txt"):
+    for line in io.open("./result.txt", 'r', encoding='utf8'):
         line = line.strip().split(" ")
         words = [pair.split("/")[0] for pair in line]
         labels = [pair.split("/")[1] for pair in line]
diff --git a/PaddleNLP/lexical_analysis/eval.py b/PaddleNLP/lexical_analysis/eval.py
index 03cf1535..3b96d0c7 100644
--- a/PaddleNLP/lexical_analysis/eval.py
+++ b/PaddleNLP/lexical_analysis/eval.py
@@ -31,20 +31,31 @@ from model_check import check_version
 parser = argparse.ArgumentParser(__doc__)
 # 1. model parameters
 model_g = utils.ArgumentGroup(parser, "model", "model configuration")
-model_g.add_arg("word_emb_dim", int, 128, "The dimension in which a word is embedded.")
-model_g.add_arg("grnn_hidden_dim", int, 128, "The number of hidden nodes in the GRNN layer.")
-model_g.add_arg("bigru_num", int, 2, "The number of bi_gru layers in the network.")
+model_g.add_arg("word_emb_dim", int, 128,
+                "The dimension in which a word is embedded.")
+model_g.add_arg("grnn_hidden_dim", int, 128,
+                "The number of hidden nodes in the GRNN layer.")
+model_g.add_arg("bigru_num", int, 2,
+                "The number of bi_gru layers in the network.")
 model_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.")
 
 # 2. data parameters
 data_g = utils.ArgumentGroup(parser, "data", "data paths")
-data_g.add_arg("word_dict_path", str, "./conf/word.dic", "The path of the word dictionary.")
-data_g.add_arg("label_dict_path", str, "./conf/tag.dic", "The path of the label dictionary.")
-data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic", "The path of the word replacement Dictionary.")
-data_g.add_arg("test_data", str, "./data/test.tsv", "The folder where the training data is located.")
+data_g.add_arg("word_dict_path", str, "./conf/word.dic",
+               "The path of the word dictionary.")
+data_g.add_arg("label_dict_path", str, "./conf/tag.dic",
+               "The path of the label dictionary.")
+data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic",
+               "The path of the word replacement Dictionary.")
+data_g.add_arg("test_data", str, "./data/test.tsv",
+               "The folder where the training data is located.")
 data_g.add_arg("init_checkpoint", str, "./model_baseline", "Path to init model")
-data_g.add_arg("batch_size", int, 200, "The number of sequences contained in a mini-batch, "
-        "or the maximum number of tokens (include paddings) contained in a mini-batch.")
+data_g.add_arg(
+    "batch_size", int, 200,
+    "The number of sequences contained in a mini-batch, "
+    "or the maximum number of tokens (include paddings) contained in a mini-batch."
+)
+
 
 def do_eval(args):
     dataset = reader.Dataset(args)
@@ -62,23 +73,23 @@ def do_eval(args):
     else:
         place = fluid.CPUPlace()
 
-    pyreader = creator.create_pyreader(args, file_name=args.test_data,
-                                       feed_list=test_ret['feed_list'],
-                                       place=place,
-                                       model='lac',
-                                       reader=dataset,
-                                       mode='test')
+    pyreader = creator.create_pyreader(
+        args,
+        file_name=args.test_data,
+        feed_list=test_ret['feed_list'],
+        place=place,
+        model='lac',
+        reader=dataset,
+        mode='test')
 
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
 
     # load model
     utils.init_checkpoint(exe, args.init_checkpoint, test_program)
-    test_process(exe=exe,
-                 program=test_program,
-                 reader=pyreader,
-                 test_ret=test_ret
-                 )
+    test_process(
+        exe=exe, program=test_program, reader=pyreader, test_ret=test_ret)
+
 
 def test_process(exe, program, reader, test_ret):
     """
@@ -93,20 +104,21 @@ def test_process(exe, program, reader, test_ret):
     start_time = time.time()
     for data in reader():
 
-        nums_infer, nums_label, nums_correct = exe.run(program,
-                                fetch_list=[
-                                    test_ret["num_infer_chunks"],
-                                    test_ret["num_label_chunks"],
-                                    test_ret["num_correct_chunks"],
-                                ],
-                             feed=data,
-                             )
+        nums_infer, nums_label, nums_correct = exe.run(
+            program,
+            fetch_list=[
+                test_ret["num_infer_chunks"],
+                test_ret["num_label_chunks"],
+                test_ret["num_correct_chunks"],
+            ],
+            feed=data, )
 
         test_ret["chunk_evaluator"].update(nums_infer, nums_label, nums_correct)
     precision, recall, f1 = test_ret["chunk_evaluator"].eval()
     end_time = time.time()
-    print("[test] P: %.5f, R: %.5f, F1: %.5f, elapsed time: %.3f s"
-          % (precision, recall, f1, end_time - start_time))
+    print("[test] P: %.5f, R: %.5f, F1: %.5f, elapsed time: %.3f s" %
+          (precision, recall, f1, end_time - start_time))
+
 
 if __name__ == '__main__':
     args = parser.parse_args()
diff --git a/PaddleNLP/lexical_analysis/inference_model.py b/PaddleNLP/lexical_analysis/inference_model.py
index 024cc36c..89075723 100644
--- a/PaddleNLP/lexical_analysis/inference_model.py
+++ b/PaddleNLP/lexical_analysis/inference_model.py
@@ -14,6 +14,7 @@ sys.path.append('../models/')
 from model_check import check_cuda
 from model_check import check_version
 
+
 def save_inference_model(args):
 
     # model definition
@@ -30,20 +31,19 @@ def save_inference_model(args):
                 args, dataset.vocab_size, dataset.num_labels, mode='infer')
             infer_program = infer_program.clone(for_test=True)
 
-
     # load pretrain check point
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
     utils.init_checkpoint(exe, args.init_checkpoint, infer_program)
 
-    fluid.io.save_inference_model(args.inference_save_dir,
-                                  ['words'],
-                                  infer_ret['crf_decode'],
-                                  exe,
-                                  main_program=infer_program,
-                                  model_filename='model.pdmodel',
-                                  params_filename='params.pdparams',
-                                  )
+    fluid.io.save_inference_model(
+        args.inference_save_dir,
+        ['words'],
+        infer_ret['crf_decode'],
+        exe,
+        main_program=infer_program,
+        model_filename='model.pdmodel',
+        params_filename='params.pdparams', )
 
 
 def test_inference_model(model_dir, text_list, dataset):
@@ -68,45 +68,46 @@ def test_inference_model(model_dir, text_list, dataset):
     tensor_words = fluid.create_lod_tensor(lod, base_shape, place)
 
     # for empty input, output the same empty
-    if(sum(base_shape[0]) == 0 ):
+    if (sum(base_shape[0]) == 0):
         crf_decode = [tensor_words]
     else:
         # load inference model
         inference_scope = fluid.core.Scope()
         with fluid.scope_guard(inference_scope):
             [inferencer, feed_target_names,
-            fetch_targets] = fluid.io.load_inference_model(model_dir, exe,
-                    model_filename='model.pdmodel',
-                    params_filename='params.pdparams',
-                    )
+             fetch_targets] = fluid.io.load_inference_model(
+                 model_dir,
+                 exe,
+                 model_filename='model.pdmodel',
+                 params_filename='params.pdparams', )
             assert feed_target_names[0] == "words"
-            print("Load inference model from %s"%(model_dir))
+            print("Load inference model from %s" % (model_dir))
 
             # get lac result
-            crf_decode = exe.run(inferencer,
-                             feed={feed_target_names[0]:tensor_words},
-                             fetch_list=fetch_targets,
-                             return_numpy=False,
-                             use_program_cache=True,
-                             )
+            crf_decode = exe.run(
+                inferencer,
+                feed={feed_target_names[0]: tensor_words},
+                fetch_list=fetch_targets,
+                return_numpy=False,
+                use_program_cache=True, )
 
     # parse the crf_decode result
-    result = utils.parse_result(tensor_words,crf_decode[0], dataset)
-    for i,(sent, tags) in enumerate(result):
-        result_list = ['(%s, %s)'%(ch, tag) for ch, tag in zip(sent,tags)]
+    result = utils.parse_result(tensor_words, crf_decode[0], dataset)
+    for i, (sent, tags) in enumerate(result):
+        result_list = ['(%s, %s)' % (ch, tag) for ch, tag in zip(sent, tags)]
         print(''.join(result_list))
 
 
-if __name__=="__main__":
+if __name__ == "__main__":
     parser = argparse.ArgumentParser(__doc__)
-    utils.load_yaml(parser,'conf/args.yaml')
+    utils.load_yaml(parser, 'conf/args.yaml')
     args = parser.parse_args()
     check_cuda(args.use_cuda)
     check_version()
     print("save inference model")
     save_inference_model(args)
-    
-    print("inference model save in %s"%args.inference_save_dir)
+
+    print("inference model save in %s" % args.inference_save_dir)
     print("test inference model")
     dataset = reader.Dataset(args)
     test_data = [u'百度是一家高科技公司', u'中山大学是岭南第一学府']
diff --git a/PaddleNLP/lexical_analysis/predict.py b/PaddleNLP/lexical_analysis/predict.py
index 002e888a..d3ed22ac 100644
--- a/PaddleNLP/lexical_analysis/predict.py
+++ b/PaddleNLP/lexical_analysis/predict.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import argparse
 import os
 import time
@@ -30,20 +31,31 @@ from model_check import check_version
 parser = argparse.ArgumentParser(__doc__)
 # 1. model parameters
 model_g = utils.ArgumentGroup(parser, "model", "model configuration")
-model_g.add_arg("word_emb_dim", int, 128, "The dimension in which a word is embedded.")
-model_g.add_arg("grnn_hidden_dim", int, 256, "The number of hidden nodes in the GRNN layer.")
-model_g.add_arg("bigru_num", int, 2, "The number of bi_gru layers in the network.")
+model_g.add_arg("word_emb_dim", int, 128,
+                "The dimension in which a word is embedded.")
+model_g.add_arg("grnn_hidden_dim", int, 256,
+                "The number of hidden nodes in the GRNN layer.")
+model_g.add_arg("bigru_num", int, 2,
+                "The number of bi_gru layers in the network.")
 model_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.")
 
 # 2. data parameters
 data_g = utils.ArgumentGroup(parser, "data", "data paths")
-data_g.add_arg("word_dict_path", str, "./conf/word.dic", "The path of the word dictionary.")
-data_g.add_arg("label_dict_path", str, "./conf/tag.dic", "The path of the label dictionary.")
-data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic", "The path of the word replacement Dictionary.")
-data_g.add_arg("infer_data", str, "./data/infer.tsv", "The folder where the training data is located.")
+data_g.add_arg("word_dict_path", str, "./conf/word.dic",
+               "The path of the word dictionary.")
+data_g.add_arg("label_dict_path", str, "./conf/tag.dic",
+               "The path of the label dictionary.")
+data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic",
+               "The path of the word replacement Dictionary.")
+data_g.add_arg("infer_data", str, "./data/infer.tsv",
+               "The folder where the training data is located.")
 data_g.add_arg("init_checkpoint", str, "./model_baseline", "Path to init model")
-data_g.add_arg("batch_size", int, 200, "The number of sequences contained in a mini-batch, "
-        "or the maximum number of tokens (include paddings) contained in a mini-batch.")
+data_g.add_arg(
+    "batch_size", int, 200,
+    "The number of sequences contained in a mini-batch, "
+    "or the maximum number of tokens (include paddings) contained in a mini-batch."
+)
+
 
 def do_infer(args):
     dataset = reader.Dataset(args)
@@ -61,14 +73,14 @@ def do_infer(args):
     else:
         place = fluid.CPUPlace()
 
-
-
-    pyreader = creator.create_pyreader(args, file_name=args.infer_data,
-                                       feed_list=infer_ret['feed_list'],
-                                       place=place,
-                                       model='lac',
-                                       reader=dataset,
-                                       mode='infer')
+    pyreader = creator.create_pyreader(
+        args,
+        file_name=args.infer_data,
+        feed_list=infer_ret['feed_list'],
+        place=place,
+        model='lac',
+        reader=dataset,
+        mode='infer')
 
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
@@ -81,8 +93,7 @@ def do_infer(args):
         program=infer_program,
         reader=pyreader,
         fetch_vars=[infer_ret['words'], infer_ret['crf_decode']],
-        dataset=dataset
-    )
+        dataset=dataset)
     for sent, tags in result:
         result_list = ['(%s, %s)' % (ch, tag) for ch, tag in zip(sent, tags)]
         print(''.join(result_list))
@@ -96,8 +107,9 @@ def infer_process(exe, program, reader, fetch_vars, dataset):
     :param reader: data reader
     :return: the list of prediction result
     """
+
     def input_check(data):
-        if data[0]['words'].lod()[0][-1]==0:
+        if data[0]['words'].lod()[0][-1] == 0:
             return data[0]['words']
         return None
 
@@ -108,17 +120,17 @@ def infer_process(exe, program, reader, fetch_vars, dataset):
             results += utils.parse_result(crf_decode, crf_decode, dataset)
             continue
 
-        words, crf_decode = exe.run(program,
-                             fetch_list=fetch_vars,
-                             feed=data,
-                             return_numpy=False,
-                             use_program_cache=True,
-                             )
+        words, crf_decode = exe.run(
+            program,
+            fetch_list=fetch_vars,
+            feed=data,
+            return_numpy=False,
+            use_program_cache=True, )
         results += utils.parse_result(words, crf_decode, dataset)
     return results
 
 
-if __name__=="__main__":
+if __name__ == "__main__":
     args = parser.parse_args()
     check_cuda(args.use_cuda)
     check_version()
diff --git a/PaddleNLP/lexical_analysis/reader.py b/PaddleNLP/lexical_analysis/reader.py
index 46101cd4..ddb0030e 100644
--- a/PaddleNLP/lexical_analysis/reader.py
+++ b/PaddleNLP/lexical_analysis/reader.py
@@ -14,6 +14,7 @@
 """
 The file_reader converts raw corpus to input.
 """
+
 import os
 import argparse
 import __future__
diff --git a/PaddleNLP/lexical_analysis/utils.py b/PaddleNLP/lexical_analysis/utils.py
index 8fab3252..d3ee614d 100644
--- a/PaddleNLP/lexical_analysis/utils.py
+++ b/PaddleNLP/lexical_analysis/utils.py
@@ -20,6 +20,7 @@ import sys
 import numpy as np
 import paddle.fluid as fluid
 import yaml
+import io
 
 
 def str2bool(v):
@@ -50,7 +51,7 @@ class ArgumentGroup(object):
 
 
 def load_yaml(parser, file_name, **kwargs):
-    with open(file_name) as f:
+    with io.open(file_name, 'r', encoding='utf8') as f:
         args = yaml.load(f)
         for title in args:
             group = parser.add_argument_group(title=title, description='')
-- 
GitLab