From 6ed5f04dcd5ff403565231fc2241560070e7faa6 Mon Sep 17 00:00:00 2001 From: Bond-SYSU <374579557@qq.com> Date: Tue, 22 Oct 2019 14:53:20 +0800 Subject: [PATCH] replace open with io.open to be compatible with windows (#3707) * update downloads.py * fix bug on ernie based inferring * replace open with io.open to be compatible with windows --- PaddleNLP/lexical_analysis/compare.py | 6 +- PaddleNLP/lexical_analysis/eval.py | 72 +++++++++++-------- PaddleNLP/lexical_analysis/inference_model.py | 57 +++++++-------- PaddleNLP/lexical_analysis/predict.py | 66 ++++++++++------- PaddleNLP/lexical_analysis/reader.py | 1 + PaddleNLP/lexical_analysis/utils.py | 3 +- 6 files changed, 116 insertions(+), 89 deletions(-) diff --git a/PaddleNLP/lexical_analysis/compare.py b/PaddleNLP/lexical_analysis/compare.py index 3e21f66d..43cecc62 100644 --- a/PaddleNLP/lexical_analysis/compare.py +++ b/PaddleNLP/lexical_analysis/compare.py @@ -12,7 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ evaluate wordseg for LAC and other open-source wordseg tools """ @@ -21,6 +20,7 @@ from __future__ import division import sys import os +import io def to_unicode(string): @@ -71,7 +71,7 @@ def load_testdata(datapath="./data/test_data/test_part"): """none""" sentences = [] sent_seg_list = [] - for line in open(datapath): + for line in io.open(datapath, 'r', encoding='utf8'): sent, label = line.strip().split("\t") sentences.append(sent) @@ -110,7 +110,7 @@ def get_lac_result(): `sh run.sh | tail -n 100 > result.txt` """ sent_seg_list = [] - for line in open("./result.txt"): + for line in io.open("./result.txt", 'r', encoding='utf8'): line = line.strip().split(" ") words = [pair.split("/")[0] for pair in line] labels = [pair.split("/")[1] for pair in line] diff --git a/PaddleNLP/lexical_analysis/eval.py b/PaddleNLP/lexical_analysis/eval.py index 03cf1535..3b96d0c7 100644 --- a/PaddleNLP/lexical_analysis/eval.py +++ b/PaddleNLP/lexical_analysis/eval.py @@ -31,20 +31,31 @@ from model_check import check_version parser = argparse.ArgumentParser(__doc__) # 1. model parameters model_g = utils.ArgumentGroup(parser, "model", "model configuration") -model_g.add_arg("word_emb_dim", int, 128, "The dimension in which a word is embedded.") -model_g.add_arg("grnn_hidden_dim", int, 128, "The number of hidden nodes in the GRNN layer.") -model_g.add_arg("bigru_num", int, 2, "The number of bi_gru layers in the network.") +model_g.add_arg("word_emb_dim", int, 128, + "The dimension in which a word is embedded.") +model_g.add_arg("grnn_hidden_dim", int, 128, + "The number of hidden nodes in the GRNN layer.") +model_g.add_arg("bigru_num", int, 2, + "The number of bi_gru layers in the network.") model_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.") # 2. data parameters data_g = utils.ArgumentGroup(parser, "data", "data paths") -data_g.add_arg("word_dict_path", str, "./conf/word.dic", "The path of the word dictionary.") -data_g.add_arg("label_dict_path", str, "./conf/tag.dic", "The path of the label dictionary.") -data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic", "The path of the word replacement Dictionary.") -data_g.add_arg("test_data", str, "./data/test.tsv", "The folder where the training data is located.") +data_g.add_arg("word_dict_path", str, "./conf/word.dic", + "The path of the word dictionary.") +data_g.add_arg("label_dict_path", str, "./conf/tag.dic", + "The path of the label dictionary.") +data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic", + "The path of the word replacement Dictionary.") +data_g.add_arg("test_data", str, "./data/test.tsv", + "The folder where the training data is located.") data_g.add_arg("init_checkpoint", str, "./model_baseline", "Path to init model") -data_g.add_arg("batch_size", int, 200, "The number of sequences contained in a mini-batch, " - "or the maximum number of tokens (include paddings) contained in a mini-batch.") +data_g.add_arg( + "batch_size", int, 200, + "The number of sequences contained in a mini-batch, " + "or the maximum number of tokens (include paddings) contained in a mini-batch." +) + def do_eval(args): dataset = reader.Dataset(args) @@ -62,23 +73,23 @@ def do_eval(args): else: place = fluid.CPUPlace() - pyreader = creator.create_pyreader(args, file_name=args.test_data, - feed_list=test_ret['feed_list'], - place=place, - model='lac', - reader=dataset, - mode='test') + pyreader = creator.create_pyreader( + args, + file_name=args.test_data, + feed_list=test_ret['feed_list'], + place=place, + model='lac', + reader=dataset, + mode='test') exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) # load model utils.init_checkpoint(exe, args.init_checkpoint, test_program) - test_process(exe=exe, - program=test_program, - reader=pyreader, - test_ret=test_ret - ) + test_process( + exe=exe, program=test_program, reader=pyreader, test_ret=test_ret) + def test_process(exe, program, reader, test_ret): """ @@ -93,20 +104,21 @@ def test_process(exe, program, reader, test_ret): start_time = time.time() for data in reader(): - nums_infer, nums_label, nums_correct = exe.run(program, - fetch_list=[ - test_ret["num_infer_chunks"], - test_ret["num_label_chunks"], - test_ret["num_correct_chunks"], - ], - feed=data, - ) + nums_infer, nums_label, nums_correct = exe.run( + program, + fetch_list=[ + test_ret["num_infer_chunks"], + test_ret["num_label_chunks"], + test_ret["num_correct_chunks"], + ], + feed=data, ) test_ret["chunk_evaluator"].update(nums_infer, nums_label, nums_correct) precision, recall, f1 = test_ret["chunk_evaluator"].eval() end_time = time.time() - print("[test] P: %.5f, R: %.5f, F1: %.5f, elapsed time: %.3f s" - % (precision, recall, f1, end_time - start_time)) + print("[test] P: %.5f, R: %.5f, F1: %.5f, elapsed time: %.3f s" % + (precision, recall, f1, end_time - start_time)) + if __name__ == '__main__': args = parser.parse_args() diff --git a/PaddleNLP/lexical_analysis/inference_model.py b/PaddleNLP/lexical_analysis/inference_model.py index 024cc36c..89075723 100644 --- a/PaddleNLP/lexical_analysis/inference_model.py +++ b/PaddleNLP/lexical_analysis/inference_model.py @@ -14,6 +14,7 @@ sys.path.append('../models/') from model_check import check_cuda from model_check import check_version + def save_inference_model(args): # model definition @@ -30,20 +31,19 @@ def save_inference_model(args): args, dataset.vocab_size, dataset.num_labels, mode='infer') infer_program = infer_program.clone(for_test=True) - # load pretrain check point exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) utils.init_checkpoint(exe, args.init_checkpoint, infer_program) - fluid.io.save_inference_model(args.inference_save_dir, - ['words'], - infer_ret['crf_decode'], - exe, - main_program=infer_program, - model_filename='model.pdmodel', - params_filename='params.pdparams', - ) + fluid.io.save_inference_model( + args.inference_save_dir, + ['words'], + infer_ret['crf_decode'], + exe, + main_program=infer_program, + model_filename='model.pdmodel', + params_filename='params.pdparams', ) def test_inference_model(model_dir, text_list, dataset): @@ -68,45 +68,46 @@ def test_inference_model(model_dir, text_list, dataset): tensor_words = fluid.create_lod_tensor(lod, base_shape, place) # for empty input, output the same empty - if(sum(base_shape[0]) == 0 ): + if (sum(base_shape[0]) == 0): crf_decode = [tensor_words] else: # load inference model inference_scope = fluid.core.Scope() with fluid.scope_guard(inference_scope): [inferencer, feed_target_names, - fetch_targets] = fluid.io.load_inference_model(model_dir, exe, - model_filename='model.pdmodel', - params_filename='params.pdparams', - ) + fetch_targets] = fluid.io.load_inference_model( + model_dir, + exe, + model_filename='model.pdmodel', + params_filename='params.pdparams', ) assert feed_target_names[0] == "words" - print("Load inference model from %s"%(model_dir)) + print("Load inference model from %s" % (model_dir)) # get lac result - crf_decode = exe.run(inferencer, - feed={feed_target_names[0]:tensor_words}, - fetch_list=fetch_targets, - return_numpy=False, - use_program_cache=True, - ) + crf_decode = exe.run( + inferencer, + feed={feed_target_names[0]: tensor_words}, + fetch_list=fetch_targets, + return_numpy=False, + use_program_cache=True, ) # parse the crf_decode result - result = utils.parse_result(tensor_words,crf_decode[0], dataset) - for i,(sent, tags) in enumerate(result): - result_list = ['(%s, %s)'%(ch, tag) for ch, tag in zip(sent,tags)] + result = utils.parse_result(tensor_words, crf_decode[0], dataset) + for i, (sent, tags) in enumerate(result): + result_list = ['(%s, %s)' % (ch, tag) for ch, tag in zip(sent, tags)] print(''.join(result_list)) -if __name__=="__main__": +if __name__ == "__main__": parser = argparse.ArgumentParser(__doc__) - utils.load_yaml(parser,'conf/args.yaml') + utils.load_yaml(parser, 'conf/args.yaml') args = parser.parse_args() check_cuda(args.use_cuda) check_version() print("save inference model") save_inference_model(args) - - print("inference model save in %s"%args.inference_save_dir) + + print("inference model save in %s" % args.inference_save_dir) print("test inference model") dataset = reader.Dataset(args) test_data = [u'百度是一家高科技公司', u'中山大学是岭南第一学府'] diff --git a/PaddleNLP/lexical_analysis/predict.py b/PaddleNLP/lexical_analysis/predict.py index 002e888a..d3ed22ac 100644 --- a/PaddleNLP/lexical_analysis/predict.py +++ b/PaddleNLP/lexical_analysis/predict.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import argparse import os import time @@ -30,20 +31,31 @@ from model_check import check_version parser = argparse.ArgumentParser(__doc__) # 1. model parameters model_g = utils.ArgumentGroup(parser, "model", "model configuration") -model_g.add_arg("word_emb_dim", int, 128, "The dimension in which a word is embedded.") -model_g.add_arg("grnn_hidden_dim", int, 256, "The number of hidden nodes in the GRNN layer.") -model_g.add_arg("bigru_num", int, 2, "The number of bi_gru layers in the network.") +model_g.add_arg("word_emb_dim", int, 128, + "The dimension in which a word is embedded.") +model_g.add_arg("grnn_hidden_dim", int, 256, + "The number of hidden nodes in the GRNN layer.") +model_g.add_arg("bigru_num", int, 2, + "The number of bi_gru layers in the network.") model_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.") # 2. data parameters data_g = utils.ArgumentGroup(parser, "data", "data paths") -data_g.add_arg("word_dict_path", str, "./conf/word.dic", "The path of the word dictionary.") -data_g.add_arg("label_dict_path", str, "./conf/tag.dic", "The path of the label dictionary.") -data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic", "The path of the word replacement Dictionary.") -data_g.add_arg("infer_data", str, "./data/infer.tsv", "The folder where the training data is located.") +data_g.add_arg("word_dict_path", str, "./conf/word.dic", + "The path of the word dictionary.") +data_g.add_arg("label_dict_path", str, "./conf/tag.dic", + "The path of the label dictionary.") +data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic", + "The path of the word replacement Dictionary.") +data_g.add_arg("infer_data", str, "./data/infer.tsv", + "The folder where the training data is located.") data_g.add_arg("init_checkpoint", str, "./model_baseline", "Path to init model") -data_g.add_arg("batch_size", int, 200, "The number of sequences contained in a mini-batch, " - "or the maximum number of tokens (include paddings) contained in a mini-batch.") +data_g.add_arg( + "batch_size", int, 200, + "The number of sequences contained in a mini-batch, " + "or the maximum number of tokens (include paddings) contained in a mini-batch." +) + def do_infer(args): dataset = reader.Dataset(args) @@ -61,14 +73,14 @@ def do_infer(args): else: place = fluid.CPUPlace() - - - pyreader = creator.create_pyreader(args, file_name=args.infer_data, - feed_list=infer_ret['feed_list'], - place=place, - model='lac', - reader=dataset, - mode='infer') + pyreader = creator.create_pyreader( + args, + file_name=args.infer_data, + feed_list=infer_ret['feed_list'], + place=place, + model='lac', + reader=dataset, + mode='infer') exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) @@ -81,8 +93,7 @@ def do_infer(args): program=infer_program, reader=pyreader, fetch_vars=[infer_ret['words'], infer_ret['crf_decode']], - dataset=dataset - ) + dataset=dataset) for sent, tags in result: result_list = ['(%s, %s)' % (ch, tag) for ch, tag in zip(sent, tags)] print(''.join(result_list)) @@ -96,8 +107,9 @@ def infer_process(exe, program, reader, fetch_vars, dataset): :param reader: data reader :return: the list of prediction result """ + def input_check(data): - if data[0]['words'].lod()[0][-1]==0: + if data[0]['words'].lod()[0][-1] == 0: return data[0]['words'] return None @@ -108,17 +120,17 @@ def infer_process(exe, program, reader, fetch_vars, dataset): results += utils.parse_result(crf_decode, crf_decode, dataset) continue - words, crf_decode = exe.run(program, - fetch_list=fetch_vars, - feed=data, - return_numpy=False, - use_program_cache=True, - ) + words, crf_decode = exe.run( + program, + fetch_list=fetch_vars, + feed=data, + return_numpy=False, + use_program_cache=True, ) results += utils.parse_result(words, crf_decode, dataset) return results -if __name__=="__main__": +if __name__ == "__main__": args = parser.parse_args() check_cuda(args.use_cuda) check_version() diff --git a/PaddleNLP/lexical_analysis/reader.py b/PaddleNLP/lexical_analysis/reader.py index 46101cd4..ddb0030e 100644 --- a/PaddleNLP/lexical_analysis/reader.py +++ b/PaddleNLP/lexical_analysis/reader.py @@ -14,6 +14,7 @@ """ The file_reader converts raw corpus to input. """ + import os import argparse import __future__ diff --git a/PaddleNLP/lexical_analysis/utils.py b/PaddleNLP/lexical_analysis/utils.py index 8fab3252..d3ee614d 100644 --- a/PaddleNLP/lexical_analysis/utils.py +++ b/PaddleNLP/lexical_analysis/utils.py @@ -20,6 +20,7 @@ import sys import numpy as np import paddle.fluid as fluid import yaml +import io def str2bool(v): @@ -50,7 +51,7 @@ class ArgumentGroup(object): def load_yaml(parser, file_name, **kwargs): - with open(file_name) as f: + with io.open(file_name, 'r', encoding='utf8') as f: args = yaml.load(f) for title in args: group = parser.add_argument_group(title=title, description='') -- GitLab