提交 6ed5f04d 编写于 作者: SYSU_BOND's avatar SYSU_BOND 提交者: bbking

replace open with io.open to be compatible with windows (#3707)

* update downloads.py

* fix bug on ernie based inferring

* replace open with io.open to be compatible with  windows
上级 68d6379c
......@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
evaluate wordseg for LAC and other open-source wordseg tools
"""
......@@ -21,6 +20,7 @@ from __future__ import division
import sys
import os
import io
def to_unicode(string):
......@@ -71,7 +71,7 @@ def load_testdata(datapath="./data/test_data/test_part"):
"""none"""
sentences = []
sent_seg_list = []
for line in open(datapath):
for line in io.open(datapath, 'r', encoding='utf8'):
sent, label = line.strip().split("\t")
sentences.append(sent)
......@@ -110,7 +110,7 @@ def get_lac_result():
`sh run.sh | tail -n 100 > result.txt`
"""
sent_seg_list = []
for line in open("./result.txt"):
for line in io.open("./result.txt", 'r', encoding='utf8'):
line = line.strip().split(" ")
words = [pair.split("/")[0] for pair in line]
labels = [pair.split("/")[1] for pair in line]
......
......@@ -31,20 +31,31 @@ from model_check import check_version
parser = argparse.ArgumentParser(__doc__)
# 1. model parameters
model_g = utils.ArgumentGroup(parser, "model", "model configuration")
model_g.add_arg("word_emb_dim", int, 128, "The dimension in which a word is embedded.")
model_g.add_arg("grnn_hidden_dim", int, 128, "The number of hidden nodes in the GRNN layer.")
model_g.add_arg("bigru_num", int, 2, "The number of bi_gru layers in the network.")
model_g.add_arg("word_emb_dim", int, 128,
"The dimension in which a word is embedded.")
model_g.add_arg("grnn_hidden_dim", int, 128,
"The number of hidden nodes in the GRNN layer.")
model_g.add_arg("bigru_num", int, 2,
"The number of bi_gru layers in the network.")
model_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.")
# 2. data parameters
data_g = utils.ArgumentGroup(parser, "data", "data paths")
data_g.add_arg("word_dict_path", str, "./conf/word.dic", "The path of the word dictionary.")
data_g.add_arg("label_dict_path", str, "./conf/tag.dic", "The path of the label dictionary.")
data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic", "The path of the word replacement Dictionary.")
data_g.add_arg("test_data", str, "./data/test.tsv", "The folder where the training data is located.")
data_g.add_arg("word_dict_path", str, "./conf/word.dic",
"The path of the word dictionary.")
data_g.add_arg("label_dict_path", str, "./conf/tag.dic",
"The path of the label dictionary.")
data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic",
"The path of the word replacement Dictionary.")
data_g.add_arg("test_data", str, "./data/test.tsv",
"The folder where the training data is located.")
data_g.add_arg("init_checkpoint", str, "./model_baseline", "Path to init model")
data_g.add_arg("batch_size", int, 200, "The number of sequences contained in a mini-batch, "
"or the maximum number of tokens (include paddings) contained in a mini-batch.")
data_g.add_arg(
"batch_size", int, 200,
"The number of sequences contained in a mini-batch, "
"or the maximum number of tokens (include paddings) contained in a mini-batch."
)
def do_eval(args):
dataset = reader.Dataset(args)
......@@ -62,23 +73,23 @@ def do_eval(args):
else:
place = fluid.CPUPlace()
pyreader = creator.create_pyreader(args, file_name=args.test_data,
feed_list=test_ret['feed_list'],
place=place,
model='lac',
reader=dataset,
mode='test')
pyreader = creator.create_pyreader(
args,
file_name=args.test_data,
feed_list=test_ret['feed_list'],
place=place,
model='lac',
reader=dataset,
mode='test')
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
# load model
utils.init_checkpoint(exe, args.init_checkpoint, test_program)
test_process(exe=exe,
program=test_program,
reader=pyreader,
test_ret=test_ret
)
test_process(
exe=exe, program=test_program, reader=pyreader, test_ret=test_ret)
def test_process(exe, program, reader, test_ret):
"""
......@@ -93,20 +104,21 @@ def test_process(exe, program, reader, test_ret):
start_time = time.time()
for data in reader():
nums_infer, nums_label, nums_correct = exe.run(program,
fetch_list=[
test_ret["num_infer_chunks"],
test_ret["num_label_chunks"],
test_ret["num_correct_chunks"],
],
feed=data,
)
nums_infer, nums_label, nums_correct = exe.run(
program,
fetch_list=[
test_ret["num_infer_chunks"],
test_ret["num_label_chunks"],
test_ret["num_correct_chunks"],
],
feed=data, )
test_ret["chunk_evaluator"].update(nums_infer, nums_label, nums_correct)
precision, recall, f1 = test_ret["chunk_evaluator"].eval()
end_time = time.time()
print("[test] P: %.5f, R: %.5f, F1: %.5f, elapsed time: %.3f s"
% (precision, recall, f1, end_time - start_time))
print("[test] P: %.5f, R: %.5f, F1: %.5f, elapsed time: %.3f s" %
(precision, recall, f1, end_time - start_time))
if __name__ == '__main__':
args = parser.parse_args()
......
......@@ -14,6 +14,7 @@ sys.path.append('../models/')
from model_check import check_cuda
from model_check import check_version
def save_inference_model(args):
# model definition
......@@ -30,20 +31,19 @@ def save_inference_model(args):
args, dataset.vocab_size, dataset.num_labels, mode='infer')
infer_program = infer_program.clone(for_test=True)
# load pretrain check point
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
utils.init_checkpoint(exe, args.init_checkpoint, infer_program)
fluid.io.save_inference_model(args.inference_save_dir,
['words'],
infer_ret['crf_decode'],
exe,
main_program=infer_program,
model_filename='model.pdmodel',
params_filename='params.pdparams',
)
fluid.io.save_inference_model(
args.inference_save_dir,
['words'],
infer_ret['crf_decode'],
exe,
main_program=infer_program,
model_filename='model.pdmodel',
params_filename='params.pdparams', )
def test_inference_model(model_dir, text_list, dataset):
......@@ -68,45 +68,46 @@ def test_inference_model(model_dir, text_list, dataset):
tensor_words = fluid.create_lod_tensor(lod, base_shape, place)
# for empty input, output the same empty
if(sum(base_shape[0]) == 0 ):
if (sum(base_shape[0]) == 0):
crf_decode = [tensor_words]
else:
# load inference model
inference_scope = fluid.core.Scope()
with fluid.scope_guard(inference_scope):
[inferencer, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(model_dir, exe,
model_filename='model.pdmodel',
params_filename='params.pdparams',
)
fetch_targets] = fluid.io.load_inference_model(
model_dir,
exe,
model_filename='model.pdmodel',
params_filename='params.pdparams', )
assert feed_target_names[0] == "words"
print("Load inference model from %s"%(model_dir))
print("Load inference model from %s" % (model_dir))
# get lac result
crf_decode = exe.run(inferencer,
feed={feed_target_names[0]:tensor_words},
fetch_list=fetch_targets,
return_numpy=False,
use_program_cache=True,
)
crf_decode = exe.run(
inferencer,
feed={feed_target_names[0]: tensor_words},
fetch_list=fetch_targets,
return_numpy=False,
use_program_cache=True, )
# parse the crf_decode result
result = utils.parse_result(tensor_words,crf_decode[0], dataset)
for i,(sent, tags) in enumerate(result):
result_list = ['(%s, %s)'%(ch, tag) for ch, tag in zip(sent,tags)]
result = utils.parse_result(tensor_words, crf_decode[0], dataset)
for i, (sent, tags) in enumerate(result):
result_list = ['(%s, %s)' % (ch, tag) for ch, tag in zip(sent, tags)]
print(''.join(result_list))
if __name__=="__main__":
if __name__ == "__main__":
parser = argparse.ArgumentParser(__doc__)
utils.load_yaml(parser,'conf/args.yaml')
utils.load_yaml(parser, 'conf/args.yaml')
args = parser.parse_args()
check_cuda(args.use_cuda)
check_version()
print("save inference model")
save_inference_model(args)
print("inference model save in %s"%args.inference_save_dir)
print("inference model save in %s" % args.inference_save_dir)
print("test inference model")
dataset = reader.Dataset(args)
test_data = [u'百度是一家高科技公司', u'中山大学是岭南第一学府']
......
......@@ -12,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import time
......@@ -30,20 +31,31 @@ from model_check import check_version
parser = argparse.ArgumentParser(__doc__)
# 1. model parameters
model_g = utils.ArgumentGroup(parser, "model", "model configuration")
model_g.add_arg("word_emb_dim", int, 128, "The dimension in which a word is embedded.")
model_g.add_arg("grnn_hidden_dim", int, 256, "The number of hidden nodes in the GRNN layer.")
model_g.add_arg("bigru_num", int, 2, "The number of bi_gru layers in the network.")
model_g.add_arg("word_emb_dim", int, 128,
"The dimension in which a word is embedded.")
model_g.add_arg("grnn_hidden_dim", int, 256,
"The number of hidden nodes in the GRNN layer.")
model_g.add_arg("bigru_num", int, 2,
"The number of bi_gru layers in the network.")
model_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.")
# 2. data parameters
data_g = utils.ArgumentGroup(parser, "data", "data paths")
data_g.add_arg("word_dict_path", str, "./conf/word.dic", "The path of the word dictionary.")
data_g.add_arg("label_dict_path", str, "./conf/tag.dic", "The path of the label dictionary.")
data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic", "The path of the word replacement Dictionary.")
data_g.add_arg("infer_data", str, "./data/infer.tsv", "The folder where the training data is located.")
data_g.add_arg("word_dict_path", str, "./conf/word.dic",
"The path of the word dictionary.")
data_g.add_arg("label_dict_path", str, "./conf/tag.dic",
"The path of the label dictionary.")
data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic",
"The path of the word replacement Dictionary.")
data_g.add_arg("infer_data", str, "./data/infer.tsv",
"The folder where the training data is located.")
data_g.add_arg("init_checkpoint", str, "./model_baseline", "Path to init model")
data_g.add_arg("batch_size", int, 200, "The number of sequences contained in a mini-batch, "
"or the maximum number of tokens (include paddings) contained in a mini-batch.")
data_g.add_arg(
"batch_size", int, 200,
"The number of sequences contained in a mini-batch, "
"or the maximum number of tokens (include paddings) contained in a mini-batch."
)
def do_infer(args):
dataset = reader.Dataset(args)
......@@ -61,14 +73,14 @@ def do_infer(args):
else:
place = fluid.CPUPlace()
pyreader = creator.create_pyreader(args, file_name=args.infer_data,
feed_list=infer_ret['feed_list'],
place=place,
model='lac',
reader=dataset,
mode='infer')
pyreader = creator.create_pyreader(
args,
file_name=args.infer_data,
feed_list=infer_ret['feed_list'],
place=place,
model='lac',
reader=dataset,
mode='infer')
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
......@@ -81,8 +93,7 @@ def do_infer(args):
program=infer_program,
reader=pyreader,
fetch_vars=[infer_ret['words'], infer_ret['crf_decode']],
dataset=dataset
)
dataset=dataset)
for sent, tags in result:
result_list = ['(%s, %s)' % (ch, tag) for ch, tag in zip(sent, tags)]
print(''.join(result_list))
......@@ -96,8 +107,9 @@ def infer_process(exe, program, reader, fetch_vars, dataset):
:param reader: data reader
:return: the list of prediction result
"""
def input_check(data):
if data[0]['words'].lod()[0][-1]==0:
if data[0]['words'].lod()[0][-1] == 0:
return data[0]['words']
return None
......@@ -108,17 +120,17 @@ def infer_process(exe, program, reader, fetch_vars, dataset):
results += utils.parse_result(crf_decode, crf_decode, dataset)
continue
words, crf_decode = exe.run(program,
fetch_list=fetch_vars,
feed=data,
return_numpy=False,
use_program_cache=True,
)
words, crf_decode = exe.run(
program,
fetch_list=fetch_vars,
feed=data,
return_numpy=False,
use_program_cache=True, )
results += utils.parse_result(words, crf_decode, dataset)
return results
if __name__=="__main__":
if __name__ == "__main__":
args = parser.parse_args()
check_cuda(args.use_cuda)
check_version()
......
......@@ -14,6 +14,7 @@
"""
The file_reader converts raw corpus to input.
"""
import os
import argparse
import __future__
......
......@@ -20,6 +20,7 @@ import sys
import numpy as np
import paddle.fluid as fluid
import yaml
import io
def str2bool(v):
......@@ -50,7 +51,7 @@ class ArgumentGroup(object):
def load_yaml(parser, file_name, **kwargs):
with open(file_name) as f:
with io.open(file_name, 'r', encoding='utf8') as f:
args = yaml.load(f)
for title in args:
group = parser.add_argument_group(title=title, description='')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册