From 8db0319c7d3df60e99d981803452f73a701b9c3d Mon Sep 17 00:00:00 2001 From: Bond-SYSU <374579557@qq.com> Date: Mon, 21 Oct 2019 21:17:32 +0800 Subject: [PATCH] Fix infer bug on Release/1.6 (#3693) * update downloads.py * fix bug on ernie based inferring --- PaddleNLP/lexical_analysis/creator.py | 116 ++++++++------ .../run_ernie_sequence_labeling.py | 144 +++++++++++------- PaddleNLP/lexical_analysis/utils.py | 44 ++++++ 3 files changed, 199 insertions(+), 105 deletions(-) diff --git a/PaddleNLP/lexical_analysis/creator.py b/PaddleNLP/lexical_analysis/creator.py index c5f3de9f..e4e1fc94 100644 --- a/PaddleNLP/lexical_analysis/creator.py +++ b/PaddleNLP/lexical_analysis/creator.py @@ -12,7 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ Define the function to create lexical analysis model and model's data reader """ @@ -37,22 +36,29 @@ def create_model(args, vocab_size, num_labels, mode='train'): # model's input data words = fluid.data(name='words', shape=[-1, 1], dtype='int64', lod_level=1) - targets = fluid.data(name='targets', shape=[-1, 1], dtype='int64', lod_level=1) + targets = fluid.data( + name='targets', shape=[-1, 1], dtype='int64', lod_level=1) # for inference process if mode == 'infer': - crf_decode = nets.lex_net(words, args, vocab_size, num_labels, for_infer=True, target=None) - return {"feed_list": [words], "words": words, "crf_decode": crf_decode, } + crf_decode = nets.lex_net( + words, args, vocab_size, num_labels, for_infer=True, target=None) + return { + "feed_list": [words], + "words": words, + "crf_decode": crf_decode, + } # for test or train process - avg_cost, crf_decode = nets.lex_net(words, args, vocab_size, num_labels, for_infer=False, target=targets) + avg_cost, crf_decode = nets.lex_net( + words, args, vocab_size, num_labels, for_infer=False, target=targets) (precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks) = fluid.layers.chunk_eval( - input=crf_decode, - label=targets, - chunk_scheme="IOB", - num_chunk_types=int(math.ceil((num_labels - 1) / 2.0))) + input=crf_decode, + label=targets, + chunk_scheme="IOB", + num_chunk_types=int(math.ceil((num_labels - 1) / 2.0))) chunk_evaluator = fluid.metrics.ChunkEvaluator() chunk_evaluator.reset() @@ -73,7 +79,14 @@ def create_model(args, vocab_size, num_labels, mode='train'): return ret -def create_pyreader(args, file_name, feed_list, place, model='lac', reader=None, return_reader=False, mode='train'): +def create_pyreader(args, + file_name, + feed_list, + place, + model='lac', + reader=None, + return_reader=False, + mode='train'): # init reader if model == 'lac': @@ -81,8 +94,7 @@ def create_pyreader(args, file_name, feed_list, place, model='lac', reader=None, feed_list=feed_list, capacity=50, use_double_buffer=True, - iterable=True - ) + iterable=True) if reader == None: reader = Dataset(args) @@ -93,20 +105,16 @@ def create_pyreader(args, file_name, feed_list, place, model='lac', reader=None, fluid.io.batch( fluid.io.shuffle( reader.file_reader(file_name), - buf_size=args.traindata_shuffle_buffer - ), - batch_size=args.batch_size - ), - places=place - ) + buf_size=args.traindata_shuffle_buffer), + batch_size=args.batch_size), + places=place) else: pyreader.decorate_sample_list_generator( fluid.io.batch( - reader.file_reader(file_name, mode=mode), - batch_size=args.batch_size - ), - places=place - ) + reader.file_reader( + file_name, mode=mode), + batch_size=args.batch_size), + places=place) elif model == 'ernie': # create ernie pyreader @@ -114,8 +122,7 @@ def create_pyreader(args, file_name, feed_list, place, model='lac', reader=None, feed_list=feed_list, capacity=50, use_double_buffer=True, - iterable=True - ) + iterable=True) if reader == None: reader = SequenceLabelReader( vocab_path=args.vocab_path, @@ -127,17 +134,21 @@ def create_pyreader(args, file_name, feed_list, place, model='lac', reader=None, if mode == 'train': pyreader.set_batch_generator( reader.data_generator( - file_name, args.batch_size, args.epoch, shuffle=True, phase="train" - ), - places=place - ) + file_name, + args.batch_size, + args.epoch, + shuffle=True, + phase="train"), + places=place) else: pyreader.set_batch_generator( reader.data_generator( - file_name, args.batch_size, epoch=1, shuffle=False, phase=mode - ), - places=place - ) + file_name, + args.batch_size, + epoch=1, + shuffle=False, + phase=mode), + places=place) if return_reader: return pyreader, reader else: @@ -150,14 +161,20 @@ def create_ernie_model(args, ernie_config): """ # ERNIE's input data - src_ids = fluid.data(name='src_ids', shape=[-1, args.max_seq_len, 1], dtype='int64') - sent_ids = fluid.data(name='sent_ids', shape=[-1, args.max_seq_len, 1], dtype='int64') - pos_ids = fluid.data(name='pos_ids', shape=[-1, args.max_seq_len, 1], dtype='int64') - input_mask = fluid.data(name='input_mask', shape=[-1, args.max_seq_len, 1], dtype='float32') + src_ids = fluid.data( + name='src_ids', shape=[-1, args.max_seq_len, 1], dtype='int64') + sent_ids = fluid.data( + name='sent_ids', shape=[-1, args.max_seq_len, 1], dtype='int64') + pos_ids = fluid.data( + name='pos_ids', shape=[-1, args.max_seq_len, 1], dtype='int64') + input_mask = fluid.data( + name='input_mask', shape=[-1, args.max_seq_len, 1], dtype='float32') - padded_labels = fluid.data(name='padded_labels', shape=[-1, args.max_seq_len, 1], dtype='int64') + padded_labels = fluid.data( + name='padded_labels', shape=[-1, args.max_seq_len, 1], dtype='int64') - seq_lens = fluid.data(name='seq_lens', shape=[-1], dtype='int64', lod_level=0) + seq_lens = fluid.data( + name='seq_lens', shape=[-1], dtype='int64', lod_level=0) squeeze_labels = fluid.layers.squeeze(padded_labels, axes=[-1]) @@ -187,28 +204,31 @@ def create_ernie_model(args, ernie_config): input=emission, label=padded_labels, param_attr=fluid.ParamAttr( - name='crfw', - learning_rate=args.crf_learning_rate), + name='crfw', learning_rate=args.crf_learning_rate), length=seq_lens) avg_cost = fluid.layers.mean(x=crf_cost) crf_decode = fluid.layers.crf_decoding( - input=emission, param_attr=fluid.ParamAttr(name='crfw'), length=seq_lens) + input=emission, + param_attr=fluid.ParamAttr(name='crfw'), + length=seq_lens) (precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks) = fluid.layers.chunk_eval( - input=crf_decode, - label=squeeze_labels, - chunk_scheme="IOB", - num_chunk_types=int(math.ceil((args.num_labels - 1) / 2.0)), - seq_length=seq_lens) + input=crf_decode, + label=squeeze_labels, + chunk_scheme="IOB", + num_chunk_types=int(math.ceil((args.num_labels - 1) / 2.0)), + seq_length=seq_lens) chunk_evaluator = fluid.metrics.ChunkEvaluator() chunk_evaluator.reset() ret = { - "feed_list": [src_ids, sent_ids, pos_ids, input_mask, padded_labels, seq_lens], + "feed_list": + [src_ids, sent_ids, pos_ids, input_mask, padded_labels, seq_lens], "words": src_ids, "labels": padded_labels, + "seq_lens": seq_lens, "avg_cost": avg_cost, "crf_decode": crf_decode, "precision": precision, diff --git a/PaddleNLP/lexical_analysis/run_ernie_sequence_labeling.py b/PaddleNLP/lexical_analysis/run_ernie_sequence_labeling.py index b5664fe2..3ebed4c3 100644 --- a/PaddleNLP/lexical_analysis/run_ernie_sequence_labeling.py +++ b/PaddleNLP/lexical_analysis/run_ernie_sequence_labeling.py @@ -39,6 +39,7 @@ from models.representation.ernie import ErnieConfig from models.model_check import check_cuda from models.model_check import check_version + def evaluate(exe, test_program, test_pyreader, test_ret): """ Evaluation Function @@ -55,8 +56,7 @@ def evaluate(exe, test_program, test_pyreader, test_ret): test_ret["num_label_chunks"], test_ret["num_correct_chunks"], ], - feed=data[0] - ) + feed=data[0]) total_loss.append(loss) test_ret["chunk_evaluator"].update(nums_infer, nums_label, nums_correct) @@ -64,9 +64,11 @@ def evaluate(exe, test_program, test_pyreader, test_ret): precision, recall, f1 = test_ret["chunk_evaluator"].eval() end_time = time.time() - print("\t[test] loss: %.5f, P: %.5f, R: %.5f, F1: %.5f, elapsed time: %.3f s" + print( + "\t[test] loss: %.5f, P: %.5f, R: %.5f, F1: %.5f, elapsed time: %.3f s" % (np.mean(total_loss), precision, recall, f1, end_time - start_time)) + def do_train(args): """ Main Function @@ -80,14 +82,15 @@ def do_train(args): else: dev_count = min(multiprocessing.cpu_count(), args.cpu_num) if (dev_count < args.cpu_num): - print("WARNING: The total CPU NUM in this machine is %d, which is less than cpu_num parameter you set. " - "Change the cpu_num from %d to %d"%(dev_count, args.cpu_num, dev_count)) + print( + "WARNING: The total CPU NUM in this machine is %d, which is less than cpu_num parameter you set. " + "Change the cpu_num from %d to %d" % + (dev_count, args.cpu_num, dev_count)) os.environ['CPU_NUM'] = str(dev_count) place = fluid.CPUPlace() exe = fluid.Executor(place) - startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed @@ -99,49 +102,56 @@ def do_train(args): train_ret = creator.create_ernie_model(args, ernie_config) # ernie pyreader - train_pyreader = creator.create_pyreader(args, file_name=args.train_data, - feed_list=train_ret['feed_list'], - model="ernie", - place=place) + train_pyreader = creator.create_pyreader( + args, + file_name=args.train_data, + feed_list=train_ret['feed_list'], + model="ernie", + place=place) test_program = train_program.clone(for_test=True) - test_pyreader = creator.create_pyreader(args, file_name=args.test_data, - feed_list=train_ret['feed_list'], - model="ernie", - place=place) - - optimizer = fluid.optimizer.Adam(learning_rate=args.base_learning_rate) - fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)) + test_pyreader = creator.create_pyreader( + args, + file_name=args.test_data, + feed_list=train_ret['feed_list'], + model="ernie", + place=place) + + optimizer = fluid.optimizer.Adam( + learning_rate=args.base_learning_rate) + fluid.clip.set_gradient_clip( + clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)) optimizer.minimize(train_ret["avg_cost"]) - lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % - (lower_mem, upper_mem, unit)) + (lower_mem, upper_mem, unit)) print("Device count: %d" % dev_count) exe.run(startup_prog) # load checkpoints if args.init_checkpoint and args.init_pretraining_params: print("WARNING: args 'init_checkpoint' and 'init_pretraining_params' " - "both are set! Only arg 'init_checkpoint' is made valid.") + "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: utils.init_checkpoint(exe, args.init_checkpoint, startup_prog) elif args.init_pretraining_params: - utils.init_pretraining_params(exe, args.init_pretraining_params, startup_prog) + utils.init_pretraining_params(exe, args.init_pretraining_params, + startup_prog) - if dev_count>1 and not args.use_cuda: + if dev_count > 1 and not args.use_cuda: device = "GPU" if args.use_cuda else "CPU" - print("%d %s are used to train model"%(dev_count, device)) + print("%d %s are used to train model" % (dev_count, device)) # multi cpu/gpu config exec_strategy = fluid.ExecutionStrategy() build_strategy = fluid.BuildStrategy() - compiled_prog = fluid.compiler.CompiledProgram(train_program).with_data_parallel( - loss_name=train_ret['avg_cost'].name, - build_strategy=build_strategy, - exec_strategy=exec_strategy) + compiled_prog = fluid.compiler.CompiledProgram( + train_program).with_data_parallel( + loss_name=train_ret['avg_cost'].name, + build_strategy=build_strategy, + exec_strategy=exec_strategy) else: compiled_prog = fluid.compiler.CompiledProgram(train_program) @@ -162,16 +172,23 @@ def do_train(args): start_time = time.time() - outputs = exe.run(program=compiled_prog, feed=data[0], fetch_list=fetch_list) + outputs = exe.run(program=compiled_prog, + feed=data[0], + fetch_list=fetch_list) end_time = time.time() if steps % args.print_steps == 0: - loss, precision, recall, f1_score = [np.mean(x) for x in outputs] - print("[train] batch_id = %d, loss = %.5f, P: %.5f, R: %.5f, F1: %.5f, elapsed time %.5f, " - "pyreader queue_size: %d " % (steps, loss, precision, recall, f1_score, - end_time - start_time, train_pyreader.queue.size())) + loss, precision, recall, f1_score = [ + np.mean(x) for x in outputs + ] + print( + "[train] batch_id = %d, loss = %.5f, P: %.5f, R: %.5f, F1: %.5f, elapsed time %.5f, " + "pyreader queue_size: %d " % + (steps, loss, precision, recall, f1_score, + end_time - start_time, train_pyreader.queue.size())) if steps % args.save_steps == 0: - save_path = os.path.join(args.model_save_dir, "step_" + str(steps)) + save_path = os.path.join(args.model_save_dir, + "step_" + str(steps)) print("\tsaving model as %s" % (save_path)) fluid.io.save_persistables(exe, save_path, train_program) @@ -182,7 +199,6 @@ def do_train(args): fluid.io.save_persistables(exe, save_path, train_program) - def do_eval(args): # init executor if args.use_cuda: @@ -198,11 +214,13 @@ def do_eval(args): test_ret = creator.create_ernie_model(args, ernie_config) test_program = test_program.clone(for_test=True) - pyreader = creator.create_pyreader(args, file_name=args.test_data, - feed_list=test_ret['feed_list'], - model="ernie", - place=place, - mode='test',) + pyreader = creator.create_pyreader( + args, + file_name=args.test_data, + feed_list=test_ret['feed_list'], + model="ernie", + place=place, + mode='test', ) print('program startup') @@ -212,11 +230,13 @@ def do_eval(args): print('program loading') # load model if not args.init_checkpoint: - raise ValueError("args 'init_checkpoint' should be set if only doing test or infer!") + raise ValueError( + "args 'init_checkpoint' should be set if only doing test or infer!") utils.init_checkpoint(exe, args.init_checkpoint, test_program) evaluate(exe, test_program, pyreader, test_ret) + def do_infer(args): # init executor if args.use_cuda: @@ -233,41 +253,52 @@ def do_infer(args): infer_ret = creator.create_ernie_model(args, ernie_config) infer_program = infer_program.clone(for_test=True) print(args.test_data) - pyreader, reader = creator.create_pyreader(args, file_name=args.test_data, - feed_list=infer_ret['feed_list'], - model="ernie", - place=place, - return_reader=True, - mode='test') + pyreader, reader = creator.create_pyreader( + args, + file_name=args.test_data, + feed_list=infer_ret['feed_list'], + model="ernie", + place=place, + return_reader=True, + mode='test') exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) # load model if not args.init_checkpoint: - raise ValueError("args 'init_checkpoint' should be set if only doing test or infer!") + raise ValueError( + "args 'init_checkpoint' should be set if only doing test or infer!") utils.init_checkpoint(exe, args.init_checkpoint, infer_program) # create dict - id2word_dict = dict([(str(word_id), word) for word, word_id in reader.vocab.items()]) - id2label_dict = dict([(str(label_id), label) for label, label_id in reader.label_map.items()]) + id2word_dict = dict( + [(str(word_id), word) for word, word_id in reader.vocab.items()]) + id2label_dict = dict([(str(label_id), label) + for label, label_id in reader.label_map.items()]) Dataset = namedtuple("Dataset", ["id2word_dict", "id2label_dict"]) dataset = Dataset(id2word_dict, id2label_dict) # make prediction for data in pyreader(): - (words, crf_decode) = exe.run(infer_program, - fetch_list=[infer_ret["words"], infer_ret["crf_decode"]], - feed=data[0], - return_numpy=False) + (words, crf_decode, seq_lens) = exe.run(infer_program, + fetch_list=[ + infer_ret["words"], + infer_ret["crf_decode"], + infer_ret["seq_lens"] + ], + feed=data[0], + return_numpy=True) # User should notice that words had been clipped if long than args.max_seq_len - results = utils.parse_result(words, crf_decode, dataset) + results = utils.parse_padding_result(words, crf_decode, seq_lens, + dataset) for sent, tags in results: - result_list = ['(%s, %s)' % (ch, tag) for ch, tag in zip(sent, tags)] + result_list = [ + '(%s, %s)' % (ch, tag) for ch, tag in zip(sent, tags) + ] print(''.join(result_list)) - if __name__ == "__main__": parser = argparse.ArgumentParser(__doc__) utils.load_yaml(parser, './conf/ernie_args.yaml') @@ -284,4 +315,3 @@ if __name__ == "__main__": do_infer(args) else: print("Usage: %s --mode train|eval|infer " % sys.argv[0]) - diff --git a/PaddleNLP/lexical_analysis/utils.py b/PaddleNLP/lexical_analysis/utils.py index 04d9d995..8fab3252 100644 --- a/PaddleNLP/lexical_analysis/utils.py +++ b/PaddleNLP/lexical_analysis/utils.py @@ -148,6 +148,50 @@ def parse_result(words, crf_decode, dataset): return batch_out +def parse_padding_result(words, crf_decode, seq_lens, dataset): + """ parse padding result """ + words = np.squeeze(words) + batch_size = len(seq_lens) + + batch_out = [] + for sent_index in range(batch_size): + + sent = [ + dataset.id2word_dict[str(id)] + for id in words[sent_index][1:seq_lens[sent_index] - 1] + ] + tags = [ + dataset.id2label_dict[str(id)] + for id in crf_decode[sent_index][1:seq_lens[sent_index] - 1] + ] + + sent_out = [] + tags_out = [] + parital_word = "" + for ind, tag in enumerate(tags): + # for the first word + if parital_word == "": + parital_word = sent[ind] + tags_out.append(tag.split('-')[0]) + continue + + # for the beginning of word + if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"): + sent_out.append(parital_word) + tags_out.append(tag.split('-')[0]) + parital_word = sent[ind] + continue + + parital_word += sent[ind] + + # append the last word, except for len(tags)=0 + if len(sent_out) < len(tags_out): + sent_out.append(parital_word) + + batch_out.append([sent_out, tags_out]) + return batch_out + + def init_checkpoint(exe, init_checkpoint_path, main_program): """ Init CheckPoint -- GitLab