From 94487f0481ba5b9381e599ec51982235b9a1da4c Mon Sep 17 00:00:00 2001 From: kinghuin Date: Wed, 16 Oct 2019 14:58:39 +0800 Subject: [PATCH] Paddingernie16 (#3598) * Update README.md * padding ernie * modify ernie * modify code style * remove squeeze_labels * fix lod/ernie input_mask dtype * padding LAC/ERNIE revert squeeze * padding LAC/ERNIE revert label shape * padding LAC/ERNIE add padded_token_embeddings --- PaddleNLP/lexical_analysis/README.md | 2 +- PaddleNLP/lexical_analysis/creator.py | 38 ++++++++++++------------ PaddleNLP/models/representation/ernie.py | 2 ++ PaddleNLP/preprocess/padding.py | 2 +- 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/PaddleNLP/lexical_analysis/README.md b/PaddleNLP/lexical_analysis/README.md index 1951b2e6..66c3f5b1 100644 --- a/PaddleNLP/lexical_analysis/README.md +++ b/PaddleNLP/lexical_analysis/README.md @@ -16,7 +16,7 @@ Lexical Analysis of Chinese,简称 LAC,是一个联合的词法分析模型 #### 1.PaddlePaddle 安装 -本项目依赖 PaddlePaddle 1.4.0 及以上版本和PaddleHub 1.0.0及以上版本 ,PaddlePaddle安装请参考官网 [快速安装](http://www.paddlepaddle.org/paddle#quick-start),PaddleHub安装参考 [PaddleHub](https://github.com/PaddlePaddle/PaddleHub)。 +本项目依赖 PaddlePaddle 1.6.0 及以上版本和PaddleHub 1.0.0及以上版本 ,PaddlePaddle安装请参考官网 [快速安装](http://www.paddlepaddle.org/paddle#quick-start),PaddleHub安装参考 [PaddleHub](https://github.com/PaddlePaddle/PaddleHub)。 > Warning: GPU 和 CPU 版本的 PaddlePaddle 分别是 paddlepaddle-gpu 和 paddlepaddle,请安装时注意区别。 diff --git a/PaddleNLP/lexical_analysis/creator.py b/PaddleNLP/lexical_analysis/creator.py index 66c8d4d1..3c19ad4f 100644 --- a/PaddleNLP/lexical_analysis/creator.py +++ b/PaddleNLP/lexical_analysis/creator.py @@ -59,7 +59,7 @@ def create_model(args, vocab_size, num_labels, mode = 'train'): "targets": targets, "avg_cost":avg_cost, "crf_decode": crf_decode, - "precision" : precision, + "precision": precision, "recall": recall, "f1_score": f1_score, "chunk_evaluator": chunk_evaluator, @@ -143,9 +143,10 @@ def create_ernie_model(args, ernie_config): src_ids = fluid.layers.data(name='src_ids', shape=[args.max_seq_len, 1], dtype='int64',lod_level=0) sent_ids = fluid.layers.data(name='sent_ids', shape=[args.max_seq_len, 1], dtype='int64',lod_level=0) pos_ids = fluid.layers.data(name='pos_ids', shape=[args.max_seq_len, 1], dtype='int64',lod_level=0) - input_mask = fluid.layers.data(name='input_mask', shape=[args.max_seq_len, 1], dtype='int64',lod_level=0) + input_mask = fluid.layers.data(name='input_mask', shape=[args.max_seq_len, 1], dtype='float32',lod_level=0) padded_labels =fluid.layers.data(name='padded_labels', shape=[args.max_seq_len, 1], dtype='int64',lod_level=0) - seq_lens = fluid.layers.data(name='seq_lens', shape=[1], dtype='int64',lod_level=0) + seq_lens = fluid.layers.data(name='seq_lens', shape=[-1], dtype='int64',lod_level=0) + squeeze_labels = fluid.layers.squeeze(padded_labels, axes=[-1]) ernie_inputs = { "src_ids": src_ids, @@ -156,44 +157,43 @@ def create_ernie_model(args, ernie_config): } embeddings = ernie_encoder(ernie_inputs, ernie_config=ernie_config) - words = fluid.layers.sequence_unpad(src_ids, seq_lens) - labels = fluid.layers.sequence_unpad(padded_labels, seq_lens) - - token_embeddings = embeddings["token_embeddings"] + padded_token_embeddings = embeddings["padded_token_embeddings"] emission = fluid.layers.fc( size=args.num_labels, - input=token_embeddings, + input=padded_token_embeddings, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-args.init_bound, high=args.init_bound), regularizer=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=1e-4))) + regularization_coeff=1e-4)), + num_flatten_dims=2) crf_cost = fluid.layers.linear_chain_crf( input=emission, - label=labels, + label=padded_labels, param_attr=fluid.ParamAttr( name='crfw', - learning_rate=args.crf_learning_rate)) + learning_rate=args.crf_learning_rate), + length=seq_lens) avg_cost = fluid.layers.mean(x=crf_cost) crf_decode = fluid.layers.crf_decoding( - input=emission, param_attr=fluid.ParamAttr(name='crfw')) - + input=emission, param_attr=fluid.ParamAttr(name='crfw'),length=seq_lens) (precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks) = fluid.layers.chunk_eval( - input=crf_decode, - label=labels, - chunk_scheme="IOB", - num_chunk_types=int(math.ceil((args.num_labels - 1) / 2.0))) + input=crf_decode, + label=squeeze_labels, + chunk_scheme="IOB", + num_chunk_types=int(math.ceil((args.num_labels - 1) / 2.0)), + seq_length=seq_lens) chunk_evaluator = fluid.metrics.ChunkEvaluator() chunk_evaluator.reset() ret = { "feed_list": [src_ids, sent_ids, pos_ids, input_mask, padded_labels, seq_lens], - "words":words, - "labels":labels, + "words":src_ids, + "labels":padded_labels, "avg_cost":avg_cost, "crf_decode":crf_decode, "precision" : precision, diff --git a/PaddleNLP/models/representation/ernie.py b/PaddleNLP/models/representation/ernie.py index 23db3ac3..69831ef2 100644 --- a/PaddleNLP/models/representation/ernie.py +++ b/PaddleNLP/models/representation/ernie.py @@ -80,6 +80,7 @@ def ernie_encoder_with_paddle_hub(ernie_inputs, max_seq_len): embeddings = { "sentence_embeddings": cls_feats, "token_embeddings": unpad_enc_out, + "padded_token_embeddings": enc_out } for k, v in embeddings.items(): @@ -106,6 +107,7 @@ def ernie_encoder(ernie_inputs, ernie_config): embeddings = { "sentence_embeddings": cls_feats, "token_embeddings": unpad_enc_out, + "padded_token_embeddings": enc_out } for k, v in embeddings.items(): diff --git a/PaddleNLP/preprocess/padding.py b/PaddleNLP/preprocess/padding.py index 6094562d..82171e68 100644 --- a/PaddleNLP/preprocess/padding.py +++ b/PaddleNLP/preprocess/padding.py @@ -69,7 +69,7 @@ def pad_batch_data(insts, if return_seq_lens: seq_lens = np.array([len(inst) for inst in insts]) - return_list += [seq_lens.astype("int64").reshape([-1, 1])] + return_list += [seq_lens.astype("int64").reshape([-1])] return return_list if len(return_list) > 1 else return_list[0] -- GitLab