提交 e3558f53 编写于 作者: SYSU_BOND's avatar SYSU_BOND 提交者: bbking

update PaddleNLP lexical_analysis for Release/1.6 (#3664)

* update for paddle 1.6

* update optimize op in paddle 1.6

* fix ernie based in paddle 1.6

* fix coding for windows
上级 0f134803
# -*- encoding: utf8 -*-
# -*- encoding: UTF8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
......
......@@ -26,12 +26,20 @@ Lexical Analysis of Chinese,简称 LAC,是一个联合的词法分析模型
git clone https://github.com/PaddlePaddle/models.git
cd models/PaddleNLP/lexical_analysis
```
#### 3. 环境依赖
PaddlePaddle的版本要求是:Python 2 版本是 2.7.15+、Python 3 版本是 3.5.1+/3.6/3.7。LAC的代码可支持Python2/3,无具体版本限制
### 数据准备
#### 1. 快速下载
本项目涉及的**数据集****预训练模型**的数据可通过执行以下脚本进行快速下载,若仅需使用部分数据,可根据需要参照下列介绍进行部分下载
```bash
python download.py all
```
或在支持运行shell脚本的环境下执行:
```bash
sh download.sh
```
......@@ -40,34 +48,18 @@ sh download.sh
下载数据集文件,解压后会生成 `./data/` 文件夹
```bash
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/lexical_analysis-dataset-2.0.0.tar.gz
tar xvf lexical_analysis-dataset-2.0.0.tar.gz
python download.py dataset
```
#### 3. 预训练模型
我们开源了在自建数据集上训练的词法分析模型,可供用户直接使用,这里提供两种下载方式:
方式一:基于 PaddleHub 命令行工具,PaddleHub 的安装参考 [PaddleHub](https://github.com/PaddlePaddle/PaddleHub)
```bash
# download baseline model
hub download lexical_analysis
tar xvf lexical_analysis-2.0.0.tar.gz
# download ERNIE finetuned model
hub download lexical_analysis_finetuned
tar xvf lexical_analysis_finetuned-1.0.0.tar.gz
```
方式二:直接下载
我们开源了在自建数据集上训练的词法分析模型,可供用户直接使用,可通过下述链接进行下载:
```bash
# download baseline model
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/lexical_analysis-2.0.0.tar.gz
tar xvf lexical_analysis-2.0.0.tar.gz
python download.py lac
# download ERNIE finetuned model
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/lexical_analysis_finetuned-1.0.0.tar.gz
tar xvf lexical_analysis_finetuned-1.0.0.tar.gz
python download.py finetuned
```
注:若需进行ERNIE Finetune训练,需自行下载 [ERNIE](https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz) 开放的模型,下载链接为: [https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz](https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz),下载后解压至 `./pretrained/` 目录下。
......@@ -189,6 +181,7 @@ python inference_model.py \
├── compare.py # 执行LAC与其他开源分词的对比脚本
├── creator.py # 执行创建网络和数据读取器的脚本
├── data/ # 存放数据集的目录
├── downloads.py # 用于下载数据和模型的脚本
├── downloads.sh # 用于下载数据和模型的脚本
├── eval.py # 词法分析评估的脚本
├── inference_model.py # 执行保存inference_model的脚本,用于准备上线部署环境
......
# -*- coding: utf-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -11,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: UTF-8 -*-
"""
evaluate wordseg for LAC and other open-source wordseg tools
"""
......
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -11,9 +12,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: UTF-8 -*-
"""
The function lex_net(args) define the lexical analysis network structure
Define the function to create lexical analysis model and model's data reader
"""
import sys
import os
......@@ -24,22 +25,24 @@ import paddle.fluid as fluid
from paddle.fluid.initializer import NormalInitializer
from reader import Dataset
from ernie_reader import SequenceLabelReader
sys.path.append("..")
from models.sequence_labeling import nets
from models.representation.ernie import ernie_encoder
from preprocess.ernie import task_reader
from models.representation.ernie import ernie_encoder, ernie_pyreader
def create_model(args, vocab_size, num_labels, mode = 'train'):
def create_model(args, vocab_size, num_labels, mode='train'):
"""create lac model"""
# model's input data
words = fluid.layers.data(name='words', shape=[-1, 1], dtype='int64',lod_level=1)
targets = fluid.layers.data(name='targets', shape=[-1, 1], dtype='int64', lod_level= 1)
words = fluid.data(name='words', shape=[-1, 1], dtype='int64', lod_level=1)
targets = fluid.data(name='targets', shape=[-1, 1], dtype='int64', lod_level=1)
# for inference process
if mode=='infer':
if mode == 'infer':
crf_decode = nets.lex_net(words, args, vocab_size, num_labels, for_infer=True, target=None)
return { "feed_list":[words],"words":words, "crf_decode":crf_decode,}
return {"feed_list": [words], "words": words, "crf_decode": crf_decode, }
# for test or train process
avg_cost, crf_decode = nets.lex_net(words, args, vocab_size, num_labels, for_infer=False, target=targets)
......@@ -54,10 +57,10 @@ def create_model(args, vocab_size, num_labels, mode = 'train'):
chunk_evaluator.reset()
ret = {
"feed_list":[words, targets],
"feed_list": [words, targets],
"words": words,
"targets": targets,
"avg_cost":avg_cost,
"avg_cost": avg_cost,
"crf_decode": crf_decode,
"precision": precision,
"recall": recall,
......@@ -70,23 +73,25 @@ def create_model(args, vocab_size, num_labels, mode = 'train'):
return ret
def create_pyreader(args, file_name, feed_list, place, model='lac', reader=None, return_reader=False, mode='train'):
# init reader
if model == 'lac':
pyreader = fluid.io.PyReader(
feed_list=feed_list,
capacity=300,
capacity=50,
use_double_buffer=True,
iterable=True
)
if model == 'lac':
if reader==None:
if reader == None:
reader = Dataset(args)
# create lac pyreader
if mode == 'train':
pyreader.decorate_sample_list_generator(
paddle.batch(
paddle.reader.shuffle(
fluid.io.batch(
fluid.io.shuffle(
reader.file_reader(file_name),
buf_size=args.traindata_shuffle_buffer
),
......@@ -96,7 +101,7 @@ def create_pyreader(args, file_name, feed_list, place, model='lac', reader=None,
)
else:
pyreader.decorate_sample_list_generator(
paddle.batch(
fluid.io.batch(
reader.file_reader(file_name, mode=mode),
batch_size=args.batch_size
),
......@@ -105,49 +110,58 @@ def create_pyreader(args, file_name, feed_list, place, model='lac', reader=None,
elif model == 'ernie':
# create ernie pyreader
if reader==None:
reader = task_reader.SequenceLabelReader(
pyreader = fluid.io.DataLoader.from_generator(
feed_list=feed_list,
capacity=50,
use_double_buffer=True,
iterable=True
)
if reader == None:
reader = SequenceLabelReader(
vocab_path=args.vocab_path,
label_map_config=args.label_map_config,
max_seq_len=args.max_seq_len,
do_lower_case=args.do_lower_case,
in_tokens=False,
random_seed=args.random_seed)
if mode == 'train':
pyreader.decorate_batch_generator(
pyreader.set_batch_generator(
reader.data_generator(
file_name, args.batch_size, args.epoch, shuffle=True, phase="train"
),
places=place
)
else:
pyreader.decorate_batch_generator(
pyreader.set_batch_generator(
reader.data_generator(
file_name, args.batch_size, epoch=1, shuffle=False, phase=mode
),
places=place
)
if return_reader:
return pyreader, reader
else:
return pyreader
def create_ernie_model(args, ernie_config):
def create_ernie_model(args, ernie_config):
"""
Create Model for LAC based on ERNIE encoder
"""
# ERNIE's input data
src_ids = fluid.layers.data(name='src_ids', shape=[args.max_seq_len, 1], dtype='int64',lod_level=0)
sent_ids = fluid.layers.data(name='sent_ids', shape=[args.max_seq_len, 1], dtype='int64',lod_level=0)
pos_ids = fluid.layers.data(name='pos_ids', shape=[args.max_seq_len, 1], dtype='int64',lod_level=0)
input_mask = fluid.layers.data(name='input_mask', shape=[args.max_seq_len, 1], dtype='float32',lod_level=0)
padded_labels =fluid.layers.data(name='padded_labels', shape=[args.max_seq_len, 1], dtype='int64',lod_level=0)
seq_lens = fluid.layers.data(name='seq_lens', shape=[-1], dtype='int64',lod_level=0)
src_ids = fluid.data(name='src_ids', shape=[-1, args.max_seq_len, 1], dtype='int64')
sent_ids = fluid.data(name='sent_ids', shape=[-1, args.max_seq_len, 1], dtype='int64')
pos_ids = fluid.data(name='pos_ids', shape=[-1, args.max_seq_len, 1], dtype='int64')
input_mask = fluid.data(name='input_mask', shape=[-1, args.max_seq_len, 1], dtype='float32')
padded_labels = fluid.data(name='padded_labels', shape=[-1, args.max_seq_len, 1], dtype='int64')
seq_lens = fluid.data(name='seq_lens', shape=[-1], dtype='int64', lod_level=0)
squeeze_labels = fluid.layers.squeeze(padded_labels, axes=[-1])
# ernie_pyreader
ernie_inputs = {
"src_ids": src_ids,
"sent_ids": sent_ids,
......@@ -176,9 +190,10 @@ def create_ernie_model(args, ernie_config):
name='crfw',
learning_rate=args.crf_learning_rate),
length=seq_lens)
avg_cost = fluid.layers.mean(x=crf_cost)
crf_decode = fluid.layers.crf_decoding(
input=emission, param_attr=fluid.ParamAttr(name='crfw'),length=seq_lens)
input=emission, param_attr=fluid.ParamAttr(name='crfw'), length=seq_lens)
(precision, recall, f1_score, num_infer_chunks, num_label_chunks,
num_correct_chunks) = fluid.layers.chunk_eval(
......@@ -192,17 +207,17 @@ def create_ernie_model(args, ernie_config):
ret = {
"feed_list": [src_ids, sent_ids, pos_ids, input_mask, padded_labels, seq_lens],
"words":src_ids,
"labels":padded_labels,
"avg_cost":avg_cost,
"crf_decode":crf_decode,
"precision" : precision,
"words": src_ids,
"labels": padded_labels,
"avg_cost": avg_cost,
"crf_decode": crf_decode,
"precision": precision,
"recall": recall,
"f1_score": f1_score,
"chunk_evaluator":chunk_evaluator,
"num_infer_chunks":num_infer_chunks,
"num_label_chunks":num_label_chunks,
"num_correct_chunks":num_correct_chunks
"chunk_evaluator": chunk_evaluator,
"num_infer_chunks": num_infer_chunks,
"num_label_chunks": num_label_chunks,
"num_correct_chunks": num_correct_chunks
}
return ret
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module provides reader for ernie model
"""
import sys
from collections import namedtuple
import numpy as np
sys.path.append("..")
from preprocess.ernie.task_reader import BaseReader,tokenization
def pad_batch_data(insts,
pad_idx=0,
max_len=128,
return_pos=False,
return_input_mask=False,
return_max_len=False,
return_num_token=False,
return_seq_lens=False):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and input mask.
"""
return_list = []
# max_len = max(len(inst) for inst in insts)
max_len = max_len
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
inst_data = np.array(
[inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])
return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
# position data
if return_pos:
inst_pos = np.array([
list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
for inst in insts
])
return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
if return_input_mask:
# This is used to avoid attention on paddings.
input_mask_data = np.array([[1] * len(inst) + [0] *
(max_len - len(inst)) for inst in insts])
input_mask_data = np.expand_dims(input_mask_data, axis=-1)
return_list += [input_mask_data.astype("float32")]
if return_max_len:
return_list += [max_len]
if return_num_token:
num_token = 0
for inst in insts:
num_token += len(inst)
return_list += [num_token]
if return_seq_lens:
seq_lens = np.array([len(inst) for inst in insts])
return_list += [seq_lens.astype("int64").reshape([-1])]
return return_list if len(return_list) > 1 else return_list[0]
class SequenceLabelReader(BaseReader):
"""SequenceLabelReader"""
def _pad_batch_records(self, batch_records):
batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records]
batch_position_ids = [record.position_ids for record in batch_records]
batch_label_ids = [record.label_ids for record in batch_records]
# padding
padded_token_ids, input_mask, batch_seq_lens = pad_batch_data(
batch_token_ids,
max_len=self.max_seq_len,
pad_idx=self.pad_id,
return_input_mask=True,
return_seq_lens=True)
padded_text_type_ids = pad_batch_data(
batch_text_type_ids, max_len=self.max_seq_len, pad_idx=self.pad_id)
padded_position_ids = pad_batch_data(
batch_position_ids, max_len=self.max_seq_len, pad_idx=self.pad_id)
padded_label_ids = pad_batch_data(
batch_label_ids, max_len=self.max_seq_len, pad_idx=len(self.label_map) - 1)
return_list = [
padded_token_ids, padded_text_type_ids, padded_position_ids,
input_mask, padded_label_ids, batch_seq_lens
]
return return_list
def _reseg_token_label(self, tokens, labels, tokenizer):
assert len(tokens) == len(labels)
ret_tokens = []
ret_labels = []
for token, label in zip(tokens, labels):
sub_token = tokenizer.tokenize(token)
if len(sub_token) == 0:
continue
ret_tokens.extend(sub_token)
ret_labels.append(label)
if len(sub_token) < 2:
continue
sub_label = label
if label.startswith("B-"):
sub_label = "I-" + label[2:]
ret_labels.extend([sub_label] * (len(sub_token) - 1))
assert len(ret_tokens) == len(ret_labels)
return ret_tokens, ret_labels
def _convert_example_to_record(self, example, max_seq_length, tokenizer):
tokens = tokenization.convert_to_unicode(example.text_a).split(u"")
labels = tokenization.convert_to_unicode(example.label).split(u"")
tokens, labels = self._reseg_token_label(tokens, labels, tokenizer)
if len(tokens) > max_seq_length - 2:
tokens = tokens[0:(max_seq_length - 2)]
labels = labels[0:(max_seq_length - 2)]
tokens = ["[CLS]"] + tokens + ["[SEP]"]
token_ids = tokenizer.convert_tokens_to_ids(tokens)
position_ids = list(range(len(token_ids)))
text_type_ids = [0] * len(token_ids)
no_entity_id = len(self.label_map) - 1
labels = [
label if label in self.label_map else u"O" for label in labels
]
label_ids = [no_entity_id] + [
self.label_map[label] for label in labels
] + [no_entity_id]
Record = namedtuple(
'Record',
['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
record = Record(
token_ids=token_ids,
text_type_ids=text_type_ids,
position_ids=position_ids,
label_ids=label_ids)
return record
\ No newline at end of file
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -11,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: UTF-8 -*-
import argparse
import os
import time
......@@ -25,6 +26,7 @@ import reader
import creator
sys.path.append('../models/')
from model_check import check_cuda
from model_check import check_version
parser = argparse.ArgumentParser(__doc__)
# 1. model parameters
......@@ -109,4 +111,5 @@ def test_process(exe, program, reader, test_ret):
if __name__ == '__main__':
args = parser.parse_args()
check_cuda(args.use_cuda)
check_version()
do_eval(args)
......@@ -12,6 +12,7 @@ import reader
import utils
sys.path.append('../models/')
from model_check import check_cuda
from model_check import check_version
def save_inference_model(args):
......@@ -101,6 +102,7 @@ if __name__=="__main__":
utils.load_yaml(parser,'conf/args.yaml')
args = parser.parse_args()
check_cuda(args.use_cuda)
check_version()
print("save inference model")
save_inference_model(args)
......
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -11,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: UTF-8 -*-
import argparse
import os
import time
......@@ -25,6 +25,7 @@ import reader
import creator
sys.path.append('../models/')
from model_check import check_cuda
from model_check import check_version
parser = argparse.ArgumentParser(__doc__)
# 1. model parameters
......@@ -120,4 +121,5 @@ def infer_process(exe, program, reader, fetch_vars, dataset):
if __name__=="__main__":
args = parser.parse_args()
check_cuda(args.use_cuda)
check_version()
do_infer(args)
......@@ -37,6 +37,7 @@ import utils
sys.path.append("..")
from models.representation.ernie import ErnieConfig
from models.model_check import check_cuda
from models.model_check import check_version
def evaluate(exe, test_program, test_pyreader, test_ret):
"""
......@@ -160,6 +161,7 @@ def do_train(args):
fetch_list = []
start_time = time.time()
outputs = exe.run(program=compiled_prog, feed=data[0], fetch_list=fetch_list)
end_time = time.time()
if steps % args.print_steps == 0:
......@@ -271,6 +273,7 @@ if __name__ == "__main__":
utils.load_yaml(parser, './conf/ernie_args.yaml')
args = parser.parse_args()
check_cuda(args.use_cuda)
check_version()
utils.print_arguments(args)
if args.mode == 'train':
......
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -11,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: UTF-8 -*-
import os
import sys
......@@ -31,6 +31,7 @@ import creator
from eval import test_process
sys.path.append('../models/')
from model_check import check_cuda
from model_check import check_version
# the function to train model
def do_train(args):
......@@ -86,9 +87,9 @@ def do_train(args):
print("%d %s are used to train model"%(dev_count, device))
# multi cpu/gpu config
exec_strategy = fluid.ExecutionStrategy()
# exec_strategy.num_threads = dev_count * 6
build_strategy = fluid.compiler.BuildStrategy()
# build_strategy.enable_inplace = True
compiled_prog = fluid.compiler.CompiledProgram(train_program).with_data_parallel(
loss_name=train_ret['avg_cost'].name,
......@@ -191,6 +192,7 @@ if __name__ == "__main__":
args = parser.parse_args()
check_cuda(args.use_cuda)
check_version()
print(args)
......
......@@ -50,7 +50,7 @@ class ArgumentGroup(object):
def load_yaml(parser, file_name, **kwargs):
with open(file_name) as f:
args = yaml.load(f)
args = yaml.load(f, Loader=yaml.FullLoader)
for title in args:
group = parser.add_argument_group(title=title, description='')
for name in args[title]:
......
......@@ -85,7 +85,7 @@ def lex_net(word, args, vocab_size, num_labels, for_infer = True, target=None):
"""
Configure the network
"""
word_embedding = fluid.layers.embedding(
word_embedding = fluid.embedding(
input=word,
size=[vocab_size, word_emb_dim],
dtype='float32',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册