未验证 提交 352d9fb8 编写于 作者: K kinghuin 提交者: GitHub

optimize lac and msra_ner example (#5270)

* optimize lac and msra_ner example

* optimize ner and lac

* rename filename to lexical_analysis_dataset_tiny_path0

* modify lac dataset url
上级 fee616e8
......@@ -20,14 +20,14 @@
- paddlepaddle >= 2.0.0rc1,安装方式请参考 [快速安装](https://www.paddlepaddle.org.cn/install/quick)
- paddlenlp >= 2.0.0b2, 安装方式:`pip install paddlenlp\>=2.0.0b2`
- paddlenlp >= 2.0.0rc, 安装方式:`pip install paddlenlp\>=2.0.0rc`
### 2.2 数据准备
我们提供了少数样本用以示例输入数据格式。执行以下命令,下载并解压示例数据集:
```bash
wget --no-check-certificate https://paddlenlp.bj.bcebos.com/data/lexical_analysis_dataset_tiny.tar.gz
wget --no-check-certificate https://paddlenlp.bj.bcebos.com/datasets/lexical_analysis_dataset_tiny.tar.gz
tar xvf lexical_analysis_dataset_tiny.tar.gz
```
......@@ -54,18 +54,18 @@ tar xvf lexical_analysis_dataset_tiny.tar.gz
模型训练支持 CPU 和 GPU,使用 GPU 之前应指定使用的显卡卡号:
```bash
export CUDA_VISIBLE_DEVICES=0,1 # 支持多卡训练
export CUDA_VISIBLE_DEVICES=0 # 支持多卡训练,如使用双卡,可以设置为0,1
```
训练启动方式如下:
```bash
python -m paddle.distributed.launch train.py \
python train.py \
--data_dir ./lexical_analysis_dataset_tiny \
--model_save_dir ./save_dir \
--epochs 10 \
--batch_size 32 \
--use_gpu True \
--n_gpu 1 \
# --init_checkpoint ./save_dir/final
```
......
......@@ -43,15 +43,13 @@ class LacDataset(paddle.io.Dataset):
word_dict_path = os.path.join(self.base_path, 'word.dic')
label_dict_path = os.path.join(self.base_path, 'tag.dic')
word_rep_dict_path = os.path.join(self.base_path, 'q2b.dic')
self.word_vocab = self._load_kv_dict(
word_dict_path, value_func=np.int64, reverse=True)
self.label_vocab = self._load_kv_dict(
label_dict_path, value_func=np.int64, reverse=True)
self.word_replace_dict = self._load_kv_dict(word_rep_dict_path)
self.word_vocab = self._load_vocab(word_dict_path)
self.label_vocab = self._load_vocab(label_dict_path)
self.word_replace_dict = self._load_vocab(word_rep_dict_path)
# Calculate vocab size and labels number, note: vocab value strats from 0.
self.vocab_size = max(self.word_vocab.values()) + 1
self.num_labels = max(self.label_vocab.values()) + 1
self.vocab_size = len(self.word_vocab)
self.num_labels = len(self.label_vocab)
if self.mode in {"train", "test", "infer"}:
self.dataset_path = os.path.join(self.base_path,
......@@ -109,30 +107,27 @@ class LacDataset(paddle.io.Dataset):
self.total += 1
def _load_kv_dict(self,
dict_path,
delimiter="\t",
key_func=None,
value_func=None,
reverse=False):
def _load_vocab(self, dict_path):
"""
Load key-value dict from file
Load vocab from file
"""
vocab = {}
for line in open(dict_path, "r", encoding='utf8'):
terms = line.strip("\n").split(delimiter)
if len(terms) != 2:
continue
reverse = None
with open(dict_path, "r", encoding='utf8') as fin:
for i, line in enumerate(fin):
terms = line.strip("\n").split("\t")
if len(terms) == 2:
if reverse == None:
reverse = True if terms[0].isdigit() else False
if reverse:
value, key = terms
else:
key, value = terms
if key in vocab:
raise KeyError("key duplicated with [%s]" % (key))
if key_func:
key = key_func(key)
if value_func:
value = value_func(value)
elif len(terms) == 1:
key, value = terms[0], i
else:
raise ValueError("Error line: %s in file: %s" %
(line, dict_path))
vocab[key] = value
return vocab
......
......@@ -34,21 +34,16 @@ parser.add_argument("--model_save_dir", type=str, default=None, help="The model
parser.add_argument("--epochs", type=int, default=10, help="Corpus iteration num.")
parser.add_argument("--batch_size", type=int, default=300, help="The number of sequences contained in a mini-batch.")
parser.add_argument("--max_seq_len", type=int, default=64, help="Number of words of the longest seqence.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="If set, use GPU for training.")
parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs to use, 0 for CPU.")
parser.add_argument("--base_lr", type=float, default=0.001, help="The basic learning rate that affects the entire network.")
parser.add_argument("--emb_dim", type=int, default=128, help="The dimension in which a word is embedded.")
parser.add_argument("--hidden_size", type=int, default=128, help="The number of hidden nodes in the GRU layer.")
args = parser.parse_args()
parser.add_argument("--verbose", type=ast.literal_eval, default=128, help="Print reader and training time in details.")
# yapf: enable
def train(args):
if args.use_gpu:
place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
paddle.set_device("gpu")
else:
place = paddle.CPUPlace()
paddle.set_device("cpu")
paddle.set_device("gpu" if args.n_gpu else "cpu")
# create dataset.
train_dataset = LacDataset(args.data_dir, mode='train')
......@@ -69,7 +64,6 @@ def train(args):
train_loader = paddle.io.DataLoader(
dataset=train_dataset,
batch_sampler=train_sampler,
places=place,
return_list=True,
collate_fn=batchify_fn)
......@@ -81,7 +75,6 @@ def train(args):
test_loader = paddle.io.DataLoader(
dataset=test_dataset,
batch_sampler=test_sampler,
places=place,
return_list=True,
collate_fn=batchify_fn)
......@@ -101,6 +94,8 @@ def train(args):
model.load(args.init_checkpoint)
# Start training
callbacks = paddle.callbacks.ProgBarLogger(
log_freq=10, verbose=3) if args.verbose else None
model.fit(train_data=train_loader,
eval_data=test_loader,
batch_size=args.batch_size,
......@@ -109,9 +104,13 @@ def train(args):
log_freq=10,
save_dir=args.model_save_dir,
save_freq=1,
shuffle=True)
shuffle=True,
callbacks=callbacks)
if __name__ == "__main__":
print(args)
args = parser.parse_args()
if args.n_gpu > 1:
paddle.distributed.spawn(train, args=(args, ), nprocs=args.n_gpu)
else:
train(args)
......@@ -12,7 +12,7 @@
- paddlepaddle >= 2.0.0,安装方式请参考 [快速安装](https://www.paddlepaddle.org.cn/install/quick)
- paddlenlp >= 2.0.0rc, 安装方式:`pip install paddlenlp>=2.0.0rc`
- paddlenlp >= 2.0.0rc, 安装方式:`pip install paddlenlp\>=2.0.0rc`
### 2.2 数据准备
......
......@@ -69,7 +69,7 @@ def predict(model, data_loader, ds):
pred_list.append(pred.numpy())
len_list.append(lens.numpy())
preds = parse_decodes(ds, pred_list, len_list)
print('\n'.join(preds[:10]))
return preds
def convert_example(example, tokenizer, label_vocab):
......@@ -177,4 +177,5 @@ if __name__ == '__main__':
paddle.save(model.state_dict(),
'./ernie_result/model_%d.pdparams' % step)
pred = predict(model, test_loader, test_ds)
preds = predict(model, test_loader, test_ds)
print('\n'.join(preds[:10]))
......@@ -17,16 +17,16 @@ PaddleNLP集成的数据集MSRA-NER数据集对文件格式做了调整:每一
- Python >= 3.6
- paddlepaddle >= 2.0.0rc1,安装方式请参考 [快速安装](https://www.paddlepaddle.org.cn/install/quick)
- paddlepaddle >= 2.0.0,安装方式请参考 [快速安装](https://www.paddlepaddle.org.cn/install/quick)
- paddlenlp >= 2.0.0b2, 安装方式:`pip install paddlenlp>=2.0.0b2`
- paddlenlp >= 2.0.0rc, 安装方式:`pip install paddlenlp\>=2.0.0rc`
### 2.2 启动MSRA-NER任务
```shell
export CUDA_VISIBLE_DEVICES=0
python -u ./run_msra_ner.py \
python -u ./train.py \
--model_name_or_path bert-base-multilingual-uncased \
--max_seq_length 128 \
--batch_size 32 \
......@@ -39,7 +39,7 @@ python -u ./run_msra_ner.py \
```
其中参数释义如下:
- `model_name_or_path`: 指示了某种特定配置的模型,对应有其预训练模型和预训练时使用的 tokenizer。若模型相关内容保存在本地,这里也可以提供相应目录地址。
- `model_name_or_path`: 指示了某种特定配置的模型,对应有其预训练模型和预训练时使用的 tokenizer,支持[PaadleNLP transformer类预训练模型](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/docs/transformers.md)中除ernie-gen以外的所有模型。若使用非BERT系列模型,需修改脚本导入相应的Task和Tokenizer。若模型相关内容保存在本地,这里也可以提供相应目录地址。
- `max_seq_length`: 表示最大句子长度,超过该长度将被截断。
- `batch_size`: 表示每次迭代**每张卡**上的样本数目。
- `learning_rate`: 表示基础学习率大小,将于learning rate scheduler产生的值相乘作为当前学习率。
......@@ -67,6 +67,46 @@ Precision | 0.908957 |
Recall | 0.926683 |
F1 | 0.917734 |
## 启动评估
```shell
export CUDA_VISIBLE_DEVICES=0
python -u ./eval.py \
--model_name_or_path bert-base-multilingual-uncased \
--max_seq_length 128 \
--batch_size 32 \
--use_gpu True \
--init_checkpoint_path tmp/msra_ner/model_500.pdparams
```
其中参数释义如下:
- `model_name_or_path`: 指示了某种特定配置的模型,对应有其预训练模型和预训练时使用的 tokenizer。若模型相关内容保存在本地,这里也可以提供相应目录地址。
- `max_seq_length`: 表示最大句子长度,超过该长度将被截断。
- `batch_size`: 表示每次迭代**每张卡**上的样本数目。
- `use_gpu`: 是否使用GPU。
- `init_checkpoint_path`: 模型加载路径。
## 启动预测
```shell
export CUDA_VISIBLE_DEVICES=0
python -u ./predict.py \
--model_name_or_path bert-base-multilingual-uncased \
--max_seq_length 128 \
--batch_size 32 \
--use_gpu True \
--init_checkpoint_path tmp/msra_ner/model_500.pdparams
```
## 使用其它预训练模型
本项目支持[PaadleNLP transformer类预训练模型](../../docs/transformers.md)中除ernie-gen以外的所有模型。若使用非BERT系列模型,需修改脚本导入相应的Task和Tokenizer。例如使用ERNIE系列模型,经查[PaadleNLP transformer类预训练模型](../../docs/transformers.md),需要加入以下代码:
```python
from paddlenlp.transformers import ErnieForTokenClassification, ErnieTokenizer
```
## 参考
[The third international Chinese language processing bakeoff: Word segmentation and named entity recognition](https://faculty.washington.edu/levow/papers/sighan06.pdf)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import ast
import random
import time
import math
from functools import partial
import numpy as np
import paddle
from paddle.io import DataLoader
import paddlenlp as ppnlp
from paddlenlp.datasets import MSRA_NER
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import BertForTokenClassification, BertTokenizer
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name_or_path",
default=None,
type=str,
required=True,
help="Path to pre-trained model or shortcut name selected in the list: " +
", ".join(list(BertTokenizer.pretrained_init_configuration.keys())))
parser.add_argument(
"--init_checkpoint_path",
default=None,
type=str,
required=True,
help="The model checkpoint path.", )
parser.add_argument(
"--max_seq_length",
default=128,
type=int,
help="The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded.", )
parser.add_argument(
"--batch_size",
default=8,
type=int,
help="Batch size per GPU/CPU for training.", )
parser.add_argument(
"--use_gpu",
type=ast.literal_eval,
default=True,
help="If set, use GPU for training.")
def convert_example(example,
tokenizer,
label_list,
no_entity_id,
max_seq_length=512,
is_test=False):
"""convert a glue example into necessary features"""
def _truncate_seqs(seqs, max_seq_length):
if len(seqs) == 1: # single sentence
# Account for [CLS] and [SEP] with "- 2"
seqs[0] = seqs[0][0:(max_seq_length - 2)]
else: # sentence pair
# Account for [CLS], [SEP], [SEP] with "- 3"
tokens_a, tokens_b = seqs
max_seq_length -= 3
while True: # truncate with longest_first strategy
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_seq_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
return seqs
def _concat_seqs(seqs, separators, seq_mask=0, separator_mask=1):
concat = sum((seq + sep for sep, seq in zip(separators, seqs)), [])
segment_ids = sum(
([i] * (len(seq) + len(sep))
for i, (sep, seq) in enumerate(zip(separators, seqs))), [])
if isinstance(seq_mask, int):
seq_mask = [[seq_mask] * len(seq) for seq in seqs]
if isinstance(separator_mask, int):
separator_mask = [[separator_mask] * len(sep) for sep in separators]
p_mask = sum((s_mask + mask
for sep, seq, s_mask, mask in zip(
separators, seqs, seq_mask, separator_mask)), [])
return concat, segment_ids, p_mask
def _reseg_token_label(tokens, tokenizer, labels=None):
if labels:
if len(tokens) != len(labels):
raise ValueError(
"The length of tokens must be same with labels")
ret_tokens = []
ret_labels = []
for token, label in zip(tokens, labels):
sub_token = tokenizer(token)
if len(sub_token) == 0:
continue
ret_tokens.extend(sub_token)
ret_labels.append(label)
if len(sub_token) < 2:
continue
sub_label = label
if label.startswith("B-"):
sub_label = "I-" + label[2:]
ret_labels.extend([sub_label] * (len(sub_token) - 1))
if len(ret_tokens) != len(ret_labels):
raise ValueError(
"The length of ret_tokens can't match with labels")
return ret_tokens, ret_labels
else:
ret_tokens = []
for token in tokens:
sub_token = tokenizer(token)
if len(sub_token) == 0:
continue
ret_tokens.extend(sub_token)
if len(sub_token) < 2:
continue
return ret_tokens, None
if not is_test:
# get the label
label = example[-1].split("\002")
example = example[0].split("\002")
#create label maps if classification task
label_map = {}
for (i, l) in enumerate(label_list):
label_map[l] = i
else:
label = None
tokens_raw, labels_raw = _reseg_token_label(
tokens=example, labels=label, tokenizer=tokenizer)
# truncate to the truncate_length,
tokens_trun = _truncate_seqs([tokens_raw], max_seq_length)
# concate the sequences with special tokens
tokens_trun[0] = [tokenizer.cls_token] + tokens_trun[0]
tokens, segment_ids, _ = _concat_seqs(tokens_trun, [[tokenizer.sep_token]] *
len(tokens_trun))
# convert the token to ids
input_ids = tokenizer.convert_tokens_to_ids(tokens)
valid_length = len(input_ids)
if labels_raw:
labels_trun = _truncate_seqs([labels_raw], max_seq_length)[0]
labels_id = [no_entity_id] + [label_map[lbl]
for lbl in labels_trun] + [no_entity_id]
if not is_test:
return input_ids, segment_ids, valid_length, labels_id
else:
return input_ids, segment_ids, valid_length
def parse_decodes(input_words, id2label, decodes, lens):
decodes = [x for batch in decodes for x in batch]
lens = [x for batch in lens for x in batch]
outputs = []
for idx, end in enumerate(lens):
sent = input_words[idx][0].replace("\002", "")[:end]
tags = [id2label[x] for x in decodes[idx][1:end]]
sent_out = []
tags_out = []
words = ""
for s, t in zip(sent, tags):
if t.startswith('B-') or t == 'O':
if len(words):
sent_out.append(words)
if t.startswith('B-'):
tags_out.append(t.split('-')[1])
else:
tags_out.append(t)
words = s
else:
words += s
if len(sent_out) < len(tags_out):
sent_out.append(words)
outputs.append(''.join(
[str((s, t)) for s, t in zip(sent_out, tags_out)]))
return outputs
def do_predict(args):
paddle.set_device("gpu" if args.use_gpu else "cpu")
train_dataset, predict_dataset = ppnlp.datasets.MSRA_NER.get_datasets(
["train", "test"])
tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
label_list = train_dataset.get_labels()
label_num = len(label_list)
no_entity_id = label_num - 1
trans_func = partial(
convert_example,
tokenizer=tokenizer,
label_list=label_list,
no_entity_id=label_num - 1,
max_seq_length=args.max_seq_length)
ignore_label = -100
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment
Stack(), # length
Pad(axis=0, pad_val=ignore_label) # label
): fn(samples)
raw_data = predict_dataset.data
id2label = dict(enumerate(predict_dataset.get_labels()))
predict_dataset = predict_dataset.apply(trans_func, lazy=True)
predict_batch_sampler = paddle.io.BatchSampler(
predict_dataset,
batch_size=args.batch_size,
shuffle=False,
drop_last=True)
predict_data_loader = DataLoader(
dataset=predict_dataset,
batch_sampler=predict_batch_sampler,
collate_fn=batchify_fn,
num_workers=0,
return_list=True)
model = BertForTokenClassification.from_pretrained(
args.model_name_or_path, num_classes=label_num)
if args.init_checkpoint_path:
model_dict = paddle.load(args.init_checkpoint_path)
model.set_dict(model_dict)
model.eval()
pred_list = []
len_list = []
for step, batch in enumerate(predict_data_loader):
input_ids, segment_ids, length, labels = batch
logits = model(input_ids, segment_ids)
pred = paddle.argmax(logits, axis=-1)
pred_list.append(pred.numpy())
len_list.append(length.numpy())
preds = parse_decodes(raw_data, id2label, pred_list, len_list)
file_path = "results.txt"
with open(file_path, "w", encoding="utf8") as fout:
fout.write("\n".join(preds))
# Print some examples
print(
"The results have been saved in the file: %s, some examples are shown below: "
% file_path)
print("\n".join(preds[:10]))
if __name__ == "__main__":
args = parser.parse_args()
do_predict(args)
......@@ -30,88 +30,78 @@ from paddlenlp.transformers import BertForTokenClassification, BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics import ChunkEvaluator
parser = argparse.ArgumentParser()
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
parser.add_argument(
"--model_name_or_path",
default=None,
type=str,
required=True,
help="Path to pre-trained model or shortcut name selected in the list: "
+ ", ".join(list(BertTokenizer.pretrained_init_configuration.keys())))
parser.add_argument(
help="Path to pre-trained model or shortcut name selected in the list: " +
", ".join(list(BertTokenizer.pretrained_init_configuration.keys())))
parser.add_argument(
"--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the model predictions and checkpoints will be written.",
)
parser.add_argument(
)
parser.add_argument(
"--max_seq_length",
default=128,
type=int,
help="The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded.", )
parser.add_argument(
parser.add_argument(
"--batch_size",
default=8,
type=int,
help="Batch size per GPU/CPU for training.", )
parser.add_argument(
parser.add_argument(
"--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument(
parser.add_argument(
"--weight_decay",
default=0.0,
type=float,
help="Weight decay if we apply some.")
parser.add_argument(
parser.add_argument(
"--adam_epsilon",
default=1e-8,
type=float,
help="Epsilon for Adam optimizer.")
parser.add_argument(
parser.add_argument(
"--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
parser.add_argument(
"--num_train_epochs",
default=3,
type=int,
help="Total number of training epochs to perform.", )
parser.add_argument(
parser.add_argument(
"--max_steps",
default=-1,
type=int,
help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
)
parser.add_argument(
)
parser.add_argument(
"--warmup_steps",
default=0,
type=int,
help="Linear warmup over warmup_steps.")
parser.add_argument(
"--logging_steps",
type=int,
default=1,
help="Log every X updates steps.")
parser.add_argument(
parser.add_argument(
"--logging_steps", type=int, default=1, help="Log every X updates steps.")
parser.add_argument(
"--save_steps",
type=int,
default=100,
help="Save checkpoint every X updates steps.")
parser.add_argument(
parser.add_argument(
"--seed", type=int, default=42, help="random seed for initialization")
parser.add_argument(
"--n_gpu",
type=int,
default=1,
help="number of gpus to use, 0 for cpu.")
args = parser.parse_args()
return args
parser.add_argument(
"--n_gpu", type=int, default=1, help="number of gpus to use, 0 for cpu.")
def evaluate(model, loss_fct, metric, data_loader, label_num):
......@@ -316,6 +306,7 @@ def do_train(args):
tic_train = time.time()
for epoch in range(args.num_train_epochs):
for step, batch in enumerate(train_data_loader):
global_step += 1
input_ids, segment_ids, length, labels = batch
logits = model(input_ids, segment_ids)
loss = loss_fct(
......@@ -337,11 +328,17 @@ def do_train(args):
paddle.save(model.state_dict(),
os.path.join(args.output_dir,
"model_%d.pdparams" % global_step))
global_step += 1
# Save final model
if (global_step) % args.save_steps != 0:
evaluate(model, loss_fct, metric, test_data_loader, label_num)
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
paddle.save(model.state_dict(),
os.path.join(args.output_dir,
"model_%d.pdparams" % global_step))
if __name__ == "__main__":
args = parse_args()
args = parser.parse_args()
if args.n_gpu > 1:
paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu)
else:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册