未验证 提交 352d9fb8 编写于 作者: K kinghuin 提交者: GitHub

optimize lac and msra_ner example (#5270)

* optimize lac and msra_ner example

* optimize ner and lac

* rename filename to lexical_analysis_dataset_tiny_path0

* modify lac dataset url
上级 fee616e8
...@@ -20,14 +20,14 @@ ...@@ -20,14 +20,14 @@
- paddlepaddle >= 2.0.0rc1,安装方式请参考 [快速安装](https://www.paddlepaddle.org.cn/install/quick) - paddlepaddle >= 2.0.0rc1,安装方式请参考 [快速安装](https://www.paddlepaddle.org.cn/install/quick)
- paddlenlp >= 2.0.0b2, 安装方式:`pip install paddlenlp\>=2.0.0b2` - paddlenlp >= 2.0.0rc, 安装方式:`pip install paddlenlp\>=2.0.0rc`
### 2.2 数据准备 ### 2.2 数据准备
我们提供了少数样本用以示例输入数据格式。执行以下命令,下载并解压示例数据集: 我们提供了少数样本用以示例输入数据格式。执行以下命令,下载并解压示例数据集:
```bash ```bash
wget --no-check-certificate https://paddlenlp.bj.bcebos.com/data/lexical_analysis_dataset_tiny.tar.gz wget --no-check-certificate https://paddlenlp.bj.bcebos.com/datasets/lexical_analysis_dataset_tiny.tar.gz
tar xvf lexical_analysis_dataset_tiny.tar.gz tar xvf lexical_analysis_dataset_tiny.tar.gz
``` ```
...@@ -54,18 +54,18 @@ tar xvf lexical_analysis_dataset_tiny.tar.gz ...@@ -54,18 +54,18 @@ tar xvf lexical_analysis_dataset_tiny.tar.gz
模型训练支持 CPU 和 GPU,使用 GPU 之前应指定使用的显卡卡号: 模型训练支持 CPU 和 GPU,使用 GPU 之前应指定使用的显卡卡号:
```bash ```bash
export CUDA_VISIBLE_DEVICES=0,1 # 支持多卡训练 export CUDA_VISIBLE_DEVICES=0 # 支持多卡训练,如使用双卡,可以设置为0,1
``` ```
训练启动方式如下: 训练启动方式如下:
```bash ```bash
python -m paddle.distributed.launch train.py \ python train.py \
--data_dir ./lexical_analysis_dataset_tiny \ --data_dir ./lexical_analysis_dataset_tiny \
--model_save_dir ./save_dir \ --model_save_dir ./save_dir \
--epochs 10 \ --epochs 10 \
--batch_size 32 \ --batch_size 32 \
--use_gpu True \ --n_gpu 1 \
# --init_checkpoint ./save_dir/final # --init_checkpoint ./save_dir/final
``` ```
......
...@@ -43,15 +43,13 @@ class LacDataset(paddle.io.Dataset): ...@@ -43,15 +43,13 @@ class LacDataset(paddle.io.Dataset):
word_dict_path = os.path.join(self.base_path, 'word.dic') word_dict_path = os.path.join(self.base_path, 'word.dic')
label_dict_path = os.path.join(self.base_path, 'tag.dic') label_dict_path = os.path.join(self.base_path, 'tag.dic')
word_rep_dict_path = os.path.join(self.base_path, 'q2b.dic') word_rep_dict_path = os.path.join(self.base_path, 'q2b.dic')
self.word_vocab = self._load_kv_dict( self.word_vocab = self._load_vocab(word_dict_path)
word_dict_path, value_func=np.int64, reverse=True) self.label_vocab = self._load_vocab(label_dict_path)
self.label_vocab = self._load_kv_dict( self.word_replace_dict = self._load_vocab(word_rep_dict_path)
label_dict_path, value_func=np.int64, reverse=True)
self.word_replace_dict = self._load_kv_dict(word_rep_dict_path)
# Calculate vocab size and labels number, note: vocab value strats from 0. # Calculate vocab size and labels number, note: vocab value strats from 0.
self.vocab_size = max(self.word_vocab.values()) + 1 self.vocab_size = len(self.word_vocab)
self.num_labels = max(self.label_vocab.values()) + 1 self.num_labels = len(self.label_vocab)
if self.mode in {"train", "test", "infer"}: if self.mode in {"train", "test", "infer"}:
self.dataset_path = os.path.join(self.base_path, self.dataset_path = os.path.join(self.base_path,
...@@ -109,30 +107,27 @@ class LacDataset(paddle.io.Dataset): ...@@ -109,30 +107,27 @@ class LacDataset(paddle.io.Dataset):
self.total += 1 self.total += 1
def _load_kv_dict(self, def _load_vocab(self, dict_path):
dict_path,
delimiter="\t",
key_func=None,
value_func=None,
reverse=False):
""" """
Load key-value dict from file Load vocab from file
""" """
vocab = {} vocab = {}
for line in open(dict_path, "r", encoding='utf8'): reverse = None
terms = line.strip("\n").split(delimiter) with open(dict_path, "r", encoding='utf8') as fin:
if len(terms) != 2: for i, line in enumerate(fin):
continue terms = line.strip("\n").split("\t")
if len(terms) == 2:
if reverse == None:
reverse = True if terms[0].isdigit() else False
if reverse: if reverse:
value, key = terms value, key = terms
else: else:
key, value = terms key, value = terms
if key in vocab: elif len(terms) == 1:
raise KeyError("key duplicated with [%s]" % (key)) key, value = terms[0], i
if key_func: else:
key = key_func(key) raise ValueError("Error line: %s in file: %s" %
if value_func: (line, dict_path))
value = value_func(value)
vocab[key] = value vocab[key] = value
return vocab return vocab
......
...@@ -34,21 +34,16 @@ parser.add_argument("--model_save_dir", type=str, default=None, help="The model ...@@ -34,21 +34,16 @@ parser.add_argument("--model_save_dir", type=str, default=None, help="The model
parser.add_argument("--epochs", type=int, default=10, help="Corpus iteration num.") parser.add_argument("--epochs", type=int, default=10, help="Corpus iteration num.")
parser.add_argument("--batch_size", type=int, default=300, help="The number of sequences contained in a mini-batch.") parser.add_argument("--batch_size", type=int, default=300, help="The number of sequences contained in a mini-batch.")
parser.add_argument("--max_seq_len", type=int, default=64, help="Number of words of the longest seqence.") parser.add_argument("--max_seq_len", type=int, default=64, help="Number of words of the longest seqence.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="If set, use GPU for training.") parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs to use, 0 for CPU.")
parser.add_argument("--base_lr", type=float, default=0.001, help="The basic learning rate that affects the entire network.") parser.add_argument("--base_lr", type=float, default=0.001, help="The basic learning rate that affects the entire network.")
parser.add_argument("--emb_dim", type=int, default=128, help="The dimension in which a word is embedded.") parser.add_argument("--emb_dim", type=int, default=128, help="The dimension in which a word is embedded.")
parser.add_argument("--hidden_size", type=int, default=128, help="The number of hidden nodes in the GRU layer.") parser.add_argument("--hidden_size", type=int, default=128, help="The number of hidden nodes in the GRU layer.")
args = parser.parse_args() parser.add_argument("--verbose", type=ast.literal_eval, default=128, help="Print reader and training time in details.")
# yapf: enable # yapf: enable
def train(args): def train(args):
if args.use_gpu: paddle.set_device("gpu" if args.n_gpu else "cpu")
place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
paddle.set_device("gpu")
else:
place = paddle.CPUPlace()
paddle.set_device("cpu")
# create dataset. # create dataset.
train_dataset = LacDataset(args.data_dir, mode='train') train_dataset = LacDataset(args.data_dir, mode='train')
...@@ -69,7 +64,6 @@ def train(args): ...@@ -69,7 +64,6 @@ def train(args):
train_loader = paddle.io.DataLoader( train_loader = paddle.io.DataLoader(
dataset=train_dataset, dataset=train_dataset,
batch_sampler=train_sampler, batch_sampler=train_sampler,
places=place,
return_list=True, return_list=True,
collate_fn=batchify_fn) collate_fn=batchify_fn)
...@@ -81,7 +75,6 @@ def train(args): ...@@ -81,7 +75,6 @@ def train(args):
test_loader = paddle.io.DataLoader( test_loader = paddle.io.DataLoader(
dataset=test_dataset, dataset=test_dataset,
batch_sampler=test_sampler, batch_sampler=test_sampler,
places=place,
return_list=True, return_list=True,
collate_fn=batchify_fn) collate_fn=batchify_fn)
...@@ -101,6 +94,8 @@ def train(args): ...@@ -101,6 +94,8 @@ def train(args):
model.load(args.init_checkpoint) model.load(args.init_checkpoint)
# Start training # Start training
callbacks = paddle.callbacks.ProgBarLogger(
log_freq=10, verbose=3) if args.verbose else None
model.fit(train_data=train_loader, model.fit(train_data=train_loader,
eval_data=test_loader, eval_data=test_loader,
batch_size=args.batch_size, batch_size=args.batch_size,
...@@ -109,9 +104,13 @@ def train(args): ...@@ -109,9 +104,13 @@ def train(args):
log_freq=10, log_freq=10,
save_dir=args.model_save_dir, save_dir=args.model_save_dir,
save_freq=1, save_freq=1,
shuffle=True) shuffle=True,
callbacks=callbacks)
if __name__ == "__main__": if __name__ == "__main__":
print(args) args = parser.parse_args()
if args.n_gpu > 1:
paddle.distributed.spawn(train, args=(args, ), nprocs=args.n_gpu)
else:
train(args) train(args)
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
- paddlepaddle >= 2.0.0,安装方式请参考 [快速安装](https://www.paddlepaddle.org.cn/install/quick) - paddlepaddle >= 2.0.0,安装方式请参考 [快速安装](https://www.paddlepaddle.org.cn/install/quick)
- paddlenlp >= 2.0.0rc, 安装方式:`pip install paddlenlp>=2.0.0rc` - paddlenlp >= 2.0.0rc, 安装方式:`pip install paddlenlp\>=2.0.0rc`
### 2.2 数据准备 ### 2.2 数据准备
......
...@@ -69,7 +69,7 @@ def predict(model, data_loader, ds): ...@@ -69,7 +69,7 @@ def predict(model, data_loader, ds):
pred_list.append(pred.numpy()) pred_list.append(pred.numpy())
len_list.append(lens.numpy()) len_list.append(lens.numpy())
preds = parse_decodes(ds, pred_list, len_list) preds = parse_decodes(ds, pred_list, len_list)
print('\n'.join(preds[:10])) return preds
def convert_example(example, tokenizer, label_vocab): def convert_example(example, tokenizer, label_vocab):
...@@ -177,4 +177,5 @@ if __name__ == '__main__': ...@@ -177,4 +177,5 @@ if __name__ == '__main__':
paddle.save(model.state_dict(), paddle.save(model.state_dict(),
'./ernie_result/model_%d.pdparams' % step) './ernie_result/model_%d.pdparams' % step)
pred = predict(model, test_loader, test_ds) preds = predict(model, test_loader, test_ds)
print('\n'.join(preds[:10]))
...@@ -17,16 +17,16 @@ PaddleNLP集成的数据集MSRA-NER数据集对文件格式做了调整:每一 ...@@ -17,16 +17,16 @@ PaddleNLP集成的数据集MSRA-NER数据集对文件格式做了调整:每一
- Python >= 3.6 - Python >= 3.6
- paddlepaddle >= 2.0.0rc1,安装方式请参考 [快速安装](https://www.paddlepaddle.org.cn/install/quick) - paddlepaddle >= 2.0.0,安装方式请参考 [快速安装](https://www.paddlepaddle.org.cn/install/quick)
- paddlenlp >= 2.0.0b2, 安装方式:`pip install paddlenlp>=2.0.0b2` - paddlenlp >= 2.0.0rc, 安装方式:`pip install paddlenlp\>=2.0.0rc`
### 2.2 启动MSRA-NER任务 ### 2.2 启动MSRA-NER任务
```shell ```shell
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
python -u ./run_msra_ner.py \ python -u ./train.py \
--model_name_or_path bert-base-multilingual-uncased \ --model_name_or_path bert-base-multilingual-uncased \
--max_seq_length 128 \ --max_seq_length 128 \
--batch_size 32 \ --batch_size 32 \
...@@ -39,7 +39,7 @@ python -u ./run_msra_ner.py \ ...@@ -39,7 +39,7 @@ python -u ./run_msra_ner.py \
``` ```
其中参数释义如下: 其中参数释义如下:
- `model_name_or_path`: 指示了某种特定配置的模型,对应有其预训练模型和预训练时使用的 tokenizer。若模型相关内容保存在本地,这里也可以提供相应目录地址。 - `model_name_or_path`: 指示了某种特定配置的模型,对应有其预训练模型和预训练时使用的 tokenizer,支持[PaadleNLP transformer类预训练模型](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/docs/transformers.md)中除ernie-gen以外的所有模型。若使用非BERT系列模型,需修改脚本导入相应的Task和Tokenizer。若模型相关内容保存在本地,这里也可以提供相应目录地址。
- `max_seq_length`: 表示最大句子长度,超过该长度将被截断。 - `max_seq_length`: 表示最大句子长度,超过该长度将被截断。
- `batch_size`: 表示每次迭代**每张卡**上的样本数目。 - `batch_size`: 表示每次迭代**每张卡**上的样本数目。
- `learning_rate`: 表示基础学习率大小,将于learning rate scheduler产生的值相乘作为当前学习率。 - `learning_rate`: 表示基础学习率大小,将于learning rate scheduler产生的值相乘作为当前学习率。
...@@ -67,6 +67,46 @@ Precision | 0.908957 | ...@@ -67,6 +67,46 @@ Precision | 0.908957 |
Recall | 0.926683 | Recall | 0.926683 |
F1 | 0.917734 | F1 | 0.917734 |
## 启动评估
```shell
export CUDA_VISIBLE_DEVICES=0
python -u ./eval.py \
--model_name_or_path bert-base-multilingual-uncased \
--max_seq_length 128 \
--batch_size 32 \
--use_gpu True \
--init_checkpoint_path tmp/msra_ner/model_500.pdparams
```
其中参数释义如下:
- `model_name_or_path`: 指示了某种特定配置的模型,对应有其预训练模型和预训练时使用的 tokenizer。若模型相关内容保存在本地,这里也可以提供相应目录地址。
- `max_seq_length`: 表示最大句子长度,超过该长度将被截断。
- `batch_size`: 表示每次迭代**每张卡**上的样本数目。
- `use_gpu`: 是否使用GPU。
- `init_checkpoint_path`: 模型加载路径。
## 启动预测
```shell
export CUDA_VISIBLE_DEVICES=0
python -u ./predict.py \
--model_name_or_path bert-base-multilingual-uncased \
--max_seq_length 128 \
--batch_size 32 \
--use_gpu True \
--init_checkpoint_path tmp/msra_ner/model_500.pdparams
```
## 使用其它预训练模型
本项目支持[PaadleNLP transformer类预训练模型](../../docs/transformers.md)中除ernie-gen以外的所有模型。若使用非BERT系列模型,需修改脚本导入相应的Task和Tokenizer。例如使用ERNIE系列模型,经查[PaadleNLP transformer类预训练模型](../../docs/transformers.md),需要加入以下代码:
```python
from paddlenlp.transformers import ErnieForTokenClassification, ErnieTokenizer
```
## 参考 ## 参考
[The third international Chinese language processing bakeoff: Word segmentation and named entity recognition](https://faculty.washington.edu/levow/papers/sighan06.pdf) [The third international Chinese language processing bakeoff: Word segmentation and named entity recognition](https://faculty.washington.edu/levow/papers/sighan06.pdf)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import ast
import random
import time
import math
from functools import partial
import numpy as np
import paddle
from paddle.io import DataLoader
import paddlenlp as ppnlp
from paddlenlp.datasets import MSRA_NER
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import BertForTokenClassification, BertTokenizer
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name_or_path",
default=None,
type=str,
required=True,
help="Path to pre-trained model or shortcut name selected in the list: " +
", ".join(list(BertTokenizer.pretrained_init_configuration.keys())))
parser.add_argument(
"--init_checkpoint_path",
default=None,
type=str,
required=True,
help="The model checkpoint path.", )
parser.add_argument(
"--max_seq_length",
default=128,
type=int,
help="The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded.", )
parser.add_argument(
"--batch_size",
default=8,
type=int,
help="Batch size per GPU/CPU for training.", )
parser.add_argument(
"--use_gpu",
type=ast.literal_eval,
default=True,
help="If set, use GPU for training.")
def convert_example(example,
tokenizer,
label_list,
no_entity_id,
max_seq_length=512,
is_test=False):
"""convert a glue example into necessary features"""
def _truncate_seqs(seqs, max_seq_length):
if len(seqs) == 1: # single sentence
# Account for [CLS] and [SEP] with "- 2"
seqs[0] = seqs[0][0:(max_seq_length - 2)]
else: # sentence pair
# Account for [CLS], [SEP], [SEP] with "- 3"
tokens_a, tokens_b = seqs
max_seq_length -= 3
while True: # truncate with longest_first strategy
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_seq_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
return seqs
def _concat_seqs(seqs, separators, seq_mask=0, separator_mask=1):
concat = sum((seq + sep for sep, seq in zip(separators, seqs)), [])
segment_ids = sum(
([i] * (len(seq) + len(sep))
for i, (sep, seq) in enumerate(zip(separators, seqs))), [])
if isinstance(seq_mask, int):
seq_mask = [[seq_mask] * len(seq) for seq in seqs]
if isinstance(separator_mask, int):
separator_mask = [[separator_mask] * len(sep) for sep in separators]
p_mask = sum((s_mask + mask
for sep, seq, s_mask, mask in zip(
separators, seqs, seq_mask, separator_mask)), [])
return concat, segment_ids, p_mask
def _reseg_token_label(tokens, tokenizer, labels=None):
if labels:
if len(tokens) != len(labels):
raise ValueError(
"The length of tokens must be same with labels")
ret_tokens = []
ret_labels = []
for token, label in zip(tokens, labels):
sub_token = tokenizer(token)
if len(sub_token) == 0:
continue
ret_tokens.extend(sub_token)
ret_labels.append(label)
if len(sub_token) < 2:
continue
sub_label = label
if label.startswith("B-"):
sub_label = "I-" + label[2:]
ret_labels.extend([sub_label] * (len(sub_token) - 1))
if len(ret_tokens) != len(ret_labels):
raise ValueError(
"The length of ret_tokens can't match with labels")
return ret_tokens, ret_labels
else:
ret_tokens = []
for token in tokens:
sub_token = tokenizer(token)
if len(sub_token) == 0:
continue
ret_tokens.extend(sub_token)
if len(sub_token) < 2:
continue
return ret_tokens, None
if not is_test:
# get the label
label = example[-1].split("\002")
example = example[0].split("\002")
#create label maps if classification task
label_map = {}
for (i, l) in enumerate(label_list):
label_map[l] = i
else:
label = None
tokens_raw, labels_raw = _reseg_token_label(
tokens=example, labels=label, tokenizer=tokenizer)
# truncate to the truncate_length,
tokens_trun = _truncate_seqs([tokens_raw], max_seq_length)
# concate the sequences with special tokens
tokens_trun[0] = [tokenizer.cls_token] + tokens_trun[0]
tokens, segment_ids, _ = _concat_seqs(tokens_trun, [[tokenizer.sep_token]] *
len(tokens_trun))
# convert the token to ids
input_ids = tokenizer.convert_tokens_to_ids(tokens)
valid_length = len(input_ids)
if labels_raw:
labels_trun = _truncate_seqs([labels_raw], max_seq_length)[0]
labels_id = [no_entity_id] + [label_map[lbl]
for lbl in labels_trun] + [no_entity_id]
if not is_test:
return input_ids, segment_ids, valid_length, labels_id
else:
return input_ids, segment_ids, valid_length
def parse_decodes(input_words, id2label, decodes, lens):
decodes = [x for batch in decodes for x in batch]
lens = [x for batch in lens for x in batch]
outputs = []
for idx, end in enumerate(lens):
sent = input_words[idx][0].replace("\002", "")[:end]
tags = [id2label[x] for x in decodes[idx][1:end]]
sent_out = []
tags_out = []
words = ""
for s, t in zip(sent, tags):
if t.startswith('B-') or t == 'O':
if len(words):
sent_out.append(words)
if t.startswith('B-'):
tags_out.append(t.split('-')[1])
else:
tags_out.append(t)
words = s
else:
words += s
if len(sent_out) < len(tags_out):
sent_out.append(words)
outputs.append(''.join(
[str((s, t)) for s, t in zip(sent_out, tags_out)]))
return outputs
def do_predict(args):
paddle.set_device("gpu" if args.use_gpu else "cpu")
train_dataset, predict_dataset = ppnlp.datasets.MSRA_NER.get_datasets(
["train", "test"])
tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
label_list = train_dataset.get_labels()
label_num = len(label_list)
no_entity_id = label_num - 1
trans_func = partial(
convert_example,
tokenizer=tokenizer,
label_list=label_list,
no_entity_id=label_num - 1,
max_seq_length=args.max_seq_length)
ignore_label = -100
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment
Stack(), # length
Pad(axis=0, pad_val=ignore_label) # label
): fn(samples)
raw_data = predict_dataset.data
id2label = dict(enumerate(predict_dataset.get_labels()))
predict_dataset = predict_dataset.apply(trans_func, lazy=True)
predict_batch_sampler = paddle.io.BatchSampler(
predict_dataset,
batch_size=args.batch_size,
shuffle=False,
drop_last=True)
predict_data_loader = DataLoader(
dataset=predict_dataset,
batch_sampler=predict_batch_sampler,
collate_fn=batchify_fn,
num_workers=0,
return_list=True)
model = BertForTokenClassification.from_pretrained(
args.model_name_or_path, num_classes=label_num)
if args.init_checkpoint_path:
model_dict = paddle.load(args.init_checkpoint_path)
model.set_dict(model_dict)
model.eval()
pred_list = []
len_list = []
for step, batch in enumerate(predict_data_loader):
input_ids, segment_ids, length, labels = batch
logits = model(input_ids, segment_ids)
pred = paddle.argmax(logits, axis=-1)
pred_list.append(pred.numpy())
len_list.append(length.numpy())
preds = parse_decodes(raw_data, id2label, pred_list, len_list)
file_path = "results.txt"
with open(file_path, "w", encoding="utf8") as fout:
fout.write("\n".join(preds))
# Print some examples
print(
"The results have been saved in the file: %s, some examples are shown below: "
% file_path)
print("\n".join(preds[:10]))
if __name__ == "__main__":
args = parser.parse_args()
do_predict(args)
...@@ -30,88 +30,78 @@ from paddlenlp.transformers import BertForTokenClassification, BertTokenizer ...@@ -30,88 +30,78 @@ from paddlenlp.transformers import BertForTokenClassification, BertTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.metrics import ChunkEvaluator from paddlenlp.metrics import ChunkEvaluator
parser = argparse.ArgumentParser()
def parse_args(): parser.add_argument(
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name_or_path", "--model_name_or_path",
default=None, default=None,
type=str, type=str,
required=True, required=True,
help="Path to pre-trained model or shortcut name selected in the list: " help="Path to pre-trained model or shortcut name selected in the list: " +
+ ", ".join(list(BertTokenizer.pretrained_init_configuration.keys()))) ", ".join(list(BertTokenizer.pretrained_init_configuration.keys())))
parser.add_argument( parser.add_argument(
"--output_dir", "--output_dir",
default=None, default=None,
type=str, type=str,
required=True, required=True,
help="The output directory where the model predictions and checkpoints will be written.", help="The output directory where the model predictions and checkpoints will be written.",
) )
parser.add_argument( parser.add_argument(
"--max_seq_length", "--max_seq_length",
default=128, default=128,
type=int, type=int,
help="The maximum total input sequence length after tokenization. Sequences longer " help="The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded.", ) "than this will be truncated, sequences shorter will be padded.", )
parser.add_argument( parser.add_argument(
"--batch_size", "--batch_size",
default=8, default=8,
type=int, type=int,
help="Batch size per GPU/CPU for training.", ) help="Batch size per GPU/CPU for training.", )
parser.add_argument( parser.add_argument(
"--learning_rate", "--learning_rate",
default=5e-5, default=5e-5,
type=float, type=float,
help="The initial learning rate for Adam.") help="The initial learning rate for Adam.")
parser.add_argument( parser.add_argument(
"--weight_decay", "--weight_decay",
default=0.0, default=0.0,
type=float, type=float,
help="Weight decay if we apply some.") help="Weight decay if we apply some.")
parser.add_argument( parser.add_argument(
"--adam_epsilon", "--adam_epsilon",
default=1e-8, default=1e-8,
type=float, type=float,
help="Epsilon for Adam optimizer.") help="Epsilon for Adam optimizer.")
parser.add_argument( parser.add_argument(
"--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument( parser.add_argument(
"--num_train_epochs", "--num_train_epochs",
default=3, default=3,
type=int, type=int,
help="Total number of training epochs to perform.", ) help="Total number of training epochs to perform.", )
parser.add_argument( parser.add_argument(
"--max_steps", "--max_steps",
default=-1, default=-1,
type=int, type=int,
help="If > 0: set total number of training steps to perform. Override num_train_epochs.", help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
) )
parser.add_argument( parser.add_argument(
"--warmup_steps", "--warmup_steps",
default=0, default=0,
type=int, type=int,
help="Linear warmup over warmup_steps.") help="Linear warmup over warmup_steps.")
parser.add_argument( parser.add_argument(
"--logging_steps", "--logging_steps", type=int, default=1, help="Log every X updates steps.")
type=int, parser.add_argument(
default=1,
help="Log every X updates steps.")
parser.add_argument(
"--save_steps", "--save_steps",
type=int, type=int,
default=100, default=100,
help="Save checkpoint every X updates steps.") help="Save checkpoint every X updates steps.")
parser.add_argument( parser.add_argument(
"--seed", type=int, default=42, help="random seed for initialization") "--seed", type=int, default=42, help="random seed for initialization")
parser.add_argument( parser.add_argument(
"--n_gpu", "--n_gpu", type=int, default=1, help="number of gpus to use, 0 for cpu.")
type=int,
default=1,
help="number of gpus to use, 0 for cpu.")
args = parser.parse_args()
return args
def evaluate(model, loss_fct, metric, data_loader, label_num): def evaluate(model, loss_fct, metric, data_loader, label_num):
...@@ -316,6 +306,7 @@ def do_train(args): ...@@ -316,6 +306,7 @@ def do_train(args):
tic_train = time.time() tic_train = time.time()
for epoch in range(args.num_train_epochs): for epoch in range(args.num_train_epochs):
for step, batch in enumerate(train_data_loader): for step, batch in enumerate(train_data_loader):
global_step += 1
input_ids, segment_ids, length, labels = batch input_ids, segment_ids, length, labels = batch
logits = model(input_ids, segment_ids) logits = model(input_ids, segment_ids)
loss = loss_fct( loss = loss_fct(
...@@ -337,11 +328,17 @@ def do_train(args): ...@@ -337,11 +328,17 @@ def do_train(args):
paddle.save(model.state_dict(), paddle.save(model.state_dict(),
os.path.join(args.output_dir, os.path.join(args.output_dir,
"model_%d.pdparams" % global_step)) "model_%d.pdparams" % global_step))
global_step += 1 # Save final model
if (global_step) % args.save_steps != 0:
evaluate(model, loss_fct, metric, test_data_loader, label_num)
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
paddle.save(model.state_dict(),
os.path.join(args.output_dir,
"model_%d.pdparams" % global_step))
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parser.parse_args()
if args.n_gpu > 1: if args.n_gpu > 1:
paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu) paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu)
else: else:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册