From 352d9fb880c0356916312bbf45594751cbfa66d6 Mon Sep 17 00:00:00 2001 From: kinghuin Date: Fri, 5 Feb 2021 22:01:53 +0800 Subject: [PATCH] optimize lac and msra_ner example (#5270) * optimize lac and msra_ner example * optimize ner and lac * rename filename to lexical_analysis_dataset_tiny_path0 * modify lac dataset url --- PaddleNLP/examples/lexical_analysis/README.md | 10 +- PaddleNLP/examples/lexical_analysis/data.py | 53 ++-- PaddleNLP/examples/lexical_analysis/train.py | 25 +- .../express_ner/README.md | 2 +- .../express_ner/run_ernie.py | 5 +- .../msra_ner/README.md | 48 +++- .../msra_ner/predict.py | 272 ++++++++++++++++++ .../msra_ner/{run_msra_ner.py => train.py} | 165 ++++++----- 8 files changed, 442 insertions(+), 138 deletions(-) create mode 100644 PaddleNLP/examples/named_entity_recognition/msra_ner/predict.py rename PaddleNLP/examples/named_entity_recognition/msra_ner/{run_msra_ner.py => train.py} (80%) diff --git a/PaddleNLP/examples/lexical_analysis/README.md b/PaddleNLP/examples/lexical_analysis/README.md index cb651e7e..166156a1 100644 --- a/PaddleNLP/examples/lexical_analysis/README.md +++ b/PaddleNLP/examples/lexical_analysis/README.md @@ -20,14 +20,14 @@ - paddlepaddle >= 2.0.0rc1,安装方式请参考 [快速安装](https://www.paddlepaddle.org.cn/install/quick)。 -- paddlenlp >= 2.0.0b2, 安装方式:`pip install paddlenlp\>=2.0.0b2` +- paddlenlp >= 2.0.0rc, 安装方式:`pip install paddlenlp\>=2.0.0rc` ### 2.2 数据准备 我们提供了少数样本用以示例输入数据格式。执行以下命令,下载并解压示例数据集: ```bash -wget --no-check-certificate https://paddlenlp.bj.bcebos.com/data/lexical_analysis_dataset_tiny.tar.gz +wget --no-check-certificate https://paddlenlp.bj.bcebos.com/datasets/lexical_analysis_dataset_tiny.tar.gz tar xvf lexical_analysis_dataset_tiny.tar.gz ``` @@ -54,18 +54,18 @@ tar xvf lexical_analysis_dataset_tiny.tar.gz 模型训练支持 CPU 和 GPU,使用 GPU 之前应指定使用的显卡卡号: ```bash -export CUDA_VISIBLE_DEVICES=0,1 # 支持多卡训练 +export CUDA_VISIBLE_DEVICES=0 # 支持多卡训练,如使用双卡,可以设置为0,1 ``` 训练启动方式如下: ```bash -python -m paddle.distributed.launch train.py \ +python train.py \ --data_dir ./lexical_analysis_dataset_tiny \ --model_save_dir ./save_dir \ --epochs 10 \ --batch_size 32 \ - --use_gpu True \ + --n_gpu 1 \ # --init_checkpoint ./save_dir/final ``` diff --git a/PaddleNLP/examples/lexical_analysis/data.py b/PaddleNLP/examples/lexical_analysis/data.py index 3342b7c8..d9477216 100644 --- a/PaddleNLP/examples/lexical_analysis/data.py +++ b/PaddleNLP/examples/lexical_analysis/data.py @@ -43,15 +43,13 @@ class LacDataset(paddle.io.Dataset): word_dict_path = os.path.join(self.base_path, 'word.dic') label_dict_path = os.path.join(self.base_path, 'tag.dic') word_rep_dict_path = os.path.join(self.base_path, 'q2b.dic') - self.word_vocab = self._load_kv_dict( - word_dict_path, value_func=np.int64, reverse=True) - self.label_vocab = self._load_kv_dict( - label_dict_path, value_func=np.int64, reverse=True) - self.word_replace_dict = self._load_kv_dict(word_rep_dict_path) + self.word_vocab = self._load_vocab(word_dict_path) + self.label_vocab = self._load_vocab(label_dict_path) + self.word_replace_dict = self._load_vocab(word_rep_dict_path) # Calculate vocab size and labels number, note: vocab value strats from 0. - self.vocab_size = max(self.word_vocab.values()) + 1 - self.num_labels = max(self.label_vocab.values()) + 1 + self.vocab_size = len(self.word_vocab) + self.num_labels = len(self.label_vocab) if self.mode in {"train", "test", "infer"}: self.dataset_path = os.path.join(self.base_path, @@ -109,31 +107,28 @@ class LacDataset(paddle.io.Dataset): self.total += 1 - def _load_kv_dict(self, - dict_path, - delimiter="\t", - key_func=None, - value_func=None, - reverse=False): + def _load_vocab(self, dict_path): """ - Load key-value dict from file + Load vocab from file """ vocab = {} - for line in open(dict_path, "r", encoding='utf8'): - terms = line.strip("\n").split(delimiter) - if len(terms) != 2: - continue - if reverse: - value, key = terms - else: - key, value = terms - if key in vocab: - raise KeyError("key duplicated with [%s]" % (key)) - if key_func: - key = key_func(key) - if value_func: - value = value_func(value) - vocab[key] = value + reverse = None + with open(dict_path, "r", encoding='utf8') as fin: + for i, line in enumerate(fin): + terms = line.strip("\n").split("\t") + if len(terms) == 2: + if reverse == None: + reverse = True if terms[0].isdigit() else False + if reverse: + value, key = terms + else: + key, value = terms + elif len(terms) == 1: + key, value = terms[0], i + else: + raise ValueError("Error line: %s in file: %s" % + (line, dict_path)) + vocab[key] = value return vocab def _convert_tokens_to_ids(self, diff --git a/PaddleNLP/examples/lexical_analysis/train.py b/PaddleNLP/examples/lexical_analysis/train.py index 5ad9dc4b..1ddb0c12 100644 --- a/PaddleNLP/examples/lexical_analysis/train.py +++ b/PaddleNLP/examples/lexical_analysis/train.py @@ -34,21 +34,16 @@ parser.add_argument("--model_save_dir", type=str, default=None, help="The model parser.add_argument("--epochs", type=int, default=10, help="Corpus iteration num.") parser.add_argument("--batch_size", type=int, default=300, help="The number of sequences contained in a mini-batch.") parser.add_argument("--max_seq_len", type=int, default=64, help="Number of words of the longest seqence.") -parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="If set, use GPU for training.") +parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs to use, 0 for CPU.") parser.add_argument("--base_lr", type=float, default=0.001, help="The basic learning rate that affects the entire network.") parser.add_argument("--emb_dim", type=int, default=128, help="The dimension in which a word is embedded.") parser.add_argument("--hidden_size", type=int, default=128, help="The number of hidden nodes in the GRU layer.") -args = parser.parse_args() +parser.add_argument("--verbose", type=ast.literal_eval, default=128, help="Print reader and training time in details.") # yapf: enable def train(args): - if args.use_gpu: - place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id) - paddle.set_device("gpu") - else: - place = paddle.CPUPlace() - paddle.set_device("cpu") + paddle.set_device("gpu" if args.n_gpu else "cpu") # create dataset. train_dataset = LacDataset(args.data_dir, mode='train') @@ -69,7 +64,6 @@ def train(args): train_loader = paddle.io.DataLoader( dataset=train_dataset, batch_sampler=train_sampler, - places=place, return_list=True, collate_fn=batchify_fn) @@ -81,7 +75,6 @@ def train(args): test_loader = paddle.io.DataLoader( dataset=test_dataset, batch_sampler=test_sampler, - places=place, return_list=True, collate_fn=batchify_fn) @@ -101,6 +94,8 @@ def train(args): model.load(args.init_checkpoint) # Start training + callbacks = paddle.callbacks.ProgBarLogger( + log_freq=10, verbose=3) if args.verbose else None model.fit(train_data=train_loader, eval_data=test_loader, batch_size=args.batch_size, @@ -109,9 +104,13 @@ def train(args): log_freq=10, save_dir=args.model_save_dir, save_freq=1, - shuffle=True) + shuffle=True, + callbacks=callbacks) if __name__ == "__main__": - print(args) - train(args) + args = parser.parse_args() + if args.n_gpu > 1: + paddle.distributed.spawn(train, args=(args, ), nprocs=args.n_gpu) + else: + train(args) diff --git a/PaddleNLP/examples/named_entity_recognition/express_ner/README.md b/PaddleNLP/examples/named_entity_recognition/express_ner/README.md index 6e486480..dffabf00 100644 --- a/PaddleNLP/examples/named_entity_recognition/express_ner/README.md +++ b/PaddleNLP/examples/named_entity_recognition/express_ner/README.md @@ -12,7 +12,7 @@ - paddlepaddle >= 2.0.0,安装方式请参考 [快速安装](https://www.paddlepaddle.org.cn/install/quick)。 -- paddlenlp >= 2.0.0rc, 安装方式:`pip install paddlenlp>=2.0.0rc` +- paddlenlp >= 2.0.0rc, 安装方式:`pip install paddlenlp\>=2.0.0rc` ### 2.2 数据准备 diff --git a/PaddleNLP/examples/named_entity_recognition/express_ner/run_ernie.py b/PaddleNLP/examples/named_entity_recognition/express_ner/run_ernie.py index 33389731..895a61e3 100644 --- a/PaddleNLP/examples/named_entity_recognition/express_ner/run_ernie.py +++ b/PaddleNLP/examples/named_entity_recognition/express_ner/run_ernie.py @@ -69,7 +69,7 @@ def predict(model, data_loader, ds): pred_list.append(pred.numpy()) len_list.append(lens.numpy()) preds = parse_decodes(ds, pred_list, len_list) - print('\n'.join(preds[:10])) + return preds def convert_example(example, tokenizer, label_vocab): @@ -177,4 +177,5 @@ if __name__ == '__main__': paddle.save(model.state_dict(), './ernie_result/model_%d.pdparams' % step) - pred = predict(model, test_loader, test_ds) + preds = predict(model, test_loader, test_ds) + print('\n'.join(preds[:10])) diff --git a/PaddleNLP/examples/named_entity_recognition/msra_ner/README.md b/PaddleNLP/examples/named_entity_recognition/msra_ner/README.md index 23dc4666..d0cb3a6d 100644 --- a/PaddleNLP/examples/named_entity_recognition/msra_ner/README.md +++ b/PaddleNLP/examples/named_entity_recognition/msra_ner/README.md @@ -17,16 +17,16 @@ PaddleNLP集成的数据集MSRA-NER数据集对文件格式做了调整:每一 - Python >= 3.6 -- paddlepaddle >= 2.0.0rc1,安装方式请参考 [快速安装](https://www.paddlepaddle.org.cn/install/quick)。 +- paddlepaddle >= 2.0.0,安装方式请参考 [快速安装](https://www.paddlepaddle.org.cn/install/quick)。 -- paddlenlp >= 2.0.0b2, 安装方式:`pip install paddlenlp>=2.0.0b2` +- paddlenlp >= 2.0.0rc, 安装方式:`pip install paddlenlp\>=2.0.0rc` ### 2.2 启动MSRA-NER任务 ```shell export CUDA_VISIBLE_DEVICES=0 -python -u ./run_msra_ner.py \ +python -u ./train.py \ --model_name_or_path bert-base-multilingual-uncased \ --max_seq_length 128 \ --batch_size 32 \ @@ -39,7 +39,7 @@ python -u ./run_msra_ner.py \ ``` 其中参数释义如下: -- `model_name_or_path`: 指示了某种特定配置的模型,对应有其预训练模型和预训练时使用的 tokenizer。若模型相关内容保存在本地,这里也可以提供相应目录地址。 +- `model_name_or_path`: 指示了某种特定配置的模型,对应有其预训练模型和预训练时使用的 tokenizer,支持[PaadleNLP transformer类预训练模型](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/docs/transformers.md)中除ernie-gen以外的所有模型。若使用非BERT系列模型,需修改脚本导入相应的Task和Tokenizer。若模型相关内容保存在本地,这里也可以提供相应目录地址。 - `max_seq_length`: 表示最大句子长度,超过该长度将被截断。 - `batch_size`: 表示每次迭代**每张卡**上的样本数目。 - `learning_rate`: 表示基础学习率大小,将于learning rate scheduler产生的值相乘作为当前学习率。 @@ -67,6 +67,46 @@ Precision | 0.908957 | Recall | 0.926683 | F1 | 0.917734 | +## 启动评估 + +```shell +export CUDA_VISIBLE_DEVICES=0 + +python -u ./eval.py \ + --model_name_or_path bert-base-multilingual-uncased \ + --max_seq_length 128 \ + --batch_size 32 \ + --use_gpu True \ + --init_checkpoint_path tmp/msra_ner/model_500.pdparams +``` + +其中参数释义如下: +- `model_name_or_path`: 指示了某种特定配置的模型,对应有其预训练模型和预训练时使用的 tokenizer。若模型相关内容保存在本地,这里也可以提供相应目录地址。 +- `max_seq_length`: 表示最大句子长度,超过该长度将被截断。 +- `batch_size`: 表示每次迭代**每张卡**上的样本数目。 +- `use_gpu`: 是否使用GPU。 +- `init_checkpoint_path`: 模型加载路径。 + +## 启动预测 + +```shell +export CUDA_VISIBLE_DEVICES=0 + +python -u ./predict.py \ + --model_name_or_path bert-base-multilingual-uncased \ + --max_seq_length 128 \ + --batch_size 32 \ + --use_gpu True \ + --init_checkpoint_path tmp/msra_ner/model_500.pdparams +``` + +## 使用其它预训练模型 + +本项目支持[PaadleNLP transformer类预训练模型](../../docs/transformers.md)中除ernie-gen以外的所有模型。若使用非BERT系列模型,需修改脚本导入相应的Task和Tokenizer。例如使用ERNIE系列模型,经查[PaadleNLP transformer类预训练模型](../../docs/transformers.md),需要加入以下代码: +```python +from paddlenlp.transformers import ErnieForTokenClassification, ErnieTokenizer +``` + ## 参考 [The third international Chinese language processing bakeoff: Word segmentation and named entity recognition](https://faculty.washington.edu/levow/papers/sighan06.pdf) diff --git a/PaddleNLP/examples/named_entity_recognition/msra_ner/predict.py b/PaddleNLP/examples/named_entity_recognition/msra_ner/predict.py new file mode 100644 index 00000000..bc400cd9 --- /dev/null +++ b/PaddleNLP/examples/named_entity_recognition/msra_ner/predict.py @@ -0,0 +1,272 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import ast +import random +import time +import math +from functools import partial + +import numpy as np +import paddle +from paddle.io import DataLoader + +import paddlenlp as ppnlp +from paddlenlp.datasets import MSRA_NER +from paddlenlp.data import Stack, Tuple, Pad +from paddlenlp.transformers import BertForTokenClassification, BertTokenizer + +parser = argparse.ArgumentParser() + +parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + required=True, + help="Path to pre-trained model or shortcut name selected in the list: " + + ", ".join(list(BertTokenizer.pretrained_init_configuration.keys()))) +parser.add_argument( + "--init_checkpoint_path", + default=None, + type=str, + required=True, + help="The model checkpoint path.", ) +parser.add_argument( + "--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.", ) +parser.add_argument( + "--batch_size", + default=8, + type=int, + help="Batch size per GPU/CPU for training.", ) +parser.add_argument( + "--use_gpu", + type=ast.literal_eval, + default=True, + help="If set, use GPU for training.") + + +def convert_example(example, + tokenizer, + label_list, + no_entity_id, + max_seq_length=512, + is_test=False): + """convert a glue example into necessary features""" + + def _truncate_seqs(seqs, max_seq_length): + if len(seqs) == 1: # single sentence + # Account for [CLS] and [SEP] with "- 2" + seqs[0] = seqs[0][0:(max_seq_length - 2)] + else: # sentence pair + # Account for [CLS], [SEP], [SEP] with "- 3" + tokens_a, tokens_b = seqs + max_seq_length -= 3 + while True: # truncate with longest_first strategy + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_seq_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + return seqs + + def _concat_seqs(seqs, separators, seq_mask=0, separator_mask=1): + concat = sum((seq + sep for sep, seq in zip(separators, seqs)), []) + segment_ids = sum( + ([i] * (len(seq) + len(sep)) + for i, (sep, seq) in enumerate(zip(separators, seqs))), []) + if isinstance(seq_mask, int): + seq_mask = [[seq_mask] * len(seq) for seq in seqs] + if isinstance(separator_mask, int): + separator_mask = [[separator_mask] * len(sep) for sep in separators] + p_mask = sum((s_mask + mask + for sep, seq, s_mask, mask in zip( + separators, seqs, seq_mask, separator_mask)), []) + return concat, segment_ids, p_mask + + def _reseg_token_label(tokens, tokenizer, labels=None): + if labels: + if len(tokens) != len(labels): + raise ValueError( + "The length of tokens must be same with labels") + ret_tokens = [] + ret_labels = [] + for token, label in zip(tokens, labels): + sub_token = tokenizer(token) + if len(sub_token) == 0: + continue + ret_tokens.extend(sub_token) + ret_labels.append(label) + if len(sub_token) < 2: + continue + sub_label = label + if label.startswith("B-"): + sub_label = "I-" + label[2:] + ret_labels.extend([sub_label] * (len(sub_token) - 1)) + + if len(ret_tokens) != len(ret_labels): + raise ValueError( + "The length of ret_tokens can't match with labels") + return ret_tokens, ret_labels + else: + ret_tokens = [] + for token in tokens: + sub_token = tokenizer(token) + if len(sub_token) == 0: + continue + ret_tokens.extend(sub_token) + if len(sub_token) < 2: + continue + + return ret_tokens, None + + if not is_test: + # get the label + label = example[-1].split("\002") + example = example[0].split("\002") + #create label maps if classification task + label_map = {} + for (i, l) in enumerate(label_list): + label_map[l] = i + else: + label = None + + tokens_raw, labels_raw = _reseg_token_label( + tokens=example, labels=label, tokenizer=tokenizer) + # truncate to the truncate_length, + tokens_trun = _truncate_seqs([tokens_raw], max_seq_length) + # concate the sequences with special tokens + tokens_trun[0] = [tokenizer.cls_token] + tokens_trun[0] + tokens, segment_ids, _ = _concat_seqs(tokens_trun, [[tokenizer.sep_token]] * + len(tokens_trun)) + # convert the token to ids + input_ids = tokenizer.convert_tokens_to_ids(tokens) + valid_length = len(input_ids) + if labels_raw: + labels_trun = _truncate_seqs([labels_raw], max_seq_length)[0] + labels_id = [no_entity_id] + [label_map[lbl] + for lbl in labels_trun] + [no_entity_id] + if not is_test: + return input_ids, segment_ids, valid_length, labels_id + else: + return input_ids, segment_ids, valid_length + + +def parse_decodes(input_words, id2label, decodes, lens): + decodes = [x for batch in decodes for x in batch] + lens = [x for batch in lens for x in batch] + + outputs = [] + for idx, end in enumerate(lens): + sent = input_words[idx][0].replace("\002", "")[:end] + tags = [id2label[x] for x in decodes[idx][1:end]] + sent_out = [] + tags_out = [] + words = "" + for s, t in zip(sent, tags): + if t.startswith('B-') or t == 'O': + if len(words): + sent_out.append(words) + if t.startswith('B-'): + tags_out.append(t.split('-')[1]) + else: + tags_out.append(t) + words = s + else: + words += s + if len(sent_out) < len(tags_out): + sent_out.append(words) + outputs.append(''.join( + [str((s, t)) for s, t in zip(sent_out, tags_out)])) + return outputs + + +def do_predict(args): + paddle.set_device("gpu" if args.use_gpu else "cpu") + + train_dataset, predict_dataset = ppnlp.datasets.MSRA_NER.get_datasets( + ["train", "test"]) + tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) + + label_list = train_dataset.get_labels() + label_num = len(label_list) + no_entity_id = label_num - 1 + trans_func = partial( + convert_example, + tokenizer=tokenizer, + label_list=label_list, + no_entity_id=label_num - 1, + max_seq_length=args.max_seq_length) + ignore_label = -100 + batchify_fn = lambda samples, fn=Tuple( + Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input + Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment + Stack(), # length + Pad(axis=0, pad_val=ignore_label) # label + ): fn(samples) + raw_data = predict_dataset.data + + id2label = dict(enumerate(predict_dataset.get_labels())) + + predict_dataset = predict_dataset.apply(trans_func, lazy=True) + predict_batch_sampler = paddle.io.BatchSampler( + predict_dataset, + batch_size=args.batch_size, + shuffle=False, + drop_last=True) + predict_data_loader = DataLoader( + dataset=predict_dataset, + batch_sampler=predict_batch_sampler, + collate_fn=batchify_fn, + num_workers=0, + return_list=True) + + model = BertForTokenClassification.from_pretrained( + args.model_name_or_path, num_classes=label_num) + if args.init_checkpoint_path: + model_dict = paddle.load(args.init_checkpoint_path) + model.set_dict(model_dict) + + model.eval() + pred_list = [] + len_list = [] + for step, batch in enumerate(predict_data_loader): + input_ids, segment_ids, length, labels = batch + logits = model(input_ids, segment_ids) + pred = paddle.argmax(logits, axis=-1) + pred_list.append(pred.numpy()) + len_list.append(length.numpy()) + + preds = parse_decodes(raw_data, id2label, pred_list, len_list) + + file_path = "results.txt" + with open(file_path, "w", encoding="utf8") as fout: + fout.write("\n".join(preds)) + # Print some examples + print( + "The results have been saved in the file: %s, some examples are shown below: " + % file_path) + print("\n".join(preds[:10])) + + +if __name__ == "__main__": + args = parser.parse_args() + do_predict(args) diff --git a/PaddleNLP/examples/named_entity_recognition/msra_ner/run_msra_ner.py b/PaddleNLP/examples/named_entity_recognition/msra_ner/train.py similarity index 80% rename from PaddleNLP/examples/named_entity_recognition/msra_ner/run_msra_ner.py rename to PaddleNLP/examples/named_entity_recognition/msra_ner/train.py index 99339e67..99357b9c 100644 --- a/PaddleNLP/examples/named_entity_recognition/msra_ner/run_msra_ner.py +++ b/PaddleNLP/examples/named_entity_recognition/msra_ner/train.py @@ -30,88 +30,78 @@ from paddlenlp.transformers import BertForTokenClassification, BertTokenizer from paddlenlp.transformers import LinearDecayWithWarmup from paddlenlp.metrics import ChunkEvaluator - -def parse_args(): - parser = argparse.ArgumentParser() - - parser.add_argument( - "--model_name_or_path", - default=None, - type=str, - required=True, - help="Path to pre-trained model or shortcut name selected in the list: " - + ", ".join(list(BertTokenizer.pretrained_init_configuration.keys()))) - parser.add_argument( - "--output_dir", - default=None, - type=str, - required=True, - help="The output directory where the model predictions and checkpoints will be written.", - ) - parser.add_argument( - "--max_seq_length", - default=128, - type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded.", ) - parser.add_argument( - "--batch_size", - default=8, - type=int, - help="Batch size per GPU/CPU for training.", ) - parser.add_argument( - "--learning_rate", - default=5e-5, - type=float, - help="The initial learning rate for Adam.") - parser.add_argument( - "--weight_decay", - default=0.0, - type=float, - help="Weight decay if we apply some.") - parser.add_argument( - "--adam_epsilon", - default=1e-8, - type=float, - help="Epsilon for Adam optimizer.") - parser.add_argument( - "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") - parser.add_argument( - "--num_train_epochs", - default=3, - type=int, - help="Total number of training epochs to perform.", ) - parser.add_argument( - "--max_steps", - default=-1, - type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.", - ) - parser.add_argument( - "--warmup_steps", - default=0, - type=int, - help="Linear warmup over warmup_steps.") - - parser.add_argument( - "--logging_steps", - type=int, - default=1, - help="Log every X updates steps.") - parser.add_argument( - "--save_steps", - type=int, - default=100, - help="Save checkpoint every X updates steps.") - parser.add_argument( - "--seed", type=int, default=42, help="random seed for initialization") - parser.add_argument( - "--n_gpu", - type=int, - default=1, - help="number of gpus to use, 0 for cpu.") - args = parser.parse_args() - return args +parser = argparse.ArgumentParser() + +parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + required=True, + help="Path to pre-trained model or shortcut name selected in the list: " + + ", ".join(list(BertTokenizer.pretrained_init_configuration.keys()))) +parser.add_argument( + "--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model predictions and checkpoints will be written.", +) +parser.add_argument( + "--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.", ) +parser.add_argument( + "--batch_size", + default=8, + type=int, + help="Batch size per GPU/CPU for training.", ) +parser.add_argument( + "--learning_rate", + default=5e-5, + type=float, + help="The initial learning rate for Adam.") +parser.add_argument( + "--weight_decay", + default=0.0, + type=float, + help="Weight decay if we apply some.") +parser.add_argument( + "--adam_epsilon", + default=1e-8, + type=float, + help="Epsilon for Adam optimizer.") +parser.add_argument( + "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") +parser.add_argument( + "--num_train_epochs", + default=3, + type=int, + help="Total number of training epochs to perform.", ) +parser.add_argument( + "--max_steps", + default=-1, + type=int, + help="If > 0: set total number of training steps to perform. Override num_train_epochs.", +) +parser.add_argument( + "--warmup_steps", + default=0, + type=int, + help="Linear warmup over warmup_steps.") + +parser.add_argument( + "--logging_steps", type=int, default=1, help="Log every X updates steps.") +parser.add_argument( + "--save_steps", + type=int, + default=100, + help="Save checkpoint every X updates steps.") +parser.add_argument( + "--seed", type=int, default=42, help="random seed for initialization") +parser.add_argument( + "--n_gpu", type=int, default=1, help="number of gpus to use, 0 for cpu.") def evaluate(model, loss_fct, metric, data_loader, label_num): @@ -316,6 +306,7 @@ def do_train(args): tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): + global_step += 1 input_ids, segment_ids, length, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct( @@ -337,11 +328,17 @@ def do_train(args): paddle.save(model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) - global_step += 1 + # Save final model + if (global_step) % args.save_steps != 0: + evaluate(model, loss_fct, metric, test_data_loader, label_num) + if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: + paddle.save(model.state_dict(), + os.path.join(args.output_dir, + "model_%d.pdparams" % global_step)) if __name__ == "__main__": - args = parse_args() + args = parser.parse_args() if args.n_gpu > 1: paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu) else: -- GitLab