From 3ce83f24313d874478ca154e50bba66938a32bbd Mon Sep 17 00:00:00 2001 From: Steffy-zxf <48793257+Steffy-zxf@users.noreply.github.com> Date: Thu, 4 Feb 2021 00:14:20 +0800 Subject: [PATCH] Refine text classification and matching examples (#5251) * update docs * update codes * update docs * update codes * update codes * update codes * add method tokenizer.get_special_tokens_mask() --- .../pretrained_models/README.md | 54 +++++-- .../pretrained_models/export_model.py | 41 ++--- .../pretrained_models/predict.py | 65 ++------ .../pretrained_models/train.py | 149 +++++------------- .../text_classification/rnn/README.md | 2 +- .../text_classification/rnn/export_model.py | 5 +- .../text_classification/rnn/predict.py | 40 +++-- .../examples/text_classification/rnn/train.py | 21 +-- .../examples/text_classification/rnn/utils.py | 101 +----------- .../sentence_transformers/README.md | 49 +++++- .../sentence_transformers/predict.py | 62 ++------ .../sentence_transformers/train.py | 149 +++++------------- .../examples/text_matching/simnet/README.md | 2 +- .../examples/text_matching/simnet/predict.py | 53 ++++--- .../examples/text_matching/simnet/train.py | 53 +++++-- .../examples/text_matching/simnet/utils.py | 112 ++----------- PaddleNLP/paddlenlp/__init__.py | 2 +- PaddleNLP/paddlenlp/models/senta.py | 36 ++--- PaddleNLP/paddlenlp/seq2vec/encoder.py | 91 ++++++----- .../paddlenlp/transformers/bert/tokenizer.py | 33 ++++ .../transformers/electra/tokenizer.py | 33 ++++ .../paddlenlp/transformers/ernie/tokenizer.py | 33 ++++ .../transformers/roberta/tokenizer.py | 33 ++++ .../paddlenlp/transformers/tokenizer_utils.py | 20 +++ 24 files changed, 534 insertions(+), 705 deletions(-) diff --git a/PaddleNLP/examples/text_classification/pretrained_models/README.md b/PaddleNLP/examples/text_classification/pretrained_models/README.md index 539c4821..71d8b8af 100644 --- a/PaddleNLP/examples/text_classification/pretrained_models/README.md +++ b/PaddleNLP/examples/text_classification/pretrained_models/README.md @@ -52,7 +52,7 @@ - paddlepaddle >= 2.0.0-rc1 ``` -pip install paddlenlp==2.0.0b +pip install paddlenlp>=2.0.0rc ``` ### 代码结构说明 @@ -73,17 +73,12 @@ pretrained_models/ ```shell # 设置使用的GPU卡号 CUDA_VISIBLE_DEVICES=0 -python train.py --model_type ernie --model_name ernie-tiny --n_gpu 1 --save_dir ./checkpoints +python train.py --n_gpu 1 --save_dir ./checkpoints ``` 可支持配置的参数: -* `model_type`:必选,模型类型,可以选择bert,ernie,roberta。 -* `model_name`: 必选,具体的模型简称。 - 如`model_type=ernie`,则model_name可以选择`ernie-1.0`和`ernie-tiny`。 - 如`model_type=bert`,则model_name可以选择`bert-base-chinese`,`bert-wwm-chinese`,`bert-wwm-ext-chinese`。 - 如`model_type=roberta`,则model_name可以选择`roberta-wwm-ext-large`,`roberta-wwm-ext`,`rbt3`,`rbtl3`。 -* `save_dir`:必选,保存训练模型的目录。 +* `save_dir`:可选,保存训练模型的目录;默认保存在当前目录checkpoints文件夹下。 * `max_seq_length`:可选,ERNIE/BERT模型使用的最大序列长度,最大不能超过512, 若出现显存不足,请适当调低这一参数;默认为128。 * `batch_size`:可选,批处理大小,请结合显存情况进行调整,若出现显存不足,请适当调低这一参数;默认为32。 * `learning_rate`:可选,Fine-tune的最大学习率;默认为5e-5。 @@ -94,6 +89,45 @@ python train.py --model_type ernie --model_name ernie-tiny --n_gpu 1 --save_dir * `seed`:可选,随机种子,默认为1000. * `n_gpu`:可选,训练过程中使用GPU卡数量,默认为1。若n_gpu=0,则使用CPU训练。 +代码示例中使用的预训练模型是ERNIE,如果想要使用其他预训练模型如BERT,RoBERTa,Electra等,只需更换`model` 和 `tokenizer`即可。 + +```python +# 使用ernie预训练模型 +# ernie +model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained('ernie',num_classes=2)) +tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie') + +# ernie-tiny +# model = ppnlp.transformers.ErnieForSequenceClassification.rom_pretrained('ernie-tiny',num_classes=2)) +# tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained('ernie-tiny') + + +# 使用bert预训练模型 +# bert-base-chinese +model = ppnlp.transformers.BertForSequenceClassification.from_pretrained('bert-base-chinese', num_class=2) +tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese') + +# bert-wwm-chinese +# model = ppnlp.transformers.BertForSequenceClassification.from_pretrained('bert-wwm-chinese', num_class=2) +# tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-wwm-chinese') + +# bert-wwm-ext-chinese +# model = ppnlp.transformers.BertForSequenceClassification.from_pretrained('bert-wwm-ext-chinese', num_class=2) +# tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-wwm-ext-chinese') + + +# 使用roberta预训练模型 +# roberta-wwm-ext +# model = ppnlp.transformers.RobertaForSequenceClassification.from_pretrained('roberta-wwm-ext', num_class=2) +# tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext') + +# roberta-wwm-ext +# model = ppnlp.transformers.RobertaForSequenceClassification.from_pretrained('roberta-wwm-ext-large', num_class=2) +# tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext-large') + +``` +更多预训练模型,参考[transformers](../../../docs/transformers.md) + 程序运行时将会自动进行训练,评估,测试。同时训练过程中会自动保存模型在指定的`save_dir`中。 如: @@ -114,7 +148,7 @@ checkpoints/ 运行方式: ```shell -python export_model.py --model_type=roberta --model_name=roberta-wwm-ext --params_path=./checkpoint/model_200/model_state.pdparams --output_path=./static_graph_params +python export_model.py --params_path=./checkpoint/model_900/model_state.pdparams --output_path=./static_graph_params ``` 其中`params_path`是指动态图训练保存的参数路径,`output_path`是指静态图参数导出路径。 @@ -123,7 +157,7 @@ python export_model.py --model_type=roberta --model_name=roberta-wwm-ext --param 启动预测: ```shell export CUDA_VISIBLE_DEVICES=0 -python predict.py --model_type ernie --model_name ernie-tiny --params_path checkpoints/model_400/model_state.pdparams +python predict.py --params_path checkpoints/model_900/model_state.pdparams ``` 将待预测数据如以下示例: diff --git a/PaddleNLP/examples/text_classification/pretrained_models/export_model.py b/PaddleNLP/examples/text_classification/pretrained_models/export_model.py index d057d5bf..da860ee2 100644 --- a/PaddleNLP/examples/text_classification/pretrained_models/export_model.py +++ b/PaddleNLP/examples/text_classification/pretrained_models/export_model.py @@ -25,46 +25,23 @@ import paddle.nn.functional as F from paddlenlp.data import Stack, Tuple, Pad import paddlenlp as ppnlp -MODEL_CLASSES = { - "bert": (ppnlp.transformers.BertForSequenceClassification, - ppnlp.transformers.BertTokenizer), - 'ernie': (ppnlp.transformers.ErnieForSequenceClassification, - ppnlp.transformers.ErnieTokenizer), - 'roberta': (ppnlp.transformers.RobertaForSequenceClassification, - ppnlp.transformers.RobertaTokenizer), -} - - # yapf: disable -def parse_args(): - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument("--model_type", default='roberta', required=True, type=str, help="Model type selected in the list: " +", ".join(MODEL_CLASSES.keys())) - parser.add_argument("--model_name_or_path", default='roberta-wwm-ext', required=True, type=str, help="Path to pre-trained model or shortcut name selected in the list: " + - ", ".join(sum([list(classes[-1].pretrained_init_configuration.keys()) for classes in MODEL_CLASSES.values()], []))) - parser.add_argument("--params_path", type=str, required=True, default='./checkpoint/model_200/model_state.pdparams', help="The path to model parameters to be loaded.") - parser.add_argument("--output_path", type=str, default='./static_graph_params', help="The path of model parameter in static graph to be saved.") - args = parser.parse_args() - return args +parser = argparse.ArgumentParser() +parser.add_argument("--params_path", type=str, required=True, default='./checkpoint/model_900/model_state.pdparams', help="The path to model parameters to be loaded.") +parser.add_argument("--output_path", type=str, default='./static_graph_params', help="The path of model parameter in static graph to be saved.") +args = parser.parse_args() # yapf: enable if __name__ == "__main__": - args = parse_args() - - args.model_type = args.model_type.lower() - model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - if args.model_name_or_path == 'ernie-tiny': - # ErnieTinyTokenizer is special for ernie-tiny pretained model. - tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained( - args.model_name_or_path) - else: - tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) + # ErnieTinyTokenizer is special for ernie-tiny pretained model. + tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained( + 'ernie-tiny') # The number of labels should be in accordance with the training dataset. label_map = {0: 'negative', 1: 'positive'} - model = model_class.from_pretrained( - args.model_name_or_path, num_classes=len(label_map)) + model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained( + "ernie-tiny", num_classes=len(label_map)) if args.params_path and os.path.isfile(args.params_path): state_dict = paddle.load(args.params_path) diff --git a/PaddleNLP/examples/text_classification/pretrained_models/predict.py b/PaddleNLP/examples/text_classification/pretrained_models/predict.py index 3ba0bf78..880d06ad 100644 --- a/PaddleNLP/examples/text_classification/pretrained_models/predict.py +++ b/PaddleNLP/examples/text_classification/pretrained_models/predict.py @@ -25,31 +25,14 @@ import paddle.nn.functional as F from paddlenlp.data import Stack, Tuple, Pad import paddlenlp as ppnlp -MODEL_CLASSES = { - "bert": (ppnlp.transformers.BertForSequenceClassification, - ppnlp.transformers.BertTokenizer), - 'ernie': (ppnlp.transformers.ErnieForSequenceClassification, - ppnlp.transformers.ErnieTokenizer), - 'roberta': (ppnlp.transformers.RobertaForSequenceClassification, - ppnlp.transformers.RobertaTokenizer), -} - - # yapf: disable -def parse_args(): - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument("--model_type", default='ernie', required=True, type=str, help="Model type selected in the list: " +", ".join(MODEL_CLASSES.keys())) - parser.add_argument("--model_name_or_path", default='ernie-tiny', required=True, type=str, help="Path to pre-trained model or shortcut name selected in the list: " + - ", ".join(sum([list(classes[-1].pretrained_init_configuration.keys()) for classes in MODEL_CLASSES.values()], []))) - parser.add_argument("--params_path", type=str, required=True, help="The path to model parameters to be loaded.") - - parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. " - "Sequences longer than this will be truncated, sequences shorter will be padded.") - parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") - parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs to use, 0 for CPU.") - args = parser.parse_args() - return args +parser = argparse.ArgumentParser() +parser.add_argument("--params_path", type=str, required=True, help="The path to model parameters to be loaded.") +parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. " + "Sequences longer than this will be truncated, sequences shorter will be padded.") +parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") +parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs to use, 0 for CPU.") +args = parser.parse_args() # yapf: enable @@ -134,23 +117,16 @@ def predict(model, data, tokenizer, label_map, batch_size=1): is_test=True) examples.append((input_ids, segment_ids)) + # Seperates data into some batches. + batches = [ + examples[idx:idx + batch_size] + for idx in range(0, len(examples), batch_size) + ] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment ): fn(samples) - # Seperates data into some batches. - batches = [] - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) - results = [] model.eval() for batch in batches: @@ -167,18 +143,11 @@ def predict(model, data, tokenizer, label_map, batch_size=1): if __name__ == "__main__": - args = parse_args() paddle.set_device("gpu" if args.n_gpu else "cpu") - args.model_type = args.model_type.lower() - model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - - if args.model_name_or_path == 'ernie-tiny': - # ErnieTinyTokenizer is special for ernie-tiny pretained model. - tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained( - args.model_name_or_path) - else: - tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) + # ErnieTinyTokenizer is special for ernie-tiny pretained model. + tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained( + 'ernie-tiny') data = [ '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', @@ -187,8 +156,8 @@ if __name__ == "__main__": ] label_map = {0: 'negative', 1: 'positive'} - model = model_class.from_pretrained( - args.model_name_or_path, num_classes=len(label_map)) + model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained( + 'ernie-tiny', num_classes=len(label_map)) if args.params_path and os.path.isfile(args.params_path): state_dict = paddle.load(args.params_path) diff --git a/PaddleNLP/examples/text_classification/pretrained_models/train.py b/PaddleNLP/examples/text_classification/pretrained_models/train.py index 756b15fa..73bec44e 100644 --- a/PaddleNLP/examples/text_classification/pretrained_models/train.py +++ b/PaddleNLP/examples/text_classification/pretrained_models/train.py @@ -25,97 +25,28 @@ import paddle.nn.functional as F from paddlenlp.data import Stack, Tuple, Pad import paddlenlp as ppnlp -MODEL_CLASSES = { - "bert": (ppnlp.transformers.BertForSequenceClassification, - ppnlp.transformers.BertTokenizer), - 'ernie': (ppnlp.transformers.ErnieForSequenceClassification, - ppnlp.transformers.ErnieTokenizer), - 'roberta': (ppnlp.transformers.RobertaForSequenceClassification, - ppnlp.transformers.RobertaTokenizer), -} - - -def parse_args(): - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_type", - default='ernie', - required=True, - type=str, - help="Model type selected in the list: " + - ", ".join(MODEL_CLASSES.keys())) - parser.add_argument( - "--model_name", - default='ernie-tiny', - required=True, - type=str, - help="Path to pre-trained model or shortcut name selected in the list: " - + ", ".join( - sum([ - list(classes[-1].pretrained_init_configuration.keys()) - for classes in MODEL_CLASSES.values() - ], []))) - parser.add_argument( - "--save_dir", - default='./checkpoint', - required=True, - type=str, - help="The output directory where the model checkpoints will be written.") - - parser.add_argument( - "--max_seq_length", - default=128, - type=int, - help="The maximum total input sequence length after tokenization. " - "Sequences longer than this will be truncated, sequences shorter will be padded." - ) - parser.add_argument( - "--batch_size", - default=32, - type=int, - help="Batch size per GPU/CPU for training.") - parser.add_argument( - "--learning_rate", - default=5e-5, - type=float, - help="The initial learning rate for Adam.") - parser.add_argument( - "--weight_decay", - default=0.0, - type=float, - help="Weight decay if we apply some.") - parser.add_argument( - "--epochs", - default=3, - type=int, - help="Total number of training epochs to perform.") - parser.add_argument( - "--warmup_proption", - default=0.0, - type=float, - help="Linear warmup proption over the training process.") - parser.add_argument( - "--init_from_ckpt", - type=str, - default=None, - help="The path of checkpoint to be loaded.") - parser.add_argument( - "--seed", type=int, default=1000, help="random seed for initialization") - parser.add_argument( - "--n_gpu", - type=int, - default=1, - help="Number of GPUs to use, 0 for CPU.") - args = parser.parse_args() - return args - - -def set_seed(args): +# yapf: disable +parser = argparse.ArgumentParser() +parser.add_argument("--save_dir", default='./checkpoint', type=str, help="The output directory where the model checkpoints will be written.") +parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. " + "Sequences longer than this will be truncated, sequences shorter will be padded.") +parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") +parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") +parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") +parser.add_argument("--epochs", default=3, type=int, help="Total number of training epochs to perform.") +parser.add_argument("--warmup_proportion", default=0.0, type=float, help="Linear warmup proption over the training process.") +parser.add_argument("--init_from_ckpt", type=str, default=None, help="The path of checkpoint to be loaded.") +parser.add_argument("--seed", type=int, default=1000, help="random seed for initialization") +parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs to use, 0 for CPU.") +args = parser.parse_args() +# yapf: enable + + +def set_seed(seed): """sets random seed""" - random.seed(args.seed) - np.random.seed(args.seed) - paddle.seed(args.seed) + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) @paddle.no_grad() @@ -223,24 +154,30 @@ def create_dataloader(dataset, return_list=True) -def do_train(args): - set_seed(args) +def do_train(): + set_seed(args.seed) paddle.set_device("gpu" if args.n_gpu else "cpu") world_size = paddle.distributed.get_world_size() if world_size > 1: paddle.distributed.init_parallel_env() - args.model_type = args.model_type.lower() - model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - train_dataset, dev_dataset, test_dataset = ppnlp.datasets.ChnSentiCorp.get_datasets( ['train', 'dev', 'test']) - if args.model_name == 'ernie-tiny': - # ErnieTinyTokenizer is special for ernie-tiny pretained model. - tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained( - args.model_name) - else: - tokenizer = tokenizer_class.from_pretrained(args.model_name) + + # If you wanna use bert/roberta/electra pretrained model, + # model = ppnlp.transformers.BertForSequenceClassification.from_pretrained('bert-base-chinese', num_class=2) + # model = ppnlp.transformers.RobertaForSequenceClassification.from_pretrained('roberta-wwm-ext', num_class=2) + # model = ppnlp.transformers.ElectraForSequenceClassification.from_pretrained('chinese-electra-small', num_classes=2) + model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained( + 'ernie-tiny', num_classes=len(train_dataset.get_labels())) + + # If you wanna use bert/roberta/electra pretrained model, + # tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese') + # tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext') + # tokenizer = ppnlp.transformers.ElectraTokenizer.from_pretrained('chinese-electra-small', num_classes=2) + # ErnieTinyTokenizer is special for ernie-tiny pretained model. + tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained( + 'ernie-tiny') trans_func = partial( convert_example, @@ -271,16 +208,13 @@ def do_train(args): batchify_fn=batchify_fn, trans_fn=trans_func) - model = model_class.from_pretrained( - args.model_name, num_classes=len(train_dataset.get_labels())) - if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs - num_warmup_steps = int(args.warmup_proption * num_training_steps) + num_warmup_steps = int(args.warmup_proportion * num_training_steps) def get_lr_factor(current_step): if current_step < num_warmup_steps: @@ -342,8 +276,7 @@ def do_train(args): if __name__ == "__main__": - args = parse_args() if args.n_gpu > 1: - paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu) + paddle.distributed.spawn(do_train, nprocs=args.n_gpu) else: - do_train(args) + do_train() diff --git a/PaddleNLP/examples/text_classification/rnn/README.md b/PaddleNLP/examples/text_classification/rnn/README.md index 490a2c50..fa5dbaca 100644 --- a/PaddleNLP/examples/text_classification/rnn/README.md +++ b/PaddleNLP/examples/text_classification/rnn/README.md @@ -115,7 +115,7 @@ PaddleNLP提供了一系列的文本表示技术,如`seq2vec`模块。 - paddlepaddle >= 2.0.0-rc1 ``` -pip install paddlenlp==2.0.0b +pip install paddlenlp>=2.0.0rc ``` ### 代码结构说明 diff --git a/PaddleNLP/examples/text_classification/rnn/export_model.py b/PaddleNLP/examples/text_classification/rnn/export_model.py index 0e9a2dc0..693e8b1d 100644 --- a/PaddleNLP/examples/text_classification/rnn/export_model.py +++ b/PaddleNLP/examples/text_classification/rnn/export_model.py @@ -16,8 +16,7 @@ import argparse import paddle import paddlenlp as ppnlp - -from utils import load_vocab +from paddlenlp.data import Vocab # yapf: disable parser = argparse.ArgumentParser(__doc__) @@ -31,7 +30,7 @@ args = parser.parse_args() def main(): # Load vocab. - vocab = load_vocab(args.vocab_path) + vocab = Vocab.load_vocabulary(args.vocab_path) label_map = {0: 'negative', 1: 'positive'} # Construct the newtork. diff --git a/PaddleNLP/examples/text_classification/rnn/predict.py b/PaddleNLP/examples/text_classification/rnn/predict.py index 42cb5d93..3de3fb33 100644 --- a/PaddleNLP/examples/text_classification/rnn/predict.py +++ b/PaddleNLP/examples/text_classification/rnn/predict.py @@ -14,10 +14,11 @@ import argparse import paddle -import paddlenlp as ppnlp import paddle.nn.functional as F +import paddlenlp as ppnlp +from paddlenlp.data import JiebaTokenizer, Stack, Tuple, Pad -from utils import load_vocab, generate_batch, preprocess_prediction_data +from utils import preprocess_prediction_data # yapf: disable parser = argparse.ArgumentParser(__doc__) @@ -30,7 +31,7 @@ args = parser.parse_args() # yapf: enable -def predict(model, data, label_map, collate_fn, batch_size=1, pad_token_id=0): +def predict(model, data, label_map, batch_size=1, pad_token_id=0): """ Predicts the data labels. @@ -39,8 +40,6 @@ def predict(model, data, label_map, collate_fn, batch_size=1, pad_token_id=0): data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object. A Example object contains `text`(word_ids) and `se_len`(sequence length). label_map(obj:`dict`): The label id (key) to label str (value) map. - collate_fn(obj: `callable`): function to generate mini-batch data by merging - the sample list. batch_size(obj:`int`, defaults to 1): The number of batch. pad_token_id(obj:`int`, optional, defaults to 0): The pad token index. @@ -49,22 +48,18 @@ def predict(model, data, label_map, collate_fn, batch_size=1, pad_token_id=0): """ # Seperates data into some batches. - batches = [] - one_batch = [] - for example in data: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) + batches = [ + data[idx:idx + batch_size] for idx in range(0, len(data), batch_size) + ] + batchify_fn = lambda samples, fn=Tuple( + Pad(axis=0, pad_val=pad_token_id), # input_ids + Stack(dtype="int64"), # seq len + ): [data for data in fn(samples)] results = [] model.eval() for batch in batches: - texts, seq_lens = collate_fn( - batch, pad_token_id=pad_token_id, return_label=False) + texts, seq_lens = batchify_fn(batch) texts = paddle.to_tensor(texts) seq_lens = paddle.to_tensor(seq_lens) logits = model(texts, seq_lens) @@ -78,8 +73,9 @@ def predict(model, data, label_map, collate_fn, batch_size=1, pad_token_id=0): if __name__ == "__main__": paddle.set_device("gpu") if args.use_gpu else paddle.set_device("cpu") - # Loads vocab. - vocab = load_vocab(args.vocab_path) + # Loads vocab.s + vocab = ppnlp.data.Vocab.load_vocabulary( + args.vocab_path, unk_token='[UNK]', pad_token='[PAD]') label_map = {0: 'negative', 1: 'positive'} # Constructs the newtork. @@ -97,14 +93,14 @@ if __name__ == "__main__": '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', ] - examples = preprocess_prediction_data(data, vocab) + tokenizer = JiebaTokenizer(vocab) + examples = preprocess_prediction_data(data, tokenizer) results = predict( model, examples, label_map=label_map, batch_size=args.batch_size, - collate_fn=generate_batch) - + pad_token_id=vocab.token_to_idx.get("[PAD]", 0)) for idx, text in enumerate(data): print('Data: {} \t Label: {}'.format(text, results[idx])) diff --git a/PaddleNLP/examples/text_classification/rnn/train.py b/PaddleNLP/examples/text_classification/rnn/train.py index a314b8c9..6412d281 100644 --- a/PaddleNLP/examples/text_classification/rnn/train.py +++ b/PaddleNLP/examples/text_classification/rnn/train.py @@ -19,10 +19,10 @@ import random import numpy as np import paddle import paddlenlp as ppnlp -from paddlenlp.data import Stack, Tuple, Pad +from paddlenlp.data import JiebaTokenizer, Pad, Stack, Tuple, Vocab from paddlenlp.datasets import ChnSentiCorp -from utils import load_vocab, convert_example +from utils import convert_example # yapf: disable parser = argparse.ArgumentParser(__doc__) @@ -50,7 +50,6 @@ def create_dataloader(dataset, mode='train', batch_size=1, use_gpu=False, - pad_token_id=0, batchify_fn=None): """ Creats dataloader. @@ -61,7 +60,6 @@ def create_dataloader(dataset, mode(obj:`str`, optional, defaults to obj:`train`): If mode is 'train', it will shuffle the dataset randomly. batch_size(obj:`int`, optional, defaults to 1): The sample number of a mini-batch. use_gpu(obj:`bool`, optional, defaults to obj:`False`): Whether to use gpu to run. - pad_token_id(obj:`int`, optional, defaults to 0): The pad token index. batchify_fn(obj:`callable`, optional, defaults to `None`): function to generate mini-batch data by merging the sample list, None for only stack each fields of sample in axis 0(same as :attr::`np.stack(..., axis=0)`). @@ -95,8 +93,9 @@ if __name__ == "__main__": if not os.path.exists(args.vocab_path): raise RuntimeError('The vocab_path can not be found in the path %s' % args.vocab_path) - vocab = load_vocab(args.vocab_path) + vocab = Vocab.load_vocabulary( + args.vocab_path, unk_token='[UNK]', pad_token='[PAD]') # Loads dataset. train_ds, dev_ds, test_ds = ChnSentiCorp.get_datasets( ['train', 'dev', 'test']) @@ -110,13 +109,10 @@ if __name__ == "__main__": model = paddle.Model(model) # Reads data and generates mini-batches. - trans_fn = partial( - convert_example, - vocab=vocab, - unk_token_id=vocab.get('[UNK]', 1), - is_test=False) + tokenizer = JiebaTokenizer(vocab) + trans_fn = partial(convert_example, tokenizer=tokenizer, is_test=False) batchify_fn = lambda samples, fn=Tuple( - Pad(axis=0, pad_val=vocab['[PAD]']), # input_ids + Pad(axis=0, pad_val=vocab.token_to_idx.get('[PAD]', 0)), # input_ids Stack(dtype="int64"), # seq len Stack(dtype="int64") # label ): [data for data in fn(samples)] @@ -126,7 +122,6 @@ if __name__ == "__main__": batch_size=args.batch_size, mode='train', use_gpu=args.use_gpu, - pad_token_id=vocab.get('[PAD]', 0), batchify_fn=batchify_fn) dev_loader = create_dataloader( dev_ds, @@ -134,7 +129,6 @@ if __name__ == "__main__": batch_size=args.batch_size, mode='validation', use_gpu=args.use_gpu, - pad_token_id=vocab.get('[PAD]', 0), batchify_fn=batchify_fn) test_loader = create_dataloader( test_ds, @@ -142,7 +136,6 @@ if __name__ == "__main__": batch_size=args.batch_size, mode='test', use_gpu=args.use_gpu, - pad_token_id=vocab.get('[PAD]', 0), batchify_fn=batchify_fn) optimizer = paddle.optimizer.Adam( diff --git a/PaddleNLP/examples/text_classification/rnn/utils.py b/PaddleNLP/examples/text_classification/rnn/utils.py index 8c489c6c..24455038 100644 --- a/PaddleNLP/examples/text_classification/rnn/utils.py +++ b/PaddleNLP/examples/text_classification/rnn/utils.py @@ -11,113 +11,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import jieba import numpy as np -def load_vocab(vocab_file): - """Loads a vocabulary file into a dictionary.""" - vocab = {} - with open(vocab_file, "r", encoding="utf-8") as reader: - tokens = reader.readlines() - for index, token in enumerate(tokens): - token = token.rstrip("\n").split("\t")[0] - vocab[token] = index - return vocab - - -def convert_ids_to_tokens(wids, inversed_vocab): - """ Converts a token string (or a sequence of tokens) in a single integer id - (or a sequence of ids), using the vocabulary. - """ - tokens = [] - for wid in wids: - wstr = inversed_vocab.get(wid, None) - if wstr: - tokens.append(wstr) - return tokens - - -def convert_tokens_to_ids(tokens, vocab): - """ Converts a token id (or a sequence of id) in a token string - (or a sequence of tokens), using the vocabulary. - """ - - ids = [] - unk_id = vocab.get('[UNK]', None) - for token in tokens: - wid = vocab.get(token, unk_id) - if wid: - ids.append(wid) - return ids - - -def pad_texts_to_max_seq_len(texts, max_seq_len, pad_token_id=0): - """ - Padded the texts to the max sequence length if the length of text is lower than it. - Unless it truncates the text. - - Args: - texts(obj:`list`): Texts which contrains a sequence of word ids. - max_seq_len(obj:`int`): Max sequence length. - pad_token_id(obj:`int`, optinal, defaults to 0) : The pad token index. - """ - for index, text in enumerate(texts): - seq_len = len(text) - if seq_len < max_seq_len: - padded_tokens = [pad_token_id for _ in range(max_seq_len - seq_len)] - new_text = text + padded_tokens - texts[index] = new_text - elif seq_len > max_seq_len: - new_text = text[:max_seq_len] - texts[index] = new_text - - -def generate_batch(batch, pad_token_id=0, return_label=True): - """ - Generates a batch whose text will be padded to the max sequence length in the batch. - - Args: - batch(obj:`List[Example]`) : One batch, which contains texts, labels and the true sequence lengths. - pad_token_id(obj:`int`, optinal, defaults to 0) : The pad token index. - - Returns: - batch(:obj:`Tuple[list]`): The batch data which contains texts, seq_lens and labels. - """ - seq_lens = [entry[1] for entry in batch] - - batch_max_seq_len = max(seq_lens) - texts = [entry[0] for entry in batch] - pad_texts_to_max_seq_len(texts, batch_max_seq_len, pad_token_id) - - if return_label: - labels = [[entry[-1]] for entry in batch] - return texts, seq_lens, labels - else: - return texts, seq_lens - - -def convert_example(example, vocab, unk_token_id=1, is_test=False): +def convert_example(example, tokenizer, is_test=False): """ Builds model inputs from a sequence for sequence classification tasks. It use `jieba.cut` to tokenize text. Args: example(obj:`list[str]`): List of input data, containing text and label if it have label. - vocab(obj:`dict`): The vocabulary. - unk_token_id(obj:`int`, defaults to 1): The unknown token id. + tokenizer(obj: paddlenlp.data.JiebaTokenizer): It use jieba to cut the chinese string. is_test(obj:`False`, defaults to `False`): Whether the example contains label or not. Returns: - input_ids(obj:`list[int]`): The list of token ids.s + input_ids(obj:`list[int]`): The list of token ids. valid_length(obj:`int`): The input sequence valid length. label(obj:`numpy.array`, data type of int64, optional): The input label if not is_test. """ - input_ids = [] - for token in jieba.cut(example[0]): - token_id = vocab.get(token, unk_token_id) - input_ids.append(token_id) + input_ids = tokenizer.encode(example[0]) valid_length = np.array(len(input_ids), dtype='int64') input_ids = np.array(input_ids, dtype='int64') @@ -128,12 +41,13 @@ def convert_example(example, vocab, unk_token_id=1, is_test=False): return input_ids, valid_length -def preprocess_prediction_data(data, vocab): +def preprocess_prediction_data(data, tokenizer): """ It process the prediction data as the format used as training. Args: data (obj:`List[str]`): The prediction data whose each element is a tokenized text. + tokenizer(obj: paddlenlp.data.JiebaTokenizer): It use jieba to cut the chinese string. Returns: examples (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object. @@ -142,7 +56,6 @@ def preprocess_prediction_data(data, vocab): """ examples = [] for text in data: - tokens = " ".join(jieba.cut(text)).split(' ') - ids = convert_tokens_to_ids(tokens, vocab) + ids = tokenizer.encode(text) examples.append([ids, len(ids)]) return examples diff --git a/PaddleNLP/examples/text_matching/sentence_transformers/README.md b/PaddleNLP/examples/text_matching/sentence_transformers/README.md index f35931c7..d4837056 100644 --- a/PaddleNLP/examples/text_matching/sentence_transformers/README.md +++ b/PaddleNLP/examples/text_matching/sentence_transformers/README.md @@ -89,17 +89,12 @@ sentence_transformers/ ```shell # 设置使用的GPU卡号 CUDA_VISIBLE_DEVICES=0 -python train.py --model_type ernie --model_name ernie-1.0 --n_gpu 1 --save_dir ./checkpoints +python train.py --save_dir ./checkpoints ``` 可支持配置的参数: -* `model_type`:必选,模型类型,可以选择bert,ernie,roberta。 -* `model_name`: 必选,具体的模型简称。 - 如`model_type=ernie`,则model_name可以选择`ernie-1.0`和`ernie-tiny`。 - 如`model_type=bert`,则model_name可以选择`bert-base-chinese`,`bert-wwm-chinese`,`bert-wwm-ext-chinese`。 - 如`model_type=roberta`,则model_name可以选择`roberta-wwm-ext`,`rbt3`,`rbtl3`。 -* `save_dir`:必选,保存训练模型的目录。 +* `save_dir`:可选,保存训练模型的目录;默认保存在当前目录checkpoints文件夹下。 * `max_seq_length`:可选,ERNIE/BERT模型使用的最大序列长度,最大不能超过512, 若出现显存不足,请适当调低这一参数;默认为128。 * `batch_size`:可选,批处理大小,请结合显存情况进行调整,若出现显存不足,请适当调低这一参数;默认为32。 * `learning_rate`:可选,Fine-tune的最大学习率;默认为5e-5。 @@ -110,6 +105,44 @@ python train.py --model_type ernie --model_name ernie-1.0 --n_gpu 1 --save_dir . * `seed`:可选,随机种子,默认为1000. * `n_gpu`:可选,训练过程中使用GPU卡数量,默认为1。若n_gpu=0,则使用CPU训练。 +代码示例中使用的预训练模型是ERNIE,如果想要使用其他预训练模型如BERT,RoBERTa,Electra等,只需更换`model` 和 `tokenizer`即可。 + +```python +# 使用ernie预训练模型 +# ernie +model = ppnlp.transformers.ErnieModel.from_pretrained('ernie')) +tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie') + +# ernie-tiny +# model = ppnlp.transformers.ErnieModel.from_pretrained('ernie-tiny')) +# tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained('ernie-tiny') + + +# 使用bert预训练模型 +# bert-base-chinese +# model = ppnlp.transformers.BertModel.from_pretrained('bert-base-chinese') +# tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese') + +# bert-wwm-chinese +# model = ppnlp.transformers.BertModel.from_pretrained('bert-wwm-chinese') +# tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-wwm-chinese') + +# bert-wwm-ext-chinese +# model = ppnlp.transformers.BertModel.from_pretrained('bert-wwm-ext-chinese') +# tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-wwm-ext-chinese') + + +# 使用roberta预训练模型 +# roberta-wwm-ext +# model = ppnlp.transformers.RobertaModel.from_pretrained('roberta-wwm-ext') +# tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext') + +# roberta-wwm-ext +# model = ppnlp.transformers.RobertaModel.from_pretrained('roberta-wwm-ext-large') +# tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext-large') + +``` +更多预训练模型,参考[transformers](../../../docs/transformers.md) 程序运行时将会自动进行训练,评估,测试。同时训练过程中会自动保存模型在指定的`save_dir`中。 如: @@ -132,7 +165,7 @@ checkpoints/ 启动预测: ```shell CUDA_VISIBLE_DEVICES=0 -python predict.py --model_type ernie --model_name ernie-tiny --params_path checkpoints/model_400/model_state.pdparams +python predict.py --params_path checkpoints/model_400/model_state.pdparams ``` 将待预测数据如以下示例: diff --git a/PaddleNLP/examples/text_matching/sentence_transformers/predict.py b/PaddleNLP/examples/text_matching/sentence_transformers/predict.py index b72ab2d1..228394d7 100644 --- a/PaddleNLP/examples/text_matching/sentence_transformers/predict.py +++ b/PaddleNLP/examples/text_matching/sentence_transformers/predict.py @@ -26,29 +26,14 @@ from paddlenlp.data import Stack, Tuple, Pad from model import SentenceTransformer -MODEL_CLASSES = { - "bert": (ppnlp.transformers.BertModel, ppnlp.transformers.BertTokenizer), - 'ernie': (ppnlp.transformers.ErnieModel, ppnlp.transformers.ErnieTokenizer), - 'roberta': - (ppnlp.transformers.RobertaModel, ppnlp.transformers.RobertaTokenizer) -} - - # yapf: disable -def parse_args(): - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument("--model_type", default='ernie', type=str, help="Model type selected in the list: " +", ".join(MODEL_CLASSES.keys())) - parser.add_argument("--model_name", default='ernie-1.0', type=str, help="Path to pre-trained model or shortcut name selected in the list: " + - ", ".join(sum([list(classes[-1].pretrained_init_configuration.keys()) for classes in MODEL_CLASSES.values()], []))) - parser.add_argument("--params_path", type=str, default='./checkpoint/model_4900/model_state.pdparams', help="The path to model parameters to be loaded.") - - parser.add_argument("--max_seq_length", default=50, type=int, help="The maximum total input sequence length after tokenization. " - "Sequences longer than this will be truncated, sequences shorter will be padded.") - parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") - parser.add_argument("--n_gpu", type=int, default=0, help="Number of GPUs to use, 0 for CPU.") - args = parser.parse_args() - return args +parser = argparse.ArgumentParser() +parser.add_argument("--params_path", type=str, default='./checkpoint/model_2700/model_state.pdparams', help="The path to model parameters to be loaded.") +parser.add_argument("--max_seq_length", default=50, type=int, help="The maximum total input sequence length after tokenization. " + "Sequences longer than this will be truncated, sequences shorter will be padded.") +parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") +parser.add_argument("--n_gpu", type=int, default=0, help="Number of GPUs to use, 0 for CPU.") +args = parser.parse_args() # yapf: enable @@ -143,6 +128,11 @@ def predict(model, data, tokenizer, label_map, batch_size=1): examples.append((query_input_ids, query_segment_ids, title_input_ids, title_segment_ids)) + # Seperates data into some batches. + batches = [ + examples[idx:idx + batch_size] + for idx in range(0, len(examples), batch_size) + ] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # query_input Pad(axis=0, pad_val=tokenizer.pad_token_id), # query_segment @@ -150,18 +140,6 @@ def predict(model, data, tokenizer, label_map, batch_size=1): Pad(axis=0, pad_val=tokenizer.pad_token_id), # tilte_segment ): [data for data in fn(samples)] - # Seperates data into some batches. - batches = [] - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) - results = [] model.eval() for batch in batches: @@ -186,18 +164,11 @@ def predict(model, data, tokenizer, label_map, batch_size=1): if __name__ == "__main__": - args = parse_args() paddle.set_device("gpu" if args.n_gpu else "cpu") - args.model_type = args.model_type.lower() - model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - - if args.model_name == 'ernie-tiny': - # ErnieTinyTokenizer is special for ernie_tiny pretained model. - tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained( - args.model_name) - else: - tokenizer = tokenizer_class.from_pretrained(args.model_name) + # ErnieTinyTokenizer is special for ernie-tiny pretained model. + tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained( + 'ernie-tiny') data = [ ['世界上什么东西最小', '世界上什么东西最小?'], @@ -206,7 +177,8 @@ if __name__ == "__main__": ] label_map = {0: 'dissimilar', 1: 'similar'} - pretrained_model = model_class.from_pretrained(args.model_name) + pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained( + "ernie-tiny") model = SentenceTransformer(pretrained_model) if args.params_path and os.path.isfile(args.params_path): diff --git a/PaddleNLP/examples/text_matching/sentence_transformers/train.py b/PaddleNLP/examples/text_matching/sentence_transformers/train.py index 93cc88eb..f37f86de 100644 --- a/PaddleNLP/examples/text_matching/sentence_transformers/train.py +++ b/PaddleNLP/examples/text_matching/sentence_transformers/train.py @@ -27,95 +27,28 @@ import paddlenlp as ppnlp from model import SentenceTransformer -MODEL_CLASSES = { - "bert": (ppnlp.transformers.BertModel, ppnlp.transformers.BertTokenizer), - 'ernie': (ppnlp.transformers.ErnieModel, ppnlp.transformers.ErnieTokenizer), - 'roberta': - (ppnlp.transformers.RobertaModel, ppnlp.transformers.RobertaTokenizer), -} - - -def parse_args(): - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_type", - default='ernie', - required=True, - type=str, - help="Model type selected in the list: " + - ", ".join(MODEL_CLASSES.keys())) - parser.add_argument( - "--model_name", - default='ernie-1.0', - required=True, - type=str, - help="Path to pre-trained model or shortcut name selected in the list: " - + ", ".join( - sum([ - list(classes[-1].pretrained_init_configuration.keys()) - for classes in MODEL_CLASSES.values() - ], []))) - parser.add_argument( - "--save_dir", - default='./checkpoint', - required=True, - type=str, - help="The output directory where the model checkpoints will be written.") - - parser.add_argument( - "--max_seq_length", - default=128, - type=int, - help="The maximum total input sequence length after tokenization. " - "Sequences longer than this will be truncated, sequences shorter will be padded." - ) - parser.add_argument( - "--batch_size", - default=32, - type=int, - help="Batch size per GPU/CPU for training.") - parser.add_argument( - "--learning_rate", - default=5e-5, - type=float, - help="The initial learning rate for Adam.") - parser.add_argument( - "--weight_decay", - default=0.0, - type=float, - help="Weight decay if we apply some.") - parser.add_argument( - "--epochs", - default=3, - type=int, - help="Total number of training epochs to perform.") - parser.add_argument( - "--warmup_proption", - default=0.0, - type=float, - help="Linear warmup proption over the training process.") - parser.add_argument( - "--init_from_ckpt", - type=str, - default=None, - help="The path of checkpoint to be loaded.") - parser.add_argument( - "--seed", type=int, default=1000, help="random seed for initialization") - parser.add_argument( - "--n_gpu", - type=int, - default=1, - help="Number of GPUs to use, 0 for CPU.") - args = parser.parse_args() - return args - - -def set_seed(args): +# yapf: disable +parser = argparse.ArgumentParser() +parser.add_argument("--save_dir", default='./checkpoint', type=str, help="The output directory where the model checkpoints will be written.") +parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. " + "Sequences longer than this will be truncated, sequences shorter will be padded.") +parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") +parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") +parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") +parser.add_argument("--epochs", default=3, type=int, help="Total number of training epochs to perform.") +parser.add_argument("--warmup_proption", default=0.0, type=float, help="Linear warmup proption over the training process.") +parser.add_argument("--init_from_ckpt", type=str, default=None, help="The path of checkpoint to be loaded.") +parser.add_argument("--seed", type=int, default=1000, help="random seed for initialization") +parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs to use, 0 for CPU.") +args = parser.parse_args() +# yapf: enable + + +def set_seed(seed): """sets random seed""" - random.seed(args.seed) - np.random.seed(args.seed) - paddle.seed(args.seed) + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) @paddle.no_grad() @@ -135,8 +68,8 @@ def evaluate(model, criterion, metric, data_loader): for batch in data_loader: query_input_ids, query_segment_ids, title_input_ids, title_segment_ids, labels = batch probs = model( - query_input_ids, - title_input_ids, + query_input_ids=query_input_ids, + title_input_ids=title_input_ids, query_token_type_ids=query_segment_ids, title_token_type_ids=title_segment_ids) loss = criterion(probs, labels) @@ -236,24 +169,28 @@ def create_dataloader(dataset, return_list=True) -def do_train(args): - set_seed(args) +def do_train(): + set_seed(args.seed) paddle.set_device("gpu" if args.n_gpu else "cpu") world_size = paddle.distributed.get_world_size() if world_size > 1: paddle.distributed.init_parallel_env() - args.model_type = args.model_type.lower() - model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - train_dataset, dev_dataset, test_dataset = ppnlp.datasets.LCQMC.get_datasets( ['train', 'dev', 'test']) - if args.model_name == 'ernie-tiny': - # ErnieTinyTokenizer is special for ernie-tiny pretained model. - tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained( - args.model_name) - else: - tokenizer = tokenizer_class.from_pretrained(args.model_name) + + # If you wanna use bert/roberta pretrained model, + # pretrained_model = ppnlp.transformers.BertModel.from_pretrained('bert-base-chinese') + # pretrained_model = ppnlp.transformers.RobertaModel.from_pretrained('roberta-wwm-ext') + pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained( + 'ernie-tiny') + + # If you wanna use bert/roberta pretrained model, + # tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese') + # tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext') + # ErnieTinyTokenizer is special for ernie-tiny pretained model. + tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained( + 'ernie-tiny') trans_func = partial( convert_example, @@ -286,7 +223,6 @@ def do_train(args): batchify_fn=batchify_fn, trans_fn=trans_func) - pretrained_model = model_class.from_pretrained(args.model_name) model = SentenceTransformer(pretrained_model) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): @@ -326,8 +262,8 @@ def do_train(args): for step, batch in enumerate(train_data_loader, start=1): query_input_ids, query_segment_ids, title_input_ids, title_segment_ids, labels = batch probs = model( - query_input_ids, - title_input_ids, + query_input_ids=query_input_ids, + title_input_ids=title_input_ids, query_token_type_ids=query_segment_ids, title_token_type_ids=title_segment_ids) loss = criterion(probs, labels) @@ -361,8 +297,7 @@ def do_train(args): if __name__ == "__main__": - args = parse_args() if args.n_gpu > 1: - paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu) + paddle.distributed.spawn(do_train, nprocs=args.n_gpu) else: - do_train(args) + do_train() diff --git a/PaddleNLP/examples/text_matching/simnet/README.md b/PaddleNLP/examples/text_matching/simnet/README.md index 91c66b00..49735114 100644 --- a/PaddleNLP/examples/text_matching/simnet/README.md +++ b/PaddleNLP/examples/text_matching/simnet/README.md @@ -38,7 +38,7 @@ SimNet框架在百度各产品上广泛应用,主要包括BOW、CNN、RNN、MM * PaddleNLP 安装 ```shell - pip install paddlenlp + pip install paddlenlp>=2.0rc0 ``` * 环境依赖 diff --git a/PaddleNLP/examples/text_matching/simnet/predict.py b/PaddleNLP/examples/text_matching/simnet/predict.py index 519bff3e..e5cc3894 100644 --- a/PaddleNLP/examples/text_matching/simnet/predict.py +++ b/PaddleNLP/examples/text_matching/simnet/predict.py @@ -16,23 +16,24 @@ from functools import partial import argparse import paddle -import paddlenlp as ppnlp import paddle.nn.functional as F +import paddlenlp as ppnlp +from paddlenlp.data import JiebaTokenizer, Pad, Stack, Tuple, Vocab -from utils import load_vocab, generate_batch, preprocess_prediction_data +from utils import preprocess_prediction_data # yapf: disable parser = argparse.ArgumentParser(__doc__) parser.add_argument("--use_gpu", type=eval, default=False, help="Whether use GPU for training, input should be True or False") parser.add_argument("--batch_size", type=int, default=64, help="Total examples' number of a batch for training.") -parser.add_argument("--vocab_path", type=str, default="./data/term2id.dict", help="The path to vocabulary.") +parser.add_argument("--vocab_path", type=str, default="./simnet_word_dict.txt", help="The path to vocabulary.") parser.add_argument('--network', type=str, default="lstm", help="Which network you would like to choose bow, cnn, lstm or gru ?") -parser.add_argument("--params_path", type=str, default='./chekpoints/final.pdparams', help="The path of model parameter to be loaded.") +parser.add_argument("--params_path", type=str, default='./checkpoints/final.pdparams', help="The path of model parameter to be loaded.") args = parser.parse_args() # yapf: enable -def predict(model, data, label_map, collate_fn, batch_size=1, pad_token_id=0): +def predict(model, data, label_map, batch_size=1, pad_token_id=0): """ Predicts the data labels. @@ -41,37 +42,35 @@ def predict(model, data, label_map, collate_fn, batch_size=1, pad_token_id=0): data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object. A Example object contains `text`(word_ids) and `seq_len`(sequence length). label_map(obj:`dict`): The label id (key) to label str (value) map. - collate_fn(obj: `callable`): function to generate mini-batch data by merging - the sample list. batch_size(obj:`int`, defaults to 1): The number of batch. - pad_token_id(obj:`int`, optinal, defaults to 0) : The pad token index. + pad_token_id(obj:`int`, optional, defaults to 0): The pad token index. Returns: results(obj:`dict`): All the predictions labels. """ # Seperates data into some batches. - batches = [] - one_batch = [] - for example in data: - one_batch.append(example) - if len(one_batch) == batch_size: - batches.append(one_batch) - one_batch = [] - if one_batch: - # The last batch whose size is less than the config batch_size setting. - batches.append(one_batch) + batches = [ + data[idx:idx + batch_size] for idx in range(0, len(data), batch_size) + ] + + batchify_fn = lambda samples, fn=Tuple( + Pad(axis=0, pad_val=pad_token_id), # query_ids + Pad(axis=0, pad_val=pad_token_id), # title_ids + Stack(dtype="int64"), # query_seq_lens + Stack(dtype="int64"), # title_seq_lens + ): [data for data in fn(samples)] results = [] model.eval() for batch in batches: - queries, titles, query_seq_lens, title_seq_lens = collate_fn( - batch, pad_token_id=pad_token_id, return_label=False) - queries = paddle.to_tensor(queries) - titles = paddle.to_tensor(titles) + query_ids, title_ids, query_seq_lens, title_seq_lens = batchify_fn( + batch) + query_ids = paddle.to_tensor(query_ids) + title_ids = paddle.to_tensor(title_ids) query_seq_lens = paddle.to_tensor(query_seq_lens) title_seq_lens = paddle.to_tensor(title_seq_lens) - logits = model(queries, titles, query_seq_lens, title_seq_lens) + logits = model(query_ids, title_ids, query_seq_lens, title_seq_lens) probs = F.softmax(logits, axis=1) idx = paddle.argmax(probs, axis=1).numpy() idx = idx.tolist() @@ -83,7 +82,9 @@ def predict(model, data, label_map, collate_fn, batch_size=1, pad_token_id=0): if __name__ == "__main__": paddle.set_device("gpu") if args.use_gpu else paddle.set_device("cpu") # Loads vocab. - vocab = load_vocab(args.vocab_path) + vocab = Vocab.load_vocabulary( + args.vocab_path, unk_token='[UNK]', pad_token='[PAD]') + tokenizer = JiebaTokenizer(vocab) label_map = {0: 'dissimilar', 1: 'similar'} # Constructs the newtork. @@ -101,13 +102,13 @@ if __name__ == "__main__": ['光眼睛大就好看吗', '眼睛好看吗?'], ['小蝌蚪找妈妈怎么样', '小蝌蚪找妈妈是谁画的'], ] - examples = preprocess_prediction_data(data, vocab) + examples = preprocess_prediction_data(data, tokenizer) results = predict( model, examples, label_map=label_map, batch_size=args.batch_size, - collate_fn=generate_batch) + pad_token_id=vocab.token_to_idx.get('[PAD]', 0)) for idx, text in enumerate(data): print('Data: {} \t Label: {}'.format(text, results[idx])) diff --git a/PaddleNLP/examples/text_matching/simnet/train.py b/PaddleNLP/examples/text_matching/simnet/train.py index df308ab8..84a6d9c1 100644 --- a/PaddleNLP/examples/text_matching/simnet/train.py +++ b/PaddleNLP/examples/text_matching/simnet/train.py @@ -20,16 +20,17 @@ import time import paddle import paddlenlp as ppnlp +from paddlenlp.data import JiebaTokenizer, Pad, Stack, Tuple, Vocab from paddlenlp.datasets import LCQMC -from utils import load_vocab, generate_batch, convert_example +from utils import convert_example # yapf: disable parser = argparse.ArgumentParser(__doc__) parser.add_argument("--epochs", type=int, default=10, help="Number of epoches for training.") parser.add_argument('--use_gpu', type=eval, default=False, help="Whether use GPU for training, input should be True or False") parser.add_argument("--lr", type=float, default=5e-4, help="Learning rate used to train.") -parser.add_argument("--save_dir", type=str, default='chekpoints/', help="Directory to save model checkpoint") +parser.add_argument("--save_dir", type=str, default='checkpoints/', help="Directory to save model checkpoint") parser.add_argument("--batch_size", type=int, default=64, help="Total examples' number of a batch for training.") parser.add_argument("--vocab_path", type=str, default="./simnet_word_dict.txt", help="The directory to dataset.") parser.add_argument('--network', type=str, default="lstm", help="Which network you would like to choose bow, cnn, lstm or gru ?") @@ -43,16 +44,19 @@ def create_dataloader(dataset, mode='train', batch_size=1, use_gpu=False, - pad_token_id=0): + batchify_fn=None): """ Creats dataloader. Args: dataset(obj:`paddle.io.Dataset`): Dataset instance. + trans_fn(obj:`callable`, optional, defaults to `None`): function to convert a data sample to input ids, etc. mode(obj:`str`, optional, defaults to obj:`train`): If mode is 'train', it will shuffle the dataset randomly. batch_size(obj:`int`, optional, defaults to 1): The sample number of a mini-batch. use_gpu(obj:`bool`, optional, defaults to obj:`False`): Whether to use gpu to run. - pad_token_id(obj:`int`, optional, defaults to 0): The pad token index. + batchify_fn(obj:`callable`, optional, defaults to `None`): function to generate mini-batch data by merging + the sample list, None for only stack each fields of sample in axis + 0(same as :attr::`np.stack(..., axis=0)`). Returns: dataloader(obj:`paddle.io.DataLoader`): The dataloader which generates batches. @@ -71,7 +75,7 @@ def create_dataloader(dataset, dataset, batch_sampler=sampler, return_list=True, - collate_fn=lambda batch: generate_batch(batch, pad_token_id=pad_token_id)) + collate_fn=batchify_fn) return dataloader @@ -82,11 +86,11 @@ if __name__ == "__main__": if not os.path.exists(args.vocab_path): raise RuntimeError('The vocab_path can not be found in the path %s' % args.vocab_path) - vocab = load_vocab(args.vocab_path) + vocab = Vocab.load_vocabulary( + args.vocab_path, unk_token='[UNK]', pad_token='[PAD]') # Loads dataset. - train_ds, dev_dataset, test_ds = LCQMC.get_datasets( - ['train', 'dev', 'test']) + train_ds, dev_ds, test_ds = LCQMC.get_datasets(['train', 'dev', 'test']) # Constructs the newtork. label_list = train_ds.get_labels() @@ -95,18 +99,41 @@ if __name__ == "__main__": vocab_size=len(vocab), num_classes=len(label_list)) model = paddle.Model(model) + new_vocab_file = open("./new_simnet_word_dict.txt", 'w', encoding='utf8') + for token, index in vocab.token_to_idx.items(): + new_vocab_file.write(token + "\n") # Reads data and generates mini-batches. - trans_fn = partial(convert_example, vocab=vocab, is_test=False) + batchify_fn = lambda samples, fn=Tuple( + Pad(axis=0, pad_val=vocab.token_to_idx.get('[PAD]', 0)), # query_ids + Pad(axis=0, pad_val=vocab.token_to_idx.get('[PAD]', 0)), # title_ids + Stack(dtype="int64"), # query_seq_lens + Stack(dtype="int64"), # title_seq_lens + Stack(dtype="int64") # label + ): [data for data in fn(samples)] + tokenizer = ppnlp.data.JiebaTokenizer(vocab) + trans_fn = partial(convert_example, tokenizer=tokenizer, is_test=False) train_loader = create_dataloader( - train_ds, trans_fn=trans_fn, batch_size=args.batch_size, mode='train') + train_ds, + trans_fn=trans_fn, + batch_size=args.batch_size, + mode='train', + use_gpu=args.use_gpu, + batchify_fn=batchify_fn) dev_loader = create_dataloader( - dev_dataset, + dev_ds, trans_fn=trans_fn, batch_size=args.batch_size, - mode='validation') + mode='validation', + use_gpu=args.use_gpu, + batchify_fn=batchify_fn) test_loader = create_dataloader( - test_ds, trans_fn=trans_fn, batch_size=args.batch_size, mode='test') + test_ds, + trans_fn=trans_fn, + batch_size=args.batch_size, + mode='test', + use_gpu=args.use_gpu, + batchify_fn=batchify_fn) optimizer = paddle.optimizer.Adam( parameters=model.parameters(), learning_rate=args.lr) diff --git a/PaddleNLP/examples/text_matching/simnet/utils.py b/PaddleNLP/examples/text_matching/simnet/utils.py index 79854825..5384ad3f 100644 --- a/PaddleNLP/examples/text_matching/simnet/utils.py +++ b/PaddleNLP/examples/text_matching/simnet/utils.py @@ -11,105 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import jieba import numpy as np -def load_vocab(vocab_file): - """Loads a vocabulary file into a dictionary.""" - vocab = {} - with open(vocab_file, "r", encoding="utf-8") as reader: - tokens = reader.readlines() - for index, token in enumerate(tokens): - token = token.rstrip("\n").split("\t")[0] - vocab[token] = index - return vocab - - -def convert_ids_to_tokens(wids, inversed_vocab): - """ Converts a token string (or a sequence of tokens) in a single integer id - (or a sequence of ids), using the vocabulary. - """ - tokens = [] - for wid in wids: - wstr = inversed_vocab.get(wid, None) - if wstr: - tokens.append(wstr) - return tokens - - -def convert_tokens_to_ids(tokens, vocab): - """ Converts a token id (or a sequence of id) in a token string - (or a sequence of tokens), using the vocabulary. - """ - - ids = [] - unk_id = vocab.get('[UNK]', None) - for token in tokens: - wid = vocab.get(token, unk_id) - if wid: - ids.append(wid) - return ids - - -def pad_texts_to_max_seq_len(texts, max_seq_len, pad_token_id=0): - """ - Padded the texts to the max sequence length if the length of text is lower than it. - Unless it truncates the text. - - Args: - texts(obj:`list`): Texts which contrains a sequence of word ids. - max_seq_len(obj:`int`): Max sequence length. - pad_token_id(obj:`int`, optinal, defaults to 0) : The pad token index. - """ - for index, text in enumerate(texts): - seq_len = len(text) - if seq_len < max_seq_len: - padded_tokens = [pad_token_id for _ in range(max_seq_len - seq_len)] - new_text = text + padded_tokens - texts[index] = new_text - elif seq_len > max_seq_len: - new_text = text[:max_seq_len] - texts[index] = new_text - - -def generate_batch(batch, pad_token_id=0, return_label=True): - """ - Generates a batch whose text will be padded to the max sequence length in the batch. - - Args: - batch(obj:`List[Example]`) : One batch, which contains texts, labels and the true sequence lengths. - pad_token_id(obj:`int`, optinal, defaults to 0) : The pad token index. - - Returns: - batch(:obj:`Tuple[list]`): The batch data which contains texts, seq_lens and labels. - """ - queries = [entry[0] for entry in batch] - titles = [entry[1] for entry in batch] - query_seq_lens = [entry[2] for entry in batch] - title_seq_lens = [entry[3] for entry in batch] - - query_batch_max_seq_len = max(query_seq_lens) - pad_texts_to_max_seq_len(queries, query_batch_max_seq_len, pad_token_id) - title_batch_max_seq_len = max(title_seq_lens) - pad_texts_to_max_seq_len(titles, title_batch_max_seq_len, pad_token_id) - - if return_label: - labels = [entry[-1] for entry in batch] - return queries, titles, query_seq_lens, title_seq_lens, labels - else: - return queries, titles, query_seq_lens, title_seq_lens - - -def convert_example(example, vocab, unk_token_id=1, is_test=False): +def convert_example(example, tokenizer, is_test=False): """ Builds model inputs from a sequence for sequence classification tasks. It use `jieba.cut` to tokenize text. Args: example(obj:`list[str]`): List of input data, containing text and label if it have label. - vocab(obj:`dict`): The vocabulary. - unk_token_id(obj:`int`, defaults to 1): The unknown token id. + tokenizer(obj: paddlenlp.data.JiebaTokenizer): It use jieba to cut the chinese string. is_test(obj:`False`, defaults to `False`): Whether the example contains label or not. Returns: @@ -121,13 +33,10 @@ def convert_example(example, vocab, unk_token_id=1, is_test=False): """ query, title = example[0], example[1] - query_tokens = jieba.lcut(query) - title_tokens = jieba.lcut(title) - - query_ids = convert_tokens_to_ids(query_tokens, vocab) - query_seq_len = len(query_ids) - title_ids = convert_tokens_to_ids(title_tokens, vocab) - title_seq_len = len(title_ids) + query_ids = np.array(tokenizer.encode(query), dtype="int64") + query_seq_len = np.array(len(query_ids), dtype="int64") + title_ids = np.array(tokenizer.encode(title), dtype="int64") + title_seq_len = np.array(len(title_ids), dtype="int64") if not is_test: label = np.array(example[-1], dtype="int64") @@ -136,7 +45,7 @@ def convert_example(example, vocab, unk_token_id=1, is_test=False): return query_ids, title_ids, query_seq_len, title_seq_len -def preprocess_prediction_data(data, vocab): +def preprocess_prediction_data(data, tokenizer): """ It process the prediction data as the format used as training. @@ -144,6 +53,7 @@ def preprocess_prediction_data(data, vocab): data (obj:`List[List[str, str]]`): The prediction data whose each element is a text pair. Each text will be tokenized by jieba.lcut() function. + tokenizer(obj: paddlenlp.data.JiebaTokenizer): It use jieba to cut the chinese string. Returns: examples (obj:`list`): The processed data whose each element @@ -157,9 +67,7 @@ def preprocess_prediction_data(data, vocab): """ examples = [] for query, title in data: - query_tokens = jieba.lcut(query) - title_tokens = jieba.lcut(title) - query_ids = convert_tokens_to_ids(query_tokens, vocab) - title_ids = convert_tokens_to_ids(title_tokens, vocab) + query_ids = tokenizer.encode(query) + title_ids = tokenizer.encode(title) examples.append([query_ids, title_ids, len(query_ids), len(title_ids)]) return examples diff --git a/PaddleNLP/paddlenlp/__init__.py b/PaddleNLP/paddlenlp/__init__.py index 06fd93c8..9b31b4b7 100644 --- a/PaddleNLP/paddlenlp/__init__.py +++ b/PaddleNLP/paddlenlp/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = '2.0.0b3' +__version__ = '2.0.0rc1' from . import data from . import datasets diff --git a/PaddleNLP/paddlenlp/models/senta.py b/PaddleNLP/paddlenlp/models/senta.py index 4aa36acd..a34b4ecd 100644 --- a/PaddleNLP/paddlenlp/models/senta.py +++ b/PaddleNLP/paddlenlp/models/senta.py @@ -112,13 +112,7 @@ class BoWModel(nn.Layer): a word embedding. Then, we encode these epresentations with a `BoWEncoder`. Lastly, we take the output of the encoder to create a final representation, which is passed through some feed-forward layers to output a logits (`output_layer`). - Args: - vocab_size (obj:`int`): The vocabulary size. - emb_dim (obj:`int`, optional, defaults to 128): The embedding dimension. - padding_idx (obj:`int`, optinal, defaults to 0) : The pad token index. - hidden_size (obj:`int`, optional, defaults to 128): The first full-connected layer hidden size. - fc_hidden_size (obj:`int`, optional, defaults to 96): The second full-connected layer hidden size. - num_classes (obj:`int`): All the labels that the data has. + """ def __init__(self, @@ -331,7 +325,7 @@ class SelfAttention(nn.Layer): Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification (Zhou et al., 2016). ref: https://www.aclweb.org/anthology/P16-2034/ Args: - hidden_size (obj:`int`): The number of expected features in the input x. + hidden_size (int): The number of expected features in the input x. """ def __init__(self, hidden_size): @@ -343,9 +337,10 @@ class SelfAttention(nn.Layer): def forward(self, input, mask=None): """ Args: - input (obj: `paddle.Tensor`) of shape (batch, seq_len, input_size): Tensor containing the features of the input sequence. - mask (obj: `paddle.Tensor`, optional, defaults to `None`) of shape (batch, seq_len) : - Tensor is a bool tensor, whose each element identifies whether the input word id is pad token or not. + input (paddle.Tensor) of shape (batch, seq_len, input_size): Tensor containing the features of the input sequence. + mask (paddle.Tensor) of shape (batch, seq_len) : + Tensor is a bool tensor, whose each element identifies whether the input word id is pad token or not. + Defaults to `None`. """ forward_input, backward_input = paddle.chunk(input, chunks=2, axis=2) # elementwise-sum forward_x and backward_x @@ -378,7 +373,7 @@ class SelfInteractiveAttention(nn.Layer): A close implementation of attention network of NAACL 2016 paper, Hierarchical Attention Networks for Document Classification (Yang et al., 2016). ref: https://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf Args: - hidden_size (obj:`int`): The number of expected features in the input x. + hidden_size (int): The number of expected features in the input x. """ def __init__(self, hidden_size): @@ -393,9 +388,10 @@ class SelfInteractiveAttention(nn.Layer): def forward(self, input, mask=None): """ Args: - input (obj: `paddle.Tensor`) of shape (batch, seq_len, input_size): Tensor containing the features of the input sequence. - mask (obj: `paddle.Tensor`, optional, defaults to `None`) of shape (batch, seq_len) : + input (paddle.Tensor) of shape (batch, seq_len, input_size): Tensor containing the features of the input sequence. + mask (paddle.Tensor) of shape (batch, seq_len) : Tensor is a bool tensor, whose each element identifies whether the input word id is pad token or not. + Defaults to `None """ weight = self.input_weight.tile( repeat_times=(paddle.shape(input)[0], 1, 1)) @@ -434,11 +430,7 @@ class CNNModel(nn.Layer): outputs from the convolution layer and outputs the max. Lastly, we take the output of the encoder to create a final representation, which is passed through some feed-forward layers to output a logits (`output_layer`). - Args: - vocab_size (obj:`int`): The vocabulary size. - emb_dim (obj:`int`, optional, defaults to 128): The embedding dimension. - padding_idx (obj:`int`, optinal, defaults to 0) : The pad token index. - num_classes (obj:`int`): All the labels that the data has. + """ def __init__(self, @@ -483,11 +475,7 @@ class TextCNNModel(nn.Layer): outputs from the convolution layer and outputs the max. Lastly, we take the output of the encoder to create a final representation, which is passed through some feed-forward layers to output a logits (`output_layer`). - Args: - vocab_size (obj:`int`): The vocabulary size. - emb_dim (obj:`int`, optional, defaults to 128): The embedding dimension. - padding_idx (obj:`int`, optinal, defaults to 0) : The pad token index. - num_classes (obj:`int`): All the labels that the data has. + """ def __init__(self, diff --git a/PaddleNLP/paddlenlp/seq2vec/encoder.py b/PaddleNLP/paddlenlp/seq2vec/encoder.py index b34f66f9..28864131 100644 --- a/PaddleNLP/paddlenlp/seq2vec/encoder.py +++ b/PaddleNLP/paddlenlp/seq2vec/encoder.py @@ -31,8 +31,7 @@ class BoWEncoder(nn.Layer): and the output is of shape `(batch_size, emb_dim)`. Args: - # TODO: unify the docstring style with PaddlePaddle. - emb_dim(obj:`int`, required): It is the input dimension to the encoder. + emb_dim(int): It is the input dimension to the encoder. """ def __init__(self, emb_dim): @@ -59,12 +58,12 @@ class BoWEncoder(nn.Layer): It simply sums the embeddings of a sequence across the time dimension. Args: - inputs (obj: `paddle.Tensor`): Shape as `(batch_size, num_tokens, emb_dim)` + inputs (paddle.Tensor): Shape as `(batch_size, num_tokens, emb_dim)` mask (obj: `paddle.Tensor`, optional, defaults to `None`): Shape same as `inputs`. Its each elements identify whether is padding token or not. If True, not padding token. If False, padding token. Returns: - summed (obj: `paddle.Tensor`): Shape of `(batch_size, emb_dim)`. The result vector of BagOfEmbedding. + summed (paddle.Tensor): Shape of `(batch_size, emb_dim)`. The result vector of BagOfEmbedding. """ if mask is not None: @@ -97,18 +96,18 @@ class CNNEncoder(nn.Layer): ref: https://arxiv.org/abs/1510.03820 Args: - emb_dim(object:`int`, required): + emb_dim(int): This is the input dimension to the encoder. - num_filter(object:`int`, required): + num_filter(int): This is the output dim for each convolutional layer, which is the number of "filters" learned by that layer. - ngram_filter_sizes(object: `Tuple[int]`, optional, default to `(2, 3, 4, 5)`): + ngram_filter_sizes(Tuple[int]): This specifies both the number of convolutional layers we will create and their sizes. The default of `(2, 3, 4, 5)` will have four convolutional layers, corresponding to encoding ngrams of size 2 to 5 with some number of filters. - conv_layer_activation(object: `str`, optional, default to `tanh`): + conv_layer_activation(str): Activation to use after the convolution layers. - output_dim(object: `int`, optional, default to `None`): + output_dim(int): After doing convolutions and pooling, we'll project the collected features into a vector of this size. If this value is `None`, we will just return the result of the max pooling, giving an output of shape `len(ngram_filter_sizes) * num_filter`. @@ -165,13 +164,13 @@ class CNNEncoder(nn.Layer): The combination of multiple convolution layers and max pooling layers. Args: - inputs (obj: `paddle.Tensor`, required): Shape as `(batch_size, num_tokens, emb_dim)` + inputs (paddle.Tensor): Shape as `(batch_size, num_tokens, emb_dim)` mask (obj: `paddle.Tensor`, optional, defaults to `None`): Shape same as `inputs`. Its each elements identify whether is padding token or not. If True, not padding token. If False, padding token. Returns: - result (obj: `paddle.Tensor`): If output_dim is None, the result shape + result (paddle.Tensor): If output_dim is None, the result shape is of `(batch_size, output_dim)`; if not, the result shape is of `(batch_size, len(ngram_filter_sizes) * num_filter)`. @@ -188,8 +187,8 @@ class CNNEncoder(nn.Layer): self._activation(conv(inputs)).squeeze(3) for conv in self.convs ] maxpool_out = [ - F.max_pool1d( - t, kernel_size=t.shape[2]).squeeze(2) for t in convs_out + F.adaptive_max_pool1d( + t, output_size=1).squeeze(2) for t in convs_out ] result = paddle.concat(maxpool_out, axis=1) @@ -221,8 +220,8 @@ class GRUEncoder(nn.Layer): E.g., setting num_layers=2 would mean stacking two GRUs together to form a stacked GRU, with the second GRU taking in outputs of the first GRU and computing the final results. direction (obj:`str`, optional, defaults to obj:`forward`): The direction of the network. - It can be "forward" and "bidirect" (it means bidirection network). - When "bidirect", the way to merge outputs of forward and backward is concatenating. + It can be `forward` and `bidirect` (it means bidirection network). + If `biderect`, it is a birectional GRU, and returns the concat output from both directions. dropout (obj:`float`, optional, defaults to 0.0): If non-zero, introduces a Dropout layer on the outputs of each GRU layer except the last layer, with dropout probability equal to dropout. pooling_type (obj: `str`, optional, defaults to obj:`None`): If `pooling_type` is None, @@ -280,11 +279,11 @@ class GRUEncoder(nn.Layer): If not, output is of shape `(batch_size, hidden_size)`. Args: - inputs (obj:`Paddle.Tensor`, required): Shape as `(batch_size, num_tokens, input_size)`. - sequence_length (obj:`Paddle.Tensor`, required): Shape as `(batch_size)`. + inputs (paddle.Tensor): Shape as `(batch_size, num_tokens, input_size)`. + sequence_length (paddle.Tensor): Shape as `(batch_size)`. Returns: - last_hidden (obj:`Paddle.Tensor`, required): Shape as `(batch_size, hidden_size)`. + last_hidden (paddle.Tensor): Shape as `(batch_size, hidden_size)`. The hidden state at the last time step for every layer. """ @@ -305,8 +304,8 @@ class GRUEncoder(nn.Layer): else: # We exploit the `encoded_text` (the hidden state at the every time step for last layer) # to create a single vector. We perform pooling on the encoded text. - # If gru is not bidirection, output is shape of `(batch_size, hidden_size)`. - # If gru is bidirection, then output is shape of `(batch_size, hidden_size*2)`. + # The output shape is `(batch_size, hidden_size*2)` if use bidirectional GRU, + # otherwise the output shape is `(batch_size, hidden_size*2)`. if self._pooling_type == 'sum': output = paddle.sum(encoded_text, axis=1) elif self._pooling_type == 'max': @@ -338,17 +337,17 @@ class LSTMEncoder(nn.Layer): lstm and backward lstm layer to create a single vector (shape of `(batch_size, hidden_size*2)`). Args: - input_size (obj:`int`, required): The number of expected features in the input (the last dimension). - hidden_size (obj:`int`, required): The number of features in the hidden state. - num_layers (obj:`int`, optional, defaults to 1): Number of recurrent layers. + input_size (int): The number of expected features in the input (the last dimension). + hidden_size (int): The number of features in the hidden state. + num_layers (int): Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two LSTMs together to form a stacked LSTM, with the second LSTM taking in outputs of the first LSTM and computing the final results. - direction (obj:`str`, optional, defaults to obj:`forwrd`): The direction of the network. - It can be "forward" and "bidirect" (it means bidirection network). - When "bidirection", the way to merge outputs of forward and backward is concatenating. - dropout (obj:`float`, optional, defaults to 0.0): If non-zero, introduces a Dropout layer + direction (str): The direction of the network. + It can be `forward` or `bidirect` (it means bidirection network). + If `biderect`, it is a birectional LSTM, and returns the concat output from both directions. + dropout (float): If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to dropout. - pooling_type (obj: `str`, optional, defaults to obj:`None`): If `pooling_type` is None, + pooling_type (str): If `pooling_type` is None, then the LSTMEncoder will return the hidden state of the last time step at last layer as a single vector. If pooling_type is not None, it must be one of `sum`, `max` and `mean`. Then it will be pooled on the LSTM output (the hidden state of every time step at last layer) to create a single vector. @@ -404,11 +403,11 @@ class LSTMEncoder(nn.Layer): If not, output is of shape `(batch_size, hidden_size)`. Args: - inputs (obj:`Paddle.Tensor`, required): Shape as `(batch_size, num_tokens, input_size)`. - sequence_length (obj:`Paddle.Tensor`, required): Shape as `(batch_size)`. + inputs (paddle.Tensor): Shape as `(batch_size, num_tokens, input_size)`. + sequence_length (paddle.Tensor): Shape as `(batch_size)`. Returns: - last_hidden (obj:`Paddle.Tensor`, required): Shape as `(batch_size, hidden_size)`. + last_hidden (paddle.Tensor): Shape as `(batch_size, hidden_size)`. The hidden state at the last time step for every layer. """ @@ -429,8 +428,8 @@ class LSTMEncoder(nn.Layer): else: # We exploit the `encoded_text` (the hidden state at the every time step for last layer) # to create a single vector. We perform pooling on the encoded text. - # If lstm is not bidirection, output is shape of `(batch_size, hidden_size)`. - # If lstm is bidirection, then output is shape of `(batch_size, hidden_size*2)`. + # The output shape is `(batch_size, hidden_size*2)` if use bidirectional LSTM, + # otherwise the output shape is `(batch_size, hidden_size*2)`. if self._pooling_type == 'sum': output = paddle.sum(encoded_text, axis=1) elif self._pooling_type == 'max': @@ -467,9 +466,9 @@ class RNNEncoder(nn.Layer): num_layers (obj:`int`, optional, defaults to 1): Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two RNNs together to form a stacked RNN, with the second RNN taking in outputs of the first RNN and computing the final results. - direction (obj:`str`, optional, defaults to obj:`forwrd`): The direction of the network. + direction (obj:`str`, optional, defaults to obj:`forward`): The direction of the network. It can be "forward" and "bidirect" (it means bidirection network). - When "bidirection", the way to merge outputs of forward and backward is concatenating. + If `biderect`, it is a birectional RNN, and returns the concat output from both directions. dropout (obj:`float`, optional, defaults to 0.0): If non-zero, introduces a Dropout layer on the outputs of each RNN layer except the last layer, with dropout probability equal to dropout. pooling_type (obj: `str`, optional, defaults to obj:`None`): If `pooling_type` is None, @@ -528,11 +527,11 @@ class RNNEncoder(nn.Layer): If not, output is of shape `(batch_size, hidden_size)`. Args: - inputs (obj:`Paddle.Tensor`, required): Shape as `(batch_size, num_tokens, input_size)`. - sequence_length (obj:`Paddle.Tensor`, required): Shape as `(batch_size)`. + inputs (paddle.Tensor): Shape as `(batch_size, num_tokens, input_size)`. + sequence_length (paddle.Tensor): Shape as `(batch_size)`. Returns: - last_hidden (obj:`Paddle.Tensor`, required): Shape as `(batch_size, hidden_size)`. + last_hidden (paddle.Tensor): Shape as `(batch_size, hidden_size)`. The hidden state at the last time step for every layer. """ @@ -553,8 +552,8 @@ class RNNEncoder(nn.Layer): else: # We exploit the `encoded_text` (the hidden state at the every time step for last layer) # to create a single vector. We perform pooling on the encoded text. - # If rnn is not bidirection, output is shape of `(batch_size, hidden_size)`. - # If rnn is bidirection, then output is shape of `(batch_size, hidden_size*2)`. + # The output shape is `(batch_size, hidden_size*2)` if use bidirectional RNN, + # otherwise the output shape is `(batch_size, hidden_size*2)`. if self._pooling_type == 'sum': output = paddle.sum(encoded_text, axis=1) elif self._pooling_type == 'max': @@ -676,10 +675,10 @@ class TCNEncoder(nn.Layer): such as LSTMs in many tasks. See https://arxiv.org/pdf/1803.01271.pdf for more details. Args: - input_size (obj:`int`, required): The number of expected features in the input (the last dimension). - num_channels (obj:`list` or obj:`tuple`, required): The number of channels in different layer. - kernel_size (obj:`int`, optional): The kernel size. Defaults to 2. - dropout (obj:`float`, optional): The dropout probability. Defaults to 0.2. + input_size (int): The number of expected features in the input (the last dimension). + num_channels (list): The number of channels in different layer. + kernel_size (int): The kernel size. Defaults to 2. + dropout (float): The dropout probability. Defaults to 0.2. """ def __init__(self, input_size, num_channels, kernel_size=2, dropout=0.2): @@ -733,10 +732,10 @@ class TCNEncoder(nn.Layer): receptive filed = $2 * \sum_{i=0}^{len(num\_channels)-1}2^i(kernel\_size-1)$. Args: - inputs (obj:`Paddle.Tensor`, required): The input tensor with shape `[batch_size, num_tokens, input_size]`. + inputs (paddle.Tensor): The input tensor with shape `[batch_size, num_tokens, input_size]`. Returns: - output (obj:`Paddle.Tensor`): The output tensor with shape `[batch_size, num_channels[-1]]`. + output (paddle.Tensor): The output tensor with shape `[batch_size, num_channels[-1]]`. """ inputs_t = inputs.transpose([0, 2, 1]) output = self.network(inputs_t).transpose([2, 0, 1])[-1] diff --git a/PaddleNLP/paddlenlp/transformers/bert/tokenizer.py b/PaddleNLP/paddlenlp/transformers/bert/tokenizer.py index 67b5b811..0b2caf25 100644 --- a/PaddleNLP/paddlenlp/transformers/bert/tokenizer.py +++ b/PaddleNLP/paddlenlp/transformers/bert/tokenizer.py @@ -442,6 +442,39 @@ class BertTokenizer(PretrainedTokenizer): return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1] + def get_special_tokens_mask(self, + token_ids_0, + token_ids_1=None, + already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``encode`` methods. + + Args: + token_ids_0 (List[int]): List of ids of the first sequence. + token_ids_1 (List[int], optinal): List of ids of the second sequence. + already_has_special_tokens (bool, optional): Whether or not the token list is already + formatted with special tokens for the model. Defaults to None. + + Returns: + results (List[int]): The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + return list( + map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, + token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ( + [0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + def encode(self, text, text_pair=None, diff --git a/PaddleNLP/paddlenlp/transformers/electra/tokenizer.py b/PaddleNLP/paddlenlp/transformers/electra/tokenizer.py index 1d81f5b2..04e34f27 100644 --- a/PaddleNLP/paddlenlp/transformers/electra/tokenizer.py +++ b/PaddleNLP/paddlenlp/transformers/electra/tokenizer.py @@ -211,6 +211,39 @@ class ElectraTokenizer(PretrainedTokenizer): return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1] + def get_special_tokens_mask(self, + token_ids_0, + token_ids_1=None, + already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``encode`` methods. + + Args: + token_ids_0 (List[int]): List of ids of the first sequence. + token_ids_1 (List[int], optinal): List of ids of the second sequence. + already_has_special_tokens (bool, optional): Whether or not the token list is already + formatted with special tokens for the model. Defaults to None. + + Returns: + results (List[int]): The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + return list( + map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, + token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ( + [0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + def encode(self, text, text_pair=None, diff --git a/PaddleNLP/paddlenlp/transformers/ernie/tokenizer.py b/PaddleNLP/paddlenlp/transformers/ernie/tokenizer.py index 32f59b11..c20fb20b 100644 --- a/PaddleNLP/paddlenlp/transformers/ernie/tokenizer.py +++ b/PaddleNLP/paddlenlp/transformers/ernie/tokenizer.py @@ -650,6 +650,39 @@ class ErnieTinyTokenizer(PretrainedTokenizer): return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1] + def get_special_tokens_mask(self, + token_ids_0, + token_ids_1=None, + already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``encode`` methods. + + Args: + token_ids_0 (List[int]): List of ids of the first sequence. + token_ids_1 (List[int], optinal): List of ids of the second sequence. + already_has_special_tokens (bool, optional): Whether or not the token list is already + formatted with special tokens for the model. Defaults to None. + + Returns: + results (List[int]): The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + return list( + map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, + token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ( + [0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + def encode(self, text, text_pair=None, diff --git a/PaddleNLP/paddlenlp/transformers/roberta/tokenizer.py b/PaddleNLP/paddlenlp/transformers/roberta/tokenizer.py index 79733436..1d40ce0f 100644 --- a/PaddleNLP/paddlenlp/transformers/roberta/tokenizer.py +++ b/PaddleNLP/paddlenlp/transformers/roberta/tokenizer.py @@ -216,6 +216,39 @@ class RobertaTokenizer(PretrainedTokenizer): return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1] + def get_special_tokens_mask(self, + token_ids_0, + token_ids_1=None, + already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``encode`` methods. + + Args: + token_ids_0 (List[int]): List of ids of the first sequence. + token_ids_1 (List[int], optinal): List of ids of the second sequence. + already_has_special_tokens (bool, optional): Whether or not the token list is already + formatted with special tokens for the model. Defaults to None. + + Returns: + results (List[int]): The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + return list( + map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, + token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ( + [0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + def encode(self, text, text_pair=None, diff --git a/PaddleNLP/paddlenlp/transformers/tokenizer_utils.py b/PaddleNLP/paddlenlp/transformers/tokenizer_utils.py index da0cd64c..9c4d6613 100644 --- a/PaddleNLP/paddlenlp/transformers/tokenizer_utils.py +++ b/PaddleNLP/paddlenlp/transformers/tokenizer_utils.py @@ -437,3 +437,23 @@ class PretrainedTokenizer(object): "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']" ) return (ids, pair_ids, overflowing_tokens) + + def get_special_tokens_mask(self, + token_ids_0, + token_ids_1=None, + already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``encode`` methods. + + Args: + token_ids_0 (List[int]): List of ids of the first sequence. + token_ids_1 (List[int], optinal): List of ids of the second sequence. + already_has_special_tokens (bool, optional): Whether or not the token list is already + formatted with special tokens for the model. Defaults to None. + + Returns: + results (List[int]): The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + return [0] * ((len(token_ids_1) + if token_ids_1 else 0) + len(token_ids_0)) -- GitLab