From 3ce83f24313d874478ca154e50bba66938a32bbd Mon Sep 17 00:00:00 2001
From: Steffy-zxf <48793257+Steffy-zxf@users.noreply.github.com>
Date: Thu, 4 Feb 2021 00:14:20 +0800
Subject: [PATCH] Refine text classification and matching examples  (#5251)

* update docs

* update codes

* update docs

* update codes

* update codes

* update codes

* add method tokenizer.get_special_tokens_mask()
---
 .../pretrained_models/README.md               |  54 +++++--
 .../pretrained_models/export_model.py         |  41 ++---
 .../pretrained_models/predict.py              |  65 ++------
 .../pretrained_models/train.py                | 149 +++++-------------
 .../text_classification/rnn/README.md         |   2 +-
 .../text_classification/rnn/export_model.py   |   5 +-
 .../text_classification/rnn/predict.py        |  40 +++--
 .../examples/text_classification/rnn/train.py |  21 +--
 .../examples/text_classification/rnn/utils.py | 101 +-----------
 .../sentence_transformers/README.md           |  49 +++++-
 .../sentence_transformers/predict.py          |  62 ++------
 .../sentence_transformers/train.py            | 149 +++++-------------
 .../examples/text_matching/simnet/README.md   |   2 +-
 .../examples/text_matching/simnet/predict.py  |  53 ++++---
 .../examples/text_matching/simnet/train.py    |  53 +++++--
 .../examples/text_matching/simnet/utils.py    | 112 ++-----------
 PaddleNLP/paddlenlp/__init__.py               |   2 +-
 PaddleNLP/paddlenlp/models/senta.py           |  36 ++---
 PaddleNLP/paddlenlp/seq2vec/encoder.py        |  91 ++++++-----
 .../paddlenlp/transformers/bert/tokenizer.py  |  33 ++++
 .../transformers/electra/tokenizer.py         |  33 ++++
 .../paddlenlp/transformers/ernie/tokenizer.py |  33 ++++
 .../transformers/roberta/tokenizer.py         |  33 ++++
 .../paddlenlp/transformers/tokenizer_utils.py |  20 +++
 24 files changed, 534 insertions(+), 705 deletions(-)

diff --git a/PaddleNLP/examples/text_classification/pretrained_models/README.md b/PaddleNLP/examples/text_classification/pretrained_models/README.md
index 539c4821..71d8b8af 100644
--- a/PaddleNLP/examples/text_classification/pretrained_models/README.md
+++ b/PaddleNLP/examples/text_classification/pretrained_models/README.md
@@ -52,7 +52,7 @@
 - paddlepaddle >= 2.0.0-rc1
 
 ```
-pip install paddlenlp==2.0.0b
+pip install paddlenlp>=2.0.0rc
 ```
 
 ### 代码结构说明
@@ -73,17 +73,12 @@ pretrained_models/
 ```shell
 # 设置使用的GPU卡号
 CUDA_VISIBLE_DEVICES=0
-python train.py --model_type ernie --model_name ernie-tiny --n_gpu 1 --save_dir ./checkpoints
+python train.py --n_gpu 1 --save_dir ./checkpoints
 ```
 
 可支持配置的参数：
 
-* `model_type`：必选，模型类型，可以选择bert，ernie，roberta。
-* `model_name`： 必选，具体的模型简称。
-   如`model_type=ernie`，则model_name可以选择`ernie-1.0`和`ernie-tiny`。
-   如`model_type=bert`，则model_name可以选择`bert-base-chinese`，`bert-wwm-chinese`，`bert-wwm-ext-chinese`。
-   如`model_type=roberta`，则model_name可以选择`roberta-wwm-ext-large`，`roberta-wwm-ext`，`rbt3`，`rbtl3`。
-* `save_dir`：必选，保存训练模型的目录。
+* `save_dir`：可选，保存训练模型的目录；默认保存在当前目录checkpoints文件夹下。
 * `max_seq_length`：可选，ERNIE/BERT模型使用的最大序列长度，最大不能超过512, 若出现显存不足，请适当调低这一参数；默认为128。
 * `batch_size`：可选，批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为32。
 * `learning_rate`：可选，Fine-tune的最大学习率；默认为5e-5。
@@ -94,6 +89,45 @@ python train.py --model_type ernie --model_name ernie-tiny --n_gpu 1 --save_dir
 * `seed`：可选，随机种子，默认为1000.
 * `n_gpu`：可选，训练过程中使用GPU卡数量，默认为1。若n_gpu=0，则使用CPU训练。
 
+代码示例中使用的预训练模型是ERNIE，如果想要使用其他预训练模型如BERT，RoBERTa，Electra等，只需更换`model` 和 `tokenizer`即可。
+
+```python
+# 使用ernie预训练模型
+# ernie
+model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained('ernie',num_classes=2))
+tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie')
+
+# ernie-tiny
+# model = ppnlp.transformers.ErnieForSequenceClassification.rom_pretrained('ernie-tiny',num_classes=2))
+# tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained('ernie-tiny')
+
+
+# 使用bert预训练模型
+# bert-base-chinese
+model = ppnlp.transformers.BertForSequenceClassification.from_pretrained('bert-base-chinese', num_class=2)
+tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese')
+
+# bert-wwm-chinese
+# model = ppnlp.transformers.BertForSequenceClassification.from_pretrained('bert-wwm-chinese', num_class=2)
+# tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-wwm-chinese')
+
+# bert-wwm-ext-chinese
+# model = ppnlp.transformers.BertForSequenceClassification.from_pretrained('bert-wwm-ext-chinese', num_class=2)
+# tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-wwm-ext-chinese')
+
+
+# 使用roberta预训练模型
+# roberta-wwm-ext
+# model = ppnlp.transformers.RobertaForSequenceClassification.from_pretrained('roberta-wwm-ext', num_class=2)
+# tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext')
+
+# roberta-wwm-ext
+# model = ppnlp.transformers.RobertaForSequenceClassification.from_pretrained('roberta-wwm-ext-large', num_class=2)
+# tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext-large')
+
+```
+更多预训练模型，参考[transformers](../../../docs/transformers.md)
+
 
 程序运行时将会自动进行训练，评估，测试。同时训练过程中会自动保存模型在指定的`save_dir`中。
 如：
@@ -114,7 +148,7 @@ checkpoints/
   运行方式：
 
 ```shell
-python export_model.py --model_type=roberta --model_name=roberta-wwm-ext --params_path=./checkpoint/model_200/model_state.pdparams --output_path=./static_graph_params
+python export_model.py --params_path=./checkpoint/model_900/model_state.pdparams --output_path=./static_graph_params
 ```
 其中`params_path`是指动态图训练保存的参数路径，`output_path`是指静态图参数导出路径。
 
@@ -123,7 +157,7 @@ python export_model.py --model_type=roberta --model_name=roberta-wwm-ext --param
 启动预测：
 ```shell
 export CUDA_VISIBLE_DEVICES=0
-python predict.py --model_type ernie --model_name ernie-tiny --params_path checkpoints/model_400/model_state.pdparams
+python predict.py --params_path checkpoints/model_900/model_state.pdparams
 ```
 
 将待预测数据如以下示例：
diff --git a/PaddleNLP/examples/text_classification/pretrained_models/export_model.py b/PaddleNLP/examples/text_classification/pretrained_models/export_model.py
index d057d5bf..da860ee2 100644
--- a/PaddleNLP/examples/text_classification/pretrained_models/export_model.py
+++ b/PaddleNLP/examples/text_classification/pretrained_models/export_model.py
@@ -25,46 +25,23 @@ import paddle.nn.functional as F
 from paddlenlp.data import Stack, Tuple, Pad
 import paddlenlp as ppnlp
 
-MODEL_CLASSES = {
-    "bert": (ppnlp.transformers.BertForSequenceClassification,
-             ppnlp.transformers.BertTokenizer),
-    'ernie': (ppnlp.transformers.ErnieForSequenceClassification,
-              ppnlp.transformers.ErnieTokenizer),
-    'roberta': (ppnlp.transformers.RobertaForSequenceClassification,
-                ppnlp.transformers.RobertaTokenizer),
-}
-
-
 # yapf: disable
-def parse_args():
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("--model_type", default='roberta', required=True, type=str, help="Model type selected in the list: " +", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default='roberta-wwm-ext', required=True, type=str, help="Path to pre-trained model or shortcut name selected in the list: " +
-        ", ".join(sum([list(classes[-1].pretrained_init_configuration.keys()) for classes in MODEL_CLASSES.values()], [])))
-    parser.add_argument("--params_path", type=str, required=True, default='./checkpoint/model_200/model_state.pdparams', help="The path to model parameters to be loaded.")
-    parser.add_argument("--output_path", type=str, default='./static_graph_params', help="The path of model parameter in static graph to be saved.")
-    args = parser.parse_args()
-    return args
+parser = argparse.ArgumentParser()
+parser.add_argument("--params_path", type=str, required=True, default='./checkpoint/model_900/model_state.pdparams', help="The path to model parameters to be loaded.")
+parser.add_argument("--output_path", type=str, default='./static_graph_params', help="The path of model parameter in static graph to be saved.")
+args = parser.parse_args()
 # yapf: enable
 
 if __name__ == "__main__":
-    args = parse_args()
-
-    args.model_type = args.model_type.lower()
-    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
 
-    if args.model_name_or_path == 'ernie-tiny':
-        # ErnieTinyTokenizer is special for ernie-tiny pretained model.
-        tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained(
-            args.model_name_or_path)
-    else:
-        tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+    # ErnieTinyTokenizer is special for ernie-tiny pretained model.
+    tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained(
+        'ernie-tiny')
 
     # The number of labels should be in accordance with the training dataset.
     label_map = {0: 'negative', 1: 'positive'}
-    model = model_class.from_pretrained(
-        args.model_name_or_path, num_classes=len(label_map))
+    model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained(
+        "ernie-tiny", num_classes=len(label_map))
 
     if args.params_path and os.path.isfile(args.params_path):
         state_dict = paddle.load(args.params_path)
diff --git a/PaddleNLP/examples/text_classification/pretrained_models/predict.py b/PaddleNLP/examples/text_classification/pretrained_models/predict.py
index 3ba0bf78..880d06ad 100644
--- a/PaddleNLP/examples/text_classification/pretrained_models/predict.py
+++ b/PaddleNLP/examples/text_classification/pretrained_models/predict.py
@@ -25,31 +25,14 @@ import paddle.nn.functional as F
 from paddlenlp.data import Stack, Tuple, Pad
 import paddlenlp as ppnlp
 
-MODEL_CLASSES = {
-    "bert": (ppnlp.transformers.BertForSequenceClassification,
-             ppnlp.transformers.BertTokenizer),
-    'ernie': (ppnlp.transformers.ErnieForSequenceClassification,
-              ppnlp.transformers.ErnieTokenizer),
-    'roberta': (ppnlp.transformers.RobertaForSequenceClassification,
-                ppnlp.transformers.RobertaTokenizer),
-}
-
-
 # yapf: disable
-def parse_args():
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("--model_type", default='ernie', required=True, type=str, help="Model type selected in the list: " +", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default='ernie-tiny', required=True, type=str, help="Path to pre-trained model or shortcut name selected in the list: " +
-        ", ".join(sum([list(classes[-1].pretrained_init_configuration.keys()) for classes in MODEL_CLASSES.values()], [])))
-    parser.add_argument("--params_path", type=str, required=True, help="The path to model parameters to be loaded.")
-
-    parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. "
-        "Sequences longer than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs to use, 0 for CPU.")
-    args = parser.parse_args()
-    return args
+parser = argparse.ArgumentParser()
+parser.add_argument("--params_path", type=str, required=True, help="The path to model parameters to be loaded.")
+parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. "
+    "Sequences longer than this will be truncated, sequences shorter will be padded.")
+parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
+parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs to use, 0 for CPU.")
+args = parser.parse_args()
 # yapf: enable
 
 
@@ -134,23 +117,16 @@ def predict(model, data, tokenizer, label_map, batch_size=1):
             is_test=True)
         examples.append((input_ids, segment_ids))
 
+    # Seperates data into some batches.
+    batches = [
+        examples[idx:idx + batch_size]
+        for idx in range(0, len(examples), batch_size)
+    ]
     batchify_fn = lambda samples, fn=Tuple(
         Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
         Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
     ): fn(samples)
 
-    # Seperates data into some batches.
-    batches = []
-    one_batch = []
-    for example in examples:
-        one_batch.append(example)
-        if len(one_batch) == batch_size:
-            batches.append(one_batch)
-            one_batch = []
-    if one_batch:
-        # The last batch whose size is less than the config batch_size setting.
-        batches.append(one_batch)
-
     results = []
     model.eval()
     for batch in batches:
@@ -167,18 +143,11 @@ def predict(model, data, tokenizer, label_map, batch_size=1):
 
 
 if __name__ == "__main__":
-    args = parse_args()
     paddle.set_device("gpu" if args.n_gpu else "cpu")
 
-    args.model_type = args.model_type.lower()
-    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-
-    if args.model_name_or_path == 'ernie-tiny':
-        # ErnieTinyTokenizer is special for ernie-tiny pretained model.
-        tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained(
-            args.model_name_or_path)
-    else:
-        tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+    # ErnieTinyTokenizer is special for ernie-tiny pretained model.
+    tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained(
+        'ernie-tiny')
 
     data = [
         '这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般',
@@ -187,8 +156,8 @@ if __name__ == "__main__":
     ]
     label_map = {0: 'negative', 1: 'positive'}
 
-    model = model_class.from_pretrained(
-        args.model_name_or_path, num_classes=len(label_map))
+    model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained(
+        'ernie-tiny', num_classes=len(label_map))
 
     if args.params_path and os.path.isfile(args.params_path):
         state_dict = paddle.load(args.params_path)
diff --git a/PaddleNLP/examples/text_classification/pretrained_models/train.py b/PaddleNLP/examples/text_classification/pretrained_models/train.py
index 756b15fa..73bec44e 100644
--- a/PaddleNLP/examples/text_classification/pretrained_models/train.py
+++ b/PaddleNLP/examples/text_classification/pretrained_models/train.py
@@ -25,97 +25,28 @@ import paddle.nn.functional as F
 from paddlenlp.data import Stack, Tuple, Pad
 import paddlenlp as ppnlp
 
-MODEL_CLASSES = {
-    "bert": (ppnlp.transformers.BertForSequenceClassification,
-             ppnlp.transformers.BertTokenizer),
-    'ernie': (ppnlp.transformers.ErnieForSequenceClassification,
-              ppnlp.transformers.ErnieTokenizer),
-    'roberta': (ppnlp.transformers.RobertaForSequenceClassification,
-                ppnlp.transformers.RobertaTokenizer),
-}
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_type",
-        default='ernie',
-        required=True,
-        type=str,
-        help="Model type selected in the list: " +
-        ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument(
-        "--model_name",
-        default='ernie-tiny',
-        required=True,
-        type=str,
-        help="Path to pre-trained model or shortcut name selected in the list: "
-        + ", ".join(
-            sum([
-                list(classes[-1].pretrained_init_configuration.keys())
-                for classes in MODEL_CLASSES.values()
-            ], [])))
-    parser.add_argument(
-        "--save_dir",
-        default='./checkpoint',
-        required=True,
-        type=str,
-        help="The output directory where the model checkpoints will be written.")
-
-    parser.add_argument(
-        "--max_seq_length",
-        default=128,
-        type=int,
-        help="The maximum total input sequence length after tokenization. "
-        "Sequences longer than this will be truncated, sequences shorter will be padded."
-    )
-    parser.add_argument(
-        "--batch_size",
-        default=32,
-        type=int,
-        help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--learning_rate",
-        default=5e-5,
-        type=float,
-        help="The initial learning rate for Adam.")
-    parser.add_argument(
-        "--weight_decay",
-        default=0.0,
-        type=float,
-        help="Weight decay if we apply some.")
-    parser.add_argument(
-        "--epochs",
-        default=3,
-        type=int,
-        help="Total number of training epochs to perform.")
-    parser.add_argument(
-        "--warmup_proption",
-        default=0.0,
-        type=float,
-        help="Linear warmup proption over the training process.")
-    parser.add_argument(
-        "--init_from_ckpt",
-        type=str,
-        default=None,
-        help="The path of checkpoint to be loaded.")
-    parser.add_argument(
-        "--seed", type=int, default=1000, help="random seed for initialization")
-    parser.add_argument(
-        "--n_gpu",
-        type=int,
-        default=1,
-        help="Number of GPUs to use, 0 for CPU.")
-    args = parser.parse_args()
-    return args
-
-
-def set_seed(args):
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--save_dir", default='./checkpoint', type=str, help="The output directory where the model checkpoints will be written.")
+parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. "
+    "Sequences longer than this will be truncated, sequences shorter will be padded.")
+parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
+parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+parser.add_argument("--epochs", default=3, type=int, help="Total number of training epochs to perform.")
+parser.add_argument("--warmup_proportion", default=0.0, type=float, help="Linear warmup proption over the training process.")
+parser.add_argument("--init_from_ckpt", type=str, default=None, help="The path of checkpoint to be loaded.")
+parser.add_argument("--seed", type=int, default=1000, help="random seed for initialization")
+parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs to use, 0 for CPU.")
+args = parser.parse_args()
+# yapf: enable
+
+
+def set_seed(seed):
     """sets random seed"""
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    paddle.seed(args.seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
 
 
 @paddle.no_grad()
@@ -223,24 +154,30 @@ def create_dataloader(dataset,
         return_list=True)
 
 
-def do_train(args):
-    set_seed(args)
+def do_train():
+    set_seed(args.seed)
     paddle.set_device("gpu" if args.n_gpu else "cpu")
     world_size = paddle.distributed.get_world_size()
     if world_size > 1:
         paddle.distributed.init_parallel_env()
 
-    args.model_type = args.model_type.lower()
-    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-
     train_dataset, dev_dataset, test_dataset = ppnlp.datasets.ChnSentiCorp.get_datasets(
         ['train', 'dev', 'test'])
-    if args.model_name == 'ernie-tiny':
-        # ErnieTinyTokenizer is special for ernie-tiny pretained model.
-        tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained(
-            args.model_name)
-    else:
-        tokenizer = tokenizer_class.from_pretrained(args.model_name)
+
+    # If you wanna use bert/roberta/electra pretrained model,
+    # model = ppnlp.transformers.BertForSequenceClassification.from_pretrained('bert-base-chinese', num_class=2) 
+    # model = ppnlp.transformers.RobertaForSequenceClassification.from_pretrained('roberta-wwm-ext', num_class=2)
+    # model = ppnlp.transformers.ElectraForSequenceClassification.from_pretrained('chinese-electra-small', num_classes=2)
+    model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained(
+        'ernie-tiny', num_classes=len(train_dataset.get_labels()))
+
+    # If you wanna use bert/roberta/electra pretrained model,
+    # tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese')
+    # tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext')
+    # tokenizer = ppnlp.transformers.ElectraTokenizer.from_pretrained('chinese-electra-small', num_classes=2)
+    # ErnieTinyTokenizer is special for ernie-tiny pretained model.
+    tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained(
+        'ernie-tiny')
 
     trans_func = partial(
         convert_example,
@@ -271,16 +208,13 @@ def do_train(args):
         batchify_fn=batchify_fn,
         trans_fn=trans_func)
 
-    model = model_class.from_pretrained(
-        args.model_name, num_classes=len(train_dataset.get_labels()))
-
     if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
         state_dict = paddle.load(args.init_from_ckpt)
         model.set_dict(state_dict)
     model = paddle.DataParallel(model)
 
     num_training_steps = len(train_data_loader) * args.epochs
-    num_warmup_steps = int(args.warmup_proption * num_training_steps)
+    num_warmup_steps = int(args.warmup_proportion * num_training_steps)
 
     def get_lr_factor(current_step):
         if current_step < num_warmup_steps:
@@ -342,8 +276,7 @@ def do_train(args):
 
 
 if __name__ == "__main__":
-    args = parse_args()
     if args.n_gpu > 1:
-        paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu)
+        paddle.distributed.spawn(do_train, nprocs=args.n_gpu)
     else:
-        do_train(args)
+        do_train()
diff --git a/PaddleNLP/examples/text_classification/rnn/README.md b/PaddleNLP/examples/text_classification/rnn/README.md
index 490a2c50..fa5dbaca 100644
--- a/PaddleNLP/examples/text_classification/rnn/README.md
+++ b/PaddleNLP/examples/text_classification/rnn/README.md
@@ -115,7 +115,7 @@ PaddleNLP提供了一系列的文本表示技术，如`seq2vec`模块。
 - paddlepaddle >= 2.0.0-rc1
 
 ```
-pip install paddlenlp==2.0.0b
+pip install paddlenlp>=2.0.0rc
 ```
 
 ### 代码结构说明
diff --git a/PaddleNLP/examples/text_classification/rnn/export_model.py b/PaddleNLP/examples/text_classification/rnn/export_model.py
index 0e9a2dc0..693e8b1d 100644
--- a/PaddleNLP/examples/text_classification/rnn/export_model.py
+++ b/PaddleNLP/examples/text_classification/rnn/export_model.py
@@ -16,8 +16,7 @@ import argparse
 
 import paddle
 import paddlenlp as ppnlp
-
-from utils import load_vocab
+from paddlenlp.data import Vocab
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
@@ -31,7 +30,7 @@ args = parser.parse_args()
 
 def main():
     # Load vocab.
-    vocab = load_vocab(args.vocab_path)
+    vocab = Vocab.load_vocabulary(args.vocab_path)
     label_map = {0: 'negative', 1: 'positive'}
 
     # Construct the newtork.
diff --git a/PaddleNLP/examples/text_classification/rnn/predict.py b/PaddleNLP/examples/text_classification/rnn/predict.py
index 42cb5d93..3de3fb33 100644
--- a/PaddleNLP/examples/text_classification/rnn/predict.py
+++ b/PaddleNLP/examples/text_classification/rnn/predict.py
@@ -14,10 +14,11 @@
 import argparse
 
 import paddle
-import paddlenlp as ppnlp
 import paddle.nn.functional as F
+import paddlenlp as ppnlp
+from paddlenlp.data import JiebaTokenizer, Stack, Tuple, Pad
 
-from utils import load_vocab, generate_batch, preprocess_prediction_data
+from utils import preprocess_prediction_data
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
@@ -30,7 +31,7 @@ args = parser.parse_args()
 # yapf: enable
 
 
-def predict(model, data, label_map, collate_fn, batch_size=1, pad_token_id=0):
+def predict(model, data, label_map, batch_size=1, pad_token_id=0):
     """
     Predicts the data labels.
 
@@ -39,8 +40,6 @@ def predict(model, data, label_map, collate_fn, batch_size=1, pad_token_id=0):
         data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object.
             A Example object contains `text`(word_ids) and `se_len`(sequence length).
         label_map(obj:`dict`): The label id (key) to label str (value) map.
-        collate_fn(obj: `callable`): function to generate mini-batch data by merging
-            the sample list.
         batch_size(obj:`int`, defaults to 1): The number of batch.
         pad_token_id(obj:`int`, optional, defaults to 0): The pad token index.
 
@@ -49,22 +48,18 @@ def predict(model, data, label_map, collate_fn, batch_size=1, pad_token_id=0):
     """
 
     # Seperates data into some batches.
-    batches = []
-    one_batch = []
-    for example in data:
-        one_batch.append(example)
-        if len(one_batch) == batch_size:
-            batches.append(one_batch)
-            one_batch = []
-    if one_batch:
-        # The last batch whose size is less than the config batch_size setting.
-        batches.append(one_batch)
+    batches = [
+        data[idx:idx + batch_size] for idx in range(0, len(data), batch_size)
+    ]
+    batchify_fn = lambda samples, fn=Tuple(
+        Pad(axis=0, pad_val=pad_token_id),  # input_ids
+        Stack(dtype="int64"),  # seq len
+    ): [data for data in fn(samples)]
 
     results = []
     model.eval()
     for batch in batches:
-        texts, seq_lens = collate_fn(
-            batch, pad_token_id=pad_token_id, return_label=False)
+        texts, seq_lens = batchify_fn(batch)
         texts = paddle.to_tensor(texts)
         seq_lens = paddle.to_tensor(seq_lens)
         logits = model(texts, seq_lens)
@@ -78,8 +73,9 @@ def predict(model, data, label_map, collate_fn, batch_size=1, pad_token_id=0):
 
 if __name__ == "__main__":
     paddle.set_device("gpu") if args.use_gpu else paddle.set_device("cpu")
-    # Loads vocab.
-    vocab = load_vocab(args.vocab_path)
+    # Loads vocab.s
+    vocab = ppnlp.data.Vocab.load_vocabulary(
+        args.vocab_path, unk_token='[UNK]', pad_token='[PAD]')
     label_map = {0: 'negative', 1: 'positive'}
 
     # Constructs the newtork.
@@ -97,14 +93,14 @@ if __name__ == "__main__":
         '怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片',
         '作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。',
     ]
-    examples = preprocess_prediction_data(data, vocab)
+    tokenizer = JiebaTokenizer(vocab)
+    examples = preprocess_prediction_data(data, tokenizer)
 
     results = predict(
         model,
         examples,
         label_map=label_map,
         batch_size=args.batch_size,
-        collate_fn=generate_batch)
-
+        pad_token_id=vocab.token_to_idx.get("[PAD]", 0))
     for idx, text in enumerate(data):
         print('Data: {} \t Label: {}'.format(text, results[idx]))
diff --git a/PaddleNLP/examples/text_classification/rnn/train.py b/PaddleNLP/examples/text_classification/rnn/train.py
index a314b8c9..6412d281 100644
--- a/PaddleNLP/examples/text_classification/rnn/train.py
+++ b/PaddleNLP/examples/text_classification/rnn/train.py
@@ -19,10 +19,10 @@ import random
 import numpy as np
 import paddle
 import paddlenlp as ppnlp
-from paddlenlp.data import Stack, Tuple, Pad
+from paddlenlp.data import JiebaTokenizer, Pad, Stack, Tuple, Vocab
 from paddlenlp.datasets import ChnSentiCorp
 
-from utils import load_vocab, convert_example
+from utils import convert_example
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
@@ -50,7 +50,6 @@ def create_dataloader(dataset,
                       mode='train',
                       batch_size=1,
                       use_gpu=False,
-                      pad_token_id=0,
                       batchify_fn=None):
     """
     Creats dataloader.
@@ -61,7 +60,6 @@ def create_dataloader(dataset,
         mode(obj:`str`, optional, defaults to obj:`train`): If mode is 'train', it will shuffle the dataset randomly.
         batch_size(obj:`int`, optional, defaults to 1): The sample number of a mini-batch.
         use_gpu(obj:`bool`, optional, defaults to obj:`False`): Whether to use gpu to run.
-        pad_token_id(obj:`int`, optional, defaults to 0): The pad token index.
         batchify_fn(obj:`callable`, optional, defaults to `None`): function to generate mini-batch data by merging
             the sample list, None for only stack each fields of sample in axis
             0(same as :attr::`np.stack(..., axis=0)`).
@@ -95,8 +93,9 @@ if __name__ == "__main__":
     if not os.path.exists(args.vocab_path):
         raise RuntimeError('The vocab_path  can not be found in the path %s' %
                            args.vocab_path)
-    vocab = load_vocab(args.vocab_path)
 
+    vocab = Vocab.load_vocabulary(
+        args.vocab_path, unk_token='[UNK]', pad_token='[PAD]')
     # Loads dataset.
     train_ds, dev_ds, test_ds = ChnSentiCorp.get_datasets(
         ['train', 'dev', 'test'])
@@ -110,13 +109,10 @@ if __name__ == "__main__":
     model = paddle.Model(model)
 
     # Reads data and generates mini-batches.
-    trans_fn = partial(
-        convert_example,
-        vocab=vocab,
-        unk_token_id=vocab.get('[UNK]', 1),
-        is_test=False)
+    tokenizer = JiebaTokenizer(vocab)
+    trans_fn = partial(convert_example, tokenizer=tokenizer, is_test=False)
     batchify_fn = lambda samples, fn=Tuple(
-        Pad(axis=0, pad_val=vocab['[PAD]']),  # input_ids
+        Pad(axis=0, pad_val=vocab.token_to_idx.get('[PAD]', 0)),  # input_ids
         Stack(dtype="int64"),  # seq len
         Stack(dtype="int64")  # label
     ): [data for data in fn(samples)]
@@ -126,7 +122,6 @@ if __name__ == "__main__":
         batch_size=args.batch_size,
         mode='train',
         use_gpu=args.use_gpu,
-        pad_token_id=vocab.get('[PAD]', 0),
         batchify_fn=batchify_fn)
     dev_loader = create_dataloader(
         dev_ds,
@@ -134,7 +129,6 @@ if __name__ == "__main__":
         batch_size=args.batch_size,
         mode='validation',
         use_gpu=args.use_gpu,
-        pad_token_id=vocab.get('[PAD]', 0),
         batchify_fn=batchify_fn)
     test_loader = create_dataloader(
         test_ds,
@@ -142,7 +136,6 @@ if __name__ == "__main__":
         batch_size=args.batch_size,
         mode='test',
         use_gpu=args.use_gpu,
-        pad_token_id=vocab.get('[PAD]', 0),
         batchify_fn=batchify_fn)
 
     optimizer = paddle.optimizer.Adam(
diff --git a/PaddleNLP/examples/text_classification/rnn/utils.py b/PaddleNLP/examples/text_classification/rnn/utils.py
index 8c489c6c..24455038 100644
--- a/PaddleNLP/examples/text_classification/rnn/utils.py
+++ b/PaddleNLP/examples/text_classification/rnn/utils.py
@@ -11,113 +11,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import jieba
 import numpy as np
 
 
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = {}
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n").split("\t")[0]
-        vocab[token] = index
-    return vocab
-
-
-def convert_ids_to_tokens(wids, inversed_vocab):
-    """ Converts a token string (or a sequence of tokens) in a single integer id
-        (or a sequence of ids), using the vocabulary.
-    """
-    tokens = []
-    for wid in wids:
-        wstr = inversed_vocab.get(wid, None)
-        if wstr:
-            tokens.append(wstr)
-    return tokens
-
-
-def convert_tokens_to_ids(tokens, vocab):
-    """ Converts a token id (or a sequence of id) in a token string
-        (or a sequence of tokens), using the vocabulary.
-    """
-
-    ids = []
-    unk_id = vocab.get('[UNK]', None)
-    for token in tokens:
-        wid = vocab.get(token, unk_id)
-        if wid:
-            ids.append(wid)
-    return ids
-
-
-def pad_texts_to_max_seq_len(texts, max_seq_len, pad_token_id=0):
-    """
-    Padded the texts to the max sequence length if the length of text is lower than it.
-    Unless it truncates the text.
-
-    Args:
-        texts(obj:`list`): Texts which contrains a sequence of word ids.
-        max_seq_len(obj:`int`): Max sequence length.
-        pad_token_id(obj:`int`, optinal, defaults to 0) : The pad token index.
-    """
-    for index, text in enumerate(texts):
-        seq_len = len(text)
-        if seq_len < max_seq_len:
-            padded_tokens = [pad_token_id for _ in range(max_seq_len - seq_len)]
-            new_text = text + padded_tokens
-            texts[index] = new_text
-        elif seq_len > max_seq_len:
-            new_text = text[:max_seq_len]
-            texts[index] = new_text
-
-
-def generate_batch(batch, pad_token_id=0, return_label=True):
-    """
-    Generates a batch whose text will be padded to the max sequence length in the batch.
-
-    Args:
-        batch(obj:`List[Example]`) : One batch, which contains texts, labels and the true sequence lengths.
-        pad_token_id(obj:`int`, optinal, defaults to 0) : The pad token index.
-
-    Returns:
-        batch(:obj:`Tuple[list]`): The batch data which contains texts, seq_lens and labels.
-    """
-    seq_lens = [entry[1] for entry in batch]
-
-    batch_max_seq_len = max(seq_lens)
-    texts = [entry[0] for entry in batch]
-    pad_texts_to_max_seq_len(texts, batch_max_seq_len, pad_token_id)
-
-    if return_label:
-        labels = [[entry[-1]] for entry in batch]
-        return texts, seq_lens, labels
-    else:
-        return texts, seq_lens
-
-
-def convert_example(example, vocab, unk_token_id=1, is_test=False):
+def convert_example(example, tokenizer, is_test=False):
     """
     Builds model inputs from a sequence for sequence classification tasks. 
     It use `jieba.cut` to tokenize text.
 
     Args:
         example(obj:`list[str]`): List of input data, containing text and label if it have label.
-        vocab(obj:`dict`): The vocabulary.
-        unk_token_id(obj:`int`, defaults to 1): The unknown token id.
+        tokenizer(obj: paddlenlp.data.JiebaTokenizer): It use jieba to cut the chinese string.
         is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
 
     Returns:
-        input_ids(obj:`list[int]`): The list of token ids.s
+        input_ids(obj:`list[int]`): The list of token ids.
         valid_length(obj:`int`): The input sequence valid length.
         label(obj:`numpy.array`, data type of int64, optional): The input label if not is_test.
     """
 
-    input_ids = []
-    for token in jieba.cut(example[0]):
-        token_id = vocab.get(token, unk_token_id)
-        input_ids.append(token_id)
+    input_ids = tokenizer.encode(example[0])
     valid_length = np.array(len(input_ids), dtype='int64')
     input_ids = np.array(input_ids, dtype='int64')
 
@@ -128,12 +41,13 @@ def convert_example(example, vocab, unk_token_id=1, is_test=False):
         return input_ids, valid_length
 
 
-def preprocess_prediction_data(data, vocab):
+def preprocess_prediction_data(data, tokenizer):
     """
     It process the prediction data as the format used as training.
 
     Args:
         data (obj:`List[str]`): The prediction data whose each element is  a tokenized text.
+        tokenizer(obj: paddlenlp.data.JiebaTokenizer): It use jieba to cut the chinese string.
 
     Returns:
         examples (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object.
@@ -142,7 +56,6 @@ def preprocess_prediction_data(data, vocab):
     """
     examples = []
     for text in data:
-        tokens = " ".join(jieba.cut(text)).split(' ')
-        ids = convert_tokens_to_ids(tokens, vocab)
+        ids = tokenizer.encode(text)
         examples.append([ids, len(ids)])
     return examples
diff --git a/PaddleNLP/examples/text_matching/sentence_transformers/README.md b/PaddleNLP/examples/text_matching/sentence_transformers/README.md
index f35931c7..d4837056 100644
--- a/PaddleNLP/examples/text_matching/sentence_transformers/README.md
+++ b/PaddleNLP/examples/text_matching/sentence_transformers/README.md
@@ -89,17 +89,12 @@ sentence_transformers/
 ```shell
 # 设置使用的GPU卡号
 CUDA_VISIBLE_DEVICES=0
-python train.py --model_type ernie --model_name ernie-1.0 --n_gpu 1 --save_dir ./checkpoints
+python train.py --save_dir ./checkpoints
 ```
 
 可支持配置的参数：
 
-* `model_type`：必选，模型类型，可以选择bert，ernie，roberta。
-* `model_name`： 必选，具体的模型简称。
-   如`model_type=ernie`，则model_name可以选择`ernie-1.0`和`ernie-tiny`。
-   如`model_type=bert`，则model_name可以选择`bert-base-chinese`，`bert-wwm-chinese`，`bert-wwm-ext-chinese`。
-   如`model_type=roberta`，则model_name可以选择`roberta-wwm-ext`，`rbt3`，`rbtl3`。
-* `save_dir`：必选，保存训练模型的目录。
+* `save_dir`：可选，保存训练模型的目录；默认保存在当前目录checkpoints文件夹下。
 * `max_seq_length`：可选，ERNIE/BERT模型使用的最大序列长度，最大不能超过512, 若出现显存不足，请适当调低这一参数；默认为128。
 * `batch_size`：可选，批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为32。
 * `learning_rate`：可选，Fine-tune的最大学习率；默认为5e-5。
@@ -110,6 +105,44 @@ python train.py --model_type ernie --model_name ernie-1.0 --n_gpu 1 --save_dir .
 * `seed`：可选，随机种子，默认为1000.
 * `n_gpu`：可选，训练过程中使用GPU卡数量，默认为1。若n_gpu=0，则使用CPU训练。
 
+代码示例中使用的预训练模型是ERNIE，如果想要使用其他预训练模型如BERT，RoBERTa，Electra等，只需更换`model` 和 `tokenizer`即可。
+
+```python
+# 使用ernie预训练模型
+# ernie
+model = ppnlp.transformers.ErnieModel.from_pretrained('ernie'))
+tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie')
+
+# ernie-tiny
+# model = ppnlp.transformers.ErnieModel.from_pretrained('ernie-tiny'))
+# tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained('ernie-tiny')
+
+
+# 使用bert预训练模型
+# bert-base-chinese
+# model = ppnlp.transformers.BertModel.from_pretrained('bert-base-chinese')
+# tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese')
+
+# bert-wwm-chinese
+# model = ppnlp.transformers.BertModel.from_pretrained('bert-wwm-chinese')
+# tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-wwm-chinese')
+
+# bert-wwm-ext-chinese
+# model = ppnlp.transformers.BertModel.from_pretrained('bert-wwm-ext-chinese')
+# tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-wwm-ext-chinese')
+
+
+# 使用roberta预训练模型
+# roberta-wwm-ext
+# model = ppnlp.transformers.RobertaModel.from_pretrained('roberta-wwm-ext')
+# tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext')
+
+# roberta-wwm-ext
+# model = ppnlp.transformers.RobertaModel.from_pretrained('roberta-wwm-ext-large')
+# tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext-large')
+
+```
+更多预训练模型，参考[transformers](../../../docs/transformers.md)
 
 程序运行时将会自动进行训练，评估，测试。同时训练过程中会自动保存模型在指定的`save_dir`中。
 如：
@@ -132,7 +165,7 @@ checkpoints/
 启动预测：
 ```shell
 CUDA_VISIBLE_DEVICES=0
-python predict.py --model_type ernie --model_name ernie-tiny --params_path checkpoints/model_400/model_state.pdparams
+python predict.py --params_path checkpoints/model_400/model_state.pdparams
 ```
 
 将待预测数据如以下示例：
diff --git a/PaddleNLP/examples/text_matching/sentence_transformers/predict.py b/PaddleNLP/examples/text_matching/sentence_transformers/predict.py
index b72ab2d1..228394d7 100644
--- a/PaddleNLP/examples/text_matching/sentence_transformers/predict.py
+++ b/PaddleNLP/examples/text_matching/sentence_transformers/predict.py
@@ -26,29 +26,14 @@ from paddlenlp.data import Stack, Tuple, Pad
 
 from model import SentenceTransformer
 
-MODEL_CLASSES = {
-    "bert": (ppnlp.transformers.BertModel, ppnlp.transformers.BertTokenizer),
-    'ernie': (ppnlp.transformers.ErnieModel, ppnlp.transformers.ErnieTokenizer),
-    'roberta':
-    (ppnlp.transformers.RobertaModel, ppnlp.transformers.RobertaTokenizer)
-}
-
-
 # yapf: disable
-def parse_args():
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("--model_type", default='ernie', type=str, help="Model type selected in the list: " +", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name", default='ernie-1.0', type=str, help="Path to pre-trained model or shortcut name selected in the list: " +
-        ", ".join(sum([list(classes[-1].pretrained_init_configuration.keys()) for classes in MODEL_CLASSES.values()], [])))
-    parser.add_argument("--params_path", type=str, default='./checkpoint/model_4900/model_state.pdparams', help="The path to model parameters to be loaded.")
-
-    parser.add_argument("--max_seq_length", default=50, type=int, help="The maximum total input sequence length after tokenization. "
-        "Sequences longer than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--n_gpu", type=int, default=0, help="Number of GPUs to use, 0 for CPU.")
-    args = parser.parse_args()
-    return args
+parser = argparse.ArgumentParser()
+parser.add_argument("--params_path", type=str, default='./checkpoint/model_2700/model_state.pdparams', help="The path to model parameters to be loaded.")
+parser.add_argument("--max_seq_length", default=50, type=int, help="The maximum total input sequence length after tokenization. "
+    "Sequences longer than this will be truncated, sequences shorter will be padded.")
+parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
+parser.add_argument("--n_gpu", type=int, default=0, help="Number of GPUs to use, 0 for CPU.")
+args = parser.parse_args()
 # yapf: enable
 
 
@@ -143,6 +128,11 @@ def predict(model, data, tokenizer, label_map, batch_size=1):
         examples.append((query_input_ids, query_segment_ids, title_input_ids,
                          title_segment_ids))
 
+    # Seperates data into some batches.
+    batches = [
+        examples[idx:idx + batch_size]
+        for idx in range(0, len(examples), batch_size)
+    ]
     batchify_fn = lambda samples, fn=Tuple(
         Pad(axis=0, pad_val=tokenizer.pad_token_id),  # query_input
         Pad(axis=0, pad_val=tokenizer.pad_token_id),  # query_segment
@@ -150,18 +140,6 @@ def predict(model, data, tokenizer, label_map, batch_size=1):
         Pad(axis=0, pad_val=tokenizer.pad_token_id),  # tilte_segment
     ): [data for data in fn(samples)]
 
-    # Seperates data into some batches.
-    batches = []
-    one_batch = []
-    for example in examples:
-        one_batch.append(example)
-        if len(one_batch) == batch_size:
-            batches.append(one_batch)
-            one_batch = []
-    if one_batch:
-        # The last batch whose size is less than the config batch_size setting.
-        batches.append(one_batch)
-
     results = []
     model.eval()
     for batch in batches:
@@ -186,18 +164,11 @@ def predict(model, data, tokenizer, label_map, batch_size=1):
 
 
 if __name__ == "__main__":
-    args = parse_args()
     paddle.set_device("gpu" if args.n_gpu else "cpu")
 
-    args.model_type = args.model_type.lower()
-    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-
-    if args.model_name == 'ernie-tiny':
-        # ErnieTinyTokenizer is special for ernie_tiny pretained model.
-        tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained(
-            args.model_name)
-    else:
-        tokenizer = tokenizer_class.from_pretrained(args.model_name)
+    # ErnieTinyTokenizer is special for ernie-tiny pretained model.
+    tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained(
+        'ernie-tiny')
 
     data = [
         ['世界上什么东西最小', '世界上什么东西最小？'],
@@ -206,7 +177,8 @@ if __name__ == "__main__":
     ]
     label_map = {0: 'dissimilar', 1: 'similar'}
 
-    pretrained_model = model_class.from_pretrained(args.model_name)
+    pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained(
+        "ernie-tiny")
     model = SentenceTransformer(pretrained_model)
 
     if args.params_path and os.path.isfile(args.params_path):
diff --git a/PaddleNLP/examples/text_matching/sentence_transformers/train.py b/PaddleNLP/examples/text_matching/sentence_transformers/train.py
index 93cc88eb..f37f86de 100644
--- a/PaddleNLP/examples/text_matching/sentence_transformers/train.py
+++ b/PaddleNLP/examples/text_matching/sentence_transformers/train.py
@@ -27,95 +27,28 @@ import paddlenlp as ppnlp
 
 from model import SentenceTransformer
 
-MODEL_CLASSES = {
-    "bert": (ppnlp.transformers.BertModel, ppnlp.transformers.BertTokenizer),
-    'ernie': (ppnlp.transformers.ErnieModel, ppnlp.transformers.ErnieTokenizer),
-    'roberta':
-    (ppnlp.transformers.RobertaModel, ppnlp.transformers.RobertaTokenizer),
-}
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_type",
-        default='ernie',
-        required=True,
-        type=str,
-        help="Model type selected in the list: " +
-        ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument(
-        "--model_name",
-        default='ernie-1.0',
-        required=True,
-        type=str,
-        help="Path to pre-trained model or shortcut name selected in the list: "
-        + ", ".join(
-            sum([
-                list(classes[-1].pretrained_init_configuration.keys())
-                for classes in MODEL_CLASSES.values()
-            ], [])))
-    parser.add_argument(
-        "--save_dir",
-        default='./checkpoint',
-        required=True,
-        type=str,
-        help="The output directory where the model checkpoints will be written.")
-
-    parser.add_argument(
-        "--max_seq_length",
-        default=128,
-        type=int,
-        help="The maximum total input sequence length after tokenization. "
-        "Sequences longer than this will be truncated, sequences shorter will be padded."
-    )
-    parser.add_argument(
-        "--batch_size",
-        default=32,
-        type=int,
-        help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--learning_rate",
-        default=5e-5,
-        type=float,
-        help="The initial learning rate for Adam.")
-    parser.add_argument(
-        "--weight_decay",
-        default=0.0,
-        type=float,
-        help="Weight decay if we apply some.")
-    parser.add_argument(
-        "--epochs",
-        default=3,
-        type=int,
-        help="Total number of training epochs to perform.")
-    parser.add_argument(
-        "--warmup_proption",
-        default=0.0,
-        type=float,
-        help="Linear warmup proption over the training process.")
-    parser.add_argument(
-        "--init_from_ckpt",
-        type=str,
-        default=None,
-        help="The path of checkpoint to be loaded.")
-    parser.add_argument(
-        "--seed", type=int, default=1000, help="random seed for initialization")
-    parser.add_argument(
-        "--n_gpu",
-        type=int,
-        default=1,
-        help="Number of GPUs to use, 0 for CPU.")
-    args = parser.parse_args()
-    return args
-
-
-def set_seed(args):
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--save_dir", default='./checkpoint', type=str, help="The output directory where the model checkpoints will be written.")
+parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. "
+    "Sequences longer than this will be truncated, sequences shorter will be padded.")
+parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
+parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+parser.add_argument("--epochs", default=3, type=int, help="Total number of training epochs to perform.")
+parser.add_argument("--warmup_proption", default=0.0, type=float, help="Linear warmup proption over the training process.")
+parser.add_argument("--init_from_ckpt", type=str, default=None, help="The path of checkpoint to be loaded.")
+parser.add_argument("--seed", type=int, default=1000, help="random seed for initialization")
+parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs to use, 0 for CPU.")
+args = parser.parse_args()
+# yapf: enable
+
+
+def set_seed(seed):
     """sets random seed"""
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    paddle.seed(args.seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
 
 
 @paddle.no_grad()
@@ -135,8 +68,8 @@ def evaluate(model, criterion, metric, data_loader):
     for batch in data_loader:
         query_input_ids, query_segment_ids, title_input_ids, title_segment_ids, labels = batch
         probs = model(
-            query_input_ids,
-            title_input_ids,
+            query_input_ids=query_input_ids,
+            title_input_ids=title_input_ids,
             query_token_type_ids=query_segment_ids,
             title_token_type_ids=title_segment_ids)
         loss = criterion(probs, labels)
@@ -236,24 +169,28 @@ def create_dataloader(dataset,
         return_list=True)
 
 
-def do_train(args):
-    set_seed(args)
+def do_train():
+    set_seed(args.seed)
     paddle.set_device("gpu" if args.n_gpu else "cpu")
     world_size = paddle.distributed.get_world_size()
     if world_size > 1:
         paddle.distributed.init_parallel_env()
 
-    args.model_type = args.model_type.lower()
-    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-
     train_dataset, dev_dataset, test_dataset = ppnlp.datasets.LCQMC.get_datasets(
         ['train', 'dev', 'test'])
-    if args.model_name == 'ernie-tiny':
-        # ErnieTinyTokenizer is special for ernie-tiny pretained model.
-        tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained(
-            args.model_name)
-    else:
-        tokenizer = tokenizer_class.from_pretrained(args.model_name)
+
+    # If you wanna use bert/roberta pretrained model,
+    # pretrained_model = ppnlp.transformers.BertModel.from_pretrained('bert-base-chinese') 
+    # pretrained_model = ppnlp.transformers.RobertaModel.from_pretrained('roberta-wwm-ext')
+    pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained(
+        'ernie-tiny')
+
+    # If you wanna use bert/roberta pretrained model,
+    # tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese')
+    # tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext')
+    # ErnieTinyTokenizer is special for ernie-tiny pretained model.
+    tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained(
+        'ernie-tiny')
 
     trans_func = partial(
         convert_example,
@@ -286,7 +223,6 @@ def do_train(args):
         batchify_fn=batchify_fn,
         trans_fn=trans_func)
 
-    pretrained_model = model_class.from_pretrained(args.model_name)
     model = SentenceTransformer(pretrained_model)
 
     if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
@@ -326,8 +262,8 @@ def do_train(args):
         for step, batch in enumerate(train_data_loader, start=1):
             query_input_ids, query_segment_ids, title_input_ids, title_segment_ids, labels = batch
             probs = model(
-                query_input_ids,
-                title_input_ids,
+                query_input_ids=query_input_ids,
+                title_input_ids=title_input_ids,
                 query_token_type_ids=query_segment_ids,
                 title_token_type_ids=title_segment_ids)
             loss = criterion(probs, labels)
@@ -361,8 +297,7 @@ def do_train(args):
 
 
 if __name__ == "__main__":
-    args = parse_args()
     if args.n_gpu > 1:
-        paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu)
+        paddle.distributed.spawn(do_train, nprocs=args.n_gpu)
     else:
-        do_train(args)
+        do_train()
diff --git a/PaddleNLP/examples/text_matching/simnet/README.md b/PaddleNLP/examples/text_matching/simnet/README.md
index 91c66b00..49735114 100644
--- a/PaddleNLP/examples/text_matching/simnet/README.md
+++ b/PaddleNLP/examples/text_matching/simnet/README.md
@@ -38,7 +38,7 @@ SimNet框架在百度各产品上广泛应用，主要包括BOW、CNN、RNN、MM
 * PaddleNLP 安装
 
    ```shell
-   pip install paddlenlp
+   pip install paddlenlp>=2.0rc0
    ```
 
 * 环境依赖
diff --git a/PaddleNLP/examples/text_matching/simnet/predict.py b/PaddleNLP/examples/text_matching/simnet/predict.py
index 519bff3e..e5cc3894 100644
--- a/PaddleNLP/examples/text_matching/simnet/predict.py
+++ b/PaddleNLP/examples/text_matching/simnet/predict.py
@@ -16,23 +16,24 @@ from functools import partial
 import argparse
 
 import paddle
-import paddlenlp as ppnlp
 import paddle.nn.functional as F
+import paddlenlp as ppnlp
+from paddlenlp.data import JiebaTokenizer, Pad, Stack, Tuple, Vocab
 
-from utils import load_vocab, generate_batch, preprocess_prediction_data
+from utils import preprocess_prediction_data
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
 parser.add_argument("--use_gpu", type=eval, default=False, help="Whether use GPU for training, input should be True or False")
 parser.add_argument("--batch_size", type=int, default=64, help="Total examples' number of a batch for training.")
-parser.add_argument("--vocab_path", type=str, default="./data/term2id.dict", help="The path to vocabulary.")
+parser.add_argument("--vocab_path", type=str, default="./simnet_word_dict.txt", help="The path to vocabulary.")
 parser.add_argument('--network', type=str, default="lstm", help="Which network you would like to choose bow, cnn, lstm or gru ?")
-parser.add_argument("--params_path", type=str, default='./chekpoints/final.pdparams', help="The path of model parameter to be loaded.")
+parser.add_argument("--params_path", type=str, default='./checkpoints/final.pdparams', help="The path of model parameter to be loaded.")
 args = parser.parse_args()
 # yapf: enable
 
 
-def predict(model, data, label_map, collate_fn, batch_size=1, pad_token_id=0):
+def predict(model, data, label_map, batch_size=1, pad_token_id=0):
     """
     Predicts the data labels.
 
@@ -41,37 +42,35 @@ def predict(model, data, label_map, collate_fn, batch_size=1, pad_token_id=0):
         data (obj:`List(Example)`): The processed data whose each element is a Example (numedtuple) object.
             A Example object contains `text`(word_ids) and `seq_len`(sequence length).
         label_map(obj:`dict`): The label id (key) to label str (value) map.
-        collate_fn(obj: `callable`): function to generate mini-batch data by merging
-            the sample list.
         batch_size(obj:`int`, defaults to 1): The number of batch.
-        pad_token_id(obj:`int`, optinal, defaults to 0) : The pad token index.
+        pad_token_id(obj:`int`, optional, defaults to 0): The pad token index.
 
     Returns:
         results(obj:`dict`): All the predictions labels.
     """
 
     # Seperates data into some batches.
-    batches = []
-    one_batch = []
-    for example in data:
-        one_batch.append(example)
-        if len(one_batch) == batch_size:
-            batches.append(one_batch)
-            one_batch = []
-    if one_batch:
-        # The last batch whose size is less than the config batch_size setting.
-        batches.append(one_batch)
+    batches = [
+        data[idx:idx + batch_size] for idx in range(0, len(data), batch_size)
+    ]
+
+    batchify_fn = lambda samples, fn=Tuple(
+        Pad(axis=0, pad_val=pad_token_id),  # query_ids
+        Pad(axis=0, pad_val=pad_token_id),  # title_ids
+        Stack(dtype="int64"),  # query_seq_lens
+        Stack(dtype="int64"),  # title_seq_lens
+    ): [data for data in fn(samples)]
 
     results = []
     model.eval()
     for batch in batches:
-        queries, titles, query_seq_lens, title_seq_lens = collate_fn(
-            batch, pad_token_id=pad_token_id, return_label=False)
-        queries = paddle.to_tensor(queries)
-        titles = paddle.to_tensor(titles)
+        query_ids, title_ids, query_seq_lens, title_seq_lens = batchify_fn(
+            batch)
+        query_ids = paddle.to_tensor(query_ids)
+        title_ids = paddle.to_tensor(title_ids)
         query_seq_lens = paddle.to_tensor(query_seq_lens)
         title_seq_lens = paddle.to_tensor(title_seq_lens)
-        logits = model(queries, titles, query_seq_lens, title_seq_lens)
+        logits = model(query_ids, title_ids, query_seq_lens, title_seq_lens)
         probs = F.softmax(logits, axis=1)
         idx = paddle.argmax(probs, axis=1).numpy()
         idx = idx.tolist()
@@ -83,7 +82,9 @@ def predict(model, data, label_map, collate_fn, batch_size=1, pad_token_id=0):
 if __name__ == "__main__":
     paddle.set_device("gpu") if args.use_gpu else paddle.set_device("cpu")
     # Loads vocab.
-    vocab = load_vocab(args.vocab_path)
+    vocab = Vocab.load_vocabulary(
+        args.vocab_path, unk_token='[UNK]', pad_token='[PAD]')
+    tokenizer = JiebaTokenizer(vocab)
     label_map = {0: 'dissimilar', 1: 'similar'}
 
     # Constructs the newtork.
@@ -101,13 +102,13 @@ if __name__ == "__main__":
         ['光眼睛大就好看吗', '眼睛好看吗？'],
         ['小蝌蚪找妈妈怎么样', '小蝌蚪找妈妈是谁画的'],
     ]
-    examples = preprocess_prediction_data(data, vocab)
+    examples = preprocess_prediction_data(data, tokenizer)
     results = predict(
         model,
         examples,
         label_map=label_map,
         batch_size=args.batch_size,
-        collate_fn=generate_batch)
+        pad_token_id=vocab.token_to_idx.get('[PAD]', 0))
 
     for idx, text in enumerate(data):
         print('Data: {} \t Label: {}'.format(text, results[idx]))
diff --git a/PaddleNLP/examples/text_matching/simnet/train.py b/PaddleNLP/examples/text_matching/simnet/train.py
index df308ab8..84a6d9c1 100644
--- a/PaddleNLP/examples/text_matching/simnet/train.py
+++ b/PaddleNLP/examples/text_matching/simnet/train.py
@@ -20,16 +20,17 @@ import time
 
 import paddle
 import paddlenlp as ppnlp
+from paddlenlp.data import JiebaTokenizer, Pad, Stack, Tuple, Vocab
 from paddlenlp.datasets import LCQMC
 
-from utils import load_vocab, generate_batch, convert_example
+from utils import convert_example
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
 parser.add_argument("--epochs", type=int, default=10, help="Number of epoches for training.")
 parser.add_argument('--use_gpu', type=eval, default=False, help="Whether use GPU for training, input should be True or False")
 parser.add_argument("--lr", type=float, default=5e-4, help="Learning rate used to train.")
-parser.add_argument("--save_dir", type=str, default='chekpoints/', help="Directory to save model checkpoint")
+parser.add_argument("--save_dir", type=str, default='checkpoints/', help="Directory to save model checkpoint")
 parser.add_argument("--batch_size", type=int, default=64, help="Total examples' number of a batch for training.")
 parser.add_argument("--vocab_path", type=str, default="./simnet_word_dict.txt", help="The directory to dataset.")
 parser.add_argument('--network', type=str, default="lstm", help="Which network you would like to choose bow, cnn, lstm or gru ?")
@@ -43,16 +44,19 @@ def create_dataloader(dataset,
                       mode='train',
                       batch_size=1,
                       use_gpu=False,
-                      pad_token_id=0):
+                      batchify_fn=None):
     """
     Creats dataloader.
 
     Args:
         dataset(obj:`paddle.io.Dataset`): Dataset instance.
+        trans_fn(obj:`callable`, optional, defaults to `None`): function to convert a data sample to input ids, etc.
         mode(obj:`str`, optional, defaults to obj:`train`): If mode is 'train', it will shuffle the dataset randomly.
         batch_size(obj:`int`, optional, defaults to 1): The sample number of a mini-batch.
         use_gpu(obj:`bool`, optional, defaults to obj:`False`): Whether to use gpu to run.
-        pad_token_id(obj:`int`, optional, defaults to 0): The pad token index.
+        batchify_fn(obj:`callable`, optional, defaults to `None`): function to generate mini-batch data by merging
+            the sample list, None for only stack each fields of sample in axis
+            0(same as :attr::`np.stack(..., axis=0)`).
 
     Returns:
         dataloader(obj:`paddle.io.DataLoader`): The dataloader which generates batches.
@@ -71,7 +75,7 @@ def create_dataloader(dataset,
         dataset,
         batch_sampler=sampler,
         return_list=True,
-        collate_fn=lambda batch: generate_batch(batch, pad_token_id=pad_token_id))
+        collate_fn=batchify_fn)
     return dataloader
 
 
@@ -82,11 +86,11 @@ if __name__ == "__main__":
     if not os.path.exists(args.vocab_path):
         raise RuntimeError('The vocab_path  can not be found in the path %s' %
                            args.vocab_path)
-    vocab = load_vocab(args.vocab_path)
+    vocab = Vocab.load_vocabulary(
+        args.vocab_path, unk_token='[UNK]', pad_token='[PAD]')
 
     # Loads dataset.
-    train_ds, dev_dataset, test_ds = LCQMC.get_datasets(
-        ['train', 'dev', 'test'])
+    train_ds, dev_ds, test_ds = LCQMC.get_datasets(['train', 'dev', 'test'])
 
     # Constructs the newtork.
     label_list = train_ds.get_labels()
@@ -95,18 +99,41 @@ if __name__ == "__main__":
         vocab_size=len(vocab),
         num_classes=len(label_list))
     model = paddle.Model(model)
+    new_vocab_file = open("./new_simnet_word_dict.txt", 'w', encoding='utf8')
+    for token, index in vocab.token_to_idx.items():
+        new_vocab_file.write(token + "\n")
 
     # Reads data and generates mini-batches.
-    trans_fn = partial(convert_example, vocab=vocab, is_test=False)
+    batchify_fn = lambda samples, fn=Tuple(
+        Pad(axis=0, pad_val=vocab.token_to_idx.get('[PAD]', 0)),  # query_ids
+        Pad(axis=0, pad_val=vocab.token_to_idx.get('[PAD]', 0)),  # title_ids
+        Stack(dtype="int64"),  # query_seq_lens
+        Stack(dtype="int64"),  # title_seq_lens
+        Stack(dtype="int64")  # label
+    ): [data for data in fn(samples)]
+    tokenizer = ppnlp.data.JiebaTokenizer(vocab)
+    trans_fn = partial(convert_example, tokenizer=tokenizer, is_test=False)
     train_loader = create_dataloader(
-        train_ds, trans_fn=trans_fn, batch_size=args.batch_size, mode='train')
+        train_ds,
+        trans_fn=trans_fn,
+        batch_size=args.batch_size,
+        mode='train',
+        use_gpu=args.use_gpu,
+        batchify_fn=batchify_fn)
     dev_loader = create_dataloader(
-        dev_dataset,
+        dev_ds,
         trans_fn=trans_fn,
         batch_size=args.batch_size,
-        mode='validation')
+        mode='validation',
+        use_gpu=args.use_gpu,
+        batchify_fn=batchify_fn)
     test_loader = create_dataloader(
-        test_ds, trans_fn=trans_fn, batch_size=args.batch_size, mode='test')
+        test_ds,
+        trans_fn=trans_fn,
+        batch_size=args.batch_size,
+        mode='test',
+        use_gpu=args.use_gpu,
+        batchify_fn=batchify_fn)
 
     optimizer = paddle.optimizer.Adam(
         parameters=model.parameters(), learning_rate=args.lr)
diff --git a/PaddleNLP/examples/text_matching/simnet/utils.py b/PaddleNLP/examples/text_matching/simnet/utils.py
index 79854825..5384ad3f 100644
--- a/PaddleNLP/examples/text_matching/simnet/utils.py
+++ b/PaddleNLP/examples/text_matching/simnet/utils.py
@@ -11,105 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import jieba
 import numpy as np
 
 
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = {}
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n").split("\t")[0]
-        vocab[token] = index
-    return vocab
-
-
-def convert_ids_to_tokens(wids, inversed_vocab):
-    """ Converts a token string (or a sequence of tokens) in a single integer id
-        (or a sequence of ids), using the vocabulary.
-    """
-    tokens = []
-    for wid in wids:
-        wstr = inversed_vocab.get(wid, None)
-        if wstr:
-            tokens.append(wstr)
-    return tokens
-
-
-def convert_tokens_to_ids(tokens, vocab):
-    """ Converts a token id (or a sequence of id) in a token string
-        (or a sequence of tokens), using the vocabulary.
-    """
-
-    ids = []
-    unk_id = vocab.get('[UNK]', None)
-    for token in tokens:
-        wid = vocab.get(token, unk_id)
-        if wid:
-            ids.append(wid)
-    return ids
-
-
-def pad_texts_to_max_seq_len(texts, max_seq_len, pad_token_id=0):
-    """
-    Padded the texts to the max sequence length if the length of text is lower than it.
-    Unless it truncates the text.
-
-    Args:
-        texts(obj:`list`): Texts which contrains a sequence of word ids.
-        max_seq_len(obj:`int`): Max sequence length.
-        pad_token_id(obj:`int`, optinal, defaults to 0) : The pad token index.
-    """
-    for index, text in enumerate(texts):
-        seq_len = len(text)
-        if seq_len < max_seq_len:
-            padded_tokens = [pad_token_id for _ in range(max_seq_len - seq_len)]
-            new_text = text + padded_tokens
-            texts[index] = new_text
-        elif seq_len > max_seq_len:
-            new_text = text[:max_seq_len]
-            texts[index] = new_text
-
-
-def generate_batch(batch, pad_token_id=0, return_label=True):
-    """
-    Generates a batch whose text will be padded to the max sequence length in the batch.
-
-    Args:
-        batch(obj:`List[Example]`) : One batch, which contains texts, labels and the true sequence lengths.
-        pad_token_id(obj:`int`, optinal, defaults to 0) : The pad token index.
-
-    Returns:
-        batch(:obj:`Tuple[list]`): The batch data which contains texts, seq_lens and labels.
-    """
-    queries = [entry[0] for entry in batch]
-    titles = [entry[1] for entry in batch]
-    query_seq_lens = [entry[2] for entry in batch]
-    title_seq_lens = [entry[3] for entry in batch]
-
-    query_batch_max_seq_len = max(query_seq_lens)
-    pad_texts_to_max_seq_len(queries, query_batch_max_seq_len, pad_token_id)
-    title_batch_max_seq_len = max(title_seq_lens)
-    pad_texts_to_max_seq_len(titles, title_batch_max_seq_len, pad_token_id)
-
-    if return_label:
-        labels = [entry[-1] for entry in batch]
-        return queries, titles, query_seq_lens, title_seq_lens, labels
-    else:
-        return queries, titles, query_seq_lens, title_seq_lens
-
-
-def convert_example(example, vocab, unk_token_id=1, is_test=False):
+def convert_example(example, tokenizer, is_test=False):
     """
     Builds model inputs from a sequence for sequence classification tasks. 
     It use `jieba.cut` to tokenize text.
 
     Args:
         example(obj:`list[str]`): List of input data, containing text and label if it have label.
-        vocab(obj:`dict`): The vocabulary.
-        unk_token_id(obj:`int`, defaults to 1): The unknown token id.
+        tokenizer(obj: paddlenlp.data.JiebaTokenizer): It use jieba to cut the chinese string.
         is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
 
     Returns:
@@ -121,13 +33,10 @@ def convert_example(example, vocab, unk_token_id=1, is_test=False):
     """
 
     query, title = example[0], example[1]
-    query_tokens = jieba.lcut(query)
-    title_tokens = jieba.lcut(title)
-
-    query_ids = convert_tokens_to_ids(query_tokens, vocab)
-    query_seq_len = len(query_ids)
-    title_ids = convert_tokens_to_ids(title_tokens, vocab)
-    title_seq_len = len(title_ids)
+    query_ids = np.array(tokenizer.encode(query), dtype="int64")
+    query_seq_len = np.array(len(query_ids), dtype="int64")
+    title_ids = np.array(tokenizer.encode(title), dtype="int64")
+    title_seq_len = np.array(len(title_ids), dtype="int64")
 
     if not is_test:
         label = np.array(example[-1], dtype="int64")
@@ -136,7 +45,7 @@ def convert_example(example, vocab, unk_token_id=1, is_test=False):
         return query_ids, title_ids, query_seq_len, title_seq_len
 
 
-def preprocess_prediction_data(data, vocab):
+def preprocess_prediction_data(data, tokenizer):
     """
     It process the prediction data as the format used as training.
 
@@ -144,6 +53,7 @@ def preprocess_prediction_data(data, vocab):
         data (obj:`List[List[str, str]]`): 
             The prediction data whose each element is a text pair. 
             Each text will be tokenized by jieba.lcut() function.
+        tokenizer(obj: paddlenlp.data.JiebaTokenizer): It use jieba to cut the chinese string.
 
     Returns:
         examples (obj:`list`): The processed data whose each element 
@@ -157,9 +67,7 @@ def preprocess_prediction_data(data, vocab):
     """
     examples = []
     for query, title in data:
-        query_tokens = jieba.lcut(query)
-        title_tokens = jieba.lcut(title)
-        query_ids = convert_tokens_to_ids(query_tokens, vocab)
-        title_ids = convert_tokens_to_ids(title_tokens, vocab)
+        query_ids = tokenizer.encode(query)
+        title_ids = tokenizer.encode(title)
         examples.append([query_ids, title_ids, len(query_ids), len(title_ids)])
     return examples
diff --git a/PaddleNLP/paddlenlp/__init__.py b/PaddleNLP/paddlenlp/__init__.py
index 06fd93c8..9b31b4b7 100644
--- a/PaddleNLP/paddlenlp/__init__.py
+++ b/PaddleNLP/paddlenlp/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = '2.0.0b3'
+__version__ = '2.0.0rc1'
 
 from . import data
 from . import datasets
diff --git a/PaddleNLP/paddlenlp/models/senta.py b/PaddleNLP/paddlenlp/models/senta.py
index 4aa36acd..a34b4ecd 100644
--- a/PaddleNLP/paddlenlp/models/senta.py
+++ b/PaddleNLP/paddlenlp/models/senta.py
@@ -112,13 +112,7 @@ class BoWModel(nn.Layer):
     a word embedding. Then, we encode these epresentations with a `BoWEncoder`.
     Lastly, we take the output of the encoder to create a final representation,
     which is passed through some feed-forward layers to output a logits (`output_layer`).
-    Args:
-        vocab_size (obj:`int`): The vocabulary size.
-        emb_dim (obj:`int`, optional, defaults to 128):  The embedding dimension.
-        padding_idx (obj:`int`, optinal, defaults to 0) : The pad token index.
-        hidden_size (obj:`int`, optional, defaults to 128): The first full-connected layer hidden size.
-        fc_hidden_size (obj:`int`, optional, defaults to 96): The second full-connected layer hidden size.
-        num_classes (obj:`int`): All the labels that the data has.
+
     """
 
     def __init__(self,
@@ -331,7 +325,7 @@ class SelfAttention(nn.Layer):
     Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification (Zhou et al., 2016).
     ref: https://www.aclweb.org/anthology/P16-2034/
     Args:
-        hidden_size (obj:`int`): The number of expected features in the input x.
+        hidden_size (int): The number of expected features in the input x.
     """
 
     def __init__(self, hidden_size):
@@ -343,9 +337,10 @@ class SelfAttention(nn.Layer):
     def forward(self, input, mask=None):
         """
         Args:
-            input (obj: `paddle.Tensor`) of shape (batch, seq_len, input_size): Tensor containing the features of the input sequence.
-            mask (obj: `paddle.Tensor`, optional, defaults to `None`) of shape (batch, seq_len) :
-                Tensor is a bool tensor, whose each element identifies whether the input word id is pad token or not.
+            input (paddle.Tensor) of shape (batch, seq_len, input_size): Tensor containing the features of the input sequence.
+            mask (paddle.Tensor) of shape (batch, seq_len) :
+                Tensor is a bool tensor, whose each element identifies whether the input word id is pad token or not. 
+                Defaults to `None`.
         """
         forward_input, backward_input = paddle.chunk(input, chunks=2, axis=2)
         # elementwise-sum forward_x and backward_x
@@ -378,7 +373,7 @@ class SelfInteractiveAttention(nn.Layer):
     A close implementation of attention network of NAACL 2016 paper, Hierarchical Attention Networks for Document Classiﬁcation (Yang et al., 2016).
     ref: https://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf
     Args:
-        hidden_size (obj:`int`): The number of expected features in the input x.
+        hidden_size (int): The number of expected features in the input x.
     """
 
     def __init__(self, hidden_size):
@@ -393,9 +388,10 @@ class SelfInteractiveAttention(nn.Layer):
     def forward(self, input, mask=None):
         """
         Args:
-            input (obj: `paddle.Tensor`) of shape (batch, seq_len, input_size): Tensor containing the features of the input sequence.
-            mask (obj: `paddle.Tensor`, optional, defaults to `None`) of shape (batch, seq_len) :
+            input (paddle.Tensor) of shape (batch, seq_len, input_size): Tensor containing the features of the input sequence.
+            mask (paddle.Tensor) of shape (batch, seq_len) :
                 Tensor is a bool tensor, whose each element identifies whether the input word id is pad token or not.
+                Defaults to `None
         """
         weight = self.input_weight.tile(
             repeat_times=(paddle.shape(input)[0], 1, 1))
@@ -434,11 +430,7 @@ class CNNModel(nn.Layer):
     outputs from the convolution layer and outputs the max. 
     Lastly, we take the output of the encoder to create a final representation,
     which is passed through some feed-forward layers to output a logits (`output_layer`).
-    Args:
-        vocab_size (obj:`int`): The vocabulary size.
-        emb_dim (obj:`int`, optional, defaults to 128):  The embedding dimension.
-        padding_idx (obj:`int`, optinal, defaults to 0) : The pad token index.
-        num_classes (obj:`int`): All the labels that the data has.
+
     """
 
     def __init__(self,
@@ -483,11 +475,7 @@ class TextCNNModel(nn.Layer):
     outputs from the convolution layer and outputs the max. 
     Lastly, we take the output of the encoder to create a final representation,
     which is passed through some feed-forward layers to output a logits (`output_layer`).
-    Args:
-        vocab_size (obj:`int`): The vocabulary size.
-        emb_dim (obj:`int`, optional, defaults to 128):  The embedding dimension.
-        padding_idx (obj:`int`, optinal, defaults to 0) : The pad token index.
-        num_classes (obj:`int`): All the labels that the data has.
+
     """
 
     def __init__(self,
diff --git a/PaddleNLP/paddlenlp/seq2vec/encoder.py b/PaddleNLP/paddlenlp/seq2vec/encoder.py
index b34f66f9..28864131 100644
--- a/PaddleNLP/paddlenlp/seq2vec/encoder.py
+++ b/PaddleNLP/paddlenlp/seq2vec/encoder.py
@@ -31,8 +31,7 @@ class BoWEncoder(nn.Layer):
     and the output is of shape `(batch_size, emb_dim)`.
 
     Args:
-        # TODO: unify the docstring style with PaddlePaddle.
-        emb_dim(obj:`int`, required): It is the input dimension to the encoder.
+        emb_dim(int): It is the input dimension to the encoder.
     """
 
     def __init__(self, emb_dim):
@@ -59,12 +58,12 @@ class BoWEncoder(nn.Layer):
         It simply sums the embeddings of a sequence across the time dimension.
 
         Args:
-            inputs (obj: `paddle.Tensor`): Shape as `(batch_size, num_tokens, emb_dim)`
+            inputs (paddle.Tensor): Shape as `(batch_size, num_tokens, emb_dim)`
             mask (obj: `paddle.Tensor`, optional, defaults to `None`): Shape same as `inputs`. Its each elements identify whether is padding token or not. 
                 If True, not padding token. If False, padding token.
 
         Returns:
-            summed (obj: `paddle.Tensor`): Shape of `(batch_size, emb_dim)`. The result vector of BagOfEmbedding.
+            summed (paddle.Tensor): Shape of `(batch_size, emb_dim)`. The result vector of BagOfEmbedding.
 
         """
         if mask is not None:
@@ -97,18 +96,18 @@ class CNNEncoder(nn.Layer):
     ref: https://arxiv.org/abs/1510.03820
 
     Args:
-        emb_dim(object:`int`, required):
+        emb_dim(int):
             This is the input dimension to the encoder.
-        num_filter(object:`int`, required):
+        num_filter(int):
             This is the output dim for each convolutional layer, which is the number of "filters"
             learned by that layer.
-        ngram_filter_sizes(object: `Tuple[int]`, optional, default to `(2, 3, 4, 5)`):
+        ngram_filter_sizes(Tuple[int]):
             This specifies both the number of convolutional layers we will create and their sizes.  The
             default of `(2, 3, 4, 5)` will have four convolutional layers, corresponding to encoding
             ngrams of size 2 to 5 with some number of filters.
-        conv_layer_activation(object: `str`, optional, default to `tanh`):
+        conv_layer_activation(str):
             Activation to use after the convolution layers.
-        output_dim(object: `int`, optional, default to `None`):
+        output_dim(int):
             After doing convolutions and pooling, we'll project the collected features into a vector of
             this size.  If this value is `None`, we will just return the result of the max pooling,
             giving an output of shape `len(ngram_filter_sizes) * num_filter`.
@@ -165,13 +164,13 @@ class CNNEncoder(nn.Layer):
         The combination of multiple convolution layers and max pooling layers.
 
         Args:
-            inputs (obj: `paddle.Tensor`, required): Shape as `(batch_size, num_tokens, emb_dim)`
+            inputs (paddle.Tensor): Shape as `(batch_size, num_tokens, emb_dim)`
             mask (obj: `paddle.Tensor`, optional, defaults to `None`): Shape same as `inputs`. 
                 Its each elements identify whether is padding token or not. 
                 If True, not padding token. If False, padding token.
 
         Returns:
-            result (obj: `paddle.Tensor`): If output_dim is None, the result shape 
+            result (paddle.Tensor): If output_dim is None, the result shape 
                 is of `(batch_size, output_dim)`; if not, the result shape 
                 is of `(batch_size, len(ngram_filter_sizes) * num_filter)`.
 
@@ -188,8 +187,8 @@ class CNNEncoder(nn.Layer):
             self._activation(conv(inputs)).squeeze(3) for conv in self.convs
         ]
         maxpool_out = [
-            F.max_pool1d(
-                t, kernel_size=t.shape[2]).squeeze(2) for t in convs_out
+            F.adaptive_max_pool1d(
+                t, output_size=1).squeeze(2) for t in convs_out
         ]
         result = paddle.concat(maxpool_out, axis=1)
 
@@ -221,8 +220,8 @@ class GRUEncoder(nn.Layer):
             E.g., setting num_layers=2 would mean stacking two GRUs together to form a stacked GRU, 
             with the second GRU taking in outputs of the first GRU and computing the final results.
         direction (obj:`str`, optional, defaults to obj:`forward`): The direction of the network. 
-            It can be "forward" and "bidirect" (it means bidirection network).
-            When "bidirect", the way to merge outputs of forward and backward is concatenating.
+            It can be `forward` and `bidirect` (it means bidirection network).
+            If `biderect`, it is a birectional GRU, and returns the concat output from both directions.
         dropout (obj:`float`, optional, defaults to 0.0): If non-zero, introduces a Dropout layer 
             on the outputs of each GRU layer except the last layer, with dropout probability equal to dropout.
         pooling_type (obj: `str`, optional, defaults to obj:`None`): If `pooling_type` is None, 
@@ -280,11 +279,11 @@ class GRUEncoder(nn.Layer):
         If not, output is of shape `(batch_size, hidden_size)`.
 
         Args:
-            inputs (obj:`Paddle.Tensor`, required): Shape as `(batch_size, num_tokens, input_size)`.
-            sequence_length (obj:`Paddle.Tensor`, required): Shape as `(batch_size)`.
+            inputs (paddle.Tensor): Shape as `(batch_size, num_tokens, input_size)`.
+            sequence_length (paddle.Tensor): Shape as `(batch_size)`.
 
         Returns:
-            last_hidden (obj:`Paddle.Tensor`, required): Shape as `(batch_size, hidden_size)`.
+            last_hidden (paddle.Tensor): Shape as `(batch_size, hidden_size)`.
                 The hidden state at the last time step for every layer.
 
         """
@@ -305,8 +304,8 @@ class GRUEncoder(nn.Layer):
         else:
             # We exploit the `encoded_text` (the hidden state at the every time step for last layer)
             # to create a single vector. We perform pooling on the encoded text.
-            # If gru is not bidirection, output is shape of `(batch_size, hidden_size)`.
-            # If gru is bidirection, then output is shape of `(batch_size, hidden_size*2)`.
+            # The output shape is `(batch_size, hidden_size*2)` if use bidirectional GRU, 
+            # otherwise the output shape is `(batch_size, hidden_size*2)`.
             if self._pooling_type == 'sum':
                 output = paddle.sum(encoded_text, axis=1)
             elif self._pooling_type == 'max':
@@ -338,17 +337,17 @@ class LSTMEncoder(nn.Layer):
     lstm and backward lstm layer to create a single vector (shape of `(batch_size, hidden_size*2)`).
 
     Args:
-        input_size (obj:`int`, required): The number of expected features in the input (the last dimension).
-        hidden_size (obj:`int`, required): The number of features in the hidden state.
-        num_layers (obj:`int`, optional, defaults to 1): Number of recurrent layers. 
+        input_size (int): The number of expected features in the input (the last dimension).
+        hidden_size (int): The number of features in the hidden state.
+        num_layers (int): Number of recurrent layers. 
             E.g., setting num_layers=2 would mean stacking two LSTMs together to form a stacked LSTM, 
             with the second LSTM taking in outputs of the first LSTM and computing the final results.
-        direction (obj:`str`, optional, defaults to obj:`forwrd`): The direction of the network. 
-            It can be "forward" and "bidirect" (it means bidirection network).
-            When "bidirection", the way to merge outputs of forward and backward is concatenating.
-        dropout (obj:`float`, optional, defaults to 0.0): If non-zero, introduces a Dropout layer 
+        direction (str): The direction of the network. 
+            It can be `forward` or `bidirect` (it means bidirection network).
+            If `biderect`, it is a birectional LSTM, and returns the concat output from both directions.
+        dropout (float): If non-zero, introduces a Dropout layer 
             on the outputs of each LSTM layer except the last layer, with dropout probability equal to dropout.
-        pooling_type (obj: `str`, optional, defaults to obj:`None`): If `pooling_type` is None, 
+        pooling_type (str): If `pooling_type` is None, 
             then the LSTMEncoder will return the hidden state of the last time step at last layer as a single vector.
             If pooling_type is not None, it must be one of `sum`, `max` and `mean`. Then it will be pooled on 
             the LSTM output (the hidden state of every time step at last layer) to create a single vector.
@@ -404,11 +403,11 @@ class LSTMEncoder(nn.Layer):
         If not, output is of shape `(batch_size, hidden_size)`.
 
         Args:
-            inputs (obj:`Paddle.Tensor`, required): Shape as `(batch_size, num_tokens, input_size)`.
-            sequence_length (obj:`Paddle.Tensor`, required): Shape as `(batch_size)`.
+            inputs (paddle.Tensor): Shape as `(batch_size, num_tokens, input_size)`.
+            sequence_length (paddle.Tensor): Shape as `(batch_size)`.
 
         Returns:
-            last_hidden (obj:`Paddle.Tensor`, required): Shape as `(batch_size, hidden_size)`.
+            last_hidden (paddle.Tensor): Shape as `(batch_size, hidden_size)`.
                 The hidden state at the last time step for every layer.
 
         """
@@ -429,8 +428,8 @@ class LSTMEncoder(nn.Layer):
         else:
             # We exploit the `encoded_text` (the hidden state at the every time step for last layer)
             # to create a single vector. We perform pooling on the encoded text.
-            # If lstm is not bidirection, output is shape of `(batch_size, hidden_size)`.
-            # If lstm is bidirection, then output is shape of `(batch_size, hidden_size*2)`.
+            # The output shape is `(batch_size, hidden_size*2)` if use bidirectional LSTM, 
+            # otherwise the output shape is `(batch_size, hidden_size*2)`.
             if self._pooling_type == 'sum':
                 output = paddle.sum(encoded_text, axis=1)
             elif self._pooling_type == 'max':
@@ -467,9 +466,9 @@ class RNNEncoder(nn.Layer):
         num_layers (obj:`int`, optional, defaults to 1): Number of recurrent layers. 
             E.g., setting num_layers=2 would mean stacking two RNNs together to form a stacked RNN, 
             with the second RNN taking in outputs of the first RNN and computing the final results.
-        direction (obj:`str`, optional, defaults to obj:`forwrd`): The direction of the network. 
+        direction (obj:`str`, optional, defaults to obj:`forward`): The direction of the network. 
             It can be "forward" and "bidirect" (it means bidirection network).
-            When "bidirection", the way to merge outputs of forward and backward is concatenating.
+            If `biderect`, it is a birectional RNN, and returns the concat output from both directions.
         dropout (obj:`float`, optional, defaults to 0.0): If non-zero, introduces a Dropout layer 
             on the outputs of each RNN layer except the last layer, with dropout probability equal to dropout.
         pooling_type (obj: `str`, optional, defaults to obj:`None`): If `pooling_type` is None, 
@@ -528,11 +527,11 @@ class RNNEncoder(nn.Layer):
         If not, output is of shape `(batch_size, hidden_size)`.
 
         Args:
-            inputs (obj:`Paddle.Tensor`, required): Shape as `(batch_size, num_tokens, input_size)`.
-            sequence_length (obj:`Paddle.Tensor`, required): Shape as `(batch_size)`.
+            inputs (paddle.Tensor): Shape as `(batch_size, num_tokens, input_size)`.
+            sequence_length (paddle.Tensor): Shape as `(batch_size)`.
 
         Returns:
-            last_hidden (obj:`Paddle.Tensor`, required): Shape as `(batch_size, hidden_size)`.
+            last_hidden (paddle.Tensor): Shape as `(batch_size, hidden_size)`.
                 The hidden state at the last time step for every layer.
 
         """
@@ -553,8 +552,8 @@ class RNNEncoder(nn.Layer):
         else:
             # We exploit the `encoded_text` (the hidden state at the every time step for last layer)
             # to create a single vector. We perform pooling on the encoded text.
-            # If rnn is not bidirection, output is shape of `(batch_size, hidden_size)`.
-            # If rnn is bidirection, then output is shape of `(batch_size, hidden_size*2)`.
+            # The output shape is `(batch_size, hidden_size*2)` if use bidirectional RNN, 
+            # otherwise the output shape is `(batch_size, hidden_size*2)`.
             if self._pooling_type == 'sum':
                 output = paddle.sum(encoded_text, axis=1)
             elif self._pooling_type == 'max':
@@ -676,10 +675,10 @@ class TCNEncoder(nn.Layer):
     such as LSTMs in many tasks. See https://arxiv.org/pdf/1803.01271.pdf for more details.
 
     Args:
-        input_size (obj:`int`, required): The number of expected features in the input (the last dimension).
-        num_channels (obj:`list` or obj:`tuple`, required): The number of channels in different layer. 
-        kernel_size (obj:`int`, optional): The kernel size. Defaults to 2.
-        dropout (obj:`float`, optional): The dropout probability. Defaults to 0.2.
+        input_size (int): The number of expected features in the input (the last dimension).
+        num_channels (list): The number of channels in different layer. 
+        kernel_size (int): The kernel size. Defaults to 2.
+        dropout (float): The dropout probability. Defaults to 0.2.
     """
 
     def __init__(self, input_size, num_channels, kernel_size=2, dropout=0.2):
@@ -733,10 +732,10 @@ class TCNEncoder(nn.Layer):
             receptive filed = $2 * \sum_{i=0}^{len(num\_channels)-1}2^i(kernel\_size-1)$.
 
         Args:
-            inputs (obj:`Paddle.Tensor`, required): The input tensor with shape `[batch_size, num_tokens, input_size]`.
+            inputs (paddle.Tensor): The input tensor with shape `[batch_size, num_tokens, input_size]`.
 
         Returns:
-            output (obj:`Paddle.Tensor`): The output tensor with shape `[batch_size, num_channels[-1]]`.
+            output (paddle.Tensor): The output tensor with shape `[batch_size, num_channels[-1]]`.
         """
         inputs_t = inputs.transpose([0, 2, 1])
         output = self.network(inputs_t).transpose([2, 0, 1])[-1]
diff --git a/PaddleNLP/paddlenlp/transformers/bert/tokenizer.py b/PaddleNLP/paddlenlp/transformers/bert/tokenizer.py
index 67b5b811..0b2caf25 100644
--- a/PaddleNLP/paddlenlp/transformers/bert/tokenizer.py
+++ b/PaddleNLP/paddlenlp/transformers/bert/tokenizer.py
@@ -442,6 +442,39 @@ class BertTokenizer(PretrainedTokenizer):
         return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 +
                                                           _sep) * [1]
 
+    def get_special_tokens_mask(self,
+                                token_ids_0,
+                                token_ids_1=None,
+                                already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]): List of ids of the first sequence.
+            token_ids_1 (List[int], optinal): List of ids of the second sequence.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already 
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            results (List[int]): The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(
+                map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
+                    token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + (
+                [0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
     def encode(self,
                text,
                text_pair=None,
diff --git a/PaddleNLP/paddlenlp/transformers/electra/tokenizer.py b/PaddleNLP/paddlenlp/transformers/electra/tokenizer.py
index 1d81f5b2..04e34f27 100644
--- a/PaddleNLP/paddlenlp/transformers/electra/tokenizer.py
+++ b/PaddleNLP/paddlenlp/transformers/electra/tokenizer.py
@@ -211,6 +211,39 @@ class ElectraTokenizer(PretrainedTokenizer):
         return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 +
                                                           _sep) * [1]
 
+    def get_special_tokens_mask(self,
+                                token_ids_0,
+                                token_ids_1=None,
+                                already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]): List of ids of the first sequence.
+            token_ids_1 (List[int], optinal): List of ids of the second sequence.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already 
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            results (List[int]): The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(
+                map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
+                    token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + (
+                [0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
     def encode(self,
                text,
                text_pair=None,
diff --git a/PaddleNLP/paddlenlp/transformers/ernie/tokenizer.py b/PaddleNLP/paddlenlp/transformers/ernie/tokenizer.py
index 32f59b11..c20fb20b 100644
--- a/PaddleNLP/paddlenlp/transformers/ernie/tokenizer.py
+++ b/PaddleNLP/paddlenlp/transformers/ernie/tokenizer.py
@@ -650,6 +650,39 @@ class ErnieTinyTokenizer(PretrainedTokenizer):
         return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 +
                                                           _sep) * [1]
 
+    def get_special_tokens_mask(self,
+                                token_ids_0,
+                                token_ids_1=None,
+                                already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]): List of ids of the first sequence.
+            token_ids_1 (List[int], optinal): List of ids of the second sequence.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already 
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            results (List[int]): The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(
+                map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
+                    token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + (
+                [0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
     def encode(self,
                text,
                text_pair=None,
diff --git a/PaddleNLP/paddlenlp/transformers/roberta/tokenizer.py b/PaddleNLP/paddlenlp/transformers/roberta/tokenizer.py
index 79733436..1d40ce0f 100644
--- a/PaddleNLP/paddlenlp/transformers/roberta/tokenizer.py
+++ b/PaddleNLP/paddlenlp/transformers/roberta/tokenizer.py
@@ -216,6 +216,39 @@ class RobertaTokenizer(PretrainedTokenizer):
         return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 +
                                                           _sep) * [1]
 
+    def get_special_tokens_mask(self,
+                                token_ids_0,
+                                token_ids_1=None,
+                                already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]): List of ids of the first sequence.
+            token_ids_1 (List[int], optinal): List of ids of the second sequence.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already 
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            results (List[int]): The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(
+                map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
+                    token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + (
+                [0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
     def encode(self,
                text,
                text_pair=None,
diff --git a/PaddleNLP/paddlenlp/transformers/tokenizer_utils.py b/PaddleNLP/paddlenlp/transformers/tokenizer_utils.py
index da0cd64c..9c4d6613 100644
--- a/PaddleNLP/paddlenlp/transformers/tokenizer_utils.py
+++ b/PaddleNLP/paddlenlp/transformers/tokenizer_utils.py
@@ -437,3 +437,23 @@ class PretrainedTokenizer(object):
                 "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']"
             )
         return (ids, pair_ids, overflowing_tokens)
+
+    def get_special_tokens_mask(self,
+                                token_ids_0,
+                                token_ids_1=None,
+                                already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]): List of ids of the first sequence.
+            token_ids_1 (List[int], optinal): List of ids of the second sequence.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already 
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            results (List[int]): The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        return [0] * ((len(token_ids_1)
+                       if token_ids_1 else 0) + len(token_ids_0))
-- 
GitLab